All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/2] f2fs: support revoking atomic written pages
@ 2015-12-29  3:12 ` Chao Yu
  0 siblings, 0 replies; 28+ messages in thread
From: Chao Yu @ 2015-12-29  3:12 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-f2fs-devel, linux-kernel

f2fs support atomic write with following semantics:
1. open db file
2. ioctl start atomic write
3. (write db file) * n
4. ioctl commit atomic write
5. close db file

With this flow we can avoid file becoming corrupted when abnormal power
cut, because we hold data of transaction in referenced pages linked in
inmem_pages list of inode, but without setting them dirty, so these data
won't be persisted unless we commit them in step 4.

But we should still hold journal db file in memory by using volatile write,
because our semantics of 'atomic write support' is not full, in step 4, we
could be fail to submit all dirty data of transaction, once partial dirty
data was committed in storage, db file should be corrupted, in this case,
we should use journal db to recover the original data in db file.

So this patch tries to improve atomic write flow, it adds a flow to support
revoking these partial submitted data of transaction when inner error
occurs. after this patch journal db file could be deprecated to reduce
memory footprint.

If revoking is failed, EAGAIN will be reported to user for sugguesting
retrying current transaction.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
---
 fs/f2fs/data.c              |   1 +
 fs/f2fs/f2fs.h              |   4 +-
 fs/f2fs/file.c              |   2 +-
 fs/f2fs/recovery.c          |   2 +-
 fs/f2fs/segment.c           | 116 +++++++++++++++++++++++++++++++-------------
 fs/f2fs/segment.h           |   1 +
 include/trace/events/f2fs.h |   1 +
 7 files changed, 90 insertions(+), 37 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d506a0e..7175d33 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1052,6 +1052,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
 		return err;
 
 	fio->blk_addr = dn.data_blkaddr;
+	fio->old_blkaddr = dn.data_blkaddr;
 
 	/* This page is already truncated */
 	if (fio->blk_addr == NULL_ADDR) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7fbfee9..9ba6a09 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -679,6 +679,7 @@ enum page_type {
 	META_FLUSH,
 	INMEM,		/* the below types are used by tracepoints only. */
 	INMEM_DROP,
+	INMEM_REVOKE,
 	IPU,
 	OPU,
 };
@@ -688,6 +689,7 @@ struct f2fs_io_info {
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
 	int rw;			/* contains R/RS/W/WS with REQ_META/REQ_PRIO */
 	block_t blk_addr;	/* block address to be written */
+	block_t old_blkaddr;	/* old block address before Cow */
 	struct page *page;	/* page to be written */
 	struct page *encrypted_page;	/* encrypted page */
 };
@@ -1804,7 +1806,7 @@ void write_node_page(unsigned int, struct f2fs_io_info *);
 void write_data_page(struct dnode_of_data *, struct f2fs_io_info *);
 void rewrite_data_page(struct f2fs_io_info *);
 void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
-				block_t, block_t, unsigned char, bool);
+				block_t, block_t, unsigned char, bool, bool);
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
 		block_t, block_t *, struct f2fs_summary *, int);
 void f2fs_wait_on_page_writeback(struct page *, enum page_type);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index cfe7f13..91d5abd 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -885,7 +885,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
 
 		get_node_info(sbi, dn.nid, &ni);
 		f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
-				ni.version, true);
+				ni.version, true, false);
 		f2fs_put_dnode(&dn);
 	} else {
 		struct page *psrc, *pdst;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 589b20b..581544d 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -467,7 +467,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 
 			/* write dummy data page */
 			f2fs_replace_block(sbi, &dn, src, dest,
-							ni.version, false);
+						ni.version, false, false);
 			recovered++;
 		}
 	}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 733f876..2145741f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -191,24 +191,48 @@ void register_inmem_page(struct inode *inode, struct page *page)
 	trace_f2fs_register_inmem_page(page, INMEM);
 }
 
-static void __revoke_inmem_pages(struct inode *inode,
-							struct list_head *head)
+static int __revoke_inmem_pages(struct inode *inode,
+				struct list_head *head, bool drop, bool recover)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct inmem_pages *cur, *tmp;
+	int err = 0;
 
 	list_for_each_entry_safe(cur, tmp, head, list) {
-		trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
+		struct page *page = cur->page;
+
+		if (drop)
+			trace_f2fs_commit_inmem_page(page, INMEM_DROP);
+
+		lock_page(page);
 
-		lock_page(cur->page);
-		ClearPageUptodate(cur->page);
-		set_page_private(cur->page, 0);
-		ClearPagePrivate(cur->page);
-		f2fs_put_page(cur->page, 1);
+		if (recover) {
+			struct dnode_of_data dn;
+			struct node_info ni;
+
+			trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
+
+			set_new_dnode(&dn, inode, NULL, NULL, 0);
+			if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) {
+				err = -EAGAIN;
+				goto next;
+			}
+			get_node_info(sbi, dn.nid, &ni);
+			f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+					cur->old_addr, ni.version, true, true);
+			f2fs_put_dnode(&dn);
+		}
+next:
+		ClearPageUptodate(page);
+		set_page_private(page, 0);
+		ClearPageUptodate(page);
+		f2fs_put_page(page, 1);
 
 		list_del(&cur->list);
 		kmem_cache_free(inmem_entry_slab, cur);
 		dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
 	}
+	return err;
 }
 
 void drop_inmem_pages(struct inode *inode)
@@ -216,11 +240,12 @@ void drop_inmem_pages(struct inode *inode)
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
 	mutex_lock(&fi->inmem_lock);
-	__revoke_inmem_pages(inode, &fi->inmem_pages);
+	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
 	mutex_unlock(&fi->inmem_lock);
 }
 
-static int __commit_inmem_pages(struct inode *inode)
+static int __commit_inmem_pages(struct inode *inode,
+					struct list_head *revoke_list)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -235,34 +260,39 @@ static int __commit_inmem_pages(struct inode *inode)
 	int err = 0;
 
 	list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
-		lock_page(cur->page);
-		if (cur->page->mapping == inode->i_mapping) {
-			set_page_dirty(cur->page);
-			f2fs_wait_on_page_writeback(cur->page, DATA);
-			if (clear_page_dirty_for_io(cur->page))
+		struct page *page = cur->page;
+
+		lock_page(page);
+		if (page->mapping == inode->i_mapping) {
+			trace_f2fs_commit_inmem_page(page, INMEM);
+
+			set_page_dirty(page);
+			f2fs_wait_on_page_writeback(page, DATA);
+			if (clear_page_dirty_for_io(page))
 				inode_dec_dirty_pages(inode);
-			trace_f2fs_commit_inmem_page(cur->page, INMEM);
-			fio.page = cur->page;
+
+			fio.page = page;
 			err = do_write_data_page(&fio);
 			if (err) {
-				unlock_page(cur->page);
+				unlock_page(page);
 				break;
 			}
-			clear_cold_data(cur->page);
+
+			/* record old blkaddr for revoking */
+			cur->old_addr = fio.old_blkaddr;
+
+			clear_cold_data(page);
 			submit_bio = true;
 		}
+		unlock_page(page);
+		list_move_tail(&cur->list, revoke_list);
+	}
 
-		set_page_private(cur->page, 0);
-		ClearPagePrivate(cur->page);
-		f2fs_put_page(cur->page, 1);
+	f2fs_submit_merged_bio(sbi, DATA, WRITE);
 
-		list_del(&cur->list);
-		kmem_cache_free(inmem_entry_slab, cur);
-		dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
-	}
+	if (!err)
+		__revoke_inmem_pages(inode, revoke_list, false, false);
 
-	if (submit_bio)
-		f2fs_submit_merged_bio(sbi, DATA, WRITE);
 	return err;
 }
 
@@ -270,13 +300,29 @@ int commit_inmem_pages(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
-	int err = 0;
+	struct list_head revoke_list;
+	int err;
 
+	INIT_LIST_HEAD(&revoke_list);
 	f2fs_balance_fs(sbi);
 	f2fs_lock_op(sbi);
 
 	mutex_lock(&fi->inmem_lock);
-	err = __commit_inmem_pages(inode);
+	err = __commit_inmem_pages(inode, &revoke_list);
+	if (err) {
+		int ret;
+		/*
+		 * try to revoke all committed pages, but still we could fail
+		 * to revoke due to no memory or other reason, so if that
+		 * happened, return EAGAIN to user.
+		 */
+		ret = __revoke_inmem_pages(inode, &revoke_list, false, true);
+		if (ret)
+			err = ret;
+
+		/* drop all uncommitted pages */
+		__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+	}
 	mutex_unlock(&fi->inmem_lock);
 
 	f2fs_unlock_op(sbi);
@@ -1357,7 +1403,7 @@ void rewrite_data_page(struct f2fs_io_info *fio)
 static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
 				struct f2fs_summary *sum,
 				block_t old_blkaddr, block_t new_blkaddr,
-				bool recover_curseg)
+				bool recover_curseg, bool recover_newaddr)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg;
@@ -1400,7 +1446,7 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
 	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
 	__add_sum_entry(sbi, type, sum);
 
-	if (!recover_curseg)
+	if (!recover_curseg || recover_newaddr)
 		update_sit_entry(sbi, new_blkaddr, 1);
 	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
 		update_sit_entry(sbi, old_blkaddr, -1);
@@ -1424,13 +1470,15 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
 
 void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
 				block_t old_addr, block_t new_addr,
-				unsigned char version, bool recover_curseg)
+				unsigned char version, bool recover_curseg,
+				bool recover_newaddr)
 {
 	struct f2fs_summary sum;
 
 	set_summary(&sum, dn->nid, dn->ofs_in_node, version);
 
-	__f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg);
+	__f2fs_replace_block(sbi, &sum, old_addr, new_addr,
+					recover_curseg, recover_newaddr);
 
 	dn->data_blkaddr = new_addr;
 	set_data_blkaddr(dn);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ee44d34..5146eb7 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -191,6 +191,7 @@ struct segment_allocation {
 struct inmem_pages {
 	struct list_head list;
 	struct page *page;
+	block_t old_addr;		/* for revoking when fail to commit */
 };
 
 struct sit_info {
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index a1b4888..851f158 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -52,6 +52,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 		{ META_FLUSH,	"META_FLUSH" },				\
 		{ INMEM,	"INMEM" },				\
 		{ INMEM_DROP,	"INMEM_DROP" },				\
+		{ INMEM_REVOKE,	"INMEM_REVOKE" },			\
 		{ IPU,		"IN-PLACE" },				\
 		{ OPU,		"OUT-OF-PLACE" })
 
-- 
2.6.3



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH 2/2] f2fs: support revoking atomic written pages
@ 2015-12-29  3:12 ` Chao Yu
  0 siblings, 0 replies; 28+ messages in thread
From: Chao Yu @ 2015-12-29  3:12 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel

f2fs support atomic write with following semantics:
1. open db file
2. ioctl start atomic write
3. (write db file) * n
4. ioctl commit atomic write
5. close db file

With this flow we can avoid file becoming corrupted when abnormal power
cut, because we hold data of transaction in referenced pages linked in
inmem_pages list of inode, but without setting them dirty, so these data
won't be persisted unless we commit them in step 4.

But we should still hold journal db file in memory by using volatile write,
because our semantics of 'atomic write support' is not full, in step 4, we
could be fail to submit all dirty data of transaction, once partial dirty
data was committed in storage, db file should be corrupted, in this case,
we should use journal db to recover the original data in db file.

So this patch tries to improve atomic write flow, it adds a flow to support
revoking these partial submitted data of transaction when inner error
occurs. after this patch journal db file could be deprecated to reduce
memory footprint.

If revoking is failed, EAGAIN will be reported to user for sugguesting
retrying current transaction.

Signed-off-by: Chao Yu <chao2.yu@samsung.com>
---
 fs/f2fs/data.c              |   1 +
 fs/f2fs/f2fs.h              |   4 +-
 fs/f2fs/file.c              |   2 +-
 fs/f2fs/recovery.c          |   2 +-
 fs/f2fs/segment.c           | 116 +++++++++++++++++++++++++++++++-------------
 fs/f2fs/segment.h           |   1 +
 include/trace/events/f2fs.h |   1 +
 7 files changed, 90 insertions(+), 37 deletions(-)

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index d506a0e..7175d33 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -1052,6 +1052,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
 		return err;
 
 	fio->blk_addr = dn.data_blkaddr;
+	fio->old_blkaddr = dn.data_blkaddr;
 
 	/* This page is already truncated */
 	if (fio->blk_addr == NULL_ADDR) {
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 7fbfee9..9ba6a09 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -679,6 +679,7 @@ enum page_type {
 	META_FLUSH,
 	INMEM,		/* the below types are used by tracepoints only. */
 	INMEM_DROP,
+	INMEM_REVOKE,
 	IPU,
 	OPU,
 };
@@ -688,6 +689,7 @@ struct f2fs_io_info {
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
 	int rw;			/* contains R/RS/W/WS with REQ_META/REQ_PRIO */
 	block_t blk_addr;	/* block address to be written */
+	block_t old_blkaddr;	/* old block address before Cow */
 	struct page *page;	/* page to be written */
 	struct page *encrypted_page;	/* encrypted page */
 };
@@ -1804,7 +1806,7 @@ void write_node_page(unsigned int, struct f2fs_io_info *);
 void write_data_page(struct dnode_of_data *, struct f2fs_io_info *);
 void rewrite_data_page(struct f2fs_io_info *);
 void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *,
-				block_t, block_t, unsigned char, bool);
+				block_t, block_t, unsigned char, bool, bool);
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
 		block_t, block_t *, struct f2fs_summary *, int);
 void f2fs_wait_on_page_writeback(struct page *, enum page_type);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index cfe7f13..91d5abd 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -885,7 +885,7 @@ static int __exchange_data_block(struct inode *inode, pgoff_t src,
 
 		get_node_info(sbi, dn.nid, &ni);
 		f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr,
-				ni.version, true);
+				ni.version, true, false);
 		f2fs_put_dnode(&dn);
 	} else {
 		struct page *psrc, *pdst;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 589b20b..581544d 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -467,7 +467,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
 
 			/* write dummy data page */
 			f2fs_replace_block(sbi, &dn, src, dest,
-							ni.version, false);
+						ni.version, false, false);
 			recovered++;
 		}
 	}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 733f876..2145741f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -191,24 +191,48 @@ void register_inmem_page(struct inode *inode, struct page *page)
 	trace_f2fs_register_inmem_page(page, INMEM);
 }
 
-static void __revoke_inmem_pages(struct inode *inode,
-							struct list_head *head)
+static int __revoke_inmem_pages(struct inode *inode,
+				struct list_head *head, bool drop, bool recover)
 {
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct inmem_pages *cur, *tmp;
+	int err = 0;
 
 	list_for_each_entry_safe(cur, tmp, head, list) {
-		trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
+		struct page *page = cur->page;
+
+		if (drop)
+			trace_f2fs_commit_inmem_page(page, INMEM_DROP);
+
+		lock_page(page);
 
-		lock_page(cur->page);
-		ClearPageUptodate(cur->page);
-		set_page_private(cur->page, 0);
-		ClearPagePrivate(cur->page);
-		f2fs_put_page(cur->page, 1);
+		if (recover) {
+			struct dnode_of_data dn;
+			struct node_info ni;
+
+			trace_f2fs_commit_inmem_page(page, INMEM_REVOKE);
+
+			set_new_dnode(&dn, inode, NULL, NULL, 0);
+			if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) {
+				err = -EAGAIN;
+				goto next;
+			}
+			get_node_info(sbi, dn.nid, &ni);
+			f2fs_replace_block(sbi, &dn, dn.data_blkaddr,
+					cur->old_addr, ni.version, true, true);
+			f2fs_put_dnode(&dn);
+		}
+next:
+		ClearPageUptodate(page);
+		set_page_private(page, 0);
+		ClearPageUptodate(page);
+		f2fs_put_page(page, 1);
 
 		list_del(&cur->list);
 		kmem_cache_free(inmem_entry_slab, cur);
 		dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
 	}
+	return err;
 }
 
 void drop_inmem_pages(struct inode *inode)
@@ -216,11 +240,12 @@ void drop_inmem_pages(struct inode *inode)
 	struct f2fs_inode_info *fi = F2FS_I(inode);
 
 	mutex_lock(&fi->inmem_lock);
-	__revoke_inmem_pages(inode, &fi->inmem_pages);
+	__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
 	mutex_unlock(&fi->inmem_lock);
 }
 
-static int __commit_inmem_pages(struct inode *inode)
+static int __commit_inmem_pages(struct inode *inode,
+					struct list_head *revoke_list)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
@@ -235,34 +260,39 @@ static int __commit_inmem_pages(struct inode *inode)
 	int err = 0;
 
 	list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
-		lock_page(cur->page);
-		if (cur->page->mapping == inode->i_mapping) {
-			set_page_dirty(cur->page);
-			f2fs_wait_on_page_writeback(cur->page, DATA);
-			if (clear_page_dirty_for_io(cur->page))
+		struct page *page = cur->page;
+
+		lock_page(page);
+		if (page->mapping == inode->i_mapping) {
+			trace_f2fs_commit_inmem_page(page, INMEM);
+
+			set_page_dirty(page);
+			f2fs_wait_on_page_writeback(page, DATA);
+			if (clear_page_dirty_for_io(page))
 				inode_dec_dirty_pages(inode);
-			trace_f2fs_commit_inmem_page(cur->page, INMEM);
-			fio.page = cur->page;
+
+			fio.page = page;
 			err = do_write_data_page(&fio);
 			if (err) {
-				unlock_page(cur->page);
+				unlock_page(page);
 				break;
 			}
-			clear_cold_data(cur->page);
+
+			/* record old blkaddr for revoking */
+			cur->old_addr = fio.old_blkaddr;
+
+			clear_cold_data(page);
 			submit_bio = true;
 		}
+		unlock_page(page);
+		list_move_tail(&cur->list, revoke_list);
+	}
 
-		set_page_private(cur->page, 0);
-		ClearPagePrivate(cur->page);
-		f2fs_put_page(cur->page, 1);
+	f2fs_submit_merged_bio(sbi, DATA, WRITE);
 
-		list_del(&cur->list);
-		kmem_cache_free(inmem_entry_slab, cur);
-		dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
-	}
+	if (!err)
+		__revoke_inmem_pages(inode, revoke_list, false, false);
 
-	if (submit_bio)
-		f2fs_submit_merged_bio(sbi, DATA, WRITE);
 	return err;
 }
 
@@ -270,13 +300,29 @@ int commit_inmem_pages(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct f2fs_inode_info *fi = F2FS_I(inode);
-	int err = 0;
+	struct list_head revoke_list;
+	int err;
 
+	INIT_LIST_HEAD(&revoke_list);
 	f2fs_balance_fs(sbi);
 	f2fs_lock_op(sbi);
 
 	mutex_lock(&fi->inmem_lock);
-	err = __commit_inmem_pages(inode);
+	err = __commit_inmem_pages(inode, &revoke_list);
+	if (err) {
+		int ret;
+		/*
+		 * try to revoke all committed pages, but still we could fail
+		 * to revoke due to no memory or other reason, so if that
+		 * happened, return EAGAIN to user.
+		 */
+		ret = __revoke_inmem_pages(inode, &revoke_list, false, true);
+		if (ret)
+			err = ret;
+
+		/* drop all uncommitted pages */
+		__revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+	}
 	mutex_unlock(&fi->inmem_lock);
 
 	f2fs_unlock_op(sbi);
@@ -1357,7 +1403,7 @@ void rewrite_data_page(struct f2fs_io_info *fio)
 static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
 				struct f2fs_summary *sum,
 				block_t old_blkaddr, block_t new_blkaddr,
-				bool recover_curseg)
+				bool recover_curseg, bool recover_newaddr)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
 	struct curseg_info *curseg;
@@ -1400,7 +1446,7 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
 	curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr);
 	__add_sum_entry(sbi, type, sum);
 
-	if (!recover_curseg)
+	if (!recover_curseg || recover_newaddr)
 		update_sit_entry(sbi, new_blkaddr, 1);
 	if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
 		update_sit_entry(sbi, old_blkaddr, -1);
@@ -1424,13 +1470,15 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi,
 
 void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
 				block_t old_addr, block_t new_addr,
-				unsigned char version, bool recover_curseg)
+				unsigned char version, bool recover_curseg,
+				bool recover_newaddr)
 {
 	struct f2fs_summary sum;
 
 	set_summary(&sum, dn->nid, dn->ofs_in_node, version);
 
-	__f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg);
+	__f2fs_replace_block(sbi, &sum, old_addr, new_addr,
+					recover_curseg, recover_newaddr);
 
 	dn->data_blkaddr = new_addr;
 	set_data_blkaddr(dn);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index ee44d34..5146eb7 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -191,6 +191,7 @@ struct segment_allocation {
 struct inmem_pages {
 	struct list_head list;
 	struct page *page;
+	block_t old_addr;		/* for revoking when fail to commit */
 };
 
 struct sit_info {
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index a1b4888..851f158 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -52,6 +52,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD);
 		{ META_FLUSH,	"META_FLUSH" },				\
 		{ INMEM,	"INMEM" },				\
 		{ INMEM_DROP,	"INMEM_DROP" },				\
+		{ INMEM_REVOKE,	"INMEM_REVOKE" },			\
 		{ IPU,		"IN-PLACE" },				\
 		{ OPU,		"OUT-OF-PLACE" })
 
-- 
2.6.3



------------------------------------------------------------------------------

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] f2fs: support revoking atomic written pages
  2015-12-29  3:12 ` Chao Yu
@ 2015-12-30  0:05   ` Jaegeuk Kim
  -1 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2015-12-30  0:05 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-f2fs-devel, linux-kernel

Hi Chao,

On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> f2fs support atomic write with following semantics:
> 1. open db file
> 2. ioctl start atomic write
> 3. (write db file) * n
> 4. ioctl commit atomic write
> 5. close db file
> 
> With this flow we can avoid file becoming corrupted when abnormal power
> cut, because we hold data of transaction in referenced pages linked in
> inmem_pages list of inode, but without setting them dirty, so these data
> won't be persisted unless we commit them in step 4.
> 
> But we should still hold journal db file in memory by using volatile write,
> because our semantics of 'atomic write support' is not full, in step 4, we
> could be fail to submit all dirty data of transaction, once partial dirty
> data was committed in storage, db file should be corrupted, in this case,
> we should use journal db to recover the original data in db file.

Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
since database should get its error literally.

So, the only thing that we need to do is keeping journal data for further db
recovery. But, unfortunately, it seems that something is missing in the
current implementation.

So simply how about this?

A possible flow would be:
1. write journal data to volatile space
2. write db data to atomic space
3. in the error case, call ioc_abort_volatile_writes for both journal and db
 - flush/fsync journal data to disk
 - drop atomic data, and will be recovered by database with journal

>From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 29 Dec 2015 15:46:33 -0800
Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write

There are two rules to handle aborting volatile or atomic writes.

1. drop atomic writes
 - we don't need to keep any stale db data.

2. write journal data
 - we should keep the journal data with fsync for db recovery.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 91f576a..d16438a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 	if (ret)
 		return ret;
 
-	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
-	commit_inmem_pages(inode, true);
+	if (f2fs_is_atomic_file(inode)) {
+		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+		commit_inmem_pages(inode, true);
+	}
+	if (f2fs_is_volatile_file(inode)) {
+		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+		ret = commit_inmem_pages(inode, false);
+		if (!ret)
+			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+	}
 
 	mnt_drop_write_file(filp);
 	return ret;
-- 
2.6.3


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] f2fs: support revoking atomic written pages
@ 2015-12-30  0:05   ` Jaegeuk Kim
  0 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2015-12-30  0:05 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

Hi Chao,

On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> f2fs support atomic write with following semantics:
> 1. open db file
> 2. ioctl start atomic write
> 3. (write db file) * n
> 4. ioctl commit atomic write
> 5. close db file
> 
> With this flow we can avoid file becoming corrupted when abnormal power
> cut, because we hold data of transaction in referenced pages linked in
> inmem_pages list of inode, but without setting them dirty, so these data
> won't be persisted unless we commit them in step 4.
> 
> But we should still hold journal db file in memory by using volatile write,
> because our semantics of 'atomic write support' is not full, in step 4, we
> could be fail to submit all dirty data of transaction, once partial dirty
> data was committed in storage, db file should be corrupted, in this case,
> we should use journal db to recover the original data in db file.

Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
since database should get its error literally.

So, the only thing that we need to do is keeping journal data for further db
recovery. But, unfortunately, it seems that something is missing in the
current implementation.

So simply how about this?

A possible flow would be:
1. write journal data to volatile space
2. write db data to atomic space
3. in the error case, call ioc_abort_volatile_writes for both journal and db
 - flush/fsync journal data to disk
 - drop atomic data, and will be recovered by database with journal

>From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
From: Jaegeuk Kim <jaegeuk@kernel.org>
Date: Tue, 29 Dec 2015 15:46:33 -0800
Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write

There are two rules to handle aborting volatile or atomic writes.

1. drop atomic writes
 - we don't need to keep any stale db data.

2. write journal data
 - we should keep the journal data with fsync for db recovery.

Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
---
 fs/f2fs/file.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 91f576a..d16438a 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 	if (ret)
 		return ret;
 
-	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
-	commit_inmem_pages(inode, true);
+	if (f2fs_is_atomic_file(inode)) {
+		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+		commit_inmem_pages(inode, true);
+	}
+	if (f2fs_is_volatile_file(inode)) {
+		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+		ret = commit_inmem_pages(inode, false);
+		if (!ret)
+			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+	}
 
 	mnt_drop_write_file(filp);
 	return ret;
-- 
2.6.3


------------------------------------------------------------------------------

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] f2fs: support revoking atomic written pages
  2015-12-30  0:05   ` Jaegeuk Kim
  (?)
@ 2015-12-30  1:34   ` Chao Yu
  2015-12-30 15:35     ` [f2fs-dev] " Chao Yu
  2015-12-30 19:41     ` Jaegeuk Kim
  -1 siblings, 2 replies; 28+ messages in thread
From: Chao Yu @ 2015-12-30  1:34 UTC (permalink / raw)
  To: 'Jaegeuk Kim'; +Cc: linux-f2fs-devel, linux-kernel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Wednesday, December 30, 2015 8:05 AM
> To: Chao Yu
> Cc: linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH 2/2] f2fs: support revoking atomic written pages
> 
> Hi Chao,
> 
> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > f2fs support atomic write with following semantics:
> > 1. open db file
> > 2. ioctl start atomic write
> > 3. (write db file) * n
> > 4. ioctl commit atomic write
> > 5. close db file
> >
> > With this flow we can avoid file becoming corrupted when abnormal power
> > cut, because we hold data of transaction in referenced pages linked in
> > inmem_pages list of inode, but without setting them dirty, so these data
> > won't be persisted unless we commit them in step 4.
> >
> > But we should still hold journal db file in memory by using volatile write,
> > because our semantics of 'atomic write support' is not full, in step 4, we
> > could be fail to submit all dirty data of transaction, once partial dirty
> > data was committed in storage, db file should be corrupted, in this case,
> > we should use journal db to recover the original data in db file.
> 
> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> since database should get its error literally.
> 
> So, the only thing that we need to do is keeping journal data for further db
> recovery.

IMO, if we really support *atomic* interface, we don't need any journal data
kept by user, because f2fs already have it in its storage since we always
trigger OPU for pages written in atomic-write opened file, f2fs can easily try
to revoke (replace old to new in metadata) when any failure exist in atomic
write process.

But in current design, we still hold journal data in memory for recovering for
*rare* failure case. I think there are several issues:
a) most of time, we are in concurrent scenario, so if large number of journal
db files were opened simultaneously, we are under big memory pressure.
b) If we are out of memory, reclaimer tries to write page of journal db into
disk, it will destroy db file.
c) Though, we have journal db file, we will face failure of recovering db file
from journal db due to ENOMEM or EIO, then db file will be corrupted.
d) Recovery flow will make data page dirty, triggering both data stream and
metadata stream, there should be more IOs than in inner revoking in
atomic-interface.
e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
recover, checkpoint will persist the corrupt data in db file, following abnormal
power-cut will leave that data in disk.

With revoking supported design, we can not solve all above issues, we will still
face the same issue like c), but it will be a big improve if we can apply this
in our interface, since it provide a way to fix the issue a) b) d). And also for
e) case, we try to rescue data in first time that our revoking operation would be
protected by f2fs_lock_op to avoid checkpoint + power-cut.

If you don't want to have a big change in this interface or recovery flow, how
about keep them both, and add a mount option to control inner recovery flow?

How do you think? :)

Thanks,

> But, unfortunately, it seems that something is missing in the
> current implementation.
> 
> So simply how about this?
> 
> A possible flow would be:
> 1. write journal data to volatile space
> 2. write db data to atomic space
> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
>  - flush/fsync journal data to disk
>  - drop atomic data, and will be recovered by database with journal
> 
> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> From: Jaegeuk Kim <jaegeuk@kernel.org>
> Date: Tue, 29 Dec 2015 15:46:33 -0800
> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> 
> There are two rules to handle aborting volatile or atomic writes.
> 
> 1. drop atomic writes
>  - we don't need to keep any stale db data.
> 
> 2. write journal data
>  - we should keep the journal data with fsync for db recovery.
> 
> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> ---
>  fs/f2fs/file.c | 13 ++++++++++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> index 91f576a..d16438a 100644
> --- a/fs/f2fs/file.c
> +++ b/fs/f2fs/file.c
> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
>  	if (ret)
>  		return ret;
> 
> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> -	commit_inmem_pages(inode, true);
> +	if (f2fs_is_atomic_file(inode)) {
> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> +		commit_inmem_pages(inode, true);
> +	}
> +	if (f2fs_is_volatile_file(inode)) {
> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> +		ret = commit_inmem_pages(inode, false);
> +		if (!ret)
> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> +	}
> 
>  	mnt_drop_write_file(filp);
>  	return ret;
> --
> 2.6.3



^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2015-12-30  1:34   ` Chao Yu
@ 2015-12-30 15:35     ` Chao Yu
  2015-12-30 19:43       ` Jaegeuk Kim
  2015-12-30 19:41     ` Jaegeuk Kim
  1 sibling, 1 reply; 28+ messages in thread
From: Chao Yu @ 2015-12-30 15:35 UTC (permalink / raw)
  To: 'Jaegeuk Kim'; +Cc: linux-kernel, linux-f2fs-devel

On 12/30/15 9:34 AM, Chao Yu wrote:
> Hi Jaegeuk,
> 
>> -----Original Message-----
>> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
>> Sent: Wednesday, December 30, 2015 8:05 AM
>> To: Chao Yu
>> Cc: linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
>> Subject: Re: [PATCH 2/2] f2fs: support revoking atomic written pages
>>
>> Hi Chao,
>>
>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
>>> f2fs support atomic write with following semantics:
>>> 1. open db file
>>> 2. ioctl start atomic write
>>> 3. (write db file) * n
>>> 4. ioctl commit atomic write
>>> 5. close db file
>>>
>>> With this flow we can avoid file becoming corrupted when abnormal power
>>> cut, because we hold data of transaction in referenced pages linked in
>>> inmem_pages list of inode, but without setting them dirty, so these data
>>> won't be persisted unless we commit them in step 4.
>>>
>>> But we should still hold journal db file in memory by using volatile write,
>>> because our semantics of 'atomic write support' is not full, in step 4, we
>>> could be fail to submit all dirty data of transaction, once partial dirty
>>> data was committed in storage, db file should be corrupted, in this case,
>>> we should use journal db to recover the original data in db file.
>>
>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
>> since database should get its error literally.
>>
>> So, the only thing that we need to do is keeping journal data for further db
>> recovery.
> 
> IMO, if we really support *atomic* interface, we don't need any journal data
> kept by user, because f2fs already have it in its storage since we always
> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> to revoke (replace old to new in metadata) when any failure exist in atomic
> write process.
> 
> But in current design, we still hold journal data in memory for recovering for
> *rare* failure case. I think there are several issues:
> a) most of time, we are in concurrent scenario, so if large number of journal
> db files were opened simultaneously, we are under big memory pressure.
> b) If we are out of memory, reclaimer tries to write page of journal db into
> disk, it will destroy db file.
> c) Though, we have journal db file, we will face failure of recovering db file
> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> d) Recovery flow will make data page dirty, triggering both data stream and
> metadata stream, there should be more IOs than in inner revoking in
> atomic-interface.
> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> recover, checkpoint will persist the corrupt data in db file, following abnormal
> power-cut will leave that data in disk.
> 
> With revoking supported design, we can not solve all above issues, we will still
> face the same issue like c), but it will be a big improve if we can apply this
> in our interface, since it provide a way to fix the issue a) b) d). And also for
> e) case, we try to rescue data in first time that our revoking operation would be
> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> 
> If you don't want to have a big change in this interface or recovery flow, how
> about keep them both, and add a mount option to control inner recovery flow?

Or introduce F2FS_IOC_COMMIT_ATOMIC_WRITE_V2 for revoking supported interface?

> 
> How do you think? :)
> 
> Thanks,
> 
>> But, unfortunately, it seems that something is missing in the
>> current implementation.
>>
>> So simply how about this?
>>
>> A possible flow would be:
>> 1. write journal data to volatile space
>> 2. write db data to atomic space
>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
>>  - flush/fsync journal data to disk
>>  - drop atomic data, and will be recovered by database with journal
>>
>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
>> From: Jaegeuk Kim <jaegeuk@kernel.org>
>> Date: Tue, 29 Dec 2015 15:46:33 -0800
>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
>>
>> There are two rules to handle aborting volatile or atomic writes.
>>
>> 1. drop atomic writes
>>  - we don't need to keep any stale db data.
>>
>> 2. write journal data
>>  - we should keep the journal data with fsync for db recovery.
>>
>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
>> ---
>>  fs/f2fs/file.c | 13 ++++++++++---
>>  1 file changed, 10 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>> index 91f576a..d16438a 100644
>> --- a/fs/f2fs/file.c
>> +++ b/fs/f2fs/file.c
>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
>>  	if (ret)
>>  		return ret;
>>
>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
>> -	commit_inmem_pages(inode, true);
>> +	if (f2fs_is_atomic_file(inode)) {
>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
>> +		commit_inmem_pages(inode, true);
>> +	}
>> +	if (f2fs_is_volatile_file(inode)) {
>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
>> +		ret = commit_inmem_pages(inode, false);

Any more inmem pages exist here? Shouldn't these page have been released above?

Thanks,

>> +		if (!ret)
>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
>> +	}
>>
>>  	mnt_drop_write_file(filp);
>>  	return ret;
>> --
>> 2.6.3
> 
> 
> 
> ------------------------------------------------------------------------------
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] f2fs: support revoking atomic written pages
  2015-12-30  1:34   ` Chao Yu
  2015-12-30 15:35     ` [f2fs-dev] " Chao Yu
@ 2015-12-30 19:41     ` Jaegeuk Kim
  2015-12-31  9:16       ` Chao Yu
  1 sibling, 1 reply; 28+ messages in thread
From: Jaegeuk Kim @ 2015-12-30 19:41 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-f2fs-devel, linux-kernel

Hello,

On Wed, Dec 30, 2015 at 09:34:40AM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > Sent: Wednesday, December 30, 2015 8:05 AM
> > To: Chao Yu
> > Cc: linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > Subject: Re: [PATCH 2/2] f2fs: support revoking atomic written pages
> > 
> > Hi Chao,
> > 
> > On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > f2fs support atomic write with following semantics:
> > > 1. open db file
> > > 2. ioctl start atomic write
> > > 3. (write db file) * n
> > > 4. ioctl commit atomic write
> > > 5. close db file
> > >
> > > With this flow we can avoid file becoming corrupted when abnormal power
> > > cut, because we hold data of transaction in referenced pages linked in
> > > inmem_pages list of inode, but without setting them dirty, so these data
> > > won't be persisted unless we commit them in step 4.
> > >
> > > But we should still hold journal db file in memory by using volatile write,
> > > because our semantics of 'atomic write support' is not full, in step 4, we
> > > could be fail to submit all dirty data of transaction, once partial dirty
> > > data was committed in storage, db file should be corrupted, in this case,
> > > we should use journal db to recover the original data in db file.
> > 
> > Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > since database should get its error literally.
> > 
> > So, the only thing that we need to do is keeping journal data for further db
> > recovery.
> 
> IMO, if we really support *atomic* interface, we don't need any journal data
> kept by user, because f2fs already have it in its storage since we always
> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> to revoke (replace old to new in metadata) when any failure exist in atomic
> write process.

Yeah, so current design does not fully support atomic writes. IOWs, volatile
writes for journal files should be used together to minimize sqlite change as
much as possible.

> But in current design, we still hold journal data in memory for recovering for
> *rare* failure case. I think there are several issues:
> a) most of time, we are in concurrent scenario, so if large number of journal
> db files were opened simultaneously, we are under big memory pressure.

In current android, I've seen that this is not a big concern. Even there is
memory pressure, f2fs flushes volatile pages.

> b) If we are out of memory, reclaimer tries to write page of journal db into
> disk, it will destroy db file.

I don't understand. Could you elaborate why journal writes can corrupt db?

> c) Though, we have journal db file, we will face failure of recovering db file
> from journal db due to ENOMEM or EIO, then db file will be corrupted.

Do you mean the failure of recovering db with a complete journal?
Why do we have to handle that? That's a database stuff, IMO.

> d) Recovery flow will make data page dirty, triggering both data stream and
> metadata stream, there should be more IOs than in inner revoking in
> atomic-interface.

Well, do you mean there is no need to recover db after revoking?

> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> recover, checkpoint will persist the corrupt data in db file, following abnormal
> power-cut will leave that data in disk.

Yes, in that case, database should recover corrupted db with its journal file.

> With revoking supported design, we can not solve all above issues, we will still
> face the same issue like c), but it will be a big improve if we can apply this
> in our interface, since it provide a way to fix the issue a) b) d). And also for
> e) case, we try to rescue data in first time that our revoking operation would be
> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> 
> If you don't want to have a big change in this interface or recovery flow, how
> about keep them both, and add a mount option to control inner recovery flow?

Hmm, okay. I believe the current design is fine for sqlite in android.
For other databases, I can understand that they can use atomic_write without
journal control, which is a sort of stand-alone atomic_write.

It'd better to add a new ioctl for that, but before adding it, can we find
any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
Then, I expect that we can define a more appropriate and powerful ioctl.

Thanks,

> 
> How do you think? :)
> 
> Thanks,
> 
> > But, unfortunately, it seems that something is missing in the
> > current implementation.
> > 
> > So simply how about this?
> > 
> > A possible flow would be:
> > 1. write journal data to volatile space
> > 2. write db data to atomic space
> > 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> >  - flush/fsync journal data to disk
> >  - drop atomic data, and will be recovered by database with journal
> > 
> > From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > From: Jaegeuk Kim <jaegeuk@kernel.org>
> > Date: Tue, 29 Dec 2015 15:46:33 -0800
> > Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > 
> > There are two rules to handle aborting volatile or atomic writes.
> > 
> > 1. drop atomic writes
> >  - we don't need to keep any stale db data.
> > 
> > 2. write journal data
> >  - we should keep the journal data with fsync for db recovery.
> > 
> > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > ---
> >  fs/f2fs/file.c | 13 ++++++++++---
> >  1 file changed, 10 insertions(+), 3 deletions(-)
> > 
> > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > index 91f576a..d16438a 100644
> > --- a/fs/f2fs/file.c
> > +++ b/fs/f2fs/file.c
> > @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> >  	if (ret)
> >  		return ret;
> > 
> > -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > -	commit_inmem_pages(inode, true);
> > +	if (f2fs_is_atomic_file(inode)) {
> > +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > +		commit_inmem_pages(inode, true);
> > +	}
> > +	if (f2fs_is_volatile_file(inode)) {
> > +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > +		ret = commit_inmem_pages(inode, false);
> > +		if (!ret)
> > +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > +	}
> > 
> >  	mnt_drop_write_file(filp);
> >  	return ret;
> > --
> > 2.6.3
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2015-12-30 15:35     ` [f2fs-dev] " Chao Yu
@ 2015-12-30 19:43       ` Jaegeuk Kim
  0 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2015-12-30 19:43 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

On Wed, Dec 30, 2015 at 11:35:20PM +0800, Chao Yu wrote:
> On 12/30/15 9:34 AM, Chao Yu wrote:
> > Hi Jaegeuk,
> > 
> >> -----Original Message-----
> >> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> >> Sent: Wednesday, December 30, 2015 8:05 AM
> >> To: Chao Yu
> >> Cc: linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> >> Subject: Re: [PATCH 2/2] f2fs: support revoking atomic written pages
> >>
> >> Hi Chao,
> >>
> >> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> >>> f2fs support atomic write with following semantics:
> >>> 1. open db file
> >>> 2. ioctl start atomic write
> >>> 3. (write db file) * n
> >>> 4. ioctl commit atomic write
> >>> 5. close db file
> >>>
> >>> With this flow we can avoid file becoming corrupted when abnormal power
> >>> cut, because we hold data of transaction in referenced pages linked in
> >>> inmem_pages list of inode, but without setting them dirty, so these data
> >>> won't be persisted unless we commit them in step 4.
> >>>
> >>> But we should still hold journal db file in memory by using volatile write,
> >>> because our semantics of 'atomic write support' is not full, in step 4, we
> >>> could be fail to submit all dirty data of transaction, once partial dirty
> >>> data was committed in storage, db file should be corrupted, in this case,
> >>> we should use journal db to recover the original data in db file.
> >>
> >> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> >> since database should get its error literally.
> >>
> >> So, the only thing that we need to do is keeping journal data for further db
> >> recovery.
> > 
> > IMO, if we really support *atomic* interface, we don't need any journal data
> > kept by user, because f2fs already have it in its storage since we always
> > trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > to revoke (replace old to new in metadata) when any failure exist in atomic
> > write process.
> > 
> > But in current design, we still hold journal data in memory for recovering for
> > *rare* failure case. I think there are several issues:
> > a) most of time, we are in concurrent scenario, so if large number of journal
> > db files were opened simultaneously, we are under big memory pressure.
> > b) If we are out of memory, reclaimer tries to write page of journal db into
> > disk, it will destroy db file.
> > c) Though, we have journal db file, we will face failure of recovering db file
> > from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > d) Recovery flow will make data page dirty, triggering both data stream and
> > metadata stream, there should be more IOs than in inner revoking in
> > atomic-interface.
> > e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> > recover, checkpoint will persist the corrupt data in db file, following abnormal
> > power-cut will leave that data in disk.
> > 
> > With revoking supported design, we can not solve all above issues, we will still
> > face the same issue like c), but it will be a big improve if we can apply this
> > in our interface, since it provide a way to fix the issue a) b) d). And also for
> > e) case, we try to rescue data in first time that our revoking operation would be
> > protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > 
> > If you don't want to have a big change in this interface or recovery flow, how
> > about keep them both, and add a mount option to control inner recovery flow?
> 
> Or introduce F2FS_IOC_COMMIT_ATOMIC_WRITE_V2 for revoking supported interface?
> 
> > 
> > How do you think? :)
> > 
> > Thanks,
> > 
> >> But, unfortunately, it seems that something is missing in the
> >> current implementation.
> >>
> >> So simply how about this?
> >>
> >> A possible flow would be:
> >> 1. write journal data to volatile space
> >> 2. write db data to atomic space
> >> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> >>  - flush/fsync journal data to disk
> >>  - drop atomic data, and will be recovered by database with journal
> >>
> >> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> >> From: Jaegeuk Kim <jaegeuk@kernel.org>
> >> Date: Tue, 29 Dec 2015 15:46:33 -0800
> >> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> >>
> >> There are two rules to handle aborting volatile or atomic writes.
> >>
> >> 1. drop atomic writes
> >>  - we don't need to keep any stale db data.
> >>
> >> 2. write journal data
> >>  - we should keep the journal data with fsync for db recovery.
> >>
> >> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> >> ---
> >>  fs/f2fs/file.c | 13 ++++++++++---
> >>  1 file changed, 10 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> >> index 91f576a..d16438a 100644
> >> --- a/fs/f2fs/file.c
> >> +++ b/fs/f2fs/file.c
> >> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> >>  	if (ret)
> >>  		return ret;
> >>
> >> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> >> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> >> -	commit_inmem_pages(inode, true);
> >> +	if (f2fs_is_atomic_file(inode)) {
> >> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> >> +		commit_inmem_pages(inode, true);
> >> +	}
> >> +	if (f2fs_is_volatile_file(inode)) {
> >> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> >> +		ret = commit_inmem_pages(inode, false);
> 
> Any more inmem pages exist here? Shouldn't these page have been released above?

Oh, this should be like:

if (f2fs_is_volatile_file(inode)) {
	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
	ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
}

Thanks,

> 
> Thanks,
> 
> >> +		if (!ret)
> >> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> >> +	}
> >>
> >>  	mnt_drop_write_file(filp);
> >>  	return ret;
> >> --
> >> 2.6.3
> > 
> > 
> > 
> > ------------------------------------------------------------------------------
> > _______________________________________________
> > Linux-f2fs-devel mailing list
> > Linux-f2fs-devel@lists.sourceforge.net
> > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH 2/2] f2fs: support revoking atomic written pages
  2015-12-30 19:41     ` Jaegeuk Kim
@ 2015-12-31  9:16       ` Chao Yu
  2016-01-01  3:50           ` Jaegeuk Kim
  0 siblings, 1 reply; 28+ messages in thread
From: Chao Yu @ 2015-12-31  9:16 UTC (permalink / raw)
  To: 'Jaegeuk Kim'; +Cc: linux-f2fs-devel, linux-kernel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Thursday, December 31, 2015 3:41 AM
> To: Chao Yu
> Cc: linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> Subject: Re: [PATCH 2/2] f2fs: support revoking atomic written pages
> 
> Hello,
> 
> On Wed, Dec 30, 2015 at 09:34:40AM +0800, Chao Yu wrote:
> > Hi Jaegeuk,
> >
> > > -----Original Message-----
> > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > Sent: Wednesday, December 30, 2015 8:05 AM
> > > To: Chao Yu
> > > Cc: linux-f2fs-devel@lists.sourceforge.net; linux-kernel@vger.kernel.org
> > > Subject: Re: [PATCH 2/2] f2fs: support revoking atomic written pages
> > >
> > > Hi Chao,
> > >
> > > On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > f2fs support atomic write with following semantics:
> > > > 1. open db file
> > > > 2. ioctl start atomic write
> > > > 3. (write db file) * n
> > > > 4. ioctl commit atomic write
> > > > 5. close db file
> > > >
> > > > With this flow we can avoid file becoming corrupted when abnormal power
> > > > cut, because we hold data of transaction in referenced pages linked in
> > > > inmem_pages list of inode, but without setting them dirty, so these data
> > > > won't be persisted unless we commit them in step 4.
> > > >
> > > > But we should still hold journal db file in memory by using volatile write,
> > > > because our semantics of 'atomic write support' is not full, in step 4, we
> > > > could be fail to submit all dirty data of transaction, once partial dirty
> > > > data was committed in storage, db file should be corrupted, in this case,
> > > > we should use journal db to recover the original data in db file.
> > >
> > > Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > since database should get its error literally.
> > >
> > > So, the only thing that we need to do is keeping journal data for further db
> > > recovery.
> >
> > IMO, if we really support *atomic* interface, we don't need any journal data
> > kept by user, because f2fs already have it in its storage since we always
> > trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > to revoke (replace old to new in metadata) when any failure exist in atomic
> > write process.
> 
> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> writes for journal files should be used together to minimize sqlite change as
> much as possible.
> 
> > But in current design, we still hold journal data in memory for recovering for
> > *rare* failure case. I think there are several issues:
> > a) most of time, we are in concurrent scenario, so if large number of journal
> > db files were opened simultaneously, we are under big memory pressure.
> 
> In current android, I've seen that this is not a big concern. Even there is
> memory pressure, f2fs flushes volatile pages.

When I change to redirty all volatile pages in ->writepage, android seems go
into an infinite loop when doing recovery flow of f2fs data partition in startup.

if (f2fs_is_volatile_file(inode))
	goto redirty_out;

I didn't dig details, but I think there may be a little risk for this design.

> 
> > b) If we are out of memory, reclaimer tries to write page of journal db into
> > disk, it will destroy db file.
> 
> I don't understand. Could you elaborate why journal writes can corrupt db?

Normally, we keep pages of journal in memory, but partial page in journal
will be write out to device by reclaimer when out of memory. So this journal
may have valid data in its log head, but with corrupted data, then after
abnormal powe-cut, recovery with this journal before a transaction will
destroy db. Right?

> 
> > c) Though, we have journal db file, we will face failure of recovering db file
> > from journal db due to ENOMEM or EIO, then db file will be corrupted.
> 
> Do you mean the failure of recovering db with a complete journal?
> Why do we have to handle that? That's a database stuff, IMO.

Yes, just list for indicating we will face the same issue which is hard to 
handle both in original design and new design, so the inner revoking failure
issue would not be a weak point or flaw of new design.

> 
> > d) Recovery flow will make data page dirty, triggering both data stream and
> > metadata stream, there should be more IOs than in inner revoking in
> > atomic-interface.
> 
> Well, do you mean there is no need to recover db after revoking?

Yes, revoking make the same effect like the recovery of sqlite, so after
revoking, recovery is no need.

One more case is that user can send a command to abort current transaction,
it should be happened before atomic_commit operation, which could easily
handle with abort_commit ioctl.

> 
> > e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> > recover, checkpoint will persist the corrupt data in db file, following abnormal
> > power-cut will leave that data in disk.
> 
> Yes, in that case, database should recover corrupted db with its journal file.

Journal could be corrupted as I descripted in b).

> 
> > With revoking supported design, we can not solve all above issues, we will still
> > face the same issue like c), but it will be a big improve if we can apply this
> > in our interface, since it provide a way to fix the issue a) b) d). And also for
> > e) case, we try to rescue data in first time that our revoking operation would be
> > protected by f2fs_lock_op to avoid checkpoint + power-cut.
> >
> > If you don't want to have a big change in this interface or recovery flow, how
> > about keep them both, and add a mount option to control inner recovery flow?
> 
> Hmm, okay. I believe the current design is fine for sqlite in android.

I believe new design will enhance in memory usage and error handling of sqlite
in android, and hope this can be applied. But, I can understand that if you
were considerring about risk control and backward compatibility, since this
change affects all atomic related ioctls.

> For other databases, I can understand that they can use atomic_write without
> journal control, which is a sort of stand-alone atomic_write.
> 
> It'd better to add a new ioctl for that, but before adding it, can we find
> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)

You mean investigating or we can only start when there is a clear commercial
demand ?

> Then, I expect that we can define a more appropriate and powerful ioctl.

Agreed :)

Thanks,

> 
> Thanks,
> 
> >
> > How do you think? :)
> >
> > Thanks,
> >
> > > But, unfortunately, it seems that something is missing in the
> > > current implementation.
> > >
> > > So simply how about this?
> > >
> > > A possible flow would be:
> > > 1. write journal data to volatile space
> > > 2. write db data to atomic space
> > > 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > >  - flush/fsync journal data to disk
> > >  - drop atomic data, and will be recovered by database with journal
> > >
> > > From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > >
> > > There are two rules to handle aborting volatile or atomic writes.
> > >
> > > 1. drop atomic writes
> > >  - we don't need to keep any stale db data.
> > >
> > > 2. write journal data
> > >  - we should keep the journal data with fsync for db recovery.
> > >
> > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > ---
> > >  fs/f2fs/file.c | 13 ++++++++++---
> > >  1 file changed, 10 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > index 91f576a..d16438a 100644
> > > --- a/fs/f2fs/file.c
> > > +++ b/fs/f2fs/file.c
> > > @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> > >  	if (ret)
> > >  		return ret;
> > >
> > > -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > -	commit_inmem_pages(inode, true);
> > > +	if (f2fs_is_atomic_file(inode)) {
> > > +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > +		commit_inmem_pages(inode, true);
> > > +	}
> > > +	if (f2fs_is_volatile_file(inode)) {
> > > +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > +		ret = commit_inmem_pages(inode, false);
> > > +		if (!ret)
> > > +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > +	}
> > >
> > >  	mnt_drop_write_file(filp);
> > >  	return ret;
> > > --
> > > 2.6.3
> >


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] f2fs: support revoking atomic written pages
  2015-12-31  9:16       ` Chao Yu
@ 2016-01-01  3:50           ` Jaegeuk Kim
  0 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2016-01-01  3:50 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-f2fs-devel, linux-kernel

Hi Chao,

...

> > > > On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > > f2fs support atomic write with following semantics:
> > > > > 1. open db file
> > > > > 2. ioctl start atomic write
> > > > > 3. (write db file) * n
> > > > > 4. ioctl commit atomic write
> > > > > 5. close db file
> > > > >
> > > > > With this flow we can avoid file becoming corrupted when abnormal power
> > > > > cut, because we hold data of transaction in referenced pages linked in
> > > > > inmem_pages list of inode, but without setting them dirty, so these data
> > > > > won't be persisted unless we commit them in step 4.
> > > > >
> > > > > But we should still hold journal db file in memory by using volatile write,
> > > > > because our semantics of 'atomic write support' is not full, in step 4, we
> > > > > could be fail to submit all dirty data of transaction, once partial dirty
> > > > > data was committed in storage, db file should be corrupted, in this case,
> > > > > we should use journal db to recover the original data in db file.
> > > >
> > > > Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > since database should get its error literally.
> > > >
> > > > So, the only thing that we need to do is keeping journal data for further db
> > > > recovery.
> > >
> > > IMO, if we really support *atomic* interface, we don't need any journal data
> > > kept by user, because f2fs already have it in its storage since we always
> > > trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > > to revoke (replace old to new in metadata) when any failure exist in atomic
> > > write process.
> > 
> > Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > writes for journal files should be used together to minimize sqlite change as
> > much as possible.
> > 
> > > But in current design, we still hold journal data in memory for recovering for
> > > *rare* failure case. I think there are several issues:
> > > a) most of time, we are in concurrent scenario, so if large number of journal
> > > db files were opened simultaneously, we are under big memory pressure.
> > 
> > In current android, I've seen that this is not a big concern. Even there is
> > memory pressure, f2fs flushes volatile pages.
> 
> When I change to redirty all volatile pages in ->writepage, android seems go
> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> 
> if (f2fs_is_volatile_file(inode))
> 	goto redirty_out;

Where did you put this? It doesn't flush at all? Why?
Practically, the peak amount of journal writes depend on how many transactions
are processing concurrently.
I mean, in-memory pages are dropped at the end of every transaction.
You can check the number of pages through f2fs_stat on your phone.

> I didn't dig details, but I think there may be a little risk for this design.
> 
> > 
> > > b) If we are out of memory, reclaimer tries to write page of journal db into
> > > disk, it will destroy db file.
> > 
> > I don't understand. Could you elaborate why journal writes can corrupt db?
> 
> Normally, we keep pages of journal in memory, but partial page in journal
> will be write out to device by reclaimer when out of memory. So this journal
> may have valid data in its log head, but with corrupted data, then after
> abnormal powe-cut, recovery with this journal before a transaction will
> destroy db. Right?

Just think about sqlite without this feature.
Broken journal is pretty normal case for sqlite.

> > 
> > > c) Though, we have journal db file, we will face failure of recovering db file
> > > from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > 
> > Do you mean the failure of recovering db with a complete journal?
> > Why do we have to handle that? That's a database stuff, IMO.
> 
> Yes, just list for indicating we will face the same issue which is hard to 
> handle both in original design and new design, so the inner revoking failure
> issue would not be a weak point or flaw of new design.
> 
> > 
> > > d) Recovery flow will make data page dirty, triggering both data stream and
> > > metadata stream, there should be more IOs than in inner revoking in
> > > atomic-interface.
> > 
> > Well, do you mean there is no need to recover db after revoking?
> 
> Yes, revoking make the same effect like the recovery of sqlite, so after
> revoking, recovery is no need.

Logically, it doesn't make sense. If there is a valid journal file, it should
redo the previous transaction. No?

> One more case is that user can send a command to abort current transaction,
> it should be happened before atomic_commit operation, which could easily
> handle with abort_commit ioctl.
> 
> > 
> > > e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> > > recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > power-cut will leave that data in disk.
> > 
> > Yes, in that case, database should recover corrupted db with its journal file.
> 
> Journal could be corrupted as I descripted in b).

Okay, so what I'm thinking is like this.
It seems there are two corruption cases after journal writes.

1. power cut during atomic writes
 - broken journal file and clean db file -> give up
 - luckily, valid journal file and clean db file -> recover db

2. error during atomic writes
 a. power-cut before abort completion
  - broken journal file and broken db file -> revoking is needed!

 b. after abort
  - valid journal file and broken db file -> recover db (likewise plain sqlite)

Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
But, I think, even if revoking is done, we should notify an error to abort and
recover db by 2.b.

Something like this after successful revoking.

1. power cut during atomic writes
 - broken journal file and clean db file -> give up
 - luckily, valid journal file and clean db file -> recover db

2. error during atomic writes w/ revoking
 a. power-cut before abort completion
  - broken journal file and clean db file -> give up
  - luckily, valid journal file and clean db file -> recover db

 b. after abort
  - valid journal file and clean db file -> recover db

Let me verify these scenarios first. :)

Thanks,

> > 
> > > With revoking supported design, we can not solve all above issues, we will still
> > > face the same issue like c), but it will be a big improve if we can apply this
> > > in our interface, since it provide a way to fix the issue a) b) d). And also for
> > > e) case, we try to rescue data in first time that our revoking operation would be
> > > protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > >
> > > If you don't want to have a big change in this interface or recovery flow, how
> > > about keep them both, and add a mount option to control inner recovery flow?
> > 
> > Hmm, okay. I believe the current design is fine for sqlite in android.
> 
> I believe new design will enhance in memory usage and error handling of sqlite
> in android, and hope this can be applied. But, I can understand that if you
> were considerring about risk control and backward compatibility, since this
> change affects all atomic related ioctls.
> 
> > For other databases, I can understand that they can use atomic_write without
> > journal control, which is a sort of stand-alone atomic_write.
> > 
> > It'd better to add a new ioctl for that, but before adding it, can we find
> > any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> 
> You mean investigating or we can only start when there is a clear commercial
> demand ?
> 
> > Then, I expect that we can define a more appropriate and powerful ioctl.
> 
> Agreed :)
> 
> Thanks,
> 
> > 
> > Thanks,
> > 
> > >
> > > How do you think? :)
> > >
> > > Thanks,
> > >
> > > > But, unfortunately, it seems that something is missing in the
> > > > current implementation.
> > > >
> > > > So simply how about this?
> > > >
> > > > A possible flow would be:
> > > > 1. write journal data to volatile space
> > > > 2. write db data to atomic space
> > > > 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > > >  - flush/fsync journal data to disk
> > > >  - drop atomic data, and will be recovered by database with journal
> > > >
> > > > From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > >
> > > > There are two rules to handle aborting volatile or atomic writes.
> > > >
> > > > 1. drop atomic writes
> > > >  - we don't need to keep any stale db data.
> > > >
> > > > 2. write journal data
> > > >  - we should keep the journal data with fsync for db recovery.
> > > >
> > > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > ---
> > > >  fs/f2fs/file.c | 13 ++++++++++---
> > > >  1 file changed, 10 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > index 91f576a..d16438a 100644
> > > > --- a/fs/f2fs/file.c
> > > > +++ b/fs/f2fs/file.c
> > > > @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> > > >  	if (ret)
> > > >  		return ret;
> > > >
> > > > -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > -	commit_inmem_pages(inode, true);
> > > > +	if (f2fs_is_atomic_file(inode)) {
> > > > +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > +		commit_inmem_pages(inode, true);
> > > > +	}
> > > > +	if (f2fs_is_volatile_file(inode)) {
> > > > +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > +		ret = commit_inmem_pages(inode, false);
> > > > +		if (!ret)
> > > > +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > +	}
> > > >
> > > >  	mnt_drop_write_file(filp);
> > > >  	return ret;
> > > > --
> > > > 2.6.3
> > >

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] f2fs: support revoking atomic written pages
@ 2016-01-01  3:50           ` Jaegeuk Kim
  0 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2016-01-01  3:50 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

Hi Chao,

...

> > > > On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > > f2fs support atomic write with following semantics:
> > > > > 1. open db file
> > > > > 2. ioctl start atomic write
> > > > > 3. (write db file) * n
> > > > > 4. ioctl commit atomic write
> > > > > 5. close db file
> > > > >
> > > > > With this flow we can avoid file becoming corrupted when abnormal power
> > > > > cut, because we hold data of transaction in referenced pages linked in
> > > > > inmem_pages list of inode, but without setting them dirty, so these data
> > > > > won't be persisted unless we commit them in step 4.
> > > > >
> > > > > But we should still hold journal db file in memory by using volatile write,
> > > > > because our semantics of 'atomic write support' is not full, in step 4, we
> > > > > could be fail to submit all dirty data of transaction, once partial dirty
> > > > > data was committed in storage, db file should be corrupted, in this case,
> > > > > we should use journal db to recover the original data in db file.
> > > >
> > > > Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > since database should get its error literally.
> > > >
> > > > So, the only thing that we need to do is keeping journal data for further db
> > > > recovery.
> > >
> > > IMO, if we really support *atomic* interface, we don't need any journal data
> > > kept by user, because f2fs already have it in its storage since we always
> > > trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > > to revoke (replace old to new in metadata) when any failure exist in atomic
> > > write process.
> > 
> > Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > writes for journal files should be used together to minimize sqlite change as
> > much as possible.
> > 
> > > But in current design, we still hold journal data in memory for recovering for
> > > *rare* failure case. I think there are several issues:
> > > a) most of time, we are in concurrent scenario, so if large number of journal
> > > db files were opened simultaneously, we are under big memory pressure.
> > 
> > In current android, I've seen that this is not a big concern. Even there is
> > memory pressure, f2fs flushes volatile pages.
> 
> When I change to redirty all volatile pages in ->writepage, android seems go
> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> 
> if (f2fs_is_volatile_file(inode))
> 	goto redirty_out;

Where did you put this? It doesn't flush at all? Why?
Practically, the peak amount of journal writes depend on how many transactions
are processing concurrently.
I mean, in-memory pages are dropped at the end of every transaction.
You can check the number of pages through f2fs_stat on your phone.

> I didn't dig details, but I think there may be a little risk for this design.
> 
> > 
> > > b) If we are out of memory, reclaimer tries to write page of journal db into
> > > disk, it will destroy db file.
> > 
> > I don't understand. Could you elaborate why journal writes can corrupt db?
> 
> Normally, we keep pages of journal in memory, but partial page in journal
> will be write out to device by reclaimer when out of memory. So this journal
> may have valid data in its log head, but with corrupted data, then after
> abnormal powe-cut, recovery with this journal before a transaction will
> destroy db. Right?

Just think about sqlite without this feature.
Broken journal is pretty normal case for sqlite.

> > 
> > > c) Though, we have journal db file, we will face failure of recovering db file
> > > from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > 
> > Do you mean the failure of recovering db with a complete journal?
> > Why do we have to handle that? That's a database stuff, IMO.
> 
> Yes, just list for indicating we will face the same issue which is hard to 
> handle both in original design and new design, so the inner revoking failure
> issue would not be a weak point or flaw of new design.
> 
> > 
> > > d) Recovery flow will make data page dirty, triggering both data stream and
> > > metadata stream, there should be more IOs than in inner revoking in
> > > atomic-interface.
> > 
> > Well, do you mean there is no need to recover db after revoking?
> 
> Yes, revoking make the same effect like the recovery of sqlite, so after
> revoking, recovery is no need.

Logically, it doesn't make sense. If there is a valid journal file, it should
redo the previous transaction. No?

> One more case is that user can send a command to abort current transaction,
> it should be happened before atomic_commit operation, which could easily
> handle with abort_commit ioctl.
> 
> > 
> > > e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> > > recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > power-cut will leave that data in disk.
> > 
> > Yes, in that case, database should recover corrupted db with its journal file.
> 
> Journal could be corrupted as I descripted in b).

Okay, so what I'm thinking is like this.
It seems there are two corruption cases after journal writes.

1. power cut during atomic writes
 - broken journal file and clean db file -> give up
 - luckily, valid journal file and clean db file -> recover db

2. error during atomic writes
 a. power-cut before abort completion
  - broken journal file and broken db file -> revoking is needed!

 b. after abort
  - valid journal file and broken db file -> recover db (likewise plain sqlite)

Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
But, I think, even if revoking is done, we should notify an error to abort and
recover db by 2.b.

Something like this after successful revoking.

1. power cut during atomic writes
 - broken journal file and clean db file -> give up
 - luckily, valid journal file and clean db file -> recover db

2. error during atomic writes w/ revoking
 a. power-cut before abort completion
  - broken journal file and clean db file -> give up
  - luckily, valid journal file and clean db file -> recover db

 b. after abort
  - valid journal file and clean db file -> recover db

Let me verify these scenarios first. :)

Thanks,

> > 
> > > With revoking supported design, we can not solve all above issues, we will still
> > > face the same issue like c), but it will be a big improve if we can apply this
> > > in our interface, since it provide a way to fix the issue a) b) d). And also for
> > > e) case, we try to rescue data in first time that our revoking operation would be
> > > protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > >
> > > If you don't want to have a big change in this interface or recovery flow, how
> > > about keep them both, and add a mount option to control inner recovery flow?
> > 
> > Hmm, okay. I believe the current design is fine for sqlite in android.
> 
> I believe new design will enhance in memory usage and error handling of sqlite
> in android, and hope this can be applied. But, I can understand that if you
> were considerring about risk control and backward compatibility, since this
> change affects all atomic related ioctls.
> 
> > For other databases, I can understand that they can use atomic_write without
> > journal control, which is a sort of stand-alone atomic_write.
> > 
> > It'd better to add a new ioctl for that, but before adding it, can we find
> > any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> 
> You mean investigating or we can only start when there is a clear commercial
> demand ?
> 
> > Then, I expect that we can define a more appropriate and powerful ioctl.
> 
> Agreed :)
> 
> Thanks,
> 
> > 
> > Thanks,
> > 
> > >
> > > How do you think? :)
> > >
> > > Thanks,
> > >
> > > > But, unfortunately, it seems that something is missing in the
> > > > current implementation.
> > > >
> > > > So simply how about this?
> > > >
> > > > A possible flow would be:
> > > > 1. write journal data to volatile space
> > > > 2. write db data to atomic space
> > > > 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > > >  - flush/fsync journal data to disk
> > > >  - drop atomic data, and will be recovered by database with journal
> > > >
> > > > From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > >
> > > > There are two rules to handle aborting volatile or atomic writes.
> > > >
> > > > 1. drop atomic writes
> > > >  - we don't need to keep any stale db data.
> > > >
> > > > 2. write journal data
> > > >  - we should keep the journal data with fsync for db recovery.
> > > >
> > > > Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > ---
> > > >  fs/f2fs/file.c | 13 ++++++++++---
> > > >  1 file changed, 10 insertions(+), 3 deletions(-)
> > > >
> > > > diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > index 91f576a..d16438a 100644
> > > > --- a/fs/f2fs/file.c
> > > > +++ b/fs/f2fs/file.c
> > > > @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> > > >  	if (ret)
> > > >  		return ret;
> > > >
> > > > -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > -	commit_inmem_pages(inode, true);
> > > > +	if (f2fs_is_atomic_file(inode)) {
> > > > +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > +		commit_inmem_pages(inode, true);
> > > > +	}
> > > > +	if (f2fs_is_volatile_file(inode)) {
> > > > +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > +		ret = commit_inmem_pages(inode, false);
> > > > +		if (!ret)
> > > > +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > +	}
> > > >
> > > >  	mnt_drop_write_file(filp);
> > > >  	return ret;
> > > > --
> > > > 2.6.3
> > >

------------------------------------------------------------------------------

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2016-01-01  3:50           ` Jaegeuk Kim
@ 2016-01-01 12:13             ` Chao Yu
  -1 siblings, 0 replies; 28+ messages in thread
From: Chao Yu @ 2016-01-01 12:13 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel

Hi Jaegeuk,

On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> Hi Chao,
> 
> ...
> 
>>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
>>>>>> f2fs support atomic write with following semantics:
>>>>>> 1. open db file
>>>>>> 2. ioctl start atomic write
>>>>>> 3. (write db file) * n
>>>>>> 4. ioctl commit atomic write
>>>>>> 5. close db file
>>>>>>
>>>>>> With this flow we can avoid file becoming corrupted when abnormal power
>>>>>> cut, because we hold data of transaction in referenced pages linked in
>>>>>> inmem_pages list of inode, but without setting them dirty, so these data
>>>>>> won't be persisted unless we commit them in step 4.
>>>>>>
>>>>>> But we should still hold journal db file in memory by using volatile write,
>>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
>>>>>> could be fail to submit all dirty data of transaction, once partial dirty
>>>>>> data was committed in storage, db file should be corrupted, in this case,
>>>>>> we should use journal db to recover the original data in db file.
>>>>>
>>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
>>>>> since database should get its error literally.
>>>>>
>>>>> So, the only thing that we need to do is keeping journal data for further db
>>>>> recovery.
>>>>
>>>> IMO, if we really support *atomic* interface, we don't need any journal data
>>>> kept by user, because f2fs already have it in its storage since we always
>>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
>>>> to revoke (replace old to new in metadata) when any failure exist in atomic
>>>> write process.
>>>
>>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
>>> writes for journal files should be used together to minimize sqlite change as
>>> much as possible.
>>>
>>>> But in current design, we still hold journal data in memory for recovering for
>>>> *rare* failure case. I think there are several issues:
>>>> a) most of time, we are in concurrent scenario, so if large number of journal
>>>> db files were opened simultaneously, we are under big memory pressure.
>>>
>>> In current android, I've seen that this is not a big concern. Even there is
>>> memory pressure, f2fs flushes volatile pages.
>>
>> When I change to redirty all volatile pages in ->writepage, android seems go
>> into an infinite loop when doing recovery flow of f2fs data partition in startup.
>>
>> if (f2fs_is_volatile_file(inode))
>> 	goto redirty_out;
> 
> Where did you put this? It doesn't flush at all? Why?

Original place in ->writepage, just remove two other conditions.

To avoid potential random writebacking of dirty page in journal which
cause unpredicted corrupting in journal.

> Practically, the peak amount of journal writes depend on how many transactions
> are processing concurrently.
> I mean, in-memory pages are dropped at the end of every transaction.
> You can check the number of pages through f2fs_stat on your phone.
> 
>> I didn't dig details, but I think there may be a little risk for this design.
>>
>>>
>>>> b) If we are out of memory, reclaimer tries to write page of journal db into
>>>> disk, it will destroy db file.
>>>
>>> I don't understand. Could you elaborate why journal writes can corrupt db?
>>
>> Normally, we keep pages of journal in memory, but partial page in journal
>> will be write out to device by reclaimer when out of memory. So this journal
>> may have valid data in its log head, but with corrupted data, then after
>> abnormal powe-cut, recovery with this journal before a transaction will
>> destroy db. Right?
> 
> Just think about sqlite without this feature.
> Broken journal is pretty normal case for sqlite.

Maybe, if it is caused by bug or design issue of software, no matter db system
or filesystem, we should try our best to fix it to avoid generating broken journals.

> 
>>>
>>>> c) Though, we have journal db file, we will face failure of recovering db file
>>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
>>>
>>> Do you mean the failure of recovering db with a complete journal?
>>> Why do we have to handle that? That's a database stuff, IMO.
>>
>> Yes, just list for indicating we will face the same issue which is hard to 
>> handle both in original design and new design, so the inner revoking failure
>> issue would not be a weak point or flaw of new design.
>>
>>>
>>>> d) Recovery flow will make data page dirty, triggering both data stream and
>>>> metadata stream, there should be more IOs than in inner revoking in
>>>> atomic-interface.
>>>
>>> Well, do you mean there is no need to recover db after revoking?
>>
>> Yes, revoking make the same effect like the recovery of sqlite, so after
>> revoking, recovery is no need.
> 
> Logically, it doesn't make sense. If there is a valid journal file, it should
> redo the previous transaction. No?

As we know, in sqlite, before we commit a transaction, we will use journal to
record original data of pages which will be updated in following transaction, so
in following if a) abnormal power-cut, b) commit error, c) redo command was
triggered by user, we will recover db with journal.

Ideally, if we support atomic write interface, in there should always return two
status in atomic write interface: success or fail. If success, transaction was
committed, otherwise, it looks like nothing happened, user will be told
transaction was failed. Then, journals in sqlite could no longer be used,
eventually no journal, no recovery.

The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
revoking in commit interface since it could destroy db file permanently w/o
journal. IMO, some optimization could be done for these cases:
1. ENOMEM: enable retrying or mark accessed flag in page in advance.
2. ENOSPC: preallocate blocks for node blocks and data blocks.

These optimizations couldn't guarantee no failure in revoking operation
completely, luckily, those are not common cases, and they also happen in sqlite
w/o atomic feature.

One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
guess we can optimize DB like:
1. reflink db to db.ref
2. do transaction in db.ref
   - failed, rm db.ref
   - power-cut rm db.ref
3. rename db.ref to db

> 
>> One more case is that user can send a command to abort current transaction,
>> it should be happened before atomic_commit operation, which could easily
>> handle with abort_commit ioctl.
>>
>>>
>>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
>>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
>>>> power-cut will leave that data in disk.
>>>
>>> Yes, in that case, database should recover corrupted db with its journal file.
>>
>> Journal could be corrupted as I descripted in b).
> 
> Okay, so what I'm thinking is like this.
> It seems there are two corruption cases after journal writes.
> 
> 1. power cut during atomic writes
>  - broken journal file and clean db file -> give up
>  - luckily, valid journal file and clean db file -> recover db
> 
> 2. error during atomic writes
>  a. power-cut before abort completion
>   - broken journal file and broken db file -> revoking is needed!
> 
>  b. after abort
>   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> 
> Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> But, I think, even if revoking is done, we should notify an error to abort and
> recover db by 2.b.
> 
> Something like this after successful revoking.
> 
> 1. power cut during atomic writes
>  - broken journal file and clean db file -> give up
>  - luckily, valid journal file and clean db file -> recover db
> 
> 2. error during atomic writes w/ revoking
>  a. power-cut before abort completion
>   - broken journal file and clean db file -> give up
>   - luckily, valid journal file and clean db file -> recover db
> 
>  b. after abort
>   - valid journal file and clean db file -> recover db

That's right.

> 
> Let me verify these scenarios first. :)

OK. :)

Thanks,

> 
> Thanks,
> 
>>>
>>>> With revoking supported design, we can not solve all above issues, we will still
>>>> face the same issue like c), but it will be a big improve if we can apply this
>>>> in our interface, since it provide a way to fix the issue a) b) d). And also for
>>>> e) case, we try to rescue data in first time that our revoking operation would be
>>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
>>>>
>>>> If you don't want to have a big change in this interface or recovery flow, how
>>>> about keep them both, and add a mount option to control inner recovery flow?
>>>
>>> Hmm, okay. I believe the current design is fine for sqlite in android.
>>
>> I believe new design will enhance in memory usage and error handling of sqlite
>> in android, and hope this can be applied. But, I can understand that if you
>> were considerring about risk control and backward compatibility, since this
>> change affects all atomic related ioctls.
>>
>>> For other databases, I can understand that they can use atomic_write without
>>> journal control, which is a sort of stand-alone atomic_write.
>>>
>>> It'd better to add a new ioctl for that, but before adding it, can we find
>>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
>>
>> You mean investigating or we can only start when there is a clear commercial
>> demand ?
>>
>>> Then, I expect that we can define a more appropriate and powerful ioctl.
>>
>> Agreed :)
>>
>> Thanks,
>>
>>>
>>> Thanks,
>>>
>>>>
>>>> How do you think? :)
>>>>
>>>> Thanks,
>>>>
>>>>> But, unfortunately, it seems that something is missing in the
>>>>> current implementation.
>>>>>
>>>>> So simply how about this?
>>>>>
>>>>> A possible flow would be:
>>>>> 1. write journal data to volatile space
>>>>> 2. write db data to atomic space
>>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
>>>>>  - flush/fsync journal data to disk
>>>>>  - drop atomic data, and will be recovered by database with journal
>>>>>
>>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
>>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
>>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
>>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
>>>>>
>>>>> There are two rules to handle aborting volatile or atomic writes.
>>>>>
>>>>> 1. drop atomic writes
>>>>>  - we don't need to keep any stale db data.
>>>>>
>>>>> 2. write journal data
>>>>>  - we should keep the journal data with fsync for db recovery.
>>>>>
>>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
>>>>> ---
>>>>>  fs/f2fs/file.c | 13 ++++++++++---
>>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>>>> index 91f576a..d16438a 100644
>>>>> --- a/fs/f2fs/file.c
>>>>> +++ b/fs/f2fs/file.c
>>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
>>>>>  	if (ret)
>>>>>  		return ret;
>>>>>
>>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
>>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
>>>>> -	commit_inmem_pages(inode, true);
>>>>> +	if (f2fs_is_atomic_file(inode)) {
>>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
>>>>> +		commit_inmem_pages(inode, true);
>>>>> +	}
>>>>> +	if (f2fs_is_volatile_file(inode)) {
>>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
>>>>> +		ret = commit_inmem_pages(inode, false);
>>>>> +		if (!ret)
>>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
>>>>> +	}
>>>>>
>>>>>  	mnt_drop_write_file(filp);
>>>>>  	return ret;
>>>>> --
>>>>> 2.6.3
>>>>
> 
> ------------------------------------------------------------------------------
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] f2fs: support revoking atomic written pages
@ 2016-01-01 12:13             ` Chao Yu
  0 siblings, 0 replies; 28+ messages in thread
From: Chao Yu @ 2016-01-01 12:13 UTC (permalink / raw)
  To: Jaegeuk Kim; +Cc: linux-kernel, linux-f2fs-devel

Hi Jaegeuk,

On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> Hi Chao,
> 
> ...
> 
>>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
>>>>>> f2fs support atomic write with following semantics:
>>>>>> 1. open db file
>>>>>> 2. ioctl start atomic write
>>>>>> 3. (write db file) * n
>>>>>> 4. ioctl commit atomic write
>>>>>> 5. close db file
>>>>>>
>>>>>> With this flow we can avoid file becoming corrupted when abnormal power
>>>>>> cut, because we hold data of transaction in referenced pages linked in
>>>>>> inmem_pages list of inode, but without setting them dirty, so these data
>>>>>> won't be persisted unless we commit them in step 4.
>>>>>>
>>>>>> But we should still hold journal db file in memory by using volatile write,
>>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
>>>>>> could be fail to submit all dirty data of transaction, once partial dirty
>>>>>> data was committed in storage, db file should be corrupted, in this case,
>>>>>> we should use journal db to recover the original data in db file.
>>>>>
>>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
>>>>> since database should get its error literally.
>>>>>
>>>>> So, the only thing that we need to do is keeping journal data for further db
>>>>> recovery.
>>>>
>>>> IMO, if we really support *atomic* interface, we don't need any journal data
>>>> kept by user, because f2fs already have it in its storage since we always
>>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
>>>> to revoke (replace old to new in metadata) when any failure exist in atomic
>>>> write process.
>>>
>>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
>>> writes for journal files should be used together to minimize sqlite change as
>>> much as possible.
>>>
>>>> But in current design, we still hold journal data in memory for recovering for
>>>> *rare* failure case. I think there are several issues:
>>>> a) most of time, we are in concurrent scenario, so if large number of journal
>>>> db files were opened simultaneously, we are under big memory pressure.
>>>
>>> In current android, I've seen that this is not a big concern. Even there is
>>> memory pressure, f2fs flushes volatile pages.
>>
>> When I change to redirty all volatile pages in ->writepage, android seems go
>> into an infinite loop when doing recovery flow of f2fs data partition in startup.
>>
>> if (f2fs_is_volatile_file(inode))
>> 	goto redirty_out;
> 
> Where did you put this? It doesn't flush at all? Why?

Original place in ->writepage, just remove two other conditions.

To avoid potential random writebacking of dirty page in journal which
cause unpredicted corrupting in journal.

> Practically, the peak amount of journal writes depend on how many transactions
> are processing concurrently.
> I mean, in-memory pages are dropped at the end of every transaction.
> You can check the number of pages through f2fs_stat on your phone.
> 
>> I didn't dig details, but I think there may be a little risk for this design.
>>
>>>
>>>> b) If we are out of memory, reclaimer tries to write page of journal db into
>>>> disk, it will destroy db file.
>>>
>>> I don't understand. Could you elaborate why journal writes can corrupt db?
>>
>> Normally, we keep pages of journal in memory, but partial page in journal
>> will be write out to device by reclaimer when out of memory. So this journal
>> may have valid data in its log head, but with corrupted data, then after
>> abnormal powe-cut, recovery with this journal before a transaction will
>> destroy db. Right?
> 
> Just think about sqlite without this feature.
> Broken journal is pretty normal case for sqlite.

Maybe, if it is caused by bug or design issue of software, no matter db system
or filesystem, we should try our best to fix it to avoid generating broken journals.

> 
>>>
>>>> c) Though, we have journal db file, we will face failure of recovering db file
>>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
>>>
>>> Do you mean the failure of recovering db with a complete journal?
>>> Why do we have to handle that? That's a database stuff, IMO.
>>
>> Yes, just list for indicating we will face the same issue which is hard to 
>> handle both in original design and new design, so the inner revoking failure
>> issue would not be a weak point or flaw of new design.
>>
>>>
>>>> d) Recovery flow will make data page dirty, triggering both data stream and
>>>> metadata stream, there should be more IOs than in inner revoking in
>>>> atomic-interface.
>>>
>>> Well, do you mean there is no need to recover db after revoking?
>>
>> Yes, revoking make the same effect like the recovery of sqlite, so after
>> revoking, recovery is no need.
> 
> Logically, it doesn't make sense. If there is a valid journal file, it should
> redo the previous transaction. No?

As we know, in sqlite, before we commit a transaction, we will use journal to
record original data of pages which will be updated in following transaction, so
in following if a) abnormal power-cut, b) commit error, c) redo command was
triggered by user, we will recover db with journal.

Ideally, if we support atomic write interface, in there should always return two
status in atomic write interface: success or fail. If success, transaction was
committed, otherwise, it looks like nothing happened, user will be told
transaction was failed. Then, journals in sqlite could no longer be used,
eventually no journal, no recovery.

The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
revoking in commit interface since it could destroy db file permanently w/o
journal. IMO, some optimization could be done for these cases:
1. ENOMEM: enable retrying or mark accessed flag in page in advance.
2. ENOSPC: preallocate blocks for node blocks and data blocks.

These optimizations couldn't guarantee no failure in revoking operation
completely, luckily, those are not common cases, and they also happen in sqlite
w/o atomic feature.

One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
guess we can optimize DB like:
1. reflink db to db.ref
2. do transaction in db.ref
   - failed, rm db.ref
   - power-cut rm db.ref
3. rename db.ref to db

> 
>> One more case is that user can send a command to abort current transaction,
>> it should be happened before atomic_commit operation, which could easily
>> handle with abort_commit ioctl.
>>
>>>
>>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
>>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
>>>> power-cut will leave that data in disk.
>>>
>>> Yes, in that case, database should recover corrupted db with its journal file.
>>
>> Journal could be corrupted as I descripted in b).
> 
> Okay, so what I'm thinking is like this.
> It seems there are two corruption cases after journal writes.
> 
> 1. power cut during atomic writes
>  - broken journal file and clean db file -> give up
>  - luckily, valid journal file and clean db file -> recover db
> 
> 2. error during atomic writes
>  a. power-cut before abort completion
>   - broken journal file and broken db file -> revoking is needed!
> 
>  b. after abort
>   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> 
> Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> But, I think, even if revoking is done, we should notify an error to abort and
> recover db by 2.b.
> 
> Something like this after successful revoking.
> 
> 1. power cut during atomic writes
>  - broken journal file and clean db file -> give up
>  - luckily, valid journal file and clean db file -> recover db
> 
> 2. error during atomic writes w/ revoking
>  a. power-cut before abort completion
>   - broken journal file and clean db file -> give up
>   - luckily, valid journal file and clean db file -> recover db
> 
>  b. after abort
>   - valid journal file and clean db file -> recover db

That's right.

> 
> Let me verify these scenarios first. :)

OK. :)

Thanks,

> 
> Thanks,
> 
>>>
>>>> With revoking supported design, we can not solve all above issues, we will still
>>>> face the same issue like c), but it will be a big improve if we can apply this
>>>> in our interface, since it provide a way to fix the issue a) b) d). And also for
>>>> e) case, we try to rescue data in first time that our revoking operation would be
>>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
>>>>
>>>> If you don't want to have a big change in this interface or recovery flow, how
>>>> about keep them both, and add a mount option to control inner recovery flow?
>>>
>>> Hmm, okay. I believe the current design is fine for sqlite in android.
>>
>> I believe new design will enhance in memory usage and error handling of sqlite
>> in android, and hope this can be applied. But, I can understand that if you
>> were considerring about risk control and backward compatibility, since this
>> change affects all atomic related ioctls.
>>
>>> For other databases, I can understand that they can use atomic_write without
>>> journal control, which is a sort of stand-alone atomic_write.
>>>
>>> It'd better to add a new ioctl for that, but before adding it, can we find
>>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
>>
>> You mean investigating or we can only start when there is a clear commercial
>> demand ?
>>
>>> Then, I expect that we can define a more appropriate and powerful ioctl.
>>
>> Agreed :)
>>
>> Thanks,
>>
>>>
>>> Thanks,
>>>
>>>>
>>>> How do you think? :)
>>>>
>>>> Thanks,
>>>>
>>>>> But, unfortunately, it seems that something is missing in the
>>>>> current implementation.
>>>>>
>>>>> So simply how about this?
>>>>>
>>>>> A possible flow would be:
>>>>> 1. write journal data to volatile space
>>>>> 2. write db data to atomic space
>>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
>>>>>  - flush/fsync journal data to disk
>>>>>  - drop atomic data, and will be recovered by database with journal
>>>>>
>>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
>>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
>>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
>>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
>>>>>
>>>>> There are two rules to handle aborting volatile or atomic writes.
>>>>>
>>>>> 1. drop atomic writes
>>>>>  - we don't need to keep any stale db data.
>>>>>
>>>>> 2. write journal data
>>>>>  - we should keep the journal data with fsync for db recovery.
>>>>>
>>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
>>>>> ---
>>>>>  fs/f2fs/file.c | 13 ++++++++++---
>>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
>>>>>
>>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
>>>>> index 91f576a..d16438a 100644
>>>>> --- a/fs/f2fs/file.c
>>>>> +++ b/fs/f2fs/file.c
>>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
>>>>>  	if (ret)
>>>>>  		return ret;
>>>>>
>>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
>>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
>>>>> -	commit_inmem_pages(inode, true);
>>>>> +	if (f2fs_is_atomic_file(inode)) {
>>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
>>>>> +		commit_inmem_pages(inode, true);
>>>>> +	}
>>>>> +	if (f2fs_is_volatile_file(inode)) {
>>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
>>>>> +		ret = commit_inmem_pages(inode, false);
>>>>> +		if (!ret)
>>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
>>>>> +	}
>>>>>
>>>>>  	mnt_drop_write_file(filp);
>>>>>  	return ret;
>>>>> --
>>>>> 2.6.3
>>>>
> 
> ------------------------------------------------------------------------------
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> 

------------------------------------------------------------------------------

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2016-01-01 12:13             ` Chao Yu
  (?)
@ 2016-01-08 12:05             ` Chao Yu
  2016-01-08 19:43                 ` Jaegeuk Kim
  -1 siblings, 1 reply; 28+ messages in thread
From: Chao Yu @ 2016-01-08 12:05 UTC (permalink / raw)
  To: 'Jaegeuk Kim'; +Cc: linux-kernel, linux-f2fs-devel

Hi Jaegeuk,

Any progress on this patch?

Thanks,

> -----Original Message-----
> From: Chao Yu [mailto:chao@kernel.org]
> Sent: Friday, January 01, 2016 8:14 PM
> To: Jaegeuk Kim
> Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> 
> Hi Jaegeuk,
> 
> On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > Hi Chao,
> >
> > ...
> >
> >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> >>>>>> f2fs support atomic write with following semantics:
> >>>>>> 1. open db file
> >>>>>> 2. ioctl start atomic write
> >>>>>> 3. (write db file) * n
> >>>>>> 4. ioctl commit atomic write
> >>>>>> 5. close db file
> >>>>>>
> >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> >>>>>> cut, because we hold data of transaction in referenced pages linked in
> >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> >>>>>> won't be persisted unless we commit them in step 4.
> >>>>>>
> >>>>>> But we should still hold journal db file in memory by using volatile write,
> >>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
> >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> >>>>>> data was committed in storage, db file should be corrupted, in this case,
> >>>>>> we should use journal db to recover the original data in db file.
> >>>>>
> >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> >>>>> since database should get its error literally.
> >>>>>
> >>>>> So, the only thing that we need to do is keeping journal data for further db
> >>>>> recovery.
> >>>>
> >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> >>>> kept by user, because f2fs already have it in its storage since we always
> >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> >>>> write process.
> >>>
> >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> >>> writes for journal files should be used together to minimize sqlite change as
> >>> much as possible.
> >>>
> >>>> But in current design, we still hold journal data in memory for recovering for
> >>>> *rare* failure case. I think there are several issues:
> >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> >>>> db files were opened simultaneously, we are under big memory pressure.
> >>>
> >>> In current android, I've seen that this is not a big concern. Even there is
> >>> memory pressure, f2fs flushes volatile pages.
> >>
> >> When I change to redirty all volatile pages in ->writepage, android seems go
> >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> >>
> >> if (f2fs_is_volatile_file(inode))
> >> 	goto redirty_out;
> >
> > Where did you put this? It doesn't flush at all? Why?
> 
> Original place in ->writepage, just remove two other conditions.
> 
> To avoid potential random writebacking of dirty page in journal which
> cause unpredicted corrupting in journal.
> 
> > Practically, the peak amount of journal writes depend on how many transactions
> > are processing concurrently.
> > I mean, in-memory pages are dropped at the end of every transaction.
> > You can check the number of pages through f2fs_stat on your phone.
> >
> >> I didn't dig details, but I think there may be a little risk for this design.
> >>
> >>>
> >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> >>>> disk, it will destroy db file.
> >>>
> >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> >>
> >> Normally, we keep pages of journal in memory, but partial page in journal
> >> will be write out to device by reclaimer when out of memory. So this journal
> >> may have valid data in its log head, but with corrupted data, then after
> >> abnormal powe-cut, recovery with this journal before a transaction will
> >> destroy db. Right?
> >
> > Just think about sqlite without this feature.
> > Broken journal is pretty normal case for sqlite.
> 
> Maybe, if it is caused by bug or design issue of software, no matter db system
> or filesystem, we should try our best to fix it to avoid generating broken journals.
> 
> >
> >>>
> >>>> c) Though, we have journal db file, we will face failure of recovering db file
> >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> >>>
> >>> Do you mean the failure of recovering db with a complete journal?
> >>> Why do we have to handle that? That's a database stuff, IMO.
> >>
> >> Yes, just list for indicating we will face the same issue which is hard to
> >> handle both in original design and new design, so the inner revoking failure
> >> issue would not be a weak point or flaw of new design.
> >>
> >>>
> >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> >>>> metadata stream, there should be more IOs than in inner revoking in
> >>>> atomic-interface.
> >>>
> >>> Well, do you mean there is no need to recover db after revoking?
> >>
> >> Yes, revoking make the same effect like the recovery of sqlite, so after
> >> revoking, recovery is no need.
> >
> > Logically, it doesn't make sense. If there is a valid journal file, it should
> > redo the previous transaction. No?
> 
> As we know, in sqlite, before we commit a transaction, we will use journal to
> record original data of pages which will be updated in following transaction, so
> in following if a) abnormal power-cut, b) commit error, c) redo command was
> triggered by user, we will recover db with journal.
> 
> Ideally, if we support atomic write interface, in there should always return two
> status in atomic write interface: success or fail. If success, transaction was
> committed, otherwise, it looks like nothing happened, user will be told
> transaction was failed. Then, journals in sqlite could no longer be used,
> eventually no journal, no recovery.
> 
> The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> revoking in commit interface since it could destroy db file permanently w/o
> journal. IMO, some optimization could be done for these cases:
> 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> 
> These optimizations couldn't guarantee no failure in revoking operation
> completely, luckily, those are not common cases, and they also happen in sqlite
> w/o atomic feature.
> 
> One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> guess we can optimize DB like:
> 1. reflink db to db.ref
> 2. do transaction in db.ref
>    - failed, rm db.ref
>    - power-cut rm db.ref
> 3. rename db.ref to db
> 
> >
> >> One more case is that user can send a command to abort current transaction,
> >> it should be happened before atomic_commit operation, which could easily
> >> handle with abort_commit ioctl.
> >>
> >>>
> >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> >>>> power-cut will leave that data in disk.
> >>>
> >>> Yes, in that case, database should recover corrupted db with its journal file.
> >>
> >> Journal could be corrupted as I descripted in b).
> >
> > Okay, so what I'm thinking is like this.
> > It seems there are two corruption cases after journal writes.
> >
> > 1. power cut during atomic writes
> >  - broken journal file and clean db file -> give up
> >  - luckily, valid journal file and clean db file -> recover db
> >
> > 2. error during atomic writes
> >  a. power-cut before abort completion
> >   - broken journal file and broken db file -> revoking is needed!
> >
> >  b. after abort
> >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> >
> > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > But, I think, even if revoking is done, we should notify an error to abort and
> > recover db by 2.b.
> >
> > Something like this after successful revoking.
> >
> > 1. power cut during atomic writes
> >  - broken journal file and clean db file -> give up
> >  - luckily, valid journal file and clean db file -> recover db
> >
> > 2. error during atomic writes w/ revoking
> >  a. power-cut before abort completion
> >   - broken journal file and clean db file -> give up
> >   - luckily, valid journal file and clean db file -> recover db
> >
> >  b. after abort
> >   - valid journal file and clean db file -> recover db
> 
> That's right.
> 
> >
> > Let me verify these scenarios first. :)
> 
> OK. :)
> 
> Thanks,
> 
> >
> > Thanks,
> >
> >>>
> >>>> With revoking supported design, we can not solve all above issues, we will still
> >>>> face the same issue like c), but it will be a big improve if we can apply this
> >>>> in our interface, since it provide a way to fix the issue a) b) d). And also for
> >>>> e) case, we try to rescue data in first time that our revoking operation would be
> >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> >>>>
> >>>> If you don't want to have a big change in this interface or recovery flow, how
> >>>> about keep them both, and add a mount option to control inner recovery flow?
> >>>
> >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> >>
> >> I believe new design will enhance in memory usage and error handling of sqlite
> >> in android, and hope this can be applied. But, I can understand that if you
> >> were considerring about risk control and backward compatibility, since this
> >> change affects all atomic related ioctls.
> >>
> >>> For other databases, I can understand that they can use atomic_write without
> >>> journal control, which is a sort of stand-alone atomic_write.
> >>>
> >>> It'd better to add a new ioctl for that, but before adding it, can we find
> >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> >>
> >> You mean investigating or we can only start when there is a clear commercial
> >> demand ?
> >>
> >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> >>
> >> Agreed :)
> >>
> >> Thanks,
> >>
> >>>
> >>> Thanks,
> >>>
> >>>>
> >>>> How do you think? :)
> >>>>
> >>>> Thanks,
> >>>>
> >>>>> But, unfortunately, it seems that something is missing in the
> >>>>> current implementation.
> >>>>>
> >>>>> So simply how about this?
> >>>>>
> >>>>> A possible flow would be:
> >>>>> 1. write journal data to volatile space
> >>>>> 2. write db data to atomic space
> >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> >>>>>  - flush/fsync journal data to disk
> >>>>>  - drop atomic data, and will be recovered by database with journal
> >>>>>
> >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> >>>>>
> >>>>> There are two rules to handle aborting volatile or atomic writes.
> >>>>>
> >>>>> 1. drop atomic writes
> >>>>>  - we don't need to keep any stale db data.
> >>>>>
> >>>>> 2. write journal data
> >>>>>  - we should keep the journal data with fsync for db recovery.
> >>>>>
> >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> >>>>> ---
> >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> >>>>>
> >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> >>>>> index 91f576a..d16438a 100644
> >>>>> --- a/fs/f2fs/file.c
> >>>>> +++ b/fs/f2fs/file.c
> >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> >>>>>  	if (ret)
> >>>>>  		return ret;
> >>>>>
> >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> >>>>> -	commit_inmem_pages(inode, true);
> >>>>> +	if (f2fs_is_atomic_file(inode)) {
> >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> >>>>> +		commit_inmem_pages(inode, true);
> >>>>> +	}
> >>>>> +	if (f2fs_is_volatile_file(inode)) {
> >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> >>>>> +		ret = commit_inmem_pages(inode, false);
> >>>>> +		if (!ret)
> >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> >>>>> +	}
> >>>>>
> >>>>>  	mnt_drop_write_file(filp);
> >>>>>  	return ret;
> >>>>> --
> >>>>> 2.6.3
> >>>>
> >
> > ------------------------------------------------------------------------------
> > _______________________________________________
> > Linux-f2fs-devel mailing list
> > Linux-f2fs-devel@lists.sourceforge.net
> > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> >
> 
> ------------------------------------------------------------------------------
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2016-01-08 12:05             ` [f2fs-dev] " Chao Yu
@ 2016-01-08 19:43                 ` Jaegeuk Kim
  0 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2016-01-08 19:43 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> Any progress on this patch?

Swamped. Will do.

Thanks,

> 
> Thanks,
> 
> > -----Original Message-----
> > From: Chao Yu [mailto:chao@kernel.org]
> > Sent: Friday, January 01, 2016 8:14 PM
> > To: Jaegeuk Kim
> > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > 
> > Hi Jaegeuk,
> > 
> > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > Hi Chao,
> > >
> > > ...
> > >
> > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > >>>>>> f2fs support atomic write with following semantics:
> > >>>>>> 1. open db file
> > >>>>>> 2. ioctl start atomic write
> > >>>>>> 3. (write db file) * n
> > >>>>>> 4. ioctl commit atomic write
> > >>>>>> 5. close db file
> > >>>>>>
> > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > >>>>>> won't be persisted unless we commit them in step 4.
> > >>>>>>
> > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > >>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
> > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > >>>>>> we should use journal db to recover the original data in db file.
> > >>>>>
> > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > >>>>> since database should get its error literally.
> > >>>>>
> > >>>>> So, the only thing that we need to do is keeping journal data for further db
> > >>>>> recovery.
> > >>>>
> > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > >>>> kept by user, because f2fs already have it in its storage since we always
> > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > >>>> write process.
> > >>>
> > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > >>> writes for journal files should be used together to minimize sqlite change as
> > >>> much as possible.
> > >>>
> > >>>> But in current design, we still hold journal data in memory for recovering for
> > >>>> *rare* failure case. I think there are several issues:
> > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > >>>> db files were opened simultaneously, we are under big memory pressure.
> > >>>
> > >>> In current android, I've seen that this is not a big concern. Even there is
> > >>> memory pressure, f2fs flushes volatile pages.
> > >>
> > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > >>
> > >> if (f2fs_is_volatile_file(inode))
> > >> 	goto redirty_out;
> > >
> > > Where did you put this? It doesn't flush at all? Why?
> > 
> > Original place in ->writepage, just remove two other conditions.
> > 
> > To avoid potential random writebacking of dirty page in journal which
> > cause unpredicted corrupting in journal.
> > 
> > > Practically, the peak amount of journal writes depend on how many transactions
> > > are processing concurrently.
> > > I mean, in-memory pages are dropped at the end of every transaction.
> > > You can check the number of pages through f2fs_stat on your phone.
> > >
> > >> I didn't dig details, but I think there may be a little risk for this design.
> > >>
> > >>>
> > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > >>>> disk, it will destroy db file.
> > >>>
> > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > >>
> > >> Normally, we keep pages of journal in memory, but partial page in journal
> > >> will be write out to device by reclaimer when out of memory. So this journal
> > >> may have valid data in its log head, but with corrupted data, then after
> > >> abnormal powe-cut, recovery with this journal before a transaction will
> > >> destroy db. Right?
> > >
> > > Just think about sqlite without this feature.
> > > Broken journal is pretty normal case for sqlite.
> > 
> > Maybe, if it is caused by bug or design issue of software, no matter db system
> > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > 
> > >
> > >>>
> > >>>> c) Though, we have journal db file, we will face failure of recovering db file
> > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > >>>
> > >>> Do you mean the failure of recovering db with a complete journal?
> > >>> Why do we have to handle that? That's a database stuff, IMO.
> > >>
> > >> Yes, just list for indicating we will face the same issue which is hard to
> > >> handle both in original design and new design, so the inner revoking failure
> > >> issue would not be a weak point or flaw of new design.
> > >>
> > >>>
> > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > >>>> metadata stream, there should be more IOs than in inner revoking in
> > >>>> atomic-interface.
> > >>>
> > >>> Well, do you mean there is no need to recover db after revoking?
> > >>
> > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > >> revoking, recovery is no need.
> > >
> > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > redo the previous transaction. No?
> > 
> > As we know, in sqlite, before we commit a transaction, we will use journal to
> > record original data of pages which will be updated in following transaction, so
> > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > triggered by user, we will recover db with journal.
> > 
> > Ideally, if we support atomic write interface, in there should always return two
> > status in atomic write interface: success or fail. If success, transaction was
> > committed, otherwise, it looks like nothing happened, user will be told
> > transaction was failed. Then, journals in sqlite could no longer be used,
> > eventually no journal, no recovery.
> > 
> > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > revoking in commit interface since it could destroy db file permanently w/o
> > journal. IMO, some optimization could be done for these cases:
> > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > 
> > These optimizations couldn't guarantee no failure in revoking operation
> > completely, luckily, those are not common cases, and they also happen in sqlite
> > w/o atomic feature.
> > 
> > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > guess we can optimize DB like:
> > 1. reflink db to db.ref
> > 2. do transaction in db.ref
> >    - failed, rm db.ref
> >    - power-cut rm db.ref
> > 3. rename db.ref to db
> > 
> > >
> > >> One more case is that user can send a command to abort current transaction,
> > >> it should be happened before atomic_commit operation, which could easily
> > >> handle with abort_commit ioctl.
> > >>
> > >>>
> > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > >>>> power-cut will leave that data in disk.
> > >>>
> > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > >>
> > >> Journal could be corrupted as I descripted in b).
> > >
> > > Okay, so what I'm thinking is like this.
> > > It seems there are two corruption cases after journal writes.
> > >
> > > 1. power cut during atomic writes
> > >  - broken journal file and clean db file -> give up
> > >  - luckily, valid journal file and clean db file -> recover db
> > >
> > > 2. error during atomic writes
> > >  a. power-cut before abort completion
> > >   - broken journal file and broken db file -> revoking is needed!
> > >
> > >  b. after abort
> > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > >
> > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > But, I think, even if revoking is done, we should notify an error to abort and
> > > recover db by 2.b.
> > >
> > > Something like this after successful revoking.
> > >
> > > 1. power cut during atomic writes
> > >  - broken journal file and clean db file -> give up
> > >  - luckily, valid journal file and clean db file -> recover db
> > >
> > > 2. error during atomic writes w/ revoking
> > >  a. power-cut before abort completion
> > >   - broken journal file and clean db file -> give up
> > >   - luckily, valid journal file and clean db file -> recover db
> > >
> > >  b. after abort
> > >   - valid journal file and clean db file -> recover db
> > 
> > That's right.
> > 
> > >
> > > Let me verify these scenarios first. :)
> > 
> > OK. :)
> > 
> > Thanks,
> > 
> > >
> > > Thanks,
> > >
> > >>>
> > >>>> With revoking supported design, we can not solve all above issues, we will still
> > >>>> face the same issue like c), but it will be a big improve if we can apply this
> > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also for
> > >>>> e) case, we try to rescue data in first time that our revoking operation would be
> > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > >>>>
> > >>>> If you don't want to have a big change in this interface or recovery flow, how
> > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > >>>
> > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > >>
> > >> I believe new design will enhance in memory usage and error handling of sqlite
> > >> in android, and hope this can be applied. But, I can understand that if you
> > >> were considerring about risk control and backward compatibility, since this
> > >> change affects all atomic related ioctls.
> > >>
> > >>> For other databases, I can understand that they can use atomic_write without
> > >>> journal control, which is a sort of stand-alone atomic_write.
> > >>>
> > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > >>
> > >> You mean investigating or we can only start when there is a clear commercial
> > >> demand ?
> > >>
> > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > >>
> > >> Agreed :)
> > >>
> > >> Thanks,
> > >>
> > >>>
> > >>> Thanks,
> > >>>
> > >>>>
> > >>>> How do you think? :)
> > >>>>
> > >>>> Thanks,
> > >>>>
> > >>>>> But, unfortunately, it seems that something is missing in the
> > >>>>> current implementation.
> > >>>>>
> > >>>>> So simply how about this?
> > >>>>>
> > >>>>> A possible flow would be:
> > >>>>> 1. write journal data to volatile space
> > >>>>> 2. write db data to atomic space
> > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > >>>>>  - flush/fsync journal data to disk
> > >>>>>  - drop atomic data, and will be recovered by database with journal
> > >>>>>
> > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > >>>>>
> > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > >>>>>
> > >>>>> 1. drop atomic writes
> > >>>>>  - we don't need to keep any stale db data.
> > >>>>>
> > >>>>> 2. write journal data
> > >>>>>  - we should keep the journal data with fsync for db recovery.
> > >>>>>
> > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > >>>>> ---
> > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > >>>>>
> > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > >>>>> index 91f576a..d16438a 100644
> > >>>>> --- a/fs/f2fs/file.c
> > >>>>> +++ b/fs/f2fs/file.c
> > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> > >>>>>  	if (ret)
> > >>>>>  		return ret;
> > >>>>>
> > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > >>>>> -	commit_inmem_pages(inode, true);
> > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > >>>>> +		commit_inmem_pages(inode, true);
> > >>>>> +	}
> > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > >>>>> +		ret = commit_inmem_pages(inode, false);
> > >>>>> +		if (!ret)
> > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > >>>>> +	}
> > >>>>>
> > >>>>>  	mnt_drop_write_file(filp);
> > >>>>>  	return ret;
> > >>>>> --
> > >>>>> 2.6.3
> > >>>>
> > >
> > > ------------------------------------------------------------------------------
> > > _______________________________________________
> > > Linux-f2fs-devel mailing list
> > > Linux-f2fs-devel@lists.sourceforge.net
> > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > >
> > 
> > ------------------------------------------------------------------------------
> > _______________________________________________
> > Linux-f2fs-devel mailing list
> > Linux-f2fs-devel@lists.sourceforge.net
> > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] f2fs: support revoking atomic written pages
@ 2016-01-08 19:43                 ` Jaegeuk Kim
  0 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2016-01-08 19:43 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> Any progress on this patch?

Swamped. Will do.

Thanks,

> 
> Thanks,
> 
> > -----Original Message-----
> > From: Chao Yu [mailto:chao@kernel.org]
> > Sent: Friday, January 01, 2016 8:14 PM
> > To: Jaegeuk Kim
> > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > 
> > Hi Jaegeuk,
> > 
> > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > Hi Chao,
> > >
> > > ...
> > >
> > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > >>>>>> f2fs support atomic write with following semantics:
> > >>>>>> 1. open db file
> > >>>>>> 2. ioctl start atomic write
> > >>>>>> 3. (write db file) * n
> > >>>>>> 4. ioctl commit atomic write
> > >>>>>> 5. close db file
> > >>>>>>
> > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > >>>>>> won't be persisted unless we commit them in step 4.
> > >>>>>>
> > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > >>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
> > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > >>>>>> we should use journal db to recover the original data in db file.
> > >>>>>
> > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > >>>>> since database should get its error literally.
> > >>>>>
> > >>>>> So, the only thing that we need to do is keeping journal data for further db
> > >>>>> recovery.
> > >>>>
> > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > >>>> kept by user, because f2fs already have it in its storage since we always
> > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > >>>> write process.
> > >>>
> > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > >>> writes for journal files should be used together to minimize sqlite change as
> > >>> much as possible.
> > >>>
> > >>>> But in current design, we still hold journal data in memory for recovering for
> > >>>> *rare* failure case. I think there are several issues:
> > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > >>>> db files were opened simultaneously, we are under big memory pressure.
> > >>>
> > >>> In current android, I've seen that this is not a big concern. Even there is
> > >>> memory pressure, f2fs flushes volatile pages.
> > >>
> > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > >>
> > >> if (f2fs_is_volatile_file(inode))
> > >> 	goto redirty_out;
> > >
> > > Where did you put this? It doesn't flush at all? Why?
> > 
> > Original place in ->writepage, just remove two other conditions.
> > 
> > To avoid potential random writebacking of dirty page in journal which
> > cause unpredicted corrupting in journal.
> > 
> > > Practically, the peak amount of journal writes depend on how many transactions
> > > are processing concurrently.
> > > I mean, in-memory pages are dropped at the end of every transaction.
> > > You can check the number of pages through f2fs_stat on your phone.
> > >
> > >> I didn't dig details, but I think there may be a little risk for this design.
> > >>
> > >>>
> > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > >>>> disk, it will destroy db file.
> > >>>
> > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > >>
> > >> Normally, we keep pages of journal in memory, but partial page in journal
> > >> will be write out to device by reclaimer when out of memory. So this journal
> > >> may have valid data in its log head, but with corrupted data, then after
> > >> abnormal powe-cut, recovery with this journal before a transaction will
> > >> destroy db. Right?
> > >
> > > Just think about sqlite without this feature.
> > > Broken journal is pretty normal case for sqlite.
> > 
> > Maybe, if it is caused by bug or design issue of software, no matter db system
> > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > 
> > >
> > >>>
> > >>>> c) Though, we have journal db file, we will face failure of recovering db file
> > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > >>>
> > >>> Do you mean the failure of recovering db with a complete journal?
> > >>> Why do we have to handle that? That's a database stuff, IMO.
> > >>
> > >> Yes, just list for indicating we will face the same issue which is hard to
> > >> handle both in original design and new design, so the inner revoking failure
> > >> issue would not be a weak point or flaw of new design.
> > >>
> > >>>
> > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > >>>> metadata stream, there should be more IOs than in inner revoking in
> > >>>> atomic-interface.
> > >>>
> > >>> Well, do you mean there is no need to recover db after revoking?
> > >>
> > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > >> revoking, recovery is no need.
> > >
> > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > redo the previous transaction. No?
> > 
> > As we know, in sqlite, before we commit a transaction, we will use journal to
> > record original data of pages which will be updated in following transaction, so
> > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > triggered by user, we will recover db with journal.
> > 
> > Ideally, if we support atomic write interface, in there should always return two
> > status in atomic write interface: success or fail. If success, transaction was
> > committed, otherwise, it looks like nothing happened, user will be told
> > transaction was failed. Then, journals in sqlite could no longer be used,
> > eventually no journal, no recovery.
> > 
> > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > revoking in commit interface since it could destroy db file permanently w/o
> > journal. IMO, some optimization could be done for these cases:
> > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > 
> > These optimizations couldn't guarantee no failure in revoking operation
> > completely, luckily, those are not common cases, and they also happen in sqlite
> > w/o atomic feature.
> > 
> > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > guess we can optimize DB like:
> > 1. reflink db to db.ref
> > 2. do transaction in db.ref
> >    - failed, rm db.ref
> >    - power-cut rm db.ref
> > 3. rename db.ref to db
> > 
> > >
> > >> One more case is that user can send a command to abort current transaction,
> > >> it should be happened before atomic_commit operation, which could easily
> > >> handle with abort_commit ioctl.
> > >>
> > >>>
> > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > >>>> power-cut will leave that data in disk.
> > >>>
> > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > >>
> > >> Journal could be corrupted as I descripted in b).
> > >
> > > Okay, so what I'm thinking is like this.
> > > It seems there are two corruption cases after journal writes.
> > >
> > > 1. power cut during atomic writes
> > >  - broken journal file and clean db file -> give up
> > >  - luckily, valid journal file and clean db file -> recover db
> > >
> > > 2. error during atomic writes
> > >  a. power-cut before abort completion
> > >   - broken journal file and broken db file -> revoking is needed!
> > >
> > >  b. after abort
> > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > >
> > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > But, I think, even if revoking is done, we should notify an error to abort and
> > > recover db by 2.b.
> > >
> > > Something like this after successful revoking.
> > >
> > > 1. power cut during atomic writes
> > >  - broken journal file and clean db file -> give up
> > >  - luckily, valid journal file and clean db file -> recover db
> > >
> > > 2. error during atomic writes w/ revoking
> > >  a. power-cut before abort completion
> > >   - broken journal file and clean db file -> give up
> > >   - luckily, valid journal file and clean db file -> recover db
> > >
> > >  b. after abort
> > >   - valid journal file and clean db file -> recover db
> > 
> > That's right.
> > 
> > >
> > > Let me verify these scenarios first. :)
> > 
> > OK. :)
> > 
> > Thanks,
> > 
> > >
> > > Thanks,
> > >
> > >>>
> > >>>> With revoking supported design, we can not solve all above issues, we will still
> > >>>> face the same issue like c), but it will be a big improve if we can apply this
> > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also for
> > >>>> e) case, we try to rescue data in first time that our revoking operation would be
> > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > >>>>
> > >>>> If you don't want to have a big change in this interface or recovery flow, how
> > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > >>>
> > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > >>
> > >> I believe new design will enhance in memory usage and error handling of sqlite
> > >> in android, and hope this can be applied. But, I can understand that if you
> > >> were considerring about risk control and backward compatibility, since this
> > >> change affects all atomic related ioctls.
> > >>
> > >>> For other databases, I can understand that they can use atomic_write without
> > >>> journal control, which is a sort of stand-alone atomic_write.
> > >>>
> > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > >>
> > >> You mean investigating or we can only start when there is a clear commercial
> > >> demand ?
> > >>
> > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > >>
> > >> Agreed :)
> > >>
> > >> Thanks,
> > >>
> > >>>
> > >>> Thanks,
> > >>>
> > >>>>
> > >>>> How do you think? :)
> > >>>>
> > >>>> Thanks,
> > >>>>
> > >>>>> But, unfortunately, it seems that something is missing in the
> > >>>>> current implementation.
> > >>>>>
> > >>>>> So simply how about this?
> > >>>>>
> > >>>>> A possible flow would be:
> > >>>>> 1. write journal data to volatile space
> > >>>>> 2. write db data to atomic space
> > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > >>>>>  - flush/fsync journal data to disk
> > >>>>>  - drop atomic data, and will be recovered by database with journal
> > >>>>>
> > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > >>>>>
> > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > >>>>>
> > >>>>> 1. drop atomic writes
> > >>>>>  - we don't need to keep any stale db data.
> > >>>>>
> > >>>>> 2. write journal data
> > >>>>>  - we should keep the journal data with fsync for db recovery.
> > >>>>>
> > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > >>>>> ---
> > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > >>>>>
> > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > >>>>> index 91f576a..d16438a 100644
> > >>>>> --- a/fs/f2fs/file.c
> > >>>>> +++ b/fs/f2fs/file.c
> > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> > >>>>>  	if (ret)
> > >>>>>  		return ret;
> > >>>>>
> > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > >>>>> -	commit_inmem_pages(inode, true);
> > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > >>>>> +		commit_inmem_pages(inode, true);
> > >>>>> +	}
> > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > >>>>> +		ret = commit_inmem_pages(inode, false);
> > >>>>> +		if (!ret)
> > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > >>>>> +	}
> > >>>>>
> > >>>>>  	mnt_drop_write_file(filp);
> > >>>>>  	return ret;
> > >>>>> --
> > >>>>> 2.6.3
> > >>>>
> > >
> > > ------------------------------------------------------------------------------
> > > _______________________________________________
> > > Linux-f2fs-devel mailing list
> > > Linux-f2fs-devel@lists.sourceforge.net
> > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > >
> > 
> > ------------------------------------------------------------------------------
> > _______________________________________________
> > Linux-f2fs-devel mailing list
> > Linux-f2fs-devel@lists.sourceforge.net
> > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

------------------------------------------------------------------------------
Site24x7 APM Insight: Get Deep Visibility into Application Performance
APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
Monitor end-to-end web transactions and take corrective actions now
Troubleshoot faster and improve end-user experience. Signup Now!
http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2016-01-08 19:43                 ` Jaegeuk Kim
  (?)
@ 2016-01-13  1:17                 ` Jaegeuk Kim
  2016-01-13  5:05                     ` Chao Yu
  -1 siblings, 1 reply; 28+ messages in thread
From: Jaegeuk Kim @ 2016-01-13  1:17 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

Hi Chao,

I just injected -EIO for one page among two pages in total into database file.
Then, I tested valid and invalid journal file to see how sqlite recovers the
transaction.

Interestingly, if journal is valid, database file is recovered, as I could see
the transaction result even after it shows EIO.
But, in the invalid journal case, somehow it drops database changes.
I'm not sure it was because I just skip second page write of database file tho.
(I added random bytes into journal pages.)
I'll break the database file with more random bytes likewise what I did for
journal.

Thanks,

On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > Hi Jaegeuk,
> > 
> > Any progress on this patch?
> 
> Swamped. Will do.
> 
> Thanks,
> 
> > 
> > Thanks,
> > 
> > > -----Original Message-----
> > > From: Chao Yu [mailto:chao@kernel.org]
> > > Sent: Friday, January 01, 2016 8:14 PM
> > > To: Jaegeuk Kim
> > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > 
> > > Hi Jaegeuk,
> > > 
> > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > Hi Chao,
> > > >
> > > > ...
> > > >
> > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > >>>>>> f2fs support atomic write with following semantics:
> > > >>>>>> 1. open db file
> > > >>>>>> 2. ioctl start atomic write
> > > >>>>>> 3. (write db file) * n
> > > >>>>>> 4. ioctl commit atomic write
> > > >>>>>> 5. close db file
> > > >>>>>>
> > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > > >>>>>> won't be persisted unless we commit them in step 4.
> > > >>>>>>
> > > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > > >>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
> > > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > > >>>>>> we should use journal db to recover the original data in db file.
> > > >>>>>
> > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > >>>>> since database should get its error literally.
> > > >>>>>
> > > >>>>> So, the only thing that we need to do is keeping journal data for further db
> > > >>>>> recovery.
> > > >>>>
> > > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > > >>>> write process.
> > > >>>
> > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > >>> writes for journal files should be used together to minimize sqlite change as
> > > >>> much as possible.
> > > >>>
> > > >>>> But in current design, we still hold journal data in memory for recovering for
> > > >>>> *rare* failure case. I think there are several issues:
> > > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > >>>
> > > >>> In current android, I've seen that this is not a big concern. Even there is
> > > >>> memory pressure, f2fs flushes volatile pages.
> > > >>
> > > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > > >>
> > > >> if (f2fs_is_volatile_file(inode))
> > > >> 	goto redirty_out;
> > > >
> > > > Where did you put this? It doesn't flush at all? Why?
> > > 
> > > Original place in ->writepage, just remove two other conditions.
> > > 
> > > To avoid potential random writebacking of dirty page in journal which
> > > cause unpredicted corrupting in journal.
> > > 
> > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > are processing concurrently.
> > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > You can check the number of pages through f2fs_stat on your phone.
> > > >
> > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > >>
> > > >>>
> > > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > > >>>> disk, it will destroy db file.
> > > >>>
> > > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > > >>
> > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > >> may have valid data in its log head, but with corrupted data, then after
> > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > >> destroy db. Right?
> > > >
> > > > Just think about sqlite without this feature.
> > > > Broken journal is pretty normal case for sqlite.
> > > 
> > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > > 
> > > >
> > > >>>
> > > >>>> c) Though, we have journal db file, we will face failure of recovering db file
> > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > >>>
> > > >>> Do you mean the failure of recovering db with a complete journal?
> > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > >>
> > > >> Yes, just list for indicating we will face the same issue which is hard to
> > > >> handle both in original design and new design, so the inner revoking failure
> > > >> issue would not be a weak point or flaw of new design.
> > > >>
> > > >>>
> > > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > >>>> atomic-interface.
> > > >>>
> > > >>> Well, do you mean there is no need to recover db after revoking?
> > > >>
> > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > >> revoking, recovery is no need.
> > > >
> > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > redo the previous transaction. No?
> > > 
> > > As we know, in sqlite, before we commit a transaction, we will use journal to
> > > record original data of pages which will be updated in following transaction, so
> > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > triggered by user, we will recover db with journal.
> > > 
> > > Ideally, if we support atomic write interface, in there should always return two
> > > status in atomic write interface: success or fail. If success, transaction was
> > > committed, otherwise, it looks like nothing happened, user will be told
> > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > eventually no journal, no recovery.
> > > 
> > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > revoking in commit interface since it could destroy db file permanently w/o
> > > journal. IMO, some optimization could be done for these cases:
> > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > 
> > > These optimizations couldn't guarantee no failure in revoking operation
> > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > w/o atomic feature.
> > > 
> > > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > > guess we can optimize DB like:
> > > 1. reflink db to db.ref
> > > 2. do transaction in db.ref
> > >    - failed, rm db.ref
> > >    - power-cut rm db.ref
> > > 3. rename db.ref to db
> > > 
> > > >
> > > >> One more case is that user can send a command to abort current transaction,
> > > >> it should be happened before atomic_commit operation, which could easily
> > > >> handle with abort_commit ioctl.
> > > >>
> > > >>>
> > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> > > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > >>>> power-cut will leave that data in disk.
> > > >>>
> > > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > > >>
> > > >> Journal could be corrupted as I descripted in b).
> > > >
> > > > Okay, so what I'm thinking is like this.
> > > > It seems there are two corruption cases after journal writes.
> > > >
> > > > 1. power cut during atomic writes
> > > >  - broken journal file and clean db file -> give up
> > > >  - luckily, valid journal file and clean db file -> recover db
> > > >
> > > > 2. error during atomic writes
> > > >  a. power-cut before abort completion
> > > >   - broken journal file and broken db file -> revoking is needed!
> > > >
> > > >  b. after abort
> > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > >
> > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > But, I think, even if revoking is done, we should notify an error to abort and
> > > > recover db by 2.b.
> > > >
> > > > Something like this after successful revoking.
> > > >
> > > > 1. power cut during atomic writes
> > > >  - broken journal file and clean db file -> give up
> > > >  - luckily, valid journal file and clean db file -> recover db
> > > >
> > > > 2. error during atomic writes w/ revoking
> > > >  a. power-cut before abort completion
> > > >   - broken journal file and clean db file -> give up
> > > >   - luckily, valid journal file and clean db file -> recover db
> > > >
> > > >  b. after abort
> > > >   - valid journal file and clean db file -> recover db
> > > 
> > > That's right.
> > > 
> > > >
> > > > Let me verify these scenarios first. :)
> > > 
> > > OK. :)
> > > 
> > > Thanks,
> > > 
> > > >
> > > > Thanks,
> > > >
> > > >>>
> > > >>>> With revoking supported design, we can not solve all above issues, we will still
> > > >>>> face the same issue like c), but it will be a big improve if we can apply this
> > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also for
> > > >>>> e) case, we try to rescue data in first time that our revoking operation would be
> > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > >>>>
> > > >>>> If you don't want to have a big change in this interface or recovery flow, how
> > > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > > >>>
> > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > >>
> > > >> I believe new design will enhance in memory usage and error handling of sqlite
> > > >> in android, and hope this can be applied. But, I can understand that if you
> > > >> were considerring about risk control and backward compatibility, since this
> > > >> change affects all atomic related ioctls.
> > > >>
> > > >>> For other databases, I can understand that they can use atomic_write without
> > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > >>>
> > > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > >>
> > > >> You mean investigating or we can only start when there is a clear commercial
> > > >> demand ?
> > > >>
> > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > >>
> > > >> Agreed :)
> > > >>
> > > >> Thanks,
> > > >>
> > > >>>
> > > >>> Thanks,
> > > >>>
> > > >>>>
> > > >>>> How do you think? :)
> > > >>>>
> > > >>>> Thanks,
> > > >>>>
> > > >>>>> But, unfortunately, it seems that something is missing in the
> > > >>>>> current implementation.
> > > >>>>>
> > > >>>>> So simply how about this?
> > > >>>>>
> > > >>>>> A possible flow would be:
> > > >>>>> 1. write journal data to volatile space
> > > >>>>> 2. write db data to atomic space
> > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > > >>>>>  - flush/fsync journal data to disk
> > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > >>>>>
> > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > >>>>>
> > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > >>>>>
> > > >>>>> 1. drop atomic writes
> > > >>>>>  - we don't need to keep any stale db data.
> > > >>>>>
> > > >>>>> 2. write journal data
> > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > >>>>>
> > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > >>>>> ---
> > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > >>>>>
> > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > >>>>> index 91f576a..d16438a 100644
> > > >>>>> --- a/fs/f2fs/file.c
> > > >>>>> +++ b/fs/f2fs/file.c
> > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> > > >>>>>  	if (ret)
> > > >>>>>  		return ret;
> > > >>>>>
> > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > >>>>> -	commit_inmem_pages(inode, true);
> > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > >>>>> +		commit_inmem_pages(inode, true);
> > > >>>>> +	}
> > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > >>>>> +		if (!ret)
> > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > >>>>> +	}
> > > >>>>>
> > > >>>>>  	mnt_drop_write_file(filp);
> > > >>>>>  	return ret;
> > > >>>>> --
> > > >>>>> 2.6.3
> > > >>>>
> > > >
> > > > ------------------------------------------------------------------------------
> > > > _______________________________________________
> > > > Linux-f2fs-devel mailing list
> > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > >
> > > 
> > > ------------------------------------------------------------------------------
> > > _______________________________________________
> > > Linux-f2fs-devel mailing list
> > > Linux-f2fs-devel@lists.sourceforge.net
> > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> 
> ------------------------------------------------------------------------------
> Site24x7 APM Insight: Get Deep Visibility into Application Performance
> APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> Monitor end-to-end web transactions and take corrective actions now
> Troubleshoot faster and improve end-user experience. Signup Now!
> http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> _______________________________________________
> Linux-f2fs-devel mailing list
> Linux-f2fs-devel@lists.sourceforge.net
> https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2016-01-13  1:17                 ` [f2fs-dev] " Jaegeuk Kim
@ 2016-01-13  5:05                     ` Chao Yu
  0 siblings, 0 replies; 28+ messages in thread
From: Chao Yu @ 2016-01-13  5:05 UTC (permalink / raw)
  To: 'Jaegeuk Kim'; +Cc: linux-kernel, linux-f2fs-devel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Wednesday, January 13, 2016 9:18 AM
> To: Chao Yu
> Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> 
> Hi Chao,
> 
> I just injected -EIO for one page among two pages in total into database file.
> Then, I tested valid and invalid journal file to see how sqlite recovers the
> transaction.
> 
> Interestingly, if journal is valid, database file is recovered, as I could see
> the transaction result even after it shows EIO.
> But, in the invalid journal case, somehow it drops database changes.

If journal has valid data in its header and corrupted data in its body, sqlite will
recover db file from corrupted journal file, then db file will be corrupted.
So what you mean is: after recovery, db file still be fine? or sqlite fails to
recover due to drop data in journal since the header of journal is not valid?

Thanks,

> I'm not sure it was because I just skip second page write of database file tho.
> (I added random bytes into journal pages.)
> I'll break the database file with more random bytes likewise what I did for
> journal.
> 
> Thanks,
> 
> On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> > On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > > Hi Jaegeuk,
> > >
> > > Any progress on this patch?
> >
> > Swamped. Will do.
> >
> > Thanks,
> >
> > >
> > > Thanks,
> > >
> > > > -----Original Message-----
> > > > From: Chao Yu [mailto:chao@kernel.org]
> > > > Sent: Friday, January 01, 2016 8:14 PM
> > > > To: Jaegeuk Kim
> > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > >
> > > > Hi Jaegeuk,
> > > >
> > > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > > Hi Chao,
> > > > >
> > > > > ...
> > > > >
> > > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > >>>>>> f2fs support atomic write with following semantics:
> > > > >>>>>> 1. open db file
> > > > >>>>>> 2. ioctl start atomic write
> > > > >>>>>> 3. (write db file) * n
> > > > >>>>>> 4. ioctl commit atomic write
> > > > >>>>>> 5. close db file
> > > > >>>>>>
> > > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > > > >>>>>> won't be persisted unless we commit them in step 4.
> > > > >>>>>>
> > > > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > > > >>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
> > > > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > > > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > > > >>>>>> we should use journal db to recover the original data in db file.
> > > > >>>>>
> > > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > >>>>> since database should get its error literally.
> > > > >>>>>
> > > > >>>>> So, the only thing that we need to do is keeping journal data for further db
> > > > >>>>> recovery.
> > > > >>>>
> > > > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > > > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > > > >>>> write process.
> > > > >>>
> > > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > > >>> writes for journal files should be used together to minimize sqlite change as
> > > > >>> much as possible.
> > > > >>>
> > > > >>>> But in current design, we still hold journal data in memory for recovering for
> > > > >>>> *rare* failure case. I think there are several issues:
> > > > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > > >>>
> > > > >>> In current android, I've seen that this is not a big concern. Even there is
> > > > >>> memory pressure, f2fs flushes volatile pages.
> > > > >>
> > > > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > > > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > > > >>
> > > > >> if (f2fs_is_volatile_file(inode))
> > > > >> 	goto redirty_out;
> > > > >
> > > > > Where did you put this? It doesn't flush at all? Why?
> > > >
> > > > Original place in ->writepage, just remove two other conditions.
> > > >
> > > > To avoid potential random writebacking of dirty page in journal which
> > > > cause unpredicted corrupting in journal.
> > > >
> > > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > > are processing concurrently.
> > > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > > You can check the number of pages through f2fs_stat on your phone.
> > > > >
> > > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > > >>
> > > > >>>
> > > > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > > > >>>> disk, it will destroy db file.
> > > > >>>
> > > > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > > > >>
> > > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > > >> may have valid data in its log head, but with corrupted data, then after
> > > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > > >> destroy db. Right?
> > > > >
> > > > > Just think about sqlite without this feature.
> > > > > Broken journal is pretty normal case for sqlite.
> > > >
> > > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > > >
> > > > >
> > > > >>>
> > > > >>>> c) Though, we have journal db file, we will face failure of recovering db file
> > > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > > >>>
> > > > >>> Do you mean the failure of recovering db with a complete journal?
> > > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > > >>
> > > > >> Yes, just list for indicating we will face the same issue which is hard to
> > > > >> handle both in original design and new design, so the inner revoking failure
> > > > >> issue would not be a weak point or flaw of new design.
> > > > >>
> > > > >>>
> > > > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > > >>>> atomic-interface.
> > > > >>>
> > > > >>> Well, do you mean there is no need to recover db after revoking?
> > > > >>
> > > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > > >> revoking, recovery is no need.
> > > > >
> > > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > > redo the previous transaction. No?
> > > >
> > > > As we know, in sqlite, before we commit a transaction, we will use journal to
> > > > record original data of pages which will be updated in following transaction, so
> > > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > > triggered by user, we will recover db with journal.
> > > >
> > > > Ideally, if we support atomic write interface, in there should always return two
> > > > status in atomic write interface: success or fail. If success, transaction was
> > > > committed, otherwise, it looks like nothing happened, user will be told
> > > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > > eventually no journal, no recovery.
> > > >
> > > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > > revoking in commit interface since it could destroy db file permanently w/o
> > > > journal. IMO, some optimization could be done for these cases:
> > > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > >
> > > > These optimizations couldn't guarantee no failure in revoking operation
> > > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > > w/o atomic feature.
> > > >
> > > > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > > > guess we can optimize DB like:
> > > > 1. reflink db to db.ref
> > > > 2. do transaction in db.ref
> > > >    - failed, rm db.ref
> > > >    - power-cut rm db.ref
> > > > 3. rename db.ref to db
> > > >
> > > > >
> > > > >> One more case is that user can send a command to abort current transaction,
> > > > >> it should be happened before atomic_commit operation, which could easily
> > > > >> handle with abort_commit ioctl.
> > > > >>
> > > > >>>
> > > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> > > > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > > >>>> power-cut will leave that data in disk.
> > > > >>>
> > > > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > > > >>
> > > > >> Journal could be corrupted as I descripted in b).
> > > > >
> > > > > Okay, so what I'm thinking is like this.
> > > > > It seems there are two corruption cases after journal writes.
> > > > >
> > > > > 1. power cut during atomic writes
> > > > >  - broken journal file and clean db file -> give up
> > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > >
> > > > > 2. error during atomic writes
> > > > >  a. power-cut before abort completion
> > > > >   - broken journal file and broken db file -> revoking is needed!
> > > > >
> > > > >  b. after abort
> > > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > > >
> > > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > > But, I think, even if revoking is done, we should notify an error to abort and
> > > > > recover db by 2.b.
> > > > >
> > > > > Something like this after successful revoking.
> > > > >
> > > > > 1. power cut during atomic writes
> > > > >  - broken journal file and clean db file -> give up
> > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > >
> > > > > 2. error during atomic writes w/ revoking
> > > > >  a. power-cut before abort completion
> > > > >   - broken journal file and clean db file -> give up
> > > > >   - luckily, valid journal file and clean db file -> recover db
> > > > >
> > > > >  b. after abort
> > > > >   - valid journal file and clean db file -> recover db
> > > >
> > > > That's right.
> > > >
> > > > >
> > > > > Let me verify these scenarios first. :)
> > > >
> > > > OK. :)
> > > >
> > > > Thanks,
> > > >
> > > > >
> > > > > Thanks,
> > > > >
> > > > >>>
> > > > >>>> With revoking supported design, we can not solve all above issues, we will still
> > > > >>>> face the same issue like c), but it will be a big improve if we can apply this
> > > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also for
> > > > >>>> e) case, we try to rescue data in first time that our revoking operation would be
> > > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > > >>>>
> > > > >>>> If you don't want to have a big change in this interface or recovery flow, how
> > > > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > > > >>>
> > > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > > >>
> > > > >> I believe new design will enhance in memory usage and error handling of sqlite
> > > > >> in android, and hope this can be applied. But, I can understand that if you
> > > > >> were considerring about risk control and backward compatibility, since this
> > > > >> change affects all atomic related ioctls.
> > > > >>
> > > > >>> For other databases, I can understand that they can use atomic_write without
> > > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > > >>>
> > > > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > > >>
> > > > >> You mean investigating or we can only start when there is a clear commercial
> > > > >> demand ?
> > > > >>
> > > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > > >>
> > > > >> Agreed :)
> > > > >>
> > > > >> Thanks,
> > > > >>
> > > > >>>
> > > > >>> Thanks,
> > > > >>>
> > > > >>>>
> > > > >>>> How do you think? :)
> > > > >>>>
> > > > >>>> Thanks,
> > > > >>>>
> > > > >>>>> But, unfortunately, it seems that something is missing in the
> > > > >>>>> current implementation.
> > > > >>>>>
> > > > >>>>> So simply how about this?
> > > > >>>>>
> > > > >>>>> A possible flow would be:
> > > > >>>>> 1. write journal data to volatile space
> > > > >>>>> 2. write db data to atomic space
> > > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > > > >>>>>  - flush/fsync journal data to disk
> > > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > > >>>>>
> > > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > > >>>>>
> > > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > > >>>>>
> > > > >>>>> 1. drop atomic writes
> > > > >>>>>  - we don't need to keep any stale db data.
> > > > >>>>>
> > > > >>>>> 2. write journal data
> > > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > > >>>>>
> > > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > >>>>> ---
> > > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > > >>>>>
> > > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > >>>>> index 91f576a..d16438a 100644
> > > > >>>>> --- a/fs/f2fs/file.c
> > > > >>>>> +++ b/fs/f2fs/file.c
> > > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> > > > >>>>>  	if (ret)
> > > > >>>>>  		return ret;
> > > > >>>>>
> > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > >>>>> -	commit_inmem_pages(inode, true);
> > > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > >>>>> +		commit_inmem_pages(inode, true);
> > > > >>>>> +	}
> > > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > > >>>>> +		if (!ret)
> > > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > >>>>> +	}
> > > > >>>>>
> > > > >>>>>  	mnt_drop_write_file(filp);
> > > > >>>>>  	return ret;
> > > > >>>>> --
> > > > >>>>> 2.6.3
> > > > >>>>
> > > > >
> > > > > ------------------------------------------------------------------------------
> > > > > _______________________________________________
> > > > > Linux-f2fs-devel mailing list
> > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > >
> > > >
> > > > ------------------------------------------------------------------------------
> > > > _______________________________________________
> > > > Linux-f2fs-devel mailing list
> > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> >
> > ------------------------------------------------------------------------------
> > Site24x7 APM Insight: Get Deep Visibility into Application Performance
> > APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> > Monitor end-to-end web transactions and take corrective actions now
> > Troubleshoot faster and improve end-user experience. Signup Now!
> > http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> > _______________________________________________
> > Linux-f2fs-devel mailing list
> > Linux-f2fs-devel@lists.sourceforge.net
> > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] f2fs: support revoking atomic written pages
@ 2016-01-13  5:05                     ` Chao Yu
  0 siblings, 0 replies; 28+ messages in thread
From: Chao Yu @ 2016-01-13  5:05 UTC (permalink / raw)
  To: 'Jaegeuk Kim'; +Cc: linux-kernel, linux-f2fs-devel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Wednesday, January 13, 2016 9:18 AM
> To: Chao Yu
> Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> 
> Hi Chao,
> 
> I just injected -EIO for one page among two pages in total into database file.
> Then, I tested valid and invalid journal file to see how sqlite recovers the
> transaction.
> 
> Interestingly, if journal is valid, database file is recovered, as I could see
> the transaction result even after it shows EIO.
> But, in the invalid journal case, somehow it drops database changes.

If journal has valid data in its header and corrupted data in its body, sqlite will
recover db file from corrupted journal file, then db file will be corrupted.
So what you mean is: after recovery, db file still be fine? or sqlite fails to
recover due to drop data in journal since the header of journal is not valid?

Thanks,

> I'm not sure it was because I just skip second page write of database file tho.
> (I added random bytes into journal pages.)
> I'll break the database file with more random bytes likewise what I did for
> journal.
> 
> Thanks,
> 
> On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> > On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > > Hi Jaegeuk,
> > >
> > > Any progress on this patch?
> >
> > Swamped. Will do.
> >
> > Thanks,
> >
> > >
> > > Thanks,
> > >
> > > > -----Original Message-----
> > > > From: Chao Yu [mailto:chao@kernel.org]
> > > > Sent: Friday, January 01, 2016 8:14 PM
> > > > To: Jaegeuk Kim
> > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > >
> > > > Hi Jaegeuk,
> > > >
> > > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > > Hi Chao,
> > > > >
> > > > > ...
> > > > >
> > > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > >>>>>> f2fs support atomic write with following semantics:
> > > > >>>>>> 1. open db file
> > > > >>>>>> 2. ioctl start atomic write
> > > > >>>>>> 3. (write db file) * n
> > > > >>>>>> 4. ioctl commit atomic write
> > > > >>>>>> 5. close db file
> > > > >>>>>>
> > > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > > > >>>>>> won't be persisted unless we commit them in step 4.
> > > > >>>>>>
> > > > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > > > >>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
> > > > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > > > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > > > >>>>>> we should use journal db to recover the original data in db file.
> > > > >>>>>
> > > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > >>>>> since database should get its error literally.
> > > > >>>>>
> > > > >>>>> So, the only thing that we need to do is keeping journal data for further db
> > > > >>>>> recovery.
> > > > >>>>
> > > > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > > > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > > > >>>> write process.
> > > > >>>
> > > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > > >>> writes for journal files should be used together to minimize sqlite change as
> > > > >>> much as possible.
> > > > >>>
> > > > >>>> But in current design, we still hold journal data in memory for recovering for
> > > > >>>> *rare* failure case. I think there are several issues:
> > > > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > > >>>
> > > > >>> In current android, I've seen that this is not a big concern. Even there is
> > > > >>> memory pressure, f2fs flushes volatile pages.
> > > > >>
> > > > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > > > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > > > >>
> > > > >> if (f2fs_is_volatile_file(inode))
> > > > >> 	goto redirty_out;
> > > > >
> > > > > Where did you put this? It doesn't flush at all? Why?
> > > >
> > > > Original place in ->writepage, just remove two other conditions.
> > > >
> > > > To avoid potential random writebacking of dirty page in journal which
> > > > cause unpredicted corrupting in journal.
> > > >
> > > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > > are processing concurrently.
> > > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > > You can check the number of pages through f2fs_stat on your phone.
> > > > >
> > > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > > >>
> > > > >>>
> > > > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > > > >>>> disk, it will destroy db file.
> > > > >>>
> > > > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > > > >>
> > > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > > >> may have valid data in its log head, but with corrupted data, then after
> > > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > > >> destroy db. Right?
> > > > >
> > > > > Just think about sqlite without this feature.
> > > > > Broken journal is pretty normal case for sqlite.
> > > >
> > > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > > >
> > > > >
> > > > >>>
> > > > >>>> c) Though, we have journal db file, we will face failure of recovering db file
> > > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > > >>>
> > > > >>> Do you mean the failure of recovering db with a complete journal?
> > > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > > >>
> > > > >> Yes, just list for indicating we will face the same issue which is hard to
> > > > >> handle both in original design and new design, so the inner revoking failure
> > > > >> issue would not be a weak point or flaw of new design.
> > > > >>
> > > > >>>
> > > > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > > >>>> atomic-interface.
> > > > >>>
> > > > >>> Well, do you mean there is no need to recover db after revoking?
> > > > >>
> > > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > > >> revoking, recovery is no need.
> > > > >
> > > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > > redo the previous transaction. No?
> > > >
> > > > As we know, in sqlite, before we commit a transaction, we will use journal to
> > > > record original data of pages which will be updated in following transaction, so
> > > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > > triggered by user, we will recover db with journal.
> > > >
> > > > Ideally, if we support atomic write interface, in there should always return two
> > > > status in atomic write interface: success or fail. If success, transaction was
> > > > committed, otherwise, it looks like nothing happened, user will be told
> > > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > > eventually no journal, no recovery.
> > > >
> > > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > > revoking in commit interface since it could destroy db file permanently w/o
> > > > journal. IMO, some optimization could be done for these cases:
> > > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > >
> > > > These optimizations couldn't guarantee no failure in revoking operation
> > > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > > w/o atomic feature.
> > > >
> > > > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > > > guess we can optimize DB like:
> > > > 1. reflink db to db.ref
> > > > 2. do transaction in db.ref
> > > >    - failed, rm db.ref
> > > >    - power-cut rm db.ref
> > > > 3. rename db.ref to db
> > > >
> > > > >
> > > > >> One more case is that user can send a command to abort current transaction,
> > > > >> it should be happened before atomic_commit operation, which could easily
> > > > >> handle with abort_commit ioctl.
> > > > >>
> > > > >>>
> > > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> > > > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > > >>>> power-cut will leave that data in disk.
> > > > >>>
> > > > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > > > >>
> > > > >> Journal could be corrupted as I descripted in b).
> > > > >
> > > > > Okay, so what I'm thinking is like this.
> > > > > It seems there are two corruption cases after journal writes.
> > > > >
> > > > > 1. power cut during atomic writes
> > > > >  - broken journal file and clean db file -> give up
> > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > >
> > > > > 2. error during atomic writes
> > > > >  a. power-cut before abort completion
> > > > >   - broken journal file and broken db file -> revoking is needed!
> > > > >
> > > > >  b. after abort
> > > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > > >
> > > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > > But, I think, even if revoking is done, we should notify an error to abort and
> > > > > recover db by 2.b.
> > > > >
> > > > > Something like this after successful revoking.
> > > > >
> > > > > 1. power cut during atomic writes
> > > > >  - broken journal file and clean db file -> give up
> > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > >
> > > > > 2. error during atomic writes w/ revoking
> > > > >  a. power-cut before abort completion
> > > > >   - broken journal file and clean db file -> give up
> > > > >   - luckily, valid journal file and clean db file -> recover db
> > > > >
> > > > >  b. after abort
> > > > >   - valid journal file and clean db file -> recover db
> > > >
> > > > That's right.
> > > >
> > > > >
> > > > > Let me verify these scenarios first. :)
> > > >
> > > > OK. :)
> > > >
> > > > Thanks,
> > > >
> > > > >
> > > > > Thanks,
> > > > >
> > > > >>>
> > > > >>>> With revoking supported design, we can not solve all above issues, we will still
> > > > >>>> face the same issue like c), but it will be a big improve if we can apply this
> > > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also for
> > > > >>>> e) case, we try to rescue data in first time that our revoking operation would be
> > > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > > >>>>
> > > > >>>> If you don't want to have a big change in this interface or recovery flow, how
> > > > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > > > >>>
> > > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > > >>
> > > > >> I believe new design will enhance in memory usage and error handling of sqlite
> > > > >> in android, and hope this can be applied. But, I can understand that if you
> > > > >> were considerring about risk control and backward compatibility, since this
> > > > >> change affects all atomic related ioctls.
> > > > >>
> > > > >>> For other databases, I can understand that they can use atomic_write without
> > > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > > >>>
> > > > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > > >>
> > > > >> You mean investigating or we can only start when there is a clear commercial
> > > > >> demand ?
> > > > >>
> > > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > > >>
> > > > >> Agreed :)
> > > > >>
> > > > >> Thanks,
> > > > >>
> > > > >>>
> > > > >>> Thanks,
> > > > >>>
> > > > >>>>
> > > > >>>> How do you think? :)
> > > > >>>>
> > > > >>>> Thanks,
> > > > >>>>
> > > > >>>>> But, unfortunately, it seems that something is missing in the
> > > > >>>>> current implementation.
> > > > >>>>>
> > > > >>>>> So simply how about this?
> > > > >>>>>
> > > > >>>>> A possible flow would be:
> > > > >>>>> 1. write journal data to volatile space
> > > > >>>>> 2. write db data to atomic space
> > > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > > > >>>>>  - flush/fsync journal data to disk
> > > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > > >>>>>
> > > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > > >>>>>
> > > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > > >>>>>
> > > > >>>>> 1. drop atomic writes
> > > > >>>>>  - we don't need to keep any stale db data.
> > > > >>>>>
> > > > >>>>> 2. write journal data
> > > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > > >>>>>
> > > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > >>>>> ---
> > > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > > >>>>>
> > > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > >>>>> index 91f576a..d16438a 100644
> > > > >>>>> --- a/fs/f2fs/file.c
> > > > >>>>> +++ b/fs/f2fs/file.c
> > > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> > > > >>>>>  	if (ret)
> > > > >>>>>  		return ret;
> > > > >>>>>
> > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > >>>>> -	commit_inmem_pages(inode, true);
> > > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > >>>>> +		commit_inmem_pages(inode, true);
> > > > >>>>> +	}
> > > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > > >>>>> +		if (!ret)
> > > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > >>>>> +	}
> > > > >>>>>
> > > > >>>>>  	mnt_drop_write_file(filp);
> > > > >>>>>  	return ret;
> > > > >>>>> --
> > > > >>>>> 2.6.3
> > > > >>>>
> > > > >
> > > > > ------------------------------------------------------------------------------
> > > > > _______________________________________________
> > > > > Linux-f2fs-devel mailing list
> > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > >
> > > >
> > > > ------------------------------------------------------------------------------
> > > > _______________________________________________
> > > > Linux-f2fs-devel mailing list
> > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> >
> > ------------------------------------------------------------------------------
> > Site24x7 APM Insight: Get Deep Visibility into Application Performance
> > APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> > Monitor end-to-end web transactions and take corrective actions now
> > Troubleshoot faster and improve end-user experience. Signup Now!
> > http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> > _______________________________________________
> > Linux-f2fs-devel mailing list
> > Linux-f2fs-devel@lists.sourceforge.net
> > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel


------------------------------------------------------------------------------
Site24x7 APM Insight: Get Deep Visibility into Application Performance
APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
Monitor end-to-end web transactions and take corrective actions now
Troubleshoot faster and improve end-user experience. Signup Now!
http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2016-01-13  5:05                     ` Chao Yu
@ 2016-01-15  0:03                       ` Jaegeuk Kim
  -1 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2016-01-15  0:03 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

Hi Chao,

On Wed, Jan 13, 2016 at 01:05:01PM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > Sent: Wednesday, January 13, 2016 9:18 AM
> > To: Chao Yu
> > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > 
> > Hi Chao,
> > 
> > I just injected -EIO for one page among two pages in total into database file.
> > Then, I tested valid and invalid journal file to see how sqlite recovers the
> > transaction.
> > 
> > Interestingly, if journal is valid, database file is recovered, as I could see
> > the transaction result even after it shows EIO.
> > But, in the invalid journal case, somehow it drops database changes.
> 
> If journal has valid data in its header and corrupted data in its body, sqlite will
> recover db file from corrupted journal file, then db file will be corrupted.
> So what you mean is: after recovery, db file still be fine? or sqlite fails to
> recover due to drop data in journal since the header of journal is not valid?

In the above case, I think I made broken journal header. At the same time, I
broke database file too, but I could see that database file is recovered
likewise roll-back. I couldn't find corruption of database.

Okay, I'll test again by corrupting journal body with valid header.

Thanks,

> 
> Thanks,
> 
> > I'm not sure it was because I just skip second page write of database file tho.
> > (I added random bytes into journal pages.)
> > I'll break the database file with more random bytes likewise what I did for
> > journal.
> > 
> > Thanks,
> > 
> > On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> > > On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > > > Hi Jaegeuk,
> > > >
> > > > Any progress on this patch?
> > >
> > > Swamped. Will do.
> > >
> > > Thanks,
> > >
> > > >
> > > > Thanks,
> > > >
> > > > > -----Original Message-----
> > > > > From: Chao Yu [mailto:chao@kernel.org]
> > > > > Sent: Friday, January 01, 2016 8:14 PM
> > > > > To: Jaegeuk Kim
> > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > >
> > > > > Hi Jaegeuk,
> > > > >
> > > > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > > > Hi Chao,
> > > > > >
> > > > > > ...
> > > > > >
> > > > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > > >>>>>> f2fs support atomic write with following semantics:
> > > > > >>>>>> 1. open db file
> > > > > >>>>>> 2. ioctl start atomic write
> > > > > >>>>>> 3. (write db file) * n
> > > > > >>>>>> 4. ioctl commit atomic write
> > > > > >>>>>> 5. close db file
> > > > > >>>>>>
> > > > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > > > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > > > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > > > > >>>>>> won't be persisted unless we commit them in step 4.
> > > > > >>>>>>
> > > > > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > > > > >>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
> > > > > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > > > > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > > > > >>>>>> we should use journal db to recover the original data in db file.
> > > > > >>>>>
> > > > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > > >>>>> since database should get its error literally.
> > > > > >>>>>
> > > > > >>>>> So, the only thing that we need to do is keeping journal data for further db
> > > > > >>>>> recovery.
> > > > > >>>>
> > > > > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > > > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > > > > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > > > > >>>> write process.
> > > > > >>>
> > > > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > > > >>> writes for journal files should be used together to minimize sqlite change as
> > > > > >>> much as possible.
> > > > > >>>
> > > > > >>>> But in current design, we still hold journal data in memory for recovering for
> > > > > >>>> *rare* failure case. I think there are several issues:
> > > > > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > > > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > > > >>>
> > > > > >>> In current android, I've seen that this is not a big concern. Even there is
> > > > > >>> memory pressure, f2fs flushes volatile pages.
> > > > > >>
> > > > > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > > > > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > > > > >>
> > > > > >> if (f2fs_is_volatile_file(inode))
> > > > > >> 	goto redirty_out;
> > > > > >
> > > > > > Where did you put this? It doesn't flush at all? Why?
> > > > >
> > > > > Original place in ->writepage, just remove two other conditions.
> > > > >
> > > > > To avoid potential random writebacking of dirty page in journal which
> > > > > cause unpredicted corrupting in journal.
> > > > >
> > > > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > > > are processing concurrently.
> > > > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > > > You can check the number of pages through f2fs_stat on your phone.
> > > > > >
> > > > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > > > >>
> > > > > >>>
> > > > > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > > > > >>>> disk, it will destroy db file.
> > > > > >>>
> > > > > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > > > > >>
> > > > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > > > >> may have valid data in its log head, but with corrupted data, then after
> > > > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > > > >> destroy db. Right?
> > > > > >
> > > > > > Just think about sqlite without this feature.
> > > > > > Broken journal is pretty normal case for sqlite.
> > > > >
> > > > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > > > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > > > >
> > > > > >
> > > > > >>>
> > > > > >>>> c) Though, we have journal db file, we will face failure of recovering db file
> > > > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > > > >>>
> > > > > >>> Do you mean the failure of recovering db with a complete journal?
> > > > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > > > >>
> > > > > >> Yes, just list for indicating we will face the same issue which is hard to
> > > > > >> handle both in original design and new design, so the inner revoking failure
> > > > > >> issue would not be a weak point or flaw of new design.
> > > > > >>
> > > > > >>>
> > > > > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > > > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > > > >>>> atomic-interface.
> > > > > >>>
> > > > > >>> Well, do you mean there is no need to recover db after revoking?
> > > > > >>
> > > > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > > > >> revoking, recovery is no need.
> > > > > >
> > > > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > > > redo the previous transaction. No?
> > > > >
> > > > > As we know, in sqlite, before we commit a transaction, we will use journal to
> > > > > record original data of pages which will be updated in following transaction, so
> > > > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > > > triggered by user, we will recover db with journal.
> > > > >
> > > > > Ideally, if we support atomic write interface, in there should always return two
> > > > > status in atomic write interface: success or fail. If success, transaction was
> > > > > committed, otherwise, it looks like nothing happened, user will be told
> > > > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > > > eventually no journal, no recovery.
> > > > >
> > > > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > > > revoking in commit interface since it could destroy db file permanently w/o
> > > > > journal. IMO, some optimization could be done for these cases:
> > > > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > > >
> > > > > These optimizations couldn't guarantee no failure in revoking operation
> > > > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > > > w/o atomic feature.
> > > > >
> > > > > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > > > > guess we can optimize DB like:
> > > > > 1. reflink db to db.ref
> > > > > 2. do transaction in db.ref
> > > > >    - failed, rm db.ref
> > > > >    - power-cut rm db.ref
> > > > > 3. rename db.ref to db
> > > > >
> > > > > >
> > > > > >> One more case is that user can send a command to abort current transaction,
> > > > > >> it should be happened before atomic_commit operation, which could easily
> > > > > >> handle with abort_commit ioctl.
> > > > > >>
> > > > > >>>
> > > > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> > > > > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > > > >>>> power-cut will leave that data in disk.
> > > > > >>>
> > > > > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > > > > >>
> > > > > >> Journal could be corrupted as I descripted in b).
> > > > > >
> > > > > > Okay, so what I'm thinking is like this.
> > > > > > It seems there are two corruption cases after journal writes.
> > > > > >
> > > > > > 1. power cut during atomic writes
> > > > > >  - broken journal file and clean db file -> give up
> > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > >
> > > > > > 2. error during atomic writes
> > > > > >  a. power-cut before abort completion
> > > > > >   - broken journal file and broken db file -> revoking is needed!
> > > > > >
> > > > > >  b. after abort
> > > > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > > > >
> > > > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > > > But, I think, even if revoking is done, we should notify an error to abort and
> > > > > > recover db by 2.b.
> > > > > >
> > > > > > Something like this after successful revoking.
> > > > > >
> > > > > > 1. power cut during atomic writes
> > > > > >  - broken journal file and clean db file -> give up
> > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > >
> > > > > > 2. error during atomic writes w/ revoking
> > > > > >  a. power-cut before abort completion
> > > > > >   - broken journal file and clean db file -> give up
> > > > > >   - luckily, valid journal file and clean db file -> recover db
> > > > > >
> > > > > >  b. after abort
> > > > > >   - valid journal file and clean db file -> recover db
> > > > >
> > > > > That's right.
> > > > >
> > > > > >
> > > > > > Let me verify these scenarios first. :)
> > > > >
> > > > > OK. :)
> > > > >
> > > > > Thanks,
> > > > >
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > >>>
> > > > > >>>> With revoking supported design, we can not solve all above issues, we will still
> > > > > >>>> face the same issue like c), but it will be a big improve if we can apply this
> > > > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also for
> > > > > >>>> e) case, we try to rescue data in first time that our revoking operation would be
> > > > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > > > >>>>
> > > > > >>>> If you don't want to have a big change in this interface or recovery flow, how
> > > > > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > > > > >>>
> > > > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > > > >>
> > > > > >> I believe new design will enhance in memory usage and error handling of sqlite
> > > > > >> in android, and hope this can be applied. But, I can understand that if you
> > > > > >> were considerring about risk control and backward compatibility, since this
> > > > > >> change affects all atomic related ioctls.
> > > > > >>
> > > > > >>> For other databases, I can understand that they can use atomic_write without
> > > > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > > > >>>
> > > > > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > > > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > > > >>
> > > > > >> You mean investigating or we can only start when there is a clear commercial
> > > > > >> demand ?
> > > > > >>
> > > > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > > > >>
> > > > > >> Agreed :)
> > > > > >>
> > > > > >> Thanks,
> > > > > >>
> > > > > >>>
> > > > > >>> Thanks,
> > > > > >>>
> > > > > >>>>
> > > > > >>>> How do you think? :)
> > > > > >>>>
> > > > > >>>> Thanks,
> > > > > >>>>
> > > > > >>>>> But, unfortunately, it seems that something is missing in the
> > > > > >>>>> current implementation.
> > > > > >>>>>
> > > > > >>>>> So simply how about this?
> > > > > >>>>>
> > > > > >>>>> A possible flow would be:
> > > > > >>>>> 1. write journal data to volatile space
> > > > > >>>>> 2. write db data to atomic space
> > > > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > > > > >>>>>  - flush/fsync journal data to disk
> > > > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > > > >>>>>
> > > > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > > > >>>>>
> > > > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > > > >>>>>
> > > > > >>>>> 1. drop atomic writes
> > > > > >>>>>  - we don't need to keep any stale db data.
> > > > > >>>>>
> > > > > >>>>> 2. write journal data
> > > > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > > > >>>>>
> > > > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > >>>>> ---
> > > > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > > > >>>>>
> > > > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > > >>>>> index 91f576a..d16438a 100644
> > > > > >>>>> --- a/fs/f2fs/file.c
> > > > > >>>>> +++ b/fs/f2fs/file.c
> > > > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> > > > > >>>>>  	if (ret)
> > > > > >>>>>  		return ret;
> > > > > >>>>>
> > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > >>>>> -	commit_inmem_pages(inode, true);
> > > > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > >>>>> +		commit_inmem_pages(inode, true);
> > > > > >>>>> +	}
> > > > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > > > >>>>> +		if (!ret)
> > > > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > > >>>>> +	}
> > > > > >>>>>
> > > > > >>>>>  	mnt_drop_write_file(filp);
> > > > > >>>>>  	return ret;
> > > > > >>>>> --
> > > > > >>>>> 2.6.3
> > > > > >>>>
> > > > > >
> > > > > > ------------------------------------------------------------------------------
> > > > > > _______________________________________________
> > > > > > Linux-f2fs-devel mailing list
> > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > >
> > > > >
> > > > > ------------------------------------------------------------------------------
> > > > > _______________________________________________
> > > > > Linux-f2fs-devel mailing list
> > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > >
> > > ------------------------------------------------------------------------------
> > > Site24x7 APM Insight: Get Deep Visibility into Application Performance
> > > APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> > > Monitor end-to-end web transactions and take corrective actions now
> > > Troubleshoot faster and improve end-user experience. Signup Now!
> > > http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> > > _______________________________________________
> > > Linux-f2fs-devel mailing list
> > > Linux-f2fs-devel@lists.sourceforge.net
> > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] f2fs: support revoking atomic written pages
@ 2016-01-15  0:03                       ` Jaegeuk Kim
  0 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2016-01-15  0:03 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

Hi Chao,

On Wed, Jan 13, 2016 at 01:05:01PM +0800, Chao Yu wrote:
> Hi Jaegeuk,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > Sent: Wednesday, January 13, 2016 9:18 AM
> > To: Chao Yu
> > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > 
> > Hi Chao,
> > 
> > I just injected -EIO for one page among two pages in total into database file.
> > Then, I tested valid and invalid journal file to see how sqlite recovers the
> > transaction.
> > 
> > Interestingly, if journal is valid, database file is recovered, as I could see
> > the transaction result even after it shows EIO.
> > But, in the invalid journal case, somehow it drops database changes.
> 
> If journal has valid data in its header and corrupted data in its body, sqlite will
> recover db file from corrupted journal file, then db file will be corrupted.
> So what you mean is: after recovery, db file still be fine? or sqlite fails to
> recover due to drop data in journal since the header of journal is not valid?

In the above case, I think I made broken journal header. At the same time, I
broke database file too, but I could see that database file is recovered
likewise roll-back. I couldn't find corruption of database.

Okay, I'll test again by corrupting journal body with valid header.

Thanks,

> 
> Thanks,
> 
> > I'm not sure it was because I just skip second page write of database file tho.
> > (I added random bytes into journal pages.)
> > I'll break the database file with more random bytes likewise what I did for
> > journal.
> > 
> > Thanks,
> > 
> > On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> > > On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > > > Hi Jaegeuk,
> > > >
> > > > Any progress on this patch?
> > >
> > > Swamped. Will do.
> > >
> > > Thanks,
> > >
> > > >
> > > > Thanks,
> > > >
> > > > > -----Original Message-----
> > > > > From: Chao Yu [mailto:chao@kernel.org]
> > > > > Sent: Friday, January 01, 2016 8:14 PM
> > > > > To: Jaegeuk Kim
> > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > >
> > > > > Hi Jaegeuk,
> > > > >
> > > > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > > > Hi Chao,
> > > > > >
> > > > > > ...
> > > > > >
> > > > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > > >>>>>> f2fs support atomic write with following semantics:
> > > > > >>>>>> 1. open db file
> > > > > >>>>>> 2. ioctl start atomic write
> > > > > >>>>>> 3. (write db file) * n
> > > > > >>>>>> 4. ioctl commit atomic write
> > > > > >>>>>> 5. close db file
> > > > > >>>>>>
> > > > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > > > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > > > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > > > > >>>>>> won't be persisted unless we commit them in step 4.
> > > > > >>>>>>
> > > > > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > > > > >>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
> > > > > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > > > > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > > > > >>>>>> we should use journal db to recover the original data in db file.
> > > > > >>>>>
> > > > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > > >>>>> since database should get its error literally.
> > > > > >>>>>
> > > > > >>>>> So, the only thing that we need to do is keeping journal data for further db
> > > > > >>>>> recovery.
> > > > > >>>>
> > > > > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > > > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > > > > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > > > > >>>> write process.
> > > > > >>>
> > > > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > > > >>> writes for journal files should be used together to minimize sqlite change as
> > > > > >>> much as possible.
> > > > > >>>
> > > > > >>>> But in current design, we still hold journal data in memory for recovering for
> > > > > >>>> *rare* failure case. I think there are several issues:
> > > > > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > > > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > > > >>>
> > > > > >>> In current android, I've seen that this is not a big concern. Even there is
> > > > > >>> memory pressure, f2fs flushes volatile pages.
> > > > > >>
> > > > > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > > > > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > > > > >>
> > > > > >> if (f2fs_is_volatile_file(inode))
> > > > > >> 	goto redirty_out;
> > > > > >
> > > > > > Where did you put this? It doesn't flush at all? Why?
> > > > >
> > > > > Original place in ->writepage, just remove two other conditions.
> > > > >
> > > > > To avoid potential random writebacking of dirty page in journal which
> > > > > cause unpredicted corrupting in journal.
> > > > >
> > > > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > > > are processing concurrently.
> > > > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > > > You can check the number of pages through f2fs_stat on your phone.
> > > > > >
> > > > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > > > >>
> > > > > >>>
> > > > > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > > > > >>>> disk, it will destroy db file.
> > > > > >>>
> > > > > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > > > > >>
> > > > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > > > >> may have valid data in its log head, but with corrupted data, then after
> > > > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > > > >> destroy db. Right?
> > > > > >
> > > > > > Just think about sqlite without this feature.
> > > > > > Broken journal is pretty normal case for sqlite.
> > > > >
> > > > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > > > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > > > >
> > > > > >
> > > > > >>>
> > > > > >>>> c) Though, we have journal db file, we will face failure of recovering db file
> > > > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > > > >>>
> > > > > >>> Do you mean the failure of recovering db with a complete journal?
> > > > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > > > >>
> > > > > >> Yes, just list for indicating we will face the same issue which is hard to
> > > > > >> handle both in original design and new design, so the inner revoking failure
> > > > > >> issue would not be a weak point or flaw of new design.
> > > > > >>
> > > > > >>>
> > > > > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > > > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > > > >>>> atomic-interface.
> > > > > >>>
> > > > > >>> Well, do you mean there is no need to recover db after revoking?
> > > > > >>
> > > > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > > > >> revoking, recovery is no need.
> > > > > >
> > > > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > > > redo the previous transaction. No?
> > > > >
> > > > > As we know, in sqlite, before we commit a transaction, we will use journal to
> > > > > record original data of pages which will be updated in following transaction, so
> > > > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > > > triggered by user, we will recover db with journal.
> > > > >
> > > > > Ideally, if we support atomic write interface, in there should always return two
> > > > > status in atomic write interface: success or fail. If success, transaction was
> > > > > committed, otherwise, it looks like nothing happened, user will be told
> > > > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > > > eventually no journal, no recovery.
> > > > >
> > > > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > > > revoking in commit interface since it could destroy db file permanently w/o
> > > > > journal. IMO, some optimization could be done for these cases:
> > > > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > > >
> > > > > These optimizations couldn't guarantee no failure in revoking operation
> > > > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > > > w/o atomic feature.
> > > > >
> > > > > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > > > > guess we can optimize DB like:
> > > > > 1. reflink db to db.ref
> > > > > 2. do transaction in db.ref
> > > > >    - failed, rm db.ref
> > > > >    - power-cut rm db.ref
> > > > > 3. rename db.ref to db
> > > > >
> > > > > >
> > > > > >> One more case is that user can send a command to abort current transaction,
> > > > > >> it should be happened before atomic_commit operation, which could easily
> > > > > >> handle with abort_commit ioctl.
> > > > > >>
> > > > > >>>
> > > > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write &
> > > > > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > > > >>>> power-cut will leave that data in disk.
> > > > > >>>
> > > > > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > > > > >>
> > > > > >> Journal could be corrupted as I descripted in b).
> > > > > >
> > > > > > Okay, so what I'm thinking is like this.
> > > > > > It seems there are two corruption cases after journal writes.
> > > > > >
> > > > > > 1. power cut during atomic writes
> > > > > >  - broken journal file and clean db file -> give up
> > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > >
> > > > > > 2. error during atomic writes
> > > > > >  a. power-cut before abort completion
> > > > > >   - broken journal file and broken db file -> revoking is needed!
> > > > > >
> > > > > >  b. after abort
> > > > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > > > >
> > > > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > > > But, I think, even if revoking is done, we should notify an error to abort and
> > > > > > recover db by 2.b.
> > > > > >
> > > > > > Something like this after successful revoking.
> > > > > >
> > > > > > 1. power cut during atomic writes
> > > > > >  - broken journal file and clean db file -> give up
> > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > >
> > > > > > 2. error during atomic writes w/ revoking
> > > > > >  a. power-cut before abort completion
> > > > > >   - broken journal file and clean db file -> give up
> > > > > >   - luckily, valid journal file and clean db file -> recover db
> > > > > >
> > > > > >  b. after abort
> > > > > >   - valid journal file and clean db file -> recover db
> > > > >
> > > > > That's right.
> > > > >
> > > > > >
> > > > > > Let me verify these scenarios first. :)
> > > > >
> > > > > OK. :)
> > > > >
> > > > > Thanks,
> > > > >
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > >>>
> > > > > >>>> With revoking supported design, we can not solve all above issues, we will still
> > > > > >>>> face the same issue like c), but it will be a big improve if we can apply this
> > > > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also for
> > > > > >>>> e) case, we try to rescue data in first time that our revoking operation would be
> > > > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > > > >>>>
> > > > > >>>> If you don't want to have a big change in this interface or recovery flow, how
> > > > > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > > > > >>>
> > > > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > > > >>
> > > > > >> I believe new design will enhance in memory usage and error handling of sqlite
> > > > > >> in android, and hope this can be applied. But, I can understand that if you
> > > > > >> were considerring about risk control and backward compatibility, since this
> > > > > >> change affects all atomic related ioctls.
> > > > > >>
> > > > > >>> For other databases, I can understand that they can use atomic_write without
> > > > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > > > >>>
> > > > > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > > > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > > > >>
> > > > > >> You mean investigating or we can only start when there is a clear commercial
> > > > > >> demand ?
> > > > > >>
> > > > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > > > >>
> > > > > >> Agreed :)
> > > > > >>
> > > > > >> Thanks,
> > > > > >>
> > > > > >>>
> > > > > >>> Thanks,
> > > > > >>>
> > > > > >>>>
> > > > > >>>> How do you think? :)
> > > > > >>>>
> > > > > >>>> Thanks,
> > > > > >>>>
> > > > > >>>>> But, unfortunately, it seems that something is missing in the
> > > > > >>>>> current implementation.
> > > > > >>>>>
> > > > > >>>>> So simply how about this?
> > > > > >>>>>
> > > > > >>>>> A possible flow would be:
> > > > > >>>>> 1. write journal data to volatile space
> > > > > >>>>> 2. write db data to atomic space
> > > > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > > > > >>>>>  - flush/fsync journal data to disk
> > > > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > > > >>>>>
> > > > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > > > >>>>>
> > > > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > > > >>>>>
> > > > > >>>>> 1. drop atomic writes
> > > > > >>>>>  - we don't need to keep any stale db data.
> > > > > >>>>>
> > > > > >>>>> 2. write journal data
> > > > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > > > >>>>>
> > > > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > >>>>> ---
> > > > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > > > >>>>>
> > > > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > > >>>>> index 91f576a..d16438a 100644
> > > > > >>>>> --- a/fs/f2fs/file.c
> > > > > >>>>> +++ b/fs/f2fs/file.c
> > > > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
> > > > > >>>>>  	if (ret)
> > > > > >>>>>  		return ret;
> > > > > >>>>>
> > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > >>>>> -	commit_inmem_pages(inode, true);
> > > > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > >>>>> +		commit_inmem_pages(inode, true);
> > > > > >>>>> +	}
> > > > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > > > >>>>> +		if (!ret)
> > > > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > > >>>>> +	}
> > > > > >>>>>
> > > > > >>>>>  	mnt_drop_write_file(filp);
> > > > > >>>>>  	return ret;
> > > > > >>>>> --
> > > > > >>>>> 2.6.3
> > > > > >>>>
> > > > > >
> > > > > > ------------------------------------------------------------------------------
> > > > > > _______________________________________________
> > > > > > Linux-f2fs-devel mailing list
> > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > >
> > > > >
> > > > > ------------------------------------------------------------------------------
> > > > > _______________________________________________
> > > > > Linux-f2fs-devel mailing list
> > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > >
> > > ------------------------------------------------------------------------------
> > > Site24x7 APM Insight: Get Deep Visibility into Application Performance
> > > APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> > > Monitor end-to-end web transactions and take corrective actions now
> > > Troubleshoot faster and improve end-user experience. Signup Now!
> > > http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> > > _______________________________________________
> > > Linux-f2fs-devel mailing list
> > > Linux-f2fs-devel@lists.sourceforge.net
> > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

------------------------------------------------------------------------------
Site24x7 APM Insight: Get Deep Visibility into Application Performance
APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
Monitor end-to-end web transactions and take corrective actions now
Troubleshoot faster and improve end-user experience. Signup Now!
http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2016-01-15  0:03                       ` Jaegeuk Kim
  (?)
@ 2016-02-01 10:04                       ` Chao Yu
  2016-02-02  2:36                           ` Jaegeuk Kim
  -1 siblings, 1 reply; 28+ messages in thread
From: Chao Yu @ 2016-02-01 10:04 UTC (permalink / raw)
  To: 'Jaegeuk Kim'; +Cc: linux-kernel, linux-f2fs-devel

Ping,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Friday, January 15, 2016 8:03 AM
> To: Chao Yu
> Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> 
> Hi Chao,
> 
> On Wed, Jan 13, 2016 at 01:05:01PM +0800, Chao Yu wrote:
> > Hi Jaegeuk,
> >
> > > -----Original Message-----
> > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > Sent: Wednesday, January 13, 2016 9:18 AM
> > > To: Chao Yu
> > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > >
> > > Hi Chao,
> > >
> > > I just injected -EIO for one page among two pages in total into database file.
> > > Then, I tested valid and invalid journal file to see how sqlite recovers the
> > > transaction.
> > >
> > > Interestingly, if journal is valid, database file is recovered, as I could see
> > > the transaction result even after it shows EIO.
> > > But, in the invalid journal case, somehow it drops database changes.
> >
> > If journal has valid data in its header and corrupted data in its body, sqlite will
> > recover db file from corrupted journal file, then db file will be corrupted.
> > So what you mean is: after recovery, db file still be fine? or sqlite fails to
> > recover due to drop data in journal since the header of journal is not valid?
> 
> In the above case, I think I made broken journal header. At the same time, I
> broke database file too, but I could see that database file is recovered
> likewise roll-back. I couldn't find corruption of database.
> 
> Okay, I'll test again by corrupting journal body with valid header.
> 
> Thanks,
> 
> >
> > Thanks,
> >
> > > I'm not sure it was because I just skip second page write of database file tho.
> > > (I added random bytes into journal pages.)
> > > I'll break the database file with more random bytes likewise what I did for
> > > journal.
> > >
> > > Thanks,
> > >
> > > On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> > > > On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > > > > Hi Jaegeuk,
> > > > >
> > > > > Any progress on this patch?
> > > >
> > > > Swamped. Will do.
> > > >
> > > > Thanks,
> > > >
> > > > >
> > > > > Thanks,
> > > > >
> > > > > > -----Original Message-----
> > > > > > From: Chao Yu [mailto:chao@kernel.org]
> > > > > > Sent: Friday, January 01, 2016 8:14 PM
> > > > > > To: Jaegeuk Kim
> > > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > > >
> > > > > > Hi Jaegeuk,
> > > > > >
> > > > > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > > > > Hi Chao,
> > > > > > >
> > > > > > > ...
> > > > > > >
> > > > > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > > > >>>>>> f2fs support atomic write with following semantics:
> > > > > > >>>>>> 1. open db file
> > > > > > >>>>>> 2. ioctl start atomic write
> > > > > > >>>>>> 3. (write db file) * n
> > > > > > >>>>>> 4. ioctl commit atomic write
> > > > > > >>>>>> 5. close db file
> > > > > > >>>>>>
> > > > > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > > > > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > > > > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > > > > > >>>>>> won't be persisted unless we commit them in step 4.
> > > > > > >>>>>>
> > > > > > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > > > > > >>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
> > > > > > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > > > > > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > > > > > >>>>>> we should use journal db to recover the original data in db file.
> > > > > > >>>>>
> > > > > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > > > >>>>> since database should get its error literally.
> > > > > > >>>>>
> > > > > > >>>>> So, the only thing that we need to do is keeping journal data for further db
> > > > > > >>>>> recovery.
> > > > > > >>>>
> > > > > > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > > > > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > > > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > > > > > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > > > > > >>>> write process.
> > > > > > >>>
> > > > > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > > > > >>> writes for journal files should be used together to minimize sqlite change as
> > > > > > >>> much as possible.
> > > > > > >>>
> > > > > > >>>> But in current design, we still hold journal data in memory for recovering for
> > > > > > >>>> *rare* failure case. I think there are several issues:
> > > > > > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > > > > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > > > > >>>
> > > > > > >>> In current android, I've seen that this is not a big concern. Even there is
> > > > > > >>> memory pressure, f2fs flushes volatile pages.
> > > > > > >>
> > > > > > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > > > > > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > > > > > >>
> > > > > > >> if (f2fs_is_volatile_file(inode))
> > > > > > >> 	goto redirty_out;
> > > > > > >
> > > > > > > Where did you put this? It doesn't flush at all? Why?
> > > > > >
> > > > > > Original place in ->writepage, just remove two other conditions.
> > > > > >
> > > > > > To avoid potential random writebacking of dirty page in journal which
> > > > > > cause unpredicted corrupting in journal.
> > > > > >
> > > > > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > > > > are processing concurrently.
> > > > > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > > > > You can check the number of pages through f2fs_stat on your phone.
> > > > > > >
> > > > > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > > > > >>
> > > > > > >>>
> > > > > > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > > > > > >>>> disk, it will destroy db file.
> > > > > > >>>
> > > > > > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > > > > > >>
> > > > > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > > > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > > > > >> may have valid data in its log head, but with corrupted data, then after
> > > > > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > > > > >> destroy db. Right?
> > > > > > >
> > > > > > > Just think about sqlite without this feature.
> > > > > > > Broken journal is pretty normal case for sqlite.
> > > > > >
> > > > > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > > > > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > > > > >
> > > > > > >
> > > > > > >>>
> > > > > > >>>> c) Though, we have journal db file, we will face failure of recovering db file
> > > > > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > > > > >>>
> > > > > > >>> Do you mean the failure of recovering db with a complete journal?
> > > > > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > > > > >>
> > > > > > >> Yes, just list for indicating we will face the same issue which is hard to
> > > > > > >> handle both in original design and new design, so the inner revoking failure
> > > > > > >> issue would not be a weak point or flaw of new design.
> > > > > > >>
> > > > > > >>>
> > > > > > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > > > > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > > > > >>>> atomic-interface.
> > > > > > >>>
> > > > > > >>> Well, do you mean there is no need to recover db after revoking?
> > > > > > >>
> > > > > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > > > > >> revoking, recovery is no need.
> > > > > > >
> > > > > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > > > > redo the previous transaction. No?
> > > > > >
> > > > > > As we know, in sqlite, before we commit a transaction, we will use journal to
> > > > > > record original data of pages which will be updated in following transaction, so
> > > > > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > > > > triggered by user, we will recover db with journal.
> > > > > >
> > > > > > Ideally, if we support atomic write interface, in there should always return two
> > > > > > status in atomic write interface: success or fail. If success, transaction was
> > > > > > committed, otherwise, it looks like nothing happened, user will be told
> > > > > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > > > > eventually no journal, no recovery.
> > > > > >
> > > > > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > > > > revoking in commit interface since it could destroy db file permanently w/o
> > > > > > journal. IMO, some optimization could be done for these cases:
> > > > > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > > > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > > > >
> > > > > > These optimizations couldn't guarantee no failure in revoking operation
> > > > > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > > > > w/o atomic feature.
> > > > > >
> > > > > > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > > > > > guess we can optimize DB like:
> > > > > > 1. reflink db to db.ref
> > > > > > 2. do transaction in db.ref
> > > > > >    - failed, rm db.ref
> > > > > >    - power-cut rm db.ref
> > > > > > 3. rename db.ref to db
> > > > > >
> > > > > > >
> > > > > > >> One more case is that user can send a command to abort current transaction,
> > > > > > >> it should be happened before atomic_commit operation, which could easily
> > > > > > >> handle with abort_commit ioctl.
> > > > > > >>
> > > > > > >>>
> > > > > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write
> &
> > > > > > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > > > > >>>> power-cut will leave that data in disk.
> > > > > > >>>
> > > > > > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > > > > > >>
> > > > > > >> Journal could be corrupted as I descripted in b).
> > > > > > >
> > > > > > > Okay, so what I'm thinking is like this.
> > > > > > > It seems there are two corruption cases after journal writes.
> > > > > > >
> > > > > > > 1. power cut during atomic writes
> > > > > > >  - broken journal file and clean db file -> give up
> > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > >
> > > > > > > 2. error during atomic writes
> > > > > > >  a. power-cut before abort completion
> > > > > > >   - broken journal file and broken db file -> revoking is needed!
> > > > > > >
> > > > > > >  b. after abort
> > > > > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > > > > >
> > > > > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > > > > But, I think, even if revoking is done, we should notify an error to abort and
> > > > > > > recover db by 2.b.
> > > > > > >
> > > > > > > Something like this after successful revoking.
> > > > > > >
> > > > > > > 1. power cut during atomic writes
> > > > > > >  - broken journal file and clean db file -> give up
> > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > >
> > > > > > > 2. error during atomic writes w/ revoking
> > > > > > >  a. power-cut before abort completion
> > > > > > >   - broken journal file and clean db file -> give up
> > > > > > >   - luckily, valid journal file and clean db file -> recover db
> > > > > > >
> > > > > > >  b. after abort
> > > > > > >   - valid journal file and clean db file -> recover db
> > > > > >
> > > > > > That's right.
> > > > > >
> > > > > > >
> > > > > > > Let me verify these scenarios first. :)
> > > > > >
> > > > > > OK. :)
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > > >
> > > > > > > Thanks,
> > > > > > >
> > > > > > >>>
> > > > > > >>>> With revoking supported design, we can not solve all above issues, we will still
> > > > > > >>>> face the same issue like c), but it will be a big improve if we can apply this
> > > > > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also
> for
> > > > > > >>>> e) case, we try to rescue data in first time that our revoking operation would
> be
> > > > > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > > > > >>>>
> > > > > > >>>> If you don't want to have a big change in this interface or recovery flow, how
> > > > > > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > > > > > >>>
> > > > > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > > > > >>
> > > > > > >> I believe new design will enhance in memory usage and error handling of sqlite
> > > > > > >> in android, and hope this can be applied. But, I can understand that if you
> > > > > > >> were considerring about risk control and backward compatibility, since this
> > > > > > >> change affects all atomic related ioctls.
> > > > > > >>
> > > > > > >>> For other databases, I can understand that they can use atomic_write without
> > > > > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > > > > >>>
> > > > > > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > > > > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > > > > >>
> > > > > > >> You mean investigating or we can only start when there is a clear commercial
> > > > > > >> demand ?
> > > > > > >>
> > > > > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > > > > >>
> > > > > > >> Agreed :)
> > > > > > >>
> > > > > > >> Thanks,
> > > > > > >>
> > > > > > >>>
> > > > > > >>> Thanks,
> > > > > > >>>
> > > > > > >>>>
> > > > > > >>>> How do you think? :)
> > > > > > >>>>
> > > > > > >>>> Thanks,
> > > > > > >>>>
> > > > > > >>>>> But, unfortunately, it seems that something is missing in the
> > > > > > >>>>> current implementation.
> > > > > > >>>>>
> > > > > > >>>>> So simply how about this?
> > > > > > >>>>>
> > > > > > >>>>> A possible flow would be:
> > > > > > >>>>> 1. write journal data to volatile space
> > > > > > >>>>> 2. write db data to atomic space
> > > > > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > > > > > >>>>>  - flush/fsync journal data to disk
> > > > > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > > > > >>>>>
> > > > > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > > > > >>>>>
> > > > > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > > > > >>>>>
> > > > > > >>>>> 1. drop atomic writes
> > > > > > >>>>>  - we don't need to keep any stale db data.
> > > > > > >>>>>
> > > > > > >>>>> 2. write journal data
> > > > > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > > > > >>>>>
> > > > > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > >>>>> ---
> > > > > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > > > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > > > > >>>>>
> > > > > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > > > >>>>> index 91f576a..d16438a 100644
> > > > > > >>>>> --- a/fs/f2fs/file.c
> > > > > > >>>>> +++ b/fs/f2fs/file.c
> > > > > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file
> *filp)
> > > > > > >>>>>  	if (ret)
> > > > > > >>>>>  		return ret;
> > > > > > >>>>>
> > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > >>>>> -	commit_inmem_pages(inode, true);
> > > > > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > >>>>> +		commit_inmem_pages(inode, true);
> > > > > > >>>>> +	}
> > > > > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > > > > >>>>> +		if (!ret)
> > > > > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > > > >>>>> +	}
> > > > > > >>>>>
> > > > > > >>>>>  	mnt_drop_write_file(filp);
> > > > > > >>>>>  	return ret;
> > > > > > >>>>> --
> > > > > > >>>>> 2.6.3
> > > > > > >>>>
> > > > > > >
> > > > > > > ------------------------------------------------------------------------------
> > > > > > > _______________________________________________
> > > > > > > Linux-f2fs-devel mailing list
> > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > >
> > > > > >
> > > > > > ------------------------------------------------------------------------------
> > > > > > _______________________________________________
> > > > > > Linux-f2fs-devel mailing list
> > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > >
> > > > ------------------------------------------------------------------------------
> > > > Site24x7 APM Insight: Get Deep Visibility into Application Performance
> > > > APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> > > > Monitor end-to-end web transactions and take corrective actions now
> > > > Troubleshoot faster and improve end-user experience. Signup Now!
> > > > http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> > > > _______________________________________________
> > > > Linux-f2fs-devel mailing list
> > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2016-02-01 10:04                       ` [f2fs-dev] " Chao Yu
@ 2016-02-02  2:36                           ` Jaegeuk Kim
  0 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2016-02-02  2:36 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

Hi Chao,

On Mon, Feb 01, 2016 at 06:04:13PM +0800, Chao Yu wrote:
> Ping,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > Sent: Friday, January 15, 2016 8:03 AM
> > To: Chao Yu
> > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > 
> > Hi Chao,
> > 
> > On Wed, Jan 13, 2016 at 01:05:01PM +0800, Chao Yu wrote:
> > > Hi Jaegeuk,
> > >
> > > > -----Original Message-----
> > > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > > Sent: Wednesday, January 13, 2016 9:18 AM
> > > > To: Chao Yu
> > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > >
> > > > Hi Chao,
> > > >
> > > > I just injected -EIO for one page among two pages in total into database file.
> > > > Then, I tested valid and invalid journal file to see how sqlite recovers the
> > > > transaction.
> > > >
> > > > Interestingly, if journal is valid, database file is recovered, as I could see
> > > > the transaction result even after it shows EIO.
> > > > But, in the invalid journal case, somehow it drops database changes.
> > >
> > > If journal has valid data in its header and corrupted data in its body, sqlite will
> > > recover db file from corrupted journal file, then db file will be corrupted.
> > > So what you mean is: after recovery, db file still be fine? or sqlite fails to
> > > recover due to drop data in journal since the header of journal is not valid?
> > 
> > In the above case, I think I made broken journal header. At the same time, I
> > broke database file too, but I could see that database file is recovered
> > likewise roll-back. I couldn't find corruption of database.
> > 
> > Okay, I'll test again by corrupting journal body with valid header.

Hmm, it's quite difficult to produce any corruption case.

I tried the below tests, but in all the cases, sqlite did rollback successfully.

 - -EIO for one db write with valid header + valid body in journal
 - -EIO for one db write with valid header + invalid body in journal
 - -EIO for one db write with invalid header + valid body in journal

Note that, I checked both integrity_check and table contents after each tests.

I suspect that journal uses checksums to validate its contents?

Thanks,

> > 
> > Thanks,
> > 
> > >
> > > Thanks,
> > >
> > > > I'm not sure it was because I just skip second page write of database file tho.
> > > > (I added random bytes into journal pages.)
> > > > I'll break the database file with more random bytes likewise what I did for
> > > > journal.
> > > >
> > > > Thanks,
> > > >
> > > > On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> > > > > On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > > > > > Hi Jaegeuk,
> > > > > >
> > > > > > Any progress on this patch?
> > > > >
> > > > > Swamped. Will do.
> > > > >
> > > > > Thanks,
> > > > >
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Chao Yu [mailto:chao@kernel.org]
> > > > > > > Sent: Friday, January 01, 2016 8:14 PM
> > > > > > > To: Jaegeuk Kim
> > > > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > > > >
> > > > > > > Hi Jaegeuk,
> > > > > > >
> > > > > > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > > > > > Hi Chao,
> > > > > > > >
> > > > > > > > ...
> > > > > > > >
> > > > > > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > > > > >>>>>> f2fs support atomic write with following semantics:
> > > > > > > >>>>>> 1. open db file
> > > > > > > >>>>>> 2. ioctl start atomic write
> > > > > > > >>>>>> 3. (write db file) * n
> > > > > > > >>>>>> 4. ioctl commit atomic write
> > > > > > > >>>>>> 5. close db file
> > > > > > > >>>>>>
> > > > > > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > > > > > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > > > > > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > > > > > > >>>>>> won't be persisted unless we commit them in step 4.
> > > > > > > >>>>>>
> > > > > > > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > > > > > > >>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
> > > > > > > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > > > > > > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > > > > > > >>>>>> we should use journal db to recover the original data in db file.
> > > > > > > >>>>>
> > > > > > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > > > > >>>>> since database should get its error literally.
> > > > > > > >>>>>
> > > > > > > >>>>> So, the only thing that we need to do is keeping journal data for further db
> > > > > > > >>>>> recovery.
> > > > > > > >>>>
> > > > > > > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > > > > > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > > > > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > > > > > > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > > > > > > >>>> write process.
> > > > > > > >>>
> > > > > > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > > > > > >>> writes for journal files should be used together to minimize sqlite change as
> > > > > > > >>> much as possible.
> > > > > > > >>>
> > > > > > > >>>> But in current design, we still hold journal data in memory for recovering for
> > > > > > > >>>> *rare* failure case. I think there are several issues:
> > > > > > > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > > > > > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > > > > > >>>
> > > > > > > >>> In current android, I've seen that this is not a big concern. Even there is
> > > > > > > >>> memory pressure, f2fs flushes volatile pages.
> > > > > > > >>
> > > > > > > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > > > > > > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > > > > > > >>
> > > > > > > >> if (f2fs_is_volatile_file(inode))
> > > > > > > >> 	goto redirty_out;
> > > > > > > >
> > > > > > > > Where did you put this? It doesn't flush at all? Why?
> > > > > > >
> > > > > > > Original place in ->writepage, just remove two other conditions.
> > > > > > >
> > > > > > > To avoid potential random writebacking of dirty page in journal which
> > > > > > > cause unpredicted corrupting in journal.
> > > > > > >
> > > > > > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > > > > > are processing concurrently.
> > > > > > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > > > > > You can check the number of pages through f2fs_stat on your phone.
> > > > > > > >
> > > > > > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > > > > > >>
> > > > > > > >>>
> > > > > > > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > > > > > > >>>> disk, it will destroy db file.
> > > > > > > >>>
> > > > > > > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > > > > > > >>
> > > > > > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > > > > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > > > > > >> may have valid data in its log head, but with corrupted data, then after
> > > > > > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > > > > > >> destroy db. Right?
> > > > > > > >
> > > > > > > > Just think about sqlite without this feature.
> > > > > > > > Broken journal is pretty normal case for sqlite.
> > > > > > >
> > > > > > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > > > > > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > > > > > >
> > > > > > > >
> > > > > > > >>>
> > > > > > > >>>> c) Though, we have journal db file, we will face failure of recovering db file
> > > > > > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > > > > > >>>
> > > > > > > >>> Do you mean the failure of recovering db with a complete journal?
> > > > > > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > > > > > >>
> > > > > > > >> Yes, just list for indicating we will face the same issue which is hard to
> > > > > > > >> handle both in original design and new design, so the inner revoking failure
> > > > > > > >> issue would not be a weak point or flaw of new design.
> > > > > > > >>
> > > > > > > >>>
> > > > > > > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > > > > > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > > > > > >>>> atomic-interface.
> > > > > > > >>>
> > > > > > > >>> Well, do you mean there is no need to recover db after revoking?
> > > > > > > >>
> > > > > > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > > > > > >> revoking, recovery is no need.
> > > > > > > >
> > > > > > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > > > > > redo the previous transaction. No?
> > > > > > >
> > > > > > > As we know, in sqlite, before we commit a transaction, we will use journal to
> > > > > > > record original data of pages which will be updated in following transaction, so
> > > > > > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > > > > > triggered by user, we will recover db with journal.
> > > > > > >
> > > > > > > Ideally, if we support atomic write interface, in there should always return two
> > > > > > > status in atomic write interface: success or fail. If success, transaction was
> > > > > > > committed, otherwise, it looks like nothing happened, user will be told
> > > > > > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > > > > > eventually no journal, no recovery.
> > > > > > >
> > > > > > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > > > > > revoking in commit interface since it could destroy db file permanently w/o
> > > > > > > journal. IMO, some optimization could be done for these cases:
> > > > > > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > > > > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > > > > >
> > > > > > > These optimizations couldn't guarantee no failure in revoking operation
> > > > > > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > > > > > w/o atomic feature.
> > > > > > >
> > > > > > > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > > > > > > guess we can optimize DB like:
> > > > > > > 1. reflink db to db.ref
> > > > > > > 2. do transaction in db.ref
> > > > > > >    - failed, rm db.ref
> > > > > > >    - power-cut rm db.ref
> > > > > > > 3. rename db.ref to db
> > > > > > >
> > > > > > > >
> > > > > > > >> One more case is that user can send a command to abort current transaction,
> > > > > > > >> it should be happened before atomic_commit operation, which could easily
> > > > > > > >> handle with abort_commit ioctl.
> > > > > > > >>
> > > > > > > >>>
> > > > > > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write
> > &
> > > > > > > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > > > > > >>>> power-cut will leave that data in disk.
> > > > > > > >>>
> > > > > > > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > > > > > > >>
> > > > > > > >> Journal could be corrupted as I descripted in b).
> > > > > > > >
> > > > > > > > Okay, so what I'm thinking is like this.
> > > > > > > > It seems there are two corruption cases after journal writes.
> > > > > > > >
> > > > > > > > 1. power cut during atomic writes
> > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > >
> > > > > > > > 2. error during atomic writes
> > > > > > > >  a. power-cut before abort completion
> > > > > > > >   - broken journal file and broken db file -> revoking is needed!
> > > > > > > >
> > > > > > > >  b. after abort
> > > > > > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > > > > > >
> > > > > > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > > > > > But, I think, even if revoking is done, we should notify an error to abort and
> > > > > > > > recover db by 2.b.
> > > > > > > >
> > > > > > > > Something like this after successful revoking.
> > > > > > > >
> > > > > > > > 1. power cut during atomic writes
> > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > >
> > > > > > > > 2. error during atomic writes w/ revoking
> > > > > > > >  a. power-cut before abort completion
> > > > > > > >   - broken journal file and clean db file -> give up
> > > > > > > >   - luckily, valid journal file and clean db file -> recover db
> > > > > > > >
> > > > > > > >  b. after abort
> > > > > > > >   - valid journal file and clean db file -> recover db
> > > > > > >
> > > > > > > That's right.
> > > > > > >
> > > > > > > >
> > > > > > > > Let me verify these scenarios first. :)
> > > > > > >
> > > > > > > OK. :)
> > > > > > >
> > > > > > > Thanks,
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks,
> > > > > > > >
> > > > > > > >>>
> > > > > > > >>>> With revoking supported design, we can not solve all above issues, we will still
> > > > > > > >>>> face the same issue like c), but it will be a big improve if we can apply this
> > > > > > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also
> > for
> > > > > > > >>>> e) case, we try to rescue data in first time that our revoking operation would
> > be
> > > > > > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > > > > > >>>>
> > > > > > > >>>> If you don't want to have a big change in this interface or recovery flow, how
> > > > > > > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > > > > > > >>>
> > > > > > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > > > > > >>
> > > > > > > >> I believe new design will enhance in memory usage and error handling of sqlite
> > > > > > > >> in android, and hope this can be applied. But, I can understand that if you
> > > > > > > >> were considerring about risk control and backward compatibility, since this
> > > > > > > >> change affects all atomic related ioctls.
> > > > > > > >>
> > > > > > > >>> For other databases, I can understand that they can use atomic_write without
> > > > > > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > > > > > >>>
> > > > > > > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > > > > > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > > > > > >>
> > > > > > > >> You mean investigating or we can only start when there is a clear commercial
> > > > > > > >> demand ?
> > > > > > > >>
> > > > > > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > > > > > >>
> > > > > > > >> Agreed :)
> > > > > > > >>
> > > > > > > >> Thanks,
> > > > > > > >>
> > > > > > > >>>
> > > > > > > >>> Thanks,
> > > > > > > >>>
> > > > > > > >>>>
> > > > > > > >>>> How do you think? :)
> > > > > > > >>>>
> > > > > > > >>>> Thanks,
> > > > > > > >>>>
> > > > > > > >>>>> But, unfortunately, it seems that something is missing in the
> > > > > > > >>>>> current implementation.
> > > > > > > >>>>>
> > > > > > > >>>>> So simply how about this?
> > > > > > > >>>>>
> > > > > > > >>>>> A possible flow would be:
> > > > > > > >>>>> 1. write journal data to volatile space
> > > > > > > >>>>> 2. write db data to atomic space
> > > > > > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > > > > > > >>>>>  - flush/fsync journal data to disk
> > > > > > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > > > > > >>>>>
> > > > > > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > > > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > > > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > > > > > >>>>>
> > > > > > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > > > > > >>>>>
> > > > > > > >>>>> 1. drop atomic writes
> > > > > > > >>>>>  - we don't need to keep any stale db data.
> > > > > > > >>>>>
> > > > > > > >>>>> 2. write journal data
> > > > > > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > > > > > >>>>>
> > > > > > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > >>>>> ---
> > > > > > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > > > > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > > > > > >>>>>
> > > > > > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > > > > >>>>> index 91f576a..d16438a 100644
> > > > > > > >>>>> --- a/fs/f2fs/file.c
> > > > > > > >>>>> +++ b/fs/f2fs/file.c
> > > > > > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file
> > *filp)
> > > > > > > >>>>>  	if (ret)
> > > > > > > >>>>>  		return ret;
> > > > > > > >>>>>
> > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > >>>>> -	commit_inmem_pages(inode, true);
> > > > > > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > >>>>> +		commit_inmem_pages(inode, true);
> > > > > > > >>>>> +	}
> > > > > > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > > > > > >>>>> +		if (!ret)
> > > > > > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > > > > >>>>> +	}
> > > > > > > >>>>>
> > > > > > > >>>>>  	mnt_drop_write_file(filp);
> > > > > > > >>>>>  	return ret;
> > > > > > > >>>>> --
> > > > > > > >>>>> 2.6.3
> > > > > > > >>>>
> > > > > > > >
> > > > > > > > ------------------------------------------------------------------------------
> > > > > > > > _______________________________________________
> > > > > > > > Linux-f2fs-devel mailing list
> > > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > > >
> > > > > > >
> > > > > > > ------------------------------------------------------------------------------
> > > > > > > _______________________________________________
> > > > > > > Linux-f2fs-devel mailing list
> > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > >
> > > > > ------------------------------------------------------------------------------
> > > > > Site24x7 APM Insight: Get Deep Visibility into Application Performance
> > > > > APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> > > > > Monitor end-to-end web transactions and take corrective actions now
> > > > > Troubleshoot faster and improve end-user experience. Signup Now!
> > > > > http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> > > > > _______________________________________________
> > > > > Linux-f2fs-devel mailing list
> > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] f2fs: support revoking atomic written pages
@ 2016-02-02  2:36                           ` Jaegeuk Kim
  0 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2016-02-02  2:36 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

Hi Chao,

On Mon, Feb 01, 2016 at 06:04:13PM +0800, Chao Yu wrote:
> Ping,
> 
> > -----Original Message-----
> > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > Sent: Friday, January 15, 2016 8:03 AM
> > To: Chao Yu
> > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > 
> > Hi Chao,
> > 
> > On Wed, Jan 13, 2016 at 01:05:01PM +0800, Chao Yu wrote:
> > > Hi Jaegeuk,
> > >
> > > > -----Original Message-----
> > > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > > Sent: Wednesday, January 13, 2016 9:18 AM
> > > > To: Chao Yu
> > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > >
> > > > Hi Chao,
> > > >
> > > > I just injected -EIO for one page among two pages in total into database file.
> > > > Then, I tested valid and invalid journal file to see how sqlite recovers the
> > > > transaction.
> > > >
> > > > Interestingly, if journal is valid, database file is recovered, as I could see
> > > > the transaction result even after it shows EIO.
> > > > But, in the invalid journal case, somehow it drops database changes.
> > >
> > > If journal has valid data in its header and corrupted data in its body, sqlite will
> > > recover db file from corrupted journal file, then db file will be corrupted.
> > > So what you mean is: after recovery, db file still be fine? or sqlite fails to
> > > recover due to drop data in journal since the header of journal is not valid?
> > 
> > In the above case, I think I made broken journal header. At the same time, I
> > broke database file too, but I could see that database file is recovered
> > likewise roll-back. I couldn't find corruption of database.
> > 
> > Okay, I'll test again by corrupting journal body with valid header.

Hmm, it's quite difficult to produce any corruption case.

I tried the below tests, but in all the cases, sqlite did rollback successfully.

 - -EIO for one db write with valid header + valid body in journal
 - -EIO for one db write with valid header + invalid body in journal
 - -EIO for one db write with invalid header + valid body in journal

Note that, I checked both integrity_check and table contents after each tests.

I suspect that journal uses checksums to validate its contents?

Thanks,

> > 
> > Thanks,
> > 
> > >
> > > Thanks,
> > >
> > > > I'm not sure it was because I just skip second page write of database file tho.
> > > > (I added random bytes into journal pages.)
> > > > I'll break the database file with more random bytes likewise what I did for
> > > > journal.
> > > >
> > > > Thanks,
> > > >
> > > > On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> > > > > On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > > > > > Hi Jaegeuk,
> > > > > >
> > > > > > Any progress on this patch?
> > > > >
> > > > > Swamped. Will do.
> > > > >
> > > > > Thanks,
> > > > >
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > > > -----Original Message-----
> > > > > > > From: Chao Yu [mailto:chao@kernel.org]
> > > > > > > Sent: Friday, January 01, 2016 8:14 PM
> > > > > > > To: Jaegeuk Kim
> > > > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > > > >
> > > > > > > Hi Jaegeuk,
> > > > > > >
> > > > > > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > > > > > Hi Chao,
> > > > > > > >
> > > > > > > > ...
> > > > > > > >
> > > > > > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > > > > >>>>>> f2fs support atomic write with following semantics:
> > > > > > > >>>>>> 1. open db file
> > > > > > > >>>>>> 2. ioctl start atomic write
> > > > > > > >>>>>> 3. (write db file) * n
> > > > > > > >>>>>> 4. ioctl commit atomic write
> > > > > > > >>>>>> 5. close db file
> > > > > > > >>>>>>
> > > > > > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > > > > > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > > > > > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > > > > > > >>>>>> won't be persisted unless we commit them in step 4.
> > > > > > > >>>>>>
> > > > > > > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > > > > > > >>>>>> because our semantics of 'atomic write support' is not full, in step 4, we
> > > > > > > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > > > > > > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > > > > > > >>>>>> we should use journal db to recover the original data in db file.
> > > > > > > >>>>>
> > > > > > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > > > > >>>>> since database should get its error literally.
> > > > > > > >>>>>
> > > > > > > >>>>> So, the only thing that we need to do is keeping journal data for further db
> > > > > > > >>>>> recovery.
> > > > > > > >>>>
> > > > > > > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > > > > > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > > > > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily try
> > > > > > > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > > > > > > >>>> write process.
> > > > > > > >>>
> > > > > > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > > > > > >>> writes for journal files should be used together to minimize sqlite change as
> > > > > > > >>> much as possible.
> > > > > > > >>>
> > > > > > > >>>> But in current design, we still hold journal data in memory for recovering for
> > > > > > > >>>> *rare* failure case. I think there are several issues:
> > > > > > > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > > > > > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > > > > > >>>
> > > > > > > >>> In current android, I've seen that this is not a big concern. Even there is
> > > > > > > >>> memory pressure, f2fs flushes volatile pages.
> > > > > > > >>
> > > > > > > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > > > > > > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > > > > > > >>
> > > > > > > >> if (f2fs_is_volatile_file(inode))
> > > > > > > >> 	goto redirty_out;
> > > > > > > >
> > > > > > > > Where did you put this? It doesn't flush at all? Why?
> > > > > > >
> > > > > > > Original place in ->writepage, just remove two other conditions.
> > > > > > >
> > > > > > > To avoid potential random writebacking of dirty page in journal which
> > > > > > > cause unpredicted corrupting in journal.
> > > > > > >
> > > > > > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > > > > > are processing concurrently.
> > > > > > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > > > > > You can check the number of pages through f2fs_stat on your phone.
> > > > > > > >
> > > > > > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > > > > > >>
> > > > > > > >>>
> > > > > > > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > > > > > > >>>> disk, it will destroy db file.
> > > > > > > >>>
> > > > > > > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > > > > > > >>
> > > > > > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > > > > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > > > > > >> may have valid data in its log head, but with corrupted data, then after
> > > > > > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > > > > > >> destroy db. Right?
> > > > > > > >
> > > > > > > > Just think about sqlite without this feature.
> > > > > > > > Broken journal is pretty normal case for sqlite.
> > > > > > >
> > > > > > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > > > > > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > > > > > >
> > > > > > > >
> > > > > > > >>>
> > > > > > > >>>> c) Though, we have journal db file, we will face failure of recovering db file
> > > > > > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > > > > > >>>
> > > > > > > >>> Do you mean the failure of recovering db with a complete journal?
> > > > > > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > > > > > >>
> > > > > > > >> Yes, just list for indicating we will face the same issue which is hard to
> > > > > > > >> handle both in original design and new design, so the inner revoking failure
> > > > > > > >> issue would not be a weak point or flaw of new design.
> > > > > > > >>
> > > > > > > >>>
> > > > > > > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > > > > > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > > > > > >>>> atomic-interface.
> > > > > > > >>>
> > > > > > > >>> Well, do you mean there is no need to recover db after revoking?
> > > > > > > >>
> > > > > > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > > > > > >> revoking, recovery is no need.
> > > > > > > >
> > > > > > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > > > > > redo the previous transaction. No?
> > > > > > >
> > > > > > > As we know, in sqlite, before we commit a transaction, we will use journal to
> > > > > > > record original data of pages which will be updated in following transaction, so
> > > > > > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > > > > > triggered by user, we will recover db with journal.
> > > > > > >
> > > > > > > Ideally, if we support atomic write interface, in there should always return two
> > > > > > > status in atomic write interface: success or fail. If success, transaction was
> > > > > > > committed, otherwise, it looks like nothing happened, user will be told
> > > > > > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > > > > > eventually no journal, no recovery.
> > > > > > >
> > > > > > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > > > > > revoking in commit interface since it could destroy db file permanently w/o
> > > > > > > journal. IMO, some optimization could be done for these cases:
> > > > > > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > > > > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > > > > >
> > > > > > > These optimizations couldn't guarantee no failure in revoking operation
> > > > > > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > > > > > w/o atomic feature.
> > > > > > >
> > > > > > > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > > > > > > guess we can optimize DB like:
> > > > > > > 1. reflink db to db.ref
> > > > > > > 2. do transaction in db.ref
> > > > > > >    - failed, rm db.ref
> > > > > > >    - power-cut rm db.ref
> > > > > > > 3. rename db.ref to db
> > > > > > >
> > > > > > > >
> > > > > > > >> One more case is that user can send a command to abort current transaction,
> > > > > > > >> it should be happened before atomic_commit operation, which could easily
> > > > > > > >> handle with abort_commit ioctl.
> > > > > > > >>
> > > > > > > >>>
> > > > > > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write
> > &
> > > > > > > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > > > > > >>>> power-cut will leave that data in disk.
> > > > > > > >>>
> > > > > > > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > > > > > > >>
> > > > > > > >> Journal could be corrupted as I descripted in b).
> > > > > > > >
> > > > > > > > Okay, so what I'm thinking is like this.
> > > > > > > > It seems there are two corruption cases after journal writes.
> > > > > > > >
> > > > > > > > 1. power cut during atomic writes
> > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > >
> > > > > > > > 2. error during atomic writes
> > > > > > > >  a. power-cut before abort completion
> > > > > > > >   - broken journal file and broken db file -> revoking is needed!
> > > > > > > >
> > > > > > > >  b. after abort
> > > > > > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > > > > > >
> > > > > > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > > > > > But, I think, even if revoking is done, we should notify an error to abort and
> > > > > > > > recover db by 2.b.
> > > > > > > >
> > > > > > > > Something like this after successful revoking.
> > > > > > > >
> > > > > > > > 1. power cut during atomic writes
> > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > >
> > > > > > > > 2. error during atomic writes w/ revoking
> > > > > > > >  a. power-cut before abort completion
> > > > > > > >   - broken journal file and clean db file -> give up
> > > > > > > >   - luckily, valid journal file and clean db file -> recover db
> > > > > > > >
> > > > > > > >  b. after abort
> > > > > > > >   - valid journal file and clean db file -> recover db
> > > > > > >
> > > > > > > That's right.
> > > > > > >
> > > > > > > >
> > > > > > > > Let me verify these scenarios first. :)
> > > > > > >
> > > > > > > OK. :)
> > > > > > >
> > > > > > > Thanks,
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks,
> > > > > > > >
> > > > > > > >>>
> > > > > > > >>>> With revoking supported design, we can not solve all above issues, we will still
> > > > > > > >>>> face the same issue like c), but it will be a big improve if we can apply this
> > > > > > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also
> > for
> > > > > > > >>>> e) case, we try to rescue data in first time that our revoking operation would
> > be
> > > > > > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > > > > > >>>>
> > > > > > > >>>> If you don't want to have a big change in this interface or recovery flow, how
> > > > > > > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > > > > > > >>>
> > > > > > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > > > > > >>
> > > > > > > >> I believe new design will enhance in memory usage and error handling of sqlite
> > > > > > > >> in android, and hope this can be applied. But, I can understand that if you
> > > > > > > >> were considerring about risk control and backward compatibility, since this
> > > > > > > >> change affects all atomic related ioctls.
> > > > > > > >>
> > > > > > > >>> For other databases, I can understand that they can use atomic_write without
> > > > > > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > > > > > >>>
> > > > > > > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > > > > > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > > > > > >>
> > > > > > > >> You mean investigating or we can only start when there is a clear commercial
> > > > > > > >> demand ?
> > > > > > > >>
> > > > > > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > > > > > >>
> > > > > > > >> Agreed :)
> > > > > > > >>
> > > > > > > >> Thanks,
> > > > > > > >>
> > > > > > > >>>
> > > > > > > >>> Thanks,
> > > > > > > >>>
> > > > > > > >>>>
> > > > > > > >>>> How do you think? :)
> > > > > > > >>>>
> > > > > > > >>>> Thanks,
> > > > > > > >>>>
> > > > > > > >>>>> But, unfortunately, it seems that something is missing in the
> > > > > > > >>>>> current implementation.
> > > > > > > >>>>>
> > > > > > > >>>>> So simply how about this?
> > > > > > > >>>>>
> > > > > > > >>>>> A possible flow would be:
> > > > > > > >>>>> 1. write journal data to volatile space
> > > > > > > >>>>> 2. write db data to atomic space
> > > > > > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and db
> > > > > > > >>>>>  - flush/fsync journal data to disk
> > > > > > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > > > > > >>>>>
> > > > > > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > > > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > > > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > > > > > >>>>>
> > > > > > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > > > > > >>>>>
> > > > > > > >>>>> 1. drop atomic writes
> > > > > > > >>>>>  - we don't need to keep any stale db data.
> > > > > > > >>>>>
> > > > > > > >>>>> 2. write journal data
> > > > > > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > > > > > >>>>>
> > > > > > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > >>>>> ---
> > > > > > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > > > > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > > > > > >>>>>
> > > > > > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > > > > >>>>> index 91f576a..d16438a 100644
> > > > > > > >>>>> --- a/fs/f2fs/file.c
> > > > > > > >>>>> +++ b/fs/f2fs/file.c
> > > > > > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file
> > *filp)
> > > > > > > >>>>>  	if (ret)
> > > > > > > >>>>>  		return ret;
> > > > > > > >>>>>
> > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > >>>>> -	commit_inmem_pages(inode, true);
> > > > > > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > >>>>> +		commit_inmem_pages(inode, true);
> > > > > > > >>>>> +	}
> > > > > > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > > > > > >>>>> +		if (!ret)
> > > > > > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > > > > >>>>> +	}
> > > > > > > >>>>>
> > > > > > > >>>>>  	mnt_drop_write_file(filp);
> > > > > > > >>>>>  	return ret;
> > > > > > > >>>>> --
> > > > > > > >>>>> 2.6.3
> > > > > > > >>>>
> > > > > > > >
> > > > > > > > ------------------------------------------------------------------------------
> > > > > > > > _______________________________________________
> > > > > > > > Linux-f2fs-devel mailing list
> > > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > > >
> > > > > > >
> > > > > > > ------------------------------------------------------------------------------
> > > > > > > _______________________________________________
> > > > > > > Linux-f2fs-devel mailing list
> > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > >
> > > > > ------------------------------------------------------------------------------
> > > > > Site24x7 APM Insight: Get Deep Visibility into Application Performance
> > > > > APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> > > > > Monitor end-to-end web transactions and take corrective actions now
> > > > > Troubleshoot faster and improve end-user experience. Signup Now!
> > > > > http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> > > > > _______________________________________________
> > > > > Linux-f2fs-devel mailing list
> > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

------------------------------------------------------------------------------
Site24x7 APM Insight: Get Deep Visibility into Application Performance
APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
Monitor end-to-end web transactions and take corrective actions now
Troubleshoot faster and improve end-user experience. Signup Now!
http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2016-02-02  2:36                           ` Jaegeuk Kim
  (?)
@ 2016-02-02 10:19                           ` Chao Yu
  2016-02-06  4:17                               ` Jaegeuk Kim
  -1 siblings, 1 reply; 28+ messages in thread
From: Chao Yu @ 2016-02-02 10:19 UTC (permalink / raw)
  To: 'Jaegeuk Kim'; +Cc: linux-kernel, linux-f2fs-devel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Tuesday, February 02, 2016 10:37 AM
> To: Chao Yu
> Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> 
> Hi Chao,
> 
> On Mon, Feb 01, 2016 at 06:04:13PM +0800, Chao Yu wrote:
> > Ping,
> >
> > > -----Original Message-----
> > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > Sent: Friday, January 15, 2016 8:03 AM
> > > To: Chao Yu
> > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > >
> > > Hi Chao,
> > >
> > > On Wed, Jan 13, 2016 at 01:05:01PM +0800, Chao Yu wrote:
> > > > Hi Jaegeuk,
> > > >
> > > > > -----Original Message-----
> > > > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > > > Sent: Wednesday, January 13, 2016 9:18 AM
> > > > > To: Chao Yu
> > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > >
> > > > > Hi Chao,
> > > > >
> > > > > I just injected -EIO for one page among two pages in total into database file.
> > > > > Then, I tested valid and invalid journal file to see how sqlite recovers the
> > > > > transaction.
> > > > >
> > > > > Interestingly, if journal is valid, database file is recovered, as I could see
> > > > > the transaction result even after it shows EIO.
> > > > > But, in the invalid journal case, somehow it drops database changes.
> > > >
> > > > If journal has valid data in its header and corrupted data in its body, sqlite will
> > > > recover db file from corrupted journal file, then db file will be corrupted.
> > > > So what you mean is: after recovery, db file still be fine? or sqlite fails to
> > > > recover due to drop data in journal since the header of journal is not valid?
> > >
> > > In the above case, I think I made broken journal header. At the same time, I
> > > broke database file too, but I could see that database file is recovered
> > > likewise roll-back. I couldn't find corruption of database.
> > >
> > > Okay, I'll test again by corrupting journal body with valid header.
> 
> Hmm, it's quite difficult to produce any corruption case.
> 
> I tried the below tests, but in all the cases, sqlite did rollback successfully.

As you saw valid db file at final, I suspect that:
a) db file was recovered by f2fs: after we fail in atomic commit, if
   checkpoint isn't be triggered to persist partial pages of one
   transaction, db file will be recovered to last transaction after an
   abnormal power-cut by f2fs.
b) or db file was recovered by sqlite: sqlite will try to do the
   revoking after it detects failure of atomic commit. Similarly, db
   file will be recovered.

> 
>  - -EIO for one db write with valid header + valid body in journal
>  - -EIO for one db write with valid header + invalid body in journal
>  - -EIO for one db write with invalid header + valid body in journal
> 
> Note that, I checked both integrity_check and table contents after each tests.
> 
> I suspect that journal uses checksums to validate its contents?

Yes, there is one checksum after each 4K-size journal page.

IMO, it's better to just destroy last one or two journal pages to make
corrupted journal file. For example, if there are 10 pages in journal, let
kworker writebacks [0-7] pages include partial old pages of transaction
and journal header, and holds [8-9] pages in memory, so in disk, [8-9]
pages were invalid to sqlite due to wrong checksum, and other pages will
be judged as valid for recovery. Note that, pages after first invalid
page were also be judged as invalid by sqlite.

Thanks,

> 
> Thanks,
> 
> > >
> > > Thanks,
> > >
> > > >
> > > > Thanks,
> > > >
> > > > > I'm not sure it was because I just skip second page write of database file tho.
> > > > > (I added random bytes into journal pages.)
> > > > > I'll break the database file with more random bytes likewise what I did for
> > > > > journal.
> > > > >
> > > > > Thanks,
> > > > >
> > > > > On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> > > > > > On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > > > > > > Hi Jaegeuk,
> > > > > > >
> > > > > > > Any progress on this patch?
> > > > > >
> > > > > > Swamped. Will do.
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > > >
> > > > > > > Thanks,
> > > > > > >
> > > > > > > > -----Original Message-----
> > > > > > > > From: Chao Yu [mailto:chao@kernel.org]
> > > > > > > > Sent: Friday, January 01, 2016 8:14 PM
> > > > > > > > To: Jaegeuk Kim
> > > > > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > > > > >
> > > > > > > > Hi Jaegeuk,
> > > > > > > >
> > > > > > > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > > > > > > Hi Chao,
> > > > > > > > >
> > > > > > > > > ...
> > > > > > > > >
> > > > > > > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > > > > > >>>>>> f2fs support atomic write with following semantics:
> > > > > > > > >>>>>> 1. open db file
> > > > > > > > >>>>>> 2. ioctl start atomic write
> > > > > > > > >>>>>> 3. (write db file) * n
> > > > > > > > >>>>>> 4. ioctl commit atomic write
> > > > > > > > >>>>>> 5. close db file
> > > > > > > > >>>>>>
> > > > > > > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > > > > > > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > > > > > > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > > > > > > > >>>>>> won't be persisted unless we commit them in step 4.
> > > > > > > > >>>>>>
> > > > > > > > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > > > > > > > >>>>>> because our semantics of 'atomic write support' is not full, in step 4,
> we
> > > > > > > > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > > > > > > > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > > > > > > > >>>>>> we should use journal db to recover the original data in db file.
> > > > > > > > >>>>>
> > > > > > > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > > > > > >>>>> since database should get its error literally.
> > > > > > > > >>>>>
> > > > > > > > >>>>> So, the only thing that we need to do is keeping journal data for further
> db
> > > > > > > > >>>>> recovery.
> > > > > > > > >>>>
> > > > > > > > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > > > > > > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > > > > > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily
> try
> > > > > > > > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > > > > > > > >>>> write process.
> > > > > > > > >>>
> > > > > > > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > > > > > > >>> writes for journal files should be used together to minimize sqlite change
> as
> > > > > > > > >>> much as possible.
> > > > > > > > >>>
> > > > > > > > >>>> But in current design, we still hold journal data in memory for recovering
> for
> > > > > > > > >>>> *rare* failure case. I think there are several issues:
> > > > > > > > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > > > > > > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > > > > > > >>>
> > > > > > > > >>> In current android, I've seen that this is not a big concern. Even there is
> > > > > > > > >>> memory pressure, f2fs flushes volatile pages.
> > > > > > > > >>
> > > > > > > > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > > > > > > > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > > > > > > > >>
> > > > > > > > >> if (f2fs_is_volatile_file(inode))
> > > > > > > > >> 	goto redirty_out;
> > > > > > > > >
> > > > > > > > > Where did you put this? It doesn't flush at all? Why?
> > > > > > > >
> > > > > > > > Original place in ->writepage, just remove two other conditions.
> > > > > > > >
> > > > > > > > To avoid potential random writebacking of dirty page in journal which
> > > > > > > > cause unpredicted corrupting in journal.
> > > > > > > >
> > > > > > > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > > > > > > are processing concurrently.
> > > > > > > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > > > > > > You can check the number of pages through f2fs_stat on your phone.
> > > > > > > > >
> > > > > > > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > > > > > > >>
> > > > > > > > >>>
> > > > > > > > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > > > > > > > >>>> disk, it will destroy db file.
> > > > > > > > >>>
> > > > > > > > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > > > > > > > >>
> > > > > > > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > > > > > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > > > > > > >> may have valid data in its log head, but with corrupted data, then after
> > > > > > > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > > > > > > >> destroy db. Right?
> > > > > > > > >
> > > > > > > > > Just think about sqlite without this feature.
> > > > > > > > > Broken journal is pretty normal case for sqlite.
> > > > > > > >
> > > > > > > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > > > > > > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > > > > > > >
> > > > > > > > >
> > > > > > > > >>>
> > > > > > > > >>>> c) Though, we have journal db file, we will face failure of recovering db
> file
> > > > > > > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > > > > > > >>>
> > > > > > > > >>> Do you mean the failure of recovering db with a complete journal?
> > > > > > > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > > > > > > >>
> > > > > > > > >> Yes, just list for indicating we will face the same issue which is hard to
> > > > > > > > >> handle both in original design and new design, so the inner revoking failure
> > > > > > > > >> issue would not be a weak point or flaw of new design.
> > > > > > > > >>
> > > > > > > > >>>
> > > > > > > > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > > > > > > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > > > > > > >>>> atomic-interface.
> > > > > > > > >>>
> > > > > > > > >>> Well, do you mean there is no need to recover db after revoking?
> > > > > > > > >>
> > > > > > > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > > > > > > >> revoking, recovery is no need.
> > > > > > > > >
> > > > > > > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > > > > > > redo the previous transaction. No?
> > > > > > > >
> > > > > > > > As we know, in sqlite, before we commit a transaction, we will use journal to
> > > > > > > > record original data of pages which will be updated in following transaction,
> so
> > > > > > > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > > > > > > triggered by user, we will recover db with journal.
> > > > > > > >
> > > > > > > > Ideally, if we support atomic write interface, in there should always return two
> > > > > > > > status in atomic write interface: success or fail. If success, transaction was
> > > > > > > > committed, otherwise, it looks like nothing happened, user will be told
> > > > > > > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > > > > > > eventually no journal, no recovery.
> > > > > > > >
> > > > > > > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > > > > > > revoking in commit interface since it could destroy db file permanently w/o
> > > > > > > > journal. IMO, some optimization could be done for these cases:
> > > > > > > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > > > > > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > > > > > >
> > > > > > > > These optimizations couldn't guarantee no failure in revoking operation
> > > > > > > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > > > > > > w/o atomic feature.
> > > > > > > >
> > > > > > > > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > > > > > > > guess we can optimize DB like:
> > > > > > > > 1. reflink db to db.ref
> > > > > > > > 2. do transaction in db.ref
> > > > > > > >    - failed, rm db.ref
> > > > > > > >    - power-cut rm db.ref
> > > > > > > > 3. rename db.ref to db
> > > > > > > >
> > > > > > > > >
> > > > > > > > >> One more case is that user can send a command to abort current transaction,
> > > > > > > > >> it should be happened before atomic_commit operation, which could easily
> > > > > > > > >> handle with abort_commit ioctl.
> > > > > > > > >>
> > > > > > > > >>>
> > > > > > > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write
> > > &
> > > > > > > > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > > > > > > >>>> power-cut will leave that data in disk.
> > > > > > > > >>>
> > > > > > > > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > > > > > > > >>
> > > > > > > > >> Journal could be corrupted as I descripted in b).
> > > > > > > > >
> > > > > > > > > Okay, so what I'm thinking is like this.
> > > > > > > > > It seems there are two corruption cases after journal writes.
> > > > > > > > >
> > > > > > > > > 1. power cut during atomic writes
> > > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > > >
> > > > > > > > > 2. error during atomic writes
> > > > > > > > >  a. power-cut before abort completion
> > > > > > > > >   - broken journal file and broken db file -> revoking is needed!
> > > > > > > > >
> > > > > > > > >  b. after abort
> > > > > > > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > > > > > > >
> > > > > > > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > > > > > > But, I think, even if revoking is done, we should notify an error to abort and
> > > > > > > > > recover db by 2.b.
> > > > > > > > >
> > > > > > > > > Something like this after successful revoking.
> > > > > > > > >
> > > > > > > > > 1. power cut during atomic writes
> > > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > > >
> > > > > > > > > 2. error during atomic writes w/ revoking
> > > > > > > > >  a. power-cut before abort completion
> > > > > > > > >   - broken journal file and clean db file -> give up
> > > > > > > > >   - luckily, valid journal file and clean db file -> recover db
> > > > > > > > >
> > > > > > > > >  b. after abort
> > > > > > > > >   - valid journal file and clean db file -> recover db
> > > > > > > >
> > > > > > > > That's right.
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Let me verify these scenarios first. :)
> > > > > > > >
> > > > > > > > OK. :)
> > > > > > > >
> > > > > > > > Thanks,
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks,
> > > > > > > > >
> > > > > > > > >>>
> > > > > > > > >>>> With revoking supported design, we can not solve all above issues, we will
> still
> > > > > > > > >>>> face the same issue like c), but it will be a big improve if we can apply
> this
> > > > > > > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also
> > > for
> > > > > > > > >>>> e) case, we try to rescue data in first time that our revoking operation
> would
> > > be
> > > > > > > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > > > > > > >>>>
> > > > > > > > >>>> If you don't want to have a big change in this interface or recovery flow,
> how
> > > > > > > > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > > > > > > > >>>
> > > > > > > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > > > > > > >>
> > > > > > > > >> I believe new design will enhance in memory usage and error handling of sqlite
> > > > > > > > >> in android, and hope this can be applied. But, I can understand that if you
> > > > > > > > >> were considerring about risk control and backward compatibility, since this
> > > > > > > > >> change affects all atomic related ioctls.
> > > > > > > > >>
> > > > > > > > >>> For other databases, I can understand that they can use atomic_write without
> > > > > > > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > > > > > > >>>
> > > > > > > > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > > > > > > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > > > > > > >>
> > > > > > > > >> You mean investigating or we can only start when there is a clear commercial
> > > > > > > > >> demand ?
> > > > > > > > >>
> > > > > > > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > > > > > > >>
> > > > > > > > >> Agreed :)
> > > > > > > > >>
> > > > > > > > >> Thanks,
> > > > > > > > >>
> > > > > > > > >>>
> > > > > > > > >>> Thanks,
> > > > > > > > >>>
> > > > > > > > >>>>
> > > > > > > > >>>> How do you think? :)
> > > > > > > > >>>>
> > > > > > > > >>>> Thanks,
> > > > > > > > >>>>
> > > > > > > > >>>>> But, unfortunately, it seems that something is missing in the
> > > > > > > > >>>>> current implementation.
> > > > > > > > >>>>>
> > > > > > > > >>>>> So simply how about this?
> > > > > > > > >>>>>
> > > > > > > > >>>>> A possible flow would be:
> > > > > > > > >>>>> 1. write journal data to volatile space
> > > > > > > > >>>>> 2. write db data to atomic space
> > > > > > > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and
> db
> > > > > > > > >>>>>  - flush/fsync journal data to disk
> > > > > > > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > > > > > > >>>>>
> > > > > > > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > > > > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > > > > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > > > > > > >>>>>
> > > > > > > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > > > > > > >>>>>
> > > > > > > > >>>>> 1. drop atomic writes
> > > > > > > > >>>>>  - we don't need to keep any stale db data.
> > > > > > > > >>>>>
> > > > > > > > >>>>> 2. write journal data
> > > > > > > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > > > > > > >>>>>
> > > > > > > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > > >>>>> ---
> > > > > > > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > > > > > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > > > > > > >>>>>
> > > > > > > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > > > > > >>>>> index 91f576a..d16438a 100644
> > > > > > > > >>>>> --- a/fs/f2fs/file.c
> > > > > > > > >>>>> +++ b/fs/f2fs/file.c
> > > > > > > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file
> > > *filp)
> > > > > > > > >>>>>  	if (ret)
> > > > > > > > >>>>>  		return ret;
> > > > > > > > >>>>>
> > > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > > >>>>> -	commit_inmem_pages(inode, true);
> > > > > > > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > > >>>>> +		commit_inmem_pages(inode, true);
> > > > > > > > >>>>> +	}
> > > > > > > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > > > > > > >>>>> +		if (!ret)
> > > > > > > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > > > > > >>>>> +	}
> > > > > > > > >>>>>
> > > > > > > > >>>>>  	mnt_drop_write_file(filp);
> > > > > > > > >>>>>  	return ret;
> > > > > > > > >>>>> --
> > > > > > > > >>>>> 2.6.3
> > > > > > > > >>>>
> > > > > > > > >
> > > > > > > > > ------------------------------------------------------------------------------
> > > > > > > > > _______________________________________________
> > > > > > > > > Linux-f2fs-devel mailing list
> > > > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > > > >
> > > > > > > >
> > > > > > > > ------------------------------------------------------------------------------
> > > > > > > > _______________________________________________
> > > > > > > > Linux-f2fs-devel mailing list
> > > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > >
> > > > > > ------------------------------------------------------------------------------
> > > > > > Site24x7 APM Insight: Get Deep Visibility into Application Performance
> > > > > > APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> > > > > > Monitor end-to-end web transactions and take corrective actions now
> > > > > > Troubleshoot faster and improve end-user experience. Signup Now!
> > > > > > http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> > > > > > _______________________________________________
> > > > > > Linux-f2fs-devel mailing list
> > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2016-02-02 10:19                           ` [f2fs-dev] " Chao Yu
@ 2016-02-06  4:17                               ` Jaegeuk Kim
  0 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2016-02-06  4:17 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

Hi Chao,

On Tue, Feb 02, 2016 at 06:19:06PM +0800, Chao Yu wrote:
> > > > > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > > > > Sent: Wednesday, January 13, 2016 9:18 AM
> > > > > > To: Chao Yu
> > > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > > >
> > > > > > Hi Chao,
> > > > > >
> > > > > > I just injected -EIO for one page among two pages in total into database file.
> > > > > > Then, I tested valid and invalid journal file to see how sqlite recovers the
> > > > > > transaction.
> > > > > >
> > > > > > Interestingly, if journal is valid, database file is recovered, as I could see
> > > > > > the transaction result even after it shows EIO.
> > > > > > But, in the invalid journal case, somehow it drops database changes.
> > > > >
> > > > > If journal has valid data in its header and corrupted data in its body, sqlite will
> > > > > recover db file from corrupted journal file, then db file will be corrupted.
> > > > > So what you mean is: after recovery, db file still be fine? or sqlite fails to
> > > > > recover due to drop data in journal since the header of journal is not valid?
> > > >
> > > > In the above case, I think I made broken journal header. At the same time, I
> > > > broke database file too, but I could see that database file is recovered
> > > > likewise roll-back. I couldn't find corruption of database.
> > > >
> > > > Okay, I'll test again by corrupting journal body with valid header.
> > 
> > Hmm, it's quite difficult to produce any corruption case.
> > 
> > I tried the below tests, but in all the cases, sqlite did rollback successfully.
> 
> As you saw valid db file at final, I suspect that:
> a) db file was recovered by f2fs: after we fail in atomic commit, if
>    checkpoint isn't be triggered to persist partial pages of one
>    transaction, db file will be recovered to last transaction after an
>    abnormal power-cut by f2fs.
> b) or db file was recovered by sqlite: sqlite will try to do the
>    revoking after it detects failure of atomic commit. Similarly, db
>    file will be recovered.
> 
> > 
> >  - -EIO for one db write with valid header + valid body in journal
> >  - -EIO for one db write with valid header + invalid body in journal
> >  - -EIO for one db write with invalid header + valid body in journal
> > 
> > Note that, I checked both integrity_check and table contents after each tests.
> > 
> > I suspect that journal uses checksums to validate its contents?
> 
> Yes, there is one checksum after each 4K-size journal page.
> 
> IMO, it's better to just destroy last one or two journal pages to make
> corrupted journal file. For example, if there are 10 pages in journal, let
> kworker writebacks [0-7] pages include partial old pages of transaction
> and journal header, and holds [8-9] pages in memory, so in disk, [8-9]
> pages were invalid to sqlite due to wrong checksum, and other pages will
> be judged as valid for recovery. Note that, pages after first invalid
> page were also be judged as invalid by sqlite.

Hmm, I couldn't find out the exact scenario to corrypt db finally.
But, when I took a look at the below document, I could agree that it is
possible scenario.

https://www.sqlite.org/howtocorrupt.html

If possible, could you rebase the patches based on the latest dev-test?
I want to review the patch seriously.

Thanks,

> 
> Thanks,
> 
> > 
> > Thanks,
> > 
> > > >
> > > > Thanks,
> > > >
> > > > >
> > > > > Thanks,
> > > > >
> > > > > > I'm not sure it was because I just skip second page write of database file tho.
> > > > > > (I added random bytes into journal pages.)
> > > > > > I'll break the database file with more random bytes likewise what I did for
> > > > > > journal.
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > > On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> > > > > > > On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > > > > > > > Hi Jaegeuk,
> > > > > > > >
> > > > > > > > Any progress on this patch?
> > > > > > >
> > > > > > > Swamped. Will do.
> > > > > > >
> > > > > > > Thanks,
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks,
> > > > > > > >
> > > > > > > > > -----Original Message-----
> > > > > > > > > From: Chao Yu [mailto:chao@kernel.org]
> > > > > > > > > Sent: Friday, January 01, 2016 8:14 PM
> > > > > > > > > To: Jaegeuk Kim
> > > > > > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > > > > > >
> > > > > > > > > Hi Jaegeuk,
> > > > > > > > >
> > > > > > > > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > > > > > > > Hi Chao,
> > > > > > > > > >
> > > > > > > > > > ...
> > > > > > > > > >
> > > > > > > > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > > > > > > >>>>>> f2fs support atomic write with following semantics:
> > > > > > > > > >>>>>> 1. open db file
> > > > > > > > > >>>>>> 2. ioctl start atomic write
> > > > > > > > > >>>>>> 3. (write db file) * n
> > > > > > > > > >>>>>> 4. ioctl commit atomic write
> > > > > > > > > >>>>>> 5. close db file
> > > > > > > > > >>>>>>
> > > > > > > > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > > > > > > > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > > > > > > > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > > > > > > > > >>>>>> won't be persisted unless we commit them in step 4.
> > > > > > > > > >>>>>>
> > > > > > > > > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > > > > > > > > >>>>>> because our semantics of 'atomic write support' is not full, in step 4,
> > we
> > > > > > > > > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > > > > > > > > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > > > > > > > > >>>>>> we should use journal db to recover the original data in db file.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > > > > > > >>>>> since database should get its error literally.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> So, the only thing that we need to do is keeping journal data for further
> > db
> > > > > > > > > >>>>> recovery.
> > > > > > > > > >>>>
> > > > > > > > > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > > > > > > > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > > > > > > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily
> > try
> > > > > > > > > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > > > > > > > > >>>> write process.
> > > > > > > > > >>>
> > > > > > > > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > > > > > > > >>> writes for journal files should be used together to minimize sqlite change
> > as
> > > > > > > > > >>> much as possible.
> > > > > > > > > >>>
> > > > > > > > > >>>> But in current design, we still hold journal data in memory for recovering
> > for
> > > > > > > > > >>>> *rare* failure case. I think there are several issues:
> > > > > > > > > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > > > > > > > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > > > > > > > >>>
> > > > > > > > > >>> In current android, I've seen that this is not a big concern. Even there is
> > > > > > > > > >>> memory pressure, f2fs flushes volatile pages.
> > > > > > > > > >>
> > > > > > > > > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > > > > > > > > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > > > > > > > > >>
> > > > > > > > > >> if (f2fs_is_volatile_file(inode))
> > > > > > > > > >> 	goto redirty_out;
> > > > > > > > > >
> > > > > > > > > > Where did you put this? It doesn't flush at all? Why?
> > > > > > > > >
> > > > > > > > > Original place in ->writepage, just remove two other conditions.
> > > > > > > > >
> > > > > > > > > To avoid potential random writebacking of dirty page in journal which
> > > > > > > > > cause unpredicted corrupting in journal.
> > > > > > > > >
> > > > > > > > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > > > > > > > are processing concurrently.
> > > > > > > > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > > > > > > > You can check the number of pages through f2fs_stat on your phone.
> > > > > > > > > >
> > > > > > > > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > > > > > > > >>
> > > > > > > > > >>>
> > > > > > > > > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > > > > > > > > >>>> disk, it will destroy db file.
> > > > > > > > > >>>
> > > > > > > > > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > > > > > > > > >>
> > > > > > > > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > > > > > > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > > > > > > > >> may have valid data in its log head, but with corrupted data, then after
> > > > > > > > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > > > > > > > >> destroy db. Right?
> > > > > > > > > >
> > > > > > > > > > Just think about sqlite without this feature.
> > > > > > > > > > Broken journal is pretty normal case for sqlite.
> > > > > > > > >
> > > > > > > > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > > > > > > > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >>>
> > > > > > > > > >>>> c) Though, we have journal db file, we will face failure of recovering db
> > file
> > > > > > > > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > > > > > > > >>>
> > > > > > > > > >>> Do you mean the failure of recovering db with a complete journal?
> > > > > > > > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > > > > > > > >>
> > > > > > > > > >> Yes, just list for indicating we will face the same issue which is hard to
> > > > > > > > > >> handle both in original design and new design, so the inner revoking failure
> > > > > > > > > >> issue would not be a weak point or flaw of new design.
> > > > > > > > > >>
> > > > > > > > > >>>
> > > > > > > > > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > > > > > > > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > > > > > > > >>>> atomic-interface.
> > > > > > > > > >>>
> > > > > > > > > >>> Well, do you mean there is no need to recover db after revoking?
> > > > > > > > > >>
> > > > > > > > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > > > > > > > >> revoking, recovery is no need.
> > > > > > > > > >
> > > > > > > > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > > > > > > > redo the previous transaction. No?
> > > > > > > > >
> > > > > > > > > As we know, in sqlite, before we commit a transaction, we will use journal to
> > > > > > > > > record original data of pages which will be updated in following transaction,
> > so
> > > > > > > > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > > > > > > > triggered by user, we will recover db with journal.
> > > > > > > > >
> > > > > > > > > Ideally, if we support atomic write interface, in there should always return two
> > > > > > > > > status in atomic write interface: success or fail. If success, transaction was
> > > > > > > > > committed, otherwise, it looks like nothing happened, user will be told
> > > > > > > > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > > > > > > > eventually no journal, no recovery.
> > > > > > > > >
> > > > > > > > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > > > > > > > revoking in commit interface since it could destroy db file permanently w/o
> > > > > > > > > journal. IMO, some optimization could be done for these cases:
> > > > > > > > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > > > > > > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > > > > > > >
> > > > > > > > > These optimizations couldn't guarantee no failure in revoking operation
> > > > > > > > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > > > > > > > w/o atomic feature.
> > > > > > > > >
> > > > > > > > > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > > > > > > > > guess we can optimize DB like:
> > > > > > > > > 1. reflink db to db.ref
> > > > > > > > > 2. do transaction in db.ref
> > > > > > > > >    - failed, rm db.ref
> > > > > > > > >    - power-cut rm db.ref
> > > > > > > > > 3. rename db.ref to db
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >> One more case is that user can send a command to abort current transaction,
> > > > > > > > > >> it should be happened before atomic_commit operation, which could easily
> > > > > > > > > >> handle with abort_commit ioctl.
> > > > > > > > > >>
> > > > > > > > > >>>
> > > > > > > > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write
> > > > &
> > > > > > > > > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > > > > > > > >>>> power-cut will leave that data in disk.
> > > > > > > > > >>>
> > > > > > > > > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > > > > > > > > >>
> > > > > > > > > >> Journal could be corrupted as I descripted in b).
> > > > > > > > > >
> > > > > > > > > > Okay, so what I'm thinking is like this.
> > > > > > > > > > It seems there are two corruption cases after journal writes.
> > > > > > > > > >
> > > > > > > > > > 1. power cut during atomic writes
> > > > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > > > >
> > > > > > > > > > 2. error during atomic writes
> > > > > > > > > >  a. power-cut before abort completion
> > > > > > > > > >   - broken journal file and broken db file -> revoking is needed!
> > > > > > > > > >
> > > > > > > > > >  b. after abort
> > > > > > > > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > > > > > > > >
> > > > > > > > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > > > > > > > But, I think, even if revoking is done, we should notify an error to abort and
> > > > > > > > > > recover db by 2.b.
> > > > > > > > > >
> > > > > > > > > > Something like this after successful revoking.
> > > > > > > > > >
> > > > > > > > > > 1. power cut during atomic writes
> > > > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > > > >
> > > > > > > > > > 2. error during atomic writes w/ revoking
> > > > > > > > > >  a. power-cut before abort completion
> > > > > > > > > >   - broken journal file and clean db file -> give up
> > > > > > > > > >   - luckily, valid journal file and clean db file -> recover db
> > > > > > > > > >
> > > > > > > > > >  b. after abort
> > > > > > > > > >   - valid journal file and clean db file -> recover db
> > > > > > > > >
> > > > > > > > > That's right.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Let me verify these scenarios first. :)
> > > > > > > > >
> > > > > > > > > OK. :)
> > > > > > > > >
> > > > > > > > > Thanks,
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Thanks,
> > > > > > > > > >
> > > > > > > > > >>>
> > > > > > > > > >>>> With revoking supported design, we can not solve all above issues, we will
> > still
> > > > > > > > > >>>> face the same issue like c), but it will be a big improve if we can apply
> > this
> > > > > > > > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also
> > > > for
> > > > > > > > > >>>> e) case, we try to rescue data in first time that our revoking operation
> > would
> > > > be
> > > > > > > > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > > > > > > > >>>>
> > > > > > > > > >>>> If you don't want to have a big change in this interface or recovery flow,
> > how
> > > > > > > > > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > > > > > > > > >>>
> > > > > > > > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > > > > > > > >>
> > > > > > > > > >> I believe new design will enhance in memory usage and error handling of sqlite
> > > > > > > > > >> in android, and hope this can be applied. But, I can understand that if you
> > > > > > > > > >> were considerring about risk control and backward compatibility, since this
> > > > > > > > > >> change affects all atomic related ioctls.
> > > > > > > > > >>
> > > > > > > > > >>> For other databases, I can understand that they can use atomic_write without
> > > > > > > > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > > > > > > > >>>
> > > > > > > > > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > > > > > > > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > > > > > > > >>
> > > > > > > > > >> You mean investigating or we can only start when there is a clear commercial
> > > > > > > > > >> demand ?
> > > > > > > > > >>
> > > > > > > > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > > > > > > > >>
> > > > > > > > > >> Agreed :)
> > > > > > > > > >>
> > > > > > > > > >> Thanks,
> > > > > > > > > >>
> > > > > > > > > >>>
> > > > > > > > > >>> Thanks,
> > > > > > > > > >>>
> > > > > > > > > >>>>
> > > > > > > > > >>>> How do you think? :)
> > > > > > > > > >>>>
> > > > > > > > > >>>> Thanks,
> > > > > > > > > >>>>
> > > > > > > > > >>>>> But, unfortunately, it seems that something is missing in the
> > > > > > > > > >>>>> current implementation.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> So simply how about this?
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> A possible flow would be:
> > > > > > > > > >>>>> 1. write journal data to volatile space
> > > > > > > > > >>>>> 2. write db data to atomic space
> > > > > > > > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and
> > db
> > > > > > > > > >>>>>  - flush/fsync journal data to disk
> > > > > > > > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > > > > > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > > > > > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> 1. drop atomic writes
> > > > > > > > > >>>>>  - we don't need to keep any stale db data.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> 2. write journal data
> > > > > > > > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > > > >>>>> ---
> > > > > > > > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > > > > > > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > > > > > > >>>>> index 91f576a..d16438a 100644
> > > > > > > > > >>>>> --- a/fs/f2fs/file.c
> > > > > > > > > >>>>> +++ b/fs/f2fs/file.c
> > > > > > > > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file
> > > > *filp)
> > > > > > > > > >>>>>  	if (ret)
> > > > > > > > > >>>>>  		return ret;
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > > > >>>>> -	commit_inmem_pages(inode, true);
> > > > > > > > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > > > >>>>> +		commit_inmem_pages(inode, true);
> > > > > > > > > >>>>> +	}
> > > > > > > > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > > > > > > > >>>>> +		if (!ret)
> > > > > > > > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > > > > > > >>>>> +	}
> > > > > > > > > >>>>>
> > > > > > > > > >>>>>  	mnt_drop_write_file(filp);
> > > > > > > > > >>>>>  	return ret;
> > > > > > > > > >>>>> --
> > > > > > > > > >>>>> 2.6.3
> > > > > > > > > >>>>
> > > > > > > > > >
> > > > > > > > > > ------------------------------------------------------------------------------
> > > > > > > > > > _______________________________________________
> > > > > > > > > > Linux-f2fs-devel mailing list
> > > > > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > ------------------------------------------------------------------------------
> > > > > > > > > _______________________________________________
> > > > > > > > > Linux-f2fs-devel mailing list
> > > > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > >
> > > > > > > ------------------------------------------------------------------------------
> > > > > > > Site24x7 APM Insight: Get Deep Visibility into Application Performance
> > > > > > > APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> > > > > > > Monitor end-to-end web transactions and take corrective actions now
> > > > > > > Troubleshoot faster and improve end-user experience. Signup Now!
> > > > > > > http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> > > > > > > _______________________________________________
> > > > > > > Linux-f2fs-devel mailing list
> > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH 2/2] f2fs: support revoking atomic written pages
@ 2016-02-06  4:17                               ` Jaegeuk Kim
  0 siblings, 0 replies; 28+ messages in thread
From: Jaegeuk Kim @ 2016-02-06  4:17 UTC (permalink / raw)
  To: Chao Yu; +Cc: linux-kernel, linux-f2fs-devel

Hi Chao,

On Tue, Feb 02, 2016 at 06:19:06PM +0800, Chao Yu wrote:
> > > > > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > > > > Sent: Wednesday, January 13, 2016 9:18 AM
> > > > > > To: Chao Yu
> > > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > > >
> > > > > > Hi Chao,
> > > > > >
> > > > > > I just injected -EIO for one page among two pages in total into database file.
> > > > > > Then, I tested valid and invalid journal file to see how sqlite recovers the
> > > > > > transaction.
> > > > > >
> > > > > > Interestingly, if journal is valid, database file is recovered, as I could see
> > > > > > the transaction result even after it shows EIO.
> > > > > > But, in the invalid journal case, somehow it drops database changes.
> > > > >
> > > > > If journal has valid data in its header and corrupted data in its body, sqlite will
> > > > > recover db file from corrupted journal file, then db file will be corrupted.
> > > > > So what you mean is: after recovery, db file still be fine? or sqlite fails to
> > > > > recover due to drop data in journal since the header of journal is not valid?
> > > >
> > > > In the above case, I think I made broken journal header. At the same time, I
> > > > broke database file too, but I could see that database file is recovered
> > > > likewise roll-back. I couldn't find corruption of database.
> > > >
> > > > Okay, I'll test again by corrupting journal body with valid header.
> > 
> > Hmm, it's quite difficult to produce any corruption case.
> > 
> > I tried the below tests, but in all the cases, sqlite did rollback successfully.
> 
> As you saw valid db file at final, I suspect that:
> a) db file was recovered by f2fs: after we fail in atomic commit, if
>    checkpoint isn't be triggered to persist partial pages of one
>    transaction, db file will be recovered to last transaction after an
>    abnormal power-cut by f2fs.
> b) or db file was recovered by sqlite: sqlite will try to do the
>    revoking after it detects failure of atomic commit. Similarly, db
>    file will be recovered.
> 
> > 
> >  - -EIO for one db write with valid header + valid body in journal
> >  - -EIO for one db write with valid header + invalid body in journal
> >  - -EIO for one db write with invalid header + valid body in journal
> > 
> > Note that, I checked both integrity_check and table contents after each tests.
> > 
> > I suspect that journal uses checksums to validate its contents?
> 
> Yes, there is one checksum after each 4K-size journal page.
> 
> IMO, it's better to just destroy last one or two journal pages to make
> corrupted journal file. For example, if there are 10 pages in journal, let
> kworker writebacks [0-7] pages include partial old pages of transaction
> and journal header, and holds [8-9] pages in memory, so in disk, [8-9]
> pages were invalid to sqlite due to wrong checksum, and other pages will
> be judged as valid for recovery. Note that, pages after first invalid
> page were also be judged as invalid by sqlite.

Hmm, I couldn't find out the exact scenario to corrypt db finally.
But, when I took a look at the below document, I could agree that it is
possible scenario.

https://www.sqlite.org/howtocorrupt.html

If possible, could you rebase the patches based on the latest dev-test?
I want to review the patch seriously.

Thanks,

> 
> Thanks,
> 
> > 
> > Thanks,
> > 
> > > >
> > > > Thanks,
> > > >
> > > > >
> > > > > Thanks,
> > > > >
> > > > > > I'm not sure it was because I just skip second page write of database file tho.
> > > > > > (I added random bytes into journal pages.)
> > > > > > I'll break the database file with more random bytes likewise what I did for
> > > > > > journal.
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > > On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> > > > > > > On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > > > > > > > Hi Jaegeuk,
> > > > > > > >
> > > > > > > > Any progress on this patch?
> > > > > > >
> > > > > > > Swamped. Will do.
> > > > > > >
> > > > > > > Thanks,
> > > > > > >
> > > > > > > >
> > > > > > > > Thanks,
> > > > > > > >
> > > > > > > > > -----Original Message-----
> > > > > > > > > From: Chao Yu [mailto:chao@kernel.org]
> > > > > > > > > Sent: Friday, January 01, 2016 8:14 PM
> > > > > > > > > To: Jaegeuk Kim
> > > > > > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > > > > > >
> > > > > > > > > Hi Jaegeuk,
> > > > > > > > >
> > > > > > > > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > > > > > > > Hi Chao,
> > > > > > > > > >
> > > > > > > > > > ...
> > > > > > > > > >
> > > > > > > > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > > > > > > >>>>>> f2fs support atomic write with following semantics:
> > > > > > > > > >>>>>> 1. open db file
> > > > > > > > > >>>>>> 2. ioctl start atomic write
> > > > > > > > > >>>>>> 3. (write db file) * n
> > > > > > > > > >>>>>> 4. ioctl commit atomic write
> > > > > > > > > >>>>>> 5. close db file
> > > > > > > > > >>>>>>
> > > > > > > > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > > > > > > > >>>>>> cut, because we hold data of transaction in referenced pages linked in
> > > > > > > > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these data
> > > > > > > > > >>>>>> won't be persisted unless we commit them in step 4.
> > > > > > > > > >>>>>>
> > > > > > > > > >>>>>> But we should still hold journal db file in memory by using volatile write,
> > > > > > > > > >>>>>> because our semantics of 'atomic write support' is not full, in step 4,
> > we
> > > > > > > > > >>>>>> could be fail to submit all dirty data of transaction, once partial dirty
> > > > > > > > > >>>>>> data was committed in storage, db file should be corrupted, in this case,
> > > > > > > > > >>>>>> we should use journal db to recover the original data in db file.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > > > > > > >>>>> since database should get its error literally.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> So, the only thing that we need to do is keeping journal data for further
> > db
> > > > > > > > > >>>>> recovery.
> > > > > > > > > >>>>
> > > > > > > > > >>>> IMO, if we really support *atomic* interface, we don't need any journal data
> > > > > > > > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > > > > > > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily
> > try
> > > > > > > > > >>>> to revoke (replace old to new in metadata) when any failure exist in atomic
> > > > > > > > > >>>> write process.
> > > > > > > > > >>>
> > > > > > > > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > > > > > > > >>> writes for journal files should be used together to minimize sqlite change
> > as
> > > > > > > > > >>> much as possible.
> > > > > > > > > >>>
> > > > > > > > > >>>> But in current design, we still hold journal data in memory for recovering
> > for
> > > > > > > > > >>>> *rare* failure case. I think there are several issues:
> > > > > > > > > >>>> a) most of time, we are in concurrent scenario, so if large number of journal
> > > > > > > > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > > > > > > > >>>
> > > > > > > > > >>> In current android, I've seen that this is not a big concern. Even there is
> > > > > > > > > >>> memory pressure, f2fs flushes volatile pages.
> > > > > > > > > >>
> > > > > > > > > >> When I change to redirty all volatile pages in ->writepage, android seems go
> > > > > > > > > >> into an infinite loop when doing recovery flow of f2fs data partition in startup.
> > > > > > > > > >>
> > > > > > > > > >> if (f2fs_is_volatile_file(inode))
> > > > > > > > > >> 	goto redirty_out;
> > > > > > > > > >
> > > > > > > > > > Where did you put this? It doesn't flush at all? Why?
> > > > > > > > >
> > > > > > > > > Original place in ->writepage, just remove two other conditions.
> > > > > > > > >
> > > > > > > > > To avoid potential random writebacking of dirty page in journal which
> > > > > > > > > cause unpredicted corrupting in journal.
> > > > > > > > >
> > > > > > > > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > > > > > > > are processing concurrently.
> > > > > > > > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > > > > > > > You can check the number of pages through f2fs_stat on your phone.
> > > > > > > > > >
> > > > > > > > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > > > > > > > >>
> > > > > > > > > >>>
> > > > > > > > > >>>> b) If we are out of memory, reclaimer tries to write page of journal db into
> > > > > > > > > >>>> disk, it will destroy db file.
> > > > > > > > > >>>
> > > > > > > > > >>> I don't understand. Could you elaborate why journal writes can corrupt db?
> > > > > > > > > >>
> > > > > > > > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > > > > > > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > > > > > > > >> may have valid data in its log head, but with corrupted data, then after
> > > > > > > > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > > > > > > > >> destroy db. Right?
> > > > > > > > > >
> > > > > > > > > > Just think about sqlite without this feature.
> > > > > > > > > > Broken journal is pretty normal case for sqlite.
> > > > > > > > >
> > > > > > > > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > > > > > > > or filesystem, we should try our best to fix it to avoid generating broken journals.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >>>
> > > > > > > > > >>>> c) Though, we have journal db file, we will face failure of recovering db
> > file
> > > > > > > > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > > > > > > > >>>
> > > > > > > > > >>> Do you mean the failure of recovering db with a complete journal?
> > > > > > > > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > > > > > > > >>
> > > > > > > > > >> Yes, just list for indicating we will face the same issue which is hard to
> > > > > > > > > >> handle both in original design and new design, so the inner revoking failure
> > > > > > > > > >> issue would not be a weak point or flaw of new design.
> > > > > > > > > >>
> > > > > > > > > >>>
> > > > > > > > > >>>> d) Recovery flow will make data page dirty, triggering both data stream and
> > > > > > > > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > > > > > > > >>>> atomic-interface.
> > > > > > > > > >>>
> > > > > > > > > >>> Well, do you mean there is no need to recover db after revoking?
> > > > > > > > > >>
> > > > > > > > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > > > > > > > >> revoking, recovery is no need.
> > > > > > > > > >
> > > > > > > > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > > > > > > > redo the previous transaction. No?
> > > > > > > > >
> > > > > > > > > As we know, in sqlite, before we commit a transaction, we will use journal to
> > > > > > > > > record original data of pages which will be updated in following transaction,
> > so
> > > > > > > > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > > > > > > > triggered by user, we will recover db with journal.
> > > > > > > > >
> > > > > > > > > Ideally, if we support atomic write interface, in there should always return two
> > > > > > > > > status in atomic write interface: success or fail. If success, transaction was
> > > > > > > > > committed, otherwise, it looks like nothing happened, user will be told
> > > > > > > > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > > > > > > > eventually no journal, no recovery.
> > > > > > > > >
> > > > > > > > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > > > > > > > revoking in commit interface since it could destroy db file permanently w/o
> > > > > > > > > journal. IMO, some optimization could be done for these cases:
> > > > > > > > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > > > > > > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > > > > > > >
> > > > > > > > > These optimizations couldn't guarantee no failure in revoking operation
> > > > > > > > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > > > > > > > w/o atomic feature.
> > > > > > > > >
> > > > > > > > > One more possible proposal is: if we support reflink feature like ocfs2/xfs, I
> > > > > > > > > guess we can optimize DB like:
> > > > > > > > > 1. reflink db to db.ref
> > > > > > > > > 2. do transaction in db.ref
> > > > > > > > >    - failed, rm db.ref
> > > > > > > > >    - power-cut rm db.ref
> > > > > > > > > 3. rename db.ref to db
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >> One more case is that user can send a command to abort current transaction,
> > > > > > > > > >> it should be happened before atomic_commit operation, which could easily
> > > > > > > > > >> handle with abort_commit ioctl.
> > > > > > > > > >>
> > > > > > > > > >>>
> > > > > > > > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort write
> > > > &
> > > > > > > > > >>>> recover, checkpoint will persist the corrupt data in db file, following abnormal
> > > > > > > > > >>>> power-cut will leave that data in disk.
> > > > > > > > > >>>
> > > > > > > > > >>> Yes, in that case, database should recover corrupted db with its journal file.
> > > > > > > > > >>
> > > > > > > > > >> Journal could be corrupted as I descripted in b).
> > > > > > > > > >
> > > > > > > > > > Okay, so what I'm thinking is like this.
> > > > > > > > > > It seems there are two corruption cases after journal writes.
> > > > > > > > > >
> > > > > > > > > > 1. power cut during atomic writes
> > > > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > > > >
> > > > > > > > > > 2. error during atomic writes
> > > > > > > > > >  a. power-cut before abort completion
> > > > > > > > > >   - broken journal file and broken db file -> revoking is needed!
> > > > > > > > > >
> > > > > > > > > >  b. after abort
> > > > > > > > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > > > > > > > >
> > > > > > > > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > > > > > > > But, I think, even if revoking is done, we should notify an error to abort and
> > > > > > > > > > recover db by 2.b.
> > > > > > > > > >
> > > > > > > > > > Something like this after successful revoking.
> > > > > > > > > >
> > > > > > > > > > 1. power cut during atomic writes
> > > > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > > > >
> > > > > > > > > > 2. error during atomic writes w/ revoking
> > > > > > > > > >  a. power-cut before abort completion
> > > > > > > > > >   - broken journal file and clean db file -> give up
> > > > > > > > > >   - luckily, valid journal file and clean db file -> recover db
> > > > > > > > > >
> > > > > > > > > >  b. after abort
> > > > > > > > > >   - valid journal file and clean db file -> recover db
> > > > > > > > >
> > > > > > > > > That's right.
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Let me verify these scenarios first. :)
> > > > > > > > >
> > > > > > > > > OK. :)
> > > > > > > > >
> > > > > > > > > Thanks,
> > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > Thanks,
> > > > > > > > > >
> > > > > > > > > >>>
> > > > > > > > > >>>> With revoking supported design, we can not solve all above issues, we will
> > still
> > > > > > > > > >>>> face the same issue like c), but it will be a big improve if we can apply
> > this
> > > > > > > > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And also
> > > > for
> > > > > > > > > >>>> e) case, we try to rescue data in first time that our revoking operation
> > would
> > > > be
> > > > > > > > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > > > > > > > >>>>
> > > > > > > > > >>>> If you don't want to have a big change in this interface or recovery flow,
> > how
> > > > > > > > > >>>> about keep them both, and add a mount option to control inner recovery flow?
> > > > > > > > > >>>
> > > > > > > > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > > > > > > > >>
> > > > > > > > > >> I believe new design will enhance in memory usage and error handling of sqlite
> > > > > > > > > >> in android, and hope this can be applied. But, I can understand that if you
> > > > > > > > > >> were considerring about risk control and backward compatibility, since this
> > > > > > > > > >> change affects all atomic related ioctls.
> > > > > > > > > >>
> > > > > > > > > >>> For other databases, I can understand that they can use atomic_write without
> > > > > > > > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > > > > > > > >>>
> > > > > > > > > >>> It'd better to add a new ioctl for that, but before adding it, can we find
> > > > > > > > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > > > > > > > >>
> > > > > > > > > >> You mean investigating or we can only start when there is a clear commercial
> > > > > > > > > >> demand ?
> > > > > > > > > >>
> > > > > > > > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > > > > > > > >>
> > > > > > > > > >> Agreed :)
> > > > > > > > > >>
> > > > > > > > > >> Thanks,
> > > > > > > > > >>
> > > > > > > > > >>>
> > > > > > > > > >>> Thanks,
> > > > > > > > > >>>
> > > > > > > > > >>>>
> > > > > > > > > >>>> How do you think? :)
> > > > > > > > > >>>>
> > > > > > > > > >>>> Thanks,
> > > > > > > > > >>>>
> > > > > > > > > >>>>> But, unfortunately, it seems that something is missing in the
> > > > > > > > > >>>>> current implementation.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> So simply how about this?
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> A possible flow would be:
> > > > > > > > > >>>>> 1. write journal data to volatile space
> > > > > > > > > >>>>> 2. write db data to atomic space
> > > > > > > > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal and
> > db
> > > > > > > > > >>>>>  - flush/fsync journal data to disk
> > > > > > > > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > > > > > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > > > > > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> 1. drop atomic writes
> > > > > > > > > >>>>>  - we don't need to keep any stale db data.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> 2. write journal data
> > > > > > > > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > > > >>>>> ---
> > > > > > > > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > > > > > > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > > > > > > >>>>> index 91f576a..d16438a 100644
> > > > > > > > > >>>>> --- a/fs/f2fs/file.c
> > > > > > > > > >>>>> +++ b/fs/f2fs/file.c
> > > > > > > > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct file
> > > > *filp)
> > > > > > > > > >>>>>  	if (ret)
> > > > > > > > > >>>>>  		return ret;
> > > > > > > > > >>>>>
> > > > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > > > >>>>> -	commit_inmem_pages(inode, true);
> > > > > > > > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > > > >>>>> +		commit_inmem_pages(inode, true);
> > > > > > > > > >>>>> +	}
> > > > > > > > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > > > > > > > >>>>> +		if (!ret)
> > > > > > > > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > > > > > > >>>>> +	}
> > > > > > > > > >>>>>
> > > > > > > > > >>>>>  	mnt_drop_write_file(filp);
> > > > > > > > > >>>>>  	return ret;
> > > > > > > > > >>>>> --
> > > > > > > > > >>>>> 2.6.3
> > > > > > > > > >>>>
> > > > > > > > > >
> > > > > > > > > > ------------------------------------------------------------------------------
> > > > > > > > > > _______________________________________________
> > > > > > > > > > Linux-f2fs-devel mailing list
> > > > > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > ------------------------------------------------------------------------------
> > > > > > > > > _______________________________________________
> > > > > > > > > Linux-f2fs-devel mailing list
> > > > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > >
> > > > > > > ------------------------------------------------------------------------------
> > > > > > > Site24x7 APM Insight: Get Deep Visibility into Application Performance
> > > > > > > APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> > > > > > > Monitor end-to-end web transactions and take corrective actions now
> > > > > > > Troubleshoot faster and improve end-user experience. Signup Now!
> > > > > > > http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> > > > > > > _______________________________________________
> > > > > > > Linux-f2fs-devel mailing list
> > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

------------------------------------------------------------------------------
Site24x7 APM Insight: Get Deep Visibility into Application Performance
APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
Monitor end-to-end web transactions and take corrective actions now
Troubleshoot faster and improve end-user experience. Signup Now!
http://pubads.g.doubleclick.net/gampad/clk?id=272487151&iu=/4140

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
  2016-02-06  4:17                               ` Jaegeuk Kim
  (?)
@ 2016-02-06  6:36                               ` Chao Yu
  -1 siblings, 0 replies; 28+ messages in thread
From: Chao Yu @ 2016-02-06  6:36 UTC (permalink / raw)
  To: 'Jaegeuk Kim'; +Cc: linux-kernel, linux-f2fs-devel

Hi Jaegeuk,

> -----Original Message-----
> From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> Sent: Saturday, February 06, 2016 12:18 PM
> To: Chao Yu
> Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> 
> Hi Chao,
> 
> On Tue, Feb 02, 2016 at 06:19:06PM +0800, Chao Yu wrote:
> > > > > > > From: Jaegeuk Kim [mailto:jaegeuk@kernel.org]
> > > > > > > Sent: Wednesday, January 13, 2016 9:18 AM
> > > > > > > To: Chao Yu
> > > > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written pages
> > > > > > >
> > > > > > > Hi Chao,
> > > > > > >
> > > > > > > I just injected -EIO for one page among two pages in total into database file.
> > > > > > > Then, I tested valid and invalid journal file to see how sqlite recovers the
> > > > > > > transaction.
> > > > > > >
> > > > > > > Interestingly, if journal is valid, database file is recovered, as I could see
> > > > > > > the transaction result even after it shows EIO.
> > > > > > > But, in the invalid journal case, somehow it drops database changes.
> > > > > >
> > > > > > If journal has valid data in its header and corrupted data in its body, sqlite will
> > > > > > recover db file from corrupted journal file, then db file will be corrupted.
> > > > > > So what you mean is: after recovery, db file still be fine? or sqlite fails to
> > > > > > recover due to drop data in journal since the header of journal is not valid?
> > > > >
> > > > > In the above case, I think I made broken journal header. At the same time, I
> > > > > broke database file too, but I could see that database file is recovered
> > > > > likewise roll-back. I couldn't find corruption of database.
> > > > >
> > > > > Okay, I'll test again by corrupting journal body with valid header.
> > >
> > > Hmm, it's quite difficult to produce any corruption case.
> > >
> > > I tried the below tests, but in all the cases, sqlite did rollback successfully.
> >
> > As you saw valid db file at final, I suspect that:
> > a) db file was recovered by f2fs: after we fail in atomic commit, if
> >    checkpoint isn't be triggered to persist partial pages of one
> >    transaction, db file will be recovered to last transaction after an
> >    abnormal power-cut by f2fs.
> > b) or db file was recovered by sqlite: sqlite will try to do the
> >    revoking after it detects failure of atomic commit. Similarly, db
> >    file will be recovered.
> >
> > >
> > >  - -EIO for one db write with valid header + valid body in journal
> > >  - -EIO for one db write with valid header + invalid body in journal
> > >  - -EIO for one db write with invalid header + valid body in journal
> > >
> > > Note that, I checked both integrity_check and table contents after each tests.
> > >
> > > I suspect that journal uses checksums to validate its contents?
> >
> > Yes, there is one checksum after each 4K-size journal page.
> >
> > IMO, it's better to just destroy last one or two journal pages to make
> > corrupted journal file. For example, if there are 10 pages in journal, let
> > kworker writebacks [0-7] pages include partial old pages of transaction
> > and journal header, and holds [8-9] pages in memory, so in disk, [8-9]
> > pages were invalid to sqlite due to wrong checksum, and other pages will
> > be judged as valid for recovery. Note that, pages after first invalid
> > page were also be judged as invalid by sqlite.
> 
> Hmm, I couldn't find out the exact scenario to corrypt db finally.
> But, when I took a look at the below document, I could agree that it is
> possible scenario.
> 
> https://www.sqlite.org/howtocorrupt.html
> 
> If possible, could you rebase the patches based on the latest dev-test?
> I want to review the patch seriously.

No problem, please help to review following patches. :)

Thanks,

> 
> Thanks,
> 
> >
> > Thanks,
> >
> > >
> > > Thanks,
> > >
> > > > >
> > > > > Thanks,
> > > > >
> > > > > >
> > > > > > Thanks,
> > > > > >
> > > > > > > I'm not sure it was because I just skip second page write of database file tho.
> > > > > > > (I added random bytes into journal pages.)
> > > > > > > I'll break the database file with more random bytes likewise what I did for
> > > > > > > journal.
> > > > > > >
> > > > > > > Thanks,
> > > > > > >
> > > > > > > On Fri, Jan 08, 2016 at 11:43:06AM -0800, Jaegeuk Kim wrote:
> > > > > > > > On Fri, Jan 08, 2016 at 08:05:52PM +0800, Chao Yu wrote:
> > > > > > > > > Hi Jaegeuk,
> > > > > > > > >
> > > > > > > > > Any progress on this patch?
> > > > > > > >
> > > > > > > > Swamped. Will do.
> > > > > > > >
> > > > > > > > Thanks,
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Thanks,
> > > > > > > > >
> > > > > > > > > > -----Original Message-----
> > > > > > > > > > From: Chao Yu [mailto:chao@kernel.org]
> > > > > > > > > > Sent: Friday, January 01, 2016 8:14 PM
> > > > > > > > > > To: Jaegeuk Kim
> > > > > > > > > > Cc: linux-kernel@vger.kernel.org; linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > > > Subject: Re: [f2fs-dev] [PATCH 2/2] f2fs: support revoking atomic written
> pages
> > > > > > > > > >
> > > > > > > > > > Hi Jaegeuk,
> > > > > > > > > >
> > > > > > > > > > On 1/1/16 11:50 AM, Jaegeuk Kim wrote:
> > > > > > > > > > > Hi Chao,
> > > > > > > > > > >
> > > > > > > > > > > ...
> > > > > > > > > > >
> > > > > > > > > > >>>>> On Tue, Dec 29, 2015 at 11:12:36AM +0800, Chao Yu wrote:
> > > > > > > > > > >>>>>> f2fs support atomic write with following semantics:
> > > > > > > > > > >>>>>> 1. open db file
> > > > > > > > > > >>>>>> 2. ioctl start atomic write
> > > > > > > > > > >>>>>> 3. (write db file) * n
> > > > > > > > > > >>>>>> 4. ioctl commit atomic write
> > > > > > > > > > >>>>>> 5. close db file
> > > > > > > > > > >>>>>>
> > > > > > > > > > >>>>>> With this flow we can avoid file becoming corrupted when abnormal power
> > > > > > > > > > >>>>>> cut, because we hold data of transaction in referenced pages linked
> in
> > > > > > > > > > >>>>>> inmem_pages list of inode, but without setting them dirty, so these
> data
> > > > > > > > > > >>>>>> won't be persisted unless we commit them in step 4.
> > > > > > > > > > >>>>>>
> > > > > > > > > > >>>>>> But we should still hold journal db file in memory by using volatile
> write,
> > > > > > > > > > >>>>>> because our semantics of 'atomic write support' is not full, in step
> 4,
> > > we
> > > > > > > > > > >>>>>> could be fail to submit all dirty data of transaction, once partial
> dirty
> > > > > > > > > > >>>>>> data was committed in storage, db file should be corrupted, in this
> case,
> > > > > > > > > > >>>>>> we should use journal db to recover the original data in db file.
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>> Originally, IOC_ABORT_VOLATILE_WRITE was supposed to handle commit failures,
> > > > > > > > > > >>>>> since database should get its error literally.
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>> So, the only thing that we need to do is keeping journal data for further
> > > db
> > > > > > > > > > >>>>> recovery.
> > > > > > > > > > >>>>
> > > > > > > > > > >>>> IMO, if we really support *atomic* interface, we don't need any journal
> data
> > > > > > > > > > >>>> kept by user, because f2fs already have it in its storage since we always
> > > > > > > > > > >>>> trigger OPU for pages written in atomic-write opened file, f2fs can easily
> > > try
> > > > > > > > > > >>>> to revoke (replace old to new in metadata) when any failure exist in
> atomic
> > > > > > > > > > >>>> write process.
> > > > > > > > > > >>>
> > > > > > > > > > >>> Yeah, so current design does not fully support atomic writes. IOWs, volatile
> > > > > > > > > > >>> writes for journal files should be used together to minimize sqlite change
> > > as
> > > > > > > > > > >>> much as possible.
> > > > > > > > > > >>>
> > > > > > > > > > >>>> But in current design, we still hold journal data in memory for recovering
> > > for
> > > > > > > > > > >>>> *rare* failure case. I think there are several issues:
> > > > > > > > > > >>>> a) most of time, we are in concurrent scenario, so if large number of
> journal
> > > > > > > > > > >>>> db files were opened simultaneously, we are under big memory pressure.
> > > > > > > > > > >>>
> > > > > > > > > > >>> In current android, I've seen that this is not a big concern. Even there
> is
> > > > > > > > > > >>> memory pressure, f2fs flushes volatile pages.
> > > > > > > > > > >>
> > > > > > > > > > >> When I change to redirty all volatile pages in ->writepage, android seems
> go
> > > > > > > > > > >> into an infinite loop when doing recovery flow of f2fs data partition in
> startup.
> > > > > > > > > > >>
> > > > > > > > > > >> if (f2fs_is_volatile_file(inode))
> > > > > > > > > > >> 	goto redirty_out;
> > > > > > > > > > >
> > > > > > > > > > > Where did you put this? It doesn't flush at all? Why?
> > > > > > > > > >
> > > > > > > > > > Original place in ->writepage, just remove two other conditions.
> > > > > > > > > >
> > > > > > > > > > To avoid potential random writebacking of dirty page in journal which
> > > > > > > > > > cause unpredicted corrupting in journal.
> > > > > > > > > >
> > > > > > > > > > > Practically, the peak amount of journal writes depend on how many transactions
> > > > > > > > > > > are processing concurrently.
> > > > > > > > > > > I mean, in-memory pages are dropped at the end of every transaction.
> > > > > > > > > > > You can check the number of pages through f2fs_stat on your phone.
> > > > > > > > > > >
> > > > > > > > > > >> I didn't dig details, but I think there may be a little risk for this design.
> > > > > > > > > > >>
> > > > > > > > > > >>>
> > > > > > > > > > >>>> b) If we are out of memory, reclaimer tries to write page of journal
> db into
> > > > > > > > > > >>>> disk, it will destroy db file.
> > > > > > > > > > >>>
> > > > > > > > > > >>> I don't understand. Could you elaborate why journal writes can corrupt
> db?
> > > > > > > > > > >>
> > > > > > > > > > >> Normally, we keep pages of journal in memory, but partial page in journal
> > > > > > > > > > >> will be write out to device by reclaimer when out of memory. So this journal
> > > > > > > > > > >> may have valid data in its log head, but with corrupted data, then after
> > > > > > > > > > >> abnormal powe-cut, recovery with this journal before a transaction will
> > > > > > > > > > >> destroy db. Right?
> > > > > > > > > > >
> > > > > > > > > > > Just think about sqlite without this feature.
> > > > > > > > > > > Broken journal is pretty normal case for sqlite.
> > > > > > > > > >
> > > > > > > > > > Maybe, if it is caused by bug or design issue of software, no matter db system
> > > > > > > > > > or filesystem, we should try our best to fix it to avoid generating broken
> journals.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >>>
> > > > > > > > > > >>>> c) Though, we have journal db file, we will face failure of recovering
> db
> > > file
> > > > > > > > > > >>>> from journal db due to ENOMEM or EIO, then db file will be corrupted.
> > > > > > > > > > >>>
> > > > > > > > > > >>> Do you mean the failure of recovering db with a complete journal?
> > > > > > > > > > >>> Why do we have to handle that? That's a database stuff, IMO.
> > > > > > > > > > >>
> > > > > > > > > > >> Yes, just list for indicating we will face the same issue which is hard
> to
> > > > > > > > > > >> handle both in original design and new design, so the inner revoking failure
> > > > > > > > > > >> issue would not be a weak point or flaw of new design.
> > > > > > > > > > >>
> > > > > > > > > > >>>
> > > > > > > > > > >>>> d) Recovery flow will make data page dirty, triggering both data stream
> and
> > > > > > > > > > >>>> metadata stream, there should be more IOs than in inner revoking in
> > > > > > > > > > >>>> atomic-interface.
> > > > > > > > > > >>>
> > > > > > > > > > >>> Well, do you mean there is no need to recover db after revoking?
> > > > > > > > > > >>
> > > > > > > > > > >> Yes, revoking make the same effect like the recovery of sqlite, so after
> > > > > > > > > > >> revoking, recovery is no need.
> > > > > > > > > > >
> > > > > > > > > > > Logically, it doesn't make sense. If there is a valid journal file, it should
> > > > > > > > > > > redo the previous transaction. No?
> > > > > > > > > >
> > > > > > > > > > As we know, in sqlite, before we commit a transaction, we will use journal
> to
> > > > > > > > > > record original data of pages which will be updated in following transaction,
> > > so
> > > > > > > > > > in following if a) abnormal power-cut, b) commit error, c) redo command was
> > > > > > > > > > triggered by user, we will recover db with journal.
> > > > > > > > > >
> > > > > > > > > > Ideally, if we support atomic write interface, in there should always return
> two
> > > > > > > > > > status in atomic write interface: success or fail. If success, transaction
> was
> > > > > > > > > > committed, otherwise, it looks like nothing happened, user will be told
> > > > > > > > > > transaction was failed. Then, journals in sqlite could no longer be used,
> > > > > > > > > > eventually no journal, no recovery.
> > > > > > > > > >
> > > > > > > > > > The only thing we should concern is inner failure (e.g. ENOMEM, ENOSPC) of
> > > > > > > > > > revoking in commit interface since it could destroy db file permanently w/o
> > > > > > > > > > journal. IMO, some optimization could be done for these cases:
> > > > > > > > > > 1. ENOMEM: enable retrying or mark accessed flag in page in advance.
> > > > > > > > > > 2. ENOSPC: preallocate blocks for node blocks and data blocks.
> > > > > > > > > >
> > > > > > > > > > These optimizations couldn't guarantee no failure in revoking operation
> > > > > > > > > > completely, luckily, those are not common cases, and they also happen in sqlite
> > > > > > > > > > w/o atomic feature.
> > > > > > > > > >
> > > > > > > > > > One more possible proposal is: if we support reflink feature like ocfs2/xfs,
> I
> > > > > > > > > > guess we can optimize DB like:
> > > > > > > > > > 1. reflink db to db.ref
> > > > > > > > > > 2. do transaction in db.ref
> > > > > > > > > >    - failed, rm db.ref
> > > > > > > > > >    - power-cut rm db.ref
> > > > > > > > > > 3. rename db.ref to db
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > >> One more case is that user can send a command to abort current transaction,
> > > > > > > > > > >> it should be happened before atomic_commit operation, which could easily
> > > > > > > > > > >> handle with abort_commit ioctl.
> > > > > > > > > > >>
> > > > > > > > > > >>>
> > > > > > > > > > >>>> e) Moreover, there should be a hole between 1) commit fail and 2) abort
> write
> > > > > &
> > > > > > > > > > >>>> recover, checkpoint will persist the corrupt data in db file, following
> abnormal
> > > > > > > > > > >>>> power-cut will leave that data in disk.
> > > > > > > > > > >>>
> > > > > > > > > > >>> Yes, in that case, database should recover corrupted db with its journal
> file.
> > > > > > > > > > >>
> > > > > > > > > > >> Journal could be corrupted as I descripted in b).
> > > > > > > > > > >
> > > > > > > > > > > Okay, so what I'm thinking is like this.
> > > > > > > > > > > It seems there are two corruption cases after journal writes.
> > > > > > > > > > >
> > > > > > > > > > > 1. power cut during atomic writes
> > > > > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > > > > >
> > > > > > > > > > > 2. error during atomic writes
> > > > > > > > > > >  a. power-cut before abort completion
> > > > > > > > > > >   - broken journal file and broken db file -> revoking is needed!
> > > > > > > > > > >
> > > > > > > > > > >  b. after abort
> > > > > > > > > > >   - valid journal file and broken db file -> recover db (likewise plain sqlite)
> > > > > > > > > > >
> > > > > > > > > > > Indeed, in the 2.a. case, we need revoking; I guess that's what you mentioned.
> > > > > > > > > > > But, I think, even if revoking is done, we should notify an error to abort
> and
> > > > > > > > > > > recover db by 2.b.
> > > > > > > > > > >
> > > > > > > > > > > Something like this after successful revoking.
> > > > > > > > > > >
> > > > > > > > > > > 1. power cut during atomic writes
> > > > > > > > > > >  - broken journal file and clean db file -> give up
> > > > > > > > > > >  - luckily, valid journal file and clean db file -> recover db
> > > > > > > > > > >
> > > > > > > > > > > 2. error during atomic writes w/ revoking
> > > > > > > > > > >  a. power-cut before abort completion
> > > > > > > > > > >   - broken journal file and clean db file -> give up
> > > > > > > > > > >   - luckily, valid journal file and clean db file -> recover db
> > > > > > > > > > >
> > > > > > > > > > >  b. after abort
> > > > > > > > > > >   - valid journal file and clean db file -> recover db
> > > > > > > > > >
> > > > > > > > > > That's right.
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Let me verify these scenarios first. :)
> > > > > > > > > >
> > > > > > > > > > OK. :)
> > > > > > > > > >
> > > > > > > > > > Thanks,
> > > > > > > > > >
> > > > > > > > > > >
> > > > > > > > > > > Thanks,
> > > > > > > > > > >
> > > > > > > > > > >>>
> > > > > > > > > > >>>> With revoking supported design, we can not solve all above issues, we
> will
> > > still
> > > > > > > > > > >>>> face the same issue like c), but it will be a big improve if we can apply
> > > this
> > > > > > > > > > >>>> in our interface, since it provide a way to fix the issue a) b) d). And
> also
> > > > > for
> > > > > > > > > > >>>> e) case, we try to rescue data in first time that our revoking operation
> > > would
> > > > > be
> > > > > > > > > > >>>> protected by f2fs_lock_op to avoid checkpoint + power-cut.
> > > > > > > > > > >>>>
> > > > > > > > > > >>>> If you don't want to have a big change in this interface or recovery
> flow,
> > > how
> > > > > > > > > > >>>> about keep them both, and add a mount option to control inner recovery
> flow?
> > > > > > > > > > >>>
> > > > > > > > > > >>> Hmm, okay. I believe the current design is fine for sqlite in android.
> > > > > > > > > > >>
> > > > > > > > > > >> I believe new design will enhance in memory usage and error handling of
> sqlite
> > > > > > > > > > >> in android, and hope this can be applied. But, I can understand that if
> you
> > > > > > > > > > >> were considerring about risk control and backward compatibility, since
> this
> > > > > > > > > > >> change affects all atomic related ioctls.
> > > > > > > > > > >>
> > > > > > > > > > >>> For other databases, I can understand that they can use atomic_write without
> > > > > > > > > > >>> journal control, which is a sort of stand-alone atomic_write.
> > > > > > > > > > >>>
> > > > > > > > > > >>> It'd better to add a new ioctl for that, but before adding it, can we
> find
> > > > > > > > > > >>> any usecase for this feature? (e.g., postgresql, mysql, mariadb, couchdb?)
> > > > > > > > > > >>
> > > > > > > > > > >> You mean investigating or we can only start when there is a clear commercial
> > > > > > > > > > >> demand ?
> > > > > > > > > > >>
> > > > > > > > > > >>> Then, I expect that we can define a more appropriate and powerful ioctl.
> > > > > > > > > > >>
> > > > > > > > > > >> Agreed :)
> > > > > > > > > > >>
> > > > > > > > > > >> Thanks,
> > > > > > > > > > >>
> > > > > > > > > > >>>
> > > > > > > > > > >>> Thanks,
> > > > > > > > > > >>>
> > > > > > > > > > >>>>
> > > > > > > > > > >>>> How do you think? :)
> > > > > > > > > > >>>>
> > > > > > > > > > >>>> Thanks,
> > > > > > > > > > >>>>
> > > > > > > > > > >>>>> But, unfortunately, it seems that something is missing in the
> > > > > > > > > > >>>>> current implementation.
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>> So simply how about this?
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>> A possible flow would be:
> > > > > > > > > > >>>>> 1. write journal data to volatile space
> > > > > > > > > > >>>>> 2. write db data to atomic space
> > > > > > > > > > >>>>> 3. in the error case, call ioc_abort_volatile_writes for both journal
> and
> > > db
> > > > > > > > > > >>>>>  - flush/fsync journal data to disk
> > > > > > > > > > >>>>>  - drop atomic data, and will be recovered by database with journal
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>> From cb33fc8bc30981c370ec70fe68871130109793ec Mon Sep 17 00:00:00 2001
> > > > > > > > > > >>>>> From: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > > > > >>>>> Date: Tue, 29 Dec 2015 15:46:33 -0800
> > > > > > > > > > >>>>> Subject: [PATCH] f2fs: fix f2fs_ioc_abort_volatile_write
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>> There are two rules to handle aborting volatile or atomic writes.
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>> 1. drop atomic writes
> > > > > > > > > > >>>>>  - we don't need to keep any stale db data.
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>> 2. write journal data
> > > > > > > > > > >>>>>  - we should keep the journal data with fsync for db recovery.
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>> Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
> > > > > > > > > > >>>>> ---
> > > > > > > > > > >>>>>  fs/f2fs/file.c | 13 ++++++++++---
> > > > > > > > > > >>>>>  1 file changed, 10 insertions(+), 3 deletions(-)
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>> diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
> > > > > > > > > > >>>>> index 91f576a..d16438a 100644
> > > > > > > > > > >>>>> --- a/fs/f2fs/file.c
> > > > > > > > > > >>>>> +++ b/fs/f2fs/file.c
> > > > > > > > > > >>>>> @@ -1433,9 +1433,16 @@ static int f2fs_ioc_abort_volatile_write(struct
> file
> > > > > *filp)
> > > > > > > > > > >>>>>  	if (ret)
> > > > > > > > > > >>>>>  		return ret;
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > > > > >>>>> -	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > > > > >>>>> -	commit_inmem_pages(inode, true);
> > > > > > > > > > >>>>> +	if (f2fs_is_atomic_file(inode)) {
> > > > > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
> > > > > > > > > > >>>>> +		commit_inmem_pages(inode, true);
> > > > > > > > > > >>>>> +	}
> > > > > > > > > > >>>>> +	if (f2fs_is_volatile_file(inode)) {
> > > > > > > > > > >>>>> +		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
> > > > > > > > > > >>>>> +		ret = commit_inmem_pages(inode, false);
> > > > > > > > > > >>>>> +		if (!ret)
> > > > > > > > > > >>>>> +			ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
> > > > > > > > > > >>>>> +	}
> > > > > > > > > > >>>>>
> > > > > > > > > > >>>>>  	mnt_drop_write_file(filp);
> > > > > > > > > > >>>>>  	return ret;
> > > > > > > > > > >>>>> --
> > > > > > > > > > >>>>> 2.6.3
> > > > > > > > > > >>>>
> > > > > > > > > > >
> > > > > > > > > > >
> ------------------------------------------------------------------------------
> > > > > > > > > > > _______________________________________________
> > > > > > > > > > > Linux-f2fs-devel mailing list
> > > > > > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > >
> ------------------------------------------------------------------------------
> > > > > > > > > > _______________________________________________
> > > > > > > > > > Linux-f2fs-devel mailing list
> > > > > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel
> > > > > > > >
> > > > > > > > ------------------------------------------------------------------------------
> > > > > > > > Site24x7 APM Insight: Get Deep Visibility into Application Performance
> > > > > > > > APM + Mobile APM + RUM: Monitor 3 App instances at just $35/Month
> > > > > > > > Monitor end-to-end web transactions and take corrective actions now
> > > > > > > > Troubleshoot faster and improve end-user experience. Signup Now!
> > > > > > > > http://pubads.g.doubleclick.net/gampad/clk?id=267308311&iu=/4140
> > > > > > > > _______________________________________________
> > > > > > > > Linux-f2fs-devel mailing list
> > > > > > > > Linux-f2fs-devel@lists.sourceforge.net
> > > > > > > > https://lists.sourceforge.net/lists/listinfo/linux-f2fs-devel

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2016-02-06  6:37 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-29  3:12 [PATCH 2/2] f2fs: support revoking atomic written pages Chao Yu
2015-12-29  3:12 ` Chao Yu
2015-12-30  0:05 ` Jaegeuk Kim
2015-12-30  0:05   ` Jaegeuk Kim
2015-12-30  1:34   ` Chao Yu
2015-12-30 15:35     ` [f2fs-dev] " Chao Yu
2015-12-30 19:43       ` Jaegeuk Kim
2015-12-30 19:41     ` Jaegeuk Kim
2015-12-31  9:16       ` Chao Yu
2016-01-01  3:50         ` Jaegeuk Kim
2016-01-01  3:50           ` Jaegeuk Kim
2016-01-01 12:13           ` [f2fs-dev] " Chao Yu
2016-01-01 12:13             ` Chao Yu
2016-01-08 12:05             ` [f2fs-dev] " Chao Yu
2016-01-08 19:43               ` Jaegeuk Kim
2016-01-08 19:43                 ` Jaegeuk Kim
2016-01-13  1:17                 ` [f2fs-dev] " Jaegeuk Kim
2016-01-13  5:05                   ` Chao Yu
2016-01-13  5:05                     ` Chao Yu
2016-01-15  0:03                     ` [f2fs-dev] " Jaegeuk Kim
2016-01-15  0:03                       ` Jaegeuk Kim
2016-02-01 10:04                       ` [f2fs-dev] " Chao Yu
2016-02-02  2:36                         ` Jaegeuk Kim
2016-02-02  2:36                           ` Jaegeuk Kim
2016-02-02 10:19                           ` [f2fs-dev] " Chao Yu
2016-02-06  4:17                             ` Jaegeuk Kim
2016-02-06  4:17                               ` Jaegeuk Kim
2016-02-06  6:36                               ` [f2fs-dev] " Chao Yu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.