From: mfo@canonical.com
To: tytso@mit.edu
Cc: dann.frazier@canonical.com, linux-ext4@vger.kernel.org,
mauricio.foliveira@gmail.com
Subject: [RFC 1/1] ext4: set page writeback on journalled writepage
Date: Sat, 21 Dec 2019 17:26:30 -0300 [thread overview]
Message-ID: <20191221202630.30718-2-mfo@canonical.com> (raw)
In-Reply-To: <20191221202630.30718-1-mfo@canonical.com>
From: Mauricio Faria de Oliveira <mfo@canonical.com>
Work in progress / request for comments to fix issue with
journal consistency errors on unclean shutdown with mmap()
on ext4 data=journal,journal_checksum mode.
Reference:
https://lore.kernel.org/linux-ext4/20190830012236.GC10779@mit.edu/
---
fs/ext4/ext4_jbd2.h | 11 ++++++
fs/ext4/inline.c | 13 +++++++
fs/ext4/inode.c | 82 ++++++++++++++++++++++++++++++++++++++++++---
3 files changed, 102 insertions(+), 4 deletions(-)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a6b9b66dbfad..8b51ca8231b7 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -144,6 +144,17 @@ struct ext4_journal_cb_entry {
/* user data goes here */
};
+/**
+ * struct ext4_journal_cb_entry_simple - Simple structure for callback information.
+ *
+ * This struct is a 'simple' structure on top of the base/seed structure,
+ * which adds a private data pointer to be used by the callback function.
+ */
+struct ext4_journal_cb_entry_simple {
+ struct ext4_journal_cb_entry jce;
+ void *jce_private;
+};
+
/**
* ext4_journal_callback_add: add a function to call after transaction commit
* @handle: active journal transaction handle to register callback on
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 2fec62d764fa..a168fe497d5d 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -565,6 +565,9 @@ static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
+ /* XXX(mfo): deadlock with journal? */
+ if (ext4_should_journal_data(inode))
+ wait_on_page_writeback(page);
ext4_write_lock_xattr(inode, &no_expand);
sem_held = 1;
@@ -693,6 +696,9 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
ret = -ENOMEM;
goto out;
}
+ /* XXX(mfo): deadlock with journal? */
+ if (ext4_should_journal_data(inode))
+ wait_on_page_writeback(page);
*pagep = page;
down_read(&EXT4_I(inode)->xattr_sem);
@@ -808,6 +814,10 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping,
page = grab_cache_page_write_begin(mapping, 0, flags);
if (!page)
return -ENOMEM;
+ /* XXX(mfo): should not deadlock with journal;
+ * this is only called after ext4_journal_stop(). */
+ if (ext4_should_journal_data(inode))
+ wait_on_page_writeback(page);
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
@@ -911,6 +921,9 @@ int ext4_da_write_inline_data_begin(struct address_space *mapping,
ret = -ENOMEM;
goto out_journal;
}
+ /* XXX(mfo): deadlock with journal? */
+ if (ext4_should_journal_data(inode))
+ wait_on_page_writeback(page);
down_read(&EXT4_I(inode)->xattr_sem);
if (!ext4_has_inline_data(inode)) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 28f28de0c1b6..ca31db5f81f0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -139,7 +139,7 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
static void ext4_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
-static int __ext4_journalled_writepage(struct page *page, unsigned int len);
+static int __ext4_journalled_writepage(struct page *page, unsigned int len, bool sync);
static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int pextents);
@@ -1155,6 +1155,15 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
unlock_page(page);
retry_journal:
+
+ /* XXX(mfo): deadlock with journal: fix attempt.
+ * does just wait_on_page_writeback() need (un)lock_page() ? */
+ if (ext4_should_journal_data(inode)) {
+ lock_page(page);
+ wait_on_page_writeback(page);
+ unlock_page(page);
+ }
+
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
if (IS_ERR(handle)) {
put_page(page);
@@ -1170,6 +1179,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
goto retry_grab;
}
/* In case writeback began while the page was unlocked */
+ /* XXX(mfo): this may call wait_on_page_writeback() and deadlock. */
wait_for_stable_page(page);
#ifdef CONFIG_FS_ENCRYPTION
@@ -1841,8 +1851,21 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
return 0;
}
+static void __ext4_jce_finish_page_writeback(struct super_block *sb,
+ struct ext4_journal_cb_entry *jce,
+ int error)
+{
+ struct ext4_journal_cb_entry_simple *jce_simple =
+ (struct ext4_journal_cb_entry_simple *) jce;
+ struct page *page = (struct page *) jce_simple->jce_private;
+
+ end_page_writeback(page);
+ kfree(jce);
+}
+
static int __ext4_journalled_writepage(struct page *page,
- unsigned int len)
+ unsigned int len,
+ bool sync)
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
@@ -1851,6 +1874,7 @@ static int __ext4_journalled_writepage(struct page *page,
int ret = 0, err = 0;
int inline_data = ext4_has_inline_data(inode);
struct buffer_head *inode_bh = NULL;
+ struct ext4_journal_cb_entry_simple *jce = NULL;
ClearPageChecked(page);
@@ -1875,13 +1899,25 @@ static int __ext4_journalled_writepage(struct page *page,
* out from under us.
*/
get_page(page);
+ /* XXX(mfo): do NOT set_page_writeback() here; as the next lock_page()
+ * could deadlock with write_cache_pages() (since it calls lock_page()
+ * and then wait_on_page_writeback()).
+ */
+ //set_page_writeback(page);
unlock_page(page);
+ if (!sync) {
+ jce = kzalloc(sizeof(*jce), GFP_NOFS);
+ if (!jce) {
+ ret = -ENOMEM;
+ goto out_no_pagelock;
+ }
+ }
+
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
ext4_writepage_trans_blocks(inode));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
- put_page(page);
goto out_no_pagelock;
}
BUG_ON(!ext4_handle_valid(handle));
@@ -1906,7 +1942,25 @@ static int __ext4_journalled_writepage(struct page *page,
}
if (ret == 0)
ret = err;
+
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+
+ /* XXX(mfo): only call set_page_writeback() and add callback to end_page_writeback()
+ * on journal commit on the *a*sync case; otherwise msync() blocks until periodic
+ * journal commit happens, waiting on page writeback.
+ * msync() -> ext4_sync_file() -> __filemap_fdatawait_range() -> wait_on_page_writeback()
+ *
+ * Confirm wether the *sync* case does *not* need ext4_handle_sync() ?
+ * as msync() -> ext4_sync_file() already calls ext4_force_commit() for data=journal.
+ */
+ if (!sync) {
+ /* Add journal callback entry to finish page writeback and free itself */
+ set_page_writeback(page);
+ jce->jce_private = page;
+ ext4_journal_callback_add(handle, __ext4_jce_finish_page_writeback,
+ (struct ext4_journal_cb_entry *) jce);
+ }
+
err = ext4_journal_stop(handle);
if (!ret)
ret = err;
@@ -1918,6 +1972,15 @@ static int __ext4_journalled_writepage(struct page *page,
out:
unlock_page(page);
out_no_pagelock:
+ /*
+ * Error leg for handle not yet initialized (jce allocation error)
+ * or failed to. Either way kfree(jce) is ok (it's NULL or valid.)
+ */
+ if (IS_ERR_OR_NULL(handle)) {
+ put_page(page);
+ kfree(jce);
+ }
+
brelse(inode_bh);
return ret;
}
@@ -2029,7 +2092,8 @@ static int ext4_writepage(struct page *page,
* It's mmapped pagecache. Add buffers and journal it. There
* doesn't seem much point in redirtying the page here.
*/
- return __ext4_journalled_writepage(page, len);
+ return __ext4_journalled_writepage(page, len,
+ (wbc->sync_mode == WB_SYNC_ALL));
ext4_io_submit_init(&io_submit, wbc);
io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
@@ -2974,6 +3038,14 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
* of file which has an already mapped buffer.
*/
retry_journal:
+
+ /* XXX(mfo): see comment in non-da ext4_write_begin(). */
+ if (ext4_should_journal_data(inode)) {
+ lock_page(page);
+ wait_on_page_writeback(page);
+ unlock_page(page);
+ }
+
handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
ext4_da_write_credits(inode, pos, len));
if (IS_ERR(handle)) {
@@ -5929,6 +6001,8 @@ vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf)
ext4_bh_unmapped)) {
/* Wait so that we don't change page under IO */
wait_for_stable_page(page);
+ if (ext4_should_journal_data(inode))
+ wait_on_page_writeback(page);
ret = VM_FAULT_LOCKED;
goto out;
}
--
2.17.1
next prev parent reply other threads:[~2019-12-21 20:27 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CALdTtnuRqgZ=By1JQ0yJJYczUPxxYCWPkAey4BjBkmj77q7aaA@mail.gmail.com>
[not found] ` <5FEB4E1B-B21B-418D-801D-81FF7C6C069F@dilger.ca>
[not found] ` <20190829225348.GA13045@xps13.dannf>
2019-08-30 1:22 ` ext4 fsck vs. kernel recovery policy Theodore Y. Ts'o
2019-09-04 14:58 ` dann frazier
2019-12-21 20:26 ` mfo
2019-12-21 20:26 ` mfo [this message]
2020-01-27 11:34 ` Mauricio Faria de Oliveira
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20191221202630.30718-2-mfo@canonical.com \
--to=mfo@canonical.com \
--cc=dann.frazier@canonical.com \
--cc=linux-ext4@vger.kernel.org \
--cc=mauricio.foliveira@gmail.com \
--cc=tytso@mit.edu \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).