All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 1/2] ext4: add a version of convert_inline_data_nolock() for regular files
@ 2017-06-06  0:03 Theodore Ts'o
  2017-06-06  0:03 ` [RFC PATCH 2/2] ext4: fix up ext4_try_to_write_inline_data() Theodore Ts'o
  2017-06-07  3:47 ` [RFC PATCH 1/2] ext4: add a version of convert_inline_data_nolock() for regular files Eric Biggers
  0 siblings, 2 replies; 4+ messages in thread
From: Theodore Ts'o @ 2017-06-06  0:03 UTC (permalink / raw)
  To: Ext4 Developers List; +Cc: gnehzuil.liu, Theodore Ts'o

The current version of ext4_convert_inline_data_nolock() should not be
used for regular files, since it does the conversion via using jbd2,
and this if we are not using data journalling, this is unsafe.  If the
data block is updated after the inode is converted from using inline
data to using a data block, when the journal is replayed, the new
version of the data block can get overwritten by the old version of
the data block.

The ext4_convert_inline_data_nolock() also doesn't handle races with
Direct I/O correctly.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inline.c | 110 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 107 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 8d141c0c8ff9..c9e3a262f27f 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1259,6 +1259,79 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
 }
 
 /*
+ * Only used for regular files, not directories!
+ *
+ * The inode and page must be locked when this function is called.
+ */
+static int reg_convert_inline_data_nolock(handle_t *handle,
+					  struct inode *inode,
+					  struct page *page,
+					  struct ext4_iloc *iloc)
+{
+	int error;
+	struct buffer_head *data_bh = NULL;
+	int inline_size = ext4_get_inline_size(inode);
+
+	if (!(inode->i_state & (I_NEW|I_FREEING)))
+		WARN_ON(!inode_is_locked(inode));
+
+	/* If some one has already done this for us, just exit. */
+	if (!ext4_has_inline_data(inode))
+		return 0;
+
+	if (!PageUptodate(page)) {
+		error = ext4_read_inline_page(inode, page);
+		if (error < 0)
+			return error;
+	}
+
+	/* Avoid races with DIO */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
+	error = ext4_destroy_inline_data_nolock(handle, inode);
+	if (error)
+		return error;
+
+	error = __block_write_begin(page, 0, inline_size, ext4_get_block);
+	if (error)
+		goto out_restore;
+
+	data_bh = page_buffers(page);
+	BUG_ON(data_bh == NULL);
+
+	if (ext4_should_journal_data(inode)) {
+		lock_buffer(data_bh);
+		error = ext4_journal_get_create_access(handle, data_bh);
+		if (error) {
+			unlock_buffer(data_bh);
+			error = -EIO;
+			goto out_restore;
+		}
+		error = ext4_handle_dirty_metadata(handle, inode, data_bh);
+		unlock_buffer(data_bh);
+	} else {
+		__set_page_dirty_buffers(page);
+		if (ext4_should_order_data(inode))
+			error = ext4_jbd2_inode_add_write(handle, inode);
+	}
+
+out_restore:
+	if (error) {
+		void *kaddr = kmap_atomic(page);
+
+		if (data_bh)
+			ext4_free_blocks(handle, inode, data_bh, 0, 1, 0);
+		ext4_create_inline_data(handle, inode, inline_size);
+		ext4_write_inline_data(inode, iloc, kaddr, 0, inline_size);
+		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+		kunmap_atomic(kaddr);
+	}
+	ext4_inode_resume_unlocked_dio(inode);
+	return error;
+}
+
+/*
  * Try to add the new entry to the inline data.
  * If succeeds, return 0. If not, extended the inline dir and copied data to
  * the new created block.
@@ -1898,7 +1971,21 @@ int ext4_try_to_evict_inline_data(handle_t *handle,
 		goto out;
 	}
 
-	error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+	if (S_ISREG(inode->i_mode)) {
+		struct page *page;
+
+		page = grab_cache_page_write_begin(inode->i_mapping, 0,
+						   AOP_FLAG_NOFS);
+		if (!page) {
+			error = -ENOMEM;
+			goto out;
+		}
+		error = reg_convert_inline_data_nolock(handle, inode,
+						       page, &iloc);
+		unlock_page(page);
+		put_page(page);
+	} else
+		error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
 out:
 	brelse(iloc.bh);
 	return error;
@@ -2007,6 +2094,7 @@ int ext4_convert_inline_data(struct inode *inode)
 	int error, needed_blocks, no_expand;
 	handle_t *handle;
 	struct ext4_iloc iloc;
+	struct page *page = NULL;
 
 	if (!ext4_has_inline_data(inode)) {
 		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
@@ -2020,6 +2108,12 @@ int ext4_convert_inline_data(struct inode *inode)
 	if (error)
 		return error;
 
+	if (S_ISREG(inode->i_mode)) {
+		page = grab_cache_page_write_begin(inode->i_mapping, 0, 0);
+		if (!page)
+			return -ENOMEM;
+	}
+
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
 	if (IS_ERR(handle)) {
 		error = PTR_ERR(handle);
@@ -2027,11 +2121,21 @@ int ext4_convert_inline_data(struct inode *inode)
 	}
 
 	ext4_write_lock_xattr(inode, &no_expand);
-	if (ext4_has_inline_data(inode))
-		error = ext4_convert_inline_data_nolock(handle, inode, &iloc);
+	if (ext4_has_inline_data(inode)) {
+		if (page)
+			error = reg_convert_inline_data_nolock(handle,
+							inode, page, &iloc);
+		else
+			error = ext4_convert_inline_data_nolock(handle, inode,
+								&iloc);
+	}
 	ext4_write_unlock_xattr(inode, &no_expand);
 	ext4_journal_stop(handle);
 out_free:
+	if (page) {
+		unlock_page(page);
+		put_page(page);
+	}
 	brelse(iloc.bh);
 	return error;
 }
-- 
2.11.0.rc0.7.gbe5a750

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [RFC PATCH 2/2] ext4: fix up ext4_try_to_write_inline_data()
  2017-06-06  0:03 [RFC PATCH 1/2] ext4: add a version of convert_inline_data_nolock() for regular files Theodore Ts'o
@ 2017-06-06  0:03 ` Theodore Ts'o
  2017-06-07  3:54   ` Eric Biggers
  2017-06-07  3:47 ` [RFC PATCH 1/2] ext4: add a version of convert_inline_data_nolock() for regular files Eric Biggers
  1 sibling, 1 reply; 4+ messages in thread
From: Theodore Ts'o @ 2017-06-06  0:03 UTC (permalink / raw)
  To: Ext4 Developers List; +Cc: gnehzuil.liu, Theodore Ts'o

There were a number of bugs in ext4_try_to_write_inline_data() and the
ext4_convert_inline_data_to_extent() function (which was only used by
ext4_try_to_write_inline_data).

For ext4_convert_inline_data_to_extent():

* It didn't handle the dioread_nolock case correctly
  * It didn't convert the extent tree entry from unwritten to written.
  * It didn't correctly handle racing DIO reads
* It didn't handle data=journal case correctly -- it doesn't follow
  the block modification correctly by failing to call
  ext4_handle_dirty_metadata() on the data block.

We fix this by eliminating ext4_convert_inline_data_to_extent()
completely, and use reg_convert_inline_data_nolock() since it has been
fixed to be Completely Correct (tm).  :-)

In ext4_try_to_write_inline_data() we need to request write access for
the inode table block before returning a return code of 1, since since
in that case ext4_write_begin() immediately returns and the jbd2 layer
needs to e informed that we might be modifying the inode.

Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/ext4/inline.c | 183 +++++++++++++++----------------------------------------
 1 file changed, 49 insertions(+), 134 deletions(-)

diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index c9e3a262f27f..a2773da52de2 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -24,6 +24,11 @@
 #define EXT4_INLINE_DOTDOT_OFFSET	2
 #define EXT4_INLINE_DOTDOT_SIZE		4
 
+static int reg_convert_inline_data_nolock(handle_t *handle,
+					  struct inode *inode,
+					  struct page *page,
+					  struct ext4_iloc *iloc);
+
 static int ext4_get_inline_size(struct inode *inode)
 {
 	if (EXT4_I(inode)->i_inline_off)
@@ -531,120 +536,6 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
 	return ret >= 0 ? 0 : ret;
 }
 
-static int ext4_convert_inline_data_to_extent(struct address_space *mapping,
-					      struct inode *inode,
-					      unsigned flags)
-{
-	int ret, needed_blocks, no_expand;
-	handle_t *handle = NULL;
-	int retries = 0, sem_held = 0;
-	struct page *page = NULL;
-	unsigned from, to;
-	struct ext4_iloc iloc;
-
-	if (!ext4_has_inline_data(inode)) {
-		/*
-		 * clear the flag so that no new write
-		 * will trap here again.
-		 */
-		ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
-		return 0;
-	}
-
-	needed_blocks = ext4_writepage_trans_blocks(inode);
-
-	ret = ext4_get_inode_loc(inode, &iloc);
-	if (ret)
-		return ret;
-
-retry:
-	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		handle = NULL;
-		goto out;
-	}
-
-	/* We cannot recurse into the filesystem as the transaction is already
-	 * started */
-	flags |= AOP_FLAG_NOFS;
-
-	page = grab_cache_page_write_begin(mapping, 0, flags);
-	if (!page) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ext4_write_lock_xattr(inode, &no_expand);
-	sem_held = 1;
-	/* If some one has already done this for us, just exit. */
-	if (!ext4_has_inline_data(inode)) {
-		ret = 0;
-		goto out;
-	}
-
-	from = 0;
-	to = ext4_get_inline_size(inode);
-	if (!PageUptodate(page)) {
-		ret = ext4_read_inline_page(inode, page);
-		if (ret < 0)
-			goto out;
-	}
-
-	ret = ext4_destroy_inline_data_nolock(handle, inode);
-	if (ret)
-		goto out;
-
-	if (ext4_should_dioread_nolock(inode)) {
-		ret = __block_write_begin(page, from, to,
-					  ext4_get_block_unwritten);
-	} else
-		ret = __block_write_begin(page, from, to, ext4_get_block);
-
-	if (!ret && ext4_should_journal_data(inode)) {
-		ret = ext4_walk_page_buffers(handle, page_buffers(page),
-					     from, to, NULL,
-					     do_journal_get_write_access);
-	}
-
-	if (ret) {
-		unlock_page(page);
-		put_page(page);
-		page = NULL;
-		ext4_orphan_add(handle, inode);
-		ext4_write_unlock_xattr(inode, &no_expand);
-		sem_held = 0;
-		ext4_journal_stop(handle);
-		handle = NULL;
-		ext4_truncate_failed_write(inode);
-		/*
-		 * If truncate failed early the inode might
-		 * still be on the orphan list; we need to
-		 * make sure the inode is removed from the
-		 * orphan list in that case.
-		 */
-		if (inode->i_nlink)
-			ext4_orphan_del(NULL, inode);
-	}
-
-	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-		goto retry;
-
-	if (page)
-		block_commit_write(page, from, to);
-out:
-	if (page) {
-		unlock_page(page);
-		put_page(page);
-	}
-	if (sem_held)
-		ext4_write_unlock_xattr(inode, &no_expand);
-	if (handle)
-		ext4_journal_stop(handle);
-	brelse(iloc.bh);
-	return ret;
-}
-
 /*
  * Try to write data in the inode.
  * If the inode has inline data, check whether the new write can be
@@ -662,13 +553,19 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 	struct page *page;
 	struct ext4_iloc iloc;
 
-	if (pos + len > ext4_get_max_inline_size(inode))
-		goto convert;
-
 	ret = ext4_get_inode_loc(inode, &iloc);
 	if (ret)
 		return ret;
 
+	page = grab_cache_page_write_begin(mapping, 0, flags);
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (pos + len > ext4_get_max_inline_size(inode))
+		goto convert;
+
 	/*
 	 * The possible write could happen in the inode,
 	 * so try to reserve the space in inode first.
@@ -686,25 +583,38 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 
 	/* We don't have space in inline inode, so convert it to extent. */
 	if (ret == -ENOSPC) {
-		ext4_journal_stop(handle);
-		brelse(iloc.bh);
-		goto convert;
-	}
+		int no_expand;
 
-	flags |= AOP_FLAG_NOFS;
+		ext4_journal_stop(handle);
+		handle = NULL;
+	convert:
+		if (!ext4_has_inline_data(inode)) {
+			/*
+			 * clear the flag so that no new write
+			 * will trap here again.
+			 */
+			ext4_clear_inode_state(inode,
+					       EXT4_STATE_MAY_INLINE_DATA);
+			ret = 0;
+			goto out;
+		}
+		handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
+					    ext4_writepage_trans_blocks(inode));
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
 
-	page = grab_cache_page_write_begin(mapping, 0, flags);
-	if (!page) {
-		ret = -ENOMEM;
+		ext4_write_lock_xattr(inode, &no_expand);
+		ret = reg_convert_inline_data_nolock(handle, inode,
+						     page, &iloc);
+		ext4_write_unlock_xattr(inode, &no_expand);
 		goto out;
 	}
 
-	*pagep = page;
 	down_read(&EXT4_I(inode)->xattr_sem);
 	if (!ext4_has_inline_data(inode)) {
 		ret = 0;
-		unlock_page(page);
-		put_page(page);
 		goto out_up_read;
 	}
 
@@ -713,19 +623,24 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
 		if (ret < 0)
 			goto out_up_read;
 	}
-
-	ret = 1;
-	handle = NULL;
+	*pagep = page;
+	page = NULL;
+	ret = ext4_journal_get_write_access(handle, iloc.bh);
+	if (ret == 0) {
+		ret = 1;
+		handle = NULL;
+	}
 out_up_read:
 	up_read(&EXT4_I(inode)->xattr_sem);
 out:
 	if (handle)
 		ext4_journal_stop(handle);
+	if (page) {
+		unlock_page(page);
+		put_page(page);
+	}
 	brelse(iloc.bh);
 	return ret;
-convert:
-	return ext4_convert_inline_data_to_extent(mapping,
-						  inode, flags);
 }
 
 int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
-- 
2.11.0.rc0.7.gbe5a750

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH 1/2] ext4: add a version of convert_inline_data_nolock() for regular files
  2017-06-06  0:03 [RFC PATCH 1/2] ext4: add a version of convert_inline_data_nolock() for regular files Theodore Ts'o
  2017-06-06  0:03 ` [RFC PATCH 2/2] ext4: fix up ext4_try_to_write_inline_data() Theodore Ts'o
@ 2017-06-07  3:47 ` Eric Biggers
  1 sibling, 0 replies; 4+ messages in thread
From: Eric Biggers @ 2017-06-07  3:47 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Ext4 Developers List, gnehzuil.liu

Hi Ted,

On Mon, Jun 05, 2017 at 08:03:58PM -0400, Theodore Ts'o wrote:
> The current version of ext4_convert_inline_data_nolock() should not be
> used for regular files, since it does the conversion via using jbd2,
> and this if we are not using data journalling, this is unsafe.  If the
> data block is updated after the inode is converted from using inline
> data to using a data block, when the journal is replayed, the new
> version of the data block can get overwritten by the old version of
> the data block.

Thanks for trying to fix this mess!  I'm not sure I know enough to fully review
it, but here are some things I'm questioning...

> 
> The ext4_convert_inline_data_nolock() also doesn't handle races with
> Direct I/O correctly.
> 

Are you sure direct I/O is an issue?   ext4_direct_IO() doesn't support files
with inline data; it returns 0 if 'ext4_has_inline_data(inode)'.

>  /*
> + * Only used for regular files, not directories!
> + *
> + * The inode and page must be locked when this function is called.
> + */
> +static int reg_convert_inline_data_nolock(handle_t *handle,
> +					  struct inode *inode,
> +					  struct page *page,
> +					  struct ext4_iloc *iloc)
> +{
> +	int error;
> +	struct buffer_head *data_bh = NULL;
> +	int inline_size = ext4_get_inline_size(inode);
> +
> +	if (!(inode->i_state & (I_NEW|I_FREEING)))
> +		WARN_ON(!inode_is_locked(inode));

I don't think the inode lock is guaranteed... ext4_convert_inline_data() is
called from ext4_fallocate() and ext4_page_mkwrite() without out.  It seems the
xattr_sem will always be taken for write, though.

> +
> +	/* If some one has already done this for us, just exit. */
> +	if (!ext4_has_inline_data(inode))
> +		return 0;
> +
> +	if (!PageUptodate(page)) {
> +		error = ext4_read_inline_page(inode, page);
> +		if (error < 0)
> +			return error;
> +	}
> +
> +	/* Avoid races with DIO */
> +	ext4_inode_block_unlocked_dio(inode);

What is ext4_inode_block_unlocked_dio() even supposed to do?  It sets
EXT4_STATE_DIOREAD_LOCK but nothing seems to check it.


> +	inode_dio_wait(inode);
> +
> +	error = ext4_destroy_inline_data_nolock(handle, inode);
> +	if (error)
> +		return error;
> +
> +	error = __block_write_begin(page, 0, inline_size, ext4_get_block);
> +	if (error)
> +		goto out_restore;
> +
> +	data_bh = page_buffers(page);
> +	BUG_ON(data_bh == NULL);
> +
> +	if (ext4_should_journal_data(inode)) {
> +		lock_buffer(data_bh);
> +		error = ext4_journal_get_create_access(handle, data_bh);
> +		if (error) {
> +			unlock_buffer(data_bh);
> +			error = -EIO;
> +			goto out_restore;
> +		}
> +		error = ext4_handle_dirty_metadata(handle, inode, data_bh);
> +		unlock_buffer(data_bh);
> +	} else {
> +		__set_page_dirty_buffers(page);
> +		if (ext4_should_order_data(inode))
> +			error = ext4_jbd2_inode_add_write(handle, inode);
> +	}
> +
> +out_restore:
> +	if (error) {
> +		void *kaddr = kmap_atomic(page);

kmap(), not kmap_atomic(), since the stuff in between can block.

>  
> +	if (S_ISREG(inode->i_mode)) {
> +		page = grab_cache_page_write_begin(inode->i_mapping, 0, 0);
> +		if (!page)
> +			return -ENOMEM;
> +	}
> +
>  	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
>  	if (IS_ERR(handle)) {
>  		error = PTR_ERR(handle);
> @@ -2027,11 +2121,21 @@ int ext4_convert_inline_data(struct inode *inode)
>  	}

Doesn't the page lock rank below transaction start?

Eric

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [RFC PATCH 2/2] ext4: fix up ext4_try_to_write_inline_data()
  2017-06-06  0:03 ` [RFC PATCH 2/2] ext4: fix up ext4_try_to_write_inline_data() Theodore Ts'o
@ 2017-06-07  3:54   ` Eric Biggers
  0 siblings, 0 replies; 4+ messages in thread
From: Eric Biggers @ 2017-06-07  3:54 UTC (permalink / raw)
  To: Theodore Ts'o; +Cc: Ext4 Developers List, gnehzuil.liu

On Mon, Jun 05, 2017 at 08:03:59PM -0400, Theodore Ts'o wrote:
> There were a number of bugs in ext4_try_to_write_inline_data() and the
> ext4_convert_inline_data_to_extent() function (which was only used by
> ext4_try_to_write_inline_data).
> 
> For ext4_convert_inline_data_to_extent():
> 
> * It didn't handle the dioread_nolock case correctly
>   * It didn't convert the extent tree entry from unwritten to written.
>   * It didn't correctly handle racing DIO reads
> * It didn't handle data=journal case correctly -- it doesn't follow
>   the block modification correctly by failing to call
>   ext4_handle_dirty_metadata() on the data block.
> 
> We fix this by eliminating ext4_convert_inline_data_to_extent()
> completely, and use reg_convert_inline_data_nolock() since it has been
> fixed to be Completely Correct (tm).  :-)
> 

Is ext4_da_convert_inline_data_to_extent() broken too?

>  /*
>   * Try to write data in the inode.
>   * If the inode has inline data, check whether the new write can be
> @@ -662,13 +553,19 @@ int ext4_try_to_write_inline_data(struct address_space *mapping,
>  	struct page *page;
>  	struct ext4_iloc iloc;
>  
> -	if (pos + len > ext4_get_max_inline_size(inode))
> -		goto convert;
> -
>  	ret = ext4_get_inode_loc(inode, &iloc);
>  	if (ret)
>  		return ret;
>  
> +	page = grab_cache_page_write_begin(mapping, 0, flags);
> +	if (!page) {
> +		ret = -ENOMEM;
> +		goto out;
> +	}
> +

Likewise, doesn't the page lock rank below transaction start?  Also this jumps
to 'out' which looks at 'handle' before it's been initialized.

Eric

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2017-06-07  3:54 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-06  0:03 [RFC PATCH 1/2] ext4: add a version of convert_inline_data_nolock() for regular files Theodore Ts'o
2017-06-06  0:03 ` [RFC PATCH 2/2] ext4: fix up ext4_try_to_write_inline_data() Theodore Ts'o
2017-06-07  3:54   ` Eric Biggers
2017-06-07  3:47 ` [RFC PATCH 1/2] ext4: add a version of convert_inline_data_nolock() for regular files Eric Biggers

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.