All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers
@ 2012-09-04 17:36 Dmitry Monakhov
  2012-09-04 17:36 ` [PATCH 2/4] ext4: serialize unlocked dio reads with truncate Dmitry Monakhov
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Dmitry Monakhov @ 2012-09-04 17:36 UTC (permalink / raw)
  To: linux-ext4; +Cc: jack, Dmitry Monakhov

Inode's block defrag and ext4_change_inode_journal_flag() may
affect nonlocked DIO reads result, so proper synchronization
required.

- add missed inode_dio_wait() calls where appropriate
- recheck ext4_should_dioread_nolock under extra i_dio_count reference.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/ext4.h        |    2 ++
 fs/ext4/ext4_jbd2.h   |    2 ++
 fs/ext4/indirect.c    |   12 ++++++++++++
 fs/ext4/inode.c       |    5 +++++
 fs/ext4/move_extent.c |   10 ++++++++++
 5 files changed, 31 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f9024a6..36e8b84 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1349,6 +1349,8 @@ enum {
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
 	EXT4_STATE_NEWENTRY,		/* File just added to dir */
 	EXT4_STATE_DELALLOC_RESERVED,	/* blks already reserved for delalloc */
+	EXT4_STATE_DIOREAD_LOCK,	/* Disable support for dio read
+					   nolocking */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 56d258c..318d177 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -399,6 +399,8 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
 		return 0;
 	if (ext4_should_journal_data(inode))
 		return 0;
+	if (ext4_test_inode_state(inode, EXT4_STATE_DIOREAD_LOCK))
+		return 0;
 	return 1;
 }
 
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 830e1b2..ba40309 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -812,10 +812,22 @@ retry:
 			ext4_flush_completed_IO(inode);
 			mutex_unlock(&inode->i_mutex);
 		}
+		/*
+		 * Inode's locking behaviour may change due to number
+		 * of reasons, in order to be shure that nolock dioreads
+		 * is still allowed we have to recheck inode's flags
+		 * while i_dio_count > 0
+		 */
+		atomic_inc(&inode->i_dio_count);
+		if (!unlikely(ext4_should_dioread_nolock(inode))) {
+			inode_dio_done(inode);
+			goto retry;
+		}
 		ret = __blockdev_direct_IO(rw, iocb, inode,
 				 inode->i_sb->s_bdev, iov,
 				 offset, nr_segs,
 				 ext4_get_block, NULL, NULL, 0);
+		inode_dio_done(inode);
 	} else {
 		ret = blockdev_direct_IO(rw, iocb, inode, iov,
 				 offset, nr_segs, ext4_get_block);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d12d30e..58ef61a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4741,6 +4741,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 			return err;
 	}
 
+	/* Wait for all existing dio workers */
+	ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+	inode_dio_wait(inode);
+
 	jbd2_journal_lock_updates(journal);
 
 	/*
@@ -4760,6 +4764,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 	ext4_set_aops(inode);
 
 	jbd2_journal_unlock_updates(journal);
+	ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
 
 	/* Finally we can mark the inode as dirty. */
 
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index c5826c6..a6a4278 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -1213,6 +1213,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
 	if (ret1 < 0)
 		return ret1;
+	/* Protect inodes against DIO workers
+	 * - Disable dio nonlock reads, so all new dio workers will block
+	 *   on i_mutex.
+	 * - wait for existing DIO in flight */
+	ext4_set_inode_state(orig_inode, EXT4_STATE_DIOREAD_LOCK);
+	ext4_set_inode_state(donor_inode, EXT4_STATE_DIOREAD_LOCK);
+	inode_dio_wait(orig_inode);
+	inode_dio_wait(donor_inode);
 
 	/* Protect extent tree against block allocations via delalloc */
 	double_down_write_data_sem(orig_inode, donor_inode);
@@ -1412,6 +1420,8 @@ out:
 		kfree(holecheck_path);
 	}
 	double_up_write_data_sem(orig_inode, donor_inode);
+	ext4_clear_inode_state(orig_inode, EXT4_STATE_DIOREAD_LOCK);
+	ext4_clear_inode_state(donor_inode, EXT4_STATE_DIOREAD_LOCK);
 	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
 
 	if (ret1)
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 2/4] ext4: serialize unlocked dio reads with truncate
  2012-09-04 17:36 [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers Dmitry Monakhov
@ 2012-09-04 17:36 ` Dmitry Monakhov
  2012-09-04 17:36 ` [PATCH 3/4] ext4: endless truncate due to nonlocked dio readers V2 Dmitry Monakhov
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 11+ messages in thread
From: Dmitry Monakhov @ 2012-09-04 17:36 UTC (permalink / raw)
  To: linux-ext4; +Cc: jack, Dmitry Monakhov

Current serialization will works only for DIO which holds
i_mutex, but nonlocked DIO following race is possible:

dio_nolock_read_task            truncate_task
				->ext4_setattr()
				 ->inode_dio_wait()
->ext4_ext_direct_IO
  ->ext4_ind_direct_IO
    ->__blockdev_direct_IO
      ->ext4_get_block
				 ->truncate_setsize()
				 ->ext4_truncate()
				 #alloc truncated blocks
				 #to other inode
      ->submit_io()
     #INFORMATION LEAK

In order to serialize with unlocked DIO reads we have to
rearrange wait sequence
1) update i_size first
2) wait for outstanding DIO requests
3) and only after that truncate inode blocks

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/inode.c |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 58ef61a..709ec5a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4304,7 +4304,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		inode_dio_wait(inode);
 
 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4355,6 +4354,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (attr->ia_size != i_size_read(inode))
 			truncate_setsize(inode, attr->ia_size);
+		inode_dio_wait(inode);
 		ext4_truncate(inode);
 	}
 
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 3/4] ext4: endless truncate due to nonlocked dio readers V2
  2012-09-04 17:36 [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers Dmitry Monakhov
  2012-09-04 17:36 ` [PATCH 2/4] ext4: serialize unlocked dio reads with truncate Dmitry Monakhov
@ 2012-09-04 17:36 ` Dmitry Monakhov
  2012-09-04 17:36 ` [PATCH 4/4] ext4: serialize truncate with owerwrite DIO workers Dmitry Monakhov
  2012-09-05 15:47 ` [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers Jan Kara
  3 siblings, 0 replies; 11+ messages in thread
From: Dmitry Monakhov @ 2012-09-04 17:36 UTC (permalink / raw)
  To: linux-ext4; +Cc: jack, Dmitry Monakhov

If we have enough aggressive DIO readers, truncate will wait
forever inside inode_dio_wait(). It is reasonable to disable
nonlock DIO read optimization during truncate.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/inode.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 709ec5a..5a75908 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4354,7 +4354,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	if (attr->ia_valid & ATTR_SIZE) {
 		if (attr->ia_size != i_size_read(inode))
 			truncate_setsize(inode, attr->ia_size);
+		ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
 		inode_dio_wait(inode);
+		ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
 		ext4_truncate(inode);
 	}
 
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH 4/4] ext4: serialize truncate with owerwrite DIO workers
  2012-09-04 17:36 [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers Dmitry Monakhov
  2012-09-04 17:36 ` [PATCH 2/4] ext4: serialize unlocked dio reads with truncate Dmitry Monakhov
  2012-09-04 17:36 ` [PATCH 3/4] ext4: endless truncate due to nonlocked dio readers V2 Dmitry Monakhov
@ 2012-09-04 17:36 ` Dmitry Monakhov
  2012-09-05 15:49   ` Jan Kara
  2012-09-05 15:47 ` [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers Jan Kara
  3 siblings, 1 reply; 11+ messages in thread
From: Dmitry Monakhov @ 2012-09-04 17:36 UTC (permalink / raw)
  To: linux-ext4; +Cc: jack, Dmitry Monakhov

Jan Kara have spotted interesting issue:
There are  potential data corruption issue with  direct IO overwrites
racing with truncate:
 Like:
  dio write                      truncate_task
  ->ext4_ext_direct_IO
   ->overwrite == 1
    ->down_read(&EXT4_I(inode)->i_data_sem);
    ->mutex_unlock(&inode->i_mutex);
                               ->ext4_setattr()
                                ->inode_dio_wait()
                                ->truncate_setsize()
                                ->ext4_truncate()
                                 ->down_write(&EXT4_I(inode)->i_data_sem);
    ->__blockdev_direct_IO
     ->ext4_get_block
     ->submit_io()
    ->up_read(&EXT4_I(inode)->i_data_sem);
                                 # truncate data blocks, allocate them to
                                 # other inode - bad stuff happens because
                                 # dio is still in flight.

In order to serialize with truncate dio worker should grab extra i_dio_count
reference before drop i_mutex.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/inode.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5a75908..9725acb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3035,6 +3035,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		overwrite = *((int *)iocb->private);
 
 		if (overwrite) {
+			atomic_inc(&inode->i_dio_count);
 			down_read(&EXT4_I(inode)->i_data_sem);
 			mutex_unlock(&inode->i_mutex);
 		}
@@ -3134,6 +3135,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		if (overwrite) {
 			up_read(&EXT4_I(inode)->i_data_sem);
 			mutex_lock(&inode->i_mutex);
+			inode_dio_done(inode);
 		}
 
 		return ret;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers
  2012-09-04 17:36 [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers Dmitry Monakhov
                   ` (2 preceding siblings ...)
  2012-09-04 17:36 ` [PATCH 4/4] ext4: serialize truncate with owerwrite DIO workers Dmitry Monakhov
@ 2012-09-05 15:47 ` Jan Kara
  2012-09-05 17:09   ` Dmitry Monakhov
  3 siblings, 1 reply; 11+ messages in thread
From: Jan Kara @ 2012-09-05 15:47 UTC (permalink / raw)
  To: Dmitry Monakhov; +Cc: linux-ext4, jack

On Tue 04-09-12 21:36:51, Dmitry Monakhov wrote:
> Inode's block defrag and ext4_change_inode_journal_flag() may
> affect nonlocked DIO reads result, so proper synchronization
> required.
> 
> - add missed inode_dio_wait() calls where appropriate
> - recheck ext4_should_dioread_nolock under extra i_dio_count reference.
> 
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
...
> diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> index 830e1b2..ba40309 100644
> --- a/fs/ext4/indirect.c
> +++ b/fs/ext4/indirect.c
> @@ -812,10 +812,22 @@ retry:
>  			ext4_flush_completed_IO(inode);
>  			mutex_unlock(&inode->i_mutex);
>  		}
> +		/*
> +		 * Inode's locking behaviour may change due to number
> +		 * of reasons, in order to be shure that nolock dioreads
					      ^^^ sure
> +		 * is still allowed we have to recheck inode's flags
> +		 * while i_dio_count > 0
> +		 */
> +		atomic_inc(&inode->i_dio_count);
> +		if (!unlikely(ext4_should_dioread_nolock(inode))) {
> +			inode_dio_done(inode);
> +			goto retry;
> +		}
  Umm, to make this reliable, you need a smp_mb() between atomic_inc() and
ext4_should_dioread_nolock(). Otherwise the i_state test could be reordered
before the atomic_inc()... Similarly you need the barriers around all other
pairs working with i_dio_count and i_state.

Frankly, I'm not very happy with this solution. It's getting ugly rather
quickly. IMHO dioread_nolock should be just ripped out and replaced with
proper range locking (if we want the scalability). But this is mostly Ted's
decision...

								Honza

>  		ret = __blockdev_direct_IO(rw, iocb, inode,
>  				 inode->i_sb->s_bdev, iov,
>  				 offset, nr_segs,
>  				 ext4_get_block, NULL, NULL, 0);
> +		inode_dio_done(inode);
>  	} else {
>  		ret = blockdev_direct_IO(rw, iocb, inode, iov,
>  				 offset, nr_segs, ext4_get_block);
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index d12d30e..58ef61a 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -4741,6 +4741,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  			return err;
>  	}
>  
> +	/* Wait for all existing dio workers */
> +	ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
> +	inode_dio_wait(inode);
> +
>  	jbd2_journal_lock_updates(journal);
>  
>  	/*
> @@ -4760,6 +4764,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
>  	ext4_set_aops(inode);
>  
>  	jbd2_journal_unlock_updates(journal);
> +	ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
>  
>  	/* Finally we can mark the inode as dirty. */
>  
> diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
> index c5826c6..a6a4278 100644
> --- a/fs/ext4/move_extent.c
> +++ b/fs/ext4/move_extent.c
> @@ -1213,6 +1213,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
>  	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
>  	if (ret1 < 0)
>  		return ret1;
> +	/* Protect inodes against DIO workers
> +	 * - Disable dio nonlock reads, so all new dio workers will block
> +	 *   on i_mutex.
> +	 * - wait for existing DIO in flight */
> +	ext4_set_inode_state(orig_inode, EXT4_STATE_DIOREAD_LOCK);
> +	ext4_set_inode_state(donor_inode, EXT4_STATE_DIOREAD_LOCK);
> +	inode_dio_wait(orig_inode);
> +	inode_dio_wait(donor_inode);
>  
>  	/* Protect extent tree against block allocations via delalloc */
>  	double_down_write_data_sem(orig_inode, donor_inode);
> @@ -1412,6 +1420,8 @@ out:
>  		kfree(holecheck_path);
>  	}
>  	double_up_write_data_sem(orig_inode, donor_inode);
> +	ext4_clear_inode_state(orig_inode, EXT4_STATE_DIOREAD_LOCK);
> +	ext4_clear_inode_state(donor_inode, EXT4_STATE_DIOREAD_LOCK);
>  	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
>  
>  	if (ret1)
> -- 
> 1.7.7.6
> 
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 4/4] ext4: serialize truncate with owerwrite DIO workers
  2012-09-04 17:36 ` [PATCH 4/4] ext4: serialize truncate with owerwrite DIO workers Dmitry Monakhov
@ 2012-09-05 15:49   ` Jan Kara
  2012-09-05 16:59     ` Dmitry Monakhov
  0 siblings, 1 reply; 11+ messages in thread
From: Jan Kara @ 2012-09-05 15:49 UTC (permalink / raw)
  To: Dmitry Monakhov; +Cc: linux-ext4, jack

On Tue 04-09-12 21:36:54, Dmitry Monakhov wrote:
> Jan Kara have spotted interesting issue:
> There are  potential data corruption issue with  direct IO overwrites
> racing with truncate:
>  Like:
>   dio write                      truncate_task
>   ->ext4_ext_direct_IO
>    ->overwrite == 1
>     ->down_read(&EXT4_I(inode)->i_data_sem);
>     ->mutex_unlock(&inode->i_mutex);
>                                ->ext4_setattr()
>                                 ->inode_dio_wait()
>                                 ->truncate_setsize()
>                                 ->ext4_truncate()
>                                  ->down_write(&EXT4_I(inode)->i_data_sem);
>     ->__blockdev_direct_IO
>      ->ext4_get_block
>      ->submit_io()
>     ->up_read(&EXT4_I(inode)->i_data_sem);
>                                  # truncate data blocks, allocate them to
>                                  # other inode - bad stuff happens because
>                                  # dio is still in flight.
> 
> In order to serialize with truncate dio worker should grab extra i_dio_count
> reference before drop i_mutex.
  Thanks for the patch. You can add:
Reviewed-by: Jan Kara <jack@suse.cz>

								Honza
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> ---
>  fs/ext4/inode.c |    2 ++
>  1 files changed, 2 insertions(+), 0 deletions(-)
> 
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5a75908..9725acb 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -3035,6 +3035,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  		overwrite = *((int *)iocb->private);
>  
>  		if (overwrite) {
> +			atomic_inc(&inode->i_dio_count);
>  			down_read(&EXT4_I(inode)->i_data_sem);
>  			mutex_unlock(&inode->i_mutex);
>  		}
> @@ -3134,6 +3135,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
>  		if (overwrite) {
>  			up_read(&EXT4_I(inode)->i_data_sem);
>  			mutex_lock(&inode->i_mutex);
> +			inode_dio_done(inode);
>  		}
>  
>  		return ret;
> -- 
> 1.7.7.6
> 
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 4/4] ext4: serialize truncate with owerwrite DIO workers
  2012-09-05 15:49   ` Jan Kara
@ 2012-09-05 16:59     ` Dmitry Monakhov
  2012-09-05 19:05       ` Jan Kara
  2012-09-06 10:07       ` Dmitry Monakhov
  0 siblings, 2 replies; 11+ messages in thread
From: Dmitry Monakhov @ 2012-09-05 16:59 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-ext4, jack

On Wed, 5 Sep 2012 17:49:20 +0200, Jan Kara <jack@suse.cz> wrote:
> On Tue 04-09-12 21:36:54, Dmitry Monakhov wrote:
> > Jan Kara have spotted interesting issue:
> > There are  potential data corruption issue with  direct IO overwrites
> > racing with truncate:
> >  Like:
> >   dio write                      truncate_task
> >   ->ext4_ext_direct_IO
> >    ->overwrite == 1
> >     ->down_read(&EXT4_I(inode)->i_data_sem);
> >     ->mutex_unlock(&inode->i_mutex);
> >                                ->ext4_setattr()
> >                                 ->inode_dio_wait()
> >                                 ->truncate_setsize()
> >                                 ->ext4_truncate()
> >                                  ->down_write(&EXT4_I(inode)->i_data_sem);
> >     ->__blockdev_direct_IO
> >      ->ext4_get_block
> >      ->submit_io()
> >     ->up_read(&EXT4_I(inode)->i_data_sem);
> >                                  # truncate data blocks, allocate them to
> >                                  # other inode - bad stuff happens because
> >                                  # dio is still in flight.
> > 
> > In order to serialize with truncate dio worker should grab extra i_dio_count
> > reference before drop i_mutex.
>   Thanks for the patch. You can add:
> Reviewed-by: Jan Kara <jack@suse.cz>
I'm Sorry, but unfortunately in two line patch i've done one mistake :( 
because inode_dio_done() should be before i_mutex will be retaken
otherwise following deadlock happen

ext4_setattr                       ext4_direct_io
                                   mutex_unlock
                                   atomic_inc(inode->i_dio_count)
  mutex_lock(i_mutex)
  inode_dio_wait(inode)  ->BLOCK
                        DEADLOCK<- mutex_lock(i_mutex)
                                   inode_dio_done()

So i'll add your review sing to updated version if you don't mind.
> 								Honza
> > Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> > ---
> >  fs/ext4/inode.c |    2 ++
> >  1 files changed, 2 insertions(+), 0 deletions(-)
> > 
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index 5a75908..9725acb 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -3035,6 +3035,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> >  		overwrite = *((int *)iocb->private);
> >  
> >  		if (overwrite) {
> > +			atomic_inc(&inode->i_dio_count);
> >  			down_read(&EXT4_I(inode)->i_data_sem);
> >  			mutex_unlock(&inode->i_mutex);
> >  		}
> > @@ -3134,6 +3135,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> >  		if (overwrite) {
> >  			up_read(&EXT4_I(inode)->i_data_sem);
> >  			mutex_lock(&inode->i_mutex);
> > +			inode_dio_done(inode);
> >  		}
> >  
> >  		return ret;
> > -- 
> > 1.7.7.6
> > 
> -- 
> Jan Kara <jack@suse.cz>
> SUSE Labs, CR
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers
  2012-09-05 15:47 ` [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers Jan Kara
@ 2012-09-05 17:09   ` Dmitry Monakhov
  0 siblings, 0 replies; 11+ messages in thread
From: Dmitry Monakhov @ 2012-09-05 17:09 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-ext4, jack

On Wed, 5 Sep 2012 17:47:19 +0200, Jan Kara <jack@suse.cz> wrote:
> On Tue 04-09-12 21:36:51, Dmitry Monakhov wrote:
> > Inode's block defrag and ext4_change_inode_journal_flag() may
> > affect nonlocked DIO reads result, so proper synchronization
> > required.
> > 
> > - add missed inode_dio_wait() calls where appropriate
> > - recheck ext4_should_dioread_nolock under extra i_dio_count reference.
> > 
> > Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> ...
> > diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
> > index 830e1b2..ba40309 100644
> > --- a/fs/ext4/indirect.c
> > +++ b/fs/ext4/indirect.c
> > @@ -812,10 +812,22 @@ retry:
> >  			ext4_flush_completed_IO(inode);
> >  			mutex_unlock(&inode->i_mutex);
> >  		}
> > +		/*
> > +		 * Inode's locking behaviour may change due to number
> > +		 * of reasons, in order to be shure that nolock dioreads
> 					      ^^^ sure
> > +		 * is still allowed we have to recheck inode's flags
> > +		 * while i_dio_count > 0
> > +		 */
> > +		atomic_inc(&inode->i_dio_count);
> > +		if (!unlikely(ext4_should_dioread_nolock(inode))) {
> > +			inode_dio_done(inode);
> > +			goto retry;
> > +		}
>   Umm, to make this reliable, you need a smp_mb() between atomic_inc() and
> ext4_should_dioread_nolock(). Otherwise the i_state test could be reordered
> before the atomic_inc()... Similarly you need the barriers around all other
> pairs working with i_dio_count and i_state.
Ohh, you right. It looks reasonable to grab down_read(i_data_sem) here.
> 
> Frankly, I'm not very happy with this solution. It's getting ugly rather
> quickly. IMHO dioread_nolock should be just ripped out and replaced with
> proper range locking (if we want the scalability). But this is mostly Ted's
> decision...
> 
> 								Honza
> 
> >  		ret = __blockdev_direct_IO(rw, iocb, inode,
> >  				 inode->i_sb->s_bdev, iov,
> >  				 offset, nr_segs,
> >  				 ext4_get_block, NULL, NULL, 0);
> > +		inode_dio_done(inode);
> >  	} else {
> >  		ret = blockdev_direct_IO(rw, iocb, inode, iov,
> >  				 offset, nr_segs, ext4_get_block);
> > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > index d12d30e..58ef61a 100644
> > --- a/fs/ext4/inode.c
> > +++ b/fs/ext4/inode.c
> > @@ -4741,6 +4741,10 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> >  			return err;
> >  	}
> >  
> > +	/* Wait for all existing dio workers */
> > +	ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
> > +	inode_dio_wait(inode);
> > +
> >  	jbd2_journal_lock_updates(journal);
> >  
> >  	/*
> > @@ -4760,6 +4764,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
> >  	ext4_set_aops(inode);
> >  
> >  	jbd2_journal_unlock_updates(journal);
> > +	ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
> >  
> >  	/* Finally we can mark the inode as dirty. */
> >  
> > diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
> > index c5826c6..a6a4278 100644
> > --- a/fs/ext4/move_extent.c
> > +++ b/fs/ext4/move_extent.c
> > @@ -1213,6 +1213,14 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
> >  	ret1 = mext_inode_double_lock(orig_inode, donor_inode);
> >  	if (ret1 < 0)
> >  		return ret1;
> > +	/* Protect inodes against DIO workers
> > +	 * - Disable dio nonlock reads, so all new dio workers will block
> > +	 *   on i_mutex.
> > +	 * - wait for existing DIO in flight */
> > +	ext4_set_inode_state(orig_inode, EXT4_STATE_DIOREAD_LOCK);
> > +	ext4_set_inode_state(donor_inode, EXT4_STATE_DIOREAD_LOCK);
> > +	inode_dio_wait(orig_inode);
> > +	inode_dio_wait(donor_inode);
> >  
> >  	/* Protect extent tree against block allocations via delalloc */
> >  	double_down_write_data_sem(orig_inode, donor_inode);
> > @@ -1412,6 +1420,8 @@ out:
> >  		kfree(holecheck_path);
> >  	}
> >  	double_up_write_data_sem(orig_inode, donor_inode);
> > +	ext4_clear_inode_state(orig_inode, EXT4_STATE_DIOREAD_LOCK);
> > +	ext4_clear_inode_state(donor_inode, EXT4_STATE_DIOREAD_LOCK);
> >  	ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
> >  
> >  	if (ret1)
> > -- 
> > 1.7.7.6
> > 
> -- 
> Jan Kara <jack@suse.cz>
> SUSE Labs, CR
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 4/4] ext4: serialize truncate with owerwrite DIO workers
  2012-09-05 16:59     ` Dmitry Monakhov
@ 2012-09-05 19:05       ` Jan Kara
  2012-09-06 10:07       ` Dmitry Monakhov
  1 sibling, 0 replies; 11+ messages in thread
From: Jan Kara @ 2012-09-05 19:05 UTC (permalink / raw)
  To: Dmitry Monakhov; +Cc: Jan Kara, linux-ext4

On Wed 05-09-12 20:59:09, Dmitry Monakhov wrote:
> On Wed, 5 Sep 2012 17:49:20 +0200, Jan Kara <jack@suse.cz> wrote:
> > On Tue 04-09-12 21:36:54, Dmitry Monakhov wrote:
> > > Jan Kara have spotted interesting issue:
> > > There are  potential data corruption issue with  direct IO overwrites
> > > racing with truncate:
> > >  Like:
> > >   dio write                      truncate_task
> > >   ->ext4_ext_direct_IO
> > >    ->overwrite == 1
> > >     ->down_read(&EXT4_I(inode)->i_data_sem);
> > >     ->mutex_unlock(&inode->i_mutex);
> > >                                ->ext4_setattr()
> > >                                 ->inode_dio_wait()
> > >                                 ->truncate_setsize()
> > >                                 ->ext4_truncate()
> > >                                  ->down_write(&EXT4_I(inode)->i_data_sem);
> > >     ->__blockdev_direct_IO
> > >      ->ext4_get_block
> > >      ->submit_io()
> > >     ->up_read(&EXT4_I(inode)->i_data_sem);
> > >                                  # truncate data blocks, allocate them to
> > >                                  # other inode - bad stuff happens because
> > >                                  # dio is still in flight.
> > > 
> > > In order to serialize with truncate dio worker should grab extra i_dio_count
> > > reference before drop i_mutex.
> >   Thanks for the patch. You can add:
> > Reviewed-by: Jan Kara <jack@suse.cz>
> I'm Sorry, but unfortunately in two line patch i've done one mistake :( 
> because inode_dio_done() should be before i_mutex will be retaken
> otherwise following deadlock happen
> 
> ext4_setattr                       ext4_direct_io
>                                    mutex_unlock
>                                    atomic_inc(inode->i_dio_count)
>   mutex_lock(i_mutex)
>   inode_dio_wait(inode)  ->BLOCK
>                         DEADLOCK<- mutex_lock(i_mutex)
>                                    inode_dio_done()
> 
> So i'll add your review sing to updated version if you don't mind.
  Sure. I should have noticed that as well...

								Honza 

> > > Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> > > ---
> > >  fs/ext4/inode.c |    2 ++
> > >  1 files changed, 2 insertions(+), 0 deletions(-)
> > > 
> > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > index 5a75908..9725acb 100644
> > > --- a/fs/ext4/inode.c
> > > +++ b/fs/ext4/inode.c
> > > @@ -3035,6 +3035,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> > >  		overwrite = *((int *)iocb->private);
> > >  
> > >  		if (overwrite) {
> > > +			atomic_inc(&inode->i_dio_count);
> > >  			down_read(&EXT4_I(inode)->i_data_sem);
> > >  			mutex_unlock(&inode->i_mutex);
> > >  		}
> > > @@ -3134,6 +3135,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> > >  		if (overwrite) {
> > >  			up_read(&EXT4_I(inode)->i_data_sem);
> > >  			mutex_lock(&inode->i_mutex);
> > > +			inode_dio_done(inode);
> > >  		}
> > >  
> > >  		return ret;
> > > -- 
> > > 1.7.7.6
> > > 
> > -- 
> > Jan Kara <jack@suse.cz>
> > SUSE Labs, CR
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 4/4] ext4: serialize truncate with owerwrite DIO workers
  2012-09-05 16:59     ` Dmitry Monakhov
  2012-09-05 19:05       ` Jan Kara
@ 2012-09-06 10:07       ` Dmitry Monakhov
  1 sibling, 0 replies; 11+ messages in thread
From: Dmitry Monakhov @ 2012-09-06 10:07 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-ext4, jack

On Wed, 05 Sep 2012 20:59:09 +0400, Dmitry Monakhov <dmonakhov@openvz.org> wrote:
> On Wed, 5 Sep 2012 17:49:20 +0200, Jan Kara <jack@suse.cz> wrote:
> > On Tue 04-09-12 21:36:54, Dmitry Monakhov wrote:
> > > Jan Kara have spotted interesting issue:
> > > There are  potential data corruption issue with  direct IO overwrites
> > > racing with truncate:
> > >  Like:
> > >   dio write                      truncate_task
> > >   ->ext4_ext_direct_IO
> > >    ->overwrite == 1
> > >     ->down_read(&EXT4_I(inode)->i_data_sem);
> > >     ->mutex_unlock(&inode->i_mutex);
> > >                                ->ext4_setattr()
> > >                                 ->inode_dio_wait()
> > >                                 ->truncate_setsize()
> > >                                 ->ext4_truncate()
> > >                                  ->down_write(&EXT4_I(inode)->i_data_sem);
> > >     ->__blockdev_direct_IO
> > >      ->ext4_get_block
> > >      ->submit_io()
> > >     ->up_read(&EXT4_I(inode)->i_data_sem);
> > >                                  # truncate data blocks, allocate them to
> > >                                  # other inode - bad stuff happens because
> > >                                  # dio is still in flight.
> > > 
> > > In order to serialize with truncate dio worker should grab extra i_dio_count
> > > reference before drop i_mutex.
> >   Thanks for the patch. You can add:
> > Reviewed-by: Jan Kara <jack@suse.cz>
> I'm Sorry, but unfortunately in two line patch i've done one mistake :( 
> because inode_dio_done() should be before i_mutex will be retaken
> otherwise following deadlock happen
> 
> ext4_setattr                       ext4_direct_io
>                                    mutex_unlock
>                                    atomic_inc(inode->i_dio_count)
>   mutex_lock(i_mutex)
>   inode_dio_wait(inode)  ->BLOCK
>                         DEADLOCK<- mutex_lock(i_mutex)
>                                    inode_dio_done()
Yeah... This is not just my fault :)
Similar deadlock already exist but happen due to end_io_work

truncate:                          kworker:
ext4_setattr                       ext4_end_io_work
                                    

   mutex_lock(i_mutex)
   inode_dio_wait(inode)  ->BLOCK
                         DEADLOCK<- mutex_trylock()
                                    inode_dio_done()
#TEST_CASE
MNT=/mnt_scrach
unlink $MNT/file
fallocate -l $((1024*1024*1024)) $MNT/file
aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file &
sleep 3
truncate -s 0 $MNT/file
#TEST_CASE_END


> 
> So i'll add your review sing to updated version if you don't mind.
> > 								Honza
> > > Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> > > ---
> > >  fs/ext4/inode.c |    2 ++
> > >  1 files changed, 2 insertions(+), 0 deletions(-)
> > > 
> > > diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> > > index 5a75908..9725acb 100644
> > > --- a/fs/ext4/inode.c
> > > +++ b/fs/ext4/inode.c
> > > @@ -3035,6 +3035,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> > >  		overwrite = *((int *)iocb->private);
> > >  
> > >  		if (overwrite) {
> > > +			atomic_inc(&inode->i_dio_count);
> > >  			down_read(&EXT4_I(inode)->i_data_sem);
> > >  			mutex_unlock(&inode->i_mutex);
> > >  		}
> > > @@ -3134,6 +3135,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
> > >  		if (overwrite) {
> > >  			up_read(&EXT4_I(inode)->i_data_sem);
> > >  			mutex_lock(&inode->i_mutex);
> > > +			inode_dio_done(inode);
> > >  		}
> > >  
> > >  		return ret;
> > > -- 
> > > 1.7.7.6
> > > 
> > -- 
> > Jan Kara <jack@suse.cz>
> > SUSE Labs, CR
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 4/4] ext4: serialize truncate with owerwrite DIO workers
  2012-09-04 17:22 Dmitry Monakhov
@ 2012-09-04 17:22 ` Dmitry Monakhov
  0 siblings, 0 replies; 11+ messages in thread
From: Dmitry Monakhov @ 2012-09-04 17:22 UTC (permalink / raw)
  To: linux-ext4; +Cc: jack, Dmitry Monakhov

Jan Kara have spotted interesting issue:
There are potential data corruption issue with direct IO overwrites
racing with truncate:
 Like:
  dio write                      truncate_task
  ->ext4_ext_direct_IO
   ->overwrite == 1
    ->down_read(&EXT4_I(inode)->i_data_sem);
    ->mutex_unlock(&inode->i_mutex);
                               ->ext4_setattr()
                                ->inode_dio_wait()
                                ->truncate_setsize()
                                ->ext4_truncate()
                                 ->down_write(&EXT4_I(inode)->i_data_sem);
    ->__blockdev_direct_IO
     ->ext4_get_block
     ->submit_io()
    ->up_read(&EXT4_I(inode)->i_data_sem);
                                 # truncate data blocks, allocate them to
                                 # other inode - bad stuff happens because
                                 # dio is still in flight.

In order to serialize with truncate dio worker should grab extra i_dio_count
reference before drop i_mutex.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/inode.c |    2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e696f19..46606e3 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3035,6 +3035,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		overwrite = *((int *)iocb->private);
 
 		if (overwrite) {
+			atomic_inc(&inode->i_dio_count);
 			down_read(&EXT4_I(inode)->i_data_sem);
 			mutex_unlock(&inode->i_mutex);
 		}
@@ -3134,6 +3135,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
 		if (overwrite) {
 			up_read(&EXT4_I(inode)->i_data_sem);
 			mutex_lock(&inode->i_mutex);
+			inode_dio_done(inode);
 		}
 
 		return ret;
-- 
1.7.7.6


^ permalink raw reply related	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2012-09-06 10:07 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-04 17:36 [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers Dmitry Monakhov
2012-09-04 17:36 ` [PATCH 2/4] ext4: serialize unlocked dio reads with truncate Dmitry Monakhov
2012-09-04 17:36 ` [PATCH 3/4] ext4: endless truncate due to nonlocked dio readers V2 Dmitry Monakhov
2012-09-04 17:36 ` [PATCH 4/4] ext4: serialize truncate with owerwrite DIO workers Dmitry Monakhov
2012-09-05 15:49   ` Jan Kara
2012-09-05 16:59     ` Dmitry Monakhov
2012-09-05 19:05       ` Jan Kara
2012-09-06 10:07       ` Dmitry Monakhov
2012-09-05 15:47 ` [PATCH 1/4] ext4: serialize dio nonlocked reads with defrag workers Jan Kara
2012-09-05 17:09   ` Dmitry Monakhov
  -- strict thread matches above, loose matches on Subject: below --
2012-09-04 17:22 Dmitry Monakhov
2012-09-04 17:22 ` [PATCH 4/4] ext4: serialize truncate with owerwrite DIO workers Dmitry Monakhov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.