All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH v2] ext4: dio take shared inode lock when overwriting preallocated blocks
@ 2022-12-26  6:20 Zhang Yi
  2023-01-02 14:09 ` Jan Kara
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Zhang Yi @ 2022-12-26  6:20 UTC (permalink / raw)
  To: linux-ext4; +Cc: tytso, adilger.kernel, jack, yi.zhang, yi.zhang, yukuai3

From: Zhang Yi <yi.zhang@huawei.com>

In the dio write path, we only take shared inode lock for the case of
aligned overwriting initialized blocks inside EOF. But for overwriting
preallocated blocks, it may only need to split unwritten extents, this
procedure has been protected under i_data_sem lock, it's safe to
release the exclusive inode lock and take shared inode lock.

This could give a significant speed up for multi-threaded writes. Test
on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.

 direct=1
 ioengine=libaio
 iodepth=10
 numjobs=10
 runtime=60
 rw=randwrite
 size=100G

And the test result are:
Before:
 bs=4k       IOPS=11.1k, BW=43.2MiB/s
 bs=16k      IOPS=11.1k, BW=173MiB/s
 bs=64k      IOPS=11.2k, BW=697MiB/s

After:
 bs=4k       IOPS=41.4k, BW=162MiB/s
 bs=16k      IOPS=41.3k, BW=646MiB/s
 bs=64k      IOPS=13.5k, BW=843MiB/s

Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
---
v2->v1:
 - Negate the 'inited' related arguments to 'unwritten'.

 fs/ext4/file.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index a7a597c727e6..21abe95a0ee7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
 	return false;
 }
 
-/* Is IO overwriting allocated and initialized blocks? */
-static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
+/* Is IO overwriting allocated or initialized blocks? */
+static bool ext4_overwrite_io(struct inode *inode,
+			      loff_t pos, loff_t len, bool *unwritten)
 {
 	struct ext4_map_blocks map;
 	unsigned int blkbits = inode->i_blkbits;
@@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
 	blklen = map.m_len;
 
 	err = ext4_map_blocks(NULL, inode, &map, 0);
+	if (err != blklen)
+		return false;
 	/*
 	 * 'err==len' means that all of the blocks have been preallocated,
-	 * regardless of whether they have been initialized or not. To exclude
-	 * unwritten extents, we need to check m_flags.
+	 * regardless of whether they have been initialized or not. We need to
+	 * check m_flags to distinguish the unwritten extents.
 	 */
-	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
+	*unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
+	return true;
 }
 
 static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
@@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
  * - For extending writes case we don't take the shared lock, since it requires
  *   updating inode i_disksize and/or orphan handling with exclusive lock.
  *
- * - shared locking will only be true mostly with overwrites. Otherwise we will
- *   switch to exclusive i_rwsem lock.
+ * - shared locking will only be true mostly with overwrites, including
+ *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
+ *   we protect splitting extents by i_data_sem in ext4_inode_info, so we can
+ *   also release exclusive i_rwsem lock.
+ *
+ * - Otherwise we will switch to exclusive i_rwsem lock.
  */
 static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
-				     bool *ilock_shared, bool *extend)
+				     bool *ilock_shared, bool *extend,
+				     bool *unwritten)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
@@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
 	 * in file_modified().
 	 */
 	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
-	     !ext4_overwrite_io(inode, offset, count))) {
+	     !ext4_overwrite_io(inode, offset, count, unwritten))) {
 		if (iocb->ki_flags & IOCB_NOWAIT) {
 			ret = -EAGAIN;
 			goto out;
@@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	loff_t offset = iocb->ki_pos;
 	size_t count = iov_iter_count(from);
 	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
-	bool extend = false, unaligned_io = false;
+	bool extend = false, unaligned_io = false, unwritten = false;
 	bool ilock_shared = true;
 
 	/*
@@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return ext4_buffered_write_iter(iocb, from);
 	}
 
-	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
+	ret = ext4_dio_write_checks(iocb, from,
+				    &ilock_shared, &extend, &unwritten);
 	if (ret <= 0)
 		return ret;
 
@@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		ext4_journal_stop(handle);
 	}
 
-	if (ilock_shared)
+	if (ilock_shared && !unwritten)
 		iomap_ops = &ext4_iomap_overwrite_ops;
 	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
 			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH v2] ext4: dio take shared inode lock when overwriting preallocated blocks
  2022-12-26  6:20 [RFC PATCH v2] ext4: dio take shared inode lock when overwriting preallocated blocks Zhang Yi
@ 2023-01-02 14:09 ` Jan Kara
  2023-02-16 17:41 ` Ritesh Harjani
  2023-02-19  5:40 ` Theodore Ts'o
  2 siblings, 0 replies; 6+ messages in thread
From: Jan Kara @ 2023-01-02 14:09 UTC (permalink / raw)
  To: Zhang Yi; +Cc: linux-ext4, tytso, adilger.kernel, jack, yi.zhang, yukuai3

On Mon 26-12-22 14:20:15, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> In the dio write path, we only take shared inode lock for the case of
> aligned overwriting initialized blocks inside EOF. But for overwriting
> preallocated blocks, it may only need to split unwritten extents, this
> procedure has been protected under i_data_sem lock, it's safe to
> release the exclusive inode lock and take shared inode lock.
> 
> This could give a significant speed up for multi-threaded writes. Test
> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
> 
>  direct=1
>  ioengine=libaio
>  iodepth=10
>  numjobs=10
>  runtime=60
>  rw=randwrite
>  size=100G
> 
> And the test result are:
> Before:
>  bs=4k       IOPS=11.1k, BW=43.2MiB/s
>  bs=16k      IOPS=11.1k, BW=173MiB/s
>  bs=64k      IOPS=11.2k, BW=697MiB/s
> 
> After:
>  bs=4k       IOPS=41.4k, BW=162MiB/s
>  bs=16k      IOPS=41.3k, BW=646MiB/s
>  bs=64k      IOPS=13.5k, BW=843MiB/s
> 
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>

Looks good to me. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
> v2->v1:
>  - Negate the 'inited' related arguments to 'unwritten'.
> 
>  fs/ext4/file.c | 34 ++++++++++++++++++++++------------
>  1 file changed, 22 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index a7a597c727e6..21abe95a0ee7 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
>  	return false;
>  }
>  
> -/* Is IO overwriting allocated and initialized blocks? */
> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
> +/* Is IO overwriting allocated or initialized blocks? */
> +static bool ext4_overwrite_io(struct inode *inode,
> +			      loff_t pos, loff_t len, bool *unwritten)
>  {
>  	struct ext4_map_blocks map;
>  	unsigned int blkbits = inode->i_blkbits;
> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>  	blklen = map.m_len;
>  
>  	err = ext4_map_blocks(NULL, inode, &map, 0);
> +	if (err != blklen)
> +		return false;
>  	/*
>  	 * 'err==len' means that all of the blocks have been preallocated,
> -	 * regardless of whether they have been initialized or not. To exclude
> -	 * unwritten extents, we need to check m_flags.
> +	 * regardless of whether they have been initialized or not. We need to
> +	 * check m_flags to distinguish the unwritten extents.
>  	 */
> -	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
> +	*unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
> +	return true;
>  }
>  
>  static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
>   * - For extending writes case we don't take the shared lock, since it requires
>   *   updating inode i_disksize and/or orphan handling with exclusive lock.
>   *
> - * - shared locking will only be true mostly with overwrites. Otherwise we will
> - *   switch to exclusive i_rwsem lock.
> + * - shared locking will only be true mostly with overwrites, including
> + *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
> + *   we protect splitting extents by i_data_sem in ext4_inode_info, so we can
> + *   also release exclusive i_rwsem lock.
> + *
> + * - Otherwise we will switch to exclusive i_rwsem lock.
>   */
>  static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
> -				     bool *ilock_shared, bool *extend)
> +				     bool *ilock_shared, bool *extend,
> +				     bool *unwritten)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file_inode(file);
> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>  	 * in file_modified().
>  	 */
>  	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
> -	     !ext4_overwrite_io(inode, offset, count))) {
> +	     !ext4_overwrite_io(inode, offset, count, unwritten))) {
>  		if (iocb->ki_flags & IOCB_NOWAIT) {
>  			ret = -EAGAIN;
>  			goto out;
> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	loff_t offset = iocb->ki_pos;
>  	size_t count = iov_iter_count(from);
>  	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
> -	bool extend = false, unaligned_io = false;
> +	bool extend = false, unaligned_io = false, unwritten = false;
>  	bool ilock_shared = true;
>  
>  	/*
> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		return ext4_buffered_write_iter(iocb, from);
>  	}
>  
> -	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
> +	ret = ext4_dio_write_checks(iocb, from,
> +				    &ilock_shared, &extend, &unwritten);
>  	if (ret <= 0)
>  		return ret;
>  
> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		ext4_journal_stop(handle);
>  	}
>  
> -	if (ilock_shared)
> +	if (ilock_shared && !unwritten)
>  		iomap_ops = &ext4_iomap_overwrite_ops;
>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>  			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
> -- 
> 2.31.1
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH v2] ext4: dio take shared inode lock when overwriting preallocated blocks
  2022-12-26  6:20 [RFC PATCH v2] ext4: dio take shared inode lock when overwriting preallocated blocks Zhang Yi
  2023-01-02 14:09 ` Jan Kara
@ 2023-02-16 17:41 ` Ritesh Harjani
  2023-02-17 13:31   ` Zhang Yi
  2023-02-19  5:40 ` Theodore Ts'o
  2 siblings, 1 reply; 6+ messages in thread
From: Ritesh Harjani @ 2023-02-16 17:41 UTC (permalink / raw)
  To: Zhang Yi, linux-ext4
  Cc: tytso, adilger.kernel, jack, yi.zhang, yi.zhang, yukuai3, Brian Foster

Zhang Yi <yi.zhang@huaweicloud.com> writes:

> From: Zhang Yi <yi.zhang@huawei.com>
>
> In the dio write path, we only take shared inode lock for the case of
> aligned overwriting initialized blocks inside EOF. But for overwriting
> preallocated blocks, it may only need to split unwritten extents, this
> procedure has been protected under i_data_sem lock, it's safe to
> release the exclusive inode lock and take shared inode lock.

Ok. One question though. Should we be passing IOMAP_DIO_FORCE_WAIT
(in this case as well) which will wait for the completion of dio
request even if the submitted IO is not synchronous. Like how it's being
done for unaligned overwrites case [1].
What I am mostly curious to know about is, how do we take care of
unwritten
to written conversion without racing which can happen in a
seperate workqueue context and/or are there any zeroing of extents
involved in this scenario which can race with one another?

So, I think in case of a non-aligned write it make sense [1] because it
might involve zeroing of the partial blocks. But in this case as you
said this already happens within i_data_sem lock context, so it won't be
necessary. I still thought it will be worth while to confirm it's indeed
the case or not.

[1]:
https://lore.kernel.org/linux-ext4/20230210145954.277611-1-bfoster@redhat.com/

Oh, one of the patch might run into some patch conflict depending upon
which one gets picked first...

-ritesh


>
> This could give a significant speed up for multi-threaded writes. Test
> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
>
>  direct=1
>  ioengine=libaio
>  iodepth=10
>  numjobs=10
>  runtime=60
>  rw=randwrite
>  size=100G
>
> And the test result are:
> Before:
>  bs=4k       IOPS=11.1k, BW=43.2MiB/s
>  bs=16k      IOPS=11.1k, BW=173MiB/s
>  bs=64k      IOPS=11.2k, BW=697MiB/s
>
> After:
>  bs=4k       IOPS=41.4k, BW=162MiB/s
>  bs=16k      IOPS=41.3k, BW=646MiB/s
>  bs=64k      IOPS=13.5k, BW=843MiB/s
>
> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
> ---
> v2->v1:
>  - Negate the 'inited' related arguments to 'unwritten'.
>
>  fs/ext4/file.c | 34 ++++++++++++++++++++++------------
>  1 file changed, 22 insertions(+), 12 deletions(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index a7a597c727e6..21abe95a0ee7 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
>  	return false;
>  }
>
> -/* Is IO overwriting allocated and initialized blocks? */
> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
> +/* Is IO overwriting allocated or initialized blocks? */
> +static bool ext4_overwrite_io(struct inode *inode,
> +			      loff_t pos, loff_t len, bool *unwritten)
>  {
>  	struct ext4_map_blocks map;
>  	unsigned int blkbits = inode->i_blkbits;
> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>  	blklen = map.m_len;
>
>  	err = ext4_map_blocks(NULL, inode, &map, 0);
> +	if (err != blklen)
> +		return false;
>  	/*
>  	 * 'err==len' means that all of the blocks have been preallocated,
> -	 * regardless of whether they have been initialized or not. To exclude
> -	 * unwritten extents, we need to check m_flags.
> +	 * regardless of whether they have been initialized or not. We need to
> +	 * check m_flags to distinguish the unwritten extents.
>  	 */
> -	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
> +	*unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
> +	return true;
>  }
>
>  static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
>   * - For extending writes case we don't take the shared lock, since it requires
>   *   updating inode i_disksize and/or orphan handling with exclusive lock.
>   *
> - * - shared locking will only be true mostly with overwrites. Otherwise we will
> - *   switch to exclusive i_rwsem lock.
> + * - shared locking will only be true mostly with overwrites, including
> + *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
> + *   we protect splitting extents by i_data_sem in ext4_inode_info, so we can
> + *   also release exclusive i_rwsem lock.
> + *
> + * - Otherwise we will switch to exclusive i_rwsem lock.
>   */
>  static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
> -				     bool *ilock_shared, bool *extend)
> +				     bool *ilock_shared, bool *extend,
> +				     bool *unwritten)
>  {
>  	struct file *file = iocb->ki_filp;
>  	struct inode *inode = file_inode(file);
> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>  	 * in file_modified().
>  	 */
>  	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
> -	     !ext4_overwrite_io(inode, offset, count))) {
> +	     !ext4_overwrite_io(inode, offset, count, unwritten))) {
>  		if (iocb->ki_flags & IOCB_NOWAIT) {
>  			ret = -EAGAIN;
>  			goto out;
> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  	loff_t offset = iocb->ki_pos;
>  	size_t count = iov_iter_count(from);
>  	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
> -	bool extend = false, unaligned_io = false;
> +	bool extend = false, unaligned_io = false, unwritten = false;
>  	bool ilock_shared = true;
>
>  	/*
> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		return ext4_buffered_write_iter(iocb, from);
>  	}
>
> -	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
> +	ret = ext4_dio_write_checks(iocb, from,
> +				    &ilock_shared, &extend, &unwritten);
>  	if (ret <= 0)
>  		return ret;
>
> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>  		ext4_journal_stop(handle);
>  	}
>
> -	if (ilock_shared)
> +	if (ilock_shared && !unwritten)
>  		iomap_ops = &ext4_iomap_overwrite_ops;
>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>  			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
> --
> 2.31.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH v2] ext4: dio take shared inode lock when overwriting preallocated blocks
  2023-02-16 17:41 ` Ritesh Harjani
@ 2023-02-17 13:31   ` Zhang Yi
  2023-02-18  8:13     ` Ritesh Harjani
  0 siblings, 1 reply; 6+ messages in thread
From: Zhang Yi @ 2023-02-17 13:31 UTC (permalink / raw)
  To: Ritesh Harjani (IBM), linux-ext4
  Cc: tytso, adilger.kernel, jack, yi.zhang, yukuai3, Brian Foster

Hello.

On 2023/2/17 1:41, Ritesh Harjani (IBM) wrote:
> Zhang Yi <yi.zhang@huaweicloud.com> writes:
> 
>> From: Zhang Yi <yi.zhang@huawei.com>
>>
>> In the dio write path, we only take shared inode lock for the case of
>> aligned overwriting initialized blocks inside EOF. But for overwriting
>> preallocated blocks, it may only need to split unwritten extents, this
>> procedure has been protected under i_data_sem lock, it's safe to
>> release the exclusive inode lock and take shared inode lock.
> 
> Ok. One question though. Should we be passing IOMAP_DIO_FORCE_WAIT
> (in this case as well) which will wait for the completion of dio
> request even if the submitted IO is not synchronous. Like how it's being
> done for unaligned overwrites case [1].
> What I am mostly curious to know about is, how do we take care of
> unwritten
> to written conversion without racing which can happen in a
> seperate workqueue context and/or are there any zeroing of extents
> involved in this scenario which can race with one another?
> 
> So, I think in case of a non-aligned write it make sense [1] because it
> might involve zeroing of the partial blocks. But in this case as you
> said this already happens within i_data_sem lock context, so it won't be
> necessary. I still thought it will be worth while to confirm it's indeed
> the case or not.
> 

I'm not quite get your question, do you mean passing IOMAP_DIO_FORCE_WAIT
for the case of unaligned writing to pre-allocated(unwritten) blocks?
IIUC, That's how it's done now if you only merge my patch. And it should be
cautious to slove the conflict if you also want to merge [1] together.

After looking at [1], I think it should be:

           |  pure overwrite       |  write to pre-allocated |
-------------------------------------------------------------|
aligned    | share lock, nowait (1)| share lock, nowait  (3) |
unaligned  | share lock, nowait (2)| excl lock, wait     (4) |

In case(3), each AIO-DIO's unwritten->written conversion do not disturb each
other because of the i_data_sem lock, and the potential zeroing extents(e.g.
ext4_zero_range()) also call inode_dio_wait() to wait DIO complete. So I don't
find any race problem.

Am I missing something? or which case you want to confirm?

Thanks,
Yi.

> [1]:
> https://lore.kernel.org/linux-ext4/20230210145954.277611-1-bfoster@redhat.com/
> 
> Oh, one of the patch might run into some patch conflict depending upon
> which one gets picked first...
> 
> -ritesh
> 
> 
>>
>> This could give a significant speed up for multi-threaded writes. Test
>> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
>>
>>  direct=1
>>  ioengine=libaio
>>  iodepth=10
>>  numjobs=10
>>  runtime=60
>>  rw=randwrite
>>  size=100G
>>
>> And the test result are:
>> Before:
>>  bs=4k       IOPS=11.1k, BW=43.2MiB/s
>>  bs=16k      IOPS=11.1k, BW=173MiB/s
>>  bs=64k      IOPS=11.2k, BW=697MiB/s
>>
>> After:
>>  bs=4k       IOPS=41.4k, BW=162MiB/s
>>  bs=16k      IOPS=41.3k, BW=646MiB/s
>>  bs=64k      IOPS=13.5k, BW=843MiB/s
>>
>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>> ---
>> v2->v1:
>>  - Negate the 'inited' related arguments to 'unwritten'.
>>
>>  fs/ext4/file.c | 34 ++++++++++++++++++++++------------
>>  1 file changed, 22 insertions(+), 12 deletions(-)
>>
>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>> index a7a597c727e6..21abe95a0ee7 100644
>> --- a/fs/ext4/file.c
>> +++ b/fs/ext4/file.c
>> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
>>  	return false;
>>  }
>>
>> -/* Is IO overwriting allocated and initialized blocks? */
>> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>> +/* Is IO overwriting allocated or initialized blocks? */
>> +static bool ext4_overwrite_io(struct inode *inode,
>> +			      loff_t pos, loff_t len, bool *unwritten)
>>  {
>>  	struct ext4_map_blocks map;
>>  	unsigned int blkbits = inode->i_blkbits;
>> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>>  	blklen = map.m_len;
>>
>>  	err = ext4_map_blocks(NULL, inode, &map, 0);
>> +	if (err != blklen)
>> +		return false;
>>  	/*
>>  	 * 'err==len' means that all of the blocks have been preallocated,
>> -	 * regardless of whether they have been initialized or not. To exclude
>> -	 * unwritten extents, we need to check m_flags.
>> +	 * regardless of whether they have been initialized or not. We need to
>> +	 * check m_flags to distinguish the unwritten extents.
>>  	 */
>> -	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
>> +	*unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
>> +	return true;
>>  }
>>
>>  static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
>> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
>>   * - For extending writes case we don't take the shared lock, since it requires
>>   *   updating inode i_disksize and/or orphan handling with exclusive lock.
>>   *
>> - * - shared locking will only be true mostly with overwrites. Otherwise we will
>> - *   switch to exclusive i_rwsem lock.
>> + * - shared locking will only be true mostly with overwrites, including
>> + *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
>> + *   we protect splitting extents by i_data_sem in ext4_inode_info, so we can
>> + *   also release exclusive i_rwsem lock.
>> + *
>> + * - Otherwise we will switch to exclusive i_rwsem lock.
>>   */
>>  static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>> -				     bool *ilock_shared, bool *extend)
>> +				     bool *ilock_shared, bool *extend,
>> +				     bool *unwritten)
>>  {
>>  	struct file *file = iocb->ki_filp;
>>  	struct inode *inode = file_inode(file);
>> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>>  	 * in file_modified().
>>  	 */
>>  	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
>> -	     !ext4_overwrite_io(inode, offset, count))) {
>> +	     !ext4_overwrite_io(inode, offset, count, unwritten))) {
>>  		if (iocb->ki_flags & IOCB_NOWAIT) {
>>  			ret = -EAGAIN;
>>  			goto out;
>> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  	loff_t offset = iocb->ki_pos;
>>  	size_t count = iov_iter_count(from);
>>  	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
>> -	bool extend = false, unaligned_io = false;
>> +	bool extend = false, unaligned_io = false, unwritten = false;
>>  	bool ilock_shared = true;
>>
>>  	/*
>> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  		return ext4_buffered_write_iter(iocb, from);
>>  	}
>>
>> -	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
>> +	ret = ext4_dio_write_checks(iocb, from,
>> +				    &ilock_shared, &extend, &unwritten);
>>  	if (ret <= 0)
>>  		return ret;
>>
>> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>  		ext4_journal_stop(handle);
>>  	}
>>
>> -	if (ilock_shared)
>> +	if (ilock_shared && !unwritten)
>>  		iomap_ops = &ext4_iomap_overwrite_ops;
>>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>>  			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
>> --
>> 2.31.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH v2] ext4: dio take shared inode lock when overwriting preallocated blocks
  2023-02-17 13:31   ` Zhang Yi
@ 2023-02-18  8:13     ` Ritesh Harjani
  0 siblings, 0 replies; 6+ messages in thread
From: Ritesh Harjani @ 2023-02-18  8:13 UTC (permalink / raw)
  To: Zhang Yi, linux-ext4
  Cc: tytso, adilger.kernel, jack, yi.zhang, yukuai3, Brian Foster

Zhang Yi <yi.zhang@huaweicloud.com> writes:

> Hello.
>
> On 2023/2/17 1:41, Ritesh Harjani (IBM) wrote:
>> Zhang Yi <yi.zhang@huaweicloud.com> writes:
>>
>>> From: Zhang Yi <yi.zhang@huawei.com>
>>>
>>> In the dio write path, we only take shared inode lock for the case of
>>> aligned overwriting initialized blocks inside EOF. But for overwriting
>>> preallocated blocks, it may only need to split unwritten extents, this
>>> procedure has been protected under i_data_sem lock, it's safe to
>>> release the exclusive inode lock and take shared inode lock.
>>
>> Ok. One question though. Should we be passing IOMAP_DIO_FORCE_WAIT
>> (in this case as well) which will wait for the completion of dio
>> request even if the submitted IO is not synchronous. Like how it's being
>> done for unaligned overwrites case [1].
>> What I am mostly curious to know about is, how do we take care of
>> unwritten
>> to written conversion without racing which can happen in a
>> seperate workqueue context and/or are there any zeroing of extents
>> involved in this scenario which can race with one another?
>>
>> So, I think in case of a non-aligned write it make sense [1] because it
>> might involve zeroing of the partial blocks. But in this case as you
>> said this already happens within i_data_sem lock context, so it won't be
>> necessary. I still thought it will be worth while to confirm it's indeed
>> the case or not.
>>
>
> I'm not quite get your question, do you mean passing IOMAP_DIO_FORCE_WAIT
> for the case of unaligned writing to pre-allocated(unwritten) blocks?
> IIUC, That's how it's done now if you only merge my patch. And it should be
> cautious to slove the conflict if you also want to merge [1] together.
>
> After looking at [1], I think it should be:
>
>            |  pure overwrite       |  write to pre-allocated |
> -------------------------------------------------------------|
> aligned    | share lock, nowait (1)| share lock, nowait  (3) |
> unaligned  | share lock, nowait (2)| excl lock, wait     (4) |
>
> In case(3), each AIO-DIO's unwritten->written conversion do not disturb each
> other because of the i_data_sem lock, and the potential zeroing extents(e.g.
> ext4_zero_range()) also call inode_dio_wait() to wait DIO complete. So I don't
> find any race problem.

aah yes. Looks like we don't need IOMAP_DIO_FORCE_WAIT then for aligned
unwritten overwrite.
Because as you said conversion will be synchronized with i_data_sem lock
and other's (e.g. ext4_zero_range()) will synchronize using inode_dio_wait().

Make sense. Thanks for confirming it!!

Looks good to me. Feel free to add -

Reviewed-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>

-ritesh

>
> Am I missing something? or which case you want to confirm?
>
> Thanks,
> Yi.
>
>> [1]:
>> https://lore.kernel.org/linux-ext4/20230210145954.277611-1-bfoster@redhat.com/
>>
>> Oh, one of the patch might run into some patch conflict depending upon
>> which one gets picked first...
>>
>> -ritesh
>>
>>
>>>
>>> This could give a significant speed up for multi-threaded writes. Test
>>> on Intel Xeon Gold 6140 and nvme SSD with below fio parameters.
>>>
>>>  direct=1
>>>  ioengine=libaio
>>>  iodepth=10
>>>  numjobs=10
>>>  runtime=60
>>>  rw=randwrite
>>>  size=100G
>>>
>>> And the test result are:
>>> Before:
>>>  bs=4k       IOPS=11.1k, BW=43.2MiB/s
>>>  bs=16k      IOPS=11.1k, BW=173MiB/s
>>>  bs=64k      IOPS=11.2k, BW=697MiB/s
>>>
>>> After:
>>>  bs=4k       IOPS=41.4k, BW=162MiB/s
>>>  bs=16k      IOPS=41.3k, BW=646MiB/s
>>>  bs=64k      IOPS=13.5k, BW=843MiB/s
>>>
>>> Signed-off-by: Zhang Yi <yi.zhang@huawei.com>
>>> ---
>>> v2->v1:
>>>  - Negate the 'inited' related arguments to 'unwritten'.
>>>
>>>  fs/ext4/file.c | 34 ++++++++++++++++++++++------------
>>>  1 file changed, 22 insertions(+), 12 deletions(-)
>>>
>>> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
>>> index a7a597c727e6..21abe95a0ee7 100644
>>> --- a/fs/ext4/file.c
>>> +++ b/fs/ext4/file.c
>>> @@ -202,8 +202,9 @@ ext4_extending_io(struct inode *inode, loff_t offset, size_t len)
>>>  	return false;
>>>  }
>>>
>>> -/* Is IO overwriting allocated and initialized blocks? */
>>> -static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>>> +/* Is IO overwriting allocated or initialized blocks? */
>>> +static bool ext4_overwrite_io(struct inode *inode,
>>> +			      loff_t pos, loff_t len, bool *unwritten)
>>>  {
>>>  	struct ext4_map_blocks map;
>>>  	unsigned int blkbits = inode->i_blkbits;
>>> @@ -217,12 +218,15 @@ static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
>>>  	blklen = map.m_len;
>>>
>>>  	err = ext4_map_blocks(NULL, inode, &map, 0);
>>> +	if (err != blklen)
>>> +		return false;
>>>  	/*
>>>  	 * 'err==len' means that all of the blocks have been preallocated,
>>> -	 * regardless of whether they have been initialized or not. To exclude
>>> -	 * unwritten extents, we need to check m_flags.
>>> +	 * regardless of whether they have been initialized or not. We need to
>>> +	 * check m_flags to distinguish the unwritten extents.
>>>  	 */
>>> -	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
>>> +	*unwritten = !(map.m_flags & EXT4_MAP_MAPPED);
>>> +	return true;
>>>  }
>>>
>>>  static ssize_t ext4_generic_write_checks(struct kiocb *iocb,
>>> @@ -431,11 +435,16 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
>>>   * - For extending writes case we don't take the shared lock, since it requires
>>>   *   updating inode i_disksize and/or orphan handling with exclusive lock.
>>>   *
>>> - * - shared locking will only be true mostly with overwrites. Otherwise we will
>>> - *   switch to exclusive i_rwsem lock.
>>> + * - shared locking will only be true mostly with overwrites, including
>>> + *   initialized blocks and unwritten blocks. For overwrite unwritten blocks
>>> + *   we protect splitting extents by i_data_sem in ext4_inode_info, so we can
>>> + *   also release exclusive i_rwsem lock.
>>> + *
>>> + * - Otherwise we will switch to exclusive i_rwsem lock.
>>>   */
>>>  static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>>> -				     bool *ilock_shared, bool *extend)
>>> +				     bool *ilock_shared, bool *extend,
>>> +				     bool *unwritten)
>>>  {
>>>  	struct file *file = iocb->ki_filp;
>>>  	struct inode *inode = file_inode(file);
>>> @@ -459,7 +468,7 @@ static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
>>>  	 * in file_modified().
>>>  	 */
>>>  	if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
>>> -	     !ext4_overwrite_io(inode, offset, count))) {
>>> +	     !ext4_overwrite_io(inode, offset, count, unwritten))) {
>>>  		if (iocb->ki_flags & IOCB_NOWAIT) {
>>>  			ret = -EAGAIN;
>>>  			goto out;
>>> @@ -491,7 +500,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>>  	loff_t offset = iocb->ki_pos;
>>>  	size_t count = iov_iter_count(from);
>>>  	const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
>>> -	bool extend = false, unaligned_io = false;
>>> +	bool extend = false, unaligned_io = false, unwritten = false;
>>>  	bool ilock_shared = true;
>>>
>>>  	/*
>>> @@ -534,7 +543,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>>  		return ext4_buffered_write_iter(iocb, from);
>>>  	}
>>>
>>> -	ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend);
>>> +	ret = ext4_dio_write_checks(iocb, from,
>>> +				    &ilock_shared, &extend, &unwritten);
>>>  	if (ret <= 0)
>>>  		return ret;
>>>
>>> @@ -582,7 +592,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
>>>  		ext4_journal_stop(handle);
>>>  	}
>>>
>>> -	if (ilock_shared)
>>> +	if (ilock_shared && !unwritten)
>>>  		iomap_ops = &ext4_iomap_overwrite_ops;
>>>  	ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
>>>  			   (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
>>> --
>>> 2.31.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [RFC PATCH v2] ext4: dio take shared inode lock when overwriting preallocated blocks
  2022-12-26  6:20 [RFC PATCH v2] ext4: dio take shared inode lock when overwriting preallocated blocks Zhang Yi
  2023-01-02 14:09 ` Jan Kara
  2023-02-16 17:41 ` Ritesh Harjani
@ 2023-02-19  5:40 ` Theodore Ts'o
  2 siblings, 0 replies; 6+ messages in thread
From: Theodore Ts'o @ 2023-02-19  5:40 UTC (permalink / raw)
  To: linux-ext4, Zhang Yi
  Cc: Theodore Ts'o, yi.zhang, yukuai3, adilger.kernel, jack

On Mon, 26 Dec 2022 14:20:15 +0800, Zhang Yi wrote:
> From: Zhang Yi <yi.zhang@huawei.com>
> 
> In the dio write path, we only take shared inode lock for the case of
> aligned overwriting initialized blocks inside EOF. But for overwriting
> preallocated blocks, it may only need to split unwritten extents, this
> procedure has been protected under i_data_sem lock, it's safe to
> release the exclusive inode lock and take shared inode lock.
> 
> [...]

Applied, thanks!

[1/1] ext4: dio take shared inode lock when overwriting preallocated blocks
      commit: 240930fb7e6b52229bdee5b1423bfeab0002fed2

Best regards,
-- 
Theodore Ts'o <tytso@mit.edu>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2023-02-19  5:41 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-12-26  6:20 [RFC PATCH v2] ext4: dio take shared inode lock when overwriting preallocated blocks Zhang Yi
2023-01-02 14:09 ` Jan Kara
2023-02-16 17:41 ` Ritesh Harjani
2023-02-17 13:31   ` Zhang Yi
2023-02-18  8:13     ` Ritesh Harjani
2023-02-19  5:40 ` Theodore Ts'o

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.