Linux-ext4 Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH 0/3] Revert parallel dio reads
@ 2019-08-27  2:05 Joseph Qi
  2019-08-27  2:05 ` [PATCH 1/3] Revert "ext4: remove EXT4_STATE_DIOREAD_LOCK flag" Joseph Qi
                   ` (3 more replies)
  0 siblings, 4 replies; 11+ messages in thread
From: Joseph Qi @ 2019-08-27  2:05 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger, Jan Kara; +Cc: linux-ext4, Dave Chinner

This patch set is trying to revert parallel dio reads feature at present
since it causes significant performance regression in mixed random
read/write scenario.

Joseph Qi (3):
  Revert "ext4: remove EXT4_STATE_DIOREAD_LOCK flag"
  Revert "ext4: fix off-by-one error when writing back pages before dio
    read"
  Revert "ext4: Allow parallel DIO reads"

 fs/ext4/ext4.h        | 17 +++++++++++++++++
 fs/ext4/extents.c     | 19 ++++++++++++++-----
 fs/ext4/inode.c       | 47 +++++++++++++++++++++++++++++++----------------
 fs/ext4/ioctl.c       |  4 ++++
 fs/ext4/move_extent.c |  4 ++++
 fs/ext4/super.c       | 12 +++++++-----
 6 files changed, 77 insertions(+), 26 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 1/3] Revert "ext4: remove EXT4_STATE_DIOREAD_LOCK flag"
  2019-08-27  2:05 [PATCH 0/3] Revert parallel dio reads Joseph Qi
@ 2019-08-27  2:05 ` Joseph Qi
  2019-08-27  2:05 ` [PATCH 2/3] Revert "ext4: fix off-by-one error when writing back pages before dio read" Joseph Qi
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 11+ messages in thread
From: Joseph Qi @ 2019-08-27  2:05 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger, Jan Kara; +Cc: linux-ext4, Dave Chinner

This reverts commit 1d39834fba99 ("ext4: remove EXT4_STATE_DIOREAD_LOCK
flag").
It is related to the following revert 16c54688592c ("ext4: Allow
parallel DIO reads") which causes significant performance regression in
mixed random read/write scenario.

Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
---
 fs/ext4/ext4.h        | 17 +++++++++++++++++
 fs/ext4/extents.c     | 19 ++++++++++++++-----
 fs/ext4/inode.c       |  8 ++++++++
 fs/ext4/ioctl.c       |  4 ++++
 fs/ext4/move_extent.c |  4 ++++
 fs/ext4/super.c       | 12 +++++++-----
 6 files changed, 54 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf660aa..1d616d4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1557,6 +1557,8 @@ enum {
 	EXT4_STATE_EXT_MIGRATE,		/* Inode is migrating */
 	EXT4_STATE_DIO_UNWRITTEN,	/* need convert on dio done*/
 	EXT4_STATE_NEWENTRY,		/* File just added to dir */
+	EXT4_STATE_DIOREAD_LOCK,	/* Disable support for dio read
+					   nolocking */
 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
 	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
 	EXT4_STATE_LUSTRE_EA_INODE,	/* Lustre-style ea_inode */
@@ -3300,6 +3302,21 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh)
 	set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state);
 }
 
+/*
+ * Disable DIO read nolock optimization, so new dioreaders will be forced
+ * to grab i_mutex
+ */
+static inline void ext4_inode_block_unlocked_dio(struct inode *inode)
+{
+	ext4_set_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+	smp_mb();
+}
+static inline void ext4_inode_resume_unlocked_dio(struct inode *inode)
+{
+	smp_mb();
+	ext4_clear_inode_state(inode, EXT4_STATE_DIOREAD_LOCK);
+}
+
 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
 
 /* For ioend & aio unwritten conversion wait queues */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 92266a2..ded1334 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -4711,6 +4711,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
 
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 
 	/* Preallocate the range including the unaligned edges */
@@ -4721,7 +4722,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 				 round_down(offset, 1 << blkbits)) >> blkbits,
 				new_size, flags);
 		if (ret)
-			goto out_mutex;
+			goto out_dio;
 
 	}
 
@@ -4745,7 +4746,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		ret = ext4_update_disksize_before_punch(inode, offset, len);
 		if (ret) {
 			up_write(&EXT4_I(inode)->i_mmap_sem);
-			goto out_mutex;
+			goto out_dio;
 		}
 		/* Now release the pages and zero block aligned part of pages */
 		truncate_pagecache_range(inode, start, end - 1);
@@ -4755,10 +4756,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 					     flags);
 		up_write(&EXT4_I(inode)->i_mmap_sem);
 		if (ret)
-			goto out_mutex;
+			goto out_dio;
 	}
 	if (!partial_begin && !partial_end)
-		goto out_mutex;
+		goto out_dio;
 
 	/*
 	 * In worst case we have to writeout two nonadjacent unwritten
@@ -4771,7 +4772,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
 		ext4_std_error(inode->i_sb, ret);
-		goto out_mutex;
+		goto out_dio;
 	}
 
 	inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -4796,6 +4797,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		ext4_handle_sync(handle);
 
 	ext4_journal_stop(handle);
+out_dio:
+	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
 	inode_unlock(inode);
 	return ret;
@@ -4883,9 +4886,11 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	}
 
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 
 	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
+	ext4_inode_resume_unlocked_dio(inode);
 	if (ret)
 		goto out;
 
@@ -5411,6 +5416,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	}
 
 	/* Wait for existing dio to complete */
+	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 
 	/*
@@ -5492,6 +5498,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	ext4_journal_stop(handle);
 out_mmap:
 	up_write(&EXT4_I(inode)->i_mmap_sem);
+	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
 	inode_unlock(inode);
 	return ret;
@@ -5564,6 +5571,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	}
 
 	/* Wait for existing dio to complete */
+	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 
 	/*
@@ -5670,6 +5678,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	ext4_journal_stop(handle);
 out_mmap:
 	up_write(&EXT4_I(inode)->i_mmap_sem);
+	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
 	inode_unlock(inode);
 	return ret;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 420fe3d..0f505f0 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4337,6 +4337,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	}
 
 	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 
 	/*
@@ -4414,6 +4415,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	ext4_journal_stop(handle);
 out_dio:
 	up_write(&EXT4_I(inode)->i_mmap_sem);
+	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
 	inode_unlock(inode);
 	return ret;
@@ -5623,7 +5625,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			 * Blocks are going to be removed from the inode. Wait
 			 * for dio in flight.
 			 */
+			ext4_inode_block_unlocked_dio(inode);
 			inode_dio_wait(inode);
+			ext4_inode_resume_unlocked_dio(inode);
 		}
 
 		down_write(&EXT4_I(inode)->i_mmap_sem);
@@ -6138,6 +6142,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		return -EROFS;
 
 	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 
 	/*
@@ -6153,6 +6158,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		err = filemap_write_and_wait(inode->i_mapping);
 		if (err < 0) {
 			up_write(&EXT4_I(inode)->i_mmap_sem);
+			ext4_inode_resume_unlocked_dio(inode);
 			return err;
 		}
 	}
@@ -6175,6 +6181,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		if (err < 0) {
 			jbd2_journal_unlock_updates(journal);
 			percpu_up_write(&sbi->s_journal_flag_rwsem);
+			ext4_inode_resume_unlocked_dio(inode);
 			return err;
 		}
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
@@ -6186,6 +6193,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 
 	if (val)
 		up_write(&EXT4_I(inode)->i_mmap_sem);
+	ext4_inode_resume_unlocked_dio(inode);
 
 	/* Finally we can mark the inode as dirty. */
 
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 442f7ef..bce15d8 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -154,6 +154,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
 		goto err_out;
 
 	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(inode);
+	ext4_inode_block_unlocked_dio(inode_bl);
 	inode_dio_wait(inode);
 	inode_dio_wait(inode_bl);
 
@@ -252,6 +254,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
 err_out:
 	up_write(&EXT4_I(inode)->i_mmap_sem);
 journal_err_out:
+	ext4_inode_resume_unlocked_dio(inode);
+	ext4_inode_resume_unlocked_dio(inode_bl);
 	unlock_two_nondirectories(inode, inode_bl);
 	iput(inode_bl);
 	return err;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 30ce3dc..47f5cd0 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -603,6 +603,8 @@
 	lock_two_nondirectories(orig_inode, donor_inode);
 
 	/* Wait for all existing dio workers */
+	ext4_inode_block_unlocked_dio(orig_inode);
+	ext4_inode_block_unlocked_dio(donor_inode);
 	inode_dio_wait(orig_inode);
 	inode_dio_wait(donor_inode);
 
@@ -693,6 +695,8 @@
 	ext4_ext_drop_refs(path);
 	kfree(path);
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
+	ext4_inode_resume_unlocked_dio(orig_inode);
+	ext4_inode_resume_unlocked_dio(donor_inode);
 	unlock_two_nondirectories(orig_inode, donor_inode);
 
 	return ret;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4079605..2768a2a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -102,13 +102,15 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
  *   i_data_sem (rw)
  *
  * truncate:
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> i_mmap_rwsem (w) -> page lock
- * sb_start_write -> i_mutex -> i_mmap_sem (w) -> transaction start ->
- *   i_data_sem (rw)
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ *   i_mmap_rwsem (w) -> page lock
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ *   transaction start -> i_data_sem (rw)
  *
  * direct IO:
- * sb_start_write -> i_mutex -> mmap_sem
- * sb_start_write -> i_mutex -> transaction start -> i_data_sem (rw)
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) ->
+ *   transaction start -> i_data_sem (rw)
  *
  * writepages:
  * transaction start -> page lock(s) -> i_data_sem (rw)
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 2/3] Revert "ext4: fix off-by-one error when writing back pages before dio read"
  2019-08-27  2:05 [PATCH 0/3] Revert parallel dio reads Joseph Qi
  2019-08-27  2:05 ` [PATCH 1/3] Revert "ext4: remove EXT4_STATE_DIOREAD_LOCK flag" Joseph Qi
@ 2019-08-27  2:05 ` Joseph Qi
  2019-08-27  2:05 ` [PATCH 3/3] Revert "ext4: Allow parallel DIO reads" Joseph Qi
  2019-08-27 11:51 ` [PATCH 0/3] Revert parallel dio reads Dave Chinner
  3 siblings, 0 replies; 11+ messages in thread
From: Joseph Qi @ 2019-08-27  2:05 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger, Jan Kara; +Cc: linux-ext4, Dave Chinner

This reverts commit e5465795cac4 ("ext4: fix off-by-one error when
writing back pages before dio read").
It is related to the following revert 16c54688592c ("ext4: Allow
parallel DIO reads") which causes significant performance regression in
mixed random read/write scenario.

Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
---
 fs/ext4/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 0f505f0..16077ec 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3875,7 +3875,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
 	 */
 	inode_lock_shared(inode);
 	ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
-					   iocb->ki_pos + count - 1);
+					   iocb->ki_pos + count);
 	if (ret)
 		goto out_unlock;
 	ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH 3/3] Revert "ext4: Allow parallel DIO reads"
  2019-08-27  2:05 [PATCH 0/3] Revert parallel dio reads Joseph Qi
  2019-08-27  2:05 ` [PATCH 1/3] Revert "ext4: remove EXT4_STATE_DIOREAD_LOCK flag" Joseph Qi
  2019-08-27  2:05 ` [PATCH 2/3] Revert "ext4: fix off-by-one error when writing back pages before dio read" Joseph Qi
@ 2019-08-27  2:05 ` Joseph Qi
  2019-08-27 11:51 ` [PATCH 0/3] Revert parallel dio reads Dave Chinner
  3 siblings, 0 replies; 11+ messages in thread
From: Joseph Qi @ 2019-08-27  2:05 UTC (permalink / raw)
  To: Theodore Ts'o, Andreas Dilger, Jan Kara; +Cc: linux-ext4, Dave Chinner

This reverts commit 16c54688592c ("ext4: Allow parallel DIO reads").

This commit causes significant performance regression in mixed random
read/write scenario. As discussed, it is because current implementation
is incomplete. So revert it at present.

The following data are tested on Intel P3600 NVMe.

fio -name=parallel_dio_reads_test -filename=/mnt/nvme0n1/testfile
-direct=1 -iodepth=1 -thread -rw=randrw -ioengine=psync -bs=$bs
-size=20G -numjobs=8 -runtime=600 -group_reporting

w/ = with parallel dio reads
w/o =  reverting parallel dio reads

bs=4k:
------------------------------------------------------------
    |            READ           |           WRITE          |
------------------------------------------------------------
w/  | 30898KB/s,7724,555.00us   | 30875KB/s,7718,479.70us  |
------------------------------------------------------------
w/o | 117915KB/s,29478,248.18us | 117854KB/s,29463,21.91us |
------------------------------------------------------------

bs=16k:
------------------------------------------------------------
    |            READ           |           WRITE          |
------------------------------------------------------------
w/  | 58961KB/s,3685,835.28us   | 58877KB/s,3679,1335.98us |
------------------------------------------------------------
w/o | 218409KB/s,13650,554.46us | 218257KB/s,13641,29.22us |
------------------------------------------------------------

bs=64k
--------------------------------------------------------------
    |            READ            |           WRITE           |
--------------------------------------------------------------
w/  | 119396KB/s,1865,1759.38us  | 119159KB/s,1861,2532.26us |
--------------------------------------------------------------
w/o | 422815KB/s,6606,1146.05us  | 421619KB/s,6587,60.72us   |
--------------------------------------------,-----------------

bs=512k
--------------------------------------------------------------
    |            READ            |           WRITE           |
--------------------------------------------------------------
w/  | 392973KB/s,767,5046.35us   | 393165KB/s,767,5359.86us  |
--------------------------------------------------------------
w/o | 590266KB/s,1152,4312.01us  | 590554KB/s,1153,2606.82us |
--------------------------------------------------------------

bs=1M
--------------------------------------------------------------
    |            READ            |           WRITE           |
--------------------------------------------------------------
w/  | 487779KB/s,476,8058.55us   | 485592KB/s,474,8630.51us  |
--------------------------------------------------------------
w/o | 593927KB/s,580,7623.63us   | 591265KB/s,577,6163.42us  |
--------------------------------------------------------------

Signed-off-by: Joseph Qi <joseph.qi@linux.alibaba.com>
---
 fs/ext4/inode.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 16077ec..e6b1740 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3863,25 +3863,32 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 
 static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
 {
-	struct address_space *mapping = iocb->ki_filp->f_mapping;
-	struct inode *inode = mapping->host;
-	size_t count = iov_iter_count(iter);
+	int unlocked = 0;
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
 	ssize_t ret;
 
-	/*
-	 * Shared inode_lock is enough for us - it protects against concurrent
-	 * writes & truncates and since we take care of writing back page cache,
-	 * we are protected against page writeback as well.
-	 */
-	inode_lock_shared(inode);
-	ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
-					   iocb->ki_pos + count);
-	if (ret)
-		goto out_unlock;
+	if (ext4_should_dioread_nolock(inode)) {
+		/*
+		 * Nolock dioread optimization may be dynamically disabled
+		 * via ext4_inode_block_unlocked_dio(). Check inode's state
+		 * while holding extra i_dio_count ref.
+		 */
+		inode_dio_begin(inode);
+		smp_mb();
+		if (unlikely(ext4_test_inode_state(inode,
+						   EXT4_STATE_DIOREAD_LOCK)))
+			inode_dio_end(inode);
+		else
+			unlocked = 1;
+	}
+
 	ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-				   iter, ext4_dio_get_block, NULL, NULL, 0);
-out_unlock:
-	inode_unlock_shared(inode);
+				   iter, ext4_dio_get_block,
+				   NULL, NULL,
+				   unlocked ? 0 : DIO_LOCKING);
+	if (unlocked)
+		inode_dio_end(inode);
+
 	return ret;
 }
 
-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/3] Revert parallel dio reads
  2019-08-27  2:05 [PATCH 0/3] Revert parallel dio reads Joseph Qi
                   ` (2 preceding siblings ...)
  2019-08-27  2:05 ` [PATCH 3/3] Revert "ext4: Allow parallel DIO reads" Joseph Qi
@ 2019-08-27 11:51 ` Dave Chinner
  2019-08-29 10:58   ` Jan Kara
  3 siblings, 1 reply; 11+ messages in thread
From: Dave Chinner @ 2019-08-27 11:51 UTC (permalink / raw)
  To: Joseph Qi; +Cc: Theodore Ts'o, Andreas Dilger, Jan Kara, linux-ext4

On Tue, Aug 27, 2019 at 10:05:49AM +0800, Joseph Qi wrote:
> This patch set is trying to revert parallel dio reads feature at present
> since it causes significant performance regression in mixed random
> read/write scenario.
> 
> Joseph Qi (3):
>   Revert "ext4: remove EXT4_STATE_DIOREAD_LOCK flag"
>   Revert "ext4: fix off-by-one error when writing back pages before dio
>     read"
>   Revert "ext4: Allow parallel DIO reads"
> 
>  fs/ext4/ext4.h        | 17 +++++++++++++++++
>  fs/ext4/extents.c     | 19 ++++++++++++++-----
>  fs/ext4/inode.c       | 47 +++++++++++++++++++++++++++++++----------------
>  fs/ext4/ioctl.c       |  4 ++++
>  fs/ext4/move_extent.c |  4 ++++
>  fs/ext4/super.c       | 12 +++++++-----
>  6 files changed, 77 insertions(+), 26 deletions(-)

Before doing this, you might want to have a chat and co-ordinate
with the folks that are currently trying to port the ext4 direct IO
code to use the iomap infrastructure:

https://lore.kernel.org/linux-ext4/20190827095221.GA1568@poseidon.bobrowski.net/T/#t

That is going to need the shared locking on read and will work just
fine with shared locking on write, too (it's the code that XFS uses
for direct IO). So it might be best here if you work towards shared
locking on the write side rather than just revert the shared locking
on the read side....

Cheers,

Dave,
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/3] Revert parallel dio reads
  2019-08-27 11:51 ` [PATCH 0/3] Revert parallel dio reads Dave Chinner
@ 2019-08-29 10:58   ` Jan Kara
  2019-08-29 19:06     ` Andreas Dilger
  2019-09-10 14:10     ` Ritesh Harjani
  0 siblings, 2 replies; 11+ messages in thread
From: Jan Kara @ 2019-08-29 10:58 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Joseph Qi, Theodore Ts'o, Andreas Dilger, Jan Kara, linux-ext4

On Tue 27-08-19 21:51:18, Dave Chinner wrote:
> On Tue, Aug 27, 2019 at 10:05:49AM +0800, Joseph Qi wrote:
> > This patch set is trying to revert parallel dio reads feature at present
> > since it causes significant performance regression in mixed random
> > read/write scenario.
> > 
> > Joseph Qi (3):
> >   Revert "ext4: remove EXT4_STATE_DIOREAD_LOCK flag"
> >   Revert "ext4: fix off-by-one error when writing back pages before dio
> >     read"
> >   Revert "ext4: Allow parallel DIO reads"
> > 
> >  fs/ext4/ext4.h        | 17 +++++++++++++++++
> >  fs/ext4/extents.c     | 19 ++++++++++++++-----
> >  fs/ext4/inode.c       | 47 +++++++++++++++++++++++++++++++----------------
> >  fs/ext4/ioctl.c       |  4 ++++
> >  fs/ext4/move_extent.c |  4 ++++
> >  fs/ext4/super.c       | 12 +++++++-----
> >  6 files changed, 77 insertions(+), 26 deletions(-)
> 
> Before doing this, you might want to have a chat and co-ordinate
> with the folks that are currently trying to port the ext4 direct IO
> code to use the iomap infrastructure:
> 
> https://lore.kernel.org/linux-ext4/20190827095221.GA1568@poseidon.bobrowski.net/T/#t
> 
> That is going to need the shared locking on read and will work just
> fine with shared locking on write, too (it's the code that XFS uses
> for direct IO). So it might be best here if you work towards shared
> locking on the write side rather than just revert the shared locking
> on the read side....

Yeah, after converting ext4 DIO path to iomap infrastructure, using shared
inode lock for all aligned non-extending DIO writes will be easy so I'd
prefer if we didn't have to redo the iomap conversion patches due to these
reverts.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/3] Revert parallel dio reads
  2019-08-29 10:58   ` Jan Kara
@ 2019-08-29 19:06     ` Andreas Dilger
  2019-08-30 15:35       ` Jan Kara
  2019-09-10 14:10     ` Ritesh Harjani
  1 sibling, 1 reply; 11+ messages in thread
From: Andreas Dilger @ 2019-08-29 19:06 UTC (permalink / raw)
  To: Jan Kara; +Cc: Dave Chinner, Joseph Qi, Theodore Ts'o, linux-ext4

[-- Attachment #1: Type: text/plain, Size: 2078 bytes --]

On Aug 29, 2019, at 4:58 AM, Jan Kara <jack@suse.cz> wrote:
> 
> On Tue 27-08-19 21:51:18, Dave Chinner wrote:
>> On Tue, Aug 27, 2019 at 10:05:49AM +0800, Joseph Qi wrote:
>>> This patch set is trying to revert parallel dio reads feature at present
>>> since it causes significant performance regression in mixed random
>>> read/write scenario.
>>> 
>>> Joseph Qi (3):
>>>  Revert "ext4: remove EXT4_STATE_DIOREAD_LOCK flag"
>>>  Revert "ext4: fix off-by-one error when writing back pages before dio
>>>    read"
>>>  Revert "ext4: Allow parallel DIO reads"
>>> 
>>> fs/ext4/ext4.h        | 17 +++++++++++++++++
>>> fs/ext4/extents.c     | 19 ++++++++++++++-----
>>> fs/ext4/inode.c       | 47 +++++++++++++++++++++++++++++++----------------
>>> fs/ext4/ioctl.c       |  4 ++++
>>> fs/ext4/move_extent.c |  4 ++++
>>> fs/ext4/super.c       | 12 +++++++-----
>>> 6 files changed, 77 insertions(+), 26 deletions(-)
>> 
>> Before doing this, you might want to have a chat and co-ordinate
>> with the folks that are currently trying to port the ext4 direct IO
>> code to use the iomap infrastructure:
>> 
>> https://lore.kernel.org/linux-ext4/20190827095221.GA1568@poseidon.bobrowski.net/T/#t
>> 
>> That is going to need the shared locking on read and will work just
>> fine with shared locking on write, too (it's the code that XFS uses
>> for direct IO). So it might be best here if you work towards shared
>> locking on the write side rather than just revert the shared locking
>> on the read side....
> 
> Yeah, after converting ext4 DIO path to iomap infrastructure, using shared
> inode lock for all aligned non-extending DIO writes will be easy so I'd
> prefer if we didn't have to redo the iomap conversion patches due to these
> reverts.

But if the next kernel is LTS and the iomap implementation isn't in the
current merge window (very unlikely) then we're stuck with this performance
hit for LTS.  It is also unlikely that LTS will take the revert patches if
they have not been landed to master.

Cheers, Andreas






[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/3] Revert parallel dio reads
  2019-08-29 19:06     ` Andreas Dilger
@ 2019-08-30 15:35       ` Jan Kara
  0 siblings, 0 replies; 11+ messages in thread
From: Jan Kara @ 2019-08-30 15:35 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Jan Kara, Dave Chinner, Joseph Qi, Theodore Ts'o, linux-ext4

On Thu 29-08-19 13:06:22, Andreas Dilger wrote:
> On Aug 29, 2019, at 4:58 AM, Jan Kara <jack@suse.cz> wrote:
> > 
> > On Tue 27-08-19 21:51:18, Dave Chinner wrote:
> >> On Tue, Aug 27, 2019 at 10:05:49AM +0800, Joseph Qi wrote:
> >>> This patch set is trying to revert parallel dio reads feature at present
> >>> since it causes significant performance regression in mixed random
> >>> read/write scenario.
> >>> 
> >>> Joseph Qi (3):
> >>>  Revert "ext4: remove EXT4_STATE_DIOREAD_LOCK flag"
> >>>  Revert "ext4: fix off-by-one error when writing back pages before dio
> >>>    read"
> >>>  Revert "ext4: Allow parallel DIO reads"
> >>> 
> >>> fs/ext4/ext4.h        | 17 +++++++++++++++++
> >>> fs/ext4/extents.c     | 19 ++++++++++++++-----
> >>> fs/ext4/inode.c       | 47 +++++++++++++++++++++++++++++++----------------
> >>> fs/ext4/ioctl.c       |  4 ++++
> >>> fs/ext4/move_extent.c |  4 ++++
> >>> fs/ext4/super.c       | 12 +++++++-----
> >>> 6 files changed, 77 insertions(+), 26 deletions(-)
> >> 
> >> Before doing this, you might want to have a chat and co-ordinate
> >> with the folks that are currently trying to port the ext4 direct IO
> >> code to use the iomap infrastructure:
> >> 
> >> https://lore.kernel.org/linux-ext4/20190827095221.GA1568@poseidon.bobrowski.net/T/#t
> >> 
> >> That is going to need the shared locking on read and will work just
> >> fine with shared locking on write, too (it's the code that XFS uses
> >> for direct IO). So it might be best here if you work towards shared
> >> locking on the write side rather than just revert the shared locking
> >> on the read side....
> > 
> > Yeah, after converting ext4 DIO path to iomap infrastructure, using shared
> > inode lock for all aligned non-extending DIO writes will be easy so I'd
> > prefer if we didn't have to redo the iomap conversion patches due to these
> > reverts.
> 
> But if the next kernel is LTS and the iomap implementation isn't in the
> current merge window (very unlikely) then we're stuck with this performance
> hit for LTS.  It is also unlikely that LTS will take the revert patches if
> they have not been landed to master.

I agree this is not great but the regression is there for 3 years, it has
been released in major distribution kernels quite a long time ago, and only
now someone complained. So it doesn't seem many people care about
performance of mixed RW workload when mounted with dioread_nolock (note
that the patches actually improve performance of read-only DIO workload
when not using dioread_nolock as for that case, exclusive lock is replaced
with a shared one).

									Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/3] Revert parallel dio reads
  2019-08-29 10:58   ` Jan Kara
  2019-08-29 19:06     ` Andreas Dilger
@ 2019-09-10 14:10     ` Ritesh Harjani
  2019-09-10 21:57       ` Jan Kara
  1 sibling, 1 reply; 11+ messages in thread
From: Ritesh Harjani @ 2019-09-10 14:10 UTC (permalink / raw)
  To: Jan Kara, Dave Chinner, Theodore Ts'o, hch
  Cc: Joseph Qi, Andreas Dilger, linux-ext4

Hello,

>> Before doing this, you might want to have a chat and co-ordinate
>> with the folks that are currently trying to port the ext4 direct IO
>> code to use the iomap infrastructure:
>>
>> https://lore.kernel.org/linux-ext4/20190827095221.GA1568@poseidon.bobrowski.net/T/#t
>>
>> That is going to need the shared locking on read and will work just
>> fine with shared locking on write, too (it's the code that XFS uses
>> for direct IO). So it might be best here if you work towards shared
>> locking on the write side rather than just revert the shared locking
>> on the read side....
> 
> Yeah, after converting ext4 DIO path to iomap infrastructure, using shared
> inode lock for all aligned non-extending DIO writes will be easy so I'd
> prefer if we didn't have to redo the iomap conversion patches due to these
> reverts.

I was looking into this to see what is required to convert this into 
shared locking for DIO write side.
Why do you say shared inode lock only for *aligned non-extending DIO 
writes*?
non-extending means overwrite case only, which still in today's code is
not under any exclusive inode_lock (except the orphan handling and 
truncate handling in case of error after IO is completed). But even with
current code the perf problem is reported right?

If we see in today's code (including in iomap new code for overwrite 
case where we downgrade the lock).
We call for inode_unlock in case of overwrite and then do the IO, since 
we don't have to allocate any blocks in this case.


	/*
	 * Make all waiters for direct IO properly wait also for extent
	 * conversion. This also disallows race between truncate() and
	 * overwrite DIO as i_dio_count needs to be incremented under
  	 * i_mutex.
	 */
	inode_dio_begin(inode);
	/* If we do a overwrite dio, i_mutex locking can be released */
	overwrite = *((int *)iocb->private);
	if (overwrite)
		inode_unlock(inode);
	
	write data (via __blockdev_direct_IO)

	inode_dio_end(inode);
	/* take i_mutex locking again if we do a ovewrite dio */
	if (overwrite)
		inode_lock(inode);



What can we do next:-

Earlier the DIO reads was not having any inode_locking.
IIUC, the inode_lock_shared() in the DIO reads case was added to
protect the race against reading back uninitialized/stale data for e.g.
in case of truncate or writeback etc.

So as Dave suggested, shouldn't we add the complete shared locking in 
DIO writes cases as well (except for unaligned writes, since it may 
required zeroing of partial blocks).


Could you please help me understand how we can go about it?
I was thinking if we can create uninitialized blocks beyond EOF, so that
any parallel read should only read 0 from that area and we may not 
require exclusive inode_lock. The block creation is anyway protected
with i_data_sem in ext4_map_blocks.


I do see that in case of bufferedIO writes, we use 
ext4_get_block_unwritten for dioread_nolock case. Which should
be true for even writes extending beyond EOF. This will
create uninitialized and mapped blocks only (even beyond EOF).
But all of this happen under exclusive inode_lock() in ext4_file_write_iter.


Whereas in case of DIO writes extending beyond EOF, we pass
EXT4_GET_BLOCKS_CREATE in ext4_map_blocks which may allocate
initialized & mapped blocks.
Do you know why so?


Do you think we can create uninit. blocks in dioread_nolock AIO/non-AIO 
DIO writes cases as well with only shared inode lock mode?
Do you see any problems with this?
Or do you have any other suggestion?


In case of XFS -
I do see it promotes the demotes the shared lock (IOLOCK_XXX) in 
xfs_file_aio_write_checks. But I don't know if after that, whether
it only holds the shared lock while doing the DIO IO?
And how does it protects against the parallel DIO reads which can 
potentially expose any stale data?


Appreciate any help & inputs from FS developers.

-ritesh


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/3] Revert parallel dio reads
  2019-09-10 14:10     ` Ritesh Harjani
@ 2019-09-10 21:57       ` Jan Kara
  2019-09-11 14:20         ` Ritesh Harjani
  0 siblings, 1 reply; 11+ messages in thread
From: Jan Kara @ 2019-09-10 21:57 UTC (permalink / raw)
  To: Ritesh Harjani
  Cc: Jan Kara, Dave Chinner, Theodore Ts'o, hch, Joseph Qi,
	Andreas Dilger, linux-ext4

Hello,

On Tue 10-09-19 19:40:40, Ritesh Harjani wrote:
> > > Before doing this, you might want to have a chat and co-ordinate
> > > with the folks that are currently trying to port the ext4 direct IO
> > > code to use the iomap infrastructure:
> > > 
> > > https://lore.kernel.org/linux-ext4/20190827095221.GA1568@poseidon.bobrowski.net/T/#t
> > > 
> > > That is going to need the shared locking on read and will work just
> > > fine with shared locking on write, too (it's the code that XFS uses
> > > for direct IO). So it might be best here if you work towards shared
> > > locking on the write side rather than just revert the shared locking
> > > on the read side....
> > 
> > Yeah, after converting ext4 DIO path to iomap infrastructure, using shared
> > inode lock for all aligned non-extending DIO writes will be easy so I'd
> > prefer if we didn't have to redo the iomap conversion patches due to these
> > reverts.
> 
> I was looking into this to see what is required to convert this into
> shared locking for DIO write side.
> Why do you say shared inode lock only for *aligned non-extending DIO
> writes*?
> non-extending means overwrite case only, which still in today's code is
> not under any exclusive inode_lock (except the orphan handling and truncate
> handling in case of error after IO is completed). But even with
> current code the perf problem is reported right?

Yes, the problem is even with current code. It is that we first acquire
inode_rwsem in exclusive mode in ext4_file_write_iter() and only later
relax that to no lock later. And this is what is causing low performance
for mixed read-write workload because the exclusive lock has to wait for
all shared holders and vice versa...

> If we see in today's code (including in iomap new code for overwrite case
> where we downgrade the lock).
> We call for inode_unlock in case of overwrite and then do the IO, since we
> don't have to allocate any blocks in this case.
> 
> 
> 	/*
> 	 * Make all waiters for direct IO properly wait also for extent
> 	 * conversion. This also disallows race between truncate() and
> 	 * overwrite DIO as i_dio_count needs to be incremented under
>  	 * i_mutex.
> 	 */
> 	inode_dio_begin(inode);
> 	/* If we do a overwrite dio, i_mutex locking can be released */
> 	overwrite = *((int *)iocb->private);
> 	if (overwrite)
> 		inode_unlock(inode);
> 	
> 	write data (via __blockdev_direct_IO)
> 
> 	inode_dio_end(inode);
> 	/* take i_mutex locking again if we do a ovewrite dio */
> 	if (overwrite)
> 		inode_lock(inode);
> 
> 
> 
> What can we do next:-
> 
> Earlier the DIO reads was not having any inode_locking.
> IIUC, the inode_lock_shared() in the DIO reads case was added to
> protect the race against reading back uninitialized/stale data for e.g.
> in case of truncate or writeback etc.

Not quite. Places that are prone to exposing stale block content (such as
truncate) wait for running DIO (inode_dio_wait()). Just with unlocked read
DIO we had to have special wait mechanisms to disable unlocked DIO for a
while so that we can actually safely drain all running DIOs without new
ones being submitted and then perform e.g. truncate. So it was more a
complexity of the locking mechanism.

> So as Dave suggested, shouldn't we add the complete shared locking in DIO
> writes cases as well (except for unaligned writes, since it may required
> zeroing of partial blocks).
> 
> 
> Could you please help me understand how we can go about it?
> I was thinking if we can create uninitialized blocks beyond EOF, so that
> any parallel read should only read 0 from that area and we may not require
> exclusive inode_lock. The block creation is anyway protected
> with i_data_sem in ext4_map_blocks.

So doing file size changes (i.e., file expansion) without exclusive
inode_lock would be tricky. I wouldn't really like to go there at least
initially. My plan would be to do it similarly to XFS like:

Do a quick check whether IO is going to grow inode size or is unaligned. If
yes, mode = exclusive, else mode = shared. Then lock inode rwsem is
appropriate mode, do ext4_write_checks() (which may find out exclusive lock
is actually needed so in that case mode = exclusive and restart). Then call
iomap code to do direct IO.

> I do see that in case of bufferedIO writes, we use ext4_get_block_unwritten
> for dioread_nolock case. Which should
> be true for even writes extending beyond EOF. This will
> create uninitialized and mapped blocks only (even beyond EOF).
> But all of this happen under exclusive inode_lock() in ext4_file_write_iter.

I guess this is mostly because we don't bother to check in
ext4_write_begin() whether we are extending the file or not and filling
holes needs unwritten extents. Also before DIO reads got changed to be
under shared inode rwsem, the following was possible:

CPU1					CPU2
DIO read from file f offset 4096
  flush cache
					grow 'f' from 4096 to 8192 by write
					  blocks get allocated, page cache
					  dirtied
  map blocks, submit IO
    - reads stale contents

> Whereas in case of DIO writes extending beyond EOF, we pass
> EXT4_GET_BLOCKS_CREATE in ext4_map_blocks which may allocate
> initialized & mapped blocks.
> Do you know why so?

Not using unwritten extents is faster if that is safe. So that's why DIO
code uses them if possible.

> Do you think we can create uninit. blocks in dioread_nolock AIO/non-AIO DIO
> writes cases as well with only shared inode lock mode?
> Do you see any problems with this?
> Or do you have any other suggestion?

Not uninit but unwritten. Yes, we can do that. After all e.g. page writeback
creates extents like this without any inode rwsem protection.

> In case of XFS -
> I do see it promotes the demotes the shared lock (IOLOCK_XXX) in
> xfs_file_aio_write_checks. But I don't know if after that, whether
> it only holds the shared lock while doing the DIO IO?
> And how does it protects against the parallel DIO reads which can
> potentially expose any stale data?

XFS protects DIO submission with shared IOLOCK. Stale data exposure is
solved by using unwritten extents similarly to what ext4 does.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH 0/3] Revert parallel dio reads
  2019-09-10 21:57       ` Jan Kara
@ 2019-09-11 14:20         ` Ritesh Harjani
  0 siblings, 0 replies; 11+ messages in thread
From: Ritesh Harjani @ 2019-09-11 14:20 UTC (permalink / raw)
  To: Jan Kara
  Cc: Dave Chinner, Theodore Ts'o, hch, Joseph Qi, Andreas Dilger,
	linux-ext4

Hello,

On 9/11/19 3:27 AM, Jan Kara wrote:
> Hello,
> 
> On Tue 10-09-19 19:40:40, Ritesh Harjani wrote:
>>>> Before doing this, you might want to have a chat and co-ordinate
>>>> with the folks that are currently trying to port the ext4 direct IO
>>>> code to use the iomap infrastructure:
>>>>
>>>> https://lore.kernel.org/linux-ext4/20190827095221.GA1568@poseidon.bobrowski.net/T/#t
>>>>
>>>> That is going to need the shared locking on read and will work just
>>>> fine with shared locking on write, too (it's the code that XFS uses
>>>> for direct IO). So it might be best here if you work towards shared
>>>> locking on the write side rather than just revert the shared locking
>>>> on the read side....
>>>
>>> Yeah, after converting ext4 DIO path to iomap infrastructure, using shared
>>> inode lock for all aligned non-extending DIO writes will be easy so I'd
>>> prefer if we didn't have to redo the iomap conversion patches due to these
>>> reverts.
>>
>> I was looking into this to see what is required to convert this into
>> shared locking for DIO write side.
>> Why do you say shared inode lock only for *aligned non-extending DIO
>> writes*?
>> non-extending means overwrite case only, which still in today's code is
>> not under any exclusive inode_lock (except the orphan handling and truncate
>> handling in case of error after IO is completed). But even with
>> current code the perf problem is reported right?
> 
> Yes, the problem is even with current code. It is that we first acquire
> inode_rwsem in exclusive mode in ext4_file_write_iter() and only later
> relax that to no lock later. And this is what is causing low performance
> for mixed read-write workload because the exclusive lock has to wait for
> all shared holders and vice versa...

Got it.


> 
>> If we see in today's code (including in iomap new code for overwrite case
>> where we downgrade the lock).
>> We call for inode_unlock in case of overwrite and then do the IO, since we
>> don't have to allocate any blocks in this case.
>>
>>
>> 	/*
>> 	 * Make all waiters for direct IO properly wait also for extent
>> 	 * conversion. This also disallows race between truncate() and
>> 	 * overwrite DIO as i_dio_count needs to be incremented under
>>   	 * i_mutex.
>> 	 */
>> 	inode_dio_begin(inode);
>> 	/* If we do a overwrite dio, i_mutex locking can be released */
>> 	overwrite = *((int *)iocb->private);
>> 	if (overwrite)
>> 		inode_unlock(inode);
>> 	
>> 	write data (via __blockdev_direct_IO)
>>
>> 	inode_dio_end(inode);
>> 	/* take i_mutex locking again if we do a ovewrite dio */
>> 	if (overwrite)
>> 		inode_lock(inode);
>>
>>
>>
>> What can we do next:-
>>
>> Earlier the DIO reads was not having any inode_locking.
>> IIUC, the inode_lock_shared() in the DIO reads case was added to
>> protect the race against reading back uninitialized/stale data for e.g.
>> in case of truncate or writeback etc.
> 
> Not quite. Places that are prone to exposing stale block content (such as
> truncate) wait for running DIO (inode_dio_wait()). Just with unlocked read
> DIO we had to have special wait mechanisms to disable unlocked DIO for a
> while so that we can actually safely drain all running DIOs without new
> ones being submitted and then perform e.g. truncate. So it was more a
> complexity of the locking mechanism.
> 
>> So as Dave suggested, shouldn't we add the complete shared locking in DIO
>> writes cases as well (except for unaligned writes, since it may required
>> zeroing of partial blocks).
>>
>>
>> Could you please help me understand how we can go about it?
>> I was thinking if we can create uninitialized blocks beyond EOF, so that
>> any parallel read should only read 0 from that area and we may not require
>> exclusive inode_lock. The block creation is anyway protected
>> with i_data_sem in ext4_map_blocks.
> 
> So doing file size changes (i.e., file expansion) without exclusive
> inode_lock would be tricky. I wouldn't really like to go there at least

Ok.

Yes, I am looking into this. I agree that the test case reported here
has the performance problem due to the fact that we take exclusive lock
first and then only when we detect it's an overwrite case we downgrade
it.

I am working over the patch to make the changes you suggested. This will
be of course on top of the new iomap patch series.


> initially. My plan would be to do it similarly to XFS like:
 >
> Do a quick check whether IO is going to grow inode size or is unaligned. If
> yes, mode = exclusive, else mode = shared. Then lock inode rwsem is
> appropriate mode, do ext4_write_checks() (which may find out exclusive lock
> is actually needed so in that case mode = exclusive and restart). Then call
> iomap code to do direct IO.

Got it.

> 
>> I do see that in case of bufferedIO writes, we use ext4_get_block_unwritten
>> for dioread_nolock case. Which should
>> be true for even writes extending beyond EOF. This will
>> create uninitialized and mapped blocks only (even beyond EOF).
>> But all of this happen under exclusive inode_lock() in ext4_file_write_iter.
> 
> I guess this is mostly because we don't bother to check in
> ext4_write_begin() whether we are extending the file or not and filling
> holes needs unwritten extents. Also before DIO reads got changed to be
> under shared inode rwsem, the following was possible:
> 
> CPU1					CPU2
> DIO read from file f offset 4096
>    flush cache
> 					grow 'f' from 4096 to 8192 by write
> 					  blocks get allocated, page cache
> 					  dirtied
>    map blocks, submit IO
>      - reads stale contents

Ok. Got it.

> 
>> Whereas in case of DIO writes extending beyond EOF, we pass
>> EXT4_GET_BLOCKS_CREATE in ext4_map_blocks which may allocate
>> initialized & mapped blocks.
>> Do you know why so?
> 
> Not using unwritten extents is faster if that is safe. So that's why DIO
> code uses them if possible.

Thanks for giving that info to me. So it must be due joining/splitting
of unwritten extents which makes it a bit slow.

> 
>> Do you think we can create uninit. blocks in dioread_nolock AIO/non-AIO DIO
>> writes cases as well with only shared inode lock mode?
>> Do you see any problems with this?
>> Or do you have any other suggestion?
> 
> Not uninit but unwritten. Yes, we can do that. After all e.g. page writeback
> creates extents like this without any inode rwsem protection.

hmm, so as I understand you would like to take this step-by-step.
First let's come up with a patch to solve above mentioned issue with 
overwrites case.
With this we can have some APIs for taking inode lock based
on the shared/exclusive mode flags.

Then as you said, we can submit IO to create unwritten extents for DIO 
writes beyond EOF as well. But that will require some careful handling 
in updating inode size. So that should be a second step we should take.


> 
>> In case of XFS -
>> I do see it promotes the demotes the shared lock (IOLOCK_XXX) in
>> xfs_file_aio_write_checks. But I don't know if after that, whether
>> it only holds the shared lock while doing the DIO IO?
>> And how does it protects against the parallel DIO reads which can
>> potentially expose any stale data?
> 
> XFS protects DIO submission with shared IOLOCK. Stale data exposure is
> solved by using unwritten extents similarly to what ext4 does.

Yes, that's what I was also thinking. But I will have to check the 
locking mechanism which XFS uses in carefully updating the inode size 
while in shared IOLOCK mode while using unwritten extents for DIO writes 
beyond EOF. I see some tricks with i_lock(ILOCK)/i_flags_lock & 
i_rwsem(IOLOCK).

Will check once the 1st step w.r.t overwrite is done.


Thanks for your comments. Appreciate it!!

-ritesh


^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, back to index

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-27  2:05 [PATCH 0/3] Revert parallel dio reads Joseph Qi
2019-08-27  2:05 ` [PATCH 1/3] Revert "ext4: remove EXT4_STATE_DIOREAD_LOCK flag" Joseph Qi
2019-08-27  2:05 ` [PATCH 2/3] Revert "ext4: fix off-by-one error when writing back pages before dio read" Joseph Qi
2019-08-27  2:05 ` [PATCH 3/3] Revert "ext4: Allow parallel DIO reads" Joseph Qi
2019-08-27 11:51 ` [PATCH 0/3] Revert parallel dio reads Dave Chinner
2019-08-29 10:58   ` Jan Kara
2019-08-29 19:06     ` Andreas Dilger
2019-08-30 15:35       ` Jan Kara
2019-09-10 14:10     ` Ritesh Harjani
2019-09-10 21:57       ` Jan Kara
2019-09-11 14:20         ` Ritesh Harjani

Linux-ext4 Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-ext4/0 linux-ext4/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-ext4 linux-ext4/ https://lore.kernel.org/linux-ext4 \
		linux-ext4@vger.kernel.org linux-ext4@archiver.kernel.org
	public-inbox-index linux-ext4


Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-ext4


AGPL code for this site: git clone https://public-inbox.org/ public-inbox