* [RFC][PATCH 1/3] ext4: split ext4_file_write into buffered IO and direct IO
2012-04-28 3:39 [RFC][PATCH 0/3] ext4: dio overwrite nolock Zheng Liu
@ 2012-04-28 3:39 ` Zheng Liu
2012-05-02 4:11 ` Tao Ma
2012-04-28 3:39 ` [RFC][PATCH 2/3] ext4: add a new flag for ext4_map_blocks Zheng Liu
2012-04-28 3:39 ` [RFC][PATCH 3/3] ext4: add dio overwrite nolock Zheng Liu
2 siblings, 1 reply; 9+ messages in thread
From: Zheng Liu @ 2012-04-28 3:39 UTC (permalink / raw)
To: linux-ext4; +Cc: Zheng Liu
From: Zheng Liu <wenqing.lz@taobao.com>
ext4_file_buffered/direct_write are defined in order to split buffered IO and
direct IO in ext4. This patch just refactor some stuff in write path.
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
---
fs/ext4/file.c | 66 +++++++++++++++++++++++++++++++++++++------------------
1 files changed, 44 insertions(+), 22 deletions(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index cb70f18..e5d6be3 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -89,12 +89,51 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
return 0;
}
+static inline ssize_t
+ext4_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ return generic_file_aio_write(iocb, iov, nr_segs, pos);
+}
+
+static ssize_t
+ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+ int unaligned_aio = 0;
+ ssize_t ret;
+
+ if (!is_sync_kiocb(iocb))
+ unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
+
+ /* Unaligned direct AIO must be serialized; see comment above */
+ if (unaligned_aio) {
+ static unsigned long unaligned_warn_time;
+
+ /* Warn about this once per day */
+ if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
+ ext4_msg(inode->i_sb, KERN_WARNING,
+ "Unaligned AIO/DIO on inode %ld by %s; "
+ "performance will be poor.",
+ inode->i_ino, current->comm);
+ mutex_lock(ext4_aio_mutex(inode));
+ ext4_aiodio_wait(inode);
+ }
+
+ ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+
+ if (unaligned_aio)
+ mutex_unlock(ext4_aio_mutex(inode));
+
+ return ret;
+}
+
static ssize_t
ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
- int unaligned_aio = 0;
int ret;
/*
@@ -114,29 +153,12 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
sbi->s_bitmap_maxbytes - pos);
}
- } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
- !is_sync_kiocb(iocb))) {
- unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
- }
-
- /* Unaligned direct AIO must be serialized; see comment above */
- if (unaligned_aio) {
- static unsigned long unaligned_warn_time;
-
- /* Warn about this once per day */
- if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
- ext4_msg(inode->i_sb, KERN_WARNING,
- "Unaligned AIO/DIO on inode %ld by %s; "
- "performance will be poor.",
- inode->i_ino, current->comm);
- mutex_lock(ext4_aio_mutex(inode));
- ext4_aiodio_wait(inode);
}
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [RFC][PATCH 1/3] ext4: split ext4_file_write into buffered IO and direct IO
2012-04-28 3:39 ` [RFC][PATCH 1/3] ext4: split ext4_file_write into buffered IO and direct IO Zheng Liu
@ 2012-05-02 4:11 ` Tao Ma
2012-05-02 5:50 ` Zheng Liu
0 siblings, 1 reply; 9+ messages in thread
From: Tao Ma @ 2012-05-02 4:11 UTC (permalink / raw)
To: Zheng Liu; +Cc: linux-ext4
Hi zheng,
On 04/28/2012 11:39 AM, Zheng Liu wrote:
> From: Zheng Liu <wenqing.lz@taobao.com>
>
> ext4_file_buffered/direct_write are defined in order to split buffered IO and
> direct IO in ext4. This patch just refactor some stuff in write path.
>
> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> ---
> fs/ext4/file.c | 66 +++++++++++++++++++++++++++++++++++++------------------
> 1 files changed, 44 insertions(+), 22 deletions(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index cb70f18..e5d6be3 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -89,12 +89,51 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
> return 0;
> }
>
> +static inline ssize_t
> +ext4_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
> + unsigned long nr_segs, loff_t pos)
> +{
> + return generic_file_aio_write(iocb, iov, nr_segs, pos);
> +}
any reason you wrap generic_file_aio_write with a new function? I didn't
see you use it in the following patch either.
Thanks
Tao
> +
> +static ssize_t
> +ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> + unsigned long nr_segs, loff_t pos)
> +{
> + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> + int unaligned_aio = 0;
> + ssize_t ret;
> +
> + if (!is_sync_kiocb(iocb))
> + unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> +
> + /* Unaligned direct AIO must be serialized; see comment above */
> + if (unaligned_aio) {
> + static unsigned long unaligned_warn_time;
> +
> + /* Warn about this once per day */
> + if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
> + ext4_msg(inode->i_sb, KERN_WARNING,
> + "Unaligned AIO/DIO on inode %ld by %s; "
> + "performance will be poor.",
> + inode->i_ino, current->comm);
> + mutex_lock(ext4_aio_mutex(inode));
> + ext4_aiodio_wait(inode);
> + }
> +
> + ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> +
> + if (unaligned_aio)
> + mutex_unlock(ext4_aio_mutex(inode));
> +
> + return ret;
> +}
> +
> static ssize_t
> ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
> unsigned long nr_segs, loff_t pos)
> {
> struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> - int unaligned_aio = 0;
> int ret;
>
> /*
> @@ -114,29 +153,12 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
> nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
> sbi->s_bitmap_maxbytes - pos);
> }
> - } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
> - !is_sync_kiocb(iocb))) {
> - unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> - }
> -
> - /* Unaligned direct AIO must be serialized; see comment above */
> - if (unaligned_aio) {
> - static unsigned long unaligned_warn_time;
> -
> - /* Warn about this once per day */
> - if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
> - ext4_msg(inode->i_sb, KERN_WARNING,
> - "Unaligned AIO/DIO on inode %ld by %s; "
> - "performance will be poor.",
> - inode->i_ino, current->comm);
> - mutex_lock(ext4_aio_mutex(inode));
> - ext4_aiodio_wait(inode);
> }
>
> - ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> -
> - if (unaligned_aio)
> - mutex_unlock(ext4_aio_mutex(inode));
> + if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
> + ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
> + else
> + ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
>
> return ret;
> }
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC][PATCH 1/3] ext4: split ext4_file_write into buffered IO and direct IO
2012-05-02 4:11 ` Tao Ma
@ 2012-05-02 5:50 ` Zheng Liu
0 siblings, 0 replies; 9+ messages in thread
From: Zheng Liu @ 2012-05-02 5:50 UTC (permalink / raw)
To: Tao Ma; +Cc: linux-ext4
On Wed, May 02, 2012 at 12:11:44PM +0800, Tao Ma wrote:
> Hi zheng,
> On 04/28/2012 11:39 AM, Zheng Liu wrote:
> > From: Zheng Liu <wenqing.lz@taobao.com>
> >
> > ext4_file_buffered/direct_write are defined in order to split buffered IO and
> > direct IO in ext4. This patch just refactor some stuff in write path.
> >
> > Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> > ---
> > fs/ext4/file.c | 66 +++++++++++++++++++++++++++++++++++++------------------
> > 1 files changed, 44 insertions(+), 22 deletions(-)
> >
> > diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> > index cb70f18..e5d6be3 100644
> > --- a/fs/ext4/file.c
> > +++ b/fs/ext4/file.c
> > @@ -89,12 +89,51 @@ ext4_unaligned_aio(struct inode *inode, const struct iovec *iov,
> > return 0;
> > }
> >
> > +static inline ssize_t
> > +ext4_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
> > + unsigned long nr_segs, loff_t pos)
> > +{
> > + return generic_file_aio_write(iocb, iov, nr_segs, pos);
> > +}
> any reason you wrap generic_file_aio_write with a new function? I didn't
> see you use it in the following patch either.
Yes, I don't use it in the following patch. It is defined in order to
be consistent with ext4_file_dio_write and make things clearly.
Regards,
Zheng
>
> Thanks
> Tao
> > +
> > +static ssize_t
> > +ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> > + unsigned long nr_segs, loff_t pos)
> > +{
> > + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> > + int unaligned_aio = 0;
> > + ssize_t ret;
> > +
> > + if (!is_sync_kiocb(iocb))
> > + unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> > +
> > + /* Unaligned direct AIO must be serialized; see comment above */
> > + if (unaligned_aio) {
> > + static unsigned long unaligned_warn_time;
> > +
> > + /* Warn about this once per day */
> > + if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
> > + ext4_msg(inode->i_sb, KERN_WARNING,
> > + "Unaligned AIO/DIO on inode %ld by %s; "
> > + "performance will be poor.",
> > + inode->i_ino, current->comm);
> > + mutex_lock(ext4_aio_mutex(inode));
> > + ext4_aiodio_wait(inode);
> > + }
> > +
> > + ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> > +
> > + if (unaligned_aio)
> > + mutex_unlock(ext4_aio_mutex(inode));
> > +
> > + return ret;
> > +}
> > +
> > static ssize_t
> > ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
> > unsigned long nr_segs, loff_t pos)
> > {
> > struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> > - int unaligned_aio = 0;
> > int ret;
> >
> > /*
> > @@ -114,29 +153,12 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
> > nr_segs = iov_shorten((struct iovec *)iov, nr_segs,
> > sbi->s_bitmap_maxbytes - pos);
> > }
> > - } else if (unlikely((iocb->ki_filp->f_flags & O_DIRECT) &&
> > - !is_sync_kiocb(iocb))) {
> > - unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> > - }
> > -
> > - /* Unaligned direct AIO must be serialized; see comment above */
> > - if (unaligned_aio) {
> > - static unsigned long unaligned_warn_time;
> > -
> > - /* Warn about this once per day */
> > - if (printk_timed_ratelimit(&unaligned_warn_time, 60*60*24*HZ))
> > - ext4_msg(inode->i_sb, KERN_WARNING,
> > - "Unaligned AIO/DIO on inode %ld by %s; "
> > - "performance will be poor.",
> > - inode->i_ino, current->comm);
> > - mutex_lock(ext4_aio_mutex(inode));
> > - ext4_aiodio_wait(inode);
> > }
> >
> > - ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> > -
> > - if (unaligned_aio)
> > - mutex_unlock(ext4_aio_mutex(inode));
> > + if (unlikely(iocb->ki_filp->f_flags & O_DIRECT))
> > + ret = ext4_file_dio_write(iocb, iov, nr_segs, pos);
> > + else
> > + ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> >
> > return ret;
> > }
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* [RFC][PATCH 2/3] ext4: add a new flag for ext4_map_blocks
2012-04-28 3:39 [RFC][PATCH 0/3] ext4: dio overwrite nolock Zheng Liu
2012-04-28 3:39 ` [RFC][PATCH 1/3] ext4: split ext4_file_write into buffered IO and direct IO Zheng Liu
@ 2012-04-28 3:39 ` Zheng Liu
2012-04-28 3:39 ` [RFC][PATCH 3/3] ext4: add dio overwrite nolock Zheng Liu
2 siblings, 0 replies; 9+ messages in thread
From: Zheng Liu @ 2012-04-28 3:39 UTC (permalink / raw)
To: linux-ext4; +Cc: Zheng Liu
From: Zheng Liu <wenqing.lz@taobao.com>
EXT4_GET_BLOCKS_NO_LOCK flag is added to indicate that we don't need to acquire
i_data_sem lock in ext4_map_blocks. Meanwhile, it lets _ext4_get_block do not
start a new journal because when we do a overwrite dio, there is no any
metadata that needs to be modified.
We define a new function called ext4_get_block_write_nolock, which is used in
dio overwrite nolock. In this function, it doesn't try to acquire i_data_sem
lock and doesn't start a new journal as it does a lookup.
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
---
fs/ext4/ext4.h | 2 ++
fs/ext4/inode.c | 46 +++++++++++++++++++++++++++++++++++-----------
2 files changed, 37 insertions(+), 11 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0e01e90..748fda5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -558,6 +558,8 @@ enum {
#define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040
/* Request will not result in inode size update (user for fallocate) */
#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080
+ /* Do not acquire i_data_sem lock */
+#define EXT4_GET_BLOCKS_NO_LOCK 0x0100
/*
* Flags used by ext4_free_blocks
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c77b0bd..c762d4f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -477,7 +477,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* Try to see if we can get the block without requesting a new
* file system block.
*/
- down_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ down_read((&EXT4_I(inode)->i_data_sem));
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
retval = ext4_ext_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -485,7 +486,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
retval = ext4_ind_map_blocks(handle, inode, map, flags &
EXT4_GET_BLOCKS_KEEP_SIZE);
}
- up_read((&EXT4_I(inode)->i_data_sem));
+ if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
+ up_read((&EXT4_I(inode)->i_data_sem));
if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
int ret = check_block_validity(inode, map);
@@ -597,7 +599,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
map.m_lblk = iblock;
map.m_len = bh->b_size >> inode->i_blkbits;
- if (flags && !handle) {
+ if (flags && !handle && !(flags & EXT4_GET_BLOCKS_NO_LOCK)) {
/* Direct IO write... */
if (map.m_len > DIO_MAX_BLOCKS)
map.m_len = DIO_MAX_BLOCKS;
@@ -2751,6 +2753,15 @@ static int ext4_get_block_write(struct inode *inode, sector_t iblock,
EXT4_GET_BLOCKS_IO_CREATE_EXT);
}
+static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
+ inode->i_ino, create);
+ return _ext4_get_block(inode, iblock, bh_result,
+ EXT4_GET_BLOCKS_NO_LOCK);
+}
+
static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
ssize_t size, void *private, int ret,
bool is_async)
@@ -2899,6 +2910,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
loff_t final_size = offset + count;
if (rw == WRITE && final_size <= inode->i_size) {
+ int overwrite = 0;
+
/*
* We could direct write to holes and fallocate.
*
@@ -2919,6 +2932,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
* Here for async case, we allocate an io_end structure to
* hook to the iocb.
*/
+ if (iocb->private)
+ overwrite = *((int *)iocb->private);
iocb->private = NULL;
EXT4_I(inode)->cur_aio_dio = NULL;
if (!is_sync_kiocb(iocb)) {
@@ -2938,13 +2953,22 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
EXT4_I(inode)->cur_aio_dio = iocb->private;
}
- ret = __blockdev_direct_IO(rw, iocb, inode,
- inode->i_sb->s_bdev, iov,
- offset, nr_segs,
- ext4_get_block_write,
- ext4_end_io_dio,
- NULL,
- DIO_LOCKING);
+ if (overwrite)
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_write_nolock,
+ ext4_end_io_dio,
+ NULL,
+ 0);
+ else
+ ret = __blockdev_direct_IO(rw, iocb, inode,
+ inode->i_sb->s_bdev, iov,
+ offset, nr_segs,
+ ext4_get_block_write,
+ ext4_end_io_dio,
+ NULL,
+ DIO_LOCKING);
if (iocb->private)
EXT4_I(inode)->cur_aio_dio = NULL;
/*
@@ -2964,7 +2988,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) {
ext4_free_io_end(iocb->private);
iocb->private = NULL;
- } else if (ret > 0 && ext4_test_inode_state(inode,
+ } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
EXT4_STATE_DIO_UNWRITTEN)) {
int err;
/*
--
1.7.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [RFC][PATCH 3/3] ext4: add dio overwrite nolock
2012-04-28 3:39 [RFC][PATCH 0/3] ext4: dio overwrite nolock Zheng Liu
2012-04-28 3:39 ` [RFC][PATCH 1/3] ext4: split ext4_file_write into buffered IO and direct IO Zheng Liu
2012-04-28 3:39 ` [RFC][PATCH 2/3] ext4: add a new flag for ext4_map_blocks Zheng Liu
@ 2012-04-28 3:39 ` Zheng Liu
2012-05-02 6:59 ` Tao Ma
2012-05-02 15:05 ` Eric Sandeen
2 siblings, 2 replies; 9+ messages in thread
From: Zheng Liu @ 2012-04-28 3:39 UTC (permalink / raw)
To: linux-ext4; +Cc: Zheng Liu
From: Zheng Liu <wenqing.lz@taobao.com>
Aligned and overwrite direct IO can be parallelized. In ext4_file_dio_write,
we first check whether these conditions are satisfied or not. If so, we unlock
the i_mutex and acquire i_data_sem directly. Meanwhile iocb->private is set to
indicate that this is a overwrite dio, and it will be processed in
ext4_ext_direct_IO.
Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
---
fs/ext4/file.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 files changed, 137 insertions(+), 3 deletions(-)
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index e5d6be3..8a5f713 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -100,9 +100,21 @@ static ssize_t
ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
- struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
- int unaligned_aio = 0;
+ struct file *file = iocb->ki_filp;
+ struct address_space * mapping = file->f_mapping;
+ struct inode *inode = file->f_path.dentry->d_inode;
+ struct blk_plug plug;
ssize_t ret;
+ ssize_t written, written_buffered;
+ size_t length = iov_length(iov, nr_segs);
+ size_t ocount; /* original count */
+ size_t count; /* after file limit checks */
+ int unaligned_aio = 0;
+ int overwrite = 0;
+ loff_t *ppos = &iocb->ki_pos;
+ loff_t endbyte;
+
+ BUG_ON(iocb->ki_pos != pos);
if (!is_sync_kiocb(iocb))
unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
@@ -121,7 +133,129 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
ext4_aiodio_wait(inode);
}
- ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ mutex_lock(&inode->i_mutex);
+ blk_start_plug(&plug);
+
+ ocount = 0;
+ ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+ if (ret)
+ goto unlock_out;
+
+ count = ocount;
+ pos = *ppos;
+
+ vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = mapping->backing_dev_info;
+ written = 0;
+
+ ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+ if (ret)
+ goto out;
+
+ if (count == 0)
+ goto out;
+
+ ret = file_remove_suid(file);
+ if (ret)
+ goto out;
+
+ file_update_time(file);
+
+ iocb->private = NULL;
+
+ if (!unaligned_aio && !file->f_mapping->nrpages &&
+ pos + length < i_size_read(inode) &&
+ ext4_should_dioread_nolock(inode)) {
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ int err;
+ int len;
+
+ map.m_lblk = pos >> blkbits;
+ map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
+ - map.m_lblk;
+ len = map.m_len;
+
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err == len && (!map.m_flags ||
+ map.m_flags & EXT4_MAP_MAPPED)) {
+ overwrite = 1;
+ iocb->private = &overwrite;
+ mutex_unlock(&inode->i_mutex);
+ down_read(&EXT4_I(inode)->i_data_sem);
+ }
+ }
+
+ if (file->f_mapping->nrpages && overwrite) {
+ overwrite = 0;
+ up_read(&EXT4_I(inode)->i_data_sem);
+ mutex_lock(&inode->i_mutex);
+ }
+
+ written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
+ ppos, count, ocount);
+ if (written < 0 || written == count)
+ goto out;
+ /*
+ * direct-io write to a hole: fall through to buffered I/O
+ * for completing the rest of the request.
+ */
+ pos += written;
+ count -= written;
+ written_buffered = generic_file_buffered_write(iocb, iov,
+ nr_segs, pos, ppos, count,
+ written);
+ /*
+ * If generic_file_buffered_write() retuned a synchronous error
+ * then we want to return the number of bytes which were
+ * direct-written, or the error code if that was zero. Note
+ * that this differs from normal direct-io semantics, which
+ * will return -EFOO even if some bytes were written.
+ */
+ if (written_buffered < 0) {
+ ret = written_buffered;
+ goto out;
+ }
+
+ /*
+ * We need to ensure that the page cache pages are written to
+ * disk and invalidated to preserve the expected O_DIRECT
+ * semantics.
+ */
+ endbyte = pos + written_buffered - written - 1;
+ ret = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+ if (ret == 0) {
+ written = written_buffered;
+ invalidate_mapping_pages(mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+ } else {
+ /*
+ * We don't know how much we wrote, so just return
+ * the number of bytes which were direct-written
+ */
+ }
+
+out:
+ current->backing_dev_info = NULL;
+ ret = written ? written : ret;
+
+unlock_out:
+ if (overwrite)
+ up_read(&EXT4_I(inode)->i_data_sem);
+ else
+ mutex_unlock(&inode->i_mutex);
+
+ if (ret > 0 || ret == -EIOCBQUEUED) {
+ ssize_t err;
+
+ err = generic_write_sync(file, pos, ret);
+ if (err < 0 && ret > 0)
+ ret = err;
+ }
+ blk_finish_plug(&plug);
if (unaligned_aio)
mutex_unlock(ext4_aio_mutex(inode));
--
1.7.1
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [RFC][PATCH 3/3] ext4: add dio overwrite nolock
2012-04-28 3:39 ` [RFC][PATCH 3/3] ext4: add dio overwrite nolock Zheng Liu
@ 2012-05-02 6:59 ` Tao Ma
2012-05-02 8:16 ` Zheng Liu
2012-05-02 15:05 ` Eric Sandeen
1 sibling, 1 reply; 9+ messages in thread
From: Tao Ma @ 2012-05-02 6:59 UTC (permalink / raw)
To: Zheng Liu; +Cc: linux-ext4
On 04/28/2012 11:39 AM, Zheng Liu wrote:
> From: Zheng Liu <wenqing.lz@taobao.com>
>
> Aligned and overwrite direct IO can be parallelized. In ext4_file_dio_write,
> we first check whether these conditions are satisfied or not. If so, we unlock
> the i_mutex and acquire i_data_sem directly. Meanwhile iocb->private is set to
> indicate that this is a overwrite dio, and it will be processed in
> ext4_ext_direct_IO.
>
> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> ---
> fs/ext4/file.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 files changed, 137 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index e5d6be3..8a5f713 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -100,9 +100,21 @@ static ssize_t
> ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> unsigned long nr_segs, loff_t pos)
> {
> - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> - int unaligned_aio = 0;
> + struct file *file = iocb->ki_filp;
> + struct address_space * mapping = file->f_mapping;
> + struct inode *inode = file->f_path.dentry->d_inode;
> + struct blk_plug plug;
> ssize_t ret;
> + ssize_t written, written_buffered;
> + size_t length = iov_length(iov, nr_segs);
> + size_t ocount; /* original count */
> + size_t count; /* after file limit checks */
> + int unaligned_aio = 0;
> + int overwrite = 0;
> + loff_t *ppos = &iocb->ki_pos;
> + loff_t endbyte;
> +
> + BUG_ON(iocb->ki_pos != pos);
>
> if (!is_sync_kiocb(iocb))
> unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> @@ -121,7 +133,129 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> ext4_aiodio_wait(inode);
> }
>
> - ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> + mutex_lock(&inode->i_mutex);
> + blk_start_plug(&plug);
> +
> + ocount = 0;
> + ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
> + if (ret)
> + goto unlock_out;
> +
> + count = ocount;
> + pos = *ppos;
> +
> + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> +
> + /* We can write back this queue in page reclaim */
> + current->backing_dev_info = mapping->backing_dev_info;
> + written = 0;
> +
> + ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
> + if (ret)
> + goto out;
> +
> + if (count == 0)
> + goto out;
> +
> + ret = file_remove_suid(file);
> + if (ret)
> + goto out;
> +
> + file_update_time(file);
> +
> + iocb->private = NULL;
> +
> + if (!unaligned_aio && !file->f_mapping->nrpages &&
> + pos + length < i_size_read(inode) &&
should be pos + length <= ?
And inode->i_size should be ok since now we have i_mutex held.
> + ext4_should_dioread_nolock(inode)) {
> + struct ext4_map_blocks map;
> + unsigned int blkbits = inode->i_blkbits;
> + int err;
> + int len;
> +
> + map.m_lblk = pos >> blkbits;
> + map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
> + - map.m_lblk;
> + len = map.m_len;
> +
> + err = ext4_map_blocks(NULL, inode, &map, 0);
> + if (err == len && (!map.m_flags ||
> + map.m_flags & EXT4_MAP_MAPPED)) {
could you please add some comments about how and why map.m_flags are
checked this way?
> + overwrite = 1;
> + iocb->private = &overwrite;
> + mutex_unlock(&inode->i_mutex);
> + down_read(&EXT4_I(inode)->i_data_sem);
Is there any possibility that the metadata is changed after we dropped
the i_mutex before the down_read?
> + }
> + }
> +
> + if (file->f_mapping->nrpages && overwrite) {
> + overwrite = 0;
> + up_read(&EXT4_I(inode)->i_data_sem);
> + mutex_lock(&inode->i_mutex);
I am not sure whether it could happen. But if it does happen, should we
also change the value in iocb->private?
> + }
> +
> + written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
> + ppos, count, ocount);
> + if (written < 0 || written == count)
> + goto out;
> + /*
> + * direct-io write to a hole: fall through to buffered I/O
> + * for completing the rest of the request.
> + */
> + pos += written;
> + count -= written;
> + written_buffered = generic_file_buffered_write(iocb, iov,
> + nr_segs, pos, ppos, count,
> + written);
If we fall back here, should we re-lock the i_mutex since the buffer
write isn't guaranteed?
Thanks
Tao
> + /*
> + * If generic_file_buffered_write() retuned a synchronous error
> + * then we want to return the number of bytes which were
> + * direct-written, or the error code if that was zero. Note
> + * that this differs from normal direct-io semantics, which
> + * will return -EFOO even if some bytes were written.
> + */
> + if (written_buffered < 0) {
> + ret = written_buffered;
> + goto out;
> + }
> +
> + /*
> + * We need to ensure that the page cache pages are written to
> + * disk and invalidated to preserve the expected O_DIRECT
> + * semantics.
> + */
> + endbyte = pos + written_buffered - written - 1;
> + ret = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
> + if (ret == 0) {
> + written = written_buffered;
> + invalidate_mapping_pages(mapping,
> + pos >> PAGE_CACHE_SHIFT,
> + endbyte >> PAGE_CACHE_SHIFT);
> + } else {
> + /*
> + * We don't know how much we wrote, so just return
> + * the number of bytes which were direct-written
> + */
> + }
> +
> +out:
> + current->backing_dev_info = NULL;
> + ret = written ? written : ret;
> +
> +unlock_out:
> + if (overwrite)
> + up_read(&EXT4_I(inode)->i_data_sem);
> + else
> + mutex_unlock(&inode->i_mutex);
> +
> + if (ret > 0 || ret == -EIOCBQUEUED) {
> + ssize_t err;
> +
> + err = generic_write_sync(file, pos, ret);
> + if (err < 0 && ret > 0)
> + ret = err;
> + }
> + blk_finish_plug(&plug);
>
> if (unaligned_aio)
> mutex_unlock(ext4_aio_mutex(inode));
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC][PATCH 3/3] ext4: add dio overwrite nolock
2012-05-02 6:59 ` Tao Ma
@ 2012-05-02 8:16 ` Zheng Liu
0 siblings, 0 replies; 9+ messages in thread
From: Zheng Liu @ 2012-05-02 8:16 UTC (permalink / raw)
To: Tao Ma; +Cc: linux-ext4
On Wed, May 02, 2012 at 02:59:34PM +0800, Tao Ma wrote:
> On 04/28/2012 11:39 AM, Zheng Liu wrote:
> > From: Zheng Liu <wenqing.lz@taobao.com>
> >
> > Aligned and overwrite direct IO can be parallelized. In ext4_file_dio_write,
> > we first check whether these conditions are satisfied or not. If so, we unlock
> > the i_mutex and acquire i_data_sem directly. Meanwhile iocb->private is set to
> > indicate that this is a overwrite dio, and it will be processed in
> > ext4_ext_direct_IO.
> >
> > Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> > ---
> > fs/ext4/file.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> > 1 files changed, 137 insertions(+), 3 deletions(-)
> >
> > diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> > index e5d6be3..8a5f713 100644
> > --- a/fs/ext4/file.c
> > +++ b/fs/ext4/file.c
> > @@ -100,9 +100,21 @@ static ssize_t
> > ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> > unsigned long nr_segs, loff_t pos)
> > {
> > - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> > - int unaligned_aio = 0;
> > + struct file *file = iocb->ki_filp;
> > + struct address_space * mapping = file->f_mapping;
> > + struct inode *inode = file->f_path.dentry->d_inode;
> > + struct blk_plug plug;
> > ssize_t ret;
> > + ssize_t written, written_buffered;
> > + size_t length = iov_length(iov, nr_segs);
> > + size_t ocount; /* original count */
> > + size_t count; /* after file limit checks */
> > + int unaligned_aio = 0;
> > + int overwrite = 0;
> > + loff_t *ppos = &iocb->ki_pos;
> > + loff_t endbyte;
> > +
> > + BUG_ON(iocb->ki_pos != pos);
> >
> > if (!is_sync_kiocb(iocb))
> > unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> > @@ -121,7 +133,129 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> > ext4_aiodio_wait(inode);
> > }
> >
> > - ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> > + mutex_lock(&inode->i_mutex);
> > + blk_start_plug(&plug);
> > +
> > + ocount = 0;
> > + ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
> > + if (ret)
> > + goto unlock_out;
> > +
> > + count = ocount;
> > + pos = *ppos;
> > +
> > + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> > +
> > + /* We can write back this queue in page reclaim */
> > + current->backing_dev_info = mapping->backing_dev_info;
> > + written = 0;
> > +
> > + ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
> > + if (ret)
> > + goto out;
> > +
> > + if (count == 0)
> > + goto out;
> > +
> > + ret = file_remove_suid(file);
> > + if (ret)
> > + goto out;
> > +
> > + file_update_time(file);
> > +
> > + iocb->private = NULL;
> > +
> > + if (!unaligned_aio && !file->f_mapping->nrpages &&
> > + pos + length < i_size_read(inode) &&
> should be pos + length <= ?
> And inode->i_size should be ok since now we have i_mutex held.
Yes, you are right.
> > + ext4_should_dioread_nolock(inode)) {
> > + struct ext4_map_blocks map;
> > + unsigned int blkbits = inode->i_blkbits;
> > + int err;
> > + int len;
> > +
> > + map.m_lblk = pos >> blkbits;
> > + map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
> > + - map.m_lblk;
> > + len = map.m_len;
> > +
> > + err = ext4_map_blocks(NULL, inode, &map, 0);
> > + if (err == len && (!map.m_flags ||
> > + map.m_flags & EXT4_MAP_MAPPED)) {
> could you please add some comments about how and why map.m_flags are
> checked this way?
OK. I will add some comments to describe it in here.
> > + overwrite = 1;
> > + iocb->private = &overwrite;
> > + mutex_unlock(&inode->i_mutex);
> > + down_read(&EXT4_I(inode)->i_data_sem);
> Is there any possibility that the metadata is changed after we dropped
> the i_mutex before the down_read?
Yes, the metadata is possible to be changed after we unlocked i_mutex
before acquire i_data_sem. So I will swap the locking order.
> > + }
> > + }
> > +
> > + if (file->f_mapping->nrpages && overwrite) {
> > + overwrite = 0;
> > + up_read(&EXT4_I(inode)->i_data_sem);
> > + mutex_lock(&inode->i_mutex);
> I am not sure whether it could happen. But if it does happen, should we
> also change the value in iocb->private?
As I said above, if we swap the locking order, I think that it shouldn't
happen. Certainly, I will set 'iocb->private = NULL' to fix it to
ensure that when it does happen, we can make filesystem do right things.
> > + }
> > +
> > + written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
> > + ppos, count, ocount);
> > + if (written < 0 || written == count)
> > + goto out;
> > + /*
> > + * direct-io write to a hole: fall through to buffered I/O
> > + * for completing the rest of the request.
> > + */
> > + pos += written;
> > + count -= written;
> > + written_buffered = generic_file_buffered_write(iocb, iov,
> > + nr_segs, pos, ppos, count,
> > + written);
> If we fall back here, should we re-lock the i_mutex since the buffer
> write isn't guaranteed?
No, we don't need to re-lock i_mutex because dio never falls through to
buffered IO when it is an overwrite. We do a lookup using
ext4_map_blocks to ensure that it never occurs before we actually issue
a dio. I will add a BUG_ON to guarantee that it couldn't happen.
Regards,
Zheng
>
> Thanks
> Tao
> > + /*
> > + * If generic_file_buffered_write() retuned a synchronous error
> > + * then we want to return the number of bytes which were
> > + * direct-written, or the error code if that was zero. Note
> > + * that this differs from normal direct-io semantics, which
> > + * will return -EFOO even if some bytes were written.
> > + */
> > + if (written_buffered < 0) {
> > + ret = written_buffered;
> > + goto out;
> > + }
> > +
> > + /*
> > + * We need to ensure that the page cache pages are written to
> > + * disk and invalidated to preserve the expected O_DIRECT
> > + * semantics.
> > + */
> > + endbyte = pos + written_buffered - written - 1;
> > + ret = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
> > + if (ret == 0) {
> > + written = written_buffered;
> > + invalidate_mapping_pages(mapping,
> > + pos >> PAGE_CACHE_SHIFT,
> > + endbyte >> PAGE_CACHE_SHIFT);
> > + } else {
> > + /*
> > + * We don't know how much we wrote, so just return
> > + * the number of bytes which were direct-written
> > + */
> > + }
> > +
> > +out:
> > + current->backing_dev_info = NULL;
> > + ret = written ? written : ret;
> > +
> > +unlock_out:
> > + if (overwrite)
> > + up_read(&EXT4_I(inode)->i_data_sem);
> > + else
> > + mutex_unlock(&inode->i_mutex);
> > +
> > + if (ret > 0 || ret == -EIOCBQUEUED) {
> > + ssize_t err;
> > +
> > + err = generic_write_sync(file, pos, ret);
> > + if (err < 0 && ret > 0)
> > + ret = err;
> > + }
> > + blk_finish_plug(&plug);
> >
> > if (unaligned_aio)
> > mutex_unlock(ext4_aio_mutex(inode));
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [RFC][PATCH 3/3] ext4: add dio overwrite nolock
2012-04-28 3:39 ` [RFC][PATCH 3/3] ext4: add dio overwrite nolock Zheng Liu
2012-05-02 6:59 ` Tao Ma
@ 2012-05-02 15:05 ` Eric Sandeen
1 sibling, 0 replies; 9+ messages in thread
From: Eric Sandeen @ 2012-05-02 15:05 UTC (permalink / raw)
To: Zheng Liu; +Cc: linux-ext4, Zheng Liu
On 4/27/12 10:39 PM, Zheng Liu wrote:
> From: Zheng Liu <wenqing.lz@taobao.com>
>
> Aligned and overwrite direct IO can be parallelized. In ext4_file_dio_write,
> we first check whether these conditions are satisfied or not. If so, we unlock
> the i_mutex and acquire i_data_sem directly. Meanwhile iocb->private is set to
> indicate that this is a overwrite dio, and it will be processed in
> ext4_ext_direct_IO.
This copies almost 100 lines of generic_file_aio_write() back into
ext4. Do we really need to do this? Copying core code into the
fs can be a maintenance nightmare...
I'll have to think more about the big picture and whether or not it's
possible, but my first reaction is to find a way to leverage or modify
existing IO code rather than pasting it all into ext4 with changes...
-Eric
> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com>
> ---
> fs/ext4/file.c | 140 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-
> 1 files changed, 137 insertions(+), 3 deletions(-)
>
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index e5d6be3..8a5f713 100644
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -100,9 +100,21 @@ static ssize_t
> ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> unsigned long nr_segs, loff_t pos)
> {
> - struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
> - int unaligned_aio = 0;
> + struct file *file = iocb->ki_filp;
> + struct address_space * mapping = file->f_mapping;
> + struct inode *inode = file->f_path.dentry->d_inode;
> + struct blk_plug plug;
> ssize_t ret;
> + ssize_t written, written_buffered;
> + size_t length = iov_length(iov, nr_segs);
> + size_t ocount; /* original count */
> + size_t count; /* after file limit checks */
> + int unaligned_aio = 0;
> + int overwrite = 0;
> + loff_t *ppos = &iocb->ki_pos;
> + loff_t endbyte;
> +
> + BUG_ON(iocb->ki_pos != pos);
>
> if (!is_sync_kiocb(iocb))
> unaligned_aio = ext4_unaligned_aio(inode, iov, nr_segs, pos);
> @@ -121,7 +133,129 @@ ext4_file_dio_write(struct kiocb *iocb, const struct iovec *iov,
> ext4_aiodio_wait(inode);
> }
>
> - ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
> + mutex_lock(&inode->i_mutex);
> + blk_start_plug(&plug);
> +
> + ocount = 0;
> + ret = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
> + if (ret)
> + goto unlock_out;
> +
> + count = ocount;
> + pos = *ppos;
> +
> + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
> +
> + /* We can write back this queue in page reclaim */
> + current->backing_dev_info = mapping->backing_dev_info;
> + written = 0;
> +
> + ret = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
> + if (ret)
> + goto out;
> +
> + if (count == 0)
> + goto out;
> +
> + ret = file_remove_suid(file);
> + if (ret)
> + goto out;
> +
> + file_update_time(file);
> +
> + iocb->private = NULL;
> +
> + if (!unaligned_aio && !file->f_mapping->nrpages &&
> + pos + length < i_size_read(inode) &&
> + ext4_should_dioread_nolock(inode)) {
> + struct ext4_map_blocks map;
> + unsigned int blkbits = inode->i_blkbits;
> + int err;
> + int len;
> +
> + map.m_lblk = pos >> blkbits;
> + map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
> + - map.m_lblk;
> + len = map.m_len;
> +
> + err = ext4_map_blocks(NULL, inode, &map, 0);
> + if (err == len && (!map.m_flags ||
> + map.m_flags & EXT4_MAP_MAPPED)) {
> + overwrite = 1;
> + iocb->private = &overwrite;
> + mutex_unlock(&inode->i_mutex);
> + down_read(&EXT4_I(inode)->i_data_sem);
> + }
> + }
> +
> + if (file->f_mapping->nrpages && overwrite) {
> + overwrite = 0;
> + up_read(&EXT4_I(inode)->i_data_sem);
> + mutex_lock(&inode->i_mutex);
> + }
> +
> + written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
> + ppos, count, ocount);
> + if (written < 0 || written == count)
> + goto out;
> + /*
> + * direct-io write to a hole: fall through to buffered I/O
> + * for completing the rest of the request.
> + */
> + pos += written;
> + count -= written;
> + written_buffered = generic_file_buffered_write(iocb, iov,
> + nr_segs, pos, ppos, count,
> + written);
> + /*
> + * If generic_file_buffered_write() retuned a synchronous error
> + * then we want to return the number of bytes which were
> + * direct-written, or the error code if that was zero. Note
> + * that this differs from normal direct-io semantics, which
> + * will return -EFOO even if some bytes were written.
> + */
> + if (written_buffered < 0) {
> + ret = written_buffered;
> + goto out;
> + }
> +
> + /*
> + * We need to ensure that the page cache pages are written to
> + * disk and invalidated to preserve the expected O_DIRECT
> + * semantics.
> + */
> + endbyte = pos + written_buffered - written - 1;
> + ret = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
> + if (ret == 0) {
> + written = written_buffered;
> + invalidate_mapping_pages(mapping,
> + pos >> PAGE_CACHE_SHIFT,
> + endbyte >> PAGE_CACHE_SHIFT);
> + } else {
> + /*
> + * We don't know how much we wrote, so just return
> + * the number of bytes which were direct-written
> + */
> + }
> +
> +out:
> + current->backing_dev_info = NULL;
> + ret = written ? written : ret;
> +
> +unlock_out:
> + if (overwrite)
> + up_read(&EXT4_I(inode)->i_data_sem);
> + else
> + mutex_unlock(&inode->i_mutex);
> +
> + if (ret > 0 || ret == -EIOCBQUEUED) {
> + ssize_t err;
> +
> + err = generic_write_sync(file, pos, ret);
> + if (err < 0 && ret > 0)
> + ret = err;
> + }
> + blk_finish_plug(&plug);
>
> if (unaligned_aio)
> mutex_unlock(ext4_aio_mutex(inode));
^ permalink raw reply [flat|nested] 9+ messages in thread