All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH 1/2] Btrfs: add noi_version option to disable MS_I_VERSION
@ 2015-06-09 12:04 Liu Bo
  2015-06-09 12:04 ` [RFC PATCH 2/2] Btrfs: improve fsync for nocow file Liu Bo
  0 siblings, 1 reply; 7+ messages in thread
From: Liu Bo @ 2015-06-09 12:04 UTC (permalink / raw)
  To: linux-btrfs

MS_I_VERSION is enabled by default for btrfs, this adds an alternative
option to toggle it off.

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/super.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f2c9f9d..c81a3f1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -324,7 +324,7 @@ enum {
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
 	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
-	Opt_datasum, Opt_treelog, Opt_noinode_cache,
+	Opt_datasum, Opt_treelog, Opt_noinode_cache, Opt_noi_version,
 	Opt_err,
 };
 
@@ -351,6 +351,7 @@ static match_table_t tokens = {
 	{Opt_nossd, "nossd"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
+	{Opt_noi_version, "noi_version"},
 	{Opt_notreelog, "notreelog"},
 	{Opt_treelog, "treelog"},
 	{Opt_flushoncommit, "flushoncommit"},
@@ -593,6 +594,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_noacl:
 			root->fs_info->sb->s_flags &= ~MS_POSIXACL;
 			break;
+		case Opt_noi_version:
+			root->fs_info->sb->s_flags &= ~MS_I_VERSION;
+			btrfs_info(root->fs_info, "disable i_version");
+			break;
 		case Opt_notreelog:
 			btrfs_set_and_info(root, NOTREELOG,
 					   "disabling tree log");
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [RFC PATCH 2/2] Btrfs: improve fsync for nocow file
  2015-06-09 12:04 [RFC PATCH 1/2] Btrfs: add noi_version option to disable MS_I_VERSION Liu Bo
@ 2015-06-09 12:04 ` Liu Bo
  2015-06-09 12:56   ` Filipe David Manana
  0 siblings, 1 reply; 7+ messages in thread
From: Liu Bo @ 2015-06-09 12:04 UTC (permalink / raw)
  To: linux-btrfs

If  we're overwriting an allocated file without changing timestamp
and inode version, and the file is with NODATACOW, we don't have any metadata to
commit, thus we can just flush the data device cache and go forward.

However, if there's have any change on extents' disk bytenr, inode size,
timestamp or inode version, we need to go through the normal btrfs_log_inode
path.

Test:
----------------------------
1. sysbench test of
"1 file + 1 thread + bs=4k + size=40k + synchronous I/O mode + randomwrite +
fsync_on_each_write",
2. loop device associated with tmpfs file
3.
  - For btrfs, "-o nodatacow" and "-o noi_version" option
  - For ext4 and xfs, no extra mount options
----------------------------

Results:
----------------------------
- btrfs:
w/o: ~30Mb/sec
w:   ~181Mb/sec

- other filesystems: (both don't enable i_version by default)
ext4:  203Mb/sec
xfs:   212Mb/sec
----------------------------

Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
---
 fs/btrfs/btrfs_inode.h |  2 ++
 fs/btrfs/disk-io.c     |  2 +-
 fs/btrfs/disk-io.h     |  1 +
 fs/btrfs/file.c        | 39 ++++++++++++++++++++++++++++++++++-----
 fs/btrfs/inode.c       |  3 +++
 5 files changed, 41 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0ef5cc1..b36d87a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -44,6 +44,8 @@
 #define BTRFS_INODE_IN_DELALLOC_LIST		9
 #define BTRFS_INODE_READDIO_NEED_LOCK		10
 #define BTRFS_INODE_HAS_PROPS		        11
+#define BTRFS_INODE_NOTIMESTAMP			12
+#define BTRFS_INODE_NOISIZE			13
 /*
  * The following 3 bits are meant only for the btree inode.
  * When any of them is set, it means an error happened while writing an
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 2ef9a4b..de7fd94 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3343,7 +3343,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
  * send an empty flush down to each device in parallel,
  * then wait for them
  */
-static int barrier_all_devices(struct btrfs_fs_info *info)
+int barrier_all_devices(struct btrfs_fs_info *info)
 {
 	struct list_head *head;
 	struct btrfs_device *dev;
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index d4cbfee..2bc91fe 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -60,6 +60,7 @@ void close_ctree(struct btrfs_root *root);
 int write_ctree_super(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
+int barrier_all_devices(struct btrfs_fs_info *info);
 int btrfs_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
 					    u64 bytenr);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 23b6e03..861c29f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -519,8 +519,12 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 	 * the disk i_size.  There is no need to log the inode
 	 * at this time.
 	 */
-	if (end_pos > isize)
+	if (end_pos > isize) {
 		i_size_write(inode, end_pos);
+		clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
+	} else {
+		set_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
+	}
 	return 0;
 }
 
@@ -1711,19 +1715,33 @@ out:
 static void update_time_for_write(struct inode *inode)
 {
 	struct timespec now;
+	int sync_it = 0;
 
-	if (IS_NOCMTIME(inode))
+	if (IS_NOCMTIME(inode)) {
+		set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
 		return;
+	}
 
 	now = current_fs_time(inode->i_sb);
-	if (!timespec_equal(&inode->i_mtime, &now))
+	if (!timespec_equal(&inode->i_mtime, &now)) {
 		inode->i_mtime = now;
+		sync_it = S_MTIME;
+	}
 
-	if (!timespec_equal(&inode->i_ctime, &now))
+	if (!timespec_equal(&inode->i_ctime, &now)) {
 		inode->i_ctime = now;
+		sync_it |= S_CTIME;
+	}
 
-	if (IS_I_VERSION(inode))
+	if (IS_I_VERSION(inode)) {
 		inode_inc_iversion(inode);
+		sync_it |= S_VERSION;
+	}
+
+	if (!sync_it)
+		set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
+	else
+		clear_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
 }
 
 static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
@@ -1987,6 +2005,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		goto out;
 	}
 
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
+		if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP,
+					&BTRFS_I(inode)->runtime_flags) &&
+		    test_and_clear_bit(BTRFS_INODE_NOISIZE,
+					&BTRFS_I(inode)->runtime_flags)) {
+			barrier_all_devices(root->fs_info);
+			mutex_unlock(&inode->i_mutex);
+			goto out;
+		}
+	}
+
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 0020b56..3d230e6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1384,6 +1384,7 @@ out_check:
 
 		btrfs_release_path(path);
 		if (cow_start != (u64)-1) {
+			clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
 			ret = cow_file_range(inode, locked_page,
 					     cow_start, found_key.offset - 1,
 					     page_started, nr_written, 1);
@@ -1426,6 +1427,7 @@ out_check:
 						em->start + em->len - 1, 0);
 			}
 			type = BTRFS_ORDERED_PREALLOC;
+			clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
 		} else {
 			type = BTRFS_ORDERED_NOCOW;
 		}
@@ -1464,6 +1466,7 @@ out_check:
 	}
 
 	if (cow_start != (u64)-1) {
+		clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
 		ret = cow_file_range(inode, locked_page, cow_start, end,
 				     page_started, nr_written, 1);
 		if (ret)
-- 
2.1.0


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH 2/2] Btrfs: improve fsync for nocow file
  2015-06-09 12:04 ` [RFC PATCH 2/2] Btrfs: improve fsync for nocow file Liu Bo
@ 2015-06-09 12:56   ` Filipe David Manana
  2015-06-10  2:09     ` Liu Bo
  0 siblings, 1 reply; 7+ messages in thread
From: Filipe David Manana @ 2015-06-09 12:56 UTC (permalink / raw)
  To: Liu Bo; +Cc: linux-btrfs

On Tue, Jun 9, 2015 at 1:04 PM, Liu Bo <bo.li.liu@oracle.com> wrote:
> If  we're overwriting an allocated file without changing timestamp
> and inode version, and the file is with NODATACOW, we don't have any metadata to
> commit, thus we can just flush the data device cache and go forward.
>
> However, if there's have any change on extents' disk bytenr, inode size,
> timestamp or inode version, we need to go through the normal btrfs_log_inode
> path.
>
> Test:
> ----------------------------
> 1. sysbench test of
> "1 file + 1 thread + bs=4k + size=40k + synchronous I/O mode + randomwrite +
> fsync_on_each_write",
> 2. loop device associated with tmpfs file
> 3.
>   - For btrfs, "-o nodatacow" and "-o noi_version" option
>   - For ext4 and xfs, no extra mount options
> ----------------------------
>
> Results:
> ----------------------------
> - btrfs:
> w/o: ~30Mb/sec
> w:   ~181Mb/sec
>
> - other filesystems: (both don't enable i_version by default)
> ext4:  203Mb/sec
> xfs:   212Mb/sec
> ----------------------------
>
> Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> ---
>  fs/btrfs/btrfs_inode.h |  2 ++
>  fs/btrfs/disk-io.c     |  2 +-
>  fs/btrfs/disk-io.h     |  1 +
>  fs/btrfs/file.c        | 39 ++++++++++++++++++++++++++++++++++-----
>  fs/btrfs/inode.c       |  3 +++
>  5 files changed, 41 insertions(+), 6 deletions(-)
>
> diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> index 0ef5cc1..b36d87a 100644
> --- a/fs/btrfs/btrfs_inode.h
> +++ b/fs/btrfs/btrfs_inode.h
> @@ -44,6 +44,8 @@
>  #define BTRFS_INODE_IN_DELALLOC_LIST           9
>  #define BTRFS_INODE_READDIO_NEED_LOCK          10
>  #define BTRFS_INODE_HAS_PROPS                  11
> +#define BTRFS_INODE_NOTIMESTAMP                        12
> +#define BTRFS_INODE_NOISIZE                    13
>  /*
>   * The following 3 bits are meant only for the btree inode.
>   * When any of them is set, it means an error happened while writing an
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 2ef9a4b..de7fd94 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -3343,7 +3343,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
>   * send an empty flush down to each device in parallel,
>   * then wait for them
>   */
> -static int barrier_all_devices(struct btrfs_fs_info *info)
> +int barrier_all_devices(struct btrfs_fs_info *info)
>  {
>         struct list_head *head;
>         struct btrfs_device *dev;
> diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
> index d4cbfee..2bc91fe 100644
> --- a/fs/btrfs/disk-io.h
> +++ b/fs/btrfs/disk-io.h
> @@ -60,6 +60,7 @@ void close_ctree(struct btrfs_root *root);
>  int write_ctree_super(struct btrfs_trans_handle *trans,
>                       struct btrfs_root *root, int max_mirrors);
>  struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
> +int barrier_all_devices(struct btrfs_fs_info *info);
>  int btrfs_commit_super(struct btrfs_root *root);
>  struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
>                                             u64 bytenr);
> diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> index 23b6e03..861c29f 100644
> --- a/fs/btrfs/file.c
> +++ b/fs/btrfs/file.c
> @@ -519,8 +519,12 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
>          * the disk i_size.  There is no need to log the inode
>          * at this time.
>          */
> -       if (end_pos > isize)
> +       if (end_pos > isize) {
>                 i_size_write(inode, end_pos);
> +               clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> +       } else {
> +               set_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> +       }
>         return 0;
>  }
>
> @@ -1711,19 +1715,33 @@ out:
>  static void update_time_for_write(struct inode *inode)
>  {
>         struct timespec now;
> +       int sync_it = 0;
>
> -       if (IS_NOCMTIME(inode))
> +       if (IS_NOCMTIME(inode)) {
> +               set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
>                 return;
> +       }
>
>         now = current_fs_time(inode->i_sb);
> -       if (!timespec_equal(&inode->i_mtime, &now))
> +       if (!timespec_equal(&inode->i_mtime, &now)) {
>                 inode->i_mtime = now;
> +               sync_it = S_MTIME;
> +       }
>
> -       if (!timespec_equal(&inode->i_ctime, &now))
> +       if (!timespec_equal(&inode->i_ctime, &now)) {
>                 inode->i_ctime = now;
> +               sync_it |= S_CTIME;
> +       }
>
> -       if (IS_I_VERSION(inode))
> +       if (IS_I_VERSION(inode)) {
>                 inode_inc_iversion(inode);
> +               sync_it |= S_VERSION;
> +       }
> +
> +       if (!sync_it)
> +               set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
> +       else
> +               clear_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
>  }
>
>  static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
> @@ -1987,6 +2005,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>                 goto out;
>         }
>
> +       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
> +               if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP,
> +                                       &BTRFS_I(inode)->runtime_flags) &&
> +                   test_and_clear_bit(BTRFS_INODE_NOISIZE,
> +                                       &BTRFS_I(inode)->runtime_flags)) {
> +                       barrier_all_devices(root->fs_info);
> +                       mutex_unlock(&inode->i_mutex);
> +                       goto out;

Hi Liu,

For the non-full sync case, what happens if an IO error happened
during writeback?
I don't see anything here that checks if an IO error happened and
return -EIO to user space if such error happened.
In other words, testing for the bit AS_EIO in the inode->i_mapping->flags.

thanks

> +               }
> +       }
> +
>         /*
>          * ok we haven't committed the transaction yet, lets do a commit
>          */
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 0020b56..3d230e6 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1384,6 +1384,7 @@ out_check:
>
>                 btrfs_release_path(path);
>                 if (cow_start != (u64)-1) {
> +                       clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>                         ret = cow_file_range(inode, locked_page,
>                                              cow_start, found_key.offset - 1,
>                                              page_started, nr_written, 1);
> @@ -1426,6 +1427,7 @@ out_check:
>                                                 em->start + em->len - 1, 0);
>                         }
>                         type = BTRFS_ORDERED_PREALLOC;
> +                       clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>                 } else {
>                         type = BTRFS_ORDERED_NOCOW;
>                 }
> @@ -1464,6 +1466,7 @@ out_check:
>         }
>
>         if (cow_start != (u64)-1) {
> +               clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>                 ret = cow_file_range(inode, locked_page, cow_start, end,
>                                      page_started, nr_written, 1);
>                 if (ret)
> --
> 2.1.0
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH 2/2] Btrfs: improve fsync for nocow file
  2015-06-09 12:56   ` Filipe David Manana
@ 2015-06-10  2:09     ` Liu Bo
  2015-06-10  8:26       ` Filipe David Manana
  0 siblings, 1 reply; 7+ messages in thread
From: Liu Bo @ 2015-06-10  2:09 UTC (permalink / raw)
  To: Filipe David Manana; +Cc: linux-btrfs

On Tue, Jun 09, 2015 at 01:56:41PM +0100, Filipe David Manana wrote:
> On Tue, Jun 9, 2015 at 1:04 PM, Liu Bo <bo.li.liu@oracle.com> wrote:
> > If  we're overwriting an allocated file without changing timestamp
> > and inode version, and the file is with NODATACOW, we don't have any metadata to
> > commit, thus we can just flush the data device cache and go forward.
> >
> > However, if there's have any change on extents' disk bytenr, inode size,
> > timestamp or inode version, we need to go through the normal btrfs_log_inode
> > path.
> >
> > Test:
> > ----------------------------
> > 1. sysbench test of
> > "1 file + 1 thread + bs=4k + size=40k + synchronous I/O mode + randomwrite +
> > fsync_on_each_write",
> > 2. loop device associated with tmpfs file
> > 3.
> >   - For btrfs, "-o nodatacow" and "-o noi_version" option
> >   - For ext4 and xfs, no extra mount options
> > ----------------------------
> >
> > Results:
> > ----------------------------
> > - btrfs:
> > w/o: ~30Mb/sec
> > w:   ~181Mb/sec
> >
> > - other filesystems: (both don't enable i_version by default)
> > ext4:  203Mb/sec
> > xfs:   212Mb/sec
> > ----------------------------
> >
> > Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> > ---
> >  fs/btrfs/btrfs_inode.h |  2 ++
> >  fs/btrfs/disk-io.c     |  2 +-
> >  fs/btrfs/disk-io.h     |  1 +
> >  fs/btrfs/file.c        | 39 ++++++++++++++++++++++++++++++++++-----
> >  fs/btrfs/inode.c       |  3 +++
> >  5 files changed, 41 insertions(+), 6 deletions(-)
> >
> > diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> > index 0ef5cc1..b36d87a 100644
> > --- a/fs/btrfs/btrfs_inode.h
> > +++ b/fs/btrfs/btrfs_inode.h
> > @@ -44,6 +44,8 @@
> >  #define BTRFS_INODE_IN_DELALLOC_LIST           9
> >  #define BTRFS_INODE_READDIO_NEED_LOCK          10
> >  #define BTRFS_INODE_HAS_PROPS                  11
> > +#define BTRFS_INODE_NOTIMESTAMP                        12
> > +#define BTRFS_INODE_NOISIZE                    13
> >  /*
> >   * The following 3 bits are meant only for the btree inode.
> >   * When any of them is set, it means an error happened while writing an
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 2ef9a4b..de7fd94 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -3343,7 +3343,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
> >   * send an empty flush down to each device in parallel,
> >   * then wait for them
> >   */
> > -static int barrier_all_devices(struct btrfs_fs_info *info)
> > +int barrier_all_devices(struct btrfs_fs_info *info)
> >  {
> >         struct list_head *head;
> >         struct btrfs_device *dev;
> > diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
> > index d4cbfee..2bc91fe 100644
> > --- a/fs/btrfs/disk-io.h
> > +++ b/fs/btrfs/disk-io.h
> > @@ -60,6 +60,7 @@ void close_ctree(struct btrfs_root *root);
> >  int write_ctree_super(struct btrfs_trans_handle *trans,
> >                       struct btrfs_root *root, int max_mirrors);
> >  struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
> > +int barrier_all_devices(struct btrfs_fs_info *info);
> >  int btrfs_commit_super(struct btrfs_root *root);
> >  struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
> >                                             u64 bytenr);
> > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> > index 23b6e03..861c29f 100644
> > --- a/fs/btrfs/file.c
> > +++ b/fs/btrfs/file.c
> > @@ -519,8 +519,12 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
> >          * the disk i_size.  There is no need to log the inode
> >          * at this time.
> >          */
> > -       if (end_pos > isize)
> > +       if (end_pos > isize) {
> >                 i_size_write(inode, end_pos);
> > +               clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> > +       } else {
> > +               set_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> > +       }
> >         return 0;
> >  }
> >
> > @@ -1711,19 +1715,33 @@ out:
> >  static void update_time_for_write(struct inode *inode)
> >  {
> >         struct timespec now;
> > +       int sync_it = 0;
> >
> > -       if (IS_NOCMTIME(inode))
> > +       if (IS_NOCMTIME(inode)) {
> > +               set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
> >                 return;
> > +       }
> >
> >         now = current_fs_time(inode->i_sb);
> > -       if (!timespec_equal(&inode->i_mtime, &now))
> > +       if (!timespec_equal(&inode->i_mtime, &now)) {
> >                 inode->i_mtime = now;
> > +               sync_it = S_MTIME;
> > +       }
> >
> > -       if (!timespec_equal(&inode->i_ctime, &now))
> > +       if (!timespec_equal(&inode->i_ctime, &now)) {
> >                 inode->i_ctime = now;
> > +               sync_it |= S_CTIME;
> > +       }
> >
> > -       if (IS_I_VERSION(inode))
> > +       if (IS_I_VERSION(inode)) {
> >                 inode_inc_iversion(inode);
> > +               sync_it |= S_VERSION;
> > +       }
> > +
> > +       if (!sync_it)
> > +               set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
> > +       else
> > +               clear_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
> >  }
> >
> >  static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
> > @@ -1987,6 +2005,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
> >                 goto out;
> >         }
> >
> > +       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
> > +               if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP,
> > +                                       &BTRFS_I(inode)->runtime_flags) &&
> > +                   test_and_clear_bit(BTRFS_INODE_NOISIZE,
> > +                                       &BTRFS_I(inode)->runtime_flags)) {
> > +                       barrier_all_devices(root->fs_info);
> > +                       mutex_unlock(&inode->i_mutex);
> > +                       goto out;
> 
> Hi Liu,
> 
> For the non-full sync case, what happens if an IO error happened
> during writeback?
> I don't see anything here that checks if an IO error happened and
> return -EIO to user space if such error happened.
> In other words, testing for the bit AS_EIO in the inode->i_mapping->flags.

Good point, I missed that part.

Besides that, is the whole "noi_version and nocow" idea acceptable to you?

Thanks,

-liubo

> 
> thanks
> 
> > +               }
> > +       }
> > +
> >         /*
> >          * ok we haven't committed the transaction yet, lets do a commit
> >          */
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 0020b56..3d230e6 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -1384,6 +1384,7 @@ out_check:
> >
> >                 btrfs_release_path(path);
> >                 if (cow_start != (u64)-1) {
> > +                       clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> >                         ret = cow_file_range(inode, locked_page,
> >                                              cow_start, found_key.offset - 1,
> >                                              page_started, nr_written, 1);
> > @@ -1426,6 +1427,7 @@ out_check:
> >                                                 em->start + em->len - 1, 0);
> >                         }
> >                         type = BTRFS_ORDERED_PREALLOC;
> > +                       clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> >                 } else {
> >                         type = BTRFS_ORDERED_NOCOW;
> >                 }
> > @@ -1464,6 +1466,7 @@ out_check:
> >         }
> >
> >         if (cow_start != (u64)-1) {
> > +               clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> >                 ret = cow_file_range(inode, locked_page, cow_start, end,
> >                                      page_started, nr_written, 1);
> >                 if (ret)
> > --
> > 2.1.0
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> 
> -- 
> Filipe David Manana,
> 
> "Reasonable men adapt themselves to the world.
>  Unreasonable men adapt the world to themselves.
>  That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH 2/2] Btrfs: improve fsync for nocow file
  2015-06-10  2:09     ` Liu Bo
@ 2015-06-10  8:26       ` Filipe David Manana
  2015-06-10 12:45         ` Filipe David Manana
  0 siblings, 1 reply; 7+ messages in thread
From: Filipe David Manana @ 2015-06-10  8:26 UTC (permalink / raw)
  To: Liu Bo; +Cc: linux-btrfs

On Wed, Jun 10, 2015 at 3:09 AM, Liu Bo <bo.li.liu@oracle.com> wrote:
> On Tue, Jun 09, 2015 at 01:56:41PM +0100, Filipe David Manana wrote:
>> On Tue, Jun 9, 2015 at 1:04 PM, Liu Bo <bo.li.liu@oracle.com> wrote:
>> > If  we're overwriting an allocated file without changing timestamp
>> > and inode version, and the file is with NODATACOW, we don't have any metadata to
>> > commit, thus we can just flush the data device cache and go forward.
>> >
>> > However, if there's have any change on extents' disk bytenr, inode size,
>> > timestamp or inode version, we need to go through the normal btrfs_log_inode
>> > path.
>> >
>> > Test:
>> > ----------------------------
>> > 1. sysbench test of
>> > "1 file + 1 thread + bs=4k + size=40k + synchronous I/O mode + randomwrite +
>> > fsync_on_each_write",
>> > 2. loop device associated with tmpfs file
>> > 3.
>> >   - For btrfs, "-o nodatacow" and "-o noi_version" option
>> >   - For ext4 and xfs, no extra mount options
>> > ----------------------------
>> >
>> > Results:
>> > ----------------------------
>> > - btrfs:
>> > w/o: ~30Mb/sec
>> > w:   ~181Mb/sec
>> >
>> > - other filesystems: (both don't enable i_version by default)
>> > ext4:  203Mb/sec
>> > xfs:   212Mb/sec
>> > ----------------------------
>> >
>> > Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
>> > ---
>> >  fs/btrfs/btrfs_inode.h |  2 ++
>> >  fs/btrfs/disk-io.c     |  2 +-
>> >  fs/btrfs/disk-io.h     |  1 +
>> >  fs/btrfs/file.c        | 39 ++++++++++++++++++++++++++++++++++-----
>> >  fs/btrfs/inode.c       |  3 +++
>> >  5 files changed, 41 insertions(+), 6 deletions(-)
>> >
>> > diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
>> > index 0ef5cc1..b36d87a 100644
>> > --- a/fs/btrfs/btrfs_inode.h
>> > +++ b/fs/btrfs/btrfs_inode.h
>> > @@ -44,6 +44,8 @@
>> >  #define BTRFS_INODE_IN_DELALLOC_LIST           9
>> >  #define BTRFS_INODE_READDIO_NEED_LOCK          10
>> >  #define BTRFS_INODE_HAS_PROPS                  11
>> > +#define BTRFS_INODE_NOTIMESTAMP                        12
>> > +#define BTRFS_INODE_NOISIZE                    13
>> >  /*
>> >   * The following 3 bits are meant only for the btree inode.
>> >   * When any of them is set, it means an error happened while writing an
>> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> > index 2ef9a4b..de7fd94 100644
>> > --- a/fs/btrfs/disk-io.c
>> > +++ b/fs/btrfs/disk-io.c
>> > @@ -3343,7 +3343,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
>> >   * send an empty flush down to each device in parallel,
>> >   * then wait for them
>> >   */
>> > -static int barrier_all_devices(struct btrfs_fs_info *info)
>> > +int barrier_all_devices(struct btrfs_fs_info *info)
>> >  {
>> >         struct list_head *head;
>> >         struct btrfs_device *dev;
>> > diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
>> > index d4cbfee..2bc91fe 100644
>> > --- a/fs/btrfs/disk-io.h
>> > +++ b/fs/btrfs/disk-io.h
>> > @@ -60,6 +60,7 @@ void close_ctree(struct btrfs_root *root);
>> >  int write_ctree_super(struct btrfs_trans_handle *trans,
>> >                       struct btrfs_root *root, int max_mirrors);
>> >  struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
>> > +int barrier_all_devices(struct btrfs_fs_info *info);
>> >  int btrfs_commit_super(struct btrfs_root *root);
>> >  struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
>> >                                             u64 bytenr);
>> > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
>> > index 23b6e03..861c29f 100644
>> > --- a/fs/btrfs/file.c
>> > +++ b/fs/btrfs/file.c
>> > @@ -519,8 +519,12 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
>> >          * the disk i_size.  There is no need to log the inode
>> >          * at this time.
>> >          */
>> > -       if (end_pos > isize)
>> > +       if (end_pos > isize) {
>> >                 i_size_write(inode, end_pos);
>> > +               clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>> > +       } else {
>> > +               set_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>> > +       }
>> >         return 0;
>> >  }
>> >
>> > @@ -1711,19 +1715,33 @@ out:
>> >  static void update_time_for_write(struct inode *inode)
>> >  {
>> >         struct timespec now;
>> > +       int sync_it = 0;
>> >
>> > -       if (IS_NOCMTIME(inode))
>> > +       if (IS_NOCMTIME(inode)) {
>> > +               set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
>> >                 return;
>> > +       }
>> >
>> >         now = current_fs_time(inode->i_sb);
>> > -       if (!timespec_equal(&inode->i_mtime, &now))
>> > +       if (!timespec_equal(&inode->i_mtime, &now)) {
>> >                 inode->i_mtime = now;
>> > +               sync_it = S_MTIME;
>> > +       }
>> >
>> > -       if (!timespec_equal(&inode->i_ctime, &now))
>> > +       if (!timespec_equal(&inode->i_ctime, &now)) {
>> >                 inode->i_ctime = now;
>> > +               sync_it |= S_CTIME;
>> > +       }
>> >
>> > -       if (IS_I_VERSION(inode))
>> > +       if (IS_I_VERSION(inode)) {
>> >                 inode_inc_iversion(inode);
>> > +               sync_it |= S_VERSION;
>> > +       }
>> > +
>> > +       if (!sync_it)
>> > +               set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
>> > +       else
>> > +               clear_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
>> >  }
>> >
>> >  static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
>> > @@ -1987,6 +2005,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>> >                 goto out;
>> >         }
>> >
>> > +       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
>> > +               if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP,
>> > +                                       &BTRFS_I(inode)->runtime_flags) &&
>> > +                   test_and_clear_bit(BTRFS_INODE_NOISIZE,
>> > +                                       &BTRFS_I(inode)->runtime_flags)) {
>> > +                       barrier_all_devices(root->fs_info);
>> > +                       mutex_unlock(&inode->i_mutex);
>> > +                       goto out;
>>
>> Hi Liu,
>>
>> For the non-full sync case, what happens if an IO error happened
>> during writeback?
>> I don't see anything here that checks if an IO error happened and
>> return -EIO to user space if such error happened.
>> In other words, testing for the bit AS_EIO in the inode->i_mapping->flags.
>
> Good point, I missed that part.
>
> Besides that, is the whole "noi_version and nocow" idea acceptable to you?

Yes.
I haven't tested it however, just eyeballed the patches.

Thanks.

>
> Thanks,
>
> -liubo
>
>>
>> thanks
>>
>> > +               }
>> > +       }
>> > +
>> >         /*
>> >          * ok we haven't committed the transaction yet, lets do a commit
>> >          */
>> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>> > index 0020b56..3d230e6 100644
>> > --- a/fs/btrfs/inode.c
>> > +++ b/fs/btrfs/inode.c
>> > @@ -1384,6 +1384,7 @@ out_check:
>> >
>> >                 btrfs_release_path(path);
>> >                 if (cow_start != (u64)-1) {
>> > +                       clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>> >                         ret = cow_file_range(inode, locked_page,
>> >                                              cow_start, found_key.offset - 1,
>> >                                              page_started, nr_written, 1);
>> > @@ -1426,6 +1427,7 @@ out_check:
>> >                                                 em->start + em->len - 1, 0);
>> >                         }
>> >                         type = BTRFS_ORDERED_PREALLOC;
>> > +                       clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>> >                 } else {
>> >                         type = BTRFS_ORDERED_NOCOW;
>> >                 }
>> > @@ -1464,6 +1466,7 @@ out_check:
>> >         }
>> >
>> >         if (cow_start != (u64)-1) {
>> > +               clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>> >                 ret = cow_file_range(inode, locked_page, cow_start, end,
>> >                                      page_started, nr_written, 1);
>> >                 if (ret)
>> > --
>> > 2.1.0
>> >
>> > --
>> > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>> > the body of a message to majordomo@vger.kernel.org
>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>
>>
>>
>> --
>> Filipe David Manana,
>>
>> "Reasonable men adapt themselves to the world.
>>  Unreasonable men adapt the world to themselves.
>>  That's why all progress depends on unreasonable men."



-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH 2/2] Btrfs: improve fsync for nocow file
  2015-06-10  8:26       ` Filipe David Manana
@ 2015-06-10 12:45         ` Filipe David Manana
  2015-06-10 13:38           ` Liu Bo
  0 siblings, 1 reply; 7+ messages in thread
From: Filipe David Manana @ 2015-06-10 12:45 UTC (permalink / raw)
  To: Liu Bo; +Cc: linux-btrfs

On Wed, Jun 10, 2015 at 9:26 AM, Filipe David Manana <fdmanana@gmail.com> wrote:
> On Wed, Jun 10, 2015 at 3:09 AM, Liu Bo <bo.li.liu@oracle.com> wrote:
>> On Tue, Jun 09, 2015 at 01:56:41PM +0100, Filipe David Manana wrote:
>>> On Tue, Jun 9, 2015 at 1:04 PM, Liu Bo <bo.li.liu@oracle.com> wrote:
>>> > If  we're overwriting an allocated file without changing timestamp
>>> > and inode version, and the file is with NODATACOW, we don't have any metadata to
>>> > commit, thus we can just flush the data device cache and go forward.
>>> >
>>> > However, if there's have any change on extents' disk bytenr, inode size,
>>> > timestamp or inode version, we need to go through the normal btrfs_log_inode
>>> > path.
>>> >
>>> > Test:
>>> > ----------------------------
>>> > 1. sysbench test of
>>> > "1 file + 1 thread + bs=4k + size=40k + synchronous I/O mode + randomwrite +
>>> > fsync_on_each_write",
>>> > 2. loop device associated with tmpfs file
>>> > 3.
>>> >   - For btrfs, "-o nodatacow" and "-o noi_version" option
>>> >   - For ext4 and xfs, no extra mount options
>>> > ----------------------------
>>> >
>>> > Results:
>>> > ----------------------------
>>> > - btrfs:
>>> > w/o: ~30Mb/sec
>>> > w:   ~181Mb/sec
>>> >
>>> > - other filesystems: (both don't enable i_version by default)
>>> > ext4:  203Mb/sec
>>> > xfs:   212Mb/sec
>>> > ----------------------------
>>> >
>>> > Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
>>> > ---
>>> >  fs/btrfs/btrfs_inode.h |  2 ++
>>> >  fs/btrfs/disk-io.c     |  2 +-
>>> >  fs/btrfs/disk-io.h     |  1 +
>>> >  fs/btrfs/file.c        | 39 ++++++++++++++++++++++++++++++++++-----
>>> >  fs/btrfs/inode.c       |  3 +++
>>> >  5 files changed, 41 insertions(+), 6 deletions(-)
>>> >
>>> > diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
>>> > index 0ef5cc1..b36d87a 100644
>>> > --- a/fs/btrfs/btrfs_inode.h
>>> > +++ b/fs/btrfs/btrfs_inode.h
>>> > @@ -44,6 +44,8 @@
>>> >  #define BTRFS_INODE_IN_DELALLOC_LIST           9
>>> >  #define BTRFS_INODE_READDIO_NEED_LOCK          10
>>> >  #define BTRFS_INODE_HAS_PROPS                  11
>>> > +#define BTRFS_INODE_NOTIMESTAMP                        12
>>> > +#define BTRFS_INODE_NOISIZE                    13
>>> >  /*
>>> >   * The following 3 bits are meant only for the btree inode.
>>> >   * When any of them is set, it means an error happened while writing an
>>> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>>> > index 2ef9a4b..de7fd94 100644
>>> > --- a/fs/btrfs/disk-io.c
>>> > +++ b/fs/btrfs/disk-io.c
>>> > @@ -3343,7 +3343,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
>>> >   * send an empty flush down to each device in parallel,
>>> >   * then wait for them
>>> >   */
>>> > -static int barrier_all_devices(struct btrfs_fs_info *info)
>>> > +int barrier_all_devices(struct btrfs_fs_info *info)
>>> >  {
>>> >         struct list_head *head;
>>> >         struct btrfs_device *dev;
>>> > diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
>>> > index d4cbfee..2bc91fe 100644
>>> > --- a/fs/btrfs/disk-io.h
>>> > +++ b/fs/btrfs/disk-io.h
>>> > @@ -60,6 +60,7 @@ void close_ctree(struct btrfs_root *root);
>>> >  int write_ctree_super(struct btrfs_trans_handle *trans,
>>> >                       struct btrfs_root *root, int max_mirrors);
>>> >  struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
>>> > +int barrier_all_devices(struct btrfs_fs_info *info);
>>> >  int btrfs_commit_super(struct btrfs_root *root);
>>> >  struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
>>> >                                             u64 bytenr);
>>> > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
>>> > index 23b6e03..861c29f 100644
>>> > --- a/fs/btrfs/file.c
>>> > +++ b/fs/btrfs/file.c
>>> > @@ -519,8 +519,12 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
>>> >          * the disk i_size.  There is no need to log the inode
>>> >          * at this time.
>>> >          */
>>> > -       if (end_pos > isize)
>>> > +       if (end_pos > isize) {
>>> >                 i_size_write(inode, end_pos);
>>> > +               clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>>> > +       } else {
>>> > +               set_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>>> > +       }
>>> >         return 0;
>>> >  }
>>> >
>>> > @@ -1711,19 +1715,33 @@ out:
>>> >  static void update_time_for_write(struct inode *inode)
>>> >  {
>>> >         struct timespec now;
>>> > +       int sync_it = 0;
>>> >
>>> > -       if (IS_NOCMTIME(inode))
>>> > +       if (IS_NOCMTIME(inode)) {
>>> > +               set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
>>> >                 return;
>>> > +       }
>>> >
>>> >         now = current_fs_time(inode->i_sb);
>>> > -       if (!timespec_equal(&inode->i_mtime, &now))
>>> > +       if (!timespec_equal(&inode->i_mtime, &now)) {
>>> >                 inode->i_mtime = now;
>>> > +               sync_it = S_MTIME;
>>> > +       }
>>> >
>>> > -       if (!timespec_equal(&inode->i_ctime, &now))
>>> > +       if (!timespec_equal(&inode->i_ctime, &now)) {
>>> >                 inode->i_ctime = now;
>>> > +               sync_it |= S_CTIME;
>>> > +       }
>>> >
>>> > -       if (IS_I_VERSION(inode))
>>> > +       if (IS_I_VERSION(inode)) {
>>> >                 inode_inc_iversion(inode);
>>> > +               sync_it |= S_VERSION;
>>> > +       }
>>> > +
>>> > +       if (!sync_it)
>>> > +               set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
>>> > +       else
>>> > +               clear_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
>>> >  }
>>> >
>>> >  static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
>>> > @@ -1987,6 +2005,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
>>> >                 goto out;
>>> >         }
>>> >
>>> > +       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
>>> > +               if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP,
>>> > +                                       &BTRFS_I(inode)->runtime_flags) &&
>>> > +                   test_and_clear_bit(BTRFS_INODE_NOISIZE,
>>> > +                                       &BTRFS_I(inode)->runtime_flags)) {
>>> > +                       barrier_all_devices(root->fs_info);
>>> > +                       mutex_unlock(&inode->i_mutex);
>>> > +                       goto out;
>>>
>>> Hi Liu,
>>>
>>> For the non-full sync case, what happens if an IO error happened
>>> during writeback?
>>> I don't see anything here that checks if an IO error happened and
>>> return -EIO to user space if such error happened.
>>> In other words, testing for the bit AS_EIO in the inode->i_mapping->flags.
>>
>> Good point, I missed that part.
>>
>> Besides that, is the whole "noi_version and nocow" idea acceptable to you?
>
> Yes.
> I haven't tested it however, just eyeballed the patches.

I forgot to ask before, but any special reason to use
barrier_all_devices() instead of waiting for the ordered extents in
the given range to get the BTRFS_ORDERED_IO_DONE set?

Using the barrier, wouldn't it wait for all writeback, including those
for other files or for ranges outside the range given to
btrfs_sync_file() (important for msync for e.g.).

thanks

>
> Thanks.
>
>>
>> Thanks,
>>
>> -liubo
>>
>>>
>>> thanks
>>>
>>> > +               }
>>> > +       }
>>> > +
>>> >         /*
>>> >          * ok we haven't committed the transaction yet, lets do a commit
>>> >          */
>>> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
>>> > index 0020b56..3d230e6 100644
>>> > --- a/fs/btrfs/inode.c
>>> > +++ b/fs/btrfs/inode.c
>>> > @@ -1384,6 +1384,7 @@ out_check:
>>> >
>>> >                 btrfs_release_path(path);
>>> >                 if (cow_start != (u64)-1) {
>>> > +                       clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>>> >                         ret = cow_file_range(inode, locked_page,
>>> >                                              cow_start, found_key.offset - 1,
>>> >                                              page_started, nr_written, 1);
>>> > @@ -1426,6 +1427,7 @@ out_check:
>>> >                                                 em->start + em->len - 1, 0);
>>> >                         }
>>> >                         type = BTRFS_ORDERED_PREALLOC;
>>> > +                       clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>>> >                 } else {
>>> >                         type = BTRFS_ORDERED_NOCOW;
>>> >                 }
>>> > @@ -1464,6 +1466,7 @@ out_check:
>>> >         }
>>> >
>>> >         if (cow_start != (u64)-1) {
>>> > +               clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
>>> >                 ret = cow_file_range(inode, locked_page, cow_start, end,
>>> >                                      page_started, nr_written, 1);
>>> >                 if (ret)
>>> > --
>>> > 2.1.0
>>> >
>>> > --
>>> > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
>>> > the body of a message to majordomo@vger.kernel.org
>>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>>
>>>
>>> --
>>> Filipe David Manana,
>>>
>>> "Reasonable men adapt themselves to the world.
>>>  Unreasonable men adapt the world to themselves.
>>>  That's why all progress depends on unreasonable men."
>
>
>
> --
> Filipe David Manana,
>
> "Reasonable men adapt themselves to the world.
>  Unreasonable men adapt the world to themselves.
>  That's why all progress depends on unreasonable men."



-- 
Filipe David Manana,

"Reasonable men adapt themselves to the world.
 Unreasonable men adapt the world to themselves.
 That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [RFC PATCH 2/2] Btrfs: improve fsync for nocow file
  2015-06-10 12:45         ` Filipe David Manana
@ 2015-06-10 13:38           ` Liu Bo
  0 siblings, 0 replies; 7+ messages in thread
From: Liu Bo @ 2015-06-10 13:38 UTC (permalink / raw)
  To: Filipe David Manana; +Cc: linux-btrfs

On Wed, Jun 10, 2015 at 01:45:29PM +0100, Filipe David Manana wrote:
> On Wed, Jun 10, 2015 at 9:26 AM, Filipe David Manana <fdmanana@gmail.com> wrote:
> > On Wed, Jun 10, 2015 at 3:09 AM, Liu Bo <bo.li.liu@oracle.com> wrote:
> >> On Tue, Jun 09, 2015 at 01:56:41PM +0100, Filipe David Manana wrote:
> >>> On Tue, Jun 9, 2015 at 1:04 PM, Liu Bo <bo.li.liu@oracle.com> wrote:
> >>> > If  we're overwriting an allocated file without changing timestamp
> >>> > and inode version, and the file is with NODATACOW, we don't have any metadata to
> >>> > commit, thus we can just flush the data device cache and go forward.
> >>> >
> >>> > However, if there's have any change on extents' disk bytenr, inode size,
> >>> > timestamp or inode version, we need to go through the normal btrfs_log_inode
> >>> > path.
> >>> >
> >>> > Test:
> >>> > ----------------------------
> >>> > 1. sysbench test of
> >>> > "1 file + 1 thread + bs=4k + size=40k + synchronous I/O mode + randomwrite +
> >>> > fsync_on_each_write",
> >>> > 2. loop device associated with tmpfs file
> >>> > 3.
> >>> >   - For btrfs, "-o nodatacow" and "-o noi_version" option
> >>> >   - For ext4 and xfs, no extra mount options
> >>> > ----------------------------
> >>> >
> >>> > Results:
> >>> > ----------------------------
> >>> > - btrfs:
> >>> > w/o: ~30Mb/sec
> >>> > w:   ~181Mb/sec
> >>> >
> >>> > - other filesystems: (both don't enable i_version by default)
> >>> > ext4:  203Mb/sec
> >>> > xfs:   212Mb/sec
> >>> > ----------------------------
> >>> >
> >>> > Signed-off-by: Liu Bo <bo.li.liu@oracle.com>
> >>> > ---
> >>> >  fs/btrfs/btrfs_inode.h |  2 ++
> >>> >  fs/btrfs/disk-io.c     |  2 +-
> >>> >  fs/btrfs/disk-io.h     |  1 +
> >>> >  fs/btrfs/file.c        | 39 ++++++++++++++++++++++++++++++++++-----
> >>> >  fs/btrfs/inode.c       |  3 +++
> >>> >  5 files changed, 41 insertions(+), 6 deletions(-)
> >>> >
> >>> > diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> >>> > index 0ef5cc1..b36d87a 100644
> >>> > --- a/fs/btrfs/btrfs_inode.h
> >>> > +++ b/fs/btrfs/btrfs_inode.h
> >>> > @@ -44,6 +44,8 @@
> >>> >  #define BTRFS_INODE_IN_DELALLOC_LIST           9
> >>> >  #define BTRFS_INODE_READDIO_NEED_LOCK          10
> >>> >  #define BTRFS_INODE_HAS_PROPS                  11
> >>> > +#define BTRFS_INODE_NOTIMESTAMP                        12
> >>> > +#define BTRFS_INODE_NOISIZE                    13
> >>> >  /*
> >>> >   * The following 3 bits are meant only for the btree inode.
> >>> >   * When any of them is set, it means an error happened while writing an
> >>> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> >>> > index 2ef9a4b..de7fd94 100644
> >>> > --- a/fs/btrfs/disk-io.c
> >>> > +++ b/fs/btrfs/disk-io.c
> >>> > @@ -3343,7 +3343,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
> >>> >   * send an empty flush down to each device in parallel,
> >>> >   * then wait for them
> >>> >   */
> >>> > -static int barrier_all_devices(struct btrfs_fs_info *info)
> >>> > +int barrier_all_devices(struct btrfs_fs_info *info)
> >>> >  {
> >>> >         struct list_head *head;
> >>> >         struct btrfs_device *dev;
> >>> > diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
> >>> > index d4cbfee..2bc91fe 100644
> >>> > --- a/fs/btrfs/disk-io.h
> >>> > +++ b/fs/btrfs/disk-io.h
> >>> > @@ -60,6 +60,7 @@ void close_ctree(struct btrfs_root *root);
> >>> >  int write_ctree_super(struct btrfs_trans_handle *trans,
> >>> >                       struct btrfs_root *root, int max_mirrors);
> >>> >  struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
> >>> > +int barrier_all_devices(struct btrfs_fs_info *info);
> >>> >  int btrfs_commit_super(struct btrfs_root *root);
> >>> >  struct extent_buffer *btrfs_find_tree_block(struct btrfs_fs_info *fs_info,
> >>> >                                             u64 bytenr);
> >>> > diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
> >>> > index 23b6e03..861c29f 100644
> >>> > --- a/fs/btrfs/file.c
> >>> > +++ b/fs/btrfs/file.c
> >>> > @@ -519,8 +519,12 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
> >>> >          * the disk i_size.  There is no need to log the inode
> >>> >          * at this time.
> >>> >          */
> >>> > -       if (end_pos > isize)
> >>> > +       if (end_pos > isize) {
> >>> >                 i_size_write(inode, end_pos);
> >>> > +               clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> >>> > +       } else {
> >>> > +               set_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> >>> > +       }
> >>> >         return 0;
> >>> >  }
> >>> >
> >>> > @@ -1711,19 +1715,33 @@ out:
> >>> >  static void update_time_for_write(struct inode *inode)
> >>> >  {
> >>> >         struct timespec now;
> >>> > +       int sync_it = 0;
> >>> >
> >>> > -       if (IS_NOCMTIME(inode))
> >>> > +       if (IS_NOCMTIME(inode)) {
> >>> > +               set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
> >>> >                 return;
> >>> > +       }
> >>> >
> >>> >         now = current_fs_time(inode->i_sb);
> >>> > -       if (!timespec_equal(&inode->i_mtime, &now))
> >>> > +       if (!timespec_equal(&inode->i_mtime, &now)) {
> >>> >                 inode->i_mtime = now;
> >>> > +               sync_it = S_MTIME;
> >>> > +       }
> >>> >
> >>> > -       if (!timespec_equal(&inode->i_ctime, &now))
> >>> > +       if (!timespec_equal(&inode->i_ctime, &now)) {
> >>> >                 inode->i_ctime = now;
> >>> > +               sync_it |= S_CTIME;
> >>> > +       }
> >>> >
> >>> > -       if (IS_I_VERSION(inode))
> >>> > +       if (IS_I_VERSION(inode)) {
> >>> >                 inode_inc_iversion(inode);
> >>> > +               sync_it |= S_VERSION;
> >>> > +       }
> >>> > +
> >>> > +       if (!sync_it)
> >>> > +               set_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
> >>> > +       else
> >>> > +               clear_bit(BTRFS_INODE_NOTIMESTAMP, &BTRFS_I(inode)->runtime_flags);
> >>> >  }
> >>> >
> >>> >  static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
> >>> > @@ -1987,6 +2005,17 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
> >>> >                 goto out;
> >>> >         }
> >>> >
> >>> > +       if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
> >>> > +               if (test_and_clear_bit(BTRFS_INODE_NOTIMESTAMP,
> >>> > +                                       &BTRFS_I(inode)->runtime_flags) &&
> >>> > +                   test_and_clear_bit(BTRFS_INODE_NOISIZE,
> >>> > +                                       &BTRFS_I(inode)->runtime_flags)) {
> >>> > +                       barrier_all_devices(root->fs_info);
> >>> > +                       mutex_unlock(&inode->i_mutex);
> >>> > +                       goto out;
> >>>
> >>> Hi Liu,
> >>>
> >>> For the non-full sync case, what happens if an IO error happened
> >>> during writeback?
> >>> I don't see anything here that checks if an IO error happened and
> >>> return -EIO to user space if such error happened.
> >>> In other words, testing for the bit AS_EIO in the inode->i_mapping->flags.
> >>
> >> Good point, I missed that part.
> >>
> >> Besides that, is the whole "noi_version and nocow" idea acceptable to you?
> >
> > Yes.
> > I haven't tested it however, just eyeballed the patches.
> 
> I forgot to ask before, but any special reason to use
> barrier_all_devices() instead of waiting for the ordered extents in
> the given range to get the BTRFS_ORDERED_IO_DONE set?

That reminds me, yes, both are needed -- flushing/waiting data and
flushing all devices' cache, but IMO we don't have to wait for
BTRFS_ORDERED_IO_DONE but only filemap_fdatawait_range is good enough.

I'll fix that along with data error.

> 
> Using the barrier, wouldn't it wait for all writeback, including those
> for other files or for ranges outside the range given to
> btrfs_sync_file() (important for msync for e.g.).

This barrier is for flushing device cache, I think we have to do that
since we skip log commit part.
(But maybe it's not needed if we specify NOBARRIER option.)

Thanks,

-liubo

> 
> thanks
> 
> >
> > Thanks.
> >
> >>
> >> Thanks,
> >>
> >> -liubo
> >>
> >>>
> >>> thanks
> >>>
> >>> > +               }
> >>> > +       }
> >>> > +
> >>> >         /*
> >>> >          * ok we haven't committed the transaction yet, lets do a commit
> >>> >          */
> >>> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> >>> > index 0020b56..3d230e6 100644
> >>> > --- a/fs/btrfs/inode.c
> >>> > +++ b/fs/btrfs/inode.c
> >>> > @@ -1384,6 +1384,7 @@ out_check:
> >>> >
> >>> >                 btrfs_release_path(path);
> >>> >                 if (cow_start != (u64)-1) {
> >>> > +                       clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> >>> >                         ret = cow_file_range(inode, locked_page,
> >>> >                                              cow_start, found_key.offset - 1,
> >>> >                                              page_started, nr_written, 1);
> >>> > @@ -1426,6 +1427,7 @@ out_check:
> >>> >                                                 em->start + em->len - 1, 0);
> >>> >                         }
> >>> >                         type = BTRFS_ORDERED_PREALLOC;
> >>> > +                       clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> >>> >                 } else {
> >>> >                         type = BTRFS_ORDERED_NOCOW;
> >>> >                 }
> >>> > @@ -1464,6 +1466,7 @@ out_check:
> >>> >         }
> >>> >
> >>> >         if (cow_start != (u64)-1) {
> >>> > +               clear_bit(BTRFS_INODE_NOISIZE, &BTRFS_I(inode)->runtime_flags);
> >>> >                 ret = cow_file_range(inode, locked_page, cow_start, end,
> >>> >                                      page_started, nr_written, 1);
> >>> >                 if (ret)
> >>> > --
> >>> > 2.1.0
> >>> >
> >>> > --
> >>> > To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> >>> > the body of a message to majordomo@vger.kernel.org
> >>> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> >>>
> >>>
> >>>
> >>> --
> >>> Filipe David Manana,
> >>>
> >>> "Reasonable men adapt themselves to the world.
> >>>  Unreasonable men adapt the world to themselves.
> >>>  That's why all progress depends on unreasonable men."
> >
> >
> >
> > --
> > Filipe David Manana,
> >
> > "Reasonable men adapt themselves to the world.
> >  Unreasonable men adapt the world to themselves.
> >  That's why all progress depends on unreasonable men."
> 
> 
> 
> -- 
> Filipe David Manana,
> 
> "Reasonable men adapt themselves to the world.
>  Unreasonable men adapt the world to themselves.
>  That's why all progress depends on unreasonable men."

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2015-06-10 13:38 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-06-09 12:04 [RFC PATCH 1/2] Btrfs: add noi_version option to disable MS_I_VERSION Liu Bo
2015-06-09 12:04 ` [RFC PATCH 2/2] Btrfs: improve fsync for nocow file Liu Bo
2015-06-09 12:56   ` Filipe David Manana
2015-06-10  2:09     ` Liu Bo
2015-06-10  8:26       ` Filipe David Manana
2015-06-10 12:45         ` Filipe David Manana
2015-06-10 13:38           ` Liu Bo

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.