All of lore.kernel.org
 help / color / mirror / Atom feed
From: Goldwyn Rodrigues <rgoldwyn@suse.de>
To: linux-fsdevel@vger.kernel.org
Cc: linux-btrfs@vger.kernel.org, hch@lst.de, darrick.wong@oracle.com,
	ruansy.fnst@cn.fujitsu.com, Goldwyn Rodrigues <rgoldwyn@suse.com>
Subject: [PATCH 04/13] btrfs: Add a simple buffered iomap write
Date: Fri,  2 Aug 2019 17:00:39 -0500	[thread overview]
Message-ID: <20190802220048.16142-5-rgoldwyn@suse.de> (raw)
In-Reply-To: <20190802220048.16142-1-rgoldwyn@suse.de>

From: Goldwyn Rodrigues <rgoldwyn@suse.com>

Introduce a new btrfs_iomap structure which contains information
about the filesystem between the iomap_begin() and iomap_end() calls.
This contains information about reservations and extent locking.

This one is a long patch. Most of the code is "inspired" by
fs/btrfs/file.c. To keep the size small, all removals are in
following patches.

Signed-off-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
---
 fs/btrfs/Makefile |   2 +-
 fs/btrfs/ctree.h  |   1 +
 fs/btrfs/file.c   |   4 +-
 fs/btrfs/iomap.c  | 381 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 385 insertions(+), 3 deletions(-)
 create mode 100644 fs/btrfs/iomap.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 76a843198bcb..f88e696b0698 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
 	   uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
-	   block-rsv.o delalloc-space.o
+	   block-rsv.o delalloc-space.o iomap.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 299e11e6c554..7a4ff524dc77 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3247,6 +3247,7 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
 loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
 			      struct file *file_out, loff_t pos_out,
 			      loff_t len, unsigned int remap_flags);
+size_t btrfs_buffered_iomap_write(struct kiocb *iocb, struct iov_iter *from);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4466a09f2d98..0707db04d3cc 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1829,7 +1829,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
 		return written;
 
 	pos = iocb->ki_pos;
-	written_buffered = btrfs_buffered_write(iocb, from);
+	written_buffered = btrfs_buffered_iomap_write(iocb, from);
 	if (written_buffered < 0) {
 		err = written_buffered;
 		goto out;
@@ -1966,7 +1966,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		num_written = __btrfs_direct_write(iocb, from);
 	} else {
-		num_written = btrfs_buffered_write(iocb, from);
+		num_written = btrfs_buffered_iomap_write(iocb, from);
 		if (num_written > 0)
 			iocb->ki_pos = pos + num_written;
 		if (clean_page)
diff --git a/fs/btrfs/iomap.c b/fs/btrfs/iomap.c
new file mode 100644
index 000000000000..9eb5e7b7603a
--- /dev/null
+++ b/fs/btrfs/iomap.c
@@ -0,0 +1,381 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * iomap support for BTRFS
+ *
+ * Copyright (c) 2019  SUSE Linux
+ * Author: Goldwyn Rodrigues <rgoldwyn@suse.com>
+ */
+
+#include <linux/iomap.h>
+#include "ctree.h"
+#include "btrfs_inode.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "delalloc-space.h"
+
+struct btrfs_iomap {
+	u64 start;
+	u64 end;
+	bool nocow;
+	int extents_locked;
+	ssize_t reserved_bytes;
+	struct extent_changeset *data_reserved;
+	struct extent_state *cached_state;
+};
+
+
+/*
+ * This function locks the extent and properly waits for data=ordered extents
+ * to finish before allowing the pages to be modified if need.
+ *
+ * The return value:
+ * 1 - the extent is locked
+ * 0 - the extent is not locked, and everything is OK
+ * -EAGAIN - need re-prepare the pages
+ * the other < 0 number - Something wrong happens
+ */
+static noinline int
+lock_and_cleanup_extent(struct btrfs_inode *inode, loff_t pos,
+			 size_t write_bytes,
+			 u64 *lockstart, u64 *lockend,
+			 struct extent_state **cached_state)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	u64 start_pos;
+	u64 last_pos;
+	int ret = 0;
+
+	start_pos = round_down(pos, fs_info->sectorsize);
+	last_pos = start_pos
+		+ round_up(pos + write_bytes - start_pos,
+			   fs_info->sectorsize) - 1;
+
+	if (start_pos < inode->vfs_inode.i_size) {
+		struct btrfs_ordered_extent *ordered;
+
+		lock_extent_bits(&inode->io_tree, start_pos, last_pos,
+				cached_state);
+		ordered = btrfs_lookup_ordered_range(inode, start_pos,
+						     last_pos - start_pos + 1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > start_pos &&
+		    ordered->file_offset <= last_pos) {
+			unlock_extent_cached(&inode->io_tree, start_pos,
+					last_pos, cached_state);
+			btrfs_start_ordered_extent(&inode->vfs_inode,
+					ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			return -EAGAIN;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+
+		*lockstart = start_pos;
+		*lockend = last_pos;
+		ret = 1;
+	}
+
+	return ret;
+}
+
+static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
+				    size_t *write_bytes)
+{
+	struct btrfs_fs_info *fs_info = inode->root->fs_info;
+	struct btrfs_root *root = inode->root;
+	struct btrfs_ordered_extent *ordered;
+	u64 lockstart, lockend;
+	u64 num_bytes;
+	int ret;
+
+	ret = btrfs_start_write_no_snapshotting(root);
+	if (!ret)
+		return -ENOSPC;
+
+	lockstart = round_down(pos, fs_info->sectorsize);
+	lockend = round_up(pos + *write_bytes,
+			   fs_info->sectorsize) - 1;
+
+	while (1) {
+		lock_extent(&inode->io_tree, lockstart, lockend);
+		ordered = btrfs_lookup_ordered_range(inode, lockstart,
+						     lockend - lockstart + 1);
+		if (!ordered) {
+			break;
+		}
+		unlock_extent(&inode->io_tree, lockstart, lockend);
+		btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	num_bytes = lockend - lockstart + 1;
+	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
+			NULL, NULL, NULL);
+	if (ret <= 0) {
+		ret = 0;
+		btrfs_end_write_no_snapshotting(root);
+	} else {
+		*write_bytes = min_t(size_t, *write_bytes ,
+				     num_bytes - pos + lockstart);
+	}
+
+	unlock_extent(&inode->io_tree, lockstart, lockend);
+
+	return ret;
+}
+
+static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
+					 const u64 start,
+					 const u64 len,
+					 struct extent_state **cached_state)
+{
+	u64 search_start = start;
+	const u64 end = start + len - 1;
+
+	while (search_start < end) {
+		const u64 search_len = end - search_start + 1;
+		struct extent_map *em;
+		u64 em_len;
+		int ret = 0;
+
+		em = btrfs_get_extent(inode, NULL, 0, search_start,
+				      search_len, 0);
+		if (IS_ERR(em))
+			return PTR_ERR(em);
+
+		if (em->block_start != EXTENT_MAP_HOLE)
+			goto next;
+
+		em_len = em->len;
+		if (em->start < search_start)
+			em_len -= search_start - em->start;
+		if (em_len > search_len)
+			em_len = search_len;
+
+		ret = set_extent_bit(&inode->io_tree, search_start,
+				     search_start + em_len - 1,
+				     EXTENT_DELALLOC_NEW,
+				     NULL, cached_state, GFP_NOFS);
+next:
+		search_start = extent_map_end(em);
+		free_extent_map(em);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+
+static void btrfs_buffered_page_done(struct inode *inode, loff_t pos,
+		unsigned copied, struct page *page,
+		struct iomap *iomap)
+{
+	SetPageUptodate(page);
+	ClearPageChecked(page);
+	set_page_dirty(page);
+	get_page(page);
+}
+
+
+static const struct iomap_page_ops btrfs_buffered_page_ops = {
+	.page_done = btrfs_buffered_page_done,
+};
+
+
+static int btrfs_buffered_iomap_begin(struct inode *inode, loff_t pos,
+		loff_t length, unsigned flags, struct iomap *iomap,
+		struct iomap *srcmap)
+{
+	int ret;
+	size_t write_bytes = length;
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	size_t sector_offset = pos & (fs_info->sectorsize - 1);
+	struct btrfs_iomap *bi;
+
+	bi = kzalloc(sizeof(struct btrfs_iomap), GFP_NOFS);
+	if (!bi)
+		return -ENOMEM;
+
+	bi->reserved_bytes = round_up(write_bytes + sector_offset,
+			fs_info->sectorsize);
+
+	/* Reserve data space */
+	ret = btrfs_check_data_free_space(inode, &bi->data_reserved, pos,
+			write_bytes);
+	if (ret < 0) {
+		/*
+		 * Space allocation failed. Let's check if we can
+		 * continue I/O without allocations
+		 */
+		if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+						BTRFS_INODE_PREALLOC)) &&
+				check_can_nocow(BTRFS_I(inode), pos,
+					&write_bytes) > 0) {
+			bi->nocow = true;
+			/*
+			 * our prealloc extent may be smaller than
+			 * write_bytes, so scale down.
+			 */
+			bi->reserved_bytes = round_up(write_bytes +
+					sector_offset,
+					fs_info->sectorsize);
+		} else {
+			goto error;
+		}
+	}
+
+	WARN_ON(bi->reserved_bytes == 0);
+
+	/* We have the data space allocated, reserve the metadata now */
+	ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
+			bi->reserved_bytes);
+	if (ret) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		if (!bi->nocow)
+			btrfs_free_reserved_data_space(inode,
+					bi->data_reserved, pos,
+					write_bytes);
+		else
+			btrfs_end_write_no_snapshotting(root);
+		goto error;
+	}
+
+	do {
+		ret = lock_and_cleanup_extent(
+				BTRFS_I(inode), pos, write_bytes, &bi->start,
+				&bi->end, &bi->cached_state);
+	} while (ret == -EAGAIN);
+
+	if (ret < 0) {
+		btrfs_delalloc_release_extents(BTRFS_I(inode),
+					       bi->reserved_bytes, true);
+		goto release;
+	} else {
+		bi->extents_locked = ret;
+	}
+	iomap->private = bi;
+	iomap->length = round_up(write_bytes, fs_info->sectorsize);
+	iomap->offset = round_down(pos, fs_info->sectorsize);
+	iomap->addr = IOMAP_NULL_ADDR;
+	iomap->type = IOMAP_DELALLOC;
+	iomap->bdev = fs_info->fs_devices->latest_bdev;
+	iomap->page_ops = &btrfs_buffered_page_ops;
+	return 0;
+release:
+	if (bi->extents_locked)
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree, bi->start,
+				bi->end, &bi->cached_state);
+	if (bi->nocow) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		btrfs_end_write_no_snapshotting(root);
+		btrfs_delalloc_release_metadata(BTRFS_I(inode),
+				bi->reserved_bytes, true);
+	} else {
+		btrfs_delalloc_release_space(inode, bi->data_reserved,
+				round_down(pos, fs_info->sectorsize),
+				bi->reserved_bytes, true);
+	}
+	extent_changeset_free(bi->data_reserved);
+
+error:
+	kfree(bi);
+	return ret;
+}
+
+static int btrfs_buffered_iomap_end(struct inode *inode, loff_t pos,
+		loff_t length, ssize_t written, unsigned flags,
+		struct iomap *iomap)
+{
+	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+	struct btrfs_iomap *bi = iomap->private;
+	ssize_t release_bytes = round_down(bi->reserved_bytes - written,
+			1 << fs_info->sb->s_blocksize_bits);
+	unsigned int extra_bits = 0;
+	u64 start_pos = pos & ~((u64) fs_info->sectorsize - 1);
+	u64 num_bytes = round_up(written + pos - start_pos,
+			fs_info->sectorsize);
+	u64 end_of_last_block = start_pos + num_bytes - 1;
+	int ret = 0;
+
+	if (release_bytes > 0) {
+		if (bi->nocow) {
+			btrfs_delalloc_release_metadata(BTRFS_I(inode),
+					release_bytes, true);
+		} else {
+			u64 __pos = round_down(pos + written, fs_info->sectorsize);
+			btrfs_delalloc_release_space(inode, bi->data_reserved,
+					__pos, release_bytes, true);
+		}
+	}
+
+	/*
+	 * The pages may have already been dirty, clear out old accounting so
+	 * we can set things up properly
+	 */
+	clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+			EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
+			EXTENT_DEFRAG, 0, 0, &bi->cached_state);
+
+	if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
+		if (start_pos >= i_size_read(inode) &&
+		    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
+			/*
+			 * There can't be any extents following eof in this case
+			 * so just set the delalloc new bit for the range
+			 * directly.
+			 */
+			extra_bits |= EXTENT_DELALLOC_NEW;
+		} else {
+			ret = btrfs_find_new_delalloc_bytes(BTRFS_I(inode),
+					start_pos, num_bytes,
+					&bi->cached_state);
+			if (ret)
+				goto unlock;
+		}
+	}
+
+	ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
+			extra_bits, &bi->cached_state, 0);
+unlock:
+	if (bi->extents_locked)
+		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+				bi->start, bi->end, &bi->cached_state);
+
+	if (bi->nocow) {
+		struct btrfs_root *root = BTRFS_I(inode)->root;
+		btrfs_end_write_no_snapshotting(root);
+		if (written > 0) {
+			u64 start = round_down(pos, fs_info->sectorsize);
+			u64 end = round_up(pos + written, fs_info->sectorsize) - 1;
+			set_extent_bit(&BTRFS_I(inode)->io_tree, start, end,
+					EXTENT_NORESERVE, NULL, NULL, GFP_NOFS);
+		}
+
+	}
+	btrfs_delalloc_release_extents(BTRFS_I(inode), bi->reserved_bytes,
+			true);
+
+	if (written < fs_info->nodesize)
+		btrfs_btree_balance_dirty(fs_info);
+
+	extent_changeset_free(bi->data_reserved);
+	kfree(bi);
+	return ret;
+}
+
+static const struct iomap_ops btrfs_buffered_iomap_ops = {
+	.iomap_begin            = btrfs_buffered_iomap_begin,
+	.iomap_end              = btrfs_buffered_iomap_end,
+};
+
+size_t btrfs_buffered_iomap_write(struct kiocb *iocb, struct iov_iter *from)
+{
+	ssize_t written;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	written = iomap_file_buffered_write(iocb, from, &btrfs_buffered_iomap_ops);
+	if (written > 0)
+		iocb->ki_pos += written;
+	if (iocb->ki_pos > i_size_read(inode))
+		i_size_write(inode, iocb->ki_pos);
+	return written;
+}
+
-- 
2.16.4


  parent reply	other threads:[~2019-08-02 22:01 UTC|newest]

Thread overview: 61+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-08-02 22:00 [PATCH v2 0/13] Btrfs iomap Goldwyn Rodrigues
2019-08-02 22:00 ` [PATCH 01/13] iomap: Use a IOMAP_COW/srcmap for a read-modify-write I/O Goldwyn Rodrigues
2019-08-03  0:39   ` Darrick J. Wong
2019-08-05  0:06   ` Dave Chinner
2019-08-02 22:00 ` [PATCH 02/13] iomap: Read page from srcmap for IOMAP_COW Goldwyn Rodrigues
2019-08-03  0:23   ` Darrick J. Wong
2019-08-04 23:52   ` Dave Chinner
2019-08-02 22:00 ` [PATCH 03/13] btrfs: Eliminate PagePrivate for btrfs data pages Goldwyn Rodrigues
2019-08-02 22:00 ` Goldwyn Rodrigues [this message]
2019-08-05  0:11   ` [PATCH 04/13] btrfs: Add a simple buffered iomap write Dave Chinner
2019-08-22 15:05     ` Goldwyn Rodrigues
2019-08-02 22:00 ` [PATCH 05/13] btrfs: Add CoW in iomap based writes Goldwyn Rodrigues
2019-08-05  0:13   ` Dave Chinner
2019-08-22 15:01     ` Goldwyn Rodrigues
2019-08-02 22:00 ` [PATCH 06/13] btrfs: remove buffered write code made unnecessary Goldwyn Rodrigues
2019-08-02 22:00 ` [PATCH 07/13] btrfs: basic direct read operation Goldwyn Rodrigues
2019-08-12 12:32   ` RITESH HARJANI
2019-08-22 15:00     ` Goldwyn Rodrigues
2019-08-02 22:00 ` [PATCH 08/13] btrfs: Carve out btrfs_get_extent_map_write() out of btrfs_get_blocks_write() Goldwyn Rodrigues
2019-08-02 22:00 ` [PATCH 09/13] btrfs: Rename __endio_write_update_ordered() to btrfs_update_ordered_extent() Goldwyn Rodrigues
2019-08-02 22:00 ` [PATCH 10/13] iomap: use a function pointer for dio submits Goldwyn Rodrigues
2019-08-03  0:21   ` Darrick J. Wong
2019-08-05 16:08     ` Goldwyn Rodrigues
2019-08-04 23:43   ` Dave Chinner
2019-08-05 16:08     ` Goldwyn Rodrigues
2019-08-05 21:54       ` Dave Chinner
2019-08-08  4:26         ` Gao Xiang
2019-08-08  4:26           ` Gao Xiang
2019-08-08  4:52           ` Gao Xiang
2019-08-08  4:52             ` Gao Xiang
2019-08-08  5:49           ` Eric Biggers
2019-08-08  5:49             ` Eric Biggers
2019-08-08  6:28             ` Gao Xiang
2019-08-08  6:28               ` Gao Xiang
2019-08-08  8:16             ` Dave Chinner
2019-08-08  8:16               ` Dave Chinner
2019-08-08  8:57               ` Gao Xiang
2019-08-08  8:57                 ` Gao Xiang
2019-08-08  9:29               ` Gao Xiang
2019-08-08  9:29                 ` Gao Xiang
2019-08-08 11:21                 ` Gao Xiang
2019-08-08 11:21                   ` Gao Xiang
2019-08-08 13:11                   ` Gao Xiang
2019-08-08 13:11                     ` Gao Xiang
2019-08-09 20:45             ` Matthew Wilcox
2019-08-09 20:45               ` Matthew Wilcox
2019-08-09 23:45               ` Gao Xiang
2019-08-09 23:45                 ` Gao Xiang
2019-08-10  0:31                 ` Eric Biggers
2019-08-10  0:31                   ` Eric Biggers
2019-08-10  0:50                   ` Eric Biggers
2019-08-10  0:50                     ` Eric Biggers
2019-08-10  1:34                     ` Gao Xiang
2019-08-10  1:34                       ` Gao Xiang
2019-08-10  1:13                   ` Gao Xiang
2019-08-10  1:13                     ` Gao Xiang
2019-08-10  0:17               ` Eric Biggers
2019-08-10  0:17                 ` Eric Biggers
2019-08-02 22:00 ` [PATCH 11/13] btrfs: Use iomap_dio_rw for performing direct I/O writes Goldwyn Rodrigues
2019-08-02 22:00 ` [PATCH 12/13] btrfs: Remove btrfs_dio_data and __btrfs_direct_write Goldwyn Rodrigues
2019-08-02 22:00 ` [PATCH 13/13] btrfs: update inode size during bio completion Goldwyn Rodrigues

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20190802220048.16142-5-rgoldwyn@suse.de \
    --to=rgoldwyn@suse.de \
    --cc=darrick.wong@oracle.com \
    --cc=hch@lst.de \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=rgoldwyn@suse.com \
    --cc=ruansy.fnst@cn.fujitsu.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.