All of lore.kernel.org
 help / color / mirror / Atom feed
From: Chandan Rajendra <chandan@linux.vnet.ibm.com>
To: clm@fb.com, jbacik@fb.com, bo.li.liu@oracle.com, dsterba@suse.cz
Cc: Chandan Rajendra <chandan@linux.vnet.ibm.com>,
	linux-btrfs@vger.kernel.org, aneesh.kumar@linux.vnet.ibm.com
Subject: [RFC PATCH V2 1/8] Btrfs: subpagesize-blocksize: Get rid of whole page reads.
Date: Wed, 11 Jun 2014 17:02:14 +0530	[thread overview]
Message-ID: <1402486341-592-2-git-send-email-chandan@linux.vnet.ibm.com> (raw)
In-Reply-To: <1402486341-592-1-git-send-email-chandan@linux.vnet.ibm.com>

Based on original patch from Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>

bio_vec->{bv_offset, bv_len} cannot be relied upon by the end bio functions
to track the file offset range operated on by the bio. Hence this patch adds
two new members to 'struct btrfs_io_bio' to track the file offset range.

This patch also brings back check_page_locked() to reliably unlock pages in
readpage's end bio function.

Signed-off-by: Chandan Rajendra <chandan@linux.vnet.ibm.com>
---
 fs/btrfs/extent_io.c | 200 ++++++++++++++++++++++-----------------------------
 fs/btrfs/volumes.h   |   3 +
 2 files changed, 90 insertions(+), 113 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fbe501d..fa28545 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1943,15 +1943,29 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
  * helper function to set a given page up to date if all the
  * extents in the tree for that page are up to date
  */
-static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
+static void check_page_uptodate(struct extent_io_tree *tree, struct page *page,
+				struct extent_state *cached)
 {
 	u64 start = page_offset(page);
 	u64 end = start + PAGE_CACHE_SIZE - 1;
-	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
+	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, cached))
 		SetPageUptodate(page);
 }
 
 /*
+ * helper function to unlock a page if all the extents in the tree
+ * for that page are unlocked
+ */
+static void check_page_locked(struct extent_io_tree *tree, struct page *page)
+{
+	u64 start = page_offset(page);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+
+	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
+		unlock_page(page);
+	}
+}
+
  * When IO fails, either with EIO or csum verification fails, we
  * try other mirrors that might have a good copy of the data.  This
  * io_failure_record is used to record state as we go through all the
@@ -2173,6 +2187,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	struct bio *bio;
 	struct btrfs_io_bio *btrfs_failed_bio;
 	struct btrfs_io_bio *btrfs_bio;
+	int nr_sectors;
 	int num_copies;
 	int ret;
 	int read_mode;
@@ -2267,7 +2282,8 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	 *	a) deliver good data to the caller
 	 *	b) correct the bad sectors on disk
 	 */
-	if (failed_bio->bi_vcnt > 1) {
+	nr_sectors = btrfs_io_bio(failed_bio)->len >> inode->i_sb->s_blocksize_bits;
+	if (nr_sectors > 1) {
 		/*
 		 * to fulfill b), we need to know the exact failing sectors, as
 		 * we don't want to rewrite any more than the failed ones. thus,
@@ -2314,6 +2330,8 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	bio->bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 	bio->bi_size = 0;
+	btrfs_io_bio(bio)->start_offset = start;
+	btrfs_io_bio(bio)->len = end - start + 1;
 
 	btrfs_failed_bio = btrfs_io_bio(failed_bio);
 	if (btrfs_failed_bio->csum) {
@@ -2414,18 +2432,6 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 	bio_put(bio);
 }
 
-static void
-endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
-			      int uptodate)
-{
-	struct extent_state *cached = NULL;
-	u64 end = start + len - 1;
-
-	if (uptodate && tree->track_uptodate)
-		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
-	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
-}
-
 /*
  * after a readpage IO is done, we need to:
  * clear the uptodate bits on error
@@ -2440,76 +2446,50 @@ endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
 static void end_bio_extent_readpage(struct bio *bio, int err)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
-	struct bio_vec *bvec = bio->bi_io_vec;
 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
+	struct bio_vec *bvec = bio->bi_io_vec;
+	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+	struct address_space *mapping;
+	struct extent_state *cached = NULL;
 	struct extent_io_tree *tree;
-	u64 offset = 0;
+	struct btrfs_root *root;
+	struct inode *inode;
+	struct page *page;
 	u64 start;
-	u64 end;
+	u64 offset = 0;
 	u64 len;
-	u64 extent_start = 0;
-	u64 extent_len = 0;
+	int nr_sectors;
 	int mirror;
 	int ret;
 
-	if (err)
-		uptodate = 0;
+	mapping = bio->bi_io_vec->bv_page->mapping;
+	inode = mapping->host;
+	root = BTRFS_I(inode)->root;
+	tree = &BTRFS_I(inode)->io_tree;
 
-	do {
-		struct page *page = bvec->bv_page;
-		struct inode *inode = page->mapping->host;
-
-		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
-			 "mirror=%lu\n", (u64)bio->bi_sector, err,
-			 io_bio->mirror_num);
-		tree = &BTRFS_I(inode)->io_tree;
-
-		/* We always issue full-page reads, but if some block
-		 * in a page fails to read, blk_update_request() will
-		 * advance bv_offset and adjust bv_len to compensate.
-		 * Print a warning for nonzero offsets, and an error
-		 * if they don't add up to a full page.  */
-		if (bvec->bv_offset || bvec->bv_len != PAGE_CACHE_SIZE) {
-			if (bvec->bv_offset + bvec->bv_len != PAGE_CACHE_SIZE)
-				btrfs_err(BTRFS_I(page->mapping->host)->root->fs_info,
-				   "partial page read in btrfs with offset %u and length %u",
-					bvec->bv_offset, bvec->bv_len);
-			else
-				btrfs_info(BTRFS_I(page->mapping->host)->root->fs_info,
-				   "incomplete page read in btrfs with offset %u and "
-				   "length %u",
-					bvec->bv_offset, bvec->bv_len);
-		}
+	start = btrfs_io_bio(bio)->start_offset;
+	len = btrfs_io_bio(bio)->len;
+	mirror = io_bio->mirror_num;
 
-		start = page_offset(page);
-		end = start + bvec->bv_offset + bvec->bv_len - 1;
-		len = bvec->bv_len;
+	nr_sectors = len >> inode->i_sb->s_blocksize_bits;
+	BUG_ON(!nr_sectors);
 
-		if (++bvec <= bvec_end)
-			prefetchw(&bvec->bv_page->flags);
+	do {
+		BUG_ON(bvec > bvec_end);
+		page = bvec->bv_page;
 
-		mirror = io_bio->mirror_num;
-		if (likely(uptodate && tree->ops &&
-			   tree->ops->readpage_end_io_hook)) {
+		if (uptodate) {
 			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
-							      page, start, end,
-							      mirror);
+							page, start,
+							start + root->sectorsize - 1,
+							mirror);
 			if (ret)
 				uptodate = 0;
 			else
 				clean_io_failure(start, page);
 		}
 
-		if (likely(uptodate))
-			goto readpage_ok;
-
-		if (tree->ops && tree->ops->readpage_io_failed_hook) {
-			ret = tree->ops->readpage_io_failed_hook(page, mirror);
-			if (!ret && !err &&
-			    test_bit(BIO_UPTODATE, &bio->bi_flags))
-				uptodate = 1;
-		} else {
+		if (!uptodate) {
 			/*
 			 * The generic bio_readpage_error handles errors the
 			 * following way: If possible, new read requests are
@@ -2520,63 +2500,38 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			 * can't handle the error it will return -EIO and we
 			 * remain responsible for that page.
 			 */
-			ret = bio_readpage_error(bio, offset, page, start, end,
-						 mirror);
+			ret = bio_readpage_error(bio, offset, page,
+						start, start + root->sectorsize - 1,
+						mirror);
 			if (ret == 0) {
-				uptodate =
-					test_bit(BIO_UPTODATE, &bio->bi_flags);
+				uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 				if (err)
 					uptodate = 0;
-				continue;
+				goto next_block;
 			}
 		}
-readpage_ok:
-		if (likely(uptodate)) {
-			loff_t i_size = i_size_read(inode);
-			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
-			unsigned offset;
-
-			/* Zero out the end if this page straddles i_size */
-			offset = i_size & (PAGE_CACHE_SIZE-1);
-			if (page->index == end_index && offset)
-				zero_user_segment(page, offset, PAGE_CACHE_SIZE);
-			SetPageUptodate(page);
+
+		if (uptodate) {
+			set_extent_uptodate(tree, start,
+					start + root->sectorsize - 1,
+					&cached, GFP_ATOMIC);
+			check_page_uptodate(tree, page, cached);
 		} else {
 			ClearPageUptodate(page);
 			SetPageError(page);
 		}
-		unlock_page(page);
-		offset += len;
-
-		if (unlikely(!uptodate)) {
-			if (extent_len) {
-				endio_readpage_release_extent(tree,
-							      extent_start,
-							      extent_len, 1);
-				extent_start = 0;
-				extent_len = 0;
-			}
-			endio_readpage_release_extent(tree, start,
-						      end - start + 1, 0);
-		} else if (!extent_len) {
-			extent_start = start;
-			extent_len = end + 1 - start;
-		} else if (extent_start + extent_len == start) {
-			extent_len += end + 1 - start;
-		} else {
-			endio_readpage_release_extent(tree, extent_start,
-						      extent_len, uptodate);
-			extent_start = start;
-			extent_len = end + 1 - start;
-		}
-	} while (bvec <= bvec_end);
 
-	if (extent_len)
-		endio_readpage_release_extent(tree, extent_start, extent_len,
-					      uptodate);
+		unlock_extent(tree, start, start + root->sectorsize - 1);
+		check_page_locked(tree, page);
+next_block:
+		offset += root->sectorsize;
+		start += root->sectorsize;
+		if ((page_offset(page) + PAGE_CACHE_SIZE) == start)
+			++bvec;
+	} while (--nr_sectors);
+
 	if (io_bio->end_io)
 		io_bio->end_io(io_bio, err);
-	bio_put(bio);
 }
 
 /*
@@ -2700,6 +2655,18 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 		else
 			contig = bio_end_sector(bio) == sector;
 
+		if (contig) {
+			/*
+			 * Check whether we are contig if file offsets.
+			 * We should mostly be for readpage/readpages
+			 * We need to do this because we use btrfs_io_bio
+			 * start_offset and len to unlock in endio routines.
+			 */
+			if ((page_offset(page) + offset) !=
+					(btrfs_io_bio(bio)->start_offset +
+					 btrfs_io_bio(bio)->len))
+				contig = 0;
+		}
 		if (prev_bio_flags != bio_flags || !contig ||
 		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
 		    bio_add_page(bio, page, page_size, offset) < page_size) {
@@ -2709,6 +2676,11 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 				return ret;
 			bio = NULL;
 		} else {
+			/*
+			 * update btrfs_io_bio len. So that we can unlock
+			 * correctly in end_io callback.
+			 */
+			btrfs_io_bio(bio)->len += page_size;
 			return 0;
 		}
 	}
@@ -2724,6 +2696,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
 	bio_add_page(bio, page, page_size, offset);
 	bio->bi_end_io = end_io_func;
 	bio->bi_private = tree;
+	btrfs_io_bio(bio)->start_offset = page_offset(page) + offset;
+	btrfs_io_bio(bio)->len = page_size;
 
 	if (bio_ret)
 		*bio_ret = bio;
@@ -2914,7 +2888,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 		/* the get_extent function already copied into the page */
 		if (test_range_bit(tree, cur, cur_end,
 				   EXTENT_UPTODATE, 1, NULL)) {
-			check_page_uptodate(tree, page);
+			check_page_uptodate(tree, page, NULL);
 			if (!parent_locked)
 				unlock_extent(tree, cur, cur + iosize - 1);
 			cur = cur + iosize;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 80754f9..fb2dbdc 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -173,6 +173,9 @@ struct btrfs_io_bio {
 	u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
 	u8 *csum_allocated;
 	btrfs_io_bio_end_io_t *end_io;
+	/* Track file offset range operated on by the bio.*/
+	u64 start_offset;
+	u64 len;
 	struct bio bio;
 };
 
-- 
1.8.3.1


  reply	other threads:[~2014-06-11 11:33 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-06-11 11:32 [RFC PATCH V2 0/8] Btrfs: Subpagesize-blocksize: Get rid of whole page I/O Chandan Rajendra
2014-06-11 11:32 ` Chandan Rajendra [this message]
2014-06-11 11:32 ` [RFC PATCH V2 2/8] Btrfs: subpagesize-blocksize: Get rid of whole page writes Chandan Rajendra
2014-06-11 11:32 ` [RFC PATCH V2 3/8] Btrfs: subpagesize-blocksize: __btrfs_buffered_write: Reserve/release extents aligned to block size Chandan Rajendra
2014-06-11 11:32 ` [RFC PATCH V2 4/8] Btrfs: subpagesize-blocksize: Define extent_buffer_head Chandan Rajendra
2014-06-11 11:32 ` [RFC PATCH V2 5/8] Btrfs: subpagesize-blocksize: Read tree blocks whose size is <PAGE_CACHE_SIZE Chandan Rajendra
2014-06-11 11:32 ` [RFC PATCH V2 6/8] Btrfs: subpagesize-blocksize: Write only dirty extent buffers belonging to a page Chandan Rajendra
2014-06-11 11:32 ` [RFC PATCH V2 7/8] Btrfs: subpagesize-blocksize: Allow mounting filesystems where sectorsize != PAGE_SIZE Chandan Rajendra
2014-06-11 11:32 ` [RFC PATCH V2 8/8] Btrfs: subpagesize-blocksize: Compute and look up csums based on sectorsized blocks Chandan Rajendra

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1402486341-592-2-git-send-email-chandan@linux.vnet.ibm.com \
    --to=chandan@linux.vnet.ibm.com \
    --cc=aneesh.kumar@linux.vnet.ibm.com \
    --cc=bo.li.liu@oracle.com \
    --cc=clm@fb.com \
    --cc=dsterba@suse.cz \
    --cc=jbacik@fb.com \
    --cc=linux-btrfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.