All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] xfs: probe data buffer from page cache for unwritten extents
@ 2012-06-25 12:41 Jeff Liu
  2012-06-25 16:13 ` Mark Tinguely
  2012-06-26  2:38 ` Dave Chinner
  0 siblings, 2 replies; 4+ messages in thread
From: Jeff Liu @ 2012-06-25 12:41 UTC (permalink / raw)
  To: xfs

Hello,

Using the start offset rather than map->br_startoff to calculate the starting page index could
get more accurate data offset in page cache probe routine.
With this refinement, the old max_t() could be able to remove too. 

Thanks Mark for pointing this out!

-Jeff


Cc: Mark Tinguely <tinguely@sgi.com>
Signed-off-by: Jie Liu <jeff.liu@oracle.com>

---
 fs/xfs/xfs_file.c |  155 +++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 files changed, 150 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9f7ec15..113190b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
 
 #include <linux/dcache.h>
 #include <linux/falloc.h>
+#include <linux/pagevec.h>
 
 static const struct vm_operations_struct xfs_file_vm_ops;
 
@@ -966,6 +967,108 @@ xfs_vm_page_mkwrite(
 	return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 }
 
+/*
+ * Probe the data buffer offset in page cache for unwritten extents.
+ * Iterate each page to find out if a buffer head state has BH_Unwritten
+ * or BH_Uptodate set.
+ */
+STATIC bool
+xfs_has_unwritten_buffer(
+	struct inode		*inode,
+	struct xfs_bmbt_irec	*map,
+	loff_t			*offset)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_mount	*mp = ip->i_mount;
+	struct pagevec		pvec;
+	pgoff_t			index;
+	pgoff_t			end;
+	bool			found = false;
+
+	pagevec_init(&pvec, 0);
+
+	index = XFS_FSB_TO_B(mp, XFS_B_TO_FSBT(mp, *offset))
+			     >> PAGE_CACHE_SHIFT;
+	end = XFS_FSB_TO_B(mp, map->br_startoff + map->br_blockcount)
+			   >> PAGE_CACHE_SHIFT;
+	do {
+		unsigned int	i;
+		unsigned	nr_pages;
+		int		want = min_t(pgoff_t, end - index,
+					     (pgoff_t)PAGEVEC_SIZE - 1) + 1;
+		nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
+					  want);
+		if (nr_pages == 0)
+			break;
+
+		for (i = 0; i < nr_pages; i++) {
+			struct page		*page = pvec.pages[i];
+			struct buffer_head	*bh;
+			struct buffer_head	*head;
+			xfs_fileoff_t		last;
+
+			/*
+			 * There is no need to check the following pages
+			 * if the current page offset is out of range.
+			 */
+			if (page->index > end)
+				goto out;
+
+			if (!trylock_page(page))
+				goto out;
+
+			if (!page_has_buffers(page)) {
+				unlock_page(page);
+				continue;
+			}
+
+			last = XFS_B_TO_FSBT(mp,
+					     page->index << PAGE_CACHE_SHIFT);
+			bh = head = page_buffers(page);
+			do {
+				/*
+				 * An extent in XFS_EXT_UNWRITTEN has disk
+				 * blocks already mapped to it, but no data
+				 * has been committed to them yet.  If it has
+				 * dirty data in the page cache it can be
+				 * identified by having BH_Unwritten set in
+				 * each buffers.  Also, the buffer head state
+				 * might be in BH_Uptodate too if the buffer
+				 * writeback procedure was fired, we need to
+				 * examine it as well.
+				 */
+				if ((buffer_unwritten(bh) ||
+				     buffer_uptodate(bh)) &&
+				     *offset <= XFS_FSB_TO_B(mp, last)) {
+					found = true;
+					*offset = XFS_FSB_TO_B(mp, last);
+					unlock_page(page);
+					goto out;
+				}
+				last++;
+			} while ((bh = bh->b_this_page) != head);
+			unlock_page(page);
+		}
+
+		/*
+		 * If the number of probed pages less than our desired,
+		 * there should no more pages mapped, search done.
+		 */
+		if (nr_pages < want)
+			break;
+
+		index = pvec.pages[i - 1]->index + 1;
+		pagevec_release(&pvec);
+	} while (index < end);
+
+out:
+	pagevec_release(&pvec);
+	if (!found)
+		*offset = 0;
+
+	return found;
+}
+
 STATIC loff_t
 xfs_seek_data(
 	struct file		*file,
@@ -1009,19 +1112,61 @@ xfs_seek_data(
 	 * Treat unwritten extent as data extent since it might
 	 * contains dirty data in page cache.
 	 */
-	if (map[0].br_startblock != HOLESTARTBLOCK) {
-		offset = max_t(loff_t, start,
-			       XFS_FSB_TO_B(mp, map[0].br_startoff));
+	if (map[0].br_state == XFS_EXT_NORM &&
+	    !isnullstartblock(map[0].br_startblock)) {
+		offset = start;
 	} else {
+		/*
+		 * Landed in an unwritten extent, try to find out
+		 * the data buffer offset from page cache firstly.
+		 * Treat it as a hole if nothing was found, and
+		 * skip to check the next extent.
+		 */
+		if (map[0].br_startblock == DELAYSTARTBLOCK ||
+		    map[0].br_state == XFS_EXT_UNWRITTEN) {
+			/* Probing page cache start from offset */
+			offset = start;
+			if (xfs_has_unwritten_buffer(inode, &map[0],
+						     &offset))
+				goto out;
+		}
+
+		/*
+		 * Found a hole in map[0] and nothing in map[1].
+		 * Probably means that we are reading after EOF.
+		 */
 		if (nmap == 1) {
 			error = ENXIO;
 			goto out_unlock;
 		}
 
-		offset = max_t(loff_t, start,
-			       XFS_FSB_TO_B(mp, map[1].br_startoff));
+		/*
+		 * We have two mappings, and need to check map[1] to
+		 * see if there is data or not.
+		 */
+		if (map[1].br_state == XFS_EXT_NORM &&
+		    !isnullstartblock(map[1].br_startblock)) {
+			offset = XFS_FSB_TO_B(mp, map[1].br_startoff);
+		} else {
+			if (map[1].br_startblock == DELAYSTARTBLOCK ||
+			    map[1].br_state == XFS_EXT_UNWRITTEN) {
+				/* Probing page cache start from offset */
+				offset = start;
+				if (xfs_has_unwritten_buffer(inode, &map[1],
+							     &offset))
+					goto out;
+			}
+			/*
+			 * xfs_bmapi_read() can handle repeated hole regions,
+			 * hence it should not return two extents both are
+			 * holes.  If the 2nd extent is unwritten, there must
+			 * have data buffer resides in page cache.
+			 */
+			BUG();
+		}
 	}
 
+out:
 	if (offset != file->f_pos)
 		file->f_pos = offset;
 
-- 
1.7.9

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] xfs: probe data buffer from page cache for unwritten extents
  2012-06-25 12:41 [PATCH v2] xfs: probe data buffer from page cache for unwritten extents Jeff Liu
@ 2012-06-25 16:13 ` Mark Tinguely
  2012-06-26  2:38 ` Dave Chinner
  1 sibling, 0 replies; 4+ messages in thread
From: Mark Tinguely @ 2012-06-25 16:13 UTC (permalink / raw)
  To: jeff.liu; +Cc: xfs

On 06/25/12 07:41, Jeff Liu wrote:
> Hello,
>
> Using the start offset rather than map->br_startoff to calculate the starting page index could
> get more accurate data offset in page cache probe routine.
> With this refinement, the old max_t() could be able to remove too.
>
> Thanks Mark for pointing this out!
>
> -Jeff
>
>
> Cc: Mark Tinguely<tinguely@sgi.com>
> Signed-off-by: Jie Liu<jeff.liu@oracle.com>
>
> ---

> +			/*
> +			 * xfs_bmapi_read() can handle repeated hole regions,
> +			 * hence it should not return two extents both are
> +			 * holes.  If the 2nd extent is unwritten, there must
> +			 * have data buffer resides in page cache.
> +			 */
> +			BUG();


Looks great.

I hit the BUG() using a test with the following test:

  hole
  unwritten (treated as a hole)
  unwritten (treated as a hole)
  page of data.

I will send the current version of the test program.

--Mark.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] xfs: probe data buffer from page cache for unwritten extents
  2012-06-25 12:41 [PATCH v2] xfs: probe data buffer from page cache for unwritten extents Jeff Liu
  2012-06-25 16:13 ` Mark Tinguely
@ 2012-06-26  2:38 ` Dave Chinner
  2012-06-26  6:45   ` Jeff Liu
  1 sibling, 1 reply; 4+ messages in thread
From: Dave Chinner @ 2012-06-26  2:38 UTC (permalink / raw)
  To: Jeff Liu; +Cc: xfs

On Mon, Jun 25, 2012 at 08:41:31PM +0800, Jeff Liu wrote:
> Hello,
> 
> Using the start offset rather than map->br_startoff to calculate the starting page index could
> get more accurate data offset in page cache probe routine.
> With this refinement, the old max_t() could be able to remove too. 
....
> +			}
> +			/*
> +			 * xfs_bmapi_read() can handle repeated hole regions,
> +			 * hence it should not return two extents both are
> +			 * holes.  If the 2nd extent is unwritten, there must
> +			 * have data buffer resides in page cache.
> +			 */
> +			BUG();

That's wrong. A hole can be up to 32bits in length. When the hole is
longer than that, you'll get two extents that are holes. Try working
with sparse files that have holes in the order of a 100TB in them...

Also, as I've said before - BUG() does not belong in filesystem code
that can return an error. Shut the filesystem down with an in-memory
corruption error and maybe put an ASSERT(0) there so debug kernels
trip over it. However, no filesystem "can not happen" logic error is
a reason to panic a production machine.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v2] xfs: probe data buffer from page cache for unwritten extents
  2012-06-26  2:38 ` Dave Chinner
@ 2012-06-26  6:45   ` Jeff Liu
  0 siblings, 0 replies; 4+ messages in thread
From: Jeff Liu @ 2012-06-26  6:45 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Mark Tinguely, xfs

Hi Mark and Dave,

Thanks for both of your comments.
On 06/26/2012 10:38 AM, Dave Chinner wrote:

> On Mon, Jun 25, 2012 at 08:41:31PM +0800, Jeff Liu wrote:
>> Hello,
>>
>> Using the start offset rather than map->br_startoff to calculate the starting page index could
>> get more accurate data offset in page cache probe routine.
>> With this refinement, the old max_t() could be able to remove too. 
> ....
>> +			}
>> +			/*
>> +			 * xfs_bmapi_read() can handle repeated hole regions,
>> +			 * hence it should not return two extents both are
>> +			 * holes.  If the 2nd extent is unwritten, there must
>> +			 * have data buffer resides in page cache.
>> +			 */
>> +			BUG();
> 
> That's wrong. A hole can be up to 32bits in length. When the hole is
> longer than that, you'll get two extents that are holes. Try working
> with sparse files that have holes in the order of a 100TB in them...

I recalled we have verified that xfs_bmapi_read() can handle repeated
hole extents since the extent length in memory is 64bits which is
defined at:
struct xfs_bmbt_irec {
....
xfs_filblks_t   br_blockcount;
};

I can reproduce that issue with Mark's test case, simply by creating a
file with xfs_io -F -f -c "truncate 200M" -c "falloc $((50 << 20)) 50m"
-c "falloc $((100 << 20) 50m" -c "pwrite $((150 << 20)) 50m"

So the file mapping is:
 0-50m          50m-100m                100m-150m       150m-200m
[hole | unwritten_without_data | unwritten_without_data | data]

Current code logic will hit BUG() as the first unwritten extent has no
data buffer.

I have to do xfs_bmap_read() in a loop as before.

> 
> Also, as I've said before - BUG() does not belong in filesystem code
> that can return an error. Shut the filesystem down with an in-memory
> corruption error and maybe put an ASSERT(0) there so debug kernels
> trip over it. However, no filesystem "can not happen" logic error is
> a reason to panic a production machine.

Thanks for this teaching again.


Regards,
-Jeff

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2012-06-26  6:45 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-06-25 12:41 [PATCH v2] xfs: probe data buffer from page cache for unwritten extents Jeff Liu
2012-06-25 16:13 ` Mark Tinguely
2012-06-26  2:38 ` Dave Chinner
2012-06-26  6:45   ` Jeff Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.