All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
@ 2017-04-28 19:46 Darrick J. Wong
  2017-05-01 18:32 ` Brian Foster
  2017-05-02  7:44 ` Christoph Hellwig
  0 siblings, 2 replies; 12+ messages in thread
From: Darrick J. Wong @ 2017-04-28 19:46 UTC (permalink / raw)
  To: xfs; +Cc: Christoph Hellwig, Brian Foster

Currently, the dir2 leaf block getdents function uses a complex state
tracking mechanism to create a shadow copy of the block mappings and
then uses the shadow copy to schedule readahead.  Since the read and
readahead functions are perfectly capable of reading the mappings
themselves, we can tear all that out in favor of a simpler function that
simply keeps pushing the readahead window further out.

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
v3: use sliding window to constrain the amount of readahead
v2: fix readahead of more than ra_want
---
 fs/xfs/xfs_dir2_readdir.c |  316 ++++++++++++---------------------------------
 1 file changed, 82 insertions(+), 234 deletions(-)

diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 20b7a5c..d05c1ec 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -243,214 +243,98 @@ xfs_dir2_block_getdents(
 	return 0;
 }
 
-struct xfs_dir2_leaf_map_info {
-	xfs_extlen_t	map_blocks;	/* number of fsbs in map */
-	xfs_dablk_t	map_off;	/* last mapped file offset */
-	int		map_size;	/* total entries in *map */
-	int		map_valid;	/* valid entries in *map */
-	int		nmap;		/* mappings to ask xfs_bmapi */
-	xfs_dir2_db_t	curdb;		/* db for current block */
-	int		ra_current;	/* number of read-ahead blks */
-	int		ra_index;	/* *map index for read-ahead */
-	int		ra_offset;	/* map entry offset for ra */
-	int		ra_want;	/* readahead count wanted */
-	struct xfs_bmbt_irec map[];	/* map vector for blocks */
-};
-
+/*
+ * Read a directory block and initiate readahead for blocks beyond that.
+ * We maintain a sliding readahead window of the remaining space in the
+ * buffer rounded up to the nearest block.
+ */
 STATIC int
 xfs_dir2_leaf_readbuf(
 	struct xfs_da_args	*args,
 	size_t			bufsize,
-	struct xfs_dir2_leaf_map_info *mip,
-	xfs_dir2_off_t		*curoff,
-	struct xfs_buf		**bpp,
-	bool			trim_map)
+	xfs_dir2_off_t		*cur_off,
+	xfs_dablk_t		*ra_blk,
+	struct xfs_buf		**bpp)
 {
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_buf		*bp = NULL;
-	struct xfs_bmbt_irec	*map = mip->map;
+	struct xfs_da_geometry	*geo = args->geo;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
+	struct xfs_bmbt_irec	map;
 	struct blk_plug		plug;
+	xfs_dir2_off_t		new_off;
+	xfs_dablk_t		next_ra;
+	xfs_dablk_t		map_off;
+	xfs_dablk_t		last_da;
+	xfs_extnum_t		idx;
+	bool			found;
+	int			ra_want;
 	int			error = 0;
-	int			length;
-	int			i;
-	int			j;
-	struct xfs_da_geometry	*geo = args->geo;
-
-	/*
-	 * If the caller just finished processing a buffer, it will tell us
-	 * we need to trim that block out of the mapping now it is done.
-	 */
-	if (trim_map) {
-		mip->map_blocks -= geo->fsbcount;
-		/*
-		 * Loop to get rid of the extents for the
-		 * directory block.
-		 */
-		for (i = geo->fsbcount; i > 0; ) {
-			j = min_t(int, map->br_blockcount, i);
-			map->br_blockcount -= j;
-			map->br_startblock += j;
-			map->br_startoff += j;
-			/*
-			 * If mapping is done, pitch it from
-			 * the table.
-			 */
-			if (!map->br_blockcount && --mip->map_valid)
-				memmove(&map[0], &map[1],
-					sizeof(map[0]) * mip->map_valid);
-			i -= j;
-		}
-	}
-
-	/*
-	 * Recalculate the readahead blocks wanted.
-	 */
-	mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1;
-	ASSERT(mip->ra_want >= 0);
-
-	/*
-	 * If we don't have as many as we want, and we haven't
-	 * run out of data blocks, get some more mappings.
-	 */
-	if (1 + mip->ra_want > mip->map_blocks &&
-	    mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) {
-		/*
-		 * Get more bmaps, fill in after the ones
-		 * we already have in the table.
-		 */
-		mip->nmap = mip->map_size - mip->map_valid;
-		error = xfs_bmapi_read(dp, mip->map_off,
-				xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) -
-								mip->map_off,
-				&map[mip->map_valid], &mip->nmap, 0);
 
-		/*
-		 * Don't know if we should ignore this or try to return an
-		 * error.  The trouble with returning errors is that readdir
-		 * will just stop without actually passing the error through.
-		 */
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
 		if (error)
-			goto out;	/* XXX */
-
-		/*
-		 * If we got all the mappings we asked for, set the final map
-		 * offset based on the last bmap value received.  Otherwise,
-		 * we've reached the end.
-		 */
-		if (mip->nmap == mip->map_size - mip->map_valid) {
-			i = mip->map_valid + mip->nmap - 1;
-			mip->map_off = map[i].br_startoff + map[i].br_blockcount;
-		} else
-			mip->map_off = xfs_dir2_byte_to_da(geo,
-							XFS_DIR2_LEAF_OFFSET);
-
-		/*
-		 * Look for holes in the mapping, and eliminate them.  Count up
-		 * the valid blocks.
-		 */
-		for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
-			if (map[i].br_startblock == HOLESTARTBLOCK) {
-				mip->nmap--;
-				length = mip->map_valid + mip->nmap - i;
-				if (length)
-					memmove(&map[i], &map[i + 1],
-						sizeof(map[i]) * length);
-			} else {
-				mip->map_blocks += map[i].br_blockcount;
-				i++;
-			}
-		}
-		mip->map_valid += mip->nmap;
+			goto out;
 	}
 
 	/*
-	 * No valid mappings, so no more data blocks.
+	 * Look for mapped directory blocks at or above the current offset.
+	 * Truncate down to the nearest directory block to start the scanning
+	 * operation.
 	 */
-	if (!mip->map_valid) {
-		*curoff = xfs_dir2_da_to_byte(geo, mip->map_off);
+	last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET);
+	map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off));
+	found = xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map);
+	if (!found || map.br_startoff >= last_da)
 		goto out;
-	}
+	xfs_trim_extent(&map, map_off, last_da - map_off);
 
-	/*
-	 * Read the directory block starting at the first mapping.
-	 */
-	mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff);
-	error = xfs_dir3_data_read(NULL, dp, map->br_startoff,
-			map->br_blockcount >= geo->fsbcount ?
-			    XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) :
-			    -1, &bp);
-	/*
-	 * Should just skip over the data block instead of giving up.
-	 */
+	/* Read the directory block of that first mapping. */
+	new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
+	if (new_off > *cur_off)
+		*cur_off = new_off;
+	error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, -1, &bp);
 	if (error)
-		goto out;	/* XXX */
-
-	/*
-	 * Adjust the current amount of read-ahead: we just read a block that
-	 * was previously ra.
-	 */
-	if (mip->ra_current)
-		mip->ra_current -= geo->fsbcount;
+		goto out;
 
 	/*
-	 * Do we need more readahead?
-	 * Each loop tries to process 1 full dir blk; last may be partial.
+	 * Start readahead for the next bufsize's worth of dir data blocks.
+	 * We may have already issued readahead for some of that range;
+	 * ra_blk tracks the last block we tried to read(ahead).
 	 */
+	ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
+	if (*ra_blk >= last_da)
+		goto out;
+	else if (*ra_blk == 0)
+		*ra_blk = map.br_startoff;
+	next_ra = map.br_startoff + geo->fsbcount;
+	if (next_ra >= last_da)
+		goto out_no_ra;
+	found = xfs_iext_lookup_extent(dp, ifp, next_ra, &idx, &map);
+	if (!found || map.br_startoff >= last_da)
+		goto out_no_ra;
+	xfs_trim_extent(&map, next_ra, last_da - next_ra);
+ 
+	/* Start ra for each dir (not fs) block that has a mapping. */
 	blk_start_plug(&plug);
-	for (mip->ra_index = mip->ra_offset = i = 0;
-	     mip->ra_want > mip->ra_current && i < mip->map_blocks;
-	     i += geo->fsbcount) {
-		ASSERT(mip->ra_index < mip->map_valid);
-		/*
-		 * Read-ahead a contiguous directory block.
-		 */
-		if (i > mip->ra_current &&
-		    (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
-		    geo->fsbcount) {
-			xfs_dir3_data_readahead(dp,
-				map[mip->ra_index].br_startoff + mip->ra_offset,
-				XFS_FSB_TO_DADDR(dp->i_mount,
-					map[mip->ra_index].br_startblock +
-							mip->ra_offset));
-			mip->ra_current = i;
-		}
-
-		/*
-		 * Read-ahead a non-contiguous directory block.  This doesn't
-		 * use our mapping, but this is a very rare case.
-		 */
-		else if (i > mip->ra_current) {
-			xfs_dir3_data_readahead(dp,
-					map[mip->ra_index].br_startoff +
-							mip->ra_offset, -1);
-			mip->ra_current = i;
-		}
-
-		/*
-		 * Advance offset through the mapping table, processing a full
-		 * dir block even if it is fragmented into several extents.
-		 * But stop if we have consumed all valid mappings, even if
-		 * it's not yet a full directory block.
-		 */
-		for (j = 0;
-		     j < geo->fsbcount && mip->ra_index < mip->map_valid;
-		     j += length ) {
-			/*
-			 * The rest of this extent but not more than a dir
-			 * block.
-			 */
-			length = min_t(int, geo->fsbcount - j,
-					map[mip->ra_index].br_blockcount -
-							mip->ra_offset);
-			mip->ra_offset += length;
-
-			/*
-			 * Advance to the next mapping if this one is used up.
-			 */
-			if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
-				mip->ra_offset = 0;
-				mip->ra_index++;
+	while (ra_want > 0) {
+		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
+		while (ra_want > 0 &&
+		       next_ra < map.br_startoff + map.br_blockcount) {
+			if (next_ra >= last_da) {
+				*ra_blk = last_da;
+				break;
+			} else if (next_ra > *ra_blk) {
+				xfs_dir3_data_readahead(dp, next_ra, -2);
+				*ra_blk = next_ra;
 			}
+			ra_want -= geo->fsbcount;
+			next_ra += geo->fsbcount;
+		}
+		found = xfs_iext_get_extent(ifp, ++idx, &map);
+		if (!found) {
+			*ra_blk = last_da;
+			break;
 		}
 	}
 	blk_finish_plug(&plug);
@@ -458,6 +342,9 @@ xfs_dir2_leaf_readbuf(
 out:
 	*bpp = bp;
 	return error;
+out_no_ra:
+	*ra_blk = last_da;
+	goto out;
 }
 
 /*
@@ -475,14 +362,14 @@ xfs_dir2_leaf_getdents(
 	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
 	xfs_dir2_data_entry_t	*dep;		/* data entry */
 	xfs_dir2_data_unused_t	*dup;		/* unused entry */
-	int			error = 0;	/* error return value */
-	int			length;		/* temporary length value */
-	int			byteoff;	/* offset in current block */
-	xfs_dir2_off_t		curoff;		/* current overall offset */
-	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
 	char			*ptr = NULL;	/* pointer to current data */
-	struct xfs_dir2_leaf_map_info *map_info;
 	struct xfs_da_geometry	*geo = args->geo;
+	xfs_dablk_t		rablk = 0;	/* current readahead block */
+	xfs_dir2_off_t		curoff;		/* current overall offset */
+	int			length;		/* temporary length value */
+	int			byteoff;	/* offset in current block */
+	int			lock_mode;
+	int			error = 0;	/* error return value */
 
 	/*
 	 * If the offset is at or past the largest allowed value,
@@ -492,30 +379,12 @@ xfs_dir2_leaf_getdents(
 		return 0;
 
 	/*
-	 * Set up to bmap a number of blocks based on the caller's
-	 * buffer size, the directory block size, and the filesystem
-	 * block size.
-	 */
-	length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
-	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
-				(length * sizeof(struct xfs_bmbt_irec)),
-			       KM_SLEEP | KM_NOFS);
-	map_info->map_size = length;
-
-	/*
 	 * Inside the loop we keep the main offset value as a byte offset
 	 * in the directory file.
 	 */
 	curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
 
 	/*
-	 * Force this conversion through db so we truncate the offset
-	 * down to get the start of the data block.
-	 */
-	map_info->map_off = xfs_dir2_db_to_da(geo,
-					      xfs_dir2_byte_to_db(geo, curoff));
-
-	/*
 	 * Loop over directory entries until we reach the end offset.
 	 * Get more blocks and readahead as necessary.
 	 */
@@ -527,38 +396,18 @@ xfs_dir2_leaf_getdents(
 		 * current buffer, need to get another one.
 		 */
 		if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
-			int	lock_mode;
-			bool	trim_map = false;
-
 			if (bp) {
-				xfs_trans_brelse(NULL, bp);
+				xfs_trans_brelse(args->trans, bp);
 				bp = NULL;
-				trim_map = true;
 			}
 
 			lock_mode = xfs_ilock_data_map_shared(dp);
-			error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
-						      &curoff, &bp, trim_map);
+			error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
+					&rablk, &bp);
 			xfs_iunlock(dp, lock_mode);
-			if (error || !map_info->map_valid)
+			if (error || !bp)
 				break;
 
-			/*
-			 * Having done a read, we need to set a new offset.
-			 */
-			newoff = xfs_dir2_db_off_to_byte(geo,
-							 map_info->curdb, 0);
-			/*
-			 * Start of the current block.
-			 */
-			if (curoff < newoff)
-				curoff = newoff;
-			/*
-			 * Make sure we're in the right block.
-			 */
-			else if (curoff > newoff)
-				ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
-				       map_info->curdb);
 			hdr = bp->b_addr;
 			xfs_dir3_data_check(dp, bp);
 			/*
@@ -643,7 +492,6 @@ xfs_dir2_leaf_getdents(
 		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
 	else
 		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
-	kmem_free(map_info);
 	if (bp)
 		xfs_trans_brelse(NULL, bp);
 	return error;

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
  2017-04-28 19:46 [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness Darrick J. Wong
@ 2017-05-01 18:32 ` Brian Foster
  2017-05-01 21:50   ` Darrick J. Wong
  2017-05-02  7:44 ` Christoph Hellwig
  1 sibling, 1 reply; 12+ messages in thread
From: Brian Foster @ 2017-05-01 18:32 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: xfs, Christoph Hellwig

On Fri, Apr 28, 2017 at 12:46:52PM -0700, Darrick J. Wong wrote:
> Currently, the dir2 leaf block getdents function uses a complex state
> tracking mechanism to create a shadow copy of the block mappings and
> then uses the shadow copy to schedule readahead.  Since the read and
> readahead functions are perfectly capable of reading the mappings
> themselves, we can tear all that out in favor of a simpler function that
> simply keeps pushing the readahead window further out.
> 
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---
> v3: use sliding window to constrain the amount of readahead
> v2: fix readahead of more than ra_want
> ---

Thanks for the updates. This looks much more simple and seems true to
the current readahead behavior. The code also looks fine to me (one bit
of whitespace damage noted below).

That aside, have you happened to test this against a huge/ugly directory
to verify it works as expected? Note that I don't think in depth
performance analysis is required. Verification of any kind of dir that
is known to benefit from readahead is probably sufficient IMO. Perhaps
dm-delay with a small enough latency to allow us to measure the effect
of readahead could help us here.

>  fs/xfs/xfs_dir2_readdir.c |  316 ++++++++++++---------------------------------
>  1 file changed, 82 insertions(+), 234 deletions(-)
> 
> diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
> index 20b7a5c..d05c1ec 100644
> --- a/fs/xfs/xfs_dir2_readdir.c
> +++ b/fs/xfs/xfs_dir2_readdir.c
> @@ -243,214 +243,98 @@ xfs_dir2_block_getdents(
>  	return 0;
>  }
>  
...
>  	/*
> -	 * Do we need more readahead?
> -	 * Each loop tries to process 1 full dir blk; last may be partial.
> +	 * Start readahead for the next bufsize's worth of dir data blocks.
> +	 * We may have already issued readahead for some of that range;
> +	 * ra_blk tracks the last block we tried to read(ahead).
>  	 */
> +	ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> +	if (*ra_blk >= last_da)
> +		goto out;
> +	else if (*ra_blk == 0)
> +		*ra_blk = map.br_startoff;
> +	next_ra = map.br_startoff + geo->fsbcount;
> +	if (next_ra >= last_da)
> +		goto out_no_ra;
> +	found = xfs_iext_lookup_extent(dp, ifp, next_ra, &idx, &map);
> +	if (!found || map.br_startoff >= last_da)
> +		goto out_no_ra;
> +	xfs_trim_extent(&map, next_ra, last_da - next_ra);
> + 

^ trailing space.

Brian

> +	/* Start ra for each dir (not fs) block that has a mapping. */
>  	blk_start_plug(&plug);
> -	for (mip->ra_index = mip->ra_offset = i = 0;
> -	     mip->ra_want > mip->ra_current && i < mip->map_blocks;
> -	     i += geo->fsbcount) {
> -		ASSERT(mip->ra_index < mip->map_valid);
> -		/*
> -		 * Read-ahead a contiguous directory block.
> -		 */
> -		if (i > mip->ra_current &&
> -		    (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
> -		    geo->fsbcount) {
> -			xfs_dir3_data_readahead(dp,
> -				map[mip->ra_index].br_startoff + mip->ra_offset,
> -				XFS_FSB_TO_DADDR(dp->i_mount,
> -					map[mip->ra_index].br_startblock +
> -							mip->ra_offset));
> -			mip->ra_current = i;
> -		}
> -
> -		/*
> -		 * Read-ahead a non-contiguous directory block.  This doesn't
> -		 * use our mapping, but this is a very rare case.
> -		 */
> -		else if (i > mip->ra_current) {
> -			xfs_dir3_data_readahead(dp,
> -					map[mip->ra_index].br_startoff +
> -							mip->ra_offset, -1);
> -			mip->ra_current = i;
> -		}
> -
> -		/*
> -		 * Advance offset through the mapping table, processing a full
> -		 * dir block even if it is fragmented into several extents.
> -		 * But stop if we have consumed all valid mappings, even if
> -		 * it's not yet a full directory block.
> -		 */
> -		for (j = 0;
> -		     j < geo->fsbcount && mip->ra_index < mip->map_valid;
> -		     j += length ) {
> -			/*
> -			 * The rest of this extent but not more than a dir
> -			 * block.
> -			 */
> -			length = min_t(int, geo->fsbcount - j,
> -					map[mip->ra_index].br_blockcount -
> -							mip->ra_offset);
> -			mip->ra_offset += length;
> -
> -			/*
> -			 * Advance to the next mapping if this one is used up.
> -			 */
> -			if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
> -				mip->ra_offset = 0;
> -				mip->ra_index++;
> +	while (ra_want > 0) {
> +		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
> +		while (ra_want > 0 &&
> +		       next_ra < map.br_startoff + map.br_blockcount) {
> +			if (next_ra >= last_da) {
> +				*ra_blk = last_da;
> +				break;
> +			} else if (next_ra > *ra_blk) {
> +				xfs_dir3_data_readahead(dp, next_ra, -2);
> +				*ra_blk = next_ra;
>  			}
> +			ra_want -= geo->fsbcount;
> +			next_ra += geo->fsbcount;
> +		}
> +		found = xfs_iext_get_extent(ifp, ++idx, &map);
> +		if (!found) {
> +			*ra_blk = last_da;
> +			break;
>  		}
>  	}
>  	blk_finish_plug(&plug);
> @@ -458,6 +342,9 @@ xfs_dir2_leaf_readbuf(
>  out:
>  	*bpp = bp;
>  	return error;
> +out_no_ra:
> +	*ra_blk = last_da;
> +	goto out;
>  }
>  
>  /*
> @@ -475,14 +362,14 @@ xfs_dir2_leaf_getdents(
>  	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
>  	xfs_dir2_data_entry_t	*dep;		/* data entry */
>  	xfs_dir2_data_unused_t	*dup;		/* unused entry */
> -	int			error = 0;	/* error return value */
> -	int			length;		/* temporary length value */
> -	int			byteoff;	/* offset in current block */
> -	xfs_dir2_off_t		curoff;		/* current overall offset */
> -	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
>  	char			*ptr = NULL;	/* pointer to current data */
> -	struct xfs_dir2_leaf_map_info *map_info;
>  	struct xfs_da_geometry	*geo = args->geo;
> +	xfs_dablk_t		rablk = 0;	/* current readahead block */
> +	xfs_dir2_off_t		curoff;		/* current overall offset */
> +	int			length;		/* temporary length value */
> +	int			byteoff;	/* offset in current block */
> +	int			lock_mode;
> +	int			error = 0;	/* error return value */
>  
>  	/*
>  	 * If the offset is at or past the largest allowed value,
> @@ -492,30 +379,12 @@ xfs_dir2_leaf_getdents(
>  		return 0;
>  
>  	/*
> -	 * Set up to bmap a number of blocks based on the caller's
> -	 * buffer size, the directory block size, and the filesystem
> -	 * block size.
> -	 */
> -	length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> -	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
> -				(length * sizeof(struct xfs_bmbt_irec)),
> -			       KM_SLEEP | KM_NOFS);
> -	map_info->map_size = length;
> -
> -	/*
>  	 * Inside the loop we keep the main offset value as a byte offset
>  	 * in the directory file.
>  	 */
>  	curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
>  
>  	/*
> -	 * Force this conversion through db so we truncate the offset
> -	 * down to get the start of the data block.
> -	 */
> -	map_info->map_off = xfs_dir2_db_to_da(geo,
> -					      xfs_dir2_byte_to_db(geo, curoff));
> -
> -	/*
>  	 * Loop over directory entries until we reach the end offset.
>  	 * Get more blocks and readahead as necessary.
>  	 */
> @@ -527,38 +396,18 @@ xfs_dir2_leaf_getdents(
>  		 * current buffer, need to get another one.
>  		 */
>  		if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
> -			int	lock_mode;
> -			bool	trim_map = false;
> -
>  			if (bp) {
> -				xfs_trans_brelse(NULL, bp);
> +				xfs_trans_brelse(args->trans, bp);
>  				bp = NULL;
> -				trim_map = true;
>  			}
>  
>  			lock_mode = xfs_ilock_data_map_shared(dp);
> -			error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
> -						      &curoff, &bp, trim_map);
> +			error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
> +					&rablk, &bp);
>  			xfs_iunlock(dp, lock_mode);
> -			if (error || !map_info->map_valid)
> +			if (error || !bp)
>  				break;
>  
> -			/*
> -			 * Having done a read, we need to set a new offset.
> -			 */
> -			newoff = xfs_dir2_db_off_to_byte(geo,
> -							 map_info->curdb, 0);
> -			/*
> -			 * Start of the current block.
> -			 */
> -			if (curoff < newoff)
> -				curoff = newoff;
> -			/*
> -			 * Make sure we're in the right block.
> -			 */
> -			else if (curoff > newoff)
> -				ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
> -				       map_info->curdb);
>  			hdr = bp->b_addr;
>  			xfs_dir3_data_check(dp, bp);
>  			/*
> @@ -643,7 +492,6 @@ xfs_dir2_leaf_getdents(
>  		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
>  	else
>  		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
> -	kmem_free(map_info);
>  	if (bp)
>  		xfs_trans_brelse(NULL, bp);
>  	return error;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
  2017-05-01 18:32 ` Brian Foster
@ 2017-05-01 21:50   ` Darrick J. Wong
  2017-05-01 23:13     ` Brian Foster
  0 siblings, 1 reply; 12+ messages in thread
From: Darrick J. Wong @ 2017-05-01 21:50 UTC (permalink / raw)
  To: Brian Foster; +Cc: xfs, Christoph Hellwig

On Mon, May 01, 2017 at 02:32:43PM -0400, Brian Foster wrote:
> On Fri, Apr 28, 2017 at 12:46:52PM -0700, Darrick J. Wong wrote:
> > Currently, the dir2 leaf block getdents function uses a complex state
> > tracking mechanism to create a shadow copy of the block mappings and
> > then uses the shadow copy to schedule readahead.  Since the read and
> > readahead functions are perfectly capable of reading the mappings
> > themselves, we can tear all that out in favor of a simpler function that
> > simply keeps pushing the readahead window further out.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> > v3: use sliding window to constrain the amount of readahead
> > v2: fix readahead of more than ra_want
> > ---
> 
> Thanks for the updates. This looks much more simple and seems true to
> the current readahead behavior. The code also looks fine to me (one bit
> of whitespace damage noted below).
> 
> That aside, have you happened to test this against a huge/ugly directory
> to verify it works as expected? Note that I don't think in depth
> performance analysis is required. Verification of any kind of dir that
> is known to benefit from readahead is probably sufficient IMO. Perhaps
> dm-delay with a small enough latency to allow us to measure the effect
> of readahead could help us here.

Yeah, I used xfs/349 to generate a filesystem containing a directory
with a freeindex block, then ls'd the entire directory to see how long
the getdents calls took.  The readhead calls were nearly identical with
similar runtimes.

> >  fs/xfs/xfs_dir2_readdir.c |  316 ++++++++++++---------------------------------
> >  1 file changed, 82 insertions(+), 234 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
> > index 20b7a5c..d05c1ec 100644
> > --- a/fs/xfs/xfs_dir2_readdir.c
> > +++ b/fs/xfs/xfs_dir2_readdir.c
> > @@ -243,214 +243,98 @@ xfs_dir2_block_getdents(
> >  	return 0;
> >  }
> >  
> ...
> >  	/*
> > -	 * Do we need more readahead?
> > -	 * Each loop tries to process 1 full dir blk; last may be partial.
> > +	 * Start readahead for the next bufsize's worth of dir data blocks.
> > +	 * We may have already issued readahead for some of that range;
> > +	 * ra_blk tracks the last block we tried to read(ahead).
> >  	 */
> > +	ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> > +	if (*ra_blk >= last_da)
> > +		goto out;
> > +	else if (*ra_blk == 0)
> > +		*ra_blk = map.br_startoff;
> > +	next_ra = map.br_startoff + geo->fsbcount;
> > +	if (next_ra >= last_da)
> > +		goto out_no_ra;
> > +	found = xfs_iext_lookup_extent(dp, ifp, next_ra, &idx, &map);
> > +	if (!found || map.br_startoff >= last_da)
> > +		goto out_no_ra;
> > +	xfs_trim_extent(&map, next_ra, last_da - next_ra);
> > + 
> 
> ^ trailing space.

(I don't see it...?)

--D

> 
> Brian
> 
> > +	/* Start ra for each dir (not fs) block that has a mapping. */
> >  	blk_start_plug(&plug);
> > -	for (mip->ra_index = mip->ra_offset = i = 0;
> > -	     mip->ra_want > mip->ra_current && i < mip->map_blocks;
> > -	     i += geo->fsbcount) {
> > -		ASSERT(mip->ra_index < mip->map_valid);
> > -		/*
> > -		 * Read-ahead a contiguous directory block.
> > -		 */
> > -		if (i > mip->ra_current &&
> > -		    (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
> > -		    geo->fsbcount) {
> > -			xfs_dir3_data_readahead(dp,
> > -				map[mip->ra_index].br_startoff + mip->ra_offset,
> > -				XFS_FSB_TO_DADDR(dp->i_mount,
> > -					map[mip->ra_index].br_startblock +
> > -							mip->ra_offset));
> > -			mip->ra_current = i;
> > -		}
> > -
> > -		/*
> > -		 * Read-ahead a non-contiguous directory block.  This doesn't
> > -		 * use our mapping, but this is a very rare case.
> > -		 */
> > -		else if (i > mip->ra_current) {
> > -			xfs_dir3_data_readahead(dp,
> > -					map[mip->ra_index].br_startoff +
> > -							mip->ra_offset, -1);
> > -			mip->ra_current = i;
> > -		}
> > -
> > -		/*
> > -		 * Advance offset through the mapping table, processing a full
> > -		 * dir block even if it is fragmented into several extents.
> > -		 * But stop if we have consumed all valid mappings, even if
> > -		 * it's not yet a full directory block.
> > -		 */
> > -		for (j = 0;
> > -		     j < geo->fsbcount && mip->ra_index < mip->map_valid;
> > -		     j += length ) {
> > -			/*
> > -			 * The rest of this extent but not more than a dir
> > -			 * block.
> > -			 */
> > -			length = min_t(int, geo->fsbcount - j,
> > -					map[mip->ra_index].br_blockcount -
> > -							mip->ra_offset);
> > -			mip->ra_offset += length;
> > -
> > -			/*
> > -			 * Advance to the next mapping if this one is used up.
> > -			 */
> > -			if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
> > -				mip->ra_offset = 0;
> > -				mip->ra_index++;
> > +	while (ra_want > 0) {
> > +		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
> > +		while (ra_want > 0 &&
> > +		       next_ra < map.br_startoff + map.br_blockcount) {
> > +			if (next_ra >= last_da) {
> > +				*ra_blk = last_da;
> > +				break;
> > +			} else if (next_ra > *ra_blk) {
> > +				xfs_dir3_data_readahead(dp, next_ra, -2);
> > +				*ra_blk = next_ra;
> >  			}
> > +			ra_want -= geo->fsbcount;
> > +			next_ra += geo->fsbcount;
> > +		}
> > +		found = xfs_iext_get_extent(ifp, ++idx, &map);
> > +		if (!found) {
> > +			*ra_blk = last_da;
> > +			break;
> >  		}
> >  	}
> >  	blk_finish_plug(&plug);
> > @@ -458,6 +342,9 @@ xfs_dir2_leaf_readbuf(
> >  out:
> >  	*bpp = bp;
> >  	return error;
> > +out_no_ra:
> > +	*ra_blk = last_da;
> > +	goto out;
> >  }
> >  
> >  /*
> > @@ -475,14 +362,14 @@ xfs_dir2_leaf_getdents(
> >  	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
> >  	xfs_dir2_data_entry_t	*dep;		/* data entry */
> >  	xfs_dir2_data_unused_t	*dup;		/* unused entry */
> > -	int			error = 0;	/* error return value */
> > -	int			length;		/* temporary length value */
> > -	int			byteoff;	/* offset in current block */
> > -	xfs_dir2_off_t		curoff;		/* current overall offset */
> > -	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
> >  	char			*ptr = NULL;	/* pointer to current data */
> > -	struct xfs_dir2_leaf_map_info *map_info;
> >  	struct xfs_da_geometry	*geo = args->geo;
> > +	xfs_dablk_t		rablk = 0;	/* current readahead block */
> > +	xfs_dir2_off_t		curoff;		/* current overall offset */
> > +	int			length;		/* temporary length value */
> > +	int			byteoff;	/* offset in current block */
> > +	int			lock_mode;
> > +	int			error = 0;	/* error return value */
> >  
> >  	/*
> >  	 * If the offset is at or past the largest allowed value,
> > @@ -492,30 +379,12 @@ xfs_dir2_leaf_getdents(
> >  		return 0;
> >  
> >  	/*
> > -	 * Set up to bmap a number of blocks based on the caller's
> > -	 * buffer size, the directory block size, and the filesystem
> > -	 * block size.
> > -	 */
> > -	length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> > -	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
> > -				(length * sizeof(struct xfs_bmbt_irec)),
> > -			       KM_SLEEP | KM_NOFS);
> > -	map_info->map_size = length;
> > -
> > -	/*
> >  	 * Inside the loop we keep the main offset value as a byte offset
> >  	 * in the directory file.
> >  	 */
> >  	curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
> >  
> >  	/*
> > -	 * Force this conversion through db so we truncate the offset
> > -	 * down to get the start of the data block.
> > -	 */
> > -	map_info->map_off = xfs_dir2_db_to_da(geo,
> > -					      xfs_dir2_byte_to_db(geo, curoff));
> > -
> > -	/*
> >  	 * Loop over directory entries until we reach the end offset.
> >  	 * Get more blocks and readahead as necessary.
> >  	 */
> > @@ -527,38 +396,18 @@ xfs_dir2_leaf_getdents(
> >  		 * current buffer, need to get another one.
> >  		 */
> >  		if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
> > -			int	lock_mode;
> > -			bool	trim_map = false;
> > -
> >  			if (bp) {
> > -				xfs_trans_brelse(NULL, bp);
> > +				xfs_trans_brelse(args->trans, bp);
> >  				bp = NULL;
> > -				trim_map = true;
> >  			}
> >  
> >  			lock_mode = xfs_ilock_data_map_shared(dp);
> > -			error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
> > -						      &curoff, &bp, trim_map);
> > +			error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
> > +					&rablk, &bp);
> >  			xfs_iunlock(dp, lock_mode);
> > -			if (error || !map_info->map_valid)
> > +			if (error || !bp)
> >  				break;
> >  
> > -			/*
> > -			 * Having done a read, we need to set a new offset.
> > -			 */
> > -			newoff = xfs_dir2_db_off_to_byte(geo,
> > -							 map_info->curdb, 0);
> > -			/*
> > -			 * Start of the current block.
> > -			 */
> > -			if (curoff < newoff)
> > -				curoff = newoff;
> > -			/*
> > -			 * Make sure we're in the right block.
> > -			 */
> > -			else if (curoff > newoff)
> > -				ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
> > -				       map_info->curdb);
> >  			hdr = bp->b_addr;
> >  			xfs_dir3_data_check(dp, bp);
> >  			/*
> > @@ -643,7 +492,6 @@ xfs_dir2_leaf_getdents(
> >  		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
> >  	else
> >  		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
> > -	kmem_free(map_info);
> >  	if (bp)
> >  		xfs_trans_brelse(NULL, bp);
> >  	return error;
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
  2017-05-01 21:50   ` Darrick J. Wong
@ 2017-05-01 23:13     ` Brian Foster
  2017-05-01 23:30       ` Darrick J. Wong
  0 siblings, 1 reply; 12+ messages in thread
From: Brian Foster @ 2017-05-01 23:13 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: xfs, Christoph Hellwig

On Mon, May 01, 2017 at 02:50:18PM -0700, Darrick J. Wong wrote:
> On Mon, May 01, 2017 at 02:32:43PM -0400, Brian Foster wrote:
> > On Fri, Apr 28, 2017 at 12:46:52PM -0700, Darrick J. Wong wrote:
> > > Currently, the dir2 leaf block getdents function uses a complex state
> > > tracking mechanism to create a shadow copy of the block mappings and
> > > then uses the shadow copy to schedule readahead.  Since the read and
> > > readahead functions are perfectly capable of reading the mappings
> > > themselves, we can tear all that out in favor of a simpler function that
> > > simply keeps pushing the readahead window further out.
> > > 
> > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > ---
> > > v3: use sliding window to constrain the amount of readahead
> > > v2: fix readahead of more than ra_want
> > > ---
> > 
> > Thanks for the updates. This looks much more simple and seems true to
> > the current readahead behavior. The code also looks fine to me (one bit
> > of whitespace damage noted below).
> > 
> > That aside, have you happened to test this against a huge/ugly directory
> > to verify it works as expected? Note that I don't think in depth
> > performance analysis is required. Verification of any kind of dir that
> > is known to benefit from readahead is probably sufficient IMO. Perhaps
> > dm-delay with a small enough latency to allow us to measure the effect
> > of readahead could help us here.
> 
> Yeah, I used xfs/349 to generate a filesystem containing a directory
> with a freeindex block, then ls'd the entire directory to see how long
> the getdents calls took.  The readhead calls were nearly identical with
> similar runtimes.
> 

Ok..

> > >  fs/xfs/xfs_dir2_readdir.c |  316 ++++++++++++---------------------------------
> > >  1 file changed, 82 insertions(+), 234 deletions(-)
> > > 
> > > diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
> > > index 20b7a5c..d05c1ec 100644
> > > --- a/fs/xfs/xfs_dir2_readdir.c
> > > +++ b/fs/xfs/xfs_dir2_readdir.c
> > > @@ -243,214 +243,98 @@ xfs_dir2_block_getdents(
> > >  	return 0;
> > >  }
> > >  
> > ...
> > >  	/*
> > > -	 * Do we need more readahead?
> > > -	 * Each loop tries to process 1 full dir blk; last may be partial.
> > > +	 * Start readahead for the next bufsize's worth of dir data blocks.
> > > +	 * We may have already issued readahead for some of that range;
> > > +	 * ra_blk tracks the last block we tried to read(ahead).
> > >  	 */
> > > +	ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> > > +	if (*ra_blk >= last_da)
> > > +		goto out;
> > > +	else if (*ra_blk == 0)
> > > +		*ra_blk = map.br_startoff;
> > > +	next_ra = map.br_startoff + geo->fsbcount;
> > > +	if (next_ra >= last_da)
> > > +		goto out_no_ra;
> > > +	found = xfs_iext_lookup_extent(dp, ifp, next_ra, &idx, &map);
> > > +	if (!found || map.br_startoff >= last_da)
> > > +		goto out_no_ra;
> > > +	xfs_trim_extent(&map, next_ra, last_da - next_ra);
> > > + 
> > 
> > ^ trailing space.
> 
> (I don't see it...?)
> 

checkpatch catches it:

$ ./scripts/checkpatch.pl /tmp/patch 
ERROR: trailing whitespace
#227: FILE: fs/xfs/xfs_dir2_readdir.c:317:
+ $

WARNING: please, no spaces at the start of a line
#227: FILE: fs/xfs/xfs_dir2_readdir.c:317:
+ $

total: 1 errors, 1 warnings, 391 lines checked
...

With that fixed:

Reviewed-by: Brian Foster <bfoster@redhat.com>

> --D
> 
> > 
> > Brian
> > 
> > > +	/* Start ra for each dir (not fs) block that has a mapping. */
> > >  	blk_start_plug(&plug);
> > > -	for (mip->ra_index = mip->ra_offset = i = 0;
> > > -	     mip->ra_want > mip->ra_current && i < mip->map_blocks;
> > > -	     i += geo->fsbcount) {
> > > -		ASSERT(mip->ra_index < mip->map_valid);
> > > -		/*
> > > -		 * Read-ahead a contiguous directory block.
> > > -		 */
> > > -		if (i > mip->ra_current &&
> > > -		    (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
> > > -		    geo->fsbcount) {
> > > -			xfs_dir3_data_readahead(dp,
> > > -				map[mip->ra_index].br_startoff + mip->ra_offset,
> > > -				XFS_FSB_TO_DADDR(dp->i_mount,
> > > -					map[mip->ra_index].br_startblock +
> > > -							mip->ra_offset));
> > > -			mip->ra_current = i;
> > > -		}
> > > -
> > > -		/*
> > > -		 * Read-ahead a non-contiguous directory block.  This doesn't
> > > -		 * use our mapping, but this is a very rare case.
> > > -		 */
> > > -		else if (i > mip->ra_current) {
> > > -			xfs_dir3_data_readahead(dp,
> > > -					map[mip->ra_index].br_startoff +
> > > -							mip->ra_offset, -1);
> > > -			mip->ra_current = i;
> > > -		}
> > > -
> > > -		/*
> > > -		 * Advance offset through the mapping table, processing a full
> > > -		 * dir block even if it is fragmented into several extents.
> > > -		 * But stop if we have consumed all valid mappings, even if
> > > -		 * it's not yet a full directory block.
> > > -		 */
> > > -		for (j = 0;
> > > -		     j < geo->fsbcount && mip->ra_index < mip->map_valid;
> > > -		     j += length ) {
> > > -			/*
> > > -			 * The rest of this extent but not more than a dir
> > > -			 * block.
> > > -			 */
> > > -			length = min_t(int, geo->fsbcount - j,
> > > -					map[mip->ra_index].br_blockcount -
> > > -							mip->ra_offset);
> > > -			mip->ra_offset += length;
> > > -
> > > -			/*
> > > -			 * Advance to the next mapping if this one is used up.
> > > -			 */
> > > -			if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
> > > -				mip->ra_offset = 0;
> > > -				mip->ra_index++;
> > > +	while (ra_want > 0) {
> > > +		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
> > > +		while (ra_want > 0 &&
> > > +		       next_ra < map.br_startoff + map.br_blockcount) {
> > > +			if (next_ra >= last_da) {
> > > +				*ra_blk = last_da;
> > > +				break;
> > > +			} else if (next_ra > *ra_blk) {
> > > +				xfs_dir3_data_readahead(dp, next_ra, -2);
> > > +				*ra_blk = next_ra;
> > >  			}
> > > +			ra_want -= geo->fsbcount;
> > > +			next_ra += geo->fsbcount;
> > > +		}
> > > +		found = xfs_iext_get_extent(ifp, ++idx, &map);
> > > +		if (!found) {
> > > +			*ra_blk = last_da;
> > > +			break;
> > >  		}
> > >  	}
> > >  	blk_finish_plug(&plug);
> > > @@ -458,6 +342,9 @@ xfs_dir2_leaf_readbuf(
> > >  out:
> > >  	*bpp = bp;
> > >  	return error;
> > > +out_no_ra:
> > > +	*ra_blk = last_da;
> > > +	goto out;
> > >  }
> > >  
> > >  /*
> > > @@ -475,14 +362,14 @@ xfs_dir2_leaf_getdents(
> > >  	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
> > >  	xfs_dir2_data_entry_t	*dep;		/* data entry */
> > >  	xfs_dir2_data_unused_t	*dup;		/* unused entry */
> > > -	int			error = 0;	/* error return value */
> > > -	int			length;		/* temporary length value */
> > > -	int			byteoff;	/* offset in current block */
> > > -	xfs_dir2_off_t		curoff;		/* current overall offset */
> > > -	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
> > >  	char			*ptr = NULL;	/* pointer to current data */
> > > -	struct xfs_dir2_leaf_map_info *map_info;
> > >  	struct xfs_da_geometry	*geo = args->geo;
> > > +	xfs_dablk_t		rablk = 0;	/* current readahead block */
> > > +	xfs_dir2_off_t		curoff;		/* current overall offset */
> > > +	int			length;		/* temporary length value */
> > > +	int			byteoff;	/* offset in current block */
> > > +	int			lock_mode;
> > > +	int			error = 0;	/* error return value */
> > >  
> > >  	/*
> > >  	 * If the offset is at or past the largest allowed value,
> > > @@ -492,30 +379,12 @@ xfs_dir2_leaf_getdents(
> > >  		return 0;
> > >  
> > >  	/*
> > > -	 * Set up to bmap a number of blocks based on the caller's
> > > -	 * buffer size, the directory block size, and the filesystem
> > > -	 * block size.
> > > -	 */
> > > -	length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> > > -	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
> > > -				(length * sizeof(struct xfs_bmbt_irec)),
> > > -			       KM_SLEEP | KM_NOFS);
> > > -	map_info->map_size = length;
> > > -
> > > -	/*
> > >  	 * Inside the loop we keep the main offset value as a byte offset
> > >  	 * in the directory file.
> > >  	 */
> > >  	curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
> > >  
> > >  	/*
> > > -	 * Force this conversion through db so we truncate the offset
> > > -	 * down to get the start of the data block.
> > > -	 */
> > > -	map_info->map_off = xfs_dir2_db_to_da(geo,
> > > -					      xfs_dir2_byte_to_db(geo, curoff));
> > > -
> > > -	/*
> > >  	 * Loop over directory entries until we reach the end offset.
> > >  	 * Get more blocks and readahead as necessary.
> > >  	 */
> > > @@ -527,38 +396,18 @@ xfs_dir2_leaf_getdents(
> > >  		 * current buffer, need to get another one.
> > >  		 */
> > >  		if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
> > > -			int	lock_mode;
> > > -			bool	trim_map = false;
> > > -
> > >  			if (bp) {
> > > -				xfs_trans_brelse(NULL, bp);
> > > +				xfs_trans_brelse(args->trans, bp);
> > >  				bp = NULL;
> > > -				trim_map = true;
> > >  			}
> > >  
> > >  			lock_mode = xfs_ilock_data_map_shared(dp);
> > > -			error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
> > > -						      &curoff, &bp, trim_map);
> > > +			error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
> > > +					&rablk, &bp);
> > >  			xfs_iunlock(dp, lock_mode);
> > > -			if (error || !map_info->map_valid)
> > > +			if (error || !bp)
> > >  				break;
> > >  
> > > -			/*
> > > -			 * Having done a read, we need to set a new offset.
> > > -			 */
> > > -			newoff = xfs_dir2_db_off_to_byte(geo,
> > > -							 map_info->curdb, 0);
> > > -			/*
> > > -			 * Start of the current block.
> > > -			 */
> > > -			if (curoff < newoff)
> > > -				curoff = newoff;
> > > -			/*
> > > -			 * Make sure we're in the right block.
> > > -			 */
> > > -			else if (curoff > newoff)
> > > -				ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
> > > -				       map_info->curdb);
> > >  			hdr = bp->b_addr;
> > >  			xfs_dir3_data_check(dp, bp);
> > >  			/*
> > > @@ -643,7 +492,6 @@ xfs_dir2_leaf_getdents(
> > >  		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
> > >  	else
> > >  		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
> > > -	kmem_free(map_info);
> > >  	if (bp)
> > >  		xfs_trans_brelse(NULL, bp);
> > >  	return error;
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
  2017-05-01 23:13     ` Brian Foster
@ 2017-05-01 23:30       ` Darrick J. Wong
  2017-05-02 14:11         ` Brian Foster
  0 siblings, 1 reply; 12+ messages in thread
From: Darrick J. Wong @ 2017-05-01 23:30 UTC (permalink / raw)
  To: Brian Foster; +Cc: xfs, Christoph Hellwig

On Mon, May 01, 2017 at 07:13:24PM -0400, Brian Foster wrote:
> On Mon, May 01, 2017 at 02:50:18PM -0700, Darrick J. Wong wrote:
> > On Mon, May 01, 2017 at 02:32:43PM -0400, Brian Foster wrote:
> > > On Fri, Apr 28, 2017 at 12:46:52PM -0700, Darrick J. Wong wrote:
> > > > Currently, the dir2 leaf block getdents function uses a complex state
> > > > tracking mechanism to create a shadow copy of the block mappings and
> > > > then uses the shadow copy to schedule readahead.  Since the read and
> > > > readahead functions are perfectly capable of reading the mappings
> > > > themselves, we can tear all that out in favor of a simpler function that
> > > > simply keeps pushing the readahead window further out.
> > > > 
> > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > ---
> > > > v3: use sliding window to constrain the amount of readahead
> > > > v2: fix readahead of more than ra_want
> > > > ---
> > > 
> > > Thanks for the updates. This looks much more simple and seems true to
> > > the current readahead behavior. The code also looks fine to me (one bit
> > > of whitespace damage noted below).
> > > 
> > > That aside, have you happened to test this against a huge/ugly directory
> > > to verify it works as expected? Note that I don't think in depth
> > > performance analysis is required. Verification of any kind of dir that
> > > is known to benefit from readahead is probably sufficient IMO. Perhaps
> > > dm-delay with a small enough latency to allow us to measure the effect
> > > of readahead could help us here.
> > 
> > Yeah, I used xfs/349 to generate a filesystem containing a directory
> > with a freeindex block, then ls'd the entire directory to see how long
> > the getdents calls took.  The readhead calls were nearly identical with
> > similar runtimes.
> > 
> 
> Ok..
> 
> > > >  fs/xfs/xfs_dir2_readdir.c |  316 ++++++++++++---------------------------------
> > > >  1 file changed, 82 insertions(+), 234 deletions(-)
> > > > 
> > > > diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
> > > > index 20b7a5c..d05c1ec 100644
> > > > --- a/fs/xfs/xfs_dir2_readdir.c
> > > > +++ b/fs/xfs/xfs_dir2_readdir.c
> > > > @@ -243,214 +243,98 @@ xfs_dir2_block_getdents(
> > > >  	return 0;
> > > >  }
> > > >  
> > > ...
> > > >  	/*
> > > > -	 * Do we need more readahead?
> > > > -	 * Each loop tries to process 1 full dir blk; last may be partial.
> > > > +	 * Start readahead for the next bufsize's worth of dir data blocks.
> > > > +	 * We may have already issued readahead for some of that range;
> > > > +	 * ra_blk tracks the last block we tried to read(ahead).
> > > >  	 */
> > > > +	ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> > > > +	if (*ra_blk >= last_da)
> > > > +		goto out;
> > > > +	else if (*ra_blk == 0)
> > > > +		*ra_blk = map.br_startoff;
> > > > +	next_ra = map.br_startoff + geo->fsbcount;
> > > > +	if (next_ra >= last_da)
> > > > +		goto out_no_ra;
> > > > +	found = xfs_iext_lookup_extent(dp, ifp, next_ra, &idx, &map);
> > > > +	if (!found || map.br_startoff >= last_da)
> > > > +		goto out_no_ra;
> > > > +	xfs_trim_extent(&map, next_ra, last_da - next_ra);
> > > > + 
> > > 
> > > ^ trailing space.
> > 
> > (I don't see it...?)
> > 
> 
> checkpatch catches it:
> 
> $ ./scripts/checkpatch.pl /tmp/patch 
> ERROR: trailing whitespace
> #227: FILE: fs/xfs/xfs_dir2_readdir.c:317:
> + $
> 
> WARNING: please, no spaces at the start of a line
> #227: FILE: fs/xfs/xfs_dir2_readdir.c:317:
> + $

<sigh> vi was lying to me, like always.

Reload buffer and *poof* the trailing whitespace appears... <grumble>

Thanks for the review,

--D

> 
> total: 1 errors, 1 warnings, 391 lines checked
> ...
> 
> With that fixed:
> 
> Reviewed-by: Brian Foster <bfoster@redhat.com>
> 
> > --D
> > 
> > > 
> > > Brian
> > > 
> > > > +	/* Start ra for each dir (not fs) block that has a mapping. */
> > > >  	blk_start_plug(&plug);
> > > > -	for (mip->ra_index = mip->ra_offset = i = 0;
> > > > -	     mip->ra_want > mip->ra_current && i < mip->map_blocks;
> > > > -	     i += geo->fsbcount) {
> > > > -		ASSERT(mip->ra_index < mip->map_valid);
> > > > -		/*
> > > > -		 * Read-ahead a contiguous directory block.
> > > > -		 */
> > > > -		if (i > mip->ra_current &&
> > > > -		    (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
> > > > -		    geo->fsbcount) {
> > > > -			xfs_dir3_data_readahead(dp,
> > > > -				map[mip->ra_index].br_startoff + mip->ra_offset,
> > > > -				XFS_FSB_TO_DADDR(dp->i_mount,
> > > > -					map[mip->ra_index].br_startblock +
> > > > -							mip->ra_offset));
> > > > -			mip->ra_current = i;
> > > > -		}
> > > > -
> > > > -		/*
> > > > -		 * Read-ahead a non-contiguous directory block.  This doesn't
> > > > -		 * use our mapping, but this is a very rare case.
> > > > -		 */
> > > > -		else if (i > mip->ra_current) {
> > > > -			xfs_dir3_data_readahead(dp,
> > > > -					map[mip->ra_index].br_startoff +
> > > > -							mip->ra_offset, -1);
> > > > -			mip->ra_current = i;
> > > > -		}
> > > > -
> > > > -		/*
> > > > -		 * Advance offset through the mapping table, processing a full
> > > > -		 * dir block even if it is fragmented into several extents.
> > > > -		 * But stop if we have consumed all valid mappings, even if
> > > > -		 * it's not yet a full directory block.
> > > > -		 */
> > > > -		for (j = 0;
> > > > -		     j < geo->fsbcount && mip->ra_index < mip->map_valid;
> > > > -		     j += length ) {
> > > > -			/*
> > > > -			 * The rest of this extent but not more than a dir
> > > > -			 * block.
> > > > -			 */
> > > > -			length = min_t(int, geo->fsbcount - j,
> > > > -					map[mip->ra_index].br_blockcount -
> > > > -							mip->ra_offset);
> > > > -			mip->ra_offset += length;
> > > > -
> > > > -			/*
> > > > -			 * Advance to the next mapping if this one is used up.
> > > > -			 */
> > > > -			if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
> > > > -				mip->ra_offset = 0;
> > > > -				mip->ra_index++;
> > > > +	while (ra_want > 0) {
> > > > +		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
> > > > +		while (ra_want > 0 &&
> > > > +		       next_ra < map.br_startoff + map.br_blockcount) {
> > > > +			if (next_ra >= last_da) {
> > > > +				*ra_blk = last_da;
> > > > +				break;
> > > > +			} else if (next_ra > *ra_blk) {
> > > > +				xfs_dir3_data_readahead(dp, next_ra, -2);
> > > > +				*ra_blk = next_ra;
> > > >  			}
> > > > +			ra_want -= geo->fsbcount;
> > > > +			next_ra += geo->fsbcount;
> > > > +		}
> > > > +		found = xfs_iext_get_extent(ifp, ++idx, &map);
> > > > +		if (!found) {
> > > > +			*ra_blk = last_da;
> > > > +			break;
> > > >  		}
> > > >  	}
> > > >  	blk_finish_plug(&plug);
> > > > @@ -458,6 +342,9 @@ xfs_dir2_leaf_readbuf(
> > > >  out:
> > > >  	*bpp = bp;
> > > >  	return error;
> > > > +out_no_ra:
> > > > +	*ra_blk = last_da;
> > > > +	goto out;
> > > >  }
> > > >  
> > > >  /*
> > > > @@ -475,14 +362,14 @@ xfs_dir2_leaf_getdents(
> > > >  	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
> > > >  	xfs_dir2_data_entry_t	*dep;		/* data entry */
> > > >  	xfs_dir2_data_unused_t	*dup;		/* unused entry */
> > > > -	int			error = 0;	/* error return value */
> > > > -	int			length;		/* temporary length value */
> > > > -	int			byteoff;	/* offset in current block */
> > > > -	xfs_dir2_off_t		curoff;		/* current overall offset */
> > > > -	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
> > > >  	char			*ptr = NULL;	/* pointer to current data */
> > > > -	struct xfs_dir2_leaf_map_info *map_info;
> > > >  	struct xfs_da_geometry	*geo = args->geo;
> > > > +	xfs_dablk_t		rablk = 0;	/* current readahead block */
> > > > +	xfs_dir2_off_t		curoff;		/* current overall offset */
> > > > +	int			length;		/* temporary length value */
> > > > +	int			byteoff;	/* offset in current block */
> > > > +	int			lock_mode;
> > > > +	int			error = 0;	/* error return value */
> > > >  
> > > >  	/*
> > > >  	 * If the offset is at or past the largest allowed value,
> > > > @@ -492,30 +379,12 @@ xfs_dir2_leaf_getdents(
> > > >  		return 0;
> > > >  
> > > >  	/*
> > > > -	 * Set up to bmap a number of blocks based on the caller's
> > > > -	 * buffer size, the directory block size, and the filesystem
> > > > -	 * block size.
> > > > -	 */
> > > > -	length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> > > > -	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
> > > > -				(length * sizeof(struct xfs_bmbt_irec)),
> > > > -			       KM_SLEEP | KM_NOFS);
> > > > -	map_info->map_size = length;
> > > > -
> > > > -	/*
> > > >  	 * Inside the loop we keep the main offset value as a byte offset
> > > >  	 * in the directory file.
> > > >  	 */
> > > >  	curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
> > > >  
> > > >  	/*
> > > > -	 * Force this conversion through db so we truncate the offset
> > > > -	 * down to get the start of the data block.
> > > > -	 */
> > > > -	map_info->map_off = xfs_dir2_db_to_da(geo,
> > > > -					      xfs_dir2_byte_to_db(geo, curoff));
> > > > -
> > > > -	/*
> > > >  	 * Loop over directory entries until we reach the end offset.
> > > >  	 * Get more blocks and readahead as necessary.
> > > >  	 */
> > > > @@ -527,38 +396,18 @@ xfs_dir2_leaf_getdents(
> > > >  		 * current buffer, need to get another one.
> > > >  		 */
> > > >  		if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
> > > > -			int	lock_mode;
> > > > -			bool	trim_map = false;
> > > > -
> > > >  			if (bp) {
> > > > -				xfs_trans_brelse(NULL, bp);
> > > > +				xfs_trans_brelse(args->trans, bp);
> > > >  				bp = NULL;
> > > > -				trim_map = true;
> > > >  			}
> > > >  
> > > >  			lock_mode = xfs_ilock_data_map_shared(dp);
> > > > -			error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
> > > > -						      &curoff, &bp, trim_map);
> > > > +			error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
> > > > +					&rablk, &bp);
> > > >  			xfs_iunlock(dp, lock_mode);
> > > > -			if (error || !map_info->map_valid)
> > > > +			if (error || !bp)
> > > >  				break;
> > > >  
> > > > -			/*
> > > > -			 * Having done a read, we need to set a new offset.
> > > > -			 */
> > > > -			newoff = xfs_dir2_db_off_to_byte(geo,
> > > > -							 map_info->curdb, 0);
> > > > -			/*
> > > > -			 * Start of the current block.
> > > > -			 */
> > > > -			if (curoff < newoff)
> > > > -				curoff = newoff;
> > > > -			/*
> > > > -			 * Make sure we're in the right block.
> > > > -			 */
> > > > -			else if (curoff > newoff)
> > > > -				ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
> > > > -				       map_info->curdb);
> > > >  			hdr = bp->b_addr;
> > > >  			xfs_dir3_data_check(dp, bp);
> > > >  			/*
> > > > @@ -643,7 +492,6 @@ xfs_dir2_leaf_getdents(
> > > >  		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
> > > >  	else
> > > >  		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
> > > > -	kmem_free(map_info);
> > > >  	if (bp)
> > > >  		xfs_trans_brelse(NULL, bp);
> > > >  	return error;
> > > > --
> > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > the body of a message to majordomo@vger.kernel.org
> > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
  2017-04-28 19:46 [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness Darrick J. Wong
  2017-05-01 18:32 ` Brian Foster
@ 2017-05-02  7:44 ` Christoph Hellwig
  2017-05-02 19:02   ` Darrick J. Wong
  1 sibling, 1 reply; 12+ messages in thread
From: Christoph Hellwig @ 2017-05-02  7:44 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: xfs, Christoph Hellwig, Brian Foster

Hi Darrick,

a few comments below.  Most are cosmetic except for one which is
a minor improvement.

Reviewed-by: Christoph Hellwig <hch@lst.de>

with the cosmetic bits fixed up.

> +	last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET);
> +	map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off));
> +	found = xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map);
> +	if (!found || map.br_startoff >= last_da)
>  		goto out;

I don't think we need the found variable in this function, all the users
only check for in the next line and then ignore it.  E.g. rewrite this
into

	if (!xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map))
		goto out;
	if (map.br_startoff >= last_da))
		goto out;

> +	ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> +	if (*ra_blk >= last_da)
> +		goto out;
> +	else if (*ra_blk == 0)
> +		*ra_blk = map.br_startoff;

No need for the else here.

> +	next_ra = map.br_startoff + geo->fsbcount;
> +	if (next_ra >= last_da)
> +		goto out_no_ra;
> +	found = xfs_iext_lookup_extent(dp, ifp, next_ra, &idx, &map);

Do we really need a new full lookup here?  This should be the same
or the next map compared to the one the original xfs_iext_lookup_extent
returned.  So just checking if it's in the original map that could
be stashed away or otherwise calling xfs_iext_get_extent would more
efficient.

> +	while (ra_want > 0) {
> +		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
> +		while (ra_want > 0 &&
> +		       next_ra < map.br_startoff + map.br_blockcount) {
> +			if (next_ra >= last_da) {
> +				*ra_blk = last_da;
> +				break;
> +			} else if (next_ra > *ra_blk) {

No need for the else.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
  2017-05-01 23:30       ` Darrick J. Wong
@ 2017-05-02 14:11         ` Brian Foster
  0 siblings, 0 replies; 12+ messages in thread
From: Brian Foster @ 2017-05-02 14:11 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: xfs, Christoph Hellwig

On Mon, May 01, 2017 at 04:30:49PM -0700, Darrick J. Wong wrote:
> On Mon, May 01, 2017 at 07:13:24PM -0400, Brian Foster wrote:
> > On Mon, May 01, 2017 at 02:50:18PM -0700, Darrick J. Wong wrote:
> > > On Mon, May 01, 2017 at 02:32:43PM -0400, Brian Foster wrote:
> > > > On Fri, Apr 28, 2017 at 12:46:52PM -0700, Darrick J. Wong wrote:
> > > > > Currently, the dir2 leaf block getdents function uses a complex state
> > > > > tracking mechanism to create a shadow copy of the block mappings and
> > > > > then uses the shadow copy to schedule readahead.  Since the read and
> > > > > readahead functions are perfectly capable of reading the mappings
> > > > > themselves, we can tear all that out in favor of a simpler function that
> > > > > simply keeps pushing the readahead window further out.
> > > > > 
> > > > > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > > > > ---
> > > > > v3: use sliding window to constrain the amount of readahead
> > > > > v2: fix readahead of more than ra_want
> > > > > ---
> > > > 
> > > > Thanks for the updates. This looks much more simple and seems true to
> > > > the current readahead behavior. The code also looks fine to me (one bit
> > > > of whitespace damage noted below).
> > > > 
> > > > That aside, have you happened to test this against a huge/ugly directory
> > > > to verify it works as expected? Note that I don't think in depth
> > > > performance analysis is required. Verification of any kind of dir that
> > > > is known to benefit from readahead is probably sufficient IMO. Perhaps
> > > > dm-delay with a small enough latency to allow us to measure the effect
> > > > of readahead could help us here.
> > > 
> > > Yeah, I used xfs/349 to generate a filesystem containing a directory
> > > with a freeindex block, then ls'd the entire directory to see how long
> > > the getdents calls took.  The readhead calls were nearly identical with
> > > similar runtimes.
> > > 
> > 
> > Ok..
> > 

FWIW and out of curiosity, I also ran a little test using a directory
from xfs/349 and dm-delay with a 200ms read delay. I measured an 'xfs_io
-c readdir <dir>' command in the following configs:

	Current code: ~3.55s
	Current code, readahead disabled: ~17.06s
	This patch: ~3.35s

So this also looks roughly equivalent to current performance, if not a
little better.

Brian

> > > > >  fs/xfs/xfs_dir2_readdir.c |  316 ++++++++++++---------------------------------
> > > > >  1 file changed, 82 insertions(+), 234 deletions(-)
> > > > > 
> > > > > diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
> > > > > index 20b7a5c..d05c1ec 100644
> > > > > --- a/fs/xfs/xfs_dir2_readdir.c
> > > > > +++ b/fs/xfs/xfs_dir2_readdir.c
> > > > > @@ -243,214 +243,98 @@ xfs_dir2_block_getdents(
> > > > >  	return 0;
> > > > >  }
> > > > >  
> > > > ...
> > > > >  	/*
> > > > > -	 * Do we need more readahead?
> > > > > -	 * Each loop tries to process 1 full dir blk; last may be partial.
> > > > > +	 * Start readahead for the next bufsize's worth of dir data blocks.
> > > > > +	 * We may have already issued readahead for some of that range;
> > > > > +	 * ra_blk tracks the last block we tried to read(ahead).
> > > > >  	 */
> > > > > +	ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> > > > > +	if (*ra_blk >= last_da)
> > > > > +		goto out;
> > > > > +	else if (*ra_blk == 0)
> > > > > +		*ra_blk = map.br_startoff;
> > > > > +	next_ra = map.br_startoff + geo->fsbcount;
> > > > > +	if (next_ra >= last_da)
> > > > > +		goto out_no_ra;
> > > > > +	found = xfs_iext_lookup_extent(dp, ifp, next_ra, &idx, &map);
> > > > > +	if (!found || map.br_startoff >= last_da)
> > > > > +		goto out_no_ra;
> > > > > +	xfs_trim_extent(&map, next_ra, last_da - next_ra);
> > > > > + 
> > > > 
> > > > ^ trailing space.
> > > 
> > > (I don't see it...?)
> > > 
> > 
> > checkpatch catches it:
> > 
> > $ ./scripts/checkpatch.pl /tmp/patch 
> > ERROR: trailing whitespace
> > #227: FILE: fs/xfs/xfs_dir2_readdir.c:317:
> > + $
> > 
> > WARNING: please, no spaces at the start of a line
> > #227: FILE: fs/xfs/xfs_dir2_readdir.c:317:
> > + $
> 
> <sigh> vi was lying to me, like always.
> 
> Reload buffer and *poof* the trailing whitespace appears... <grumble>
> 
> Thanks for the review,
> 
> --D
> 
> > 
> > total: 1 errors, 1 warnings, 391 lines checked
> > ...
> > 
> > With that fixed:
> > 
> > Reviewed-by: Brian Foster <bfoster@redhat.com>
> > 
> > > --D
> > > 
> > > > 
> > > > Brian
> > > > 
> > > > > +	/* Start ra for each dir (not fs) block that has a mapping. */
> > > > >  	blk_start_plug(&plug);
> > > > > -	for (mip->ra_index = mip->ra_offset = i = 0;
> > > > > -	     mip->ra_want > mip->ra_current && i < mip->map_blocks;
> > > > > -	     i += geo->fsbcount) {
> > > > > -		ASSERT(mip->ra_index < mip->map_valid);
> > > > > -		/*
> > > > > -		 * Read-ahead a contiguous directory block.
> > > > > -		 */
> > > > > -		if (i > mip->ra_current &&
> > > > > -		    (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
> > > > > -		    geo->fsbcount) {
> > > > > -			xfs_dir3_data_readahead(dp,
> > > > > -				map[mip->ra_index].br_startoff + mip->ra_offset,
> > > > > -				XFS_FSB_TO_DADDR(dp->i_mount,
> > > > > -					map[mip->ra_index].br_startblock +
> > > > > -							mip->ra_offset));
> > > > > -			mip->ra_current = i;
> > > > > -		}
> > > > > -
> > > > > -		/*
> > > > > -		 * Read-ahead a non-contiguous directory block.  This doesn't
> > > > > -		 * use our mapping, but this is a very rare case.
> > > > > -		 */
> > > > > -		else if (i > mip->ra_current) {
> > > > > -			xfs_dir3_data_readahead(dp,
> > > > > -					map[mip->ra_index].br_startoff +
> > > > > -							mip->ra_offset, -1);
> > > > > -			mip->ra_current = i;
> > > > > -		}
> > > > > -
> > > > > -		/*
> > > > > -		 * Advance offset through the mapping table, processing a full
> > > > > -		 * dir block even if it is fragmented into several extents.
> > > > > -		 * But stop if we have consumed all valid mappings, even if
> > > > > -		 * it's not yet a full directory block.
> > > > > -		 */
> > > > > -		for (j = 0;
> > > > > -		     j < geo->fsbcount && mip->ra_index < mip->map_valid;
> > > > > -		     j += length ) {
> > > > > -			/*
> > > > > -			 * The rest of this extent but not more than a dir
> > > > > -			 * block.
> > > > > -			 */
> > > > > -			length = min_t(int, geo->fsbcount - j,
> > > > > -					map[mip->ra_index].br_blockcount -
> > > > > -							mip->ra_offset);
> > > > > -			mip->ra_offset += length;
> > > > > -
> > > > > -			/*
> > > > > -			 * Advance to the next mapping if this one is used up.
> > > > > -			 */
> > > > > -			if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
> > > > > -				mip->ra_offset = 0;
> > > > > -				mip->ra_index++;
> > > > > +	while (ra_want > 0) {
> > > > > +		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
> > > > > +		while (ra_want > 0 &&
> > > > > +		       next_ra < map.br_startoff + map.br_blockcount) {
> > > > > +			if (next_ra >= last_da) {
> > > > > +				*ra_blk = last_da;
> > > > > +				break;
> > > > > +			} else if (next_ra > *ra_blk) {
> > > > > +				xfs_dir3_data_readahead(dp, next_ra, -2);
> > > > > +				*ra_blk = next_ra;
> > > > >  			}
> > > > > +			ra_want -= geo->fsbcount;
> > > > > +			next_ra += geo->fsbcount;
> > > > > +		}
> > > > > +		found = xfs_iext_get_extent(ifp, ++idx, &map);
> > > > > +		if (!found) {
> > > > > +			*ra_blk = last_da;
> > > > > +			break;
> > > > >  		}
> > > > >  	}
> > > > >  	blk_finish_plug(&plug);
> > > > > @@ -458,6 +342,9 @@ xfs_dir2_leaf_readbuf(
> > > > >  out:
> > > > >  	*bpp = bp;
> > > > >  	return error;
> > > > > +out_no_ra:
> > > > > +	*ra_blk = last_da;
> > > > > +	goto out;
> > > > >  }
> > > > >  
> > > > >  /*
> > > > > @@ -475,14 +362,14 @@ xfs_dir2_leaf_getdents(
> > > > >  	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
> > > > >  	xfs_dir2_data_entry_t	*dep;		/* data entry */
> > > > >  	xfs_dir2_data_unused_t	*dup;		/* unused entry */
> > > > > -	int			error = 0;	/* error return value */
> > > > > -	int			length;		/* temporary length value */
> > > > > -	int			byteoff;	/* offset in current block */
> > > > > -	xfs_dir2_off_t		curoff;		/* current overall offset */
> > > > > -	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
> > > > >  	char			*ptr = NULL;	/* pointer to current data */
> > > > > -	struct xfs_dir2_leaf_map_info *map_info;
> > > > >  	struct xfs_da_geometry	*geo = args->geo;
> > > > > +	xfs_dablk_t		rablk = 0;	/* current readahead block */
> > > > > +	xfs_dir2_off_t		curoff;		/* current overall offset */
> > > > > +	int			length;		/* temporary length value */
> > > > > +	int			byteoff;	/* offset in current block */
> > > > > +	int			lock_mode;
> > > > > +	int			error = 0;	/* error return value */
> > > > >  
> > > > >  	/*
> > > > >  	 * If the offset is at or past the largest allowed value,
> > > > > @@ -492,30 +379,12 @@ xfs_dir2_leaf_getdents(
> > > > >  		return 0;
> > > > >  
> > > > >  	/*
> > > > > -	 * Set up to bmap a number of blocks based on the caller's
> > > > > -	 * buffer size, the directory block size, and the filesystem
> > > > > -	 * block size.
> > > > > -	 */
> > > > > -	length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> > > > > -	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
> > > > > -				(length * sizeof(struct xfs_bmbt_irec)),
> > > > > -			       KM_SLEEP | KM_NOFS);
> > > > > -	map_info->map_size = length;
> > > > > -
> > > > > -	/*
> > > > >  	 * Inside the loop we keep the main offset value as a byte offset
> > > > >  	 * in the directory file.
> > > > >  	 */
> > > > >  	curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
> > > > >  
> > > > >  	/*
> > > > > -	 * Force this conversion through db so we truncate the offset
> > > > > -	 * down to get the start of the data block.
> > > > > -	 */
> > > > > -	map_info->map_off = xfs_dir2_db_to_da(geo,
> > > > > -					      xfs_dir2_byte_to_db(geo, curoff));
> > > > > -
> > > > > -	/*
> > > > >  	 * Loop over directory entries until we reach the end offset.
> > > > >  	 * Get more blocks and readahead as necessary.
> > > > >  	 */
> > > > > @@ -527,38 +396,18 @@ xfs_dir2_leaf_getdents(
> > > > >  		 * current buffer, need to get another one.
> > > > >  		 */
> > > > >  		if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
> > > > > -			int	lock_mode;
> > > > > -			bool	trim_map = false;
> > > > > -
> > > > >  			if (bp) {
> > > > > -				xfs_trans_brelse(NULL, bp);
> > > > > +				xfs_trans_brelse(args->trans, bp);
> > > > >  				bp = NULL;
> > > > > -				trim_map = true;
> > > > >  			}
> > > > >  
> > > > >  			lock_mode = xfs_ilock_data_map_shared(dp);
> > > > > -			error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
> > > > > -						      &curoff, &bp, trim_map);
> > > > > +			error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
> > > > > +					&rablk, &bp);
> > > > >  			xfs_iunlock(dp, lock_mode);
> > > > > -			if (error || !map_info->map_valid)
> > > > > +			if (error || !bp)
> > > > >  				break;
> > > > >  
> > > > > -			/*
> > > > > -			 * Having done a read, we need to set a new offset.
> > > > > -			 */
> > > > > -			newoff = xfs_dir2_db_off_to_byte(geo,
> > > > > -							 map_info->curdb, 0);
> > > > > -			/*
> > > > > -			 * Start of the current block.
> > > > > -			 */
> > > > > -			if (curoff < newoff)
> > > > > -				curoff = newoff;
> > > > > -			/*
> > > > > -			 * Make sure we're in the right block.
> > > > > -			 */
> > > > > -			else if (curoff > newoff)
> > > > > -				ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
> > > > > -				       map_info->curdb);
> > > > >  			hdr = bp->b_addr;
> > > > >  			xfs_dir3_data_check(dp, bp);
> > > > >  			/*
> > > > > @@ -643,7 +492,6 @@ xfs_dir2_leaf_getdents(
> > > > >  		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
> > > > >  	else
> > > > >  		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
> > > > > -	kmem_free(map_info);
> > > > >  	if (bp)
> > > > >  		xfs_trans_brelse(NULL, bp);
> > > > >  	return error;
> > > > > --
> > > > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > > > the body of a message to majordomo@vger.kernel.org
> > > > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > > --
> > > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > > the body of a message to majordomo@vger.kernel.org
> > > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
  2017-05-02  7:44 ` Christoph Hellwig
@ 2017-05-02 19:02   ` Darrick J. Wong
  0 siblings, 0 replies; 12+ messages in thread
From: Darrick J. Wong @ 2017-05-02 19:02 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs, Brian Foster

On Tue, May 02, 2017 at 12:44:00AM -0700, Christoph Hellwig wrote:
> Hi Darrick,
> 
> a few comments below.  Most are cosmetic except for one which is
> a minor improvement.
> 
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> 
> with the cosmetic bits fixed up.
> 
> > +	last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET);
> > +	map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off));
> > +	found = xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map);
> > +	if (!found || map.br_startoff >= last_da)
> >  		goto out;
> 
> I don't think we need the found variable in this function, all the users
> only check for in the next line and then ignore it.  E.g. rewrite this
> into
> 
> 	if (!xfs_iext_lookup_extent(dp, ifp, map_off, &idx, &map))
> 		goto out;
> 	if (map.br_startoff >= last_da))
> 		goto out;

Ok.

> > +	ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> > +	if (*ra_blk >= last_da)
> > +		goto out;
> > +	else if (*ra_blk == 0)
> > +		*ra_blk = map.br_startoff;
> 
> No need for the else here.

Ok.

> > +	next_ra = map.br_startoff + geo->fsbcount;
> > +	if (next_ra >= last_da)
> > +		goto out_no_ra;
> > +	found = xfs_iext_lookup_extent(dp, ifp, next_ra, &idx, &map);
> 
> Do we really need a new full lookup here?  This should be the same
> or the next map compared to the one the original xfs_iext_lookup_extent
> returned.  So just checking if it's in the original map that could
> be stashed away or otherwise calling xfs_iext_get_extent would more
> efficient.

Sure, that could be something like:

	next_ra = map.br_startoff + geo->fsbcount;
	if (next_ra >= last_da)
		goto out_no_ra;
	if (map.br_blockcount < geo->fsbcount &&
	    !xfs_iext_get_extent(ifp, ++idx, &map))
		goto out_no_ra;
	if (map.br_startoff >= last_da)
		goto out_no_ra;
	xfs_trim_extent(&map, next_ra, last_da - next_ra);

> > +	while (ra_want > 0) {
> > +		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
> > +		while (ra_want > 0 &&
> > +		       next_ra < map.br_startoff + map.br_blockcount) {
> > +			if (next_ra >= last_da) {
> > +				*ra_blk = last_da;
> > +				break;
> > +			} else if (next_ra > *ra_blk) {
> 
> No need for the else.

Ok.

--D
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
  2017-04-22 12:15 ` Brian Foster
@ 2017-04-24 21:31   ` Darrick J. Wong
  0 siblings, 0 replies; 12+ messages in thread
From: Darrick J. Wong @ 2017-04-24 21:31 UTC (permalink / raw)
  To: Brian Foster
  Cc: Eric Sandeen, linux-xfs, Carlos Maiolino, billodo, Dave Chinner

On Sat, Apr 22, 2017 at 08:15:33AM -0400, Brian Foster wrote:
> On Tue, Apr 18, 2017 at 05:14:34PM -0700, Darrick J. Wong wrote:
> > Currently, the dir2 leaf block getdents function uses a complex state
> > tracking mechanism to create a shadow copy of the block mappings and
> > then uses the shadow copy to schedule readahead.  Since the read and
> > readahead functions are perfectly capable of reading the mappings
> > themselves, we can tear all that out in favor of a simpler function that
> > simply keeps pushing the readahead window further out.
> > 
> > Inspired-by: Dave Chinner <david@fromorbit.com>
> > Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> > ---
> 
> I attempted to take a look at this yesterday (email has been dead) but
> noticed it didn't apply to for-next (w/ or w/o Eric's fix)..?

It should apply with both Eric and your fixes applied...

> >  fs/xfs/xfs_dir2_readdir.c |  324 ++++++++++++---------------------------------
> >  1 file changed, 87 insertions(+), 237 deletions(-)
> > 
> > diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
> > index 929e8b6..290c610 100644
> > --- a/fs/xfs/xfs_dir2_readdir.c
> > +++ b/fs/xfs/xfs_dir2_readdir.c
> > @@ -243,215 +243,109 @@ xfs_dir2_block_getdents(
> ...
> > +	while (ra_want > 0 && next_ra < last_da) {
> > +		nmap = 1;
> > +		error = xfs_bmapi_read(dp, next_ra, last_da - next_ra,
> > +				&map, &nmap, 0);
> > +		if (error || !nmap)
> > +			break;
> > +		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
> > +		while (map.br_startblock != HOLESTARTBLOCK &&
> > +		       next_ra < map.br_startoff + map.br_blockcount) {
> > +			xfs_dir3_data_readahead(dp, next_ra, -2);
> > +			*ra_blk = next_ra;
> > +			ra_want -= geo->fsbcount;
> > +			next_ra += geo->fsbcount;
> >  		}
> 
> FWIW and not having looked at the rest of the patch, it does look like
> the readahead window can stretch far beyond the expected size if you
> happen to have a large contiguous extent (IOW, the inner loop doesn't
> consider ra_want).

Oops, good catch.  I'll fix that before the next submission.

--D

> 
> Brian
> 
> > +		next_ra = map.br_startoff + map.br_blockcount;
> >  	}
> >  	blk_finish_plug(&plug);
> >  
> > @@ -475,14 +369,14 @@ xfs_dir2_leaf_getdents(
> >  	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
> >  	xfs_dir2_data_entry_t	*dep;		/* data entry */
> >  	xfs_dir2_data_unused_t	*dup;		/* unused entry */
> > -	int			error = 0;	/* error return value */
> > -	int			length;		/* temporary length value */
> > -	int			byteoff;	/* offset in current block */
> > -	xfs_dir2_off_t		curoff;		/* current overall offset */
> > -	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
> >  	char			*ptr = NULL;	/* pointer to current data */
> > -	struct xfs_dir2_leaf_map_info *map_info;
> >  	struct xfs_da_geometry	*geo = args->geo;
> > +	xfs_dablk_t		rablk = 0;	/* current readahead block */
> > +	xfs_dir2_off_t		curoff;		/* current overall offset */
> > +	int			length;		/* temporary length value */
> > +	int			byteoff;	/* offset in current block */
> > +	int			lock_mode;
> > +	int			error = 0;	/* error return value */
> >  
> >  	/*
> >  	 * If the offset is at or past the largest allowed value,
> > @@ -492,30 +386,12 @@ xfs_dir2_leaf_getdents(
> >  		return 0;
> >  
> >  	/*
> > -	 * Set up to bmap a number of blocks based on the caller's
> > -	 * buffer size, the directory block size, and the filesystem
> > -	 * block size.
> > -	 */
> > -	length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> > -	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
> > -				(length * sizeof(struct xfs_bmbt_irec)),
> > -			       KM_SLEEP | KM_NOFS);
> > -	map_info->map_size = length;
> > -
> > -	/*
> >  	 * Inside the loop we keep the main offset value as a byte offset
> >  	 * in the directory file.
> >  	 */
> >  	curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
> >  
> >  	/*
> > -	 * Force this conversion through db so we truncate the offset
> > -	 * down to get the start of the data block.
> > -	 */
> > -	map_info->map_off = xfs_dir2_db_to_da(geo,
> > -					      xfs_dir2_byte_to_db(geo, curoff));
> > -
> > -	/*
> >  	 * Loop over directory entries until we reach the end offset.
> >  	 * Get more blocks and readahead as necessary.
> >  	 */
> > @@ -527,38 +403,13 @@ xfs_dir2_leaf_getdents(
> >  		 * current buffer, need to get another one.
> >  		 */
> >  		if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
> > -			int	lock_mode;
> > -			bool	trim_map = false;
> > -
> > -			if (bp) {
> > -				xfs_trans_brelse(args->trans, bp);
> > -				bp = NULL;
> > -				trim_map = true;
> > -			}
> > -
> >  			lock_mode = xfs_ilock_data_map_shared(dp);
> > -			error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
> > -						      &curoff, &bp, trim_map);
> > +			error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
> > +					&rablk, &bp);
> >  			xfs_iunlock(dp, lock_mode);
> > -			if (error || !map_info->map_valid)
> > +			if (error || !bp)
> >  				break;
> >  
> > -			/*
> > -			 * Having done a read, we need to set a new offset.
> > -			 */
> > -			newoff = xfs_dir2_db_off_to_byte(geo,
> > -							 map_info->curdb, 0);
> > -			/*
> > -			 * Start of the current block.
> > -			 */
> > -			if (curoff < newoff)
> > -				curoff = newoff;
> > -			/*
> > -			 * Make sure we're in the right block.
> > -			 */
> > -			else if (curoff > newoff)
> > -				ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
> > -				       map_info->curdb);
> >  			hdr = bp->b_addr;
> >  			xfs_dir3_data_check(dp, bp);
> >  			/*
> > @@ -643,7 +494,6 @@ xfs_dir2_leaf_getdents(
> >  		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
> >  	else
> >  		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
> > -	kmem_free(map_info);
> >  	if (bp)
> >  		xfs_trans_brelse(args->trans, bp);
> >  	return error;
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
  2017-04-19  0:14 Darrick J. Wong
  2017-04-19  1:34 ` Dave Chinner
@ 2017-04-22 12:15 ` Brian Foster
  2017-04-24 21:31   ` Darrick J. Wong
  1 sibling, 1 reply; 12+ messages in thread
From: Brian Foster @ 2017-04-22 12:15 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Eric Sandeen, linux-xfs, Carlos Maiolino, billodo, Dave Chinner

On Tue, Apr 18, 2017 at 05:14:34PM -0700, Darrick J. Wong wrote:
> Currently, the dir2 leaf block getdents function uses a complex state
> tracking mechanism to create a shadow copy of the block mappings and
> then uses the shadow copy to schedule readahead.  Since the read and
> readahead functions are perfectly capable of reading the mappings
> themselves, we can tear all that out in favor of a simpler function that
> simply keeps pushing the readahead window further out.
> 
> Inspired-by: Dave Chinner <david@fromorbit.com>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
> ---

I attempted to take a look at this yesterday (email has been dead) but
noticed it didn't apply to for-next (w/ or w/o Eric's fix)..?

>  fs/xfs/xfs_dir2_readdir.c |  324 ++++++++++++---------------------------------
>  1 file changed, 87 insertions(+), 237 deletions(-)
> 
> diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
> index 929e8b6..290c610 100644
> --- a/fs/xfs/xfs_dir2_readdir.c
> +++ b/fs/xfs/xfs_dir2_readdir.c
> @@ -243,215 +243,109 @@ xfs_dir2_block_getdents(
...
> +	while (ra_want > 0 && next_ra < last_da) {
> +		nmap = 1;
> +		error = xfs_bmapi_read(dp, next_ra, last_da - next_ra,
> +				&map, &nmap, 0);
> +		if (error || !nmap)
> +			break;
> +		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
> +		while (map.br_startblock != HOLESTARTBLOCK &&
> +		       next_ra < map.br_startoff + map.br_blockcount) {
> +			xfs_dir3_data_readahead(dp, next_ra, -2);
> +			*ra_blk = next_ra;
> +			ra_want -= geo->fsbcount;
> +			next_ra += geo->fsbcount;
>  		}

FWIW and not having looked at the rest of the patch, it does look like
the readahead window can stretch far beyond the expected size if you
happen to have a large contiguous extent (IOW, the inner loop doesn't
consider ra_want).

Brian

> +		next_ra = map.br_startoff + map.br_blockcount;
>  	}
>  	blk_finish_plug(&plug);
>  
> @@ -475,14 +369,14 @@ xfs_dir2_leaf_getdents(
>  	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
>  	xfs_dir2_data_entry_t	*dep;		/* data entry */
>  	xfs_dir2_data_unused_t	*dup;		/* unused entry */
> -	int			error = 0;	/* error return value */
> -	int			length;		/* temporary length value */
> -	int			byteoff;	/* offset in current block */
> -	xfs_dir2_off_t		curoff;		/* current overall offset */
> -	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
>  	char			*ptr = NULL;	/* pointer to current data */
> -	struct xfs_dir2_leaf_map_info *map_info;
>  	struct xfs_da_geometry	*geo = args->geo;
> +	xfs_dablk_t		rablk = 0;	/* current readahead block */
> +	xfs_dir2_off_t		curoff;		/* current overall offset */
> +	int			length;		/* temporary length value */
> +	int			byteoff;	/* offset in current block */
> +	int			lock_mode;
> +	int			error = 0;	/* error return value */
>  
>  	/*
>  	 * If the offset is at or past the largest allowed value,
> @@ -492,30 +386,12 @@ xfs_dir2_leaf_getdents(
>  		return 0;
>  
>  	/*
> -	 * Set up to bmap a number of blocks based on the caller's
> -	 * buffer size, the directory block size, and the filesystem
> -	 * block size.
> -	 */
> -	length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
> -	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
> -				(length * sizeof(struct xfs_bmbt_irec)),
> -			       KM_SLEEP | KM_NOFS);
> -	map_info->map_size = length;
> -
> -	/*
>  	 * Inside the loop we keep the main offset value as a byte offset
>  	 * in the directory file.
>  	 */
>  	curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
>  
>  	/*
> -	 * Force this conversion through db so we truncate the offset
> -	 * down to get the start of the data block.
> -	 */
> -	map_info->map_off = xfs_dir2_db_to_da(geo,
> -					      xfs_dir2_byte_to_db(geo, curoff));
> -
> -	/*
>  	 * Loop over directory entries until we reach the end offset.
>  	 * Get more blocks and readahead as necessary.
>  	 */
> @@ -527,38 +403,13 @@ xfs_dir2_leaf_getdents(
>  		 * current buffer, need to get another one.
>  		 */
>  		if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
> -			int	lock_mode;
> -			bool	trim_map = false;
> -
> -			if (bp) {
> -				xfs_trans_brelse(args->trans, bp);
> -				bp = NULL;
> -				trim_map = true;
> -			}
> -
>  			lock_mode = xfs_ilock_data_map_shared(dp);
> -			error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
> -						      &curoff, &bp, trim_map);
> +			error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
> +					&rablk, &bp);
>  			xfs_iunlock(dp, lock_mode);
> -			if (error || !map_info->map_valid)
> +			if (error || !bp)
>  				break;
>  
> -			/*
> -			 * Having done a read, we need to set a new offset.
> -			 */
> -			newoff = xfs_dir2_db_off_to_byte(geo,
> -							 map_info->curdb, 0);
> -			/*
> -			 * Start of the current block.
> -			 */
> -			if (curoff < newoff)
> -				curoff = newoff;
> -			/*
> -			 * Make sure we're in the right block.
> -			 */
> -			else if (curoff > newoff)
> -				ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
> -				       map_info->curdb);
>  			hdr = bp->b_addr;
>  			xfs_dir3_data_check(dp, bp);
>  			/*
> @@ -643,7 +494,6 @@ xfs_dir2_leaf_getdents(
>  		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
>  	else
>  		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
> -	kmem_free(map_info);
>  	if (bp)
>  		xfs_trans_brelse(args->trans, bp);
>  	return error;
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
  2017-04-19  0:14 Darrick J. Wong
@ 2017-04-19  1:34 ` Dave Chinner
  2017-04-22 12:15 ` Brian Foster
  1 sibling, 0 replies; 12+ messages in thread
From: Dave Chinner @ 2017-04-19  1:34 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Eric Sandeen, Brian Foster, linux-xfs, Carlos Maiolino, billodo

On Tue, Apr 18, 2017 at 05:14:34PM -0700, Darrick J. Wong wrote:
> Currently, the dir2 leaf block getdents function uses a complex state
> tracking mechanism to create a shadow copy of the block mappings and
> then uses the shadow copy to schedule readahead.  Since the read and
> readahead functions are perfectly capable of reading the mappings
> themselves, we can tear all that out in favor of a simpler function that
> simply keeps pushing the readahead window further out.
> 
> Inspired-by: Dave Chinner <david@fromorbit.com>
> Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>

FWIW, here's the patch in progress I had. I hadn't removed any of
the old code, just added a bunch of comments explaining everything
and added the new search loop. It should also handle discontiguous
directory blocks correctly, which using xfs_bmapi_read() directly
won't do...

-Dave.

----

Rework directory readahead to avoid lockdep issues...

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_da_btree.c     |   2 +-
 fs/xfs/xfs_da_btree.h     |   3 +
 fs/xfs/xfs_dir2_readdir.c | 155 +++++++++++++++++++++++++++++++++++++++-------
 3 files changed, 137 insertions(+), 23 deletions(-)

diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9eec594..35e6aeb 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2460,7 +2460,7 @@ xfs_buf_map_from_irec(
  *	 0 - if we mapped the block successfully
  *	>0 - positive error number if there was an error.
  */
-static int
+int
 xfs_dabuf_map(
 	struct xfs_inode	*dp,
 	xfs_dablk_t		bno,
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index c824a0a..dddc355 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -188,6 +188,9 @@ int	xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
 xfs_daddr_t	xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
 				xfs_daddr_t mapped_bno, int whichfork,
 				const struct xfs_buf_ops *ops);
+int	xfs_dabuf_map(struct xfs_inode *dp, xfs_dablk_t bno,
+		      xfs_daddr_t mappedbno, int whichfork,
+		      struct xfs_buf_map **map, int *nmaps);
 int	xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
 					  struct xfs_buf *dead_buf);
 
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 50b72f7..726701f 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -270,6 +270,58 @@ xfs_dir2_block_getdents(
 	return 0;
 }
 
+/*
+ * Directory leaf/node format readahead
+ *
+ * Readahead is done on a directory block basis. We can't
+ * hold the ilock across the entire readdir call because filldir can trigger
+ * page faults on user buffers, and that causes potential problems with page
+ * fault processing. There are no known problems, though lockdep gets
+ * *extremely* unhappy with us taking page faults with the ilock held.
+ *
+ * THis is because the regular file IO path lock order is:
+ *
+ *	iolock -> page fault -> mmap_sem -> ilock
+ *
+ * and we are effectively under the same lock order constraints with readdir.
+ * The directory dirent data is the "file data" and hence lockdep has trouble
+ * telling the difference between regular file and directory inode contexts,
+ * especially with respect to memroy reclaim contexts.
+ *
+ * To avoid this entire class of problem, and to avoid needing to use the iolock
+ * on directories to protect readdir operations from directory modifications, we
+ * can make use of the fact that while we hold the directory buffer lock, the
+ * directory block we are reading cannot be modified. Hence we can serialise
+ * readdir within a data block by grabbing the ilock to stabilise the mapping
+ * and lock out modifications, then read the directory block. Once we have read
+ * the directory block and hold it's lock, we can drop the ilock knowing that
+ * any modification to that block will be held off until we drop the buffer
+ * lock.
+ *
+ * We can do this block-by-block lock-map-read on individual blocks because
+ * readdir already has to handle continuation between disjoint syscalls, and so
+ * if we miss an entry due to racing with a modification between block reads,
+ * the result is no different to userspace doing two smaller reads and racing
+ * with the same modification.
+ *
+ * Further, the directory DATA segment contains only dirent data, and none of
+ * the directory indexes. Hence we don't have to care about racing with index
+ * tree updates as index updates only occur once the data buffer has already
+ * been locked into a transaction.
+ *
+ * Hence readahead does not store any state from block read to block read. There
+ * are no cached mappings between readahead calls - we simply map ahead a
+ * certain number of directory blocks and issue readahead on them immediately.
+ * We don't bother trying to keep a sliding window or be smart - we simply pass
+ * back the last offset we issued readahead on and on the next readbuf call we
+ * simply extend out the readahead from that last offset.
+ *
+ * If buffers are modified between the readahead call and when we actually read
+ * them, we don't care due to the fact we map the buffer and read it in a
+ * serialisable manner. if the block is removed from the directory, then it will
+ * be a hole mapping and so we skip over it rather than try to read a stale
+ * buffer.
+ */
 struct xfs_dir2_leaf_map_info {
 	xfs_extlen_t	map_blocks;	/* number of fsbs in map */
 	xfs_dablk_t	map_off;	/* last mapped file offset */
@@ -484,6 +536,59 @@ out:
 }
 
 /*
+ * readahead a number of entire directory blocks.
+ *
+ * To support discontiguous directory blocks, we leave the mapping of the
+ * individual blocks to the readahead code. If it lands in a hole, it will
+ * return the block at the end of the hole for the next pass.
+ */
+STATIC void
+xfs_dir2_leaf_getdents_readahead(
+	struct xfs_inode	*dp,
+	xfs_dablk_t		curoff,
+	int			dirblks)
+{
+	struct xfs_buf_map	map;
+	struct xfs_buf_map	*mapp = &map;
+	int			nmaps = 1;
+	int			error;
+
+
+	while (dirblks > 0) {
+		mapp = &map;
+		nmaps = 1;
+		error = xfs_dabuf_map(dp, curoff, -2, XFS_DATA_FORK,
+				      &mapp, &nmaps);
+		if (error == -1) {
+			/* map points to a hole, skip it */
+			while (--nmaps >= 0)
+				curoff += XFS_BB_TO_FSB(dp->i_mount,
+							mapp[nmaps].bm_len);
+
+			if (mapp != &map)
+				kmem_free(mapp);
+			continue;
+		}
+		if (error)
+			break;
+
+		dirblks--;
+		error = xfs_dir3_data_readahead(dp, curoff, -1);
+		if (error < 0)
+			break;
+
+		/* wind the current offset forwards */
+		while (--nmaps >= 0)
+			curoff += XFS_BB_TO_FSB(dp->i_mount, mapp[nmaps].bm_len);
+		if (curoff >= XFS_DIR2_LEAF_OFFSET)
+			break;
+	}
+
+	if (mapp != &map)
+		kmem_free(mapp);
+}
+
+/*
  * Getdents (readdir) for leaf and node directories.
  * This reads the data blocks only, so is the same for both forms.
  */
@@ -504,7 +609,8 @@ xfs_dir2_leaf_getdents(
 	xfs_dir2_off_t		curoff;		/* current overall offset */
 	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
 	char			*ptr = NULL;	/* pointer to current data */
-	struct xfs_dir2_leaf_map_info *map_info;
+	xfs_dir2_db_t		curdb;		/* db for current block */
+	xfs_dablk_t		map_off;	/* last mapped file offset */
 
 	/*
 	 * If the offset is at or past the largest allowed value,
@@ -516,18 +622,6 @@ xfs_dir2_leaf_getdents(
 	mp = dp->i_mount;
 
 	/*
-	 * Set up to bmap a number of blocks based on the caller's
-	 * buffer size, the directory block size, and the filesystem
-	 * block size.
-	 */
-	length = howmany(bufsize + mp->m_dirblksize,
-				     mp->m_sb.sb_blocksize);
-	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
-				(length * sizeof(struct xfs_bmbt_irec)),
-			       KM_SLEEP | KM_NOFS);
-	map_info->map_size = length;
-
-	/*
 	 * Inside the loop we keep the main offset value as a byte offset
 	 * in the directory file.
 	 */
@@ -537,8 +631,15 @@ xfs_dir2_leaf_getdents(
 	 * Force this conversion through db so we truncate the offset
 	 * down to get the start of the data block.
 	 */
-	map_info->map_off = xfs_dir2_db_to_da(mp,
-					      xfs_dir2_byte_to_db(mp, curoff));
+	map_off = xfs_dir2_db_to_da(mp, xfs_dir2_byte_to_db(mp, curoff));
+
+	/*
+	 * Set up readahead based on the caller's buffer size and the directory
+	 * block size We double the buffer size because we expect to be called
+	 * again soon to read the next buffer's worth of dirents.
+	 */
+	length = 2 * howmany(bufsize + mp->m_dirblksize, mp->m_dirblksize);
+	xfs_dir2_leaf_getdents_readahead(dp, map_off, length);
 
 	/*
 	 * Loop over directory entries until we reach the end offset.
@@ -552,16 +653,25 @@ xfs_dir2_leaf_getdents(
 		 * current buffer, need to get another one.
 		 */
 		if (!bp || ptr >= (char *)bp->b_addr + mp->m_dirblksize) {
+			if (bp)
+				xfs_trans_brelse(NULL, bp);
 
-			error = xfs_dir2_leaf_readbuf(dp, bufsize, map_info,
-						      &curoff, &bp);
-			if (error || !map_info->map_valid)
+			curdb = xfs_dir2_da_to_db(mp, curoff);
+			error = xfs_dir3_data_read(NULL, dp, curoff, -1, &bp);
+			if (error)
 				break;
+			if (!bp) {
+				/* landed in a hole */
+				/* XXX: need to map and skip hole! */
+				curoff += mp->m_dirblksize;
+				continue;
+			}
 
 			/*
 			 * Having done a read, we need to set a new offset.
 			 */
-			newoff = xfs_dir2_db_off_to_byte(mp, map_info->curdb, 0);
+			newoff = xfs_dir2_db_off_to_byte(mp, curdb, 0);
+
 			/*
 			 * Start of the current block.
 			 */
@@ -571,15 +681,17 @@ xfs_dir2_leaf_getdents(
 			 * Make sure we're in the right block.
 			 */
 			else if (curoff > newoff)
-				ASSERT(xfs_dir2_byte_to_db(mp, curoff) ==
-				       map_info->curdb);
+				ASSERT(xfs_dir2_byte_to_db(mp, curoff) == curdb);
+
 			hdr = bp->b_addr;
 			xfs_dir3_data_check(dp, bp);
+
 			/*
 			 * Find our position in the block.
 			 */
 			ptr = (char *)dp->d_ops->data_entry_p(hdr);
 			byteoff = xfs_dir2_byte_to_off(mp, curoff);
+
 			/*
 			 * Skip past the header.
 			 */
@@ -657,7 +769,6 @@ xfs_dir2_leaf_getdents(
 		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
 	else
 		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
-	kmem_free(map_info);
 	if (bp)
 		xfs_trans_brelse(NULL, bp);
 	return error;


-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness
@ 2017-04-19  0:14 Darrick J. Wong
  2017-04-19  1:34 ` Dave Chinner
  2017-04-22 12:15 ` Brian Foster
  0 siblings, 2 replies; 12+ messages in thread
From: Darrick J. Wong @ 2017-04-19  0:14 UTC (permalink / raw)
  To: Eric Sandeen
  Cc: Brian Foster, linux-xfs, Carlos Maiolino, billodo, Dave Chinner

Currently, the dir2 leaf block getdents function uses a complex state
tracking mechanism to create a shadow copy of the block mappings and
then uses the shadow copy to schedule readahead.  Since the read and
readahead functions are perfectly capable of reading the mappings
themselves, we can tear all that out in favor of a simpler function that
simply keeps pushing the readahead window further out.

Inspired-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 fs/xfs/xfs_dir2_readdir.c |  324 ++++++++++++---------------------------------
 1 file changed, 87 insertions(+), 237 deletions(-)

diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 929e8b6..290c610 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -243,215 +243,109 @@ xfs_dir2_block_getdents(
 	return 0;
 }
 
-struct xfs_dir2_leaf_map_info {
-	xfs_extlen_t	map_blocks;	/* number of fsbs in map */
-	xfs_dablk_t	map_off;	/* last mapped file offset */
-	int		map_size;	/* total entries in *map */
-	int		map_valid;	/* valid entries in *map */
-	int		nmap;		/* mappings to ask xfs_bmapi */
-	xfs_dir2_db_t	curdb;		/* db for current block */
-	int		ra_current;	/* number of read-ahead blks */
-	int		ra_index;	/* *map index for read-ahead */
-	int		ra_offset;	/* map entry offset for ra */
-	int		ra_want;	/* readahead count wanted */
-	struct xfs_bmbt_irec map[];	/* map vector for blocks */
-};
-
+/*
+ * Read a directory block and initiate readahead for blocks beyond that.
+ *
+ * Readahead does not store any state from block read to block read.
+ * There are no cached mappings between readahead calls - we simply read
+ * the requested directory block and issue readahead of subsequent block
+ * offsets immediately.  We don't bother trying to keep a sliding window
+ * or be smart - we simply pass back the last offset we issued readahead
+ * on and on the next readbuf call we simply extend out the readahead
+ * from that last offset.
+ */
 STATIC int
 xfs_dir2_leaf_readbuf(
 	struct xfs_da_args	*args,
 	size_t			bufsize,
-	struct xfs_dir2_leaf_map_info *mip,
-	xfs_dir2_off_t		*curoff,
-	struct xfs_buf		**bpp,
-	bool			trim_map)
+	xfs_dir2_off_t		*cur_off,
+	xfs_dablk_t		*ra_blk,
+	struct xfs_buf		**bpp)
 {
 	struct xfs_inode	*dp = args->dp;
 	struct xfs_buf		*bp = NULL;
-	struct xfs_bmbt_irec	*map = mip->map;
+	struct xfs_da_geometry	*geo = args->geo;
+	struct xfs_bmbt_irec	map;
 	struct blk_plug		plug;
+	xfs_daddr_t		old_daddr = 0;
+	xfs_dir2_off_t		new_off;
+	xfs_dablk_t		next_ra;
+	xfs_dablk_t		map_off;
+	xfs_dablk_t		last_da;
+	int			nmap;
+	int			ra_want;
 	int			error = 0;
-	int			length;
-	int			i;
-	int			j;
-	struct xfs_da_geometry	*geo = args->geo;
-
-	/*
-	 * If the caller just finished processing a buffer, it will tell us
-	 * we need to trim that block out of the mapping now it is done.
-	 */
-	if (trim_map) {
-		mip->map_blocks -= geo->fsbcount;
-		/*
-		 * Loop to get rid of the extents for the
-		 * directory block.
-		 */
-		for (i = geo->fsbcount; i > 0; ) {
-			j = min_t(int, map->br_blockcount, i);
-			map->br_blockcount -= j;
-			map->br_startblock += j;
-			map->br_startoff += j;
-			/*
-			 * If mapping is done, pitch it from
-			 * the table.
-			 */
-			if (!map->br_blockcount && --mip->map_valid)
-				memmove(&map[0], &map[1],
-					sizeof(map[0]) * mip->map_valid);
-			i -= j;
-		}
-	}
-
-	/*
-	 * Recalculate the readahead blocks wanted.
-	 */
-	mip->ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1;
-	ASSERT(mip->ra_want >= 0);
-
-	/*
-	 * If we don't have as many as we want, and we haven't
-	 * run out of data blocks, get some more mappings.
-	 */
-	if (1 + mip->ra_want > mip->map_blocks &&
-	    mip->map_off < xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET)) {
-		/*
-		 * Get more bmaps, fill in after the ones
-		 * we already have in the table.
-		 */
-		mip->nmap = mip->map_size - mip->map_valid;
-		error = xfs_bmapi_read(dp, mip->map_off,
-				xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET) -
-								mip->map_off,
-				&map[mip->map_valid], &mip->nmap, 0);
-
-		/*
-		 * Don't know if we should ignore this or try to return an
-		 * error.  The trouble with returning errors is that readdir
-		 * will just stop without actually passing the error through.
-		 */
-		if (error)
-			goto out;	/* XXX */
 
-		/*
-		 * If we got all the mappings we asked for, set the final map
-		 * offset based on the last bmap value received.  Otherwise,
-		 * we've reached the end.
-		 */
-		if (mip->nmap == mip->map_size - mip->map_valid) {
-			i = mip->map_valid + mip->nmap - 1;
-			mip->map_off = map[i].br_startoff + map[i].br_blockcount;
-		} else
-			mip->map_off = xfs_dir2_byte_to_da(geo,
-							XFS_DIR2_LEAF_OFFSET);
-
-		/*
-		 * Look for holes in the mapping, and eliminate them.  Count up
-		 * the valid blocks.
-		 */
-		for (i = mip->map_valid; i < mip->map_valid + mip->nmap; ) {
-			if (map[i].br_startblock == HOLESTARTBLOCK) {
-				mip->nmap--;
-				length = mip->map_valid + mip->nmap - i;
-				if (length)
-					memmove(&map[i], &map[i + 1],
-						sizeof(map[i]) * length);
-			} else {
-				mip->map_blocks += map[i].br_blockcount;
-				i++;
-			}
-		}
-		mip->map_valid += mip->nmap;
+	/* Flush old buf; remember its daddr for error detection. */
+	if (*bpp) {
+		old_daddr = (*bpp)->b_bn;
+		xfs_trans_brelse(args->trans, *bpp);
+		*bpp = NULL;
 	}
 
 	/*
-	 * No valid mappings, so no more data blocks.
+	 * Look for mapped directory blocks at or above the current
+	 * offset.  We must truncate down to the nearest directory
+	 * block to start the scanning operation.
 	 */
-	if (!mip->map_valid) {
-		*curoff = xfs_dir2_da_to_byte(geo, mip->map_off);
+	last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET);
+	map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off));
+	do {
+		nmap = 1;
+		error = xfs_bmapi_read(dp, map_off, last_da - map_off,
+				&map, &nmap, 0);
+		if (error || !nmap)
+			goto out;
+		map_off = map.br_startoff + map.br_blockcount;
+	} while (map_off < last_da && map.br_startblock == HOLESTARTBLOCK);
+
+	if (map.br_startblock == HOLESTARTBLOCK)
 		goto out;
-	}
 
-	/*
-	 * Read the directory block starting at the first mapping.
-	 */
-	mip->curdb = xfs_dir2_da_to_db(geo, map->br_startoff);
-	error = xfs_dir3_data_read(args->trans, dp, map->br_startoff,
-			map->br_blockcount >= geo->fsbcount ?
-			    XFS_FSB_TO_DADDR(dp->i_mount, map->br_startblock) :
-			    -1, &bp);
-	/*
-	 * Should just skip over the data block instead of giving up.
-	 */
+	/* Read the directory block of that first mapping. */
+	new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
+	if (new_off > *cur_off)
+		*cur_off = new_off;
+	error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, -1, &bp);
 	if (error)
-		goto out;	/* XXX */
+		goto out;
 
 	/*
-	 * Adjust the current amount of read-ahead: we just read a block that
-	 * was previously ra.
+	 * Make sure we don't just get the same old block back.
 	 */
-	if (mip->ra_current)
-		mip->ra_current -= geo->fsbcount;
+	if (!bp || bp->b_bn == old_daddr) {
+		ASSERT(0);
+		if (bp)
+			xfs_trans_brelse(args->trans, bp);
+		error = -EFSCORRUPTED;
+		goto out;
+	}
 
 	/*
-	 * Do we need more readahead?
-	 * Each loop tries to process 1 full dir blk; last may be partial.
+	 * Do we need more readahead for this call?  Issue ra against
+	 * bufsize's worth of dir blocks or until we hit the end of the
+	 * data section.
 	 */
+	ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog)) - 1;
+	if (*ra_blk == 0)
+		*ra_blk = map.br_startoff;
+	next_ra = *ra_blk + geo->fsbcount;
 	blk_start_plug(&plug);
-	for (mip->ra_index = mip->ra_offset = i = 0;
-	     mip->ra_want > mip->ra_current && i < mip->map_blocks;
-	     i += geo->fsbcount) {
-		ASSERT(mip->ra_index < mip->map_valid);
-		/*
-		 * Read-ahead a contiguous directory block.
-		 */
-		if (i > mip->ra_current &&
-		    (map[mip->ra_index].br_blockcount - mip->ra_offset) >=
-		     geo->fsbcount) {
-			xfs_dir3_data_readahead(dp,
-				map[mip->ra_index].br_startoff + mip->ra_offset,
-				XFS_FSB_TO_DADDR(dp->i_mount,
-					map[mip->ra_index].br_startblock +
-							mip->ra_offset));
-			mip->ra_current = i;
-		}
-
-		/*
-		 * Read-ahead a non-contiguous directory block.  This doesn't
-		 * use our mapping, but this is a very rare case.
-		 */
-		else if (i > mip->ra_current) {
-			xfs_dir3_data_readahead(dp,
-					map[mip->ra_index].br_startoff +
-							mip->ra_offset, -1);
-			mip->ra_current = i;
-		}
-
-		/*
-		 * Advance offset through the mapping table, processing a full
-		 * dir block even if it is fragmented into several extents.
-		 * But stop if we have consumed all valid mappings, even if
-		 * it's not yet a full directory block.
-		 */
-		for (j = 0;
-		     j < geo->fsbcount && mip->ra_index < mip->map_valid;
-		     j += length ) {
-			/*
-			 * The rest of this extent but not more than a dir
-			 * block.
-			 */
-			length = min_t(int, geo->fsbcount - j,
-					map[mip->ra_index].br_blockcount -
-							mip->ra_offset);
-			mip->ra_offset += length;
-
-			/*
-			 * Advance to the next mapping if this one is used up.
-			 */
-			if (mip->ra_offset == map[mip->ra_index].br_blockcount) {
-				mip->ra_offset = 0;
-				mip->ra_index++;
-			}
+	while (ra_want > 0 && next_ra < last_da) {
+		nmap = 1;
+		error = xfs_bmapi_read(dp, next_ra, last_da - next_ra,
+				&map, &nmap, 0);
+		if (error || !nmap)
+			break;
+		next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
+		while (map.br_startblock != HOLESTARTBLOCK &&
+		       next_ra < map.br_startoff + map.br_blockcount) {
+			xfs_dir3_data_readahead(dp, next_ra, -2);
+			*ra_blk = next_ra;
+			ra_want -= geo->fsbcount;
+			next_ra += geo->fsbcount;
 		}
+		next_ra = map.br_startoff + map.br_blockcount;
 	}
 	blk_finish_plug(&plug);
 
@@ -475,14 +369,14 @@ xfs_dir2_leaf_getdents(
 	xfs_dir2_data_hdr_t	*hdr;		/* data block header */
 	xfs_dir2_data_entry_t	*dep;		/* data entry */
 	xfs_dir2_data_unused_t	*dup;		/* unused entry */
-	int			error = 0;	/* error return value */
-	int			length;		/* temporary length value */
-	int			byteoff;	/* offset in current block */
-	xfs_dir2_off_t		curoff;		/* current overall offset */
-	xfs_dir2_off_t		newoff;		/* new curoff after new blk */
 	char			*ptr = NULL;	/* pointer to current data */
-	struct xfs_dir2_leaf_map_info *map_info;
 	struct xfs_da_geometry	*geo = args->geo;
+	xfs_dablk_t		rablk = 0;	/* current readahead block */
+	xfs_dir2_off_t		curoff;		/* current overall offset */
+	int			length;		/* temporary length value */
+	int			byteoff;	/* offset in current block */
+	int			lock_mode;
+	int			error = 0;	/* error return value */
 
 	/*
 	 * If the offset is at or past the largest allowed value,
@@ -492,30 +386,12 @@ xfs_dir2_leaf_getdents(
 		return 0;
 
 	/*
-	 * Set up to bmap a number of blocks based on the caller's
-	 * buffer size, the directory block size, and the filesystem
-	 * block size.
-	 */
-	length = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
-	map_info = kmem_zalloc(offsetof(struct xfs_dir2_leaf_map_info, map) +
-				(length * sizeof(struct xfs_bmbt_irec)),
-			       KM_SLEEP | KM_NOFS);
-	map_info->map_size = length;
-
-	/*
 	 * Inside the loop we keep the main offset value as a byte offset
 	 * in the directory file.
 	 */
 	curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
 
 	/*
-	 * Force this conversion through db so we truncate the offset
-	 * down to get the start of the data block.
-	 */
-	map_info->map_off = xfs_dir2_db_to_da(geo,
-					      xfs_dir2_byte_to_db(geo, curoff));
-
-	/*
 	 * Loop over directory entries until we reach the end offset.
 	 * Get more blocks and readahead as necessary.
 	 */
@@ -527,38 +403,13 @@ xfs_dir2_leaf_getdents(
 		 * current buffer, need to get another one.
 		 */
 		if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
-			int	lock_mode;
-			bool	trim_map = false;
-
-			if (bp) {
-				xfs_trans_brelse(args->trans, bp);
-				bp = NULL;
-				trim_map = true;
-			}
-
 			lock_mode = xfs_ilock_data_map_shared(dp);
-			error = xfs_dir2_leaf_readbuf(args, bufsize, map_info,
-						      &curoff, &bp, trim_map);
+			error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
+					&rablk, &bp);
 			xfs_iunlock(dp, lock_mode);
-			if (error || !map_info->map_valid)
+			if (error || !bp)
 				break;
 
-			/*
-			 * Having done a read, we need to set a new offset.
-			 */
-			newoff = xfs_dir2_db_off_to_byte(geo,
-							 map_info->curdb, 0);
-			/*
-			 * Start of the current block.
-			 */
-			if (curoff < newoff)
-				curoff = newoff;
-			/*
-			 * Make sure we're in the right block.
-			 */
-			else if (curoff > newoff)
-				ASSERT(xfs_dir2_byte_to_db(geo, curoff) ==
-				       map_info->curdb);
 			hdr = bp->b_addr;
 			xfs_dir3_data_check(dp, bp);
 			/*
@@ -643,7 +494,6 @@ xfs_dir2_leaf_getdents(
 		ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
 	else
 		ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
-	kmem_free(map_info);
 	if (bp)
 		xfs_trans_brelse(args->trans, bp);
 	return error;

^ permalink raw reply related	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-05-02 19:03 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-28 19:46 [PATCH] xfs: refactor dir2 leaf readahead shadow buffer cleverness Darrick J. Wong
2017-05-01 18:32 ` Brian Foster
2017-05-01 21:50   ` Darrick J. Wong
2017-05-01 23:13     ` Brian Foster
2017-05-01 23:30       ` Darrick J. Wong
2017-05-02 14:11         ` Brian Foster
2017-05-02  7:44 ` Christoph Hellwig
2017-05-02 19:02   ` Darrick J. Wong
  -- strict thread matches above, loose matches on Subject: below --
2017-04-19  0:14 Darrick J. Wong
2017-04-19  1:34 ` Dave Chinner
2017-04-22 12:15 ` Brian Foster
2017-04-24 21:31   ` Darrick J. Wong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.