All of lore.kernel.org
 help / color / mirror / Atom feed
* minleft fixes
@ 2016-12-13 15:59 Christoph Hellwig
  2016-12-13 15:59 ` [PATCH 1/4] xfs: fix bogus minleft manipulations Christoph Hellwig
                   ` (4 more replies)
  0 siblings, 5 replies; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-13 15:59 UTC (permalink / raw)
  To: linux-xfs; +Cc: eguan, darrick.wong

[now to the correct list address]

This is my attempt to fix the problems with rmap and reflink file system
running out of space during delayed extent conversions.  It turns out
the way minleft has been handled has always been bogus, but the rmap
enablement made it even worse..

This has survived xfstests xfs/109 on a 2k fs for 2 1/2 hour here now,
but Eryu was much better than me at reproducing the problems, so it could
use some further testing.


^ permalink raw reply	[flat|nested] 31+ messages in thread

* [PATCH 1/4] xfs: fix bogus minleft manipulations
  2016-12-13 15:59 minleft fixes Christoph Hellwig
@ 2016-12-13 15:59 ` Christoph Hellwig
  2016-12-14 17:35   ` Brian Foster
  2016-12-15 22:09   ` Dave Chinner
  2016-12-13 15:59 ` [PATCH 2/4] xfs: calculate minleft correctly for bmap allocations Christoph Hellwig
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-13 15:59 UTC (permalink / raw)
  To: linux-xfs; +Cc: eguan, darrick.wong

We can't just set minleft to 0 when we're low on space - that's exactly
what we need minleft for: to protect space in the AG for btree block
allocations when we are low on free space.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_alloc.c      | 24 +++++++-----------------
 fs/xfs/libxfs/xfs_bmap.c       |  3 ---
 fs/xfs/libxfs/xfs_bmap_btree.c | 14 --------------
 3 files changed, 7 insertions(+), 34 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 5050056..91a6c17 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2638,12 +2638,10 @@ xfs_alloc_vextent(
 	xfs_agblock_t	agsize;	/* allocation group size */
 	int		error;
 	int		flags;	/* XFS_ALLOC_FLAG_... locking flags */
-	xfs_extlen_t	minleft;/* minimum left value, temp copy */
 	xfs_mount_t	*mp;	/* mount structure pointer */
 	xfs_agnumber_t	sagno;	/* starting allocation group number */
 	xfs_alloctype_t	type;	/* input allocation type */
 	int		bump_rotor = 0;
-	int		no_min = 0;
 	xfs_agnumber_t	rotorstep = xfs_rotorstep; /* inode32 agf stepper */
 
 	mp = args->mp;
@@ -2672,7 +2670,6 @@ xfs_alloc_vextent(
 		trace_xfs_alloc_vextent_badargs(args);
 		return 0;
 	}
-	minleft = args->minleft;
 
 	switch (type) {
 	case XFS_ALLOCTYPE_THIS_AG:
@@ -2683,9 +2680,7 @@ xfs_alloc_vextent(
 		 */
 		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
 		args->pag = xfs_perag_get(mp, args->agno);
-		args->minleft = 0;
 		error = xfs_alloc_fix_freelist(args, 0);
-		args->minleft = minleft;
 		if (error) {
 			trace_xfs_alloc_vextent_nofix(args);
 			goto error0;
@@ -2750,9 +2745,7 @@ xfs_alloc_vextent(
 		 */
 		for (;;) {
 			args->pag = xfs_perag_get(mp, args->agno);
-			if (no_min) args->minleft = 0;
 			error = xfs_alloc_fix_freelist(args, flags);
-			args->minleft = minleft;
 			if (error) {
 				trace_xfs_alloc_vextent_nofix(args);
 				goto error0;
@@ -2792,20 +2785,17 @@ xfs_alloc_vextent(
 			 * or switch to non-trylock mode.
 			 */
 			if (args->agno == sagno) {
-				if (no_min == 1) {
+				if (flags == 0) {
 					args->agbno = NULLAGBLOCK;
 					trace_xfs_alloc_vextent_allfailed(args);
 					break;
 				}
-				if (flags == 0) {
-					no_min = 1;
-				} else {
-					flags = 0;
-					if (type == XFS_ALLOCTYPE_START_BNO) {
-						args->agbno = XFS_FSB_TO_AGBNO(mp,
-							args->fsbno);
-						args->type = XFS_ALLOCTYPE_NEAR_BNO;
-					}
+
+				flags = 0;
+				if (type == XFS_ALLOCTYPE_START_BNO) {
+					args->agbno = XFS_FSB_TO_AGBNO(mp,
+						args->fsbno);
+					args->type = XFS_ALLOCTYPE_NEAR_BNO;
 				}
 			}
 			xfs_perag_put(args->pag);
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 2760bc3..44773c9 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3812,7 +3812,6 @@ xfs_bmap_btalloc(
 		args.fsbno = 0;
 		args.type = XFS_ALLOCTYPE_FIRST_AG;
 		args.total = ap->minlen;
-		args.minleft = 0;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 		ap->dfops->dop_low = true;
@@ -4344,8 +4343,6 @@ xfs_bmapi_allocate(
 	if (error)
 		return error;
 
-	if (bma->dfops->dop_low)
-		bma->minleft = 0;
 	if (bma->cur)
 		bma->cur->bc_private.b.firstblock = *bma->firstblock;
 	if (bma->blkno == NULLFSBLOCK)
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index d6330c2..9581ee8 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -499,20 +499,6 @@ xfs_bmbt_alloc_block(
 		goto try_another_ag;
 	}
 
-	if (args.fsbno == NULLFSBLOCK && args.minleft) {
-		/*
-		 * Could not find an AG with enough free space to satisfy
-		 * a full btree split.  Try again without minleft and if
-		 * successful activate the lowspace algorithm.
-		 */
-		args.fsbno = 0;
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
-		args.minleft = 0;
-		error = xfs_alloc_vextent(&args);
-		if (error)
-			goto error0;
-		cur->bc_private.b.dfops->dop_low = true;
-	}
 	if (args.fsbno == NULLFSBLOCK) {
 		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
 		*stat = 0;
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 2/4] xfs: calculate minleft correctly for bmap allocations
  2016-12-13 15:59 minleft fixes Christoph Hellwig
  2016-12-13 15:59 ` [PATCH 1/4] xfs: fix bogus minleft manipulations Christoph Hellwig
@ 2016-12-13 15:59 ` Christoph Hellwig
  2016-12-14 18:24   ` Brian Foster
  2016-12-13 15:59 ` [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available Christoph Hellwig
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-13 15:59 UTC (permalink / raw)
  To: linux-xfs; +Cc: eguan, darrick.wong

We need to take into account the worst possible number of bmap and rmap
btree block allocations.  Use the helper we have to calculate it instead
of doing an overly optimistic estimate.  Also move the call to calculate
it into the function that sets the allocation parameters, as we have
more accurate information about the allocation there.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c | 16 ++++++----------
 fs/xfs/libxfs/xfs_bmap.h |  1 -
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 44773c9..2126ecc 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3764,7 +3764,12 @@ xfs_bmap_btalloc(
 		args.alignment = 1;
 		args.minalignslop = 0;
 	}
-	args.minleft = ap->minleft;
+
+	/*
+	 * Ensure we have enough blocks left in the AG to satisfy all possible
+	 * bmap and rmap tree block allocations.
+	 */
+	args.minleft = xfs_bmap_worst_indlen(ap->ip, args.maxlen);
 	args.wasdel = ap->wasdel;
 	args.resv = XFS_AG_RESV_NONE;
 	args.datatype = ap->datatype;
@@ -4572,15 +4577,6 @@ xfs_bmapi_write(
 
 	XFS_STATS_INC(mp, xs_blk_mapw);
 
-	if (*firstblock == NULLFSBLOCK) {
-		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
-			bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-		else
-			bma.minleft = 1;
-	} else {
-		bma.minleft = 0;
-	}
-
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(tp, ip, whichfork);
 		if (error)
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index cecd094..92e6755 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -49,7 +49,6 @@ struct xfs_bmalloca {
 
 	xfs_extlen_t		total;	/* total blocks needed for xaction */
 	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
-	xfs_extlen_t		minleft; /* amount must be left after alloc */
 	bool			eof;	/* set if allocating past last extent */
 	bool			wasdel;	/* replacing a delayed allocation */
 	bool			aeof;	/* allocated space at eof */
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available
  2016-12-13 15:59 minleft fixes Christoph Hellwig
  2016-12-13 15:59 ` [PATCH 1/4] xfs: fix bogus minleft manipulations Christoph Hellwig
  2016-12-13 15:59 ` [PATCH 2/4] xfs: calculate minleft correctly for bmap allocations Christoph Hellwig
@ 2016-12-13 15:59 ` Christoph Hellwig
  2016-12-14 18:24   ` Brian Foster
                     ` (2 more replies)
  2016-12-13 15:59 ` [PATCH 4/4] xfs: don't rely on ->total " Christoph Hellwig
  2016-12-14 10:51 ` minleft fixes Eryu Guan
  4 siblings, 3 replies; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-13 15:59 UTC (permalink / raw)
  To: linux-xfs; +Cc: eguan, darrick.wong

We must decide in xfs_alloc_fix_freelist if we can perform an
allocation from a given AG is possible or not based on the available
space, and should not fail the allocation past that point on a
healthy file system.

But currently we have two additional places that second-guess
xfs_alloc_fix_freelist: xfs_alloc_ag_vextent tries to adjust the
maxlen parameter to remove the reservation before doing the
allocation (but ignores the various minium freespace requirements),
and xfs_alloc_fix_minleft tries to fix up the allocated length
after we've found an extent, but ignores the reservations and also
doesn't take the AGFL into account (and thus fails allocations
for not matching minlen in some cases).

Remove all these later fixups and just correct the maxlen argument
inside xfs_alloc_fix_freelist once we have the AGF buffer locked.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_alloc.c | 78 +++++++++--------------------------------------
 fs/xfs/libxfs/xfs_alloc.h |  2 +-
 2 files changed, 16 insertions(+), 64 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 91a6c17..87328a8 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -365,36 +365,12 @@ xfs_alloc_fix_len(
 		return;
 	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
 	ASSERT(rlen % args->prod == args->mod);
+	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
+		rlen + args->minleft);
 	args->len = rlen;
 }
 
 /*
- * Fix up length if there is too little space left in the a.g.
- * Return 1 if ok, 0 if too little, should give up.
- */
-STATIC int
-xfs_alloc_fix_minleft(
-	xfs_alloc_arg_t	*args)		/* allocation argument structure */
-{
-	xfs_agf_t	*agf;		/* a.g. freelist header */
-	int		diff;		/* free space difference */
-
-	if (args->minleft == 0)
-		return 1;
-	agf = XFS_BUF_TO_AGF(args->agbp);
-	diff = be32_to_cpu(agf->agf_freeblks)
-		- args->len - args->minleft;
-	if (diff >= 0)
-		return 1;
-	args->len += diff;		/* shrink the allocated space */
-	/* casts to (int) catch length underflows */
-	if ((int)args->len >= (int)args->minlen)
-		return 1;
-	args->agbno = NULLAGBLOCK;
-	return 0;
-}
-
-/*
  * Update the two btrees, logically removing from freespace the extent
  * starting at rbno, rlen blocks.  The extent is contained within the
  * actual (current) free extent fbno for flen blocks.
@@ -689,8 +665,6 @@ xfs_alloc_ag_vextent(
 	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
 {
 	int		error=0;
-	xfs_extlen_t	reservation;
-	xfs_extlen_t	oldmax;
 
 	ASSERT(args->minlen > 0);
 	ASSERT(args->maxlen > 0);
@@ -699,20 +673,6 @@ xfs_alloc_ag_vextent(
 	ASSERT(args->alignment > 0);
 
 	/*
-	 * Clamp maxlen to the amount of free space minus any reservations
-	 * that have been made.
-	 */
-	oldmax = args->maxlen;
-	reservation = xfs_ag_resv_needed(args->pag, args->resv);
-	if (args->maxlen > args->pag->pagf_freeblks - reservation)
-		args->maxlen = args->pag->pagf_freeblks - reservation;
-	if (args->maxlen == 0) {
-		args->agbno = NULLAGBLOCK;
-		args->maxlen = oldmax;
-		return 0;
-	}
-
-	/*
 	 * Branch to correct routine based on the type.
 	 */
 	args->wasfromfl = 0;
@@ -731,8 +691,6 @@ xfs_alloc_ag_vextent(
 		/* NOTREACHED */
 	}
 
-	args->maxlen = oldmax;
-
 	if (error || args->agbno == NULLAGBLOCK)
 		return error;
 
@@ -841,9 +799,6 @@ xfs_alloc_ag_vextent_exact(
 	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
 						- args->agbno;
 	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args))
-		goto not_found;
-
 	ASSERT(args->agbno + args->len <= tend);
 
 	/*
@@ -1149,11 +1104,6 @@ xfs_alloc_ag_vextent_near(
 		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
 		args->len = blen;
-		if (!xfs_alloc_fix_minleft(args)) {
-			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			trace_xfs_alloc_near_nominleft(args);
-			return 0;
-		}
 		blen = args->len;
 		/*
 		 * We are allocating starting at bnew for blen blocks.
@@ -1346,12 +1296,6 @@ xfs_alloc_ag_vextent_near(
 	 */
 	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args)) {
-		trace_xfs_alloc_near_nominleft(args);
-		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-		return 0;
-	}
 	rlen = args->len;
 	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
 				     args->datatype, ltbnoa, ltlena, &ltnew);
@@ -1553,8 +1497,6 @@ xfs_alloc_ag_vextent_size(
 	}
 	xfs_alloc_fix_len(args);
 
-	if (!xfs_alloc_fix_minleft(args))
-		goto out_nominleft;
 	rlen = args->len;
 	XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
 	/*
@@ -2073,10 +2015,19 @@ xfs_alloc_space_available(
 
 	/* do we have enough free space remaining for the allocation? */
 	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
-			  reservation - min_free - args->total);
-	if (available < (int)args->minleft || available <= 0)
+			  reservation - min_free - args->minleft);
+	if (available < (int)args->total)
 		return false;
 
+	/*
+	 * Clamp maxlen to the amount of free space available for the actual
+	 * extent allocation.
+	 */
+	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
+		args->maxlen = available;
+		ASSERT(args->maxlen > 0);
+	}
+
 	return true;
 }
 
@@ -2122,7 +2073,8 @@ xfs_alloc_fix_freelist(
 	}
 
 	need = xfs_alloc_min_freelist(mp, pag);
-	if (!xfs_alloc_space_available(args, need, flags))
+	if (!xfs_alloc_space_available(args, need, flags |
+			XFS_ALLOC_FLAG_CHECK))
 		goto out_agbp_relse;
 
 	/*
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 7c404a6..1d0f48a 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -56,7 +56,7 @@ typedef unsigned int xfs_alloctype_t;
 #define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
 #define	XFS_ALLOC_FLAG_NORMAP	0x00000004  /* don't modify the rmapbt */
 #define	XFS_ALLOC_FLAG_NOSHRINK	0x00000008  /* don't shrink the freelist */
-
+#define	XFS_ALLOC_FLAG_CHECK	0x00000010  /* test only, don't modify args */
 
 /*
  * Argument structure for xfs_alloc routines.
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* [PATCH 4/4] xfs: don't rely on ->total in xfs_alloc_space_available
  2016-12-13 15:59 minleft fixes Christoph Hellwig
                   ` (2 preceding siblings ...)
  2016-12-13 15:59 ` [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available Christoph Hellwig
@ 2016-12-13 15:59 ` Christoph Hellwig
  2016-12-14 18:30   ` Brian Foster
  2016-12-14 10:51 ` minleft fixes Eryu Guan
  4 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-13 15:59 UTC (permalink / raw)
  To: linux-xfs; +Cc: eguan, darrick.wong

->total is a bit of an odd parameter passed down to the low-level
allocator all the way from the high-level callers.  It's supposed to
contain the maximum number of blocks to be allocated for the whole
transaction [1].

But in xfs_iomap_write_allocate we only convert existing delayed
allocations and thus only have a minimal block reservation for the
current transaction, so xfs_alloc_space_available can't use it for
the allocation decisions.  Use the maximum of args->total and the
calculated block requirement to make a decision.  We probably should
get rid of args->total eventually and instead apply ->minleft more
broadly, but that will require some extensive changes all over.

[1] which creates lots of confusion as most callers don't decrement it
once doing a first allocation.  But that's for a separate series.
---
 fs/xfs/libxfs/xfs_alloc.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 87328a8..85039e5 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -1998,7 +1998,7 @@ xfs_alloc_space_available(
 	int			flags)
 {
 	struct xfs_perag	*pag = args->pag;
-	xfs_extlen_t		longest;
+	xfs_extlen_t		alloc_len, longest;
 	xfs_extlen_t		reservation; /* blocks that are still reserved */
 	int			available;
 
@@ -2008,15 +2008,16 @@ xfs_alloc_space_available(
 	reservation = xfs_ag_resv_needed(pag, args->resv);
 
 	/* do we have enough contiguous free space for the allocation? */
+	alloc_len = args->minlen + (args->alignment - 1)+ args->minalignslop;
 	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
 			reservation);
-	if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
+	if (longest < alloc_len)
 		return false;
 
 	/* do we have enough free space remaining for the allocation? */
 	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
 			  reservation - min_free - args->minleft);
-	if (available < (int)args->total)
+	if (available < (int)max(args->total, alloc_len))
 		return false;
 
 	/*
-- 
2.1.4


^ permalink raw reply related	[flat|nested] 31+ messages in thread

* Re: minleft fixes
  2016-12-13 15:59 minleft fixes Christoph Hellwig
                   ` (3 preceding siblings ...)
  2016-12-13 15:59 ` [PATCH 4/4] xfs: don't rely on ->total " Christoph Hellwig
@ 2016-12-14 10:51 ` Eryu Guan
  2016-12-15 10:24   ` Eryu Guan
  4 siblings, 1 reply; 31+ messages in thread
From: Eryu Guan @ 2016-12-14 10:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, darrick.wong

On Tue, Dec 13, 2016 at 04:59:23PM +0100, Christoph Hellwig wrote:
> [now to the correct list address]
> 
> This is my attempt to fix the problems with rmap and reflink file system
> running out of space during delayed extent conversions.  It turns out
> the way minleft has been handled has always been bogus, but the rmap
> enablement made it even worse..
> 
> This has survived xfstests xfs/109 on a 2k fs for 2 1/2 hour here now,
> but Eryu was much better than me at reproducing the problems, so it could
> use some further testing.
> 

Test results looked good to me so far. It passed 200 iterations of
xfs/109 with both xfs_2k_reflink and xfs_1k_reflink test configs. 

Thanks,
Eryu

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] xfs: fix bogus minleft manipulations
  2016-12-13 15:59 ` [PATCH 1/4] xfs: fix bogus minleft manipulations Christoph Hellwig
@ 2016-12-14 17:35   ` Brian Foster
  2016-12-14 19:36     ` Christoph Hellwig
  2016-12-15 22:09   ` Dave Chinner
  1 sibling, 1 reply; 31+ messages in thread
From: Brian Foster @ 2016-12-14 17:35 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, eguan, darrick.wong

On Tue, Dec 13, 2016 at 04:59:24PM +0100, Christoph Hellwig wrote:
> We can't just set minleft to 0 when we're low on space - that's exactly
> what we need minleft for: to protect space in the AG for btree block
> allocations when we are low on free space.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_alloc.c      | 24 +++++++-----------------
>  fs/xfs/libxfs/xfs_bmap.c       |  3 ---
>  fs/xfs/libxfs/xfs_bmap_btree.c | 14 --------------
>  3 files changed, 7 insertions(+), 34 deletions(-)
> 
...
> diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
> index d6330c2..9581ee8 100644
> --- a/fs/xfs/libxfs/xfs_bmap_btree.c
> +++ b/fs/xfs/libxfs/xfs_bmap_btree.c
> @@ -499,20 +499,6 @@ xfs_bmbt_alloc_block(
>  		goto try_another_ag;
>  	}
>  
> -	if (args.fsbno == NULLFSBLOCK && args.minleft) {
> -		/*
> -		 * Could not find an AG with enough free space to satisfy
> -		 * a full btree split.  Try again without minleft and if
> -		 * successful activate the lowspace algorithm.
> -		 */
> -		args.fsbno = 0;
> -		args.type = XFS_ALLOCTYPE_FIRST_AG;
> -		args.minleft = 0;
> -		error = xfs_alloc_vextent(&args);
> -		if (error)
> -			goto error0;
> -		cur->bc_private.b.dfops->dop_low = true;
> -	}

I'm still trying to grok the minleft/dop_low stuff, but doesn't this
increase the risk of bmbt block allocation failure? E.g., args.minleft
is set to the transaction reservation earlier in the function. AFAIK the
res is not guaranteed to be in a particular AG, but in that case we're
just setting a start AG and ultimately cycling through the AGs. If that
fails, this hunk resets minleft, restarts at fsbno 0 and activates the
sequential agno allocation requirement for subsequent allocs. Doesn't
removing this logic mean we can successfully reserve blocks in a write
transaction that will never ultimately be able to allocate bmbt blocks,
even if the data blocks can all be allocated..?

The other thing to note here is that minleft is initialized to 0 and
only set to the transaction res conditionally, so if firstblock is
already set we won't use minleft at all. It's definitely possible I'm
still missing something about how this was all "supposed" to work
originally, but the logic that's left here seems a bit... erratic. :/

Brian

>  	if (args.fsbno == NULLFSBLOCK) {
>  		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
>  		*stat = 0;
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 2/4] xfs: calculate minleft correctly for bmap allocations
  2016-12-13 15:59 ` [PATCH 2/4] xfs: calculate minleft correctly for bmap allocations Christoph Hellwig
@ 2016-12-14 18:24   ` Brian Foster
  0 siblings, 0 replies; 31+ messages in thread
From: Brian Foster @ 2016-12-14 18:24 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, eguan, darrick.wong

On Tue, Dec 13, 2016 at 04:59:25PM +0100, Christoph Hellwig wrote:
> We need to take into account the worst possible number of bmap and rmap
> btree block allocations.  Use the helper we have to calculate it instead
> of doing an overly optimistic estimate.  Also move the call to calculate
> it into the function that sets the allocation parameters, as we have
> more accurate information about the allocation there.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Seems reasonable:

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/libxfs/xfs_bmap.c | 16 ++++++----------
>  fs/xfs/libxfs/xfs_bmap.h |  1 -
>  2 files changed, 6 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 44773c9..2126ecc 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -3764,7 +3764,12 @@ xfs_bmap_btalloc(
>  		args.alignment = 1;
>  		args.minalignslop = 0;
>  	}
> -	args.minleft = ap->minleft;
> +
> +	/*
> +	 * Ensure we have enough blocks left in the AG to satisfy all possible
> +	 * bmap and rmap tree block allocations.
> +	 */
> +	args.minleft = xfs_bmap_worst_indlen(ap->ip, args.maxlen);
>  	args.wasdel = ap->wasdel;
>  	args.resv = XFS_AG_RESV_NONE;
>  	args.datatype = ap->datatype;
> @@ -4572,15 +4577,6 @@ xfs_bmapi_write(
>  
>  	XFS_STATS_INC(mp, xs_blk_mapw);
>  
> -	if (*firstblock == NULLFSBLOCK) {
> -		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
> -			bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
> -		else
> -			bma.minleft = 1;
> -	} else {
> -		bma.minleft = 0;
> -	}
> -
>  	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
>  		error = xfs_iread_extents(tp, ip, whichfork);
>  		if (error)
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index cecd094..92e6755 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -49,7 +49,6 @@ struct xfs_bmalloca {
>  
>  	xfs_extlen_t		total;	/* total blocks needed for xaction */
>  	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
> -	xfs_extlen_t		minleft; /* amount must be left after alloc */
>  	bool			eof;	/* set if allocating past last extent */
>  	bool			wasdel;	/* replacing a delayed allocation */
>  	bool			aeof;	/* allocated space at eof */
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available
  2016-12-13 15:59 ` [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available Christoph Hellwig
@ 2016-12-14 18:24   ` Brian Foster
  2016-12-14 19:37     ` Christoph Hellwig
  2016-12-15 20:41   ` Libor Klepáč
  2016-12-16  0:28   ` Dave Chinner
  2 siblings, 1 reply; 31+ messages in thread
From: Brian Foster @ 2016-12-14 18:24 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, eguan, darrick.wong

On Tue, Dec 13, 2016 at 04:59:26PM +0100, Christoph Hellwig wrote:
> We must decide in xfs_alloc_fix_freelist if we can perform an
> allocation from a given AG is possible or not based on the available
> space, and should not fail the allocation past that point on a
> healthy file system.
> 
> But currently we have two additional places that second-guess
> xfs_alloc_fix_freelist: xfs_alloc_ag_vextent tries to adjust the
> maxlen parameter to remove the reservation before doing the
> allocation (but ignores the various minium freespace requirements),
> and xfs_alloc_fix_minleft tries to fix up the allocated length
> after we've found an extent, but ignores the reservations and also
> doesn't take the AGFL into account (and thus fails allocations
> for not matching minlen in some cases).
> 
> Remove all these later fixups and just correct the maxlen argument
> inside xfs_alloc_fix_freelist once we have the AGF buffer locked.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_alloc.c | 78 +++++++++--------------------------------------
>  fs/xfs/libxfs/xfs_alloc.h |  2 +-
>  2 files changed, 16 insertions(+), 64 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index 91a6c17..87328a8 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
...
> @@ -2073,10 +2015,19 @@ xfs_alloc_space_available(
>  
>  	/* do we have enough free space remaining for the allocation? */
>  	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
> -			  reservation - min_free - args->total);
> -	if (available < (int)args->minleft || available <= 0)
> +			  reservation - min_free - args->minleft);
> +	if (available < (int)args->total)
>  		return false;
>  
> +	/*
> +	 * Clamp maxlen to the amount of free space available for the actual
> +	 * extent allocation.
> +	 */
> +	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
> +		args->maxlen = available;
> +		ASSERT(args->maxlen > 0);

Nit: could we assert args->maxlen >= args->minlen here as well?
Otherwise looks good to me:

Reviewed-by: Brian Foster <bfoster@redhat.com>

> +	}
> +
>  	return true;
>  }
>  
> @@ -2122,7 +2073,8 @@ xfs_alloc_fix_freelist(
>  	}
>  
>  	need = xfs_alloc_min_freelist(mp, pag);
> -	if (!xfs_alloc_space_available(args, need, flags))
> +	if (!xfs_alloc_space_available(args, need, flags |
> +			XFS_ALLOC_FLAG_CHECK))
>  		goto out_agbp_relse;
>  
>  	/*
> diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
> index 7c404a6..1d0f48a 100644
> --- a/fs/xfs/libxfs/xfs_alloc.h
> +++ b/fs/xfs/libxfs/xfs_alloc.h
> @@ -56,7 +56,7 @@ typedef unsigned int xfs_alloctype_t;
>  #define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
>  #define	XFS_ALLOC_FLAG_NORMAP	0x00000004  /* don't modify the rmapbt */
>  #define	XFS_ALLOC_FLAG_NOSHRINK	0x00000008  /* don't shrink the freelist */
> -
> +#define	XFS_ALLOC_FLAG_CHECK	0x00000010  /* test only, don't modify args */
>  
>  /*
>   * Argument structure for xfs_alloc routines.
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 4/4] xfs: don't rely on ->total in xfs_alloc_space_available
  2016-12-13 15:59 ` [PATCH 4/4] xfs: don't rely on ->total " Christoph Hellwig
@ 2016-12-14 18:30   ` Brian Foster
  2016-12-14 19:38     ` Christoph Hellwig
  0 siblings, 1 reply; 31+ messages in thread
From: Brian Foster @ 2016-12-14 18:30 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, eguan, darrick.wong

On Tue, Dec 13, 2016 at 04:59:27PM +0100, Christoph Hellwig wrote:
> ->total is a bit of an odd parameter passed down to the low-level
> allocator all the way from the high-level callers.  It's supposed to
> contain the maximum number of blocks to be allocated for the whole
> transaction [1].
> 
> But in xfs_iomap_write_allocate we only convert existing delayed
> allocations and thus only have a minimal block reservation for the
> current transaction, so xfs_alloc_space_available can't use it for
> the allocation decisions.  Use the maximum of args->total and the
> calculated block requirement to make a decision.  We probably should
> get rid of args->total eventually and instead apply ->minleft more
> broadly, but that will require some extensive changes all over.
> 
> [1] which creates lots of confusion as most callers don't decrement it
> once doing a first allocation.  But that's for a separate series.
> ---
>  fs/xfs/libxfs/xfs_alloc.c | 7 ++++---
>  1 file changed, 4 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index 87328a8..85039e5 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -1998,7 +1998,7 @@ xfs_alloc_space_available(
>  	int			flags)
>  {
>  	struct xfs_perag	*pag = args->pag;
> -	xfs_extlen_t		longest;
> +	xfs_extlen_t		alloc_len, longest;
>  	xfs_extlen_t		reservation; /* blocks that are still reserved */
>  	int			available;
>  
> @@ -2008,15 +2008,16 @@ xfs_alloc_space_available(
>  	reservation = xfs_ag_resv_needed(pag, args->resv);
>  
>  	/* do we have enough contiguous free space for the allocation? */
> +	alloc_len = args->minlen + (args->alignment - 1)+ args->minalignslop;

Whitespace nit, missing space:				^

Also, is the addition of braces intentional? I believe it is possible
for args->alignment == 0.

Nits aside:

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
>  			reservation);
> -	if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
> +	if (longest < alloc_len)
>  		return false;
>  
>  	/* do we have enough free space remaining for the allocation? */
>  	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
>  			  reservation - min_free - args->minleft);
> -	if (available < (int)args->total)
> +	if (available < (int)max(args->total, alloc_len))
>  		return false;
>  
>  	/*
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] xfs: fix bogus minleft manipulations
  2016-12-14 17:35   ` Brian Foster
@ 2016-12-14 19:36     ` Christoph Hellwig
  2016-12-14 21:51       ` Brian Foster
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-14 19:36 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs, eguan, darrick.wong

On Wed, Dec 14, 2016 at 12:35:07PM -0500, Brian Foster wrote:
> I'm still trying to grok the minleft/dop_low stuff, but doesn't this
> increase the risk of bmbt block allocation failure?

No.  If working correctly it reduces the chance of a failing bmbt block
allocation in transaction to zero, at the expense of potentially
failing whole block allocations while we still have a few blocks left.
But given that the first leads to a fs shutdown and the latter just to
an ENOSPC it seems worthwhile.

> E.g., args.minleft
> is set to the transaction reservation earlier in the function. AFAIK the
> res is not guaranteed to be in a particular AG, but in that case we're
> just setting a start AG and ultimately cycling through the AGs. If that
> fails, this hunk resets minleft, restarts at fsbno 0 and activates the
> sequential agno allocation requirement for subsequent allocs. Doesn't
> removing this logic mean we can successfully reserve blocks in a write
> transaction that will never ultimately be able to allocate bmbt blocks,
> even if the data blocks can all be allocated..?

We only set it to res if no firstblock was set.  The only case where
that is possible during the first bmbt block allocation in a reflink
operation - for actual direct I/O writes, fallocate calls or unwritten
extent conversion we will always have firstblock set from the allocation
of the actual data extent.  And that allocation now has the proper minleft
set after this series so that we are guaranteed we have enough space
in one single AG for the whole allocation.

In the reflink case where firstblock wasn't set the minleft ensures
that we fail the first allocation if we don't have enough blocks
in a single AG.
retryloop to try the allocation 

> The other thing to note here is that minleft is initialized to 0 and
> only set to the transaction res conditionally, so if firstblock is
> already set we won't use minleft at all.

Yes, that's intentional - we already did our minleft check when allocating
the data extent.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available
  2016-12-14 18:24   ` Brian Foster
@ 2016-12-14 19:37     ` Christoph Hellwig
  0 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-14 19:37 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs, eguan, darrick.wong

On Wed, Dec 14, 2016 at 01:24:32PM -0500, Brian Foster wrote:
> Nit: could we assert args->maxlen >= args->minlen here as well?

Sure..

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 4/4] xfs: don't rely on ->total in xfs_alloc_space_available
  2016-12-14 18:30   ` Brian Foster
@ 2016-12-14 19:38     ` Christoph Hellwig
  2016-12-14 21:51       ` Brian Foster
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-14 19:38 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs, eguan, darrick.wong

On Wed, Dec 14, 2016 at 01:30:18PM -0500, Brian Foster wrote:
> >  	/* do we have enough contiguous free space for the allocation? */
> > +	alloc_len = args->minlen + (args->alignment - 1)+ args->minalignslop;
> 
> Whitespace nit, missing space:				^
> 
> Also, is the addition of braces intentional? I believe it is possible
> for args->alignment == 0.

It's mostly for explaining the - 1 to the reader as it took me a while
to figure out what it was for.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] xfs: fix bogus minleft manipulations
  2016-12-14 19:36     ` Christoph Hellwig
@ 2016-12-14 21:51       ` Brian Foster
       [not found]         ` <20161215143430.GB29477@bfoster.bfoster>
  0 siblings, 1 reply; 31+ messages in thread
From: Brian Foster @ 2016-12-14 21:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, eguan, darrick.wong

On Wed, Dec 14, 2016 at 08:36:26PM +0100, Christoph Hellwig wrote:
> On Wed, Dec 14, 2016 at 12:35:07PM -0500, Brian Foster wrote:
> > I'm still trying to grok the minleft/dop_low stuff, but doesn't this
> > increase the risk of bmbt block allocation failure?
> 
> No.  If working correctly it reduces the chance of a failing bmbt block
> allocation in transaction to zero, at the expense of potentially
> failing whole block allocations while we still have a few blocks left.
> But given that the first leads to a fs shutdown and the latter just to
> an ENOSPC it seems worthwhile.
> 

Indeed, I completely agree in principle with the series. It's much
better to leave some free space efficiency on the table for greater
reliability.  I just want to make sure I understand how this change
works and whether it does so correctly...

> > E.g., args.minleft
> > is set to the transaction reservation earlier in the function. AFAIK the
> > res is not guaranteed to be in a particular AG, but in that case we're
> > just setting a start AG and ultimately cycling through the AGs. If that
> > fails, this hunk resets minleft, restarts at fsbno 0 and activates the
> > sequential agno allocation requirement for subsequent allocs. Doesn't
> > removing this logic mean we can successfully reserve blocks in a write
> > transaction that will never ultimately be able to allocate bmbt blocks,
> > even if the data blocks can all be allocated..?
> 
> We only set it to res if no firstblock was set.  The only case where
> that is possible during the first bmbt block allocation in a reflink
> operation - for actual direct I/O writes, fallocate calls or unwritten
> extent conversion we will always have firstblock set from the allocation
> of the actual data extent.  And that allocation now has the proper minleft
> set after this series so that we are guaranteed we have enough space
> in one single AG for the whole allocation.
> 

Ok, I need to stare at that codepath a bit more..

In the meantime, what about the delalloc codepath as well? We reserve
indlen out of the global pool at write time and track the res in the
in-core extents. Writeback comes around later and we try to allocate
real blocks without any need for transaction reservation. Now that we
set 'args.minleft = xfs_bmap_worst_indlen()' in xfs_bmap_btalloc() and
never reset it, what guarantees any particular AG can satisfy that
request based on the global reservation?

I'm basically just a little nervous that we might rely on some of these
minleft hacks to avoid serious failures like in bmbt allocation or
writeback, despite the fact that the broader mechanism may be bogus...

Brian

> In the reflink case where firstblock wasn't set the minleft ensures
> that we fail the first allocation if we don't have enough blocks
> in a single AG.
> retryloop to try the allocation 
> 
> > The other thing to note here is that minleft is initialized to 0 and
> > only set to the transaction res conditionally, so if firstblock is
> > already set we won't use minleft at all.
> 
> Yes, that's intentional - we already did our minleft check when allocating
> the data extent.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 4/4] xfs: don't rely on ->total in xfs_alloc_space_available
  2016-12-14 19:38     ` Christoph Hellwig
@ 2016-12-14 21:51       ` Brian Foster
  2016-12-15  8:55         ` Christoph Hellwig
  0 siblings, 1 reply; 31+ messages in thread
From: Brian Foster @ 2016-12-14 21:51 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, eguan, darrick.wong

On Wed, Dec 14, 2016 at 08:38:02PM +0100, Christoph Hellwig wrote:
> On Wed, Dec 14, 2016 at 01:30:18PM -0500, Brian Foster wrote:
> > >  	/* do we have enough contiguous free space for the allocation? */
> > > +	alloc_len = args->minlen + (args->alignment - 1)+ args->minalignslop;
> > 
> > Whitespace nit, missing space:				^
> > 
> > Also, is the addition of braces intentional? I believe it is possible
> > for args->alignment == 0.
> 
> It's mostly for explaining the - 1 to the reader as it took me a while
> to figure out what it was for.

Oh, care to elaborate? :)

Brian

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 4/4] xfs: don't rely on ->total in xfs_alloc_space_available
  2016-12-14 21:51       ` Brian Foster
@ 2016-12-15  8:55         ` Christoph Hellwig
  2016-12-15 12:00           ` Brian Foster
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-15  8:55 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs, eguan, darrick.wong

On Wed, Dec 14, 2016 at 04:51:58PM -0500, Brian Foster wrote:
> On Wed, Dec 14, 2016 at 08:38:02PM +0100, Christoph Hellwig wrote:
> > On Wed, Dec 14, 2016 at 01:30:18PM -0500, Brian Foster wrote:
> > > >  	/* do we have enough contiguous free space for the allocation? */
> > > > +	alloc_len = args->minlen + (args->alignment - 1)+ args->minalignslop;
> > > 
> > > Whitespace nit, missing space:				^
> > > 
> > > Also, is the addition of braces intentional? I believe it is possible
> > > for args->alignment == 0.
> > 
> > It's mostly for explaining the - 1 to the reader as it took me a while
> > to figure out what it was for.
> 
> Oh, care to elaborate? :)

the alignment is the boundary we want to align to, so we'll have to add
alignment - 1 at max to achive it. 

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: minleft fixes
  2016-12-14 10:51 ` minleft fixes Eryu Guan
@ 2016-12-15 10:24   ` Eryu Guan
  0 siblings, 0 replies; 31+ messages in thread
From: Eryu Guan @ 2016-12-15 10:24 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, darrick.wong

On Wed, Dec 14, 2016 at 06:51:01PM +0800, Eryu Guan wrote:
> On Tue, Dec 13, 2016 at 04:59:23PM +0100, Christoph Hellwig wrote:
> > [now to the correct list address]
> > 
> > This is my attempt to fix the problems with rmap and reflink file system
> > running out of space during delayed extent conversions.  It turns out
> > the way minleft has been handled has always been bogus, but the rmap
> > enablement made it even worse..
> > 
> > This has survived xfstests xfs/109 on a 2k fs for 2 1/2 hour here now,
> > but Eryu was much better than me at reproducing the problems, so it could
> > use some further testing.
> > 
> 
> Test results looked good to me so far. It passed 200 iterations of
> xfs/109 with both xfs_2k_reflink and xfs_1k_reflink test configs. 

Results of full auto group runs with different test configs also look
good to me, I didn't notice any regression. Thanks!

Eryu

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 4/4] xfs: don't rely on ->total in xfs_alloc_space_available
  2016-12-15  8:55         ` Christoph Hellwig
@ 2016-12-15 12:00           ` Brian Foster
  0 siblings, 0 replies; 31+ messages in thread
From: Brian Foster @ 2016-12-15 12:00 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, eguan, darrick.wong

On Thu, Dec 15, 2016 at 09:55:34AM +0100, Christoph Hellwig wrote:
> On Wed, Dec 14, 2016 at 04:51:58PM -0500, Brian Foster wrote:
> > On Wed, Dec 14, 2016 at 08:38:02PM +0100, Christoph Hellwig wrote:
> > > On Wed, Dec 14, 2016 at 01:30:18PM -0500, Brian Foster wrote:
> > > > >  	/* do we have enough contiguous free space for the allocation? */
> > > > > +	alloc_len = args->minlen + (args->alignment - 1)+ args->minalignslop;
> > > > 
> > > > Whitespace nit, missing space:				^
> > > > 
> > > > Also, is the addition of braces intentional? I believe it is possible
> > > > for args->alignment == 0.
> > > 
> > > It's mostly for explaining the - 1 to the reader as it took me a while
> > > to figure out what it was for.
> > 
> > Oh, care to elaborate? :)
> 
> the alignment is the boundary we want to align to, so we'll have to add
> alignment - 1 at max to achive it. 

Gotcha, so perhaps we should check that alignment != 0?

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available
  2016-12-13 15:59 ` [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available Christoph Hellwig
  2016-12-14 18:24   ` Brian Foster
@ 2016-12-15 20:41   ` Libor Klepáč
  2016-12-16  8:20     ` Christoph Hellwig
  2016-12-16  0:28   ` Dave Chinner
  2 siblings, 1 reply; 31+ messages in thread
From: Libor Klepáč @ 2016-12-15 20:41 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

Hello,

On úterý 13. prosince 2016 16:59:26 CET Christoph Hellwig wrote:
> ---
>  fs/xfs/libxfs/xfs_alloc.c | 78 ++++++++
+--------------------------------------
>  fs/xfs/libxfs/xfs_alloc.h |  2 +-
>  2 files changed, 16 insertions(+), 64 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index 91a6c17..87328a8 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -1149,11 +1104,6 @@ xfs_alloc_ag_vextent_near(
>  		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
>  		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)-
>agf_length));
>  		args->len = blen;
> -		if (!xfs_alloc_fix_minleft(args)) {
> -			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
> -			trace_xfs_alloc_near_nominleft(args);
> -			return 0;
> -		}
>  		blen = args->len;
>  		/*
>  		 * We are allocating starting at bnew for blen blocks.

Doesn't this produce this code?

  		args->len = blen;
  		blen = args->len;


Libor


^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] xfs: fix bogus minleft manipulations
  2016-12-13 15:59 ` [PATCH 1/4] xfs: fix bogus minleft manipulations Christoph Hellwig
  2016-12-14 17:35   ` Brian Foster
@ 2016-12-15 22:09   ` Dave Chinner
  2016-12-16  8:20     ` Christoph Hellwig
  1 sibling, 1 reply; 31+ messages in thread
From: Dave Chinner @ 2016-12-15 22:09 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, eguan, darrick.wong

On Tue, Dec 13, 2016 at 04:59:24PM +0100, Christoph Hellwig wrote:
> We can't just set minleft to 0 when we're low on space - that's exactly
> what we need minleft for: to protect space in the AG for btree block
> allocations when we are low on free space.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
....

The xfs_alloc_vextent() loop changes look fine. The minleft hacks
always troubled me.

> ---
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 2760bc3..44773c9 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -3812,7 +3812,6 @@ xfs_bmap_btalloc(
>  		args.fsbno = 0;
>  		args.type = XFS_ALLOCTYPE_FIRST_AG;
>  		args.total = ap->minlen;
> -		args.minleft = 0;
>  		if ((error = xfs_alloc_vextent(&args)))
>  			return error;
>  		ap->dfops->dop_low = true;

But this looks wrong. when combined with the next patch that sets:

	args.minleft = xfs_bmap_worst_indlen(ap->ip, args.maxlen);

we've got a minleft value that might be for multigigabyte
allocation, but now we are only asking for a total allocation
of minlen, and that might be 1 block. IOWs, shouldn't this "last,
final attempt" code actually do something like this:

		args.maxlen = ap->minlen;
		args.total = ap->minlen;
		args.minleft = xfs_bmap_worst_indlen(ap->ip, args.maxlen);

> +++ b/fs/xfs/libxfs/xfs_bmap_btree.c
> @@ -499,20 +499,6 @@ xfs_bmbt_alloc_block(
>  		goto try_another_ag;
>  	}
>  
> -	if (args.fsbno == NULLFSBLOCK && args.minleft) {
> -		/*
> -		 * Could not find an AG with enough free space to satisfy
> -		 * a full btree split.  Try again without minleft and if
> -		 * successful activate the lowspace algorithm.
> -		 */
> -		args.fsbno = 0;
> -		args.type = XFS_ALLOCTYPE_FIRST_AG;
> -		args.minleft = 0;
> -		error = xfs_alloc_vextent(&args);
> -		if (error)
> -			goto error0;
> -		cur->bc_private.b.dfops->dop_low = true;
> -	}
>  	if (args.fsbno == NULLFSBLOCK) {
>  		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
>  		*stat = 0;

Yes, that's fair enough considering the common in the block that
set args.minleft, and that was a bug fix introduced long after this
fallback was put in place because the fallback could still fail...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available
  2016-12-13 15:59 ` [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available Christoph Hellwig
  2016-12-14 18:24   ` Brian Foster
  2016-12-15 20:41   ` Libor Klepáč
@ 2016-12-16  0:28   ` Dave Chinner
  2016-12-16  8:25     ` Christoph Hellwig
  2 siblings, 1 reply; 31+ messages in thread
From: Dave Chinner @ 2016-12-16  0:28 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, eguan, darrick.wong

On Tue, Dec 13, 2016 at 04:59:26PM +0100, Christoph Hellwig wrote:
> We must decide in xfs_alloc_fix_freelist if we can perform an
> allocation from a given AG is possible or not based on the available
> space, and should not fail the allocation past that point on a
> healthy file system.
> 
> But currently we have two additional places that second-guess
> xfs_alloc_fix_freelist: xfs_alloc_ag_vextent tries to adjust the
> maxlen parameter to remove the reservation before doing the
> allocation (but ignores the various minium freespace requirements),
> and xfs_alloc_fix_minleft tries to fix up the allocated length
> after we've found an extent, but ignores the reservations and also
> doesn't take the AGFL into account (and thus fails allocations
> for not matching minlen in some cases).
> 
> Remove all these later fixups and just correct the maxlen argument
> inside xfs_alloc_fix_freelist once we have the AGF buffer locked.

Ok, the concept seems solid....

> @@ -2073,10 +2015,19 @@ xfs_alloc_space_available(
>  
>  	/* do we have enough free space remaining for the allocation? */
>  	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
> -			  reservation - min_free - args->total);
> +			  reservation - min_free - args->minleft);

This is fine, but...

> -	if (available < (int)args->minleft || available <= 0)
> +	if (available < (int)args->total)
>  		return false;

this is where I begin to wonder. The "args->total" logic here just
doesn't read cleanly to me. xfs_bmapi_write() says:

 * An upper bound for the number of blocks to be allocated is supplied to
 * the first call in "total"; if no allocation group has that many free
 * blocks then the call will fail (return NULLFSBLOCK in "firstblock"). 

So if we are asked to allocate 1 block, but the AG doesn't have 10
total blocks free, then allocation will fail. What this is used for
is to chain multiple independent data block allocations together in
a single transaction to attempt to get them all from the one AG.
This is used only by the directory/attr code for ensuring all the
allocations needed to add an entry to the dir/attr tree will succeed.
It's essentially an "external block reservation" as the AGF will be
held locked across the multiple allocations once the first
allocation has been done.

The only time "total" is actually meaningful is the first
allocation in a chain. i.e. when firstblock is null. It's really a
"free blocks required to proceed" parameter , not a length
bound for the current allocation.

However, it's impact is to set a maximum length bound on the
allocation, so I'm left to wonder why this was is being hidden this
inside xfs_alloc_space_available() rather than dealing with it when
setting up args->maxlen/minlen/minleft in xfs_bmap_btalloc()?

i.e args->maxlen must always be less than args->total. And if we are
using minleft to protect against running out of space for
bmbt/rmapbt allocation, then I think it should be args->maxlen +
args->minleft < args->total.

If this can all be done and enforced in xfs_bmap_btalloc(), then we
can get rid of args->total from the allocargs completely...


> +	/*
> +	 * Clamp maxlen to the amount of free space available for the actual
> +	 * extent allocation.
> +	 */
> +	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
> +		args->maxlen = available;
> +		ASSERT(args->maxlen > 0);
> +	}

I'd love to get rid of all these (int) casts, too...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available
  2016-12-15 20:41   ` Libor Klepáč
@ 2016-12-16  8:20     ` Christoph Hellwig
  0 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-16  8:20 UTC (permalink / raw)
  To: Libor Klepáč; +Cc: Christoph Hellwig, linux-xfs

On Thu, Dec 15, 2016 at 09:41:39PM +0100, Libor Klepáč wrote:
> > index 91a6c17..87328a8 100644
> > --- a/fs/xfs/libxfs/xfs_alloc.c
> > +++ b/fs/xfs/libxfs/xfs_alloc.c
> > @@ -1149,11 +1104,6 @@ xfs_alloc_ag_vextent_near(
> >  		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
> >  		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)-
> >agf_length));
> >  		args->len = blen;
> > -		if (!xfs_alloc_fix_minleft(args)) {
> > -			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
> > -			trace_xfs_alloc_near_nominleft(args);
> > -			return 0;
> > -		}
> >  		blen = args->len;
> >  		/*
> >  		 * We are allocating starting at bnew for blen blocks.
> 
> Doesn't this produce this code?
> 
>   		args->len = blen;
>   		blen = args->len;

Yes, that second line could be removed, and I'll do that in the next
iteration.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] xfs: fix bogus minleft manipulations
  2016-12-15 22:09   ` Dave Chinner
@ 2016-12-16  8:20     ` Christoph Hellwig
  2017-01-04  6:32       ` Darrick J. Wong
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-16  8:20 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, linux-xfs, eguan, darrick.wong

On Fri, Dec 16, 2016 at 09:09:38AM +1100, Dave Chinner wrote:
> > @@ -3812,7 +3812,6 @@ xfs_bmap_btalloc(
> >  		args.fsbno = 0;
> >  		args.type = XFS_ALLOCTYPE_FIRST_AG;
> >  		args.total = ap->minlen;
> > -		args.minleft = 0;
> >  		if ((error = xfs_alloc_vextent(&args)))
> >  			return error;
> >  		ap->dfops->dop_low = true;
> 
> But this looks wrong. when combined with the next patch that sets:
> 
> 	args.minleft = xfs_bmap_worst_indlen(ap->ip, args.maxlen);
> 
> we've got a minleft value that might be for multigigabyte
> allocation, but now we are only asking for a total allocation
> of minlen, and that might be 1 block. IOWs, shouldn't this "last,
> final attempt" code actually do something like this:
> 
> 		args.maxlen = ap->minlen;
> 		args.total = ap->minlen;
> 		args.minleft = xfs_bmap_worst_indlen(ap->ip, args.maxlen);

Yes.  And I actually had that in a previous iteration, and it got
dropped at some point.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] xfs: fix bogus minleft manipulations
       [not found]         ` <20161215143430.GB29477@bfoster.bfoster>
@ 2016-12-16  8:21           ` Christoph Hellwig
  2016-12-19 11:38           ` Christoph Hellwig
  1 sibling, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-16  8:21 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs, eguan, darrick.wong

On Thu, Dec 15, 2016 at 09:34:33AM -0500, Brian Foster wrote:
> FWIW, I was playing with this a bit more and managed to manufacture a
> filesystem layout that this series doesn't handle too well. Emphasis on
> "manufactured" because this might not be a likely real world scenario,
> but either way the current code handles it fine.
> 
> I've attached a metadump of the offending image. mdestore it, mount and
> attempt something like 'dd if=/dev/zero of=/mnt/file' on the root. The
> buffered write looks like it's in a livelock, waiting indefinitely for a
> writeback cycle that will never complete...

Ok, I'll give it a spin.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available
  2016-12-16  0:28   ` Dave Chinner
@ 2016-12-16  8:25     ` Christoph Hellwig
  2016-12-18 23:55       ` Dave Chinner
  0 siblings, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-16  8:25 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, linux-xfs, eguan, darrick.wong

On Fri, Dec 16, 2016 at 11:28:51AM +1100, Dave Chinner wrote:
> >  	/* do we have enough free space remaining for the allocation? */
> >  	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
> > -			  reservation - min_free - args->total);
> > +			  reservation - min_free - args->minleft);
> 
> This is fine, but...
> 
> > -	if (available < (int)args->minleft || available <= 0)
> > +	if (available < (int)args->total)
> >  		return false;
> 
> this is where I begin to wonder. The "args->total" logic here just
> doesn't read cleanly to me. xfs_bmapi_write() says:

args->total is a complete mess, but the above just rearranged the
deckchairs by moving it to a different place in the equation without
changing the result..

> So if we are asked to allocate 1 block, but the AG doesn't have 10
> total blocks free, then allocation will fail. What this is used for
> is to chain multiple independent data block allocations together in
> a single transaction to attempt to get them all from the one AG.
> This is used only by the directory/attr code for ensuring all the
> allocations needed to add an entry to the dir/attr tree will succeed.
> It's essentially an "external block reservation" as the AGF will be
> held locked across the multiple allocations once the first
> allocation has been done.

Symlink creation also uses it to cover the blocks for the symlink
body.  And unlike all users it actually seems to get the semantics
right by decrementing the used blocks from args->total after each
allocation..

> The only time "total" is actually meaningful is the first
> allocation in a chain. i.e. when firstblock is null. It's really a
> "free blocks required to proceed" parameter , not a length
> bound for the current allocation.

Yes.

> However, it's impact is to set a maximum length bound on the
> allocation, so I'm left to wonder why this was is being hidden this
> inside xfs_alloc_space_available() rather than dealing with it when
> setting up args->maxlen/minlen/minleft in xfs_bmap_btalloc()?
> 
> i.e args->maxlen must always be less than args->total. And if we are
> using minleft to protect against running out of space for
> bmbt/rmapbt allocation, then I think it should be args->maxlen +
> args->minleft < args->total.
> 
> If this can all be done and enforced in xfs_bmap_btalloc(), then we
> can get rid of args->total from the allocargs completely...

My long terms plan was to kill it of in favour of passing a minleft
parameter that's only set on the first call to xfs_bmapi_write.
But I didn't fancy rewriting hairy parts of the dir code while trying
to get an urgent customer escalation fixed..

> 
> 
> > +	/*
> > +	 * Clamp maxlen to the amount of free space available for the actual
> > +	 * extent allocation.
> > +	 */
> > +	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
> > +		args->maxlen = available;
> > +		ASSERT(args->maxlen > 0);
> > +	}
> 
> I'd love to get rid of all these (int) casts, too...

The problem here is that we compare 32-bit signed to 32-bit unsigned
variables.  And given that this is ripe for nasty bugs due to the C
type promotion rules I'd rather be extra careful.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available
  2016-12-16  8:25     ` Christoph Hellwig
@ 2016-12-18 23:55       ` Dave Chinner
  0 siblings, 0 replies; 31+ messages in thread
From: Dave Chinner @ 2016-12-18 23:55 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, eguan, darrick.wong

On Fri, Dec 16, 2016 at 09:25:44AM +0100, Christoph Hellwig wrote:
> On Fri, Dec 16, 2016 at 11:28:51AM +1100, Dave Chinner wrote:
> > > +	/*
> > > +	 * Clamp maxlen to the amount of free space available for the actual
> > > +	 * extent allocation.
> > > +	 */
> > > +	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
> > > +		args->maxlen = available;
> > > +		ASSERT(args->maxlen > 0);
> > > +	}
> > 
> > I'd love to get rid of all these (int) casts, too...
> 
> The problem here is that we compare 32-bit signed to 32-bit unsigned
> variables.  And given that this is ripe for nasty bugs due to the C
> type promotion rules I'd rather be extra careful.

Yes, I know - it's bitten me several times in the past. The
underlying problem (IMO) is using xfs_extlen_t for things that
aren't physical BMBT extent lengths, which are bound to:

#define MAXEXTLEN       ((xfs_extlen_t)0x001fffff)      /* 21 bits */            

but it's grown to be used for everything that needs to store or
operate on the length of an extent.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] xfs: fix bogus minleft manipulations
       [not found]         ` <20161215143430.GB29477@bfoster.bfoster>
  2016-12-16  8:21           ` Christoph Hellwig
@ 2016-12-19 11:38           ` Christoph Hellwig
  2016-12-20 14:17             ` Brian Foster
  1 sibling, 1 reply; 31+ messages in thread
From: Christoph Hellwig @ 2016-12-19 11:38 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs, eguan, darrick.wong

On Thu, Dec 15, 2016 at 09:34:33AM -0500, Brian Foster wrote:
> FWIW, I was playing with this a bit more and managed to manufacture a
> filesystem layout that this series doesn't handle too well. Emphasis on
> "manufactured" because this might not be a likely real world scenario,
> but either way the current code handles it fine.

It does, although mostly by accident.  I suspect with an even better
manufcatured image you could also drive the current code to it's knees,
e.g. only have one single block free in the first few AGs, and then
a small number just higher than that in a higher AG.

> I've attached a metadump of the offending image. mdestore it, mount and
> attempt something like 'dd if=/dev/zero of=/mnt/file' on the root. The
> buffered write looks like it's in a livelock, waiting indefinitely for a
> writeback cycle that will never complete...

Yeah, that's the loop that keeps going even if it can't allocate any
blocks, which seems generally bogus.  But even without that we'd get
ENOSPC despite not having a reservations. Which is a little easier to
debug, but just as wrong.

The only good way out I can see is to not hand out any more reservations
after we only nave nr_ags * xfs_bmap_worst_indlen(1) available.  I'll
see if I can come up with a patch for that.

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] xfs: fix bogus minleft manipulations
  2016-12-19 11:38           ` Christoph Hellwig
@ 2016-12-20 14:17             ` Brian Foster
  2016-12-20 21:45               ` Dave Chinner
  0 siblings, 1 reply; 31+ messages in thread
From: Brian Foster @ 2016-12-20 14:17 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, eguan, darrick.wong

On Mon, Dec 19, 2016 at 12:38:26PM +0100, Christoph Hellwig wrote:
> On Thu, Dec 15, 2016 at 09:34:33AM -0500, Brian Foster wrote:
> > FWIW, I was playing with this a bit more and managed to manufacture a
> > filesystem layout that this series doesn't handle too well. Emphasis on
> > "manufactured" because this might not be a likely real world scenario,
> > but either way the current code handles it fine.
> 
> It does, although mostly by accident.  I suspect with an even better
> manufcatured image you could also drive the current code to it's knees,
> e.g. only have one single block free in the first few AGs, and then
> a small number just higher than that in a higher AG.
> 

Perhaps, I certainly wouldn't expect the code in current form to be
perfect. It's hard enough to understand as it is. Just trying to avoid
regressions and properly scope the required fix...

> > I've attached a metadump of the offending image. mdestore it, mount and
> > attempt something like 'dd if=/dev/zero of=/mnt/file' on the root. The
> > buffered write looks like it's in a livelock, waiting indefinitely for a
> > writeback cycle that will never complete...
> 
> Yeah, that's the loop that keeps going even if it can't allocate any
> blocks, which seems generally bogus.  But even without that we'd get
> ENOSPC despite not having a reservations. Which is a little easier to
> debug, but just as wrong.
> 

Indeed.

> The only good way out I can see is to not hand out any more reservations
> after we only nave nr_ags * xfs_bmap_worst_indlen(1) available.  I'll
> see if I can come up with a patch for that.

Hmm, so the idea is to basically find a way we can infer accurate
information about the per-AG state at the time blocks are reserved from
the global pool (i.e., buffered write time) and cut off writes at the
point we can no longer guarantee at least one AG can satisfy the
smallest write..?

If so, that seems reasonable to me in principle. I'd have to think about
it a bit more. The first question that comes to mind is that we'd have
to make sure all allocations honor the minleft heuristic, yes? (Or
perhaps not allow any allocations after this point?) Otherwise, what
prevents the assumption of (available > nr_ags *
xfs_bmap_worst_indlen(1)) from becoming false after the reservation has
been granted but before the physical allocation is attempted at
writeback time? E.g., write/reserve the last available delalloc block,
then chew up the remaining minleft in each AG via sparse inode allocs or
something (for example), then writeback occurs and can't find an AG to
honor minleft (??).

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] xfs: fix bogus minleft manipulations
  2016-12-20 14:17             ` Brian Foster
@ 2016-12-20 21:45               ` Dave Chinner
  0 siblings, 0 replies; 31+ messages in thread
From: Dave Chinner @ 2016-12-20 21:45 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs, eguan, darrick.wong

On Tue, Dec 20, 2016 at 09:17:47AM -0500, Brian Foster wrote:
> On Mon, Dec 19, 2016 at 12:38:26PM +0100, Christoph Hellwig wrote:
> > On Thu, Dec 15, 2016 at 09:34:33AM -0500, Brian Foster wrote:
> > > FWIW, I was playing with this a bit more and managed to manufacture a
> > > filesystem layout that this series doesn't handle too well. Emphasis on
> > > "manufactured" because this might not be a likely real world scenario,
> > > but either way the current code handles it fine.
> > 
> > It does, although mostly by accident.  I suspect with an even better
> > manufcatured image you could also drive the current code to it's knees,
> > e.g. only have one single block free in the first few AGs, and then
> > a small number just higher than that in a higher AG.
> > 
> 
> Perhaps, I certainly wouldn't expect the code in current form to be
> perfect. It's hard enough to understand as it is. Just trying to avoid
> regressions and properly scope the required fix...
> 
> > > I've attached a metadump of the offending image. mdestore it, mount and
> > > attempt something like 'dd if=/dev/zero of=/mnt/file' on the root. The
> > > buffered write looks like it's in a livelock, waiting indefinitely for a
> > > writeback cycle that will never complete...
> > 
> > Yeah, that's the loop that keeps going even if it can't allocate any
> > blocks, which seems generally bogus.  But even without that we'd get
> > ENOSPC despite not having a reservations. Which is a little easier to
> > debug, but just as wrong.
> > 
> 
> Indeed.
> 
> > The only good way out I can see is to not hand out any more reservations
> > after we only nave nr_ags * xfs_bmap_worst_indlen(1) available.  I'll
> > see if I can come up with a patch for that.
> 
> Hmm, so the idea is to basically find a way we can infer accurate
> information about the per-AG state at the time blocks are reserved from
> the global pool (i.e., buffered write time) and cut off writes at the
> point we can no longer guarantee at least one AG can satisfy the
> smallest write..?

We already do this for per-AG freelist minimum space requirements.
See XFS_ALLOC_AGFL_RESERVE and the big comment above
xfs_alloc_set_aside().

What's worth noting is that xfs_alloc_set_aside() has a magic number
of "4" added to it, which is supposedly for the bmbt split that
might be needed. This is applied at delalloc space reservation time,
so this would seem to me to be the place to hook into here.

I do see a problem here, though - it's only reserving space for a
single BMBT split from the global free space pool. This is fine for
the AGFL reservations (as they are static and fixed in size), but
maybe this is where we are over-committing the freespace pool...

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] xfs: fix bogus minleft manipulations
  2016-12-16  8:20     ` Christoph Hellwig
@ 2017-01-04  6:32       ` Darrick J. Wong
  2017-01-04  8:50         ` Christoph Hellwig
  0 siblings, 1 reply; 31+ messages in thread
From: Darrick J. Wong @ 2017-01-04  6:32 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Dave Chinner, linux-xfs, eguan

On Fri, Dec 16, 2016 at 09:20:44AM +0100, Christoph Hellwig wrote:
> On Fri, Dec 16, 2016 at 09:09:38AM +1100, Dave Chinner wrote:
> > > @@ -3812,7 +3812,6 @@ xfs_bmap_btalloc(
> > >  		args.fsbno = 0;
> > >  		args.type = XFS_ALLOCTYPE_FIRST_AG;
> > >  		args.total = ap->minlen;
> > > -		args.minleft = 0;
> > >  		if ((error = xfs_alloc_vextent(&args)))
> > >  			return error;
> > >  		ap->dfops->dop_low = true;
> > 
> > But this looks wrong. when combined with the next patch that sets:
> > 
> > 	args.minleft = xfs_bmap_worst_indlen(ap->ip, args.maxlen);
> > 
> > we've got a minleft value that might be for multigigabyte
> > allocation, but now we are only asking for a total allocation
> > of minlen, and that might be 1 block. IOWs, shouldn't this "last,
> > final attempt" code actually do something like this:
> > 
> > 		args.maxlen = ap->minlen;
> > 		args.total = ap->minlen;
> > 		args.minleft = xfs_bmap_worst_indlen(ap->ip, args.maxlen);
> 
> Yes.  And I actually had that in a previous iteration, and it got
> dropped at some point.

I see the diff hunk in question hasn't changed in the v2 patchset...
Did Dave's suggestion not work?

--D

^ permalink raw reply	[flat|nested] 31+ messages in thread

* Re: [PATCH 1/4] xfs: fix bogus minleft manipulations
  2017-01-04  6:32       ` Darrick J. Wong
@ 2017-01-04  8:50         ` Christoph Hellwig
  0 siblings, 0 replies; 31+ messages in thread
From: Christoph Hellwig @ 2017-01-04  8:50 UTC (permalink / raw)
  To: Darrick J. Wong; +Cc: Dave Chinner, linux-xfs, eguan

> > Yes.  And I actually had that in a previous iteration, and it got
> > dropped at some point.
> 
> I see the diff hunk in question hasn't changed in the v2 patchset...
> Did Dave's suggestion not work?

We don't need it anymore.  The previous iteration set minleft to the
worst case indirect blocks for the requested extent length.  This series
instead sticks to the old minleft value for just a single extent
allocation and thus doesn't need the fallback as the minleft value
can't get any smaller.

^ permalink raw reply	[flat|nested] 31+ messages in thread

end of thread, other threads:[~2017-01-04  8:50 UTC | newest]

Thread overview: 31+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-12-13 15:59 minleft fixes Christoph Hellwig
2016-12-13 15:59 ` [PATCH 1/4] xfs: fix bogus minleft manipulations Christoph Hellwig
2016-12-14 17:35   ` Brian Foster
2016-12-14 19:36     ` Christoph Hellwig
2016-12-14 21:51       ` Brian Foster
     [not found]         ` <20161215143430.GB29477@bfoster.bfoster>
2016-12-16  8:21           ` Christoph Hellwig
2016-12-19 11:38           ` Christoph Hellwig
2016-12-20 14:17             ` Brian Foster
2016-12-20 21:45               ` Dave Chinner
2016-12-15 22:09   ` Dave Chinner
2016-12-16  8:20     ` Christoph Hellwig
2017-01-04  6:32       ` Darrick J. Wong
2017-01-04  8:50         ` Christoph Hellwig
2016-12-13 15:59 ` [PATCH 2/4] xfs: calculate minleft correctly for bmap allocations Christoph Hellwig
2016-12-14 18:24   ` Brian Foster
2016-12-13 15:59 ` [PATCH 3/4] xfs: adjust allocation length in xfs_alloc_space_available Christoph Hellwig
2016-12-14 18:24   ` Brian Foster
2016-12-14 19:37     ` Christoph Hellwig
2016-12-15 20:41   ` Libor Klepáč
2016-12-16  8:20     ` Christoph Hellwig
2016-12-16  0:28   ` Dave Chinner
2016-12-16  8:25     ` Christoph Hellwig
2016-12-18 23:55       ` Dave Chinner
2016-12-13 15:59 ` [PATCH 4/4] xfs: don't rely on ->total " Christoph Hellwig
2016-12-14 18:30   ` Brian Foster
2016-12-14 19:38     ` Christoph Hellwig
2016-12-14 21:51       ` Brian Foster
2016-12-15  8:55         ` Christoph Hellwig
2016-12-15 12:00           ` Brian Foster
2016-12-14 10:51 ` minleft fixes Eryu Guan
2016-12-15 10:24   ` Eryu Guan

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.