All of lore.kernel.org
 help / color / mirror / Atom feed
* fix space reservations underneath xfs_bmapi_write
@ 2017-04-13  8:05 Christoph Hellwig
  2017-04-13  8:05 ` [PATCH 01/10] xfs: introduce xfs_trans_blk_res Christoph Hellwig
                   ` (9 more replies)
  0 siblings, 10 replies; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-13  8:05 UTC (permalink / raw)
  To: linux-xfs

This series fixes the way xfs_bmapi_write deals with space reservations,
and as result gets rid of the low space allocator mode which could lead
to file system shutdowns due to ignoring the AG space allocation
constraints.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* [PATCH 01/10] xfs: introduce xfs_trans_blk_res
  2017-04-13  8:05 fix space reservations underneath xfs_bmapi_write Christoph Hellwig
@ 2017-04-13  8:05 ` Christoph Hellwig
  2017-04-13 18:28   ` Brian Foster
  2017-04-13  8:05 ` [PATCH 02/10] xfs: rewrite xfs_da_grow_inode_int Christoph Hellwig
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-13  8:05 UTC (permalink / raw)
  To: linux-xfs

This is a smaller helper to check the remaining block reservation in a
transaction.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_trans.c | 2 +-
 fs/xfs/xfs_trans.h | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index a280e126491f..3a83927a5334 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -97,7 +97,7 @@ xfs_trans_dup(
 	/* We gave our writer reference to the new transaction */
 	tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
 	ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
-	ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
+	ntp->t_blk_res = xfs_trans_blk_res(tp);
 	tp->t_blk_res = tp->t_blk_res_used;
 	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
 	tp->t_rtx_res = tp->t_rtx_res_used;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 2a9292df6640..e4f041c89846 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -152,6 +152,11 @@ typedef struct xfs_trans {
 #define	xfs_trans_agbtree_delta(tp, d)
 #endif
 
+static inline unsigned int xfs_trans_blk_res(struct xfs_trans *tp)
+{
+	return tp->t_blk_res - tp->t_blk_res_used;
+}
+
 /*
  * XFS transaction mechanism exported interfaces.
  */
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 02/10] xfs: rewrite xfs_da_grow_inode_int
  2017-04-13  8:05 fix space reservations underneath xfs_bmapi_write Christoph Hellwig
  2017-04-13  8:05 ` [PATCH 01/10] xfs: introduce xfs_trans_blk_res Christoph Hellwig
@ 2017-04-13  8:05 ` Christoph Hellwig
  2017-04-13 18:28   ` Brian Foster
  2017-04-13  8:05 ` [PATCH 03/10] xfs: remove the XFS_BMAPI_CONTIG flag Christoph Hellwig
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-13  8:05 UTC (permalink / raw)
  To: linux-xfs

Use one xfs_bmapi_write loop that just iterates if it didn't manage to
allocate enough blocks.  Get rid of the separate contig call that just
makes us call the allocator twice, and also get rid of the memory
allocation for the map array that we don't actually need.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_da_btree.c | 94 +++++++++++++-------------------------------
 1 file changed, 28 insertions(+), 66 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 1bdf2888295b..00853d332bcd 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -2007,84 +2007,46 @@ xfs_da_grow_inode_int(
 	xfs_fileoff_t		*bno,
 	int			count)
 {
-	struct xfs_trans	*tp = args->trans;
-	struct xfs_inode	*dp = args->dp;
 	int			w = args->whichfork;
-	xfs_rfsblock_t		nblks = dp->i_d.di_nblocks;
-	struct xfs_bmbt_irec	map, *mapp;
-	int			nmap, error, got, i, mapi;
+	xfs_fileoff_t		b;
+	int			error;
 
 	/*
 	 * Find a spot in the file space to put the new block.
 	 */
-	error = xfs_bmap_first_unused(tp, dp, count, bno, w);
+	error = xfs_bmap_first_unused(args->trans, args->dp, count, bno, w);
 	if (error)
 		return error;
 
-	/*
-	 * Try mapping it in one filesystem block.
-	 */
-	nmap = 1;
-	ASSERT(args->firstblock != NULL);
-	error = xfs_bmapi_write(tp, dp, *bno, count,
-			xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
-			args->firstblock, args->total, &map, &nmap,
-			args->dfops);
-	if (error)
-		return error;
+	b = *bno;
+	do {
+		xfs_rfsblock_t nblks = args->dp->i_d.di_nblocks;
+		struct xfs_bmbt_irec imap;
+		xfs_extlen_t len;
+		int nmap = 1;
+
+		error = xfs_bmapi_write(args->trans, args->dp, b, count,
+				xfs_bmapi_aflag(w) | XFS_BMAPI_METADATA,
+				args->firstblock, args->total, &imap, &nmap,
+				args->dfops);
+		if (error)
+			return error;
+		if (!nmap)
+			return -ENOSPC;
 
-	ASSERT(nmap <= 1);
-	if (nmap == 1) {
-		mapp = &map;
-		mapi = 1;
-	} else if (nmap == 0 && count > 1) {
-		xfs_fileoff_t		b;
-		int			c;
+		len = imap.br_startoff + imap.br_blockcount - b;
+		ASSERT(imap.br_startoff <= b);
+		ASSERT(len > 0);
+		ASSERT(len <= count);
 
-		/*
-		 * If we didn't get it and the block might work if fragmented,
-		 * try without the CONTIG flag.  Loop until we get it all.
-		 */
-		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
-		for (b = *bno, mapi = 0; b < *bno + count; ) {
-			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
-			c = (int)(*bno + count - b);
-			error = xfs_bmapi_write(tp, dp, b, c,
-					xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
-					args->firstblock, args->total,
-					&mapp[mapi], &nmap, args->dfops);
-			if (error)
-				goto out_free_map;
-			if (nmap < 1)
-				break;
-			mapi += nmap;
-			b = mapp[mapi - 1].br_startoff +
-			    mapp[mapi - 1].br_blockcount;
-		}
-	} else {
-		mapi = 0;
-		mapp = NULL;
-	}
-
-	/*
-	 * Count the blocks we got, make sure it matches the total.
-	 */
-	for (i = 0, got = 0; i < mapi; i++)
-		got += mapp[i].br_blockcount;
-	if (got != count || mapp[0].br_startoff != *bno ||
-	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
-	    *bno + count) {
-		error = -ENOSPC;
-		goto out_free_map;
-	}
+		/* account for newly allocated blocks in reserved blocks total */
+		args->total -= (args->dp->i_d.di_nblocks - nblks);
 
-	/* account for newly allocated blocks in reserved blocks total */
-	args->total -= dp->i_d.di_nblocks - nblks;
+		b += len;
+		count -= len;
+	} while (count > 0);
 
-out_free_map:
-	if (mapp != &map)
-		kmem_free(mapp);
-	return error;
+	return 0;
 }
 
 /*
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 03/10] xfs: remove the XFS_BMAPI_CONTIG flag
  2017-04-13  8:05 fix space reservations underneath xfs_bmapi_write Christoph Hellwig
  2017-04-13  8:05 ` [PATCH 01/10] xfs: introduce xfs_trans_blk_res Christoph Hellwig
  2017-04-13  8:05 ` [PATCH 02/10] xfs: rewrite xfs_da_grow_inode_int Christoph Hellwig
@ 2017-04-13  8:05 ` Christoph Hellwig
  2017-04-13 18:28   ` Brian Foster
  2017-04-13  8:05 ` [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block Christoph Hellwig
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-13  8:05 UTC (permalink / raw)
  To: linux-xfs

The only user just went away, so we can remove the flag and the minlen
parameter in struct xfs_bmalloca now.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c | 17 +++++++----------
 fs/xfs/libxfs/xfs_bmap.h |  3 ---
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 7f42f6067eb5..aec4907056ba 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3473,12 +3473,12 @@ xfs_bmap_select_minlen(
 	xfs_extlen_t		*blen,
 	int			notinit)
 {
-	if (notinit || *blen < ap->minlen) {
+	if (notinit) {
 		/*
 		 * Since we did a BUF_TRYLOCK above, it is possible that
 		 * there is space for this request.
 		 */
-		args->minlen = ap->minlen;
+		args->minlen = 1;
 	} else if (*blen < args->maxlen) {
 		/*
 		 * If the best seen length is less than the request length,
@@ -3670,11 +3670,11 @@ xfs_bmap_btalloc(
 			args.type = XFS_ALLOCTYPE_FIRST_AG;
 		else
 			args.type = XFS_ALLOCTYPE_START_BNO;
-		args.total = args.minlen = ap->minlen;
+		args.total = args.minlen = 1;
 	} else {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 		args.total = ap->total;
-		args.minlen = ap->minlen;
+		args.minlen = 1;
 	}
 	/* apply extent size hints if obtained earlier */
 	if (align) {
@@ -3776,9 +3776,8 @@ xfs_bmap_btalloc(
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
-	if (args.fsbno == NULLFSBLOCK && nullfb &&
-	    args.minlen > ap->minlen) {
-		args.minlen = ap->minlen;
+	if (args.fsbno == NULLFSBLOCK && nullfb && args.minlen > 1) {
+		args.minlen = 1;
 		args.type = XFS_ALLOCTYPE_START_BNO;
 		args.fsbno = ap->blkno;
 		if ((error = xfs_alloc_vextent(&args)))
@@ -3787,7 +3786,7 @@ xfs_bmap_btalloc(
 	if (args.fsbno == NULLFSBLOCK && nullfb) {
 		args.fsbno = 0;
 		args.type = XFS_ALLOCTYPE_FIRST_AG;
-		args.total = ap->minlen;
+		args.total = 1;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 		ap->dfops->dop_low = true;
@@ -4246,8 +4245,6 @@ xfs_bmapi_allocate(
 			bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
 	}
 
-	bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
-
 	/*
 	 * Only want to do the alignment at the eof if it is userdata and
 	 * allocation length is larger than a stripe unit.
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index a6e612cabe15..d9e0b1db4cdb 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -48,7 +48,6 @@ struct xfs_bmalloca {
 	int			logflags;/* flags for transaction logging */
 
 	xfs_extlen_t		total;	/* total blocks needed for xaction */
-	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
 	xfs_extlen_t		minleft; /* amount must be left after alloc */
 	bool			eof;	/* set if allocating past last extent */
 	bool			wasdel;	/* replacing a delayed allocation */
@@ -81,7 +80,6 @@ struct xfs_extent_free_item
 #define XFS_BMAPI_PREALLOC	0x008	/* preallocation op: unwritten space */
 #define XFS_BMAPI_IGSTATE	0x010	/* Ignore state - */
 					/* combine contig. space */
-#define XFS_BMAPI_CONTIG	0x020	/* must allocate only one extent */
 /*
  * unwritten extent conversion - this needs write cache flushing and no additional
  * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
@@ -119,7 +117,6 @@ struct xfs_extent_free_item
 	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
 	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
 	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
-	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
 	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
 	{ XFS_BMAPI_ZERO,	"ZERO" }, \
 	{ XFS_BMAPI_REMAP,	"REMAP" }, \
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block
  2017-04-13  8:05 fix space reservations underneath xfs_bmapi_write Christoph Hellwig
                   ` (2 preceding siblings ...)
  2017-04-13  8:05 ` [PATCH 03/10] xfs: remove the XFS_BMAPI_CONTIG flag Christoph Hellwig
@ 2017-04-13  8:05 ` Christoph Hellwig
  2017-04-13 18:30   ` Brian Foster
  2017-04-13  8:05 ` [PATCH 05/10] xfs: remove the total argument to xfs_bmap_local_to_extents Christoph Hellwig
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-13  8:05 UTC (permalink / raw)
  To: linux-xfs

We've already reserved all possible required blocks and checked
they are avaible in the same AG.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap_btree.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 3e17ceda038c..ce41dd5fbb34 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -476,19 +476,6 @@ xfs_bmbt_alloc_block(
 	if (error)
 		goto error0;
 
-	if (args.fsbno == NULLFSBLOCK && args.minleft) {
-		/*
-		 * Could not find an AG with enough free space to satisfy
-		 * a full btree split.  Try again and if
-		 * successful activate the lowspace algorithm.
-		 */
-		args.fsbno = 0;
-		args.type = XFS_ALLOCTYPE_FIRST_AG;
-		error = xfs_alloc_vextent(&args);
-		if (error)
-			goto error0;
-		cur->bc_private.b.dfops->dop_low = true;
-	}
 	if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
 		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
 		*stat = 0;
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 05/10] xfs: remove the total argument to xfs_bmap_local_to_extents
  2017-04-13  8:05 fix space reservations underneath xfs_bmapi_write Christoph Hellwig
                   ` (3 preceding siblings ...)
  2017-04-13  8:05 ` [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block Christoph Hellwig
@ 2017-04-13  8:05 ` Christoph Hellwig
  2017-04-17 14:19   ` Brian Foster
  2017-04-13  8:05 ` [PATCH 06/10] xfs: fix bmap minleft calculation Christoph Hellwig
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-13  8:05 UTC (permalink / raw)
  To: linux-xfs

It must inherently be the same as the minlen/maxlen arguments for
this allocation.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index aec4907056ba..53f1386b1868 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -867,7 +867,6 @@ xfs_bmap_local_to_extents(
 	xfs_trans_t	*tp,		/* transaction pointer */
 	xfs_inode_t	*ip,		/* incore inode pointer */
 	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
-	xfs_extlen_t	total,		/* total blocks needed by transaction */
 	int		*logflagsp,	/* inode logging flags */
 	int		whichfork,
 	void		(*init_fn)(struct xfs_trans *tp,
@@ -916,8 +915,7 @@ xfs_bmap_local_to_extents(
 		args.fsbno = *firstblock;
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 	}
-	args.total = total;
-	args.minlen = args.maxlen = args.prod = 1;
+	args.total = args.minlen = args.maxlen = args.prod = 1;
 	error = xfs_alloc_vextent(&args);
 	if (error)
 		goto done;
@@ -1067,7 +1065,7 @@ xfs_bmap_add_attrfork_local(
 	}
 
 	if (S_ISLNK(VFS_I(ip)->i_mode))
-		return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
+		return xfs_bmap_local_to_extents(tp, ip, firstblock,
 						 flags, XFS_DATA_FORK,
 						 xfs_symlink_local_to_remote);
 
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 06/10] xfs: fix bmap minleft calculation
  2017-04-13  8:05 fix space reservations underneath xfs_bmapi_write Christoph Hellwig
                   ` (4 preceding siblings ...)
  2017-04-13  8:05 ` [PATCH 05/10] xfs: remove the total argument to xfs_bmap_local_to_extents Christoph Hellwig
@ 2017-04-13  8:05 ` Christoph Hellwig
  2017-04-17 14:19   ` Brian Foster
  2017-04-13  8:05 ` [PATCH 07/10] xfs: fix space reservation in xfs_bmbt_alloc_block Christoph Hellwig
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-13  8:05 UTC (permalink / raw)
  To: linux-xfs

We need to set a proper minleft value even if we didn't do a previous
allocation in the transaction, as we can't switch to a different AG
after allocating the data extent.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c | 27 +++++++++++++++++----------
 fs/xfs/libxfs/xfs_bmap.h |  1 -
 2 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 53f1386b1868..6faefa342748 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3570,6 +3570,22 @@ xfs_bmap_btalloc_filestreams(
 	return 0;
 }
 
+/*
+ * Return the minimum number of blocks required for the bmap btree manipulation
+ * after adding a single extent.
+ */
+static xfs_extlen_t
+xfs_bmap_minleft(
+	struct xfs_bmalloca	*ap)
+{
+	int			whichfork = xfs_bmapi_whichfork(ap->flags);
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ap->ip, whichfork);
+
+	if (XFS_IFORK_FORMAT(ap->ip, whichfork) == XFS_DINODE_FMT_BTREE)
+		return be16_to_cpu(ifp->if_broot->bb_level) + 1;
+	return 1;
+}
+
 STATIC int
 xfs_bmap_btalloc(
 	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
@@ -3738,7 +3754,7 @@ xfs_bmap_btalloc(
 		args.alignment = 1;
 		args.minalignslop = 0;
 	}
-	args.minleft = ap->minleft;
+	args.minleft = xfs_bmap_minleft(ap);
 	args.wasdel = ap->wasdel;
 	args.resv = XFS_AG_RESV_NONE;
 	args.datatype = ap->datatype;
@@ -4493,15 +4509,6 @@ xfs_bmapi_write(
 
 	XFS_STATS_INC(mp, xs_blk_mapw);
 
-	if (*firstblock == NULLFSBLOCK) {
-		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
-			bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
-		else
-			bma.minleft = 1;
-	} else {
-		bma.minleft = 0;
-	}
-
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(tp, ip, whichfork);
 		if (error)
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index d9e0b1db4cdb..36a7f36f5f38 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -48,7 +48,6 @@ struct xfs_bmalloca {
 	int			logflags;/* flags for transaction logging */
 
 	xfs_extlen_t		total;	/* total blocks needed for xaction */
-	xfs_extlen_t		minleft; /* amount must be left after alloc */
 	bool			eof;	/* set if allocating past last extent */
 	bool			wasdel;	/* replacing a delayed allocation */
 	bool			aeof;	/* allocated space at eof */
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 07/10] xfs: fix space reservation in xfs_bmbt_alloc_block
  2017-04-13  8:05 fix space reservations underneath xfs_bmapi_write Christoph Hellwig
                   ` (5 preceding siblings ...)
  2017-04-13  8:05 ` [PATCH 06/10] xfs: fix bmap minleft calculation Christoph Hellwig
@ 2017-04-13  8:05 ` Christoph Hellwig
  2017-04-17 14:19   ` Brian Foster
  2017-04-13  8:05 ` [PATCH 08/10] xfs: introduce a XFS_BMAPI_BESTEFFORT flag Christoph Hellwig
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-13  8:05 UTC (permalink / raw)
  To: linux-xfs

minleft counts the number of blocks that need to be available after the
current allocation has been completed.  As the total allocation should not
be more than the transaction reservation we need to subtract the
allocation length.  In addition we need to subtract the already used
transaction reservation, for that use the new xfs_trans_blk_res helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap_btree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index ce41dd5fbb34..153c969febd4 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -442,6 +442,7 @@ xfs_bmbt_alloc_block(
 	args.mp = cur->bc_mp;
 	args.fsbno = cur->bc_private.b.firstblock;
 	args.firstblock = args.fsbno;
+	args.minlen = args.maxlen = args.prod = 1;
 	xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino,
 			cur->bc_private.b.whichfork);
 
@@ -459,14 +460,14 @@ xfs_bmbt_alloc_block(
 		 * reservation amount is insufficient then we may fail a
 		 * block allocation here and corrupt the filesystem.
 		 */
-		args.minleft = args.tp->t_blk_res;
+		if (xfs_trans_blk_res(args.tp))
+			args.minleft = xfs_trans_blk_res(args.tp) - args.maxlen;
 	} else if (cur->bc_private.b.dfops->dop_low) {
 		args.type = XFS_ALLOCTYPE_START_BNO;
 	} else {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 	}
 
-	args.minlen = args.maxlen = args.prod = 1;
 	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
 	if (!args.wasdel && args.tp->t_blk_res == 0) {
 		error = -ENOSPC;
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 08/10] xfs: introduce a XFS_BMAPI_BESTEFFORT flag
  2017-04-13  8:05 fix space reservations underneath xfs_bmapi_write Christoph Hellwig
                   ` (6 preceding siblings ...)
  2017-04-13  8:05 ` [PATCH 07/10] xfs: fix space reservation in xfs_bmbt_alloc_block Christoph Hellwig
@ 2017-04-13  8:05 ` Christoph Hellwig
  2017-04-17 18:08   ` Brian Foster
  2017-04-13  8:05 ` [PATCH 09/10] xfs: kill the dop_low flag Christoph Hellwig
  2017-04-13  8:05 ` [PATCH 10/10] xfs: remove xfs_bmap_alloc Christoph Hellwig
  9 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-13  8:05 UTC (permalink / raw)
  To: linux-xfs

We currently have two classes of xfs_bmapi_write callers that have
conflicting space reservation needs.  Many callers expect to be able
to reserve the number of total blocks passed in a single AG for
the use with the current allocation as well as future allocations
in the same transactions, or transactions chained off from it.

Other callers want to allocate up to the number of blocks passed in,
although they are fine with a smaller number of blocks to make
forward progress.  For those we only need to leave a few blocks aside
for the bmap btree manipulations when doing the main space allocation.

This patch introduces a new XFS_BMAPI_BESTEFFORT flag for the second
kind of callers that ignores the total flag and just uses the minleft
parameter to leave space for bmap btree allocations and splits.

With this we can remove the potentially dangerous fallback that
ignores the total reservation in xfs_bmap_btalloc.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_attr_remote.c |  6 ++++--
 fs/xfs/libxfs/xfs_bmap.c        | 27 ++++++++++++++++++++-------
 fs/xfs/libxfs/xfs_bmap.h        |  3 +++
 fs/xfs/xfs_bmap_util.c          |  2 ++
 fs/xfs/xfs_iomap.c              |  4 ++--
 fs/xfs/xfs_reflink.c            |  3 ++-
 6 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index d52f525f5b2d..35a99d15521c 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -464,8 +464,10 @@ xfs_attr_rmtval_set(
 		xfs_defer_init(args->dfops, args->firstblock);
 		nmap = 1;
 		error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
-				  blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock,
-				  args->total, &map, &nmap, args->dfops);
+				  blkcnt,
+				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_BESTEFFORT,
+				  args->firstblock, args->total, &map, &nmap,
+				  args->dfops);
 		if (!error)
 			error = xfs_defer_finish(&args->trans, args->dfops, dp);
 		if (error) {
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 6faefa342748..c06e7d500ed1 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -915,7 +915,7 @@ xfs_bmap_local_to_extents(
 		args.fsbno = *firstblock;
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 	}
-	args.total = args.minlen = args.maxlen = args.prod = 1;
+	args.minlen = args.maxlen = args.prod = 1;
 	error = xfs_alloc_vextent(&args);
 	if (error)
 		goto done;
@@ -3504,7 +3504,6 @@ xfs_bmap_btalloc_nullfb(
 	int			error;
 
 	args->type = XFS_ALLOCTYPE_START_BNO;
-	args->total = ap->total;
 
 	startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
 	if (startag == NULLAGNUMBER)
@@ -3538,7 +3537,6 @@ xfs_bmap_btalloc_filestreams(
 	int			error;
 
 	args->type = XFS_ALLOCTYPE_NEAR_BNO;
-	args->total = ap->total;
 
 	ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
 	if (ag == NULLAGNUMBER)
@@ -3684,10 +3682,9 @@ xfs_bmap_btalloc(
 			args.type = XFS_ALLOCTYPE_FIRST_AG;
 		else
 			args.type = XFS_ALLOCTYPE_START_BNO;
-		args.total = args.minlen = 1;
+		args.minlen = 1;
 	} else {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
-		args.total = ap->total;
 		args.minlen = 1;
 	}
 	/* apply extent size hints if obtained earlier */
@@ -3754,7 +3751,24 @@ xfs_bmap_btalloc(
 		args.alignment = 1;
 		args.minalignslop = 0;
 	}
-	args.minleft = xfs_bmap_minleft(ap);
+
+	/*
+	 * If the XFS_BMAPI_BESTEFFORT flag is set we try to allocate any
+	 * space that's available, even if it is less than requested.  We
+	 * still need to set a minleft value to guarantee that we can still
+	 * manipulate the bmap btree, though.  The total value is ignored in
+	 * this case.
+	 *
+	 * If the flag is not set the total value specifies the total space
+	 * that the transaction may use, and we must find an AG that has
+	 * enough space available for all of total, or this allocation request
+	 * will fail.
+	 */
+	if (ap->flags & XFS_BMAPI_BESTEFFORT)
+		args.minleft = xfs_bmap_minleft(ap);
+	else
+		args.total = ap->total;
+
 	args.wasdel = ap->wasdel;
 	args.resv = XFS_AG_RESV_NONE;
 	args.datatype = ap->datatype;
@@ -3800,7 +3814,6 @@ xfs_bmap_btalloc(
 	if (args.fsbno == NULLFSBLOCK && nullfb) {
 		args.fsbno = 0;
 		args.type = XFS_ALLOCTYPE_FIRST_AG;
-		args.total = 1;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 		ap->dfops->dop_low = true;
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 36a7f36f5f38..f35a2b2c4f06 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -79,6 +79,9 @@ struct xfs_extent_free_item
 #define XFS_BMAPI_PREALLOC	0x008	/* preallocation op: unwritten space */
 #define XFS_BMAPI_IGSTATE	0x010	/* Ignore state - */
 					/* combine contig. space */
+
+#define XFS_BMAPI_BESTEFFORT	0x020	/* may allocate less than requested */
+
 /*
  * unwritten extent conversion - this needs write cache flushing and no additional
  * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 4d1920e594b0..f809ebc7a495 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1033,6 +1033,8 @@ xfs_alloc_file_space(
 	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
 	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
 
+	alloc_type |= XFS_BMAPI_BESTEFFORT;
+
 	/*
 	 * Allocate file space until done or until there is an error
 	 */
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 009f8243dddc..1abed9b6d5c5 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -171,7 +171,7 @@ xfs_iomap_write_direct(
 	uint		qblocks, resblks, resrtextents;
 	int		error;
 	int		lockmode;
-	int		bmapi_flags = XFS_BMAPI_PREALLOC;
+	int		bmapi_flags = XFS_BMAPI_PREALLOC | XFS_BMAPI_BESTEFFORT;
 	uint		tflags = 0;
 
 	rt = XFS_IS_REALTIME_INODE(ip);
@@ -679,7 +679,7 @@ xfs_iomap_write_allocate(
 	xfs_trans_t	*tp;
 	int		nimaps;
 	int		error = 0;
-	int		flags = XFS_BMAPI_DELALLOC;
+	int		flags = XFS_BMAPI_DELALLOC | XFS_BMAPI_BESTEFFORT;
 	int		nres;
 
 	if (whichfork == XFS_COW_FORK)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index c0f3754caca2..aad321c16d2f 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -455,7 +455,8 @@ xfs_reflink_allocate_cow(
 
 	/* Allocate the entire reservation as unwritten blocks. */
 	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
-			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block,
+			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
+			XFS_BMAPI_BESTEFFORT, &first_block,
 			resblks, imap, &nimaps, &dfops);
 	if (error)
 		goto out_bmap_cancel;
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 09/10] xfs: kill the dop_low flag
  2017-04-13  8:05 fix space reservations underneath xfs_bmapi_write Christoph Hellwig
                   ` (7 preceding siblings ...)
  2017-04-13  8:05 ` [PATCH 08/10] xfs: introduce a XFS_BMAPI_BESTEFFORT flag Christoph Hellwig
@ 2017-04-13  8:05 ` Christoph Hellwig
  2017-04-17 18:08   ` Brian Foster
  2017-04-13  8:05 ` [PATCH 10/10] xfs: remove xfs_bmap_alloc Christoph Hellwig
  9 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-13  8:05 UTC (permalink / raw)
  To: linux-xfs

Now that we always get our space reservations right in xfs_bmapi_write
we can remove the last remains of the dangerous low space allocator mode.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_bmap.c       | 12 +-----------
 fs/xfs/libxfs/xfs_bmap_btree.c |  2 --
 fs/xfs/libxfs/xfs_defer.c      |  1 -
 fs/xfs/libxfs/xfs_defer.h      | 15 ---------------
 fs/xfs/xfs_filestream.c        |  2 --
 fs/xfs/xfs_trace.h             | 12 +++---------
 6 files changed, 4 insertions(+), 40 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c06e7d500ed1..9e3f0e6a9062 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -762,9 +762,6 @@ xfs_bmap_extents_to_btree(
 	if (*firstblock == NULLFSBLOCK) {
 		args.type = XFS_ALLOCTYPE_START_BNO;
 		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
-	} else if (dfops->dop_low) {
-		args.type = XFS_ALLOCTYPE_START_BNO;
-		args.fsbno = *firstblock;
 	} else {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 		args.fsbno = *firstblock;
@@ -3677,12 +3674,6 @@ xfs_bmap_btalloc(
 			error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
 		if (error)
 			return error;
-	} else if (ap->dfops->dop_low) {
-		if (xfs_inode_is_filestream(ap->ip))
-			args.type = XFS_ALLOCTYPE_FIRST_AG;
-		else
-			args.type = XFS_ALLOCTYPE_START_BNO;
-		args.minlen = 1;
 	} else {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 		args.minlen = 1;
@@ -3709,7 +3700,7 @@ xfs_bmap_btalloc(
 	 * is >= the stripe unit and the allocation offset is
 	 * at the end of file.
 	 */
-	if (!ap->dfops->dop_low && ap->aeof) {
+	if (ap->aeof) {
 		if (!ap->offset) {
 			args.alignment = stripe_align;
 			atype = args.type;
@@ -3816,7 +3807,6 @@ xfs_bmap_btalloc(
 		args.type = XFS_ALLOCTYPE_FIRST_AG;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
-		ap->dfops->dop_low = true;
 	}
 	if (args.fsbno != NULLFSBLOCK) {
 		/*
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 153c969febd4..b9f0f61bd8ee 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -462,8 +462,6 @@ xfs_bmbt_alloc_block(
 		 */
 		if (xfs_trans_blk_res(args.tp))
 			args.minleft = xfs_trans_blk_res(args.tp) - args.maxlen;
-	} else if (cur->bc_private.b.dfops->dop_low) {
-		args.type = XFS_ALLOCTYPE_START_BNO;
 	} else {
 		args.type = XFS_ALLOCTYPE_NEAR_BNO;
 	}
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 5c2929f94bd3..172d583c6f1c 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -503,7 +503,6 @@ xfs_defer_init(
 	xfs_fsblock_t			*fbp)
 {
 	dop->dop_committed = false;
-	dop->dop_low = false;
 	memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes));
 	*fbp = NULLFSBLOCK;
 	INIT_LIST_HEAD(&dop->dop_intake);
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index f6e93ef0bffe..cc1acd1fd4e8 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -36,20 +36,6 @@ struct xfs_defer_pending {
 	unsigned int			dfp_count;	/* # extent items */
 };
 
-/*
- * Header for deferred operation list.
- *
- * dop_low is used by the allocator to activate the lowspace algorithm -
- * when free space is running low the extent allocator may choose to
- * allocate an extent from an AG without leaving sufficient space for
- * a btree split when inserting the new extent.  In this case the allocator
- * will enable the lowspace algorithm which is supposed to allow further
- * allocations (such as btree splits and newroots) to allocate from
- * sequential AGs.  In order to avoid locking AGs out of order the lowspace
- * algorithm will start searching for free space from AG 0.  If the correct
- * transaction reservations have been made then this algorithm will eventually
- * find all the space it needs.
- */
 enum xfs_defer_ops_type {
 	XFS_DEFER_OPS_TYPE_BMAP,
 	XFS_DEFER_OPS_TYPE_REFCOUNT,
@@ -62,7 +48,6 @@ enum xfs_defer_ops_type {
 
 struct xfs_defer_ops {
 	bool			dop_committed;	/* did any trans commit? */
-	bool			dop_low;	/* alloc in low mode */
 	struct list_head	dop_intake;	/* unlogged pending work */
 	struct list_head	dop_pending;	/* logged pending work */
 
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index 043ca3808ea2..6d61e61c0700 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -390,8 +390,6 @@ xfs_filestream_new_ag(
 
 	if (xfs_alloc_is_userdata(ap->datatype))
 		flags |= XFS_PICK_USERDATA;
-	if (ap->dfops->dop_low)
-		flags |= XFS_PICK_LOWSPACE;
 
 	err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index cba10daf8391..ab4ce20b00dd 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -2256,19 +2256,16 @@ DECLARE_EVENT_CLASS(xfs_defer_class,
 		__field(dev_t, dev)
 		__field(void *, dop)
 		__field(bool, committed)
-		__field(bool, low)
 	),
 	TP_fast_assign(
 		__entry->dev = mp ? mp->m_super->s_dev : 0;
 		__entry->dop = dop;
 		__entry->committed = dop->dop_committed;
-		__entry->low = dop->dop_low;
 	),
-	TP_printk("dev %d:%d ops %p committed %d low %d\n",
+	TP_printk("dev %d:%d ops %p committed %d\n",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->dop,
-		  __entry->committed,
-		  __entry->low)
+		  __entry->committed)
 )
 #define DEFINE_DEFER_EVENT(name) \
 DEFINE_EVENT(xfs_defer_class, name, \
@@ -2282,21 +2279,18 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class,
 		__field(dev_t, dev)
 		__field(void *, dop)
 		__field(bool, committed)
-		__field(bool, low)
 		__field(int, error)
 	),
 	TP_fast_assign(
 		__entry->dev = mp ? mp->m_super->s_dev : 0;
 		__entry->dop = dop;
 		__entry->committed = dop->dop_committed;
-		__entry->low = dop->dop_low;
 		__entry->error = error;
 	),
-	TP_printk("dev %d:%d ops %p committed %d low %d err %d\n",
+	TP_printk("dev %d:%d ops %p committed %d err %d\n",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->dop,
 		  __entry->committed,
-		  __entry->low,
 		  __entry->error)
 )
 #define DEFINE_DEFER_ERROR_EVENT(name) \
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* [PATCH 10/10] xfs: remove xfs_bmap_alloc
  2017-04-13  8:05 fix space reservations underneath xfs_bmapi_write Christoph Hellwig
                   ` (8 preceding siblings ...)
  2017-04-13  8:05 ` [PATCH 09/10] xfs: kill the dop_low flag Christoph Hellwig
@ 2017-04-13  8:05 ` Christoph Hellwig
  2017-04-17 18:08   ` Brian Foster
  9 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-13  8:05 UTC (permalink / raw)
  To: linux-xfs

Just opencode it in the only caller to make the code flow easier to
follow.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/libxfs/xfs_alloc.c |  2 +-
 fs/xfs/libxfs/xfs_bmap.c  | 22 ++++++----------------
 fs/xfs/libxfs/xfs_bmap.h  |  2 +-
 3 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 7486401ccbd3..02326142ccb0 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2606,7 +2606,7 @@ xfs_alloc_vextent(
 	/*
 	 * Just fix this up, for the case where the last a.g. is shorter
 	 * (or there's only one a.g.) and the caller couldn't easily figure
-	 * that out (xfs_bmap_alloc).
+	 * that out (xfs_bmap_btalloc).
 	 */
 	agsize = mp->m_sb.sb_agblocks;
 	if (args->maxlen > agsize)
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9e3f0e6a9062..8e94030bcb8f 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -3842,20 +3842,6 @@ xfs_bmap_btalloc(
 	return 0;
 }
 
-/*
- * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
- * It figures out where to ask the underlying allocator to put the new extent.
- */
-STATIC int
-xfs_bmap_alloc(
-	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
-{
-	if (XFS_IS_REALTIME_INODE(ap->ip) &&
-	    xfs_alloc_is_userdata(ap->datatype))
-		return xfs_bmap_rtalloc(ap);
-	return xfs_bmap_btalloc(ap);
-}
-
 /* Trim extent to fit a logical block range. */
 void
 xfs_trim_extent(
@@ -4273,7 +4259,11 @@ xfs_bmapi_allocate(
 			return error;
 	}
 
-	error = xfs_bmap_alloc(bma);
+	if (XFS_IS_REALTIME_INODE(bma->ip) &&
+	    xfs_alloc_is_userdata(bma->datatype))
+		error = xfs_bmap_rtalloc(bma);
+	else
+		error = xfs_bmap_btalloc(bma);
 	if (error)
 		return error;
 
@@ -4451,7 +4441,7 @@ xfs_bmapi_write(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp;
-	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_alloc */
+	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_*alloc */
 	xfs_fileoff_t		end;		/* end of mapped file region */
 	bool			eof = false;	/* after the end of extents */
 	int			error;		/* error return */
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index f35a2b2c4f06..6de7d321f8a2 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -28,7 +28,7 @@ struct xfs_trans;
 extern kmem_zone_t	*xfs_bmap_free_item_zone;
 
 /*
- * Argument structure for xfs_bmap_alloc.
+ * Argument structure for xfs_bmap_*alloc.
  */
 struct xfs_bmalloca {
 	xfs_fsblock_t		*firstblock; /* i/o first block allocated */
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 30+ messages in thread

* Re: [PATCH 01/10] xfs: introduce xfs_trans_blk_res
  2017-04-13  8:05 ` [PATCH 01/10] xfs: introduce xfs_trans_blk_res Christoph Hellwig
@ 2017-04-13 18:28   ` Brian Foster
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2017-04-13 18:28 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Thu, Apr 13, 2017 at 10:05:08AM +0200, Christoph Hellwig wrote:
> This is a smaller helper to check the remaining block reservation in a
> transaction.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/xfs_trans.c | 2 +-
>  fs/xfs/xfs_trans.h | 5 +++++
>  2 files changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
> index a280e126491f..3a83927a5334 100644
> --- a/fs/xfs/xfs_trans.c
> +++ b/fs/xfs/xfs_trans.c
> @@ -97,7 +97,7 @@ xfs_trans_dup(
>  	/* We gave our writer reference to the new transaction */
>  	tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
>  	ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
> -	ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
> +	ntp->t_blk_res = xfs_trans_blk_res(tp);
>  	tp->t_blk_res = tp->t_blk_res_used;
>  	ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
>  	tp->t_rtx_res = tp->t_rtx_res_used;
> diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
> index 2a9292df6640..e4f041c89846 100644
> --- a/fs/xfs/xfs_trans.h
> +++ b/fs/xfs/xfs_trans.h
> @@ -152,6 +152,11 @@ typedef struct xfs_trans {
>  #define	xfs_trans_agbtree_delta(tp, d)
>  #endif
>  
> +static inline unsigned int xfs_trans_blk_res(struct xfs_trans *tp)
> +{
> +	return tp->t_blk_res - tp->t_blk_res_used;
> +}
> +
>  /*
>   * XFS transaction mechanism exported interfaces.
>   */
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 02/10] xfs: rewrite xfs_da_grow_inode_int
  2017-04-13  8:05 ` [PATCH 02/10] xfs: rewrite xfs_da_grow_inode_int Christoph Hellwig
@ 2017-04-13 18:28   ` Brian Foster
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2017-04-13 18:28 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Thu, Apr 13, 2017 at 10:05:09AM +0200, Christoph Hellwig wrote:
> Use one xfs_bmapi_write loop that just iterates if it didn't manage to
> allocate enough blocks.  Get rid of the separate contig call that just
> makes us call the allocator twice, and also get rid of the memory
> allocation for the map array that we don't actually need.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/libxfs/xfs_da_btree.c | 94 +++++++++++++-------------------------------
>  1 file changed, 28 insertions(+), 66 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
> index 1bdf2888295b..00853d332bcd 100644
> --- a/fs/xfs/libxfs/xfs_da_btree.c
> +++ b/fs/xfs/libxfs/xfs_da_btree.c
> @@ -2007,84 +2007,46 @@ xfs_da_grow_inode_int(
>  	xfs_fileoff_t		*bno,
>  	int			count)
>  {
> -	struct xfs_trans	*tp = args->trans;
> -	struct xfs_inode	*dp = args->dp;
>  	int			w = args->whichfork;
> -	xfs_rfsblock_t		nblks = dp->i_d.di_nblocks;
> -	struct xfs_bmbt_irec	map, *mapp;
> -	int			nmap, error, got, i, mapi;
> +	xfs_fileoff_t		b;
> +	int			error;
>  
>  	/*
>  	 * Find a spot in the file space to put the new block.
>  	 */
> -	error = xfs_bmap_first_unused(tp, dp, count, bno, w);
> +	error = xfs_bmap_first_unused(args->trans, args->dp, count, bno, w);
>  	if (error)
>  		return error;
>  
> -	/*
> -	 * Try mapping it in one filesystem block.
> -	 */
> -	nmap = 1;
> -	ASSERT(args->firstblock != NULL);
> -	error = xfs_bmapi_write(tp, dp, *bno, count,
> -			xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
> -			args->firstblock, args->total, &map, &nmap,
> -			args->dfops);
> -	if (error)
> -		return error;
> +	b = *bno;
> +	do {
> +		xfs_rfsblock_t nblks = args->dp->i_d.di_nblocks;
> +		struct xfs_bmbt_irec imap;
> +		xfs_extlen_t len;
> +		int nmap = 1;
> +
> +		error = xfs_bmapi_write(args->trans, args->dp, b, count,
> +				xfs_bmapi_aflag(w) | XFS_BMAPI_METADATA,
> +				args->firstblock, args->total, &imap, &nmap,
> +				args->dfops);
> +		if (error)
> +			return error;
> +		if (!nmap)
> +			return -ENOSPC;
>  
> -	ASSERT(nmap <= 1);
> -	if (nmap == 1) {
> -		mapp = &map;
> -		mapi = 1;
> -	} else if (nmap == 0 && count > 1) {
> -		xfs_fileoff_t		b;
> -		int			c;
> +		len = imap.br_startoff + imap.br_blockcount - b;
> +		ASSERT(imap.br_startoff <= b);
> +		ASSERT(len > 0);
> +		ASSERT(len <= count);
>  
> -		/*
> -		 * If we didn't get it and the block might work if fragmented,
> -		 * try without the CONTIG flag.  Loop until we get it all.
> -		 */
> -		mapp = kmem_alloc(sizeof(*mapp) * count, KM_SLEEP);
> -		for (b = *bno, mapi = 0; b < *bno + count; ) {
> -			nmap = MIN(XFS_BMAP_MAX_NMAP, count);
> -			c = (int)(*bno + count - b);
> -			error = xfs_bmapi_write(tp, dp, b, c,
> -					xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA,
> -					args->firstblock, args->total,
> -					&mapp[mapi], &nmap, args->dfops);
> -			if (error)
> -				goto out_free_map;
> -			if (nmap < 1)
> -				break;
> -			mapi += nmap;
> -			b = mapp[mapi - 1].br_startoff +
> -			    mapp[mapi - 1].br_blockcount;
> -		}
> -	} else {
> -		mapi = 0;
> -		mapp = NULL;
> -	}
> -
> -	/*
> -	 * Count the blocks we got, make sure it matches the total.
> -	 */
> -	for (i = 0, got = 0; i < mapi; i++)
> -		got += mapp[i].br_blockcount;
> -	if (got != count || mapp[0].br_startoff != *bno ||
> -	    mapp[mapi - 1].br_startoff + mapp[mapi - 1].br_blockcount !=
> -	    *bno + count) {
> -		error = -ENOSPC;
> -		goto out_free_map;
> -	}
> +		/* account for newly allocated blocks in reserved blocks total */
> +		args->total -= (args->dp->i_d.di_nblocks - nblks);
>  
> -	/* account for newly allocated blocks in reserved blocks total */
> -	args->total -= dp->i_d.di_nblocks - nblks;
> +		b += len;
> +		count -= len;
> +	} while (count > 0);
>  
> -out_free_map:
> -	if (mapp != &map)
> -		kmem_free(mapp);
> -	return error;
> +	return 0;
>  }
>  
>  /*
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 03/10] xfs: remove the XFS_BMAPI_CONTIG flag
  2017-04-13  8:05 ` [PATCH 03/10] xfs: remove the XFS_BMAPI_CONTIG flag Christoph Hellwig
@ 2017-04-13 18:28   ` Brian Foster
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2017-04-13 18:28 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Thu, Apr 13, 2017 at 10:05:10AM +0200, Christoph Hellwig wrote:
> The only user just went away, so we can remove the flag and the minlen
> parameter in struct xfs_bmalloca now.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_bmap.c | 17 +++++++----------
>  fs/xfs/libxfs/xfs_bmap.h |  3 ---
>  2 files changed, 7 insertions(+), 13 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 7f42f6067eb5..aec4907056ba 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -3473,12 +3473,12 @@ xfs_bmap_select_minlen(
>  	xfs_extlen_t		*blen,
>  	int			notinit)
>  {
> -	if (notinit || *blen < ap->minlen) {
> +	if (notinit) {
>  		/*
>  		 * Since we did a BUF_TRYLOCK above, it is possible that
>  		 * there is space for this request.
>  		 */
> -		args->minlen = ap->minlen;
> +		args->minlen = 1;

The ap param is no longer used in this function. Otherwise looks Ok to
me:

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  	} else if (*blen < args->maxlen) {
>  		/*
>  		 * If the best seen length is less than the request length,
> @@ -3670,11 +3670,11 @@ xfs_bmap_btalloc(
>  			args.type = XFS_ALLOCTYPE_FIRST_AG;
>  		else
>  			args.type = XFS_ALLOCTYPE_START_BNO;
> -		args.total = args.minlen = ap->minlen;
> +		args.total = args.minlen = 1;
>  	} else {
>  		args.type = XFS_ALLOCTYPE_NEAR_BNO;
>  		args.total = ap->total;
> -		args.minlen = ap->minlen;
> +		args.minlen = 1;
>  	}
>  	/* apply extent size hints if obtained earlier */
>  	if (align) {
> @@ -3776,9 +3776,8 @@ xfs_bmap_btalloc(
>  		if ((error = xfs_alloc_vextent(&args)))
>  			return error;
>  	}
> -	if (args.fsbno == NULLFSBLOCK && nullfb &&
> -	    args.minlen > ap->minlen) {
> -		args.minlen = ap->minlen;
> +	if (args.fsbno == NULLFSBLOCK && nullfb && args.minlen > 1) {
> +		args.minlen = 1;
>  		args.type = XFS_ALLOCTYPE_START_BNO;
>  		args.fsbno = ap->blkno;
>  		if ((error = xfs_alloc_vextent(&args)))
> @@ -3787,7 +3786,7 @@ xfs_bmap_btalloc(
>  	if (args.fsbno == NULLFSBLOCK && nullfb) {
>  		args.fsbno = 0;
>  		args.type = XFS_ALLOCTYPE_FIRST_AG;
> -		args.total = ap->minlen;
> +		args.total = 1;
>  		if ((error = xfs_alloc_vextent(&args)))
>  			return error;
>  		ap->dfops->dop_low = true;
> @@ -4246,8 +4245,6 @@ xfs_bmapi_allocate(
>  			bma->datatype |= XFS_ALLOC_USERDATA_ZERO;
>  	}
>  
> -	bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
> -
>  	/*
>  	 * Only want to do the alignment at the eof if it is userdata and
>  	 * allocation length is larger than a stripe unit.
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index a6e612cabe15..d9e0b1db4cdb 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -48,7 +48,6 @@ struct xfs_bmalloca {
>  	int			logflags;/* flags for transaction logging */
>  
>  	xfs_extlen_t		total;	/* total blocks needed for xaction */
> -	xfs_extlen_t		minlen;	/* minimum allocation size (blocks) */
>  	xfs_extlen_t		minleft; /* amount must be left after alloc */
>  	bool			eof;	/* set if allocating past last extent */
>  	bool			wasdel;	/* replacing a delayed allocation */
> @@ -81,7 +80,6 @@ struct xfs_extent_free_item
>  #define XFS_BMAPI_PREALLOC	0x008	/* preallocation op: unwritten space */
>  #define XFS_BMAPI_IGSTATE	0x010	/* Ignore state - */
>  					/* combine contig. space */
> -#define XFS_BMAPI_CONTIG	0x020	/* must allocate only one extent */
>  /*
>   * unwritten extent conversion - this needs write cache flushing and no additional
>   * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
> @@ -119,7 +117,6 @@ struct xfs_extent_free_item
>  	{ XFS_BMAPI_ATTRFORK,	"ATTRFORK" }, \
>  	{ XFS_BMAPI_PREALLOC,	"PREALLOC" }, \
>  	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
> -	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
>  	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
>  	{ XFS_BMAPI_ZERO,	"ZERO" }, \
>  	{ XFS_BMAPI_REMAP,	"REMAP" }, \
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block
  2017-04-13  8:05 ` [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block Christoph Hellwig
@ 2017-04-13 18:30   ` Brian Foster
  2017-04-14  7:46     ` Christoph Hellwig
  0 siblings, 1 reply; 30+ messages in thread
From: Brian Foster @ 2017-04-13 18:30 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Thu, Apr 13, 2017 at 10:05:11AM +0200, Christoph Hellwig wrote:
> We've already reserved all possible required blocks and checked
> they are avaible in the same AG.
> 

I'm not quite following why this retry is unsafe as noted in the patch
title.. do you mean "unnecessary?" AFAICT, the firstblock == NULLFSBLOCK
case means we can issue this first allocation from any AG. If no AG can
allocate a block while satisfying minleft, then we can still safely
allocate from any AG provided any subsequent allocations occur in
increasing AG order (i.e., by setting dop_low), right?

Also, if this is unnecessary, what exactly verifies that all of the
reserved blocks are available within the same AG?

This patch may ultimately be fine, but at minimum I think a bit more
context/explanation is needed in the commit log. A couple things that
give me pause are 1.) this is a context highly sensitive to allocation
failure and 2.) the minleft used in the initial allocation is based on
the transaction block reservation, which isn't exactly deterministic (so
can some future transaction now increase the likelihood of bmbt block
allocation failure because it decided to reserve too many extra
blocks?).

Brian

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_bmap_btree.c | 13 -------------
>  1 file changed, 13 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
> index 3e17ceda038c..ce41dd5fbb34 100644
> --- a/fs/xfs/libxfs/xfs_bmap_btree.c
> +++ b/fs/xfs/libxfs/xfs_bmap_btree.c
> @@ -476,19 +476,6 @@ xfs_bmbt_alloc_block(
>  	if (error)
>  		goto error0;
>  
> -	if (args.fsbno == NULLFSBLOCK && args.minleft) {
> -		/*
> -		 * Could not find an AG with enough free space to satisfy
> -		 * a full btree split.  Try again and if
> -		 * successful activate the lowspace algorithm.
> -		 */
> -		args.fsbno = 0;
> -		args.type = XFS_ALLOCTYPE_FIRST_AG;
> -		error = xfs_alloc_vextent(&args);
> -		if (error)
> -			goto error0;
> -		cur->bc_private.b.dfops->dop_low = true;
> -	}
>  	if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
>  		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
>  		*stat = 0;
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block
  2017-04-13 18:30   ` Brian Foster
@ 2017-04-14  7:46     ` Christoph Hellwig
  2017-04-17 14:19       ` Brian Foster
  0 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-14  7:46 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs

On Thu, Apr 13, 2017 at 02:30:06PM -0400, Brian Foster wrote:
> I'm not quite following why this retry is unsafe as noted in the patch
> title.. do you mean "unnecessary?" AFAICT, the firstblock == NULLFSBLOCK
> case means we can issue this first allocation from any AG.

Yes.

> If no AG can
> allocate a block while satisfying minleft, then we can still safely
> allocate from any AG provided any subsequent allocations occur in
> increasing AG order (i.e., by setting dop_low), right?

Yes.  But minleft is set exactly because we require this number of
blocks to be left after the current allocation.  If we could only
allocate the current allocation, but not satisfy minleft we risk
shutting the file system during subsequent allocations instead of
just returning ENOSPC now.

> Also, if this is unnecessary, what exactly verifies that all of the
> reserved blocks are available within the same AG?

xfs_alloc_space_available verifies that ->total blocks are available
in the current AG.  Callers of the allocator need to set it to the
correct value currently, although I have more xfs_bmapi changes in
the pipe to get this right automatically - but those aren't 4.12
material.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block
  2017-04-14  7:46     ` Christoph Hellwig
@ 2017-04-17 14:19       ` Brian Foster
  2017-04-18  7:54         ` Christoph Hellwig
  0 siblings, 1 reply; 30+ messages in thread
From: Brian Foster @ 2017-04-17 14:19 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Fri, Apr 14, 2017 at 09:46:58AM +0200, Christoph Hellwig wrote:
> On Thu, Apr 13, 2017 at 02:30:06PM -0400, Brian Foster wrote:
> > I'm not quite following why this retry is unsafe as noted in the patch
> > title.. do you mean "unnecessary?" AFAICT, the firstblock == NULLFSBLOCK
> > case means we can issue this first allocation from any AG.
> 
> Yes.
> 
> > If no AG can
> > allocate a block while satisfying minleft, then we can still safely
> > allocate from any AG provided any subsequent allocations occur in
> > increasing AG order (i.e., by setting dop_low), right?
> 
> Yes.  But minleft is set exactly because we require this number of
> blocks to be left after the current allocation.  If we could only
> allocate the current allocation, but not satisfy minleft we risk
> shutting the file system during subsequent allocations instead of
> just returning ENOSPC now.
> 

I don't see anything about setting minleft here that says the allocation
is required to come from one AG as opposed to that simply being
preferred.

Also, I think we risk shutdown if this allocation fails at all,
regardless of the firstblock state, because the transaction is likely
already dirty. I have by no means audited all of the possible contexts
that lead here, but a quick tracepoint check shows the transcation as
dirty when punching holes. I'm also guessing this is why we currently
try so hard to allocate here.

> > Also, if this is unnecessary, what exactly verifies that all of the
> > reserved blocks are available within the same AG?
> 
> xfs_alloc_space_available verifies that ->total blocks are available
> in the current AG.  Callers of the allocator need to set it to the
> correct value currently, although I have more xfs_bmapi changes in
> the pipe to get this right automatically - but those aren't 4.12
> material.

Not all bmbt block allocations are tied to extent allocations. This is
the firstblock == NULLFSBLOCK case after all, which I take it means an
allocation hasn't yet occurred. IOW, what about other potentially
record-inserting operations like hole punch, extent conversion, etc.?

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 05/10] xfs: remove the total argument to xfs_bmap_local_to_extents
  2017-04-13  8:05 ` [PATCH 05/10] xfs: remove the total argument to xfs_bmap_local_to_extents Christoph Hellwig
@ 2017-04-17 14:19   ` Brian Foster
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2017-04-17 14:19 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Thu, Apr 13, 2017 at 10:05:12AM +0200, Christoph Hellwig wrote:
> It must inherently be the same as the minlen/maxlen arguments for
> this allocation.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/libxfs/xfs_bmap.c | 6 ++----
>  1 file changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index aec4907056ba..53f1386b1868 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -867,7 +867,6 @@ xfs_bmap_local_to_extents(
>  	xfs_trans_t	*tp,		/* transaction pointer */
>  	xfs_inode_t	*ip,		/* incore inode pointer */
>  	xfs_fsblock_t	*firstblock,	/* first block allocated in xaction */
> -	xfs_extlen_t	total,		/* total blocks needed by transaction */
>  	int		*logflagsp,	/* inode logging flags */
>  	int		whichfork,
>  	void		(*init_fn)(struct xfs_trans *tp,
> @@ -916,8 +915,7 @@ xfs_bmap_local_to_extents(
>  		args.fsbno = *firstblock;
>  		args.type = XFS_ALLOCTYPE_NEAR_BNO;
>  	}
> -	args.total = total;
> -	args.minlen = args.maxlen = args.prod = 1;
> +	args.total = args.minlen = args.maxlen = args.prod = 1;
>  	error = xfs_alloc_vextent(&args);
>  	if (error)
>  		goto done;
> @@ -1067,7 +1065,7 @@ xfs_bmap_add_attrfork_local(
>  	}
>  
>  	if (S_ISLNK(VFS_I(ip)->i_mode))
> -		return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
> +		return xfs_bmap_local_to_extents(tp, ip, firstblock,
>  						 flags, XFS_DATA_FORK,
>  						 xfs_symlink_local_to_remote);
>  
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 06/10] xfs: fix bmap minleft calculation
  2017-04-13  8:05 ` [PATCH 06/10] xfs: fix bmap minleft calculation Christoph Hellwig
@ 2017-04-17 14:19   ` Brian Foster
  2017-04-18  7:59     ` Christoph Hellwig
  0 siblings, 1 reply; 30+ messages in thread
From: Brian Foster @ 2017-04-17 14:19 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Thu, Apr 13, 2017 at 10:05:13AM +0200, Christoph Hellwig wrote:
> We need to set a proper minleft value even if we didn't do a previous
> allocation in the transaction, as we can't switch to a different AG
> after allocating the data extent.
> 

Hmm, the code currently does set minleft if we haven't done a previous
allocation (firstblock == NULLFSBLOCK). Do you mean "even if we have
done a previous allocation?"

The code seems Ok, but we also currently reserve enough blocks for a
full btree split in a write transaction and afaict only allocate a
single extent per-transaction. The current code expects to select an AG
with those additional blocks available and then not if a subsequent
allocation occurs (by setting minleft = 0), presumably because the check
was already made. So I'm wondering if this fixes a problem that has been
observed or is just cleaning up the code..?

Brian

> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_bmap.c | 27 +++++++++++++++++----------
>  fs/xfs/libxfs/xfs_bmap.h |  1 -
>  2 files changed, 17 insertions(+), 11 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 53f1386b1868..6faefa342748 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -3570,6 +3570,22 @@ xfs_bmap_btalloc_filestreams(
>  	return 0;
>  }
>  
> +/*
> + * Return the minimum number of blocks required for the bmap btree manipulation
> + * after adding a single extent.
> + */
> +static xfs_extlen_t
> +xfs_bmap_minleft(
> +	struct xfs_bmalloca	*ap)
> +{
> +	int			whichfork = xfs_bmapi_whichfork(ap->flags);
> +	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ap->ip, whichfork);
> +
> +	if (XFS_IFORK_FORMAT(ap->ip, whichfork) == XFS_DINODE_FMT_BTREE)
> +		return be16_to_cpu(ifp->if_broot->bb_level) + 1;
> +	return 1;
> +}
> +
>  STATIC int
>  xfs_bmap_btalloc(
>  	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
> @@ -3738,7 +3754,7 @@ xfs_bmap_btalloc(
>  		args.alignment = 1;
>  		args.minalignslop = 0;
>  	}
> -	args.minleft = ap->minleft;
> +	args.minleft = xfs_bmap_minleft(ap);
>  	args.wasdel = ap->wasdel;
>  	args.resv = XFS_AG_RESV_NONE;
>  	args.datatype = ap->datatype;
> @@ -4493,15 +4509,6 @@ xfs_bmapi_write(
>  
>  	XFS_STATS_INC(mp, xs_blk_mapw);
>  
> -	if (*firstblock == NULLFSBLOCK) {
> -		if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE)
> -			bma.minleft = be16_to_cpu(ifp->if_broot->bb_level) + 1;
> -		else
> -			bma.minleft = 1;
> -	} else {
> -		bma.minleft = 0;
> -	}
> -
>  	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
>  		error = xfs_iread_extents(tp, ip, whichfork);
>  		if (error)
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index d9e0b1db4cdb..36a7f36f5f38 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -48,7 +48,6 @@ struct xfs_bmalloca {
>  	int			logflags;/* flags for transaction logging */
>  
>  	xfs_extlen_t		total;	/* total blocks needed for xaction */
> -	xfs_extlen_t		minleft; /* amount must be left after alloc */
>  	bool			eof;	/* set if allocating past last extent */
>  	bool			wasdel;	/* replacing a delayed allocation */
>  	bool			aeof;	/* allocated space at eof */
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 07/10] xfs: fix space reservation in xfs_bmbt_alloc_block
  2017-04-13  8:05 ` [PATCH 07/10] xfs: fix space reservation in xfs_bmbt_alloc_block Christoph Hellwig
@ 2017-04-17 14:19   ` Brian Foster
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2017-04-17 14:19 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Thu, Apr 13, 2017 at 10:05:14AM +0200, Christoph Hellwig wrote:
> minleft counts the number of blocks that need to be available after the
> current allocation has been completed.  As the total allocation should not
> be more than the transaction reservation we need to subtract the
> allocation length.  In addition we need to subtract the already used
> transaction reservation, for that use the new xfs_trans_blk_res helper.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/libxfs/xfs_bmap_btree.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
> index ce41dd5fbb34..153c969febd4 100644
> --- a/fs/xfs/libxfs/xfs_bmap_btree.c
> +++ b/fs/xfs/libxfs/xfs_bmap_btree.c
> @@ -442,6 +442,7 @@ xfs_bmbt_alloc_block(
>  	args.mp = cur->bc_mp;
>  	args.fsbno = cur->bc_private.b.firstblock;
>  	args.firstblock = args.fsbno;
> +	args.minlen = args.maxlen = args.prod = 1;
>  	xfs_rmap_ino_bmbt_owner(&args.oinfo, cur->bc_private.b.ip->i_ino,
>  			cur->bc_private.b.whichfork);
>  
> @@ -459,14 +460,14 @@ xfs_bmbt_alloc_block(
>  		 * reservation amount is insufficient then we may fail a
>  		 * block allocation here and corrupt the filesystem.
>  		 */
> -		args.minleft = args.tp->t_blk_res;
> +		if (xfs_trans_blk_res(args.tp))
> +			args.minleft = xfs_trans_blk_res(args.tp) - args.maxlen;
>  	} else if (cur->bc_private.b.dfops->dop_low) {
>  		args.type = XFS_ALLOCTYPE_START_BNO;
>  	} else {
>  		args.type = XFS_ALLOCTYPE_NEAR_BNO;
>  	}
>  
> -	args.minlen = args.maxlen = args.prod = 1;
>  	args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
>  	if (!args.wasdel && args.tp->t_blk_res == 0) {
>  		error = -ENOSPC;
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 08/10] xfs: introduce a XFS_BMAPI_BESTEFFORT flag
  2017-04-13  8:05 ` [PATCH 08/10] xfs: introduce a XFS_BMAPI_BESTEFFORT flag Christoph Hellwig
@ 2017-04-17 18:08   ` Brian Foster
  2017-04-18  7:58     ` Christoph Hellwig
  0 siblings, 1 reply; 30+ messages in thread
From: Brian Foster @ 2017-04-17 18:08 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Thu, Apr 13, 2017 at 10:05:15AM +0200, Christoph Hellwig wrote:
> We currently have two classes of xfs_bmapi_write callers that have
> conflicting space reservation needs.  Many callers expect to be able
> to reserve the number of total blocks passed in a single AG for
> the use with the current allocation as well as future allocations
> in the same transactions, or transactions chained off from it.
> 
> Other callers want to allocate up to the number of blocks passed in,
> although they are fine with a smaller number of blocks to make
> forward progress.  For those we only need to leave a few blocks aside
> for the bmap btree manipulations when doing the main space allocation.
> 
> This patch introduces a new XFS_BMAPI_BESTEFFORT flag for the second
> kind of callers that ignores the total flag and just uses the minleft
> parameter to leave space for bmap btree allocations and splits.
> 
> With this we can remove the potentially dangerous fallback that
> ignores the total reservation in xfs_bmap_btalloc.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_attr_remote.c |  6 ++++--
>  fs/xfs/libxfs/xfs_bmap.c        | 27 ++++++++++++++++++++-------
>  fs/xfs/libxfs/xfs_bmap.h        |  3 +++
>  fs/xfs/xfs_bmap_util.c          |  2 ++
>  fs/xfs/xfs_iomap.c              |  4 ++--
>  fs/xfs/xfs_reflink.c            |  3 ++-
>  6 files changed, 33 insertions(+), 12 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
> index d52f525f5b2d..35a99d15521c 100644
> --- a/fs/xfs/libxfs/xfs_attr_remote.c
> +++ b/fs/xfs/libxfs/xfs_attr_remote.c
> @@ -464,8 +464,10 @@ xfs_attr_rmtval_set(
>  		xfs_defer_init(args->dfops, args->firstblock);
>  		nmap = 1;
>  		error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
> -				  blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock,
> -				  args->total, &map, &nmap, args->dfops);
> +				  blkcnt,
> +				  XFS_BMAPI_ATTRFORK | XFS_BMAPI_BESTEFFORT,
> +				  args->firstblock, args->total, &map, &nmap,
> +				  args->dfops);
>  		if (!error)
>  			error = xfs_defer_finish(&args->trans, args->dfops, dp);
>  		if (error) {
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 6faefa342748..c06e7d500ed1 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -915,7 +915,7 @@ xfs_bmap_local_to_extents(
>  		args.fsbno = *firstblock;
>  		args.type = XFS_ALLOCTYPE_NEAR_BNO;
>  	}
> -	args.total = args.minlen = args.maxlen = args.prod = 1;
> +	args.minlen = args.maxlen = args.prod = 1;
>  	error = xfs_alloc_vextent(&args);
>  	if (error)
>  		goto done;
> @@ -3504,7 +3504,6 @@ xfs_bmap_btalloc_nullfb(
>  	int			error;
>  
>  	args->type = XFS_ALLOCTYPE_START_BNO;
> -	args->total = ap->total;
>  
>  	startag = ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
>  	if (startag == NULLAGNUMBER)
> @@ -3538,7 +3537,6 @@ xfs_bmap_btalloc_filestreams(
>  	int			error;
>  
>  	args->type = XFS_ALLOCTYPE_NEAR_BNO;
> -	args->total = ap->total;
>  
>  	ag = XFS_FSB_TO_AGNO(mp, args->fsbno);
>  	if (ag == NULLAGNUMBER)
> @@ -3684,10 +3682,9 @@ xfs_bmap_btalloc(
>  			args.type = XFS_ALLOCTYPE_FIRST_AG;
>  		else
>  			args.type = XFS_ALLOCTYPE_START_BNO;
> -		args.total = args.minlen = 1;
> +		args.minlen = 1;
>  	} else {
>  		args.type = XFS_ALLOCTYPE_NEAR_BNO;
> -		args.total = ap->total;
>  		args.minlen = 1;
>  	}
>  	/* apply extent size hints if obtained earlier */
> @@ -3754,7 +3751,24 @@ xfs_bmap_btalloc(
>  		args.alignment = 1;
>  		args.minalignslop = 0;
>  	}
> -	args.minleft = xfs_bmap_minleft(ap);
> +
> +	/*
> +	 * If the XFS_BMAPI_BESTEFFORT flag is set we try to allocate any
> +	 * space that's available, even if it is less than requested.  We
> +	 * still need to set a minleft value to guarantee that we can still
> +	 * manipulate the bmap btree, though.  The total value is ignored in
> +	 * this case.
> +	 *
> +	 * If the flag is not set the total value specifies the total space
> +	 * that the transaction may use, and we must find an AG that has
> +	 * enough space available for all of total, or this allocation request
> +	 * will fail.
> +	 */
> +	if (ap->flags & XFS_BMAPI_BESTEFFORT)
> +		args.minleft = xfs_bmap_minleft(ap);
> +	else
> +		args.total = ap->total;
> +

This seems Ok, but I don't think it's very elegant to have callers pass
a total parameter along with a flag to ignore it. Couldn't we just set
minleft when total is not used and pass zero from those particular
callers, or do we actually need to support the !BESTEFFORT && total == 0
case? (At the very least, we could assert that total == 0 when
BESTEFFORT is set.)

Brian

>  	args.wasdel = ap->wasdel;
>  	args.resv = XFS_AG_RESV_NONE;
>  	args.datatype = ap->datatype;
> @@ -3800,7 +3814,6 @@ xfs_bmap_btalloc(
>  	if (args.fsbno == NULLFSBLOCK && nullfb) {
>  		args.fsbno = 0;
>  		args.type = XFS_ALLOCTYPE_FIRST_AG;
> -		args.total = 1;
>  		if ((error = xfs_alloc_vextent(&args)))
>  			return error;
>  		ap->dfops->dop_low = true;
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index 36a7f36f5f38..f35a2b2c4f06 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -79,6 +79,9 @@ struct xfs_extent_free_item
>  #define XFS_BMAPI_PREALLOC	0x008	/* preallocation op: unwritten space */
>  #define XFS_BMAPI_IGSTATE	0x010	/* Ignore state - */
>  					/* combine contig. space */
> +
> +#define XFS_BMAPI_BESTEFFORT	0x020	/* may allocate less than requested */
> +
>  /*
>   * unwritten extent conversion - this needs write cache flushing and no additional
>   * allocation alignments. When specified with XFS_BMAPI_PREALLOC it converts
> diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
> index 4d1920e594b0..f809ebc7a495 100644
> --- a/fs/xfs/xfs_bmap_util.c
> +++ b/fs/xfs/xfs_bmap_util.c
> @@ -1033,6 +1033,8 @@ xfs_alloc_file_space(
>  	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
>  	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
>  
> +	alloc_type |= XFS_BMAPI_BESTEFFORT;
> +
>  	/*
>  	 * Allocate file space until done or until there is an error
>  	 */
> diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
> index 009f8243dddc..1abed9b6d5c5 100644
> --- a/fs/xfs/xfs_iomap.c
> +++ b/fs/xfs/xfs_iomap.c
> @@ -171,7 +171,7 @@ xfs_iomap_write_direct(
>  	uint		qblocks, resblks, resrtextents;
>  	int		error;
>  	int		lockmode;
> -	int		bmapi_flags = XFS_BMAPI_PREALLOC;
> +	int		bmapi_flags = XFS_BMAPI_PREALLOC | XFS_BMAPI_BESTEFFORT;
>  	uint		tflags = 0;
>  
>  	rt = XFS_IS_REALTIME_INODE(ip);
> @@ -679,7 +679,7 @@ xfs_iomap_write_allocate(
>  	xfs_trans_t	*tp;
>  	int		nimaps;
>  	int		error = 0;
> -	int		flags = XFS_BMAPI_DELALLOC;
> +	int		flags = XFS_BMAPI_DELALLOC | XFS_BMAPI_BESTEFFORT;
>  	int		nres;
>  
>  	if (whichfork == XFS_COW_FORK)
> diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
> index c0f3754caca2..aad321c16d2f 100644
> --- a/fs/xfs/xfs_reflink.c
> +++ b/fs/xfs/xfs_reflink.c
> @@ -455,7 +455,8 @@ xfs_reflink_allocate_cow(
>  
>  	/* Allocate the entire reservation as unwritten blocks. */
>  	error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
> -			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, &first_block,
> +			XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC |
> +			XFS_BMAPI_BESTEFFORT, &first_block,
>  			resblks, imap, &nimaps, &dfops);
>  	if (error)
>  		goto out_bmap_cancel;
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 09/10] xfs: kill the dop_low flag
  2017-04-13  8:05 ` [PATCH 09/10] xfs: kill the dop_low flag Christoph Hellwig
@ 2017-04-17 18:08   ` Brian Foster
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2017-04-17 18:08 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Thu, Apr 13, 2017 at 10:05:16AM +0200, Christoph Hellwig wrote:
> Now that we always get our space reservations right in xfs_bmapi_write
> we can remove the last remains of the dangerous low space allocator mode.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_bmap.c       | 12 +-----------
>  fs/xfs/libxfs/xfs_bmap_btree.c |  2 --
>  fs/xfs/libxfs/xfs_defer.c      |  1 -
>  fs/xfs/libxfs/xfs_defer.h      | 15 ---------------
>  fs/xfs/xfs_filestream.c        |  2 --
>  fs/xfs/xfs_trace.h             | 12 +++---------
>  6 files changed, 4 insertions(+), 40 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index c06e7d500ed1..9e3f0e6a9062 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -762,9 +762,6 @@ xfs_bmap_extents_to_btree(
>  	if (*firstblock == NULLFSBLOCK) {
>  		args.type = XFS_ALLOCTYPE_START_BNO;
>  		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
> -	} else if (dfops->dop_low) {
> -		args.type = XFS_ALLOCTYPE_START_BNO;
> -		args.fsbno = *firstblock;
>  	} else {
>  		args.type = XFS_ALLOCTYPE_NEAR_BNO;
>  		args.fsbno = *firstblock;
> @@ -3677,12 +3674,6 @@ xfs_bmap_btalloc(
>  			error = xfs_bmap_btalloc_nullfb(ap, &args, &blen);
>  		if (error)
>  			return error;
> -	} else if (ap->dfops->dop_low) {
> -		if (xfs_inode_is_filestream(ap->ip))
> -			args.type = XFS_ALLOCTYPE_FIRST_AG;
> -		else
> -			args.type = XFS_ALLOCTYPE_START_BNO;
> -		args.minlen = 1;
>  	} else {
>  		args.type = XFS_ALLOCTYPE_NEAR_BNO;
>  		args.minlen = 1;
> @@ -3709,7 +3700,7 @@ xfs_bmap_btalloc(
>  	 * is >= the stripe unit and the allocation offset is
>  	 * at the end of file.
>  	 */
> -	if (!ap->dfops->dop_low && ap->aeof) {
> +	if (ap->aeof) {

The comment above looks like it needs updating.

>  		if (!ap->offset) {
>  			args.alignment = stripe_align;
>  			atype = args.type;
> @@ -3816,7 +3807,6 @@ xfs_bmap_btalloc(
>  		args.type = XFS_ALLOCTYPE_FIRST_AG;
>  		if ((error = xfs_alloc_vextent(&args)))
>  			return error;
> -		ap->dfops->dop_low = true;
>  	}
>  	if (args.fsbno != NULLFSBLOCK) {
>  		/*
> diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
> index 153c969febd4..b9f0f61bd8ee 100644
> --- a/fs/xfs/libxfs/xfs_bmap_btree.c
> +++ b/fs/xfs/libxfs/xfs_bmap_btree.c
> @@ -462,8 +462,6 @@ xfs_bmbt_alloc_block(
>  		 */
>  		if (xfs_trans_blk_res(args.tp))
>  			args.minleft = xfs_trans_blk_res(args.tp) - args.maxlen;
> -	} else if (cur->bc_private.b.dfops->dop_low) {
> -		args.type = XFS_ALLOCTYPE_START_BNO;

I'm not convinced this bit is safe, based on the comments on patch 4
that removes the dop_low activation logic from this context...

Brian

>  	} else {
>  		args.type = XFS_ALLOCTYPE_NEAR_BNO;
>  	}
> diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
> index 5c2929f94bd3..172d583c6f1c 100644
> --- a/fs/xfs/libxfs/xfs_defer.c
> +++ b/fs/xfs/libxfs/xfs_defer.c
> @@ -503,7 +503,6 @@ xfs_defer_init(
>  	xfs_fsblock_t			*fbp)
>  {
>  	dop->dop_committed = false;
> -	dop->dop_low = false;
>  	memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes));
>  	*fbp = NULLFSBLOCK;
>  	INIT_LIST_HEAD(&dop->dop_intake);
> diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
> index f6e93ef0bffe..cc1acd1fd4e8 100644
> --- a/fs/xfs/libxfs/xfs_defer.h
> +++ b/fs/xfs/libxfs/xfs_defer.h
> @@ -36,20 +36,6 @@ struct xfs_defer_pending {
>  	unsigned int			dfp_count;	/* # extent items */
>  };
>  
> -/*
> - * Header for deferred operation list.
> - *
> - * dop_low is used by the allocator to activate the lowspace algorithm -
> - * when free space is running low the extent allocator may choose to
> - * allocate an extent from an AG without leaving sufficient space for
> - * a btree split when inserting the new extent.  In this case the allocator
> - * will enable the lowspace algorithm which is supposed to allow further
> - * allocations (such as btree splits and newroots) to allocate from
> - * sequential AGs.  In order to avoid locking AGs out of order the lowspace
> - * algorithm will start searching for free space from AG 0.  If the correct
> - * transaction reservations have been made then this algorithm will eventually
> - * find all the space it needs.
> - */
>  enum xfs_defer_ops_type {
>  	XFS_DEFER_OPS_TYPE_BMAP,
>  	XFS_DEFER_OPS_TYPE_REFCOUNT,
> @@ -62,7 +48,6 @@ enum xfs_defer_ops_type {
>  
>  struct xfs_defer_ops {
>  	bool			dop_committed;	/* did any trans commit? */
> -	bool			dop_low;	/* alloc in low mode */
>  	struct list_head	dop_intake;	/* unlogged pending work */
>  	struct list_head	dop_pending;	/* logged pending work */
>  
> diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
> index 043ca3808ea2..6d61e61c0700 100644
> --- a/fs/xfs/xfs_filestream.c
> +++ b/fs/xfs/xfs_filestream.c
> @@ -390,8 +390,6 @@ xfs_filestream_new_ag(
>  
>  	if (xfs_alloc_is_userdata(ap->datatype))
>  		flags |= XFS_PICK_USERDATA;
> -	if (ap->dfops->dop_low)
> -		flags |= XFS_PICK_LOWSPACE;
>  
>  	err = xfs_filestream_pick_ag(pip, startag, agp, flags, minlen);
>  
> diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
> index cba10daf8391..ab4ce20b00dd 100644
> --- a/fs/xfs/xfs_trace.h
> +++ b/fs/xfs/xfs_trace.h
> @@ -2256,19 +2256,16 @@ DECLARE_EVENT_CLASS(xfs_defer_class,
>  		__field(dev_t, dev)
>  		__field(void *, dop)
>  		__field(bool, committed)
> -		__field(bool, low)
>  	),
>  	TP_fast_assign(
>  		__entry->dev = mp ? mp->m_super->s_dev : 0;
>  		__entry->dop = dop;
>  		__entry->committed = dop->dop_committed;
> -		__entry->low = dop->dop_low;
>  	),
> -	TP_printk("dev %d:%d ops %p committed %d low %d\n",
> +	TP_printk("dev %d:%d ops %p committed %d\n",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  __entry->dop,
> -		  __entry->committed,
> -		  __entry->low)
> +		  __entry->committed)
>  )
>  #define DEFINE_DEFER_EVENT(name) \
>  DEFINE_EVENT(xfs_defer_class, name, \
> @@ -2282,21 +2279,18 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class,
>  		__field(dev_t, dev)
>  		__field(void *, dop)
>  		__field(bool, committed)
> -		__field(bool, low)
>  		__field(int, error)
>  	),
>  	TP_fast_assign(
>  		__entry->dev = mp ? mp->m_super->s_dev : 0;
>  		__entry->dop = dop;
>  		__entry->committed = dop->dop_committed;
> -		__entry->low = dop->dop_low;
>  		__entry->error = error;
>  	),
> -	TP_printk("dev %d:%d ops %p committed %d low %d err %d\n",
> +	TP_printk("dev %d:%d ops %p committed %d err %d\n",
>  		  MAJOR(__entry->dev), MINOR(__entry->dev),
>  		  __entry->dop,
>  		  __entry->committed,
> -		  __entry->low,
>  		  __entry->error)
>  )
>  #define DEFINE_DEFER_ERROR_EVENT(name) \
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 10/10] xfs: remove xfs_bmap_alloc
  2017-04-13  8:05 ` [PATCH 10/10] xfs: remove xfs_bmap_alloc Christoph Hellwig
@ 2017-04-17 18:08   ` Brian Foster
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2017-04-17 18:08 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Thu, Apr 13, 2017 at 10:05:17AM +0200, Christoph Hellwig wrote:
> Just opencode it in the only caller to make the code flow easier to
> follow.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---

Reviewed-by: Brian Foster <bfoster@redhat.com>

>  fs/xfs/libxfs/xfs_alloc.c |  2 +-
>  fs/xfs/libxfs/xfs_bmap.c  | 22 ++++++----------------
>  fs/xfs/libxfs/xfs_bmap.h  |  2 +-
>  3 files changed, 8 insertions(+), 18 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index 7486401ccbd3..02326142ccb0 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
> @@ -2606,7 +2606,7 @@ xfs_alloc_vextent(
>  	/*
>  	 * Just fix this up, for the case where the last a.g. is shorter
>  	 * (or there's only one a.g.) and the caller couldn't easily figure
> -	 * that out (xfs_bmap_alloc).
> +	 * that out (xfs_bmap_btalloc).
>  	 */
>  	agsize = mp->m_sb.sb_agblocks;
>  	if (args->maxlen > agsize)
> diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
> index 9e3f0e6a9062..8e94030bcb8f 100644
> --- a/fs/xfs/libxfs/xfs_bmap.c
> +++ b/fs/xfs/libxfs/xfs_bmap.c
> @@ -3842,20 +3842,6 @@ xfs_bmap_btalloc(
>  	return 0;
>  }
>  
> -/*
> - * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
> - * It figures out where to ask the underlying allocator to put the new extent.
> - */
> -STATIC int
> -xfs_bmap_alloc(
> -	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
> -{
> -	if (XFS_IS_REALTIME_INODE(ap->ip) &&
> -	    xfs_alloc_is_userdata(ap->datatype))
> -		return xfs_bmap_rtalloc(ap);
> -	return xfs_bmap_btalloc(ap);
> -}
> -
>  /* Trim extent to fit a logical block range. */
>  void
>  xfs_trim_extent(
> @@ -4273,7 +4259,11 @@ xfs_bmapi_allocate(
>  			return error;
>  	}
>  
> -	error = xfs_bmap_alloc(bma);
> +	if (XFS_IS_REALTIME_INODE(bma->ip) &&
> +	    xfs_alloc_is_userdata(bma->datatype))
> +		error = xfs_bmap_rtalloc(bma);
> +	else
> +		error = xfs_bmap_btalloc(bma);
>  	if (error)
>  		return error;
>  
> @@ -4451,7 +4441,7 @@ xfs_bmapi_write(
>  {
>  	struct xfs_mount	*mp = ip->i_mount;
>  	struct xfs_ifork	*ifp;
> -	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_alloc */
> +	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_*alloc */
>  	xfs_fileoff_t		end;		/* end of mapped file region */
>  	bool			eof = false;	/* after the end of extents */
>  	int			error;		/* error return */
> diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
> index f35a2b2c4f06..6de7d321f8a2 100644
> --- a/fs/xfs/libxfs/xfs_bmap.h
> +++ b/fs/xfs/libxfs/xfs_bmap.h
> @@ -28,7 +28,7 @@ struct xfs_trans;
>  extern kmem_zone_t	*xfs_bmap_free_item_zone;
>  
>  /*
> - * Argument structure for xfs_bmap_alloc.
> + * Argument structure for xfs_bmap_*alloc.
>   */
>  struct xfs_bmalloca {
>  	xfs_fsblock_t		*firstblock; /* i/o first block allocated */
> -- 
> 2.11.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block
  2017-04-17 14:19       ` Brian Foster
@ 2017-04-18  7:54         ` Christoph Hellwig
  2017-04-18 14:18           ` Brian Foster
  0 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-18  7:54 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs

On Mon, Apr 17, 2017 at 10:19:23AM -0400, Brian Foster wrote:
> I don't see anything about setting minleft here that says the allocation
> is required to come from one AG as opposed to that simply being
> preferred.

minleft must be in the same AG because we can't allocate from another
AG in the same transaction.  If we didn't respect this our whole allocator
would break apart..

> Not all bmbt block allocations are tied to extent allocations. This is
> the firstblock == NULLFSBLOCK case after all, which I take it means an
> allocation hasn't yet occurred. IOW, what about other potentially
> record-inserting operations like hole punch, extent conversion, etc.?

Yes, for other ops we might not have allocated anything yet, but we
might have to do more operations later and thus respect the minleft
later.  This is especially bad for directory operations that do
multiple calls to xfs_bmapi_write in the same transaction.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 08/10] xfs: introduce a XFS_BMAPI_BESTEFFORT flag
  2017-04-17 18:08   ` Brian Foster
@ 2017-04-18  7:58     ` Christoph Hellwig
  2017-04-18 14:18       ` Brian Foster
  0 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-18  7:58 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs

On Mon, Apr 17, 2017 at 02:08:08PM -0400, Brian Foster wrote:
> This seems Ok, but I don't think it's very elegant to have callers pass
> a total parameter along with a flag to ignore it. Couldn't we just set
> minleft when total is not used and pass zero from those particular
> callers, or do we actually need to support the !BESTEFFORT && total == 0
> case?

We could do that, in in context of just this patch it would even seem
cleaner.  But for the next merge window I have changes queued up that
remove the total parameter and it's passing along the dir/da_btree
code entirely by looking at the transaction reservation for the
!BESTEFFORT case, which I thikn is even better as a grand scheme.

> (At the very least, we could assert that total == 0 when
> BESTEFFORT is set.)

Sure.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 06/10] xfs: fix bmap minleft calculation
  2017-04-17 14:19   ` Brian Foster
@ 2017-04-18  7:59     ` Christoph Hellwig
  0 siblings, 0 replies; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-18  7:59 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs

On Mon, Apr 17, 2017 at 10:19:39AM -0400, Brian Foster wrote:
> On Thu, Apr 13, 2017 at 10:05:13AM +0200, Christoph Hellwig wrote:
> > We need to set a proper minleft value even if we didn't do a previous
> > allocation in the transaction, as we can't switch to a different AG
> > after allocating the data extent.
> > 
> 
> Hmm, the code currently does set minleft if we haven't done a previous
> allocation (firstblock == NULLFSBLOCK). Do you mean "even if we have
> done a previous allocation?"

Yeah.

> The code seems Ok, but we also currently reserve enough blocks for a
> full btree split in a write transaction and afaict only allocate a
> single extent per-transaction. The current code expects to select an AG
> with those additional blocks available and then not if a subsequent
> allocation occurs (by setting minleft = 0), presumably because the check
> was already made. So I'm wondering if this fixes a problem that has been
> observed or is just cleaning up the code..?

It's fixing a problem that is currently masked by the low space allocator.
Once we remove that just try again without reservations for good luck
mode we need to get all the reservations right.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block
  2017-04-18  7:54         ` Christoph Hellwig
@ 2017-04-18 14:18           ` Brian Foster
  2017-04-25  7:30             ` Christoph Hellwig
  0 siblings, 1 reply; 30+ messages in thread
From: Brian Foster @ 2017-04-18 14:18 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Tue, Apr 18, 2017 at 09:54:55AM +0200, Christoph Hellwig wrote:
> On Mon, Apr 17, 2017 at 10:19:23AM -0400, Brian Foster wrote:
> > I don't see anything about setting minleft here that says the allocation
> > is required to come from one AG as opposed to that simply being
> > preferred.
> 
> minleft must be in the same AG because we can't allocate from another
> AG in the same transaction.  If we didn't respect this our whole allocator
> would break apart..
> 

I'm confused. Didn't we just confirm in the previous email (the part you
trimmed) that multiple AG locking/allocation is safe, so long as locking
occurs in ascending AG order..?

> > Not all bmbt block allocations are tied to extent allocations. This is
> > the firstblock == NULLFSBLOCK case after all, which I take it means an
> > allocation hasn't yet occurred. IOW, what about other potentially
> > record-inserting operations like hole punch, extent conversion, etc.?
> 
> Yes, for other ops we might not have allocated anything yet, but we
> might have to do more operations later and thus respect the minleft
> later.  This is especially bad for directory operations that do
> multiple calls to xfs_bmapi_write in the same transaction.

Fair point. I don't discount that dropping minleft here might be
inappropriate or even harmful for some contexts (that's what I meant by
not having audited all possible codepaths). Rather, my point is that we
apparently do also have some contexts where the minleft retry is
important. E.g., the hole punch example may have successfully allocated
a transaction, reserved a number of blocks that could be across any
number of AGs, dirtied the transaction, and then got here attempting to
allocate blocks only to now fail due to the more restrictive allocation
logic and ultimately shutdown the fs.

IOWs, it sounds like we're potentially playing whack a mole with
allocation failure here, improving likelihood of success in one context
while reducing it in another. Is there something we can do to
conditionally use the retry (perhaps check if the tp is dirty, since at
that point shutdown is inevitable?) rather than remove it, or am I
missing something else as to why this shouldn't be a problem for
contexts that might not have called into the allocator before bmbt block
allocation?

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 08/10] xfs: introduce a XFS_BMAPI_BESTEFFORT flag
  2017-04-18  7:58     ` Christoph Hellwig
@ 2017-04-18 14:18       ` Brian Foster
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2017-04-18 14:18 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Tue, Apr 18, 2017 at 09:58:01AM +0200, Christoph Hellwig wrote:
> On Mon, Apr 17, 2017 at 02:08:08PM -0400, Brian Foster wrote:
> > This seems Ok, but I don't think it's very elegant to have callers pass
> > a total parameter along with a flag to ignore it. Couldn't we just set
> > minleft when total is not used and pass zero from those particular
> > callers, or do we actually need to support the !BESTEFFORT && total == 0
> > case?
> 
> We could do that, in in context of just this patch it would even seem
> cleaner.  But for the next merge window I have changes queued up that
> remove the total parameter and it's passing along the dir/da_btree
> code entirely by looking at the transaction reservation for the
> !BESTEFFORT case, which I thikn is even better as a grand scheme.
> 

Ok, then there is probably justification for the flag. In which case, I
think the tweak and assertion below at least helps clarify the semantics
of the call.

> > (At the very least, we could assert that total == 0 when
> > BESTEFFORT is set.)
> 
> Sure.

Thanks.

Brian

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block
  2017-04-18 14:18           ` Brian Foster
@ 2017-04-25  7:30             ` Christoph Hellwig
  2017-04-25 12:11               ` Brian Foster
  0 siblings, 1 reply; 30+ messages in thread
From: Christoph Hellwig @ 2017-04-25  7:30 UTC (permalink / raw)
  To: Brian Foster; +Cc: Christoph Hellwig, linux-xfs

On Tue, Apr 18, 2017 at 10:18:19AM -0400, Brian Foster wrote:
> > minleft must be in the same AG because we can't allocate from another
> > AG in the same transaction.  If we didn't respect this our whole allocator
> > would break apart..
> > 
> 
> I'm confused. Didn't we just confirm in the previous email (the part you
> trimmed) that multiple AG locking/allocation is safe, so long as locking
> occurs in ascending AG order..?

Its is.  But we have no way to account for space available in AG N or
higher, so we have to lock us into the same AG.

> > > Not all bmbt block allocations are tied to extent allocations. This is
> > > the firstblock == NULLFSBLOCK case after all, which I take it means an
> > > allocation hasn't yet occurred. IOW, what about other potentially
> > > record-inserting operations like hole punch, extent conversion, etc.?
> > 
> > Yes, for other ops we might not have allocated anything yet, but we
> > might have to do more operations later and thus respect the minleft
> > later.  This is especially bad for directory operations that do
> > multiple calls to xfs_bmapi_write in the same transaction.
> 
> Fair point. I don't discount that dropping minleft here might be
> inappropriate or even harmful for some contexts (that's what I meant by
> not having audited all possible codepaths). Rather, my point is that we
> apparently do also have some contexts where the minleft retry is
> important. E.g., the hole punch example may have successfully allocated
> a transaction, reserved a number of blocks that could be across any
> number of AGs, dirtied the transaction, and then got here attempting to
> allocate blocks only to now fail due to the more restrictive allocation
> logic and ultimately shutdown the fs.

I don't think it's important there, it's just as harmful as everywhere
else.  Say we have a xfs_unmap_extent that requires allocating more than
one new btree block.  If our allocation for the first one goes through due
to the minleft retry only we'll successfully do the first split, and then
fail the second one at which point the transaction is dirty.
If we do however properly respect minleft we'll fail the first allocation
in this case and are better off in the end.  The only downside is that
we might get ENOSPC a little earlier when we might not use up the full
reservation, but at least we never get it with a dirty transaction.

> IOWs, it sounds like we're potentially playing whack a mole with
> allocation failure here, improving likelihood of success in one context
> while reducing it in another. Is there something we can do to
> conditionally use the retry (perhaps check if the tp is dirty, since at
> that point shutdown is inevitable?) rather than remove it, or am I
> missing something else as to why this shouldn't be a problem for
> contexts that might not have called into the allocator before bmbt block
> allocation?

It's not a problem because now all our allocator calls set the right
minleft / total value to make sure subsequent allocations go through.
For BESTEFFORT allocations we calculate minleft on demand for the max
btree allocations, and for all others the caller passes a total value
that is respected by every allocation.

^ permalink raw reply	[flat|nested] 30+ messages in thread

* Re: [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block
  2017-04-25  7:30             ` Christoph Hellwig
@ 2017-04-25 12:11               ` Brian Foster
  0 siblings, 0 replies; 30+ messages in thread
From: Brian Foster @ 2017-04-25 12:11 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs

On Tue, Apr 25, 2017 at 09:30:07AM +0200, Christoph Hellwig wrote:
> On Tue, Apr 18, 2017 at 10:18:19AM -0400, Brian Foster wrote:
> > > minleft must be in the same AG because we can't allocate from another
> > > AG in the same transaction.  If we didn't respect this our whole allocator
> > > would break apart..
> > > 
> > 
> > I'm confused. Didn't we just confirm in the previous email (the part you
> > trimmed) that multiple AG locking/allocation is safe, so long as locking
> > occurs in ascending AG order..?
> 
> Its is.  But we have no way to account for space available in AG N or
> higher, so we have to lock us into the same AG.
> 

It may be true that we don't know whether space is available in higher
AGs. My point is that it seems like we can get here with a dirty
transaction without any assurance that any one AG can satisfy minleft.
Therefore, it appears that the purpose of the low free space allocator
is to try _really hard_ to allocate a block in a context where not doing
so is a catastrophic error for the filesystem.

The logic used above sounds like you are saying that the low free space
allocator can't guarantee an allocation, so we can't use it. I agree
that it may not guarantee an allocation. I'm contending that the low
free space allocator serves a purpose by facilitating allocations in
cases where nothing else ensures a single AG can satisfy minleft from
this context and thus that any allocation failure (even when firstblock
== NULLFSBLOCK) may result in shutdown.

> > > > Not all bmbt block allocations are tied to extent allocations. This is
> > > > the firstblock == NULLFSBLOCK case after all, which I take it means an
> > > > allocation hasn't yet occurred. IOW, what about other potentially
> > > > record-inserting operations like hole punch, extent conversion, etc.?
> > > 
> > > Yes, for other ops we might not have allocated anything yet, but we
> > > might have to do more operations later and thus respect the minleft
> > > later.  This is especially bad for directory operations that do
> > > multiple calls to xfs_bmapi_write in the same transaction.
> > 
> > Fair point. I don't discount that dropping minleft here might be
> > inappropriate or even harmful for some contexts (that's what I meant by
> > not having audited all possible codepaths). Rather, my point is that we
> > apparently do also have some contexts where the minleft retry is
> > important. E.g., the hole punch example may have successfully allocated
> > a transaction, reserved a number of blocks that could be across any
> > number of AGs, dirtied the transaction, and then got here attempting to
> > allocate blocks only to now fail due to the more restrictive allocation
> > logic and ultimately shutdown the fs.
> 
> I don't think it's important there, it's just as harmful as everywhere
> else.  Say we have a xfs_unmap_extent that requires allocating more than
> one new btree block.  If our allocation for the first one goes through due
> to the minleft retry only we'll successfully do the first split, and then
> fail the second one at which point the transaction is dirty.

I understand this case and I don't disagree at all with the principle to
fail early and cleanly rather than risk fs shutdown.

> If we do however properly respect minleft we'll fail the first allocation
> in this case and are better off in the end.  The only downside is that
> we might get ENOSPC a little earlier when we might not use up the full
> reservation, but at least we never get it with a dirty transaction.
> 

As noted in my last email, I don't think it's true that you always fail
here without a dirty transaction. I used the hole punch example because
I observed initial allocations from this context with a dirty
transaction. This is also why I suggested the possibility of restricting
the retry based on the transaction dirty state rather than ripping it
out entirely. Despite the ugliness, do you see a problem with doing
something like that?

> > IOWs, it sounds like we're potentially playing whack a mole with
> > allocation failure here, improving likelihood of success in one context
> > while reducing it in another. Is there something we can do to
> > conditionally use the retry (perhaps check if the tp is dirty, since at
> > that point shutdown is inevitable?) rather than remove it, or am I
> > missing something else as to why this shouldn't be a problem for
> > contexts that might not have called into the allocator before bmbt block
> > allocation?
> 
> It's not a problem because now all our allocator calls set the right
> minleft / total value to make sure subsequent allocations go through.
> For BESTEFFORT allocations we calculate minleft on demand for the max
> btree allocations, and for all others the caller passes a total value
> that is respected by every allocation.

I think I'm not being quite clear and we're arguing past eachother a bit
here. I do not disagree at all that the fail early behavior you describe
above is an ideal tradeoff for not allowing the use of every last block
in the fs before ENOSPC, and in general that is a fine direction to move
in.

Rather, what I'm arguing here is that it is not safe to remove the low
free space allocator until we've dealt with all possible cases where it
could prevent shutdown. AFAICT, it is still possible to get to bmbt
allocation context with something like the following general sequence of
events:

	- filesystem is near full
	- allocate a transaction and reserve N blocks
		- block reservation succeeds, but no single AG has N
		  free blocks (the N reserved blocks are spread out
		  across the fs)
	- start executing a hole punch operation
	- dirty the transaction
	- bmbt block allocation occurs - set minleft = N
		 - allocation fails because no single AG satisfies the N
		   blocks from the tx
	- retry bmbt block allocation - reset minleft and use _FIRST_AG
		- i.e., allocate from any AG starting from AG 0 and
		  activate the low space allocator for any subsequent
		  bmbt allocs
	- subsequent bmbt alklocs should (in theory)[1] succeed because
	  we've reserved N blocks out the fs, and we can allocate one at
	  a time starting from where the previous alloc left off

... and therefore it is not quite safe to rip out the low free space
allocator from this particular context. Note that the transaction is
dirty before we ever attempt the first allocation, so failing the
allocation with minleft = N is not safe.

Also note that I'm not claiming we always get here with a dirty
transaction, only that we can in some cases and in those particular
cases, we may still need the low space allocator to prevent shutdown. It
may be fine to skip the retry and return ENOSPC, as you suggest, if the
transaction is clean.

Brian

[1] I'm not totally confident that a reservation of N blocks means we
can actually allocate N blocks across the entire set of AGs, but that's
not really the point here. The transaction probably overreserves and in
practice we should be able to complete an associated operation when the
reservation succeeds.

> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 30+ messages in thread

end of thread, other threads:[~2017-04-25 12:11 UTC | newest]

Thread overview: 30+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-04-13  8:05 fix space reservations underneath xfs_bmapi_write Christoph Hellwig
2017-04-13  8:05 ` [PATCH 01/10] xfs: introduce xfs_trans_blk_res Christoph Hellwig
2017-04-13 18:28   ` Brian Foster
2017-04-13  8:05 ` [PATCH 02/10] xfs: rewrite xfs_da_grow_inode_int Christoph Hellwig
2017-04-13 18:28   ` Brian Foster
2017-04-13  8:05 ` [PATCH 03/10] xfs: remove the XFS_BMAPI_CONTIG flag Christoph Hellwig
2017-04-13 18:28   ` Brian Foster
2017-04-13  8:05 ` [PATCH 04/10] xfs: remove an unsafe retry in xfs_bmbt_alloc_block Christoph Hellwig
2017-04-13 18:30   ` Brian Foster
2017-04-14  7:46     ` Christoph Hellwig
2017-04-17 14:19       ` Brian Foster
2017-04-18  7:54         ` Christoph Hellwig
2017-04-18 14:18           ` Brian Foster
2017-04-25  7:30             ` Christoph Hellwig
2017-04-25 12:11               ` Brian Foster
2017-04-13  8:05 ` [PATCH 05/10] xfs: remove the total argument to xfs_bmap_local_to_extents Christoph Hellwig
2017-04-17 14:19   ` Brian Foster
2017-04-13  8:05 ` [PATCH 06/10] xfs: fix bmap minleft calculation Christoph Hellwig
2017-04-17 14:19   ` Brian Foster
2017-04-18  7:59     ` Christoph Hellwig
2017-04-13  8:05 ` [PATCH 07/10] xfs: fix space reservation in xfs_bmbt_alloc_block Christoph Hellwig
2017-04-17 14:19   ` Brian Foster
2017-04-13  8:05 ` [PATCH 08/10] xfs: introduce a XFS_BMAPI_BESTEFFORT flag Christoph Hellwig
2017-04-17 18:08   ` Brian Foster
2017-04-18  7:58     ` Christoph Hellwig
2017-04-18 14:18       ` Brian Foster
2017-04-13  8:05 ` [PATCH 09/10] xfs: kill the dop_low flag Christoph Hellwig
2017-04-17 18:08   ` Brian Foster
2017-04-13  8:05 ` [PATCH 10/10] xfs: remove xfs_bmap_alloc Christoph Hellwig
2017-04-17 18:08   ` Brian Foster

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.