All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHSET 0/2] sync discard
@ 2018-04-30 15:32 Jens Axboe
  2018-04-30 15:32 ` [PATCH 1/2] block: add BLKDEV_DISCARD_SYNC flag Jens Axboe
                   ` (2 more replies)
  0 siblings, 3 replies; 27+ messages in thread
From: Jens Axboe @ 2018-04-30 15:32 UTC (permalink / raw)
  To: linux-xfs, linux-block; +Cc: hch

We recently had a pretty severe perf regression with the XFS async
discard. This small series add a SYNC issue discard flag, and also
limits the chain size for sync discards. Patch 2 adds support for
reverting XFS back to doign sync discards.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* [PATCH 1/2] block: add BLKDEV_DISCARD_SYNC flag
  2018-04-30 15:32 [PATCHSET 0/2] sync discard Jens Axboe
@ 2018-04-30 15:32 ` Jens Axboe
  2018-04-30 15:32 ` [PATCH 2/2] xfs: add 'discard_sync' mount flag Jens Axboe
  2018-05-02 12:45 ` [PATCHSET 0/2] sync discard Christoph Hellwig
  2 siblings, 0 replies; 27+ messages in thread
From: Jens Axboe @ 2018-04-30 15:32 UTC (permalink / raw)
  To: linux-xfs, linux-block; +Cc: hch, Jens Axboe

By default, blkdev_issue_discard() will build potentially big
bio chains, even when the discard size has been limited by the
user. Add a BLKDEV_DISCARD_SYNC flag, telling blkdev_issue_discard()
to wait for completion of each discard.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-lib.c        | 33 ++++++++++++++++++++++++---------
 include/linux/blkdev.h |  1 +
 2 files changed, 25 insertions(+), 9 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index a676084d4740..1d0263c13c9c 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -11,13 +11,18 @@
 #include "blk.h"
 
 static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
-		gfp_t gfp)
+			    gfp_t gfp, bool do_sync)
 {
 	struct bio *new = bio_alloc(gfp, nr_pages);
 
 	if (bio) {
-		bio_chain(bio, new);
-		submit_bio(bio);
+		if (do_sync) {
+			submit_bio_wait(bio);
+			bio_put(bio);
+		} else {
+			bio_chain(bio, new);
+			submit_bio(bio);
+		}
 	}
 
 	return new;
@@ -30,7 +35,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	struct request_queue *q = bdev_get_queue(bdev);
 	struct bio *bio = *biop;
 	unsigned int granularity;
-	unsigned int op;
+	unsigned int op, max_sectors;
 	int alignment;
 	sector_t bs_mask;
 
@@ -58,12 +63,17 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 	granularity = max(q->limits.discard_granularity >> 9, 1U);
 	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
 
+	if (flags & BLKDEV_DISCARD_SYNC)
+		max_sectors = q->limits.max_discard_sectors;
+	else
+		max_sectors = UINT_MAX >> 9;
+
 	while (nr_sects) {
 		unsigned int req_sects;
 		sector_t end_sect, tmp;
 
 		/* Make sure bi_size doesn't overflow */
-		req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9);
+		req_sects = min_t(sector_t, nr_sects, max_sectors);
 
 		/**
 		 * If splitting a request, and the next starting sector would be
@@ -79,7 +89,7 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 			req_sects = end_sect - sector;
 		}
 
-		bio = next_bio(bio, 0, gfp_mask);
+		bio = next_bio(bio, 0, gfp_mask, flags & BLKDEV_DISCARD_SYNC);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
 		bio_set_op_attrs(bio, op, 0);
@@ -97,6 +107,11 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		cond_resched();
 	}
 
+	if (bio && (flags & BLKDEV_DISCARD_SYNC)) {
+		submit_bio_wait(bio);
+		bio_put(bio);
+		bio = NULL;
+	}
 	*biop = bio;
 	return 0;
 }
@@ -173,7 +188,7 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 	max_write_same_sectors = UINT_MAX >> 9;
 
 	while (nr_sects) {
-		bio = next_bio(bio, 1, gfp_mask);
+		bio = next_bio(bio, 1, gfp_mask, false);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
 		bio->bi_vcnt = 1;
@@ -249,7 +264,7 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 		return -EOPNOTSUPP;
 
 	while (nr_sects) {
-		bio = next_bio(bio, 0, gfp_mask);
+		bio = next_bio(bio, 0, gfp_mask, false);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
 		bio->bi_opf = REQ_OP_WRITE_ZEROES;
@@ -301,7 +316,7 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
 
 	while (nr_sects != 0) {
 		bio = next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
-			       gfp_mask);
+			       gfp_mask, false);
 		bio->bi_iter.bi_sector = sector;
 		bio_set_dev(bio, bdev);
 		bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5c4eee043191..e90388004029 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1396,6 +1396,7 @@ extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 
 #define BLKDEV_DISCARD_SECURE	(1 << 0)	/* issue a secure erase */
+#define BLKDEV_DISCARD_SYNC	(1 << 1)
 
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 15:32 [PATCHSET 0/2] sync discard Jens Axboe
  2018-04-30 15:32 ` [PATCH 1/2] block: add BLKDEV_DISCARD_SYNC flag Jens Axboe
@ 2018-04-30 15:32 ` Jens Axboe
  2018-04-30 17:19   ` Brian Foster
  2018-04-30 21:31   ` Dave Chinner
  2018-05-02 12:45 ` [PATCHSET 0/2] sync discard Christoph Hellwig
  2 siblings, 2 replies; 27+ messages in thread
From: Jens Axboe @ 2018-04-30 15:32 UTC (permalink / raw)
  To: linux-xfs, linux-block; +Cc: hch, Jens Axboe

XFS recently added support for async discards. While this can be
a win for some workloads and devices, there are also cases where
async bursty discard will severly harm the latencies of reads
and writes.

Add a 'discard_sync' mount flag to revert to using sync discard,
issuing them one at the time and waiting for each one. This fixes
a big performance regression we had moving to kernels that include
the XFS async discard support.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
---
 fs/xfs/xfs_log_cil.c | 18 ++++++++++++++----
 fs/xfs/xfs_mount.h   |  1 +
 fs/xfs/xfs_super.c   |  7 ++++++-
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index 4668403b1741..4eced3eceea5 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -552,10 +552,16 @@ xlog_discard_busy_extents(
 	struct bio		*bio = NULL;
 	struct blk_plug		plug;
 	int			error = 0;
+	int			discard_flags = 0;
+	bool			sync_discard = mp->m_flags & XFS_MOUNT_DISCARD_SYNC;
 
 	ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
 
-	blk_start_plug(&plug);
+	if (!sync_discard)
+		blk_start_plug(&plug);
+	else
+		discard_flags = BLKDEV_DISCARD_SYNC;
+
 	list_for_each_entry(busyp, list, list) {
 		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
 					 busyp->length);
@@ -563,7 +569,7 @@ xlog_discard_busy_extents(
 		error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
 				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
 				XFS_FSB_TO_BB(mp, busyp->length),
-				GFP_NOFS, 0, &bio);
+				GFP_NOFS, discard_flags, &bio);
 		if (error && error != -EOPNOTSUPP) {
 			xfs_info(mp,
 	 "discard failed for extent [0x%llx,%u], error %d",
@@ -574,14 +580,18 @@ xlog_discard_busy_extents(
 		}
 	}
 
-	if (bio) {
+	if (sync_discard) {
+		xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
+		kmem_free(ctx);
+	} else if (bio) {
 		bio->bi_private = ctx;
 		bio->bi_end_io = xlog_discard_endio;
 		submit_bio(bio);
+		blk_finish_plug(&plug);
 	} else {
 		xlog_discard_endio_work(&ctx->discard_endio_work);
+		blk_finish_plug(&plug);
 	}
-	blk_finish_plug(&plug);
 }
 
 /*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 10b90bbc5162..3f0b7b9106c7 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -219,6 +219,7 @@ typedef struct xfs_mount {
 						   operations, typically for
 						   disk errors in metadata */
 #define XFS_MOUNT_DISCARD	(1ULL << 5)	/* discard unused blocks */
+#define XFS_MOUNT_DISCARD_SYNC	(1ULL << 6)	/* discard unused blocks sync */
 #define XFS_MOUNT_NOALIGN	(1ULL << 7)	/* turn off stripe alignment
 						   allocations */
 #define XFS_MOUNT_ATTR2		(1ULL << 8)	/* allow use of attr2 format */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index d71424052917..6d960bb4725f 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -83,7 +83,7 @@ enum {
 	Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota,
 	Opt_uquota, Opt_gquota, Opt_pquota,
 	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
-	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
+	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, Opt_discard_sync,
 };
 
 static const match_table_t tokens = {
@@ -129,6 +129,7 @@ static const match_table_t tokens = {
 	{Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */
 	{Opt_qnoenforce, "qnoenforce"},	/* same as uqnoenforce */
 	{Opt_discard,	"discard"},	/* Discard unused blocks */
+	{Opt_discard_sync,"discard_sync"},/* Discard unused blocks sync */
 	{Opt_nodiscard,	"nodiscard"},	/* Do not discard unused blocks */
 
 	{Opt_dax,	"dax"},		/* Enable direct access to bdev pages */
@@ -363,11 +364,15 @@ xfs_parseargs(
 			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
 			mp->m_qflags &= ~XFS_GQUOTA_ENFD;
 			break;
+		case Opt_discard_sync:
+			mp->m_flags |= XFS_MOUNT_DISCARD_SYNC;
+			/* fall through and set XFS_MOUNT_DISCARD too */
 		case Opt_discard:
 			mp->m_flags |= XFS_MOUNT_DISCARD;
 			break;
 		case Opt_nodiscard:
 			mp->m_flags &= ~XFS_MOUNT_DISCARD;
+			mp->m_flags &= ~XFS_MOUNT_DISCARD_SYNC;
 			break;
 #ifdef CONFIG_FS_DAX
 		case Opt_dax:
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 15:32 ` [PATCH 2/2] xfs: add 'discard_sync' mount flag Jens Axboe
@ 2018-04-30 17:19   ` Brian Foster
  2018-04-30 18:07     ` Jens Axboe
  2018-04-30 21:31   ` Dave Chinner
  1 sibling, 1 reply; 27+ messages in thread
From: Brian Foster @ 2018-04-30 17:19 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-xfs, linux-block, hch

On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> XFS recently added support for async discards. While this can be
> a win for some workloads and devices, there are also cases where
> async bursty discard will severly harm the latencies of reads
> and writes.
> 
> Add a 'discard_sync' mount flag to revert to using sync discard,
> issuing them one at the time and waiting for each one. This fixes
> a big performance regression we had moving to kernels that include
> the XFS async discard support.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> ---

Hm, I figured the async discard stuff would have been a pretty clear win
all around, but then again I'm not terribly familiar with what happens
with discards beneath the fs. I do know that the previous behavior would
cause fs level latencies due to holding up log I/O completion while
discards completed one at a time. My understanding is that this lead to
online discard being pretty much universally "not recommended" in favor
of fstrim.

Do you have any more data around the workload where the old sync discard
behavior actually provides an overall win over the newer behavior? Is it
purely a matter of workload, or is it a workload+device thing with how
discards may be handled by certain devices?

I'm ultimately not against doing this if it's useful for somebody and is
otherwise buried under a mount option, but it would be nice to see if
there's opportunity to improve the async mechanism before resorting to
that. Is the issue here too large bio chains, too many chains at once,
or just simply too many discards (regardless of chaining) at the same
time?

I'm wondering if xfs could separate discard submission from log I/O
completion entirely and then perhaps limit/throttle submission somehow
or another (Christoph, thoughts?) via a background task. Perhaps doing
something like that may even eliminate the need for some discards on
busy filesystems with a lot of block free -> reallocation activity, but
I'm just handwaving atm.

Brian

>  fs/xfs/xfs_log_cil.c | 18 ++++++++++++++----
>  fs/xfs/xfs_mount.h   |  1 +
>  fs/xfs/xfs_super.c   |  7 ++++++-
>  3 files changed, 21 insertions(+), 5 deletions(-)
> 
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index 4668403b1741..4eced3eceea5 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -552,10 +552,16 @@ xlog_discard_busy_extents(
>  	struct bio		*bio = NULL;
>  	struct blk_plug		plug;
>  	int			error = 0;
> +	int			discard_flags = 0;
> +	bool			sync_discard = mp->m_flags & XFS_MOUNT_DISCARD_SYNC;
>  
>  	ASSERT(mp->m_flags & XFS_MOUNT_DISCARD);
>  
> -	blk_start_plug(&plug);
> +	if (!sync_discard)
> +		blk_start_plug(&plug);
> +	else
> +		discard_flags = BLKDEV_DISCARD_SYNC;
> +
>  	list_for_each_entry(busyp, list, list) {
>  		trace_xfs_discard_extent(mp, busyp->agno, busyp->bno,
>  					 busyp->length);
> @@ -563,7 +569,7 @@ xlog_discard_busy_extents(
>  		error = __blkdev_issue_discard(mp->m_ddev_targp->bt_bdev,
>  				XFS_AGB_TO_DADDR(mp, busyp->agno, busyp->bno),
>  				XFS_FSB_TO_BB(mp, busyp->length),
> -				GFP_NOFS, 0, &bio);
> +				GFP_NOFS, discard_flags, &bio);
>  		if (error && error != -EOPNOTSUPP) {
>  			xfs_info(mp,
>  	 "discard failed for extent [0x%llx,%u], error %d",
> @@ -574,14 +580,18 @@ xlog_discard_busy_extents(
>  		}
>  	}
>  
> -	if (bio) {
> +	if (sync_discard) {
> +		xfs_extent_busy_clear(mp, &ctx->busy_extents, false);
> +		kmem_free(ctx);
> +	} else if (bio) {
>  		bio->bi_private = ctx;
>  		bio->bi_end_io = xlog_discard_endio;
>  		submit_bio(bio);
> +		blk_finish_plug(&plug);
>  	} else {
>  		xlog_discard_endio_work(&ctx->discard_endio_work);
> +		blk_finish_plug(&plug);
>  	}
> -	blk_finish_plug(&plug);
>  }
>  
>  /*
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 10b90bbc5162..3f0b7b9106c7 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -219,6 +219,7 @@ typedef struct xfs_mount {
>  						   operations, typically for
>  						   disk errors in metadata */
>  #define XFS_MOUNT_DISCARD	(1ULL << 5)	/* discard unused blocks */
> +#define XFS_MOUNT_DISCARD_SYNC	(1ULL << 6)	/* discard unused blocks sync */
>  #define XFS_MOUNT_NOALIGN	(1ULL << 7)	/* turn off stripe alignment
>  						   allocations */
>  #define XFS_MOUNT_ATTR2		(1ULL << 8)	/* allow use of attr2 format */
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index d71424052917..6d960bb4725f 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -83,7 +83,7 @@ enum {
>  	Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota,
>  	Opt_uquota, Opt_gquota, Opt_pquota,
>  	Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce,
> -	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err,
> +	Opt_discard, Opt_nodiscard, Opt_dax, Opt_err, Opt_discard_sync,
>  };
>  
>  static const match_table_t tokens = {
> @@ -129,6 +129,7 @@ static const match_table_t tokens = {
>  	{Opt_pqnoenforce,"pqnoenforce"},/* project quota limit enforcement */
>  	{Opt_qnoenforce, "qnoenforce"},	/* same as uqnoenforce */
>  	{Opt_discard,	"discard"},	/* Discard unused blocks */
> +	{Opt_discard_sync,"discard_sync"},/* Discard unused blocks sync */
>  	{Opt_nodiscard,	"nodiscard"},	/* Do not discard unused blocks */
>  
>  	{Opt_dax,	"dax"},		/* Enable direct access to bdev pages */
> @@ -363,11 +364,15 @@ xfs_parseargs(
>  			mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
>  			mp->m_qflags &= ~XFS_GQUOTA_ENFD;
>  			break;
> +		case Opt_discard_sync:
> +			mp->m_flags |= XFS_MOUNT_DISCARD_SYNC;
> +			/* fall through and set XFS_MOUNT_DISCARD too */
>  		case Opt_discard:
>  			mp->m_flags |= XFS_MOUNT_DISCARD;
>  			break;
>  		case Opt_nodiscard:
>  			mp->m_flags &= ~XFS_MOUNT_DISCARD;
> +			mp->m_flags &= ~XFS_MOUNT_DISCARD_SYNC;
>  			break;
>  #ifdef CONFIG_FS_DAX
>  		case Opt_dax:
> -- 
> 2.7.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 17:19   ` Brian Foster
@ 2018-04-30 18:07     ` Jens Axboe
  2018-04-30 18:25       ` Luis R. Rodriguez
  2018-04-30 19:18       ` Brian Foster
  0 siblings, 2 replies; 27+ messages in thread
From: Jens Axboe @ 2018-04-30 18:07 UTC (permalink / raw)
  To: Brian Foster; +Cc: linux-xfs, linux-block, hch

On 4/30/18 11:19 AM, Brian Foster wrote:
> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>> XFS recently added support for async discards. While this can be
>> a win for some workloads and devices, there are also cases where
>> async bursty discard will severly harm the latencies of reads
>> and writes.
>>
>> Add a 'discard_sync' mount flag to revert to using sync discard,
>> issuing them one at the time and waiting for each one. This fixes
>> a big performance regression we had moving to kernels that include
>> the XFS async discard support.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>> ---
> 
> Hm, I figured the async discard stuff would have been a pretty clear win
> all around, but then again I'm not terribly familiar with what happens
> with discards beneath the fs. I do know that the previous behavior would
> cause fs level latencies due to holding up log I/O completion while
> discards completed one at a time. My understanding is that this lead to
> online discard being pretty much universally "not recommended" in favor
> of fstrim.

It's not a secret that most devices suck at discard. While the async
discard is nifty and I bet works well for some cases, it can also cause
a flood of discards on the device side which does not work well for
other cases.

> Do you have any more data around the workload where the old sync discard
> behavior actually provides an overall win over the newer behavior? Is it
> purely a matter of workload, or is it a workload+device thing with how
> discards may be handled by certain devices?

The worse read latencies were observed on more than one device type,
making it sync again was universally a win. We've had many issues
with discard, one trick that is often used is to chop up file deletion
into smaller chunks. Say you want to kill 10GB of data, you do it
incrementally, since 10G of discard usually doesn't behave very nicely.
If you make that async, then you're back to square one.

> I'm ultimately not against doing this if it's useful for somebody and is
> otherwise buried under a mount option, but it would be nice to see if
> there's opportunity to improve the async mechanism before resorting to
> that. Is the issue here too large bio chains, too many chains at once,
> or just simply too many discards (regardless of chaining) at the same
> time?

Well, ultimately you'd need better scheduling of the discards, but for
most devices what really works the best is a simple "just do one at
the time". The size constraint was added to further limit the impact.

Honestly, I think the only real improvement would be on the device
side. Most folks want discard as an advisory hint, and it should not
impact current workloads at all. In reality, many implementations
are much more strict and even include substantial flash writes. For
the cases where we can't just turn it off (I'd love to), we at least
need to make it less intrusive.

> I'm wondering if xfs could separate discard submission from log I/O
> completion entirely and then perhaps limit/throttle submission somehow
> or another (Christoph, thoughts?) via a background task. Perhaps doing
> something like that may even eliminate the need for some discards on
> busy filesystems with a lot of block free -> reallocation activity, but
> I'm just handwaving atm.

Just having the sync vs async option is the best split imho. The async
could potentially be scheduled. I don't think more involved logic
belongs in the fs.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 18:07     ` Jens Axboe
@ 2018-04-30 18:25       ` Luis R. Rodriguez
  2018-04-30 18:31         ` Jens Axboe
  2018-04-30 19:19         ` Eric Sandeen
  2018-04-30 19:18       ` Brian Foster
  1 sibling, 2 replies; 27+ messages in thread
From: Luis R. Rodriguez @ 2018-04-30 18:25 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Brian Foster, linux-xfs, linux-block, hch

On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
> On 4/30/18 11:19 AM, Brian Foster wrote:
> > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> >> XFS recently added support for async discards. While this can be
> >> a win for some workloads and devices, there are also cases where
> >> async bursty discard will severly harm the latencies of reads
> >> and writes.
> >>
> >> Add a 'discard_sync' mount flag to revert to using sync discard,
> >> issuing them one at the time and waiting for each one. This fixes
> >> a big performance regression we had moving to kernels that include
> >> the XFS async discard support.
> >>
> >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> >> ---
> > 
> > Hm, I figured the async discard stuff would have been a pretty clear win
> > all around, but then again I'm not terribly familiar with what happens
> > with discards beneath the fs. I do know that the previous behavior would
> > cause fs level latencies due to holding up log I/O completion while
> > discards completed one at a time. My understanding is that this lead to
> > online discard being pretty much universally "not recommended" in favor
> > of fstrim.
> 
> It's not a secret that most devices suck at discard.

How can we know if a device sucks at discard?

> While the async
> discard is nifty and I bet works well for some cases, it can also cause
> a flood of discards on the device side which does not work well for
> other cases.

Shouldn't async then be only enabled if the device used supports it well?
Or should a blacklist per device be more suitable? Which is more popular?

  Luis

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 18:25       ` Luis R. Rodriguez
@ 2018-04-30 18:31         ` Jens Axboe
  2018-04-30 19:19         ` Eric Sandeen
  1 sibling, 0 replies; 27+ messages in thread
From: Jens Axboe @ 2018-04-30 18:31 UTC (permalink / raw)
  To: Luis R. Rodriguez; +Cc: Brian Foster, linux-xfs, linux-block, hch

On 4/30/18 12:25 PM, Luis R. Rodriguez wrote:
> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
>> On 4/30/18 11:19 AM, Brian Foster wrote:
>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>> XFS recently added support for async discards. While this can be
>>>> a win for some workloads and devices, there are also cases where
>>>> async bursty discard will severly harm the latencies of reads
>>>> and writes.
>>>>
>>>> Add a 'discard_sync' mount flag to revert to using sync discard,
>>>> issuing them one at the time and waiting for each one. This fixes
>>>> a big performance regression we had moving to kernels that include
>>>> the XFS async discard support.
>>>>
>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>> ---
>>>
>>> Hm, I figured the async discard stuff would have been a pretty clear win
>>> all around, but then again I'm not terribly familiar with what happens
>>> with discards beneath the fs. I do know that the previous behavior would
>>> cause fs level latencies due to holding up log I/O completion while
>>> discards completed one at a time. My understanding is that this lead to
>>> online discard being pretty much universally "not recommended" in favor
>>> of fstrim.
>>
>> It's not a secret that most devices suck at discard.
> 
> How can we know if a device sucks at discard?

This test usually works well - does it support discard? Then it probably
sucks :-)

But seriously, synthetic test case with reads/writes (the actual workload)
and then mix in trims. If the performance suffers, then discards suck.
Just consider this series - it was a 25% latency win.

>> While the async
>> discard is nifty and I bet works well for some cases, it can also cause
>> a flood of discards on the device side which does not work well for
>> other cases.
> 
> Shouldn't async then be only enabled if the device used supports it well?
> Or should a blacklist per device be more suitable? Which is more popular?

You'd be left with nothing... My general recommendation is to not use
discards at all, unless there's a proven case that it makes a write
amplification difference for you - from at all, to "big enough that I'd
suffer the latency consequences".

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 18:07     ` Jens Axboe
  2018-04-30 18:25       ` Luis R. Rodriguez
@ 2018-04-30 19:18       ` Brian Foster
  1 sibling, 0 replies; 27+ messages in thread
From: Brian Foster @ 2018-04-30 19:18 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-xfs, linux-block, hch

On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
> On 4/30/18 11:19 AM, Brian Foster wrote:
> > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> >> XFS recently added support for async discards. While this can be
> >> a win for some workloads and devices, there are also cases where
> >> async bursty discard will severly harm the latencies of reads
> >> and writes.
> >>
> >> Add a 'discard_sync' mount flag to revert to using sync discard,
> >> issuing them one at the time and waiting for each one. This fixes
> >> a big performance regression we had moving to kernels that include
> >> the XFS async discard support.
> >>
> >> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> >> ---
> > 
> > Hm, I figured the async discard stuff would have been a pretty clear win
> > all around, but then again I'm not terribly familiar with what happens
> > with discards beneath the fs. I do know that the previous behavior would
> > cause fs level latencies due to holding up log I/O completion while
> > discards completed one at a time. My understanding is that this lead to
> > online discard being pretty much universally "not recommended" in favor
> > of fstrim.
> 
> It's not a secret that most devices suck at discard. While the async
> discard is nifty and I bet works well for some cases, it can also cause
> a flood of discards on the device side which does not work well for
> other cases.
> 

Heh, Ok.

> > Do you have any more data around the workload where the old sync discard
> > behavior actually provides an overall win over the newer behavior? Is it
> > purely a matter of workload, or is it a workload+device thing with how
> > discards may be handled by certain devices?
> 
> The worse read latencies were observed on more than one device type,
> making it sync again was universally a win. We've had many issues
> with discard, one trick that is often used is to chop up file deletion
> into smaller chunks. Say you want to kill 10GB of data, you do it
> incrementally, since 10G of discard usually doesn't behave very nicely.
> If you make that async, then you're back to square one.
> 

Makes sense, so there's not much win in chopping up huge discard ranges
into smaller, async requests that cover the same overall size/range..

> > I'm ultimately not against doing this if it's useful for somebody and is
> > otherwise buried under a mount option, but it would be nice to see if
> > there's opportunity to improve the async mechanism before resorting to
> > that. Is the issue here too large bio chains, too many chains at once,
> > or just simply too many discards (regardless of chaining) at the same
> > time?
> 
> Well, ultimately you'd need better scheduling of the discards, but for
> most devices what really works the best is a simple "just do one at
> the time". The size constraint was added to further limit the impact.
> 

... but presumably there is some value in submitting some number of
requests together provided they adhere to some size constraint..? Is
there a typical size constraint for the average ssd, or is this value
all over the place? (Is there a field somewhere in the bdev that the fs
can query?)

(I guess I'll defer to Christoph's input on this, I assume he measured
some kind of improvement in the previous async work..)

> Honestly, I think the only real improvement would be on the device
> side. Most folks want discard as an advisory hint, and it should not
> impact current workloads at all. In reality, many implementations
> are much more strict and even include substantial flash writes. For
> the cases where we can't just turn it off (I'd love to), we at least
> need to make it less intrusive.
> 
> > I'm wondering if xfs could separate discard submission from log I/O
> > completion entirely and then perhaps limit/throttle submission somehow
> > or another (Christoph, thoughts?) via a background task. Perhaps doing
> > something like that may even eliminate the need for some discards on
> > busy filesystems with a lot of block free -> reallocation activity, but
> > I'm just handwaving atm.
> 
> Just having the sync vs async option is the best split imho. The async
> could potentially be scheduled. I don't think more involved logic
> belongs in the fs.
> 

The more interesting part to me is whether we can safely separate
discard from log I/O completion in XFS. Then we can release the log
buffer locks and whatnot and let the fs proceed without waiting on any
number of discards to complete. In theory, I think the background task
could issue discards one at a time (or N at a time, or N blocks at a
time, whatever..) without putting us back in a place where discards hold
up the log and subsequently lock up the rest of the fs.

If that's possible, then the whole sync/async thing is more of an
implementation detail and we have no need for separate mount options for
users to try and grok.

Brian

> -- 
> Jens Axboe
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 18:25       ` Luis R. Rodriguez
  2018-04-30 18:31         ` Jens Axboe
@ 2018-04-30 19:19         ` Eric Sandeen
  2018-04-30 19:21           ` Jens Axboe
  1 sibling, 1 reply; 27+ messages in thread
From: Eric Sandeen @ 2018-04-30 19:19 UTC (permalink / raw)
  To: Luis R. Rodriguez, Jens Axboe; +Cc: Brian Foster, linux-xfs, linux-block, hch



On 4/30/18 1:25 PM, Luis R. Rodriguez wrote:
> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
>> On 4/30/18 11:19 AM, Brian Foster wrote:
>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>> XFS recently added support for async discards. While this can be
>>>> a win for some workloads and devices, there are also cases where
>>>> async bursty discard will severly harm the latencies of reads
>>>> and writes.
>>>>
>>>> Add a 'discard_sync' mount flag to revert to using sync discard,
>>>> issuing them one at the time and waiting for each one. This fixes
>>>> a big performance regression we had moving to kernels that include
>>>> the XFS async discard support.
>>>>
>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>> ---
>>>
>>> Hm, I figured the async discard stuff would have been a pretty clear win
>>> all around, but then again I'm not terribly familiar with what happens
>>> with discards beneath the fs. I do know that the previous behavior would
>>> cause fs level latencies due to holding up log I/O completion while
>>> discards completed one at a time. My understanding is that this lead to
>>> online discard being pretty much universally "not recommended" in favor
>>> of fstrim.
>>
>> It's not a secret that most devices suck at discard.
> 
> How can we know if a device sucks at discard?

I was going to ask the same thing.  ;)  "Meh, punt to the admin!"

I'm having deja vu but can't remember why.  Seems like this has come up
before and we thought it should be a block device tunable, not pushed down
from the filesystem.  Is that possible?

-Eric

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 19:19         ` Eric Sandeen
@ 2018-04-30 19:21           ` Jens Axboe
  2018-04-30 19:57             ` Eric Sandeen
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2018-04-30 19:21 UTC (permalink / raw)
  To: Eric Sandeen, Luis R. Rodriguez; +Cc: Brian Foster, linux-xfs, linux-block, hch

On 4/30/18 1:19 PM, Eric Sandeen wrote:
> 
> 
> On 4/30/18 1:25 PM, Luis R. Rodriguez wrote:
>> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
>>> On 4/30/18 11:19 AM, Brian Foster wrote:
>>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>>> XFS recently added support for async discards. While this can be
>>>>> a win for some workloads and devices, there are also cases where
>>>>> async bursty discard will severly harm the latencies of reads
>>>>> and writes.
>>>>>
>>>>> Add a 'discard_sync' mount flag to revert to using sync discard,
>>>>> issuing them one at the time and waiting for each one. This fixes
>>>>> a big performance regression we had moving to kernels that include
>>>>> the XFS async discard support.
>>>>>
>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>> ---
>>>>
>>>> Hm, I figured the async discard stuff would have been a pretty clear win
>>>> all around, but then again I'm not terribly familiar with what happens
>>>> with discards beneath the fs. I do know that the previous behavior would
>>>> cause fs level latencies due to holding up log I/O completion while
>>>> discards completed one at a time. My understanding is that this lead to
>>>> online discard being pretty much universally "not recommended" in favor
>>>> of fstrim.
>>>
>>> It's not a secret that most devices suck at discard.
>>
>> How can we know if a device sucks at discard?
> 
> I was going to ask the same thing.  ;)  "Meh, punt to the admin!"
> 
> I'm having deja vu but can't remember why.  Seems like this has come up
> before and we thought it should be a block device tunable, not pushed down
> from the filesystem.  Is that possible?

The problem is that it'll depend on the workload as well. The device in
may laptop is fine with discard for my workload, which is very light.
But if you are running RocksDB on it, and doing heavy compactions and
deletes, it probably would not be.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 19:21           ` Jens Axboe
@ 2018-04-30 19:57             ` Eric Sandeen
  2018-04-30 19:58               ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Eric Sandeen @ 2018-04-30 19:57 UTC (permalink / raw)
  To: Jens Axboe, Luis R. Rodriguez; +Cc: Brian Foster, linux-xfs, linux-block, hch



On 4/30/18 2:21 PM, Jens Axboe wrote:
> On 4/30/18 1:19 PM, Eric Sandeen wrote:
>>
>>
>> On 4/30/18 1:25 PM, Luis R. Rodriguez wrote:
>>> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
>>>> On 4/30/18 11:19 AM, Brian Foster wrote:
>>>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>>>> XFS recently added support for async discards. While this can be
>>>>>> a win for some workloads and devices, there are also cases where
>>>>>> async bursty discard will severly harm the latencies of reads
>>>>>> and writes.
>>>>>>
>>>>>> Add a 'discard_sync' mount flag to revert to using sync discard,
>>>>>> issuing them one at the time and waiting for each one. This fixes
>>>>>> a big performance regression we had moving to kernels that include
>>>>>> the XFS async discard support.
>>>>>>
>>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>>> ---
>>>>>
>>>>> Hm, I figured the async discard stuff would have been a pretty clear win
>>>>> all around, but then again I'm not terribly familiar with what happens
>>>>> with discards beneath the fs. I do know that the previous behavior would
>>>>> cause fs level latencies due to holding up log I/O completion while
>>>>> discards completed one at a time. My understanding is that this lead to
>>>>> online discard being pretty much universally "not recommended" in favor
>>>>> of fstrim.
>>>>
>>>> It's not a secret that most devices suck at discard.
>>>
>>> How can we know if a device sucks at discard?
>>
>> I was going to ask the same thing.  ;)  "Meh, punt to the admin!"
>>
>> I'm having deja vu but can't remember why.  Seems like this has come up
>> before and we thought it should be a block device tunable, not pushed down
>> from the filesystem.  Is that possible?
> 
> The problem is that it'll depend on the workload as well. The device in
> may laptop is fine with discard for my workload, which is very light.
> But if you are running RocksDB on it, and doing heavy compactions and
> deletes, it probably would not be.

Ok, but I'm not sure how that precludes a block device tunable?  You'd tune
it for your workload, right?

Or is the concern that it could only be for the entire block device, and
perhaps different partitions have different workloads?

Sorry, caveman filesystem guy doesn't completely understand block devices.

-Eric

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 19:57             ` Eric Sandeen
@ 2018-04-30 19:58               ` Jens Axboe
  2018-04-30 22:59                 ` Eric Sandeen
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2018-04-30 19:58 UTC (permalink / raw)
  To: Eric Sandeen, Luis R. Rodriguez; +Cc: Brian Foster, linux-xfs, linux-block, hch

On 4/30/18 1:57 PM, Eric Sandeen wrote:
> 
> 
> On 4/30/18 2:21 PM, Jens Axboe wrote:
>> On 4/30/18 1:19 PM, Eric Sandeen wrote:
>>>
>>>
>>> On 4/30/18 1:25 PM, Luis R. Rodriguez wrote:
>>>> On Mon, Apr 30, 2018 at 12:07:31PM -0600, Jens Axboe wrote:
>>>>> On 4/30/18 11:19 AM, Brian Foster wrote:
>>>>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>>>>> XFS recently added support for async discards. While this can be
>>>>>>> a win for some workloads and devices, there are also cases where
>>>>>>> async bursty discard will severly harm the latencies of reads
>>>>>>> and writes.
>>>>>>>
>>>>>>> Add a 'discard_sync' mount flag to revert to using sync discard,
>>>>>>> issuing them one at the time and waiting for each one. This fixes
>>>>>>> a big performance regression we had moving to kernels that include
>>>>>>> the XFS async discard support.
>>>>>>>
>>>>>>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>>>>>> ---
>>>>>>
>>>>>> Hm, I figured the async discard stuff would have been a pretty clear win
>>>>>> all around, but then again I'm not terribly familiar with what happens
>>>>>> with discards beneath the fs. I do know that the previous behavior would
>>>>>> cause fs level latencies due to holding up log I/O completion while
>>>>>> discards completed one at a time. My understanding is that this lead to
>>>>>> online discard being pretty much universally "not recommended" in favor
>>>>>> of fstrim.
>>>>>
>>>>> It's not a secret that most devices suck at discard.
>>>>
>>>> How can we know if a device sucks at discard?
>>>
>>> I was going to ask the same thing.  ;)  "Meh, punt to the admin!"
>>>
>>> I'm having deja vu but can't remember why.  Seems like this has come up
>>> before and we thought it should be a block device tunable, not pushed down
>>> from the filesystem.  Is that possible?
>>
>> The problem is that it'll depend on the workload as well. The device in
>> may laptop is fine with discard for my workload, which is very light.
>> But if you are running RocksDB on it, and doing heavy compactions and
>> deletes, it probably would not be.
> 
> Ok, but I'm not sure how that precludes a block device tunable?  You'd tune
> it for your workload, right?

What kind of tunable are you thinking of? Right now we have one tunable,
which is the max discard size. Patch #1 actually helps make this do what
it should for sync discards, instead of just building a bio chain and
submitting that all at once.

> Or is the concern that it could only be for the entire block device, and
> perhaps different partitions have different workloads?
> 
> Sorry, caveman filesystem guy doesn't completely understand block devices.

I just don't know what you are trying to tune :-)

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 15:32 ` [PATCH 2/2] xfs: add 'discard_sync' mount flag Jens Axboe
  2018-04-30 17:19   ` Brian Foster
@ 2018-04-30 21:31   ` Dave Chinner
  2018-04-30 21:42     ` Jens Axboe
  1 sibling, 1 reply; 27+ messages in thread
From: Dave Chinner @ 2018-04-30 21:31 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-xfs, linux-block, hch

On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> XFS recently added support for async discards. While this can be
> a win for some workloads and devices, there are also cases where
> async bursty discard will severly harm the latencies of reads
> and writes.

FWIW, convention is it document the performance regression in the
commit message, not leave the reader to guess at what it was....

Did anyone analyse the pattern of discards being issued to work out
what pattern was worse for async vs sync discard? is it lots of
little discards, large extents being discarded, perhaps a problem
with the request request queue starving other IOs because we queue
so many async discards in such a short time (which is the difference
in behaviour vs the old code), or something else?

> Add a 'discard_sync' mount flag to revert to using sync discard,
> issuing them one at the time and waiting for each one. This fixes
> a big performance regression we had moving to kernels that include
> the XFS async discard support.

I'm not a fan of adding a mount option to work around bad,
unpredictable performance due to a mount option we recommend you
don't use because it results in bad, unpredictable performance.

Without any details of the discard pattern that results in problems
I don't think we should be changing anything - adding an opaque,
user-unfriendly mount option does nothing to address the underlying
problem - it's just a hack to work around the symptoms being seen...

More details of the regression and the root cause analysis is
needed, please.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 21:31   ` Dave Chinner
@ 2018-04-30 21:42     ` Jens Axboe
  2018-04-30 22:28       ` Dave Chinner
  0 siblings, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2018-04-30 21:42 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, linux-block, hch

On 4/30/18 3:31 PM, Dave Chinner wrote:
> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>> XFS recently added support for async discards. While this can be
>> a win for some workloads and devices, there are also cases where
>> async bursty discard will severly harm the latencies of reads
>> and writes.
> 
> FWIW, convention is it document the performance regression in the
> commit message, not leave the reader to guess at what it was....

Yeah I'll give you that, I can improve the commit message for sure.

> Did anyone analyse the pattern of discards being issued to work out
> what pattern was worse for async vs sync discard? is it lots of
> little discards, large extents being discarded, perhaps a problem
> with the request request queue starving other IOs because we queue
> so many async discards in such a short time (which is the difference
> in behaviour vs the old code), or something else?

What was observed was a big discard which would previously have
gone down as smaller discards now going down as either one or many
discards. Looking at the blktrace data, it's the difference between

discard 1 queue
discard 1 complete
discatd 2 queue
discard 2 complete
[...]
discard n queue
discard n complete

which is now

discard 1 queue
discard 2 queue
[...]
discard n queue
[...]
discard 1 complete
discard 2 complete
[...]
discard n complete

Note that we set a max discard size of 64MB for most devices,
since it's been shown to have less impact on latencies for
the IO that jobs actually care about.

>> Add a 'discard_sync' mount flag to revert to using sync discard,
>> issuing them one at the time and waiting for each one. This fixes
>> a big performance regression we had moving to kernels that include
>> the XFS async discard support.
> 
> I'm not a fan of adding a mount option to work around bad,
> unpredictable performance due to a mount option we recommend you
> don't use because it results in bad, unpredictable performance.

Oh I hear you, as I wrote in other replies, I don't generally
recommend discard except for cases where it's been proven to be
useful in terms of write amplification improvements. If we can
avoid using it, we do. It's a tradeoff, and for some situations,
the right decision is to use discards.

> Without any details of the discard pattern that results in problems
> I don't think we should be changing anything - adding an opaque,
> user-unfriendly mount option does nothing to address the underlying
> problem - it's just a hack to work around the symptoms being seen...
> 
> More details of the regression and the root cause analysis is
> needed, please.

It brings back the same behavior as we had before, which performs
better for us. It's preventing users of XFS+discard from upgrading,
which is sad.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 21:42     ` Jens Axboe
@ 2018-04-30 22:28       ` Dave Chinner
  2018-04-30 22:40         ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Dave Chinner @ 2018-04-30 22:28 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-xfs, linux-block, hch

On Mon, Apr 30, 2018 at 03:42:11PM -0600, Jens Axboe wrote:
> On 4/30/18 3:31 PM, Dave Chinner wrote:
> > On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> >> XFS recently added support for async discards. While this can be
> >> a win for some workloads and devices, there are also cases where
> >> async bursty discard will severly harm the latencies of reads
> >> and writes.
> > 
> > FWIW, convention is it document the performance regression in the
> > commit message, not leave the reader to guess at what it was....
> 
> Yeah I'll give you that, I can improve the commit message for sure.
> 
> > Did anyone analyse the pattern of discards being issued to work out
> > what pattern was worse for async vs sync discard? is it lots of
> > little discards, large extents being discarded, perhaps a problem
> > with the request request queue starving other IOs because we queue
> > so many async discards in such a short time (which is the difference
> > in behaviour vs the old code), or something else?
> 
> What was observed was a big discard which would previously have
> gone down as smaller discards now going down as either one or many
> discards. Looking at the blktrace data, it's the difference between
> 
> discard 1 queue
> discard 1 complete
> discatd 2 queue
> discard 2 complete
> [...]
> discard n queue
> discard n complete
> 
> which is now
> 
> discard 1 queue
> discard 2 queue
> [...]
> discard n queue
> [...]
> discard 1 complete
> discard 2 complete
> [...]
> discard n complete
> 
> Note that we set a max discard size of 64MB for most devices,
> since it's been shown to have less impact on latencies for
> the IO that jobs actually care about.

IOWs, this has nothing to do with XFS behaviour, and everything to
do with suboptimal scheduling control for concurrent queued discards
in the block layer....

XFS can issue discard requests of up to 8GB (on 4k block size
filesystems), and how they are optimised is completely up to the
block layer. blkdev_issue_discard() happens to be synchronous (for
historical reasons) and that does not match our asynchronous log IO
model. it's always been a cause of total filesystem stall problems
for XFS because we can't free log space until all the pending
discards have been issued. Hence we can see global filesystems
stalls of *minutes* with synchronous discards on bad devices and can
cause OOM and all sorts of other nasty cascading failures.

Async discard dispatch solves this problem for XFS - discards no
longer block forward journal progress, and we really don't want to
go back to having that ticking timebomb in XFS. Handling concurrent
discard requests in an optimal manner is not a filesystem problem -
it's an IO scheduling problem.


Essentially, we've exposed yet another limitation of the block/IO
layer handling of discard requests in the linux storage stack - it
does not have a configurable discard queue depth.

I'd much prefer these problems get handled at the IO scheduling
layer where there is visibulity of device capabilities and request
queue state. i.e. we should be throttling async discards just like
we can throttle read or write IO, especially given there are many
devices that serialise discards at the device level (i.e. not queued
device commands). This solves the problem for everyone and makes
things much better for the future where hardware implementations are
likely to get better and support more and more concurrency in
discard operations.

IMO, the way the high level code dispatches discard requests is
irrelevant here - this is a problem to do with queue depth and IO
scheduling/throttling. That's why I don't think adding permanent ABI
changes to filesystems to work around this problem is the right
solution....

> > I don't think we should be changing anything - adding an opaque,
> > user-unfriendly mount option does nothing to address the underlying
> > problem - it's just a hack to work around the symptoms being seen...
> > 
> > More details of the regression and the root cause analysis is
> > needed, please.
> 
> It brings back the same behavior as we had before, which performs
> better for us. It's preventing users of XFS+discard from upgrading,
> which is sad.

Yes, it does, but so would having the block layer to throttle device
discard requests in flight to a queue depth of 1. And then we don't
have to change XFS at all.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 22:28       ` Dave Chinner
@ 2018-04-30 22:40         ` Jens Axboe
  2018-04-30 23:00           ` Jens Axboe
  2018-04-30 23:01           ` Darrick J. Wong
  0 siblings, 2 replies; 27+ messages in thread
From: Jens Axboe @ 2018-04-30 22:40 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, linux-block, hch

On 4/30/18 4:28 PM, Dave Chinner wrote:
> On Mon, Apr 30, 2018 at 03:42:11PM -0600, Jens Axboe wrote:
>> On 4/30/18 3:31 PM, Dave Chinner wrote:
>>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
>>>> XFS recently added support for async discards. While this can be
>>>> a win for some workloads and devices, there are also cases where
>>>> async bursty discard will severly harm the latencies of reads
>>>> and writes.
>>>
>>> FWIW, convention is it document the performance regression in the
>>> commit message, not leave the reader to guess at what it was....
>>
>> Yeah I'll give you that, I can improve the commit message for sure.
>>
>>> Did anyone analyse the pattern of discards being issued to work out
>>> what pattern was worse for async vs sync discard? is it lots of
>>> little discards, large extents being discarded, perhaps a problem
>>> with the request request queue starving other IOs because we queue
>>> so many async discards in such a short time (which is the difference
>>> in behaviour vs the old code), or something else?
>>
>> What was observed was a big discard which would previously have
>> gone down as smaller discards now going down as either one or many
>> discards. Looking at the blktrace data, it's the difference between
>>
>> discard 1 queue
>> discard 1 complete
>> discatd 2 queue
>> discard 2 complete
>> [...]
>> discard n queue
>> discard n complete
>>
>> which is now
>>
>> discard 1 queue
>> discard 2 queue
>> [...]
>> discard n queue
>> [...]
>> discard 1 complete
>> discard 2 complete
>> [...]
>> discard n complete
>>
>> Note that we set a max discard size of 64MB for most devices,
>> since it's been shown to have less impact on latencies for
>> the IO that jobs actually care about.
> 
> IOWs, this has nothing to do with XFS behaviour, and everything to
> do with suboptimal scheduling control for concurrent queued discards
> in the block layer....

You could argue that, and it's fallout from XFS being the first
user of async discard. Prior to that, we've never had that use
case. I'm quite fine making all discards throttle to depth
1.

> XFS can issue discard requests of up to 8GB (on 4k block size
> filesystems), and how they are optimised is completely up to the
> block layer. blkdev_issue_discard() happens to be synchronous (for
> historical reasons) and that does not match our asynchronous log IO
> model. it's always been a cause of total filesystem stall problems
> for XFS because we can't free log space until all the pending
> discards have been issued. Hence we can see global filesystems
> stalls of *minutes* with synchronous discards on bad devices and can
> cause OOM and all sorts of other nasty cascading failures.
> 
> Async discard dispatch solves this problem for XFS - discards no
> longer block forward journal progress, and we really don't want to
> go back to having that ticking timebomb in XFS. Handling concurrent
> discard requests in an optimal manner is not a filesystem problem -
> it's an IO scheduling problem.
> 
> 
> Essentially, we've exposed yet another limitation of the block/IO
> layer handling of discard requests in the linux storage stack - it
> does not have a configurable discard queue depth.
> 
> I'd much prefer these problems get handled at the IO scheduling
> layer where there is visibulity of device capabilities and request
> queue state. i.e. we should be throttling async discards just like
> we can throttle read or write IO, especially given there are many
> devices that serialise discards at the device level (i.e. not queued
> device commands). This solves the problem for everyone and makes
> things much better for the future where hardware implementations are
> likely to get better and support more and more concurrency in
> discard operations.
> 
> IMO, the way the high level code dispatches discard requests is
> irrelevant here - this is a problem to do with queue depth and IO
> scheduling/throttling. That's why I don't think adding permanent ABI
> changes to filesystems to work around this problem is the right
> solution....

If you get off your high horse for a bit, this is essentially
a performance regression. I can either migrate folks off of XFS, or
I can come up with something that works for them. It's pretty
easy to claim this is "yet another limitation of the IO stack",
but it's really not fair to make crazy claims like that when it's
an entirely new use case. Let's try to state things objectively
and fairly.

This work should probably have been done before making XFS
discard async. This isn't the first fallout we've had from that
code.

>>> I don't think we should be changing anything - adding an opaque,
>>> user-unfriendly mount option does nothing to address the underlying
>>> problem - it's just a hack to work around the symptoms being seen...
>>>
>>> More details of the regression and the root cause analysis is
>>> needed, please.
>>
>> It brings back the same behavior as we had before, which performs
>> better for us. It's preventing users of XFS+discard from upgrading,
>> which is sad.
> 
> Yes, it does, but so would having the block layer to throttle device
> discard requests in flight to a queue depth of 1. And then we don't
> have to change XFS at all.

I'm perfectly fine with making that change by default, and much easier
for me since I don't have to patch file systems.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 19:58               ` Jens Axboe
@ 2018-04-30 22:59                 ` Eric Sandeen
  2018-04-30 23:02                   ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Eric Sandeen @ 2018-04-30 22:59 UTC (permalink / raw)
  To: Jens Axboe, Luis R. Rodriguez; +Cc: Brian Foster, linux-xfs, linux-block, hch

On 4/30/18 2:58 PM, Jens Axboe wrote:
> On 4/30/18 1:57 PM, Eric Sandeen wrote:
>> On 4/30/18 2:21 PM, Jens Axboe wrote:

...

>> Ok, but I'm not sure how that precludes a block device tunable?  You'd tune
>> it for your workload, right?
> 
> What kind of tunable are you thinking of? Right now we have one tunable,
> which is the max discard size. Patch #1 actually helps make this do what
> it should for sync discards, instead of just building a bio chain and
> submitting that all at once.
> 
>> Or is the concern that it could only be for the entire block device, and
>> perhaps different partitions have different workloads?
>>
>> Sorry, caveman filesystem guy doesn't completely understand block devices.
> 
> I just don't know what you are trying to tune :-)

I'm wondering if "make discards synchronous" couldn't just be a block device
tunable, rather than a filesystem mount option tunable.

-Eric

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 22:40         ` Jens Axboe
@ 2018-04-30 23:00           ` Jens Axboe
  2018-04-30 23:23             ` Dave Chinner
  2018-04-30 23:01           ` Darrick J. Wong
  1 sibling, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2018-04-30 23:00 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, linux-block, hch

On 4/30/18 4:40 PM, Jens Axboe wrote:
> On 4/30/18 4:28 PM, Dave Chinner wrote:
>> Yes, it does, but so would having the block layer to throttle device
>> discard requests in flight to a queue depth of 1. And then we don't
>> have to change XFS at all.
> 
> I'm perfectly fine with making that change by default, and much easier
> for me since I don't have to patch file systems.

Totally untested, but this should do the trick. It ensures we have
a QD of 1 (per caller), which should be sufficient.

If people tune down the discard size, then you'll be blocking waiting
for discards on issue.

diff --git a/block/blk-lib.c b/block/blk-lib.c
index a676084d4740..0bf9befcc863 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -11,16 +11,19 @@
 #include "blk.h"
 
 static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
-		gfp_t gfp)
+			    gfp_t gfp)
 {
-	struct bio *new = bio_alloc(gfp, nr_pages);
-
+	/*
+	 * Devices suck at discard, so if we have to break up the bio
+	 * size due to the max discard size setting, wait for the
+	 * previous one to finish first.
+	 */
 	if (bio) {
-		bio_chain(bio, new);
-		submit_bio(bio);
+		submit_bio_wait(bio);
+		bio_put(bio);
 	}
 
-	return new;
+	return bio_alloc(gfp, nr_pages);
 }
 
 int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
@@ -63,7 +66,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t end_sect, tmp;
 
 		/* Make sure bi_size doesn't overflow */
-		req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9);
+		req_sects = min_t(sector_t, nr_sects,
+					q->limits.max_discard_sectors);
 
 		/**
 		 * If splitting a request, and the next starting sector would be

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 22:40         ` Jens Axboe
  2018-04-30 23:00           ` Jens Axboe
@ 2018-04-30 23:01           ` Darrick J. Wong
  1 sibling, 0 replies; 27+ messages in thread
From: Darrick J. Wong @ 2018-04-30 23:01 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Dave Chinner, linux-xfs, linux-block, hch

On Mon, Apr 30, 2018 at 04:40:04PM -0600, Jens Axboe wrote:
> On 4/30/18 4:28 PM, Dave Chinner wrote:
> > On Mon, Apr 30, 2018 at 03:42:11PM -0600, Jens Axboe wrote:
> >> On 4/30/18 3:31 PM, Dave Chinner wrote:
> >>> On Mon, Apr 30, 2018 at 09:32:52AM -0600, Jens Axboe wrote:
> >>>> XFS recently added support for async discards. While this can be
> >>>> a win for some workloads and devices, there are also cases where
> >>>> async bursty discard will severly harm the latencies of reads
> >>>> and writes.
> >>>
> >>> FWIW, convention is it document the performance regression in the
> >>> commit message, not leave the reader to guess at what it was....
> >>
> >> Yeah I'll give you that, I can improve the commit message for sure.
> >>
> >>> Did anyone analyse the pattern of discards being issued to work out
> >>> what pattern was worse for async vs sync discard? is it lots of
> >>> little discards, large extents being discarded, perhaps a problem
> >>> with the request request queue starving other IOs because we queue
> >>> so many async discards in such a short time (which is the difference
> >>> in behaviour vs the old code), or something else?
> >>
> >> What was observed was a big discard which would previously have
> >> gone down as smaller discards now going down as either one or many
> >> discards. Looking at the blktrace data, it's the difference between
> >>
> >> discard 1 queue
> >> discard 1 complete
> >> discatd 2 queue
> >> discard 2 complete
> >> [...]
> >> discard n queue
> >> discard n complete
> >>
> >> which is now
> >>
> >> discard 1 queue
> >> discard 2 queue
> >> [...]
> >> discard n queue
> >> [...]
> >> discard 1 complete
> >> discard 2 complete
> >> [...]
> >> discard n complete
> >>
> >> Note that we set a max discard size of 64MB for most devices,
> >> since it's been shown to have less impact on latencies for
> >> the IO that jobs actually care about.

Ok, so as I see it the problem here is that discards are not some
instantaneous command, but instead have some cost that is (probably)
less than (say) a full write of the same quantity of zeroes.  Previously
we'd feed the block layer one discard io at a time, but now we batch
them up and send multiple large discard requests at the same time.  The
discards then tie up the device for long periods of time.  We'd get the
same result if we sent gigabytes of write commands simultaneously too,
right?

Wouldn't the same problem would happen if say 200 threads were each
issuing discards one by one because there's just too many bytes in
flight at a time?

So what I'm asking here is, can we throttle discard IOs so that no
single issuer can monopolize a device?  As a stupid example, if program
A is sending 200MB of reads, program B is sending 200MB of writes, and
xfs is sending 200MB of discards at the same time, can we throttle all
three so that they each get roughly a third of the queue at a time?

This way XFS can still send huge batches of discard IOs asynchronously,
and it's fine enough if some of those have to wait longer in order to
avoid starving other things.

(Yes, I imagine you could serialize discards and run them one at a time,
but that seems a little suboptimal...)

> > 
> > IOWs, this has nothing to do with XFS behaviour, and everything to
> > do with suboptimal scheduling control for concurrent queued discards
> > in the block layer....
> 
> You could argue that, and it's fallout from XFS being the first
> user of async discard. Prior to that, we've never had that use
> case. I'm quite fine making all discards throttle to depth
> 1.
> 
> > XFS can issue discard requests of up to 8GB (on 4k block size
> > filesystems), and how they are optimised is completely up to the
> > block layer. blkdev_issue_discard() happens to be synchronous (for
> > historical reasons) and that does not match our asynchronous log IO
> > model. it's always been a cause of total filesystem stall problems
> > for XFS because we can't free log space until all the pending
> > discards have been issued. Hence we can see global filesystems
> > stalls of *minutes* with synchronous discards on bad devices and can
> > cause OOM and all sorts of other nasty cascading failures.
> > 
> > Async discard dispatch solves this problem for XFS - discards no
> > longer block forward journal progress, and we really don't want to
> > go back to having that ticking timebomb in XFS. Handling concurrent
> > discard requests in an optimal manner is not a filesystem problem -
> > it's an IO scheduling problem.
> > 
> > 
> > Essentially, we've exposed yet another limitation of the block/IO
> > layer handling of discard requests in the linux storage stack - it
> > does not have a configurable discard queue depth.
> > 
> > I'd much prefer these problems get handled at the IO scheduling
> > layer where there is visibulity of device capabilities and request
> > queue state. i.e. we should be throttling async discards just like
> > we can throttle read or write IO, especially given there are many
> > devices that serialise discards at the device level (i.e. not queued
> > device commands). This solves the problem for everyone and makes
> > things much better for the future where hardware implementations are
> > likely to get better and support more and more concurrency in
> > discard operations.
> > 
> > IMO, the way the high level code dispatches discard requests is
> > irrelevant here - this is a problem to do with queue depth and IO
> > scheduling/throttling. That's why I don't think adding permanent ABI
> > changes to filesystems to work around this problem is the right
> > solution....
> 
> If you get off your high horse for a bit, this is essentially
> a performance regression. I can either migrate folks off of XFS, or
> I can come up with something that works for them. It's pretty
> easy to claim this is "yet another limitation of the IO stack",
> but it's really not fair to make crazy claims like that when it's
> an entirely new use case. Let's try to state things objectively
> and fairly.
> 
> This work should probably have been done before making XFS
> discard async. This isn't the first fallout we've had from that
> code.
> 
> >>> I don't think we should be changing anything - adding an opaque,
> >>> user-unfriendly mount option does nothing to address the underlying
> >>> problem - it's just a hack to work around the symptoms being seen...
> >>>
> >>> More details of the regression and the root cause analysis is
> >>> needed, please.
> >>
> >> It brings back the same behavior as we had before, which performs
> >> better for us. It's preventing users of XFS+discard from upgrading,
> >> which is sad.
> > 
> > Yes, it does, but so would having the block layer to throttle device
> > discard requests in flight to a queue depth of 1. And then we don't
> > have to change XFS at all.
> 
> I'm perfectly fine with making that change by default, and much easier
> for me since I don't have to patch file systems.

...and I prefer not to add mount options, fwiw.

--D

> -- 
> Jens Axboe
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 22:59                 ` Eric Sandeen
@ 2018-04-30 23:02                   ` Jens Axboe
  0 siblings, 0 replies; 27+ messages in thread
From: Jens Axboe @ 2018-04-30 23:02 UTC (permalink / raw)
  To: Eric Sandeen, Luis R. Rodriguez; +Cc: Brian Foster, linux-xfs, linux-block, hch

On 4/30/18 4:59 PM, Eric Sandeen wrote:
> On 4/30/18 2:58 PM, Jens Axboe wrote:
>> On 4/30/18 1:57 PM, Eric Sandeen wrote:
>>> On 4/30/18 2:21 PM, Jens Axboe wrote:
> 
> ...
> 
>>> Ok, but I'm not sure how that precludes a block device tunable?  You'd tune
>>> it for your workload, right?
>>
>> What kind of tunable are you thinking of? Right now we have one tunable,
>> which is the max discard size. Patch #1 actually helps make this do what
>> it should for sync discards, instead of just building a bio chain and
>> submitting that all at once.
>>
>>> Or is the concern that it could only be for the entire block device, and
>>> perhaps different partitions have different workloads?
>>>
>>> Sorry, caveman filesystem guy doesn't completely understand block devices.
>>
>> I just don't know what you are trying to tune :-)
> 
> I'm wondering if "make discards synchronous" couldn't just be a block device
> tunable, rather than a filesystem mount option tunable.

See e-mail I just sent out. With that, you get that tunable bundled with
the discard_max_bytes tunable. If you leave it at the default, which is
generally very high, then you get huge discards. If you tune it down,
they become basically sync. I don't think we need a separate tunable
for this.

It's still per-caller, which is good enough for us. I don't want to
make it more complicated until people show up with a valid use case
that shows we need it strictly throttled per-device.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 23:00           ` Jens Axboe
@ 2018-04-30 23:23             ` Dave Chinner
  2018-05-01 11:11               ` Brian Foster
  2018-05-01 15:23               ` Jens Axboe
  0 siblings, 2 replies; 27+ messages in thread
From: Dave Chinner @ 2018-04-30 23:23 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-xfs, linux-block, hch

On Mon, Apr 30, 2018 at 05:00:14PM -0600, Jens Axboe wrote:
> On 4/30/18 4:40 PM, Jens Axboe wrote:
> > On 4/30/18 4:28 PM, Dave Chinner wrote:
> >> Yes, it does, but so would having the block layer to throttle device
> >> discard requests in flight to a queue depth of 1. And then we don't
> >> have to change XFS at all.
> > 
> > I'm perfectly fine with making that change by default, and much easier
> > for me since I don't have to patch file systems.
> 
> Totally untested, but this should do the trick. It ensures we have
> a QD of 1 (per caller), which should be sufficient.
> 
> If people tune down the discard size, then you'll be blocking waiting
> for discards on issue.
> 
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index a676084d4740..0bf9befcc863 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -11,16 +11,19 @@
>  #include "blk.h"
>  
>  static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
> -		gfp_t gfp)
> +			    gfp_t gfp)
>  {
> -	struct bio *new = bio_alloc(gfp, nr_pages);
> -
> +	/*
> +	 * Devices suck at discard, so if we have to break up the bio
> +	 * size due to the max discard size setting, wait for the
> +	 * previous one to finish first.
> +	 */
>  	if (bio) {
> -		bio_chain(bio, new);
> -		submit_bio(bio);
> +		submit_bio_wait(bio);
> +		bio_put(bio);
>  	}

This only addresses the case where __blkdev_issue_discard() breaks
up a single large discard, right? It seems like a brute force
solution, too, because it will do so even when the underlying device
is idle and there's no need to throttle.

Shouldn't the throttling logic at least look at device congestion?
i.e. if the device is not backlogged, then we should be able to
issue the discard without problems. 

I ask this because this only addresses throttling the "discard large
extent" case when the discard limit is set low. i.e. your exact
problem case. We know that XFS can issue large numbers of
discontiguous async discards in a single batch - this patch does not
address that case and so it will still cause starvation problems.

If we look at device congestion in determining how to throttle/back
off during discard issuing, then it doesn't matter what
max_discard_sectors is set to - it will throttle in all situations
that cause device overloads and starvations....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 23:23             ` Dave Chinner
@ 2018-05-01 11:11               ` Brian Foster
  2018-05-01 15:23               ` Jens Axboe
  1 sibling, 0 replies; 27+ messages in thread
From: Brian Foster @ 2018-05-01 11:11 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Jens Axboe, linux-xfs, linux-block, hch

On Tue, May 01, 2018 at 09:23:20AM +1000, Dave Chinner wrote:
> On Mon, Apr 30, 2018 at 05:00:14PM -0600, Jens Axboe wrote:
> > On 4/30/18 4:40 PM, Jens Axboe wrote:
> > > On 4/30/18 4:28 PM, Dave Chinner wrote:
> > >> Yes, it does, but so would having the block layer to throttle device
> > >> discard requests in flight to a queue depth of 1. And then we don't
> > >> have to change XFS at all.
> > > 
> > > I'm perfectly fine with making that change by default, and much easier
> > > for me since I don't have to patch file systems.
> > 
> > Totally untested, but this should do the trick. It ensures we have
> > a QD of 1 (per caller), which should be sufficient.
> > 
> > If people tune down the discard size, then you'll be blocking waiting
> > for discards on issue.
> > 
> > diff --git a/block/blk-lib.c b/block/blk-lib.c
> > index a676084d4740..0bf9befcc863 100644
> > --- a/block/blk-lib.c
> > +++ b/block/blk-lib.c
> > @@ -11,16 +11,19 @@
> >  #include "blk.h"
> >  
> >  static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
> > -		gfp_t gfp)
> > +			    gfp_t gfp)
> >  {
> > -	struct bio *new = bio_alloc(gfp, nr_pages);
> > -
> > +	/*
> > +	 * Devices suck at discard, so if we have to break up the bio
> > +	 * size due to the max discard size setting, wait for the
> > +	 * previous one to finish first.
> > +	 */
> >  	if (bio) {
> > -		bio_chain(bio, new);
> > -		submit_bio(bio);
> > +		submit_bio_wait(bio);
> > +		bio_put(bio);
> >  	}
> 
> This only addresses the case where __blkdev_issue_discard() breaks
> up a single large discard, right? It seems like a brute force
> solution, too, because it will do so even when the underlying device
> is idle and there's no need to throttle.
> 

I think the above serializes the discards regardless of whether
__blkdev_issue_discard() splits up a single range itself or the caller
passes discontiguous ranges as part of a single chain.

That said, I agree that it seems brute force. ISTM that this wouldn't
address separate callers causing the same problem (i.e., if the discard
burst doesn't happen to be part of a single chain)...? Alternatively,
what if the discard chain is large but covers a very small range (e.g.,
consider a chain of single block discards, for example)? Given your
previous comments around the size of the range being the determining
factor, would we really want to serialize in that case?

BTW, what happens with submit_bio_wait() if the block device is plugged?
Note that xlog_discard_busy_extents() currently plugs around the discard
bio chain construction.

> Shouldn't the throttling logic at least look at device congestion?
> i.e. if the device is not backlogged, then we should be able to
> issue the discard without problems. 
> 
> I ask this because this only addresses throttling the "discard large
> extent" case when the discard limit is set low. i.e. your exact
> problem case. We know that XFS can issue large numbers of
> discontiguous async discards in a single batch - this patch does not
> address that case and so it will still cause starvation problems.
> 
> If we look at device congestion in determining how to throttle/back
> off during discard issuing, then it doesn't matter what
> max_discard_sectors is set to - it will throttle in all situations
> that cause device overloads and starvations....
> 

Indeed, this all seems more robust to me. TBH, even something more
simple like allowing one discard chain at a time where the chain itself
is size limited (and so puts the onus on the caller a bit to construct
larger chains to trade off for more simple block layer throttling logic)
seems reasonable, but I'm not sure if/how feasible that is (or if it
really is any more simple). Just a thought.

Note that however the implementation, I think that channeling the
backpressure in the form of sync submission has the potential to
reintroduce the filesystem latencies that have been brought up a couple
times in this thread. Unless I'm missing something, I still think there's
value in trying to separate discard submission from log I/O completion
in XFS.

Dave, do you have any fundamental thoughts on that? For example, I'm
wondering if there's any danger in creating a sort of intermediary busy
state between log I/O completion and some time later when a background
wq job or something discards the pending busy extents and removes them
from the pag tree. That actually might open the door for more
interesting optimizations like eliminating the need for certain discards
at all if the assocated blocks are reallocated (after the associated log
I/O completed but before the background discard happened to occur).
Actually, I'm told that explicit reuse of such blocks might be a
significant win for things like VDO (dm dedup), where discards are
apparently extra double ungood. Thoughts?

Brian

> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> david@fromorbit.com
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-04-30 23:23             ` Dave Chinner
  2018-05-01 11:11               ` Brian Foster
@ 2018-05-01 15:23               ` Jens Axboe
  2018-05-02  2:54                 ` Martin K. Petersen
  1 sibling, 1 reply; 27+ messages in thread
From: Jens Axboe @ 2018-05-01 15:23 UTC (permalink / raw)
  To: Dave Chinner; +Cc: linux-xfs, linux-block, hch

On 4/30/18 5:23 PM, Dave Chinner wrote:
> On Mon, Apr 30, 2018 at 05:00:14PM -0600, Jens Axboe wrote:
>> On 4/30/18 4:40 PM, Jens Axboe wrote:
>>> On 4/30/18 4:28 PM, Dave Chinner wrote:
>>>> Yes, it does, but so would having the block layer to throttle device
>>>> discard requests in flight to a queue depth of 1. And then we don't
>>>> have to change XFS at all.
>>>
>>> I'm perfectly fine with making that change by default, and much easier
>>> for me since I don't have to patch file systems.
>>
>> Totally untested, but this should do the trick. It ensures we have
>> a QD of 1 (per caller), which should be sufficient.
>>
>> If people tune down the discard size, then you'll be blocking waiting
>> for discards on issue.
>>
>> diff --git a/block/blk-lib.c b/block/blk-lib.c
>> index a676084d4740..0bf9befcc863 100644
>> --- a/block/blk-lib.c
>> +++ b/block/blk-lib.c
>> @@ -11,16 +11,19 @@
>>  #include "blk.h"
>>  
>>  static struct bio *next_bio(struct bio *bio, unsigned int nr_pages,
>> -		gfp_t gfp)
>> +			    gfp_t gfp)
>>  {
>> -	struct bio *new = bio_alloc(gfp, nr_pages);
>> -
>> +	/*
>> +	 * Devices suck at discard, so if we have to break up the bio
>> +	 * size due to the max discard size setting, wait for the
>> +	 * previous one to finish first.
>> +	 */
>>  	if (bio) {
>> -		bio_chain(bio, new);
>> -		submit_bio(bio);
>> +		submit_bio_wait(bio);
>> +		bio_put(bio);
>>  	}
> 
> This only addresses the case where __blkdev_issue_discard() breaks
> up a single large discard, right? It seems like a brute force
> solution, too, because it will do so even when the underlying device
> is idle and there's no need to throttle.

Right, the above would only break up a single discard, that's the
per-caller part.

> Shouldn't the throttling logic at least look at device congestion?
> i.e. if the device is not backlogged, then we should be able to
> issue the discard without problems. 
> 
> I ask this because this only addresses throttling the "discard large
> extent" case when the discard limit is set low. i.e. your exact
> problem case. We know that XFS can issue large numbers of
> discontiguous async discards in a single batch - this patch does not
> address that case and so it will still cause starvation problems.
> 
> If we look at device congestion in determining how to throttle/back
> off during discard issuing, then it doesn't matter what
> max_discard_sectors is set to - it will throttle in all situations
> that cause device overloads and starvations....

How about the below? It integrates it with the writeback throttling,
treating it like background writes. Totally untested. The benefit
of this is that it ties into that whole framework, and it's
per-device managed.

The blk-lib change is a separate patch, ensuring we break up discards
according to the user size. Will get broken up.


diff --git a/block/blk-lib.c b/block/blk-lib.c
index a676084d4740..7417d617091b 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -62,10 +62,11 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		unsigned int req_sects;
 		sector_t end_sect, tmp;
 
-		/* Make sure bi_size doesn't overflow */
-		req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9);
+		/* Issue in chunks of the user defined max discard setting */
+		req_sects = min_t(sector_t, nr_sects,
+					q->limits.max_discard_sectors);
 
-		/**
+		/*
 		 * If splitting a request, and the next starting sector would be
 		 * misaligned, stop the discard at the previous aligned sector.
 		 */
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 2dd36347252a..c22049a8125e 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -10,11 +10,11 @@
 
 /*
  * from upper:
- * 3 bits: reserved for other usage
+ * 4 bits: reserved for other usage
  * 12 bits: size
- * 49 bits: time
+ * 48 bits: time
  */
-#define BLK_STAT_RES_BITS	3
+#define BLK_STAT_RES_BITS	4
 #define BLK_STAT_SIZE_BITS	12
 #define BLK_STAT_RES_SHIFT	(64 - BLK_STAT_RES_BITS)
 #define BLK_STAT_SIZE_SHIFT	(BLK_STAT_RES_SHIFT - BLK_STAT_SIZE_BITS)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index f92fc84b5e2c..ba0c2825d382 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -101,9 +101,15 @@ static bool wb_recent_wait(struct rq_wb *rwb)
 	return time_before(jiffies, wb->dirty_sleep + HZ);
 }
 
-static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd)
+static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_trim,
+					  bool is_kswapd)
 {
-	return &rwb->rq_wait[is_kswapd];
+	if (is_trim)
+		return &rwb->rq_wait[WBT_REQ_TRIM];
+	else if (is_kswapd)
+		return &rwb->rq_wait[WBT_REQ_KSWAPD];
+	else
+		return &rwb->rq_wait[WBT_REQ_BG];
 }
 
 static void rwb_wake_all(struct rq_wb *rwb)
@@ -120,13 +126,14 @@ static void rwb_wake_all(struct rq_wb *rwb)
 
 void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
 {
+	const bool is_trim = wb_acct & WBT_TRIM;
 	struct rq_wait *rqw;
 	int inflight, limit;
 
 	if (!(wb_acct & WBT_TRACKED))
 		return;
 
-	rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD);
+	rqw = get_rq_wait(rwb, is_trim, wb_acct & WBT_KSWAPD);
 	inflight = atomic_dec_return(&rqw->inflight);
 
 	/*
@@ -139,10 +146,13 @@ void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct)
 	}
 
 	/*
-	 * If the device does write back caching, drop further down
-	 * before we wake people up.
+	 * For discards, our limit is always the background. For writes, if
+	 * the device does write back caching, drop further down before we
+	 * wake people up.
 	 */
-	if (rwb->wc && !wb_recent_wait(rwb))
+	if (is_trim)
+		limit = rwb->wb_background;
+	else if (rwb->wc && !wb_recent_wait(rwb))
 		limit = 0;
 	else
 		limit = rwb->wb_normal;
@@ -479,6 +489,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
 {
 	unsigned int limit;
 
+	if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD)
+		return rwb->wb_background;
+
 	/*
 	 * At this point we know it's a buffered write. If this is
 	 * kswapd trying to free memory, or REQ_SYNC is set, then
@@ -533,7 +546,8 @@ static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock)
 	__releases(lock)
 	__acquires(lock)
 {
-	struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd());
+	const bool is_trim = (rw & REQ_OP_MASK) == REQ_OP_DISCARD;
+	struct rq_wait *rqw = get_rq_wait(rwb, is_trim, current_is_kswapd());
 	DEFINE_WAIT(wait);
 
 	if (may_queue(rwb, rqw, &wait, rw))
@@ -561,19 +575,19 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
 {
 	const int op = bio_op(bio);
 
-	/*
-	 * If not a WRITE, do nothing
-	 */
-	if (op != REQ_OP_WRITE)
-		return false;
+	if (op == REQ_OP_WRITE) {
+		/*
+		 * Don't throttle WRITE_ODIRECT
+		 */
+		if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) ==
+		    (REQ_SYNC | REQ_IDLE))
+			return false;
 
-	/*
-	 * Don't throttle WRITE_ODIRECT
-	 */
-	if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE))
-		return false;
+		return true;
+	} else if (op == REQ_OP_DISCARD)
+		return true;
 
-	return true;
+	return false;
 }
 
 /*
@@ -605,6 +619,8 @@ enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock)
 
 	if (current_is_kswapd())
 		ret |= WBT_KSWAPD;
+	if (bio_op(bio) == REQ_OP_DISCARD)
+		ret |= WBT_TRIM;
 
 	return ret | WBT_TRACKED;
 }
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index a232c98fbf4d..aec5bc82d580 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -14,12 +14,17 @@ enum wbt_flags {
 	WBT_TRACKED		= 1,	/* write, tracked for throttling */
 	WBT_READ		= 2,	/* read */
 	WBT_KSWAPD		= 4,	/* write, from kswapd */
+	WBT_TRIM		= 8,
 
-	WBT_NR_BITS		= 3,	/* number of bits */
+	WBT_NR_BITS		= 4,	/* number of bits */
 };
 
 enum {
-	WBT_NUM_RWQ		= 2,
+	WBT_REQ_BG = 0,
+	WBT_REQ_KSWAPD,
+	WBT_REQ_TRIM,
+
+	WBT_NUM_RWQ,
 };
 
 /*

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-05-01 15:23               ` Jens Axboe
@ 2018-05-02  2:54                 ` Martin K. Petersen
  2018-05-02 14:20                   ` Jens Axboe
  0 siblings, 1 reply; 27+ messages in thread
From: Martin K. Petersen @ 2018-05-02  2:54 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Dave Chinner, linux-xfs, linux-block, hch


Jens,

> diff --git a/block/blk-wbt.h b/block/blk-wbt.h
> index a232c98fbf4d..aec5bc82d580 100644
> --- a/block/blk-wbt.h
> +++ b/block/blk-wbt.h
> @@ -14,12 +14,17 @@ enum wbt_flags {
>  	WBT_TRACKED		= 1,	/* write, tracked for throttling */
>  	WBT_READ		= 2,	/* read */
>  	WBT_KSWAPD		= 4,	/* write, from kswapd */
> +	WBT_TRIM		= 8,

The term TRIM does not apply to NVMe, nor SCSI. Please call it
WBT_DISCARD.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCHSET 0/2] sync discard
  2018-04-30 15:32 [PATCHSET 0/2] sync discard Jens Axboe
  2018-04-30 15:32 ` [PATCH 1/2] block: add BLKDEV_DISCARD_SYNC flag Jens Axboe
  2018-04-30 15:32 ` [PATCH 2/2] xfs: add 'discard_sync' mount flag Jens Axboe
@ 2018-05-02 12:45 ` Christoph Hellwig
  2018-05-02 14:19   ` Jens Axboe
  2 siblings, 1 reply; 27+ messages in thread
From: Christoph Hellwig @ 2018-05-02 12:45 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-xfs, linux-block, hch

On Mon, Apr 30, 2018 at 09:32:50AM -0600, Jens Axboe wrote:
> We recently had a pretty severe perf regression with the XFS async
> discard. This small series add a SYNC issue discard flag, and also
> limits the chain size for sync discards. Patch 2 adds support for
> reverting XFS back to doign sync discards.

Please explain the hardware and workload that you see this with.
Also given that it is a hardware limitation a quirk limiting the
size of discard requests and/or number of outstanding discards
would be more useful than falling back to a broken mode of issueing
block commands.

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCHSET 0/2] sync discard
  2018-05-02 12:45 ` [PATCHSET 0/2] sync discard Christoph Hellwig
@ 2018-05-02 14:19   ` Jens Axboe
  0 siblings, 0 replies; 27+ messages in thread
From: Jens Axboe @ 2018-05-02 14:19 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-xfs, linux-block

On 5/2/18 6:45 AM, Christoph Hellwig wrote:
> On Mon, Apr 30, 2018 at 09:32:50AM -0600, Jens Axboe wrote:
>> We recently had a pretty severe perf regression with the XFS async
>> discard. This small series add a SYNC issue discard flag, and also
>> limits the chain size for sync discards. Patch 2 adds support for
>> reverting XFS back to doign sync discards.
> 
> Please explain the hardware and workload that you see this with.
> Also given that it is a hardware limitation a quirk limiting the
> size of discard requests and/or number of outstanding discards
> would be more useful than falling back to a broken mode of issueing
> block commands.

Honestly, from what I've seen in production, I can whitelist one
device and that one is no longer being produced. The rest is crap
in terms of latency impacting trim. A quirk or whitelist/blacklist
isn't going to be of any help.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

* Re: [PATCH 2/2] xfs: add 'discard_sync' mount flag
  2018-05-02  2:54                 ` Martin K. Petersen
@ 2018-05-02 14:20                   ` Jens Axboe
  0 siblings, 0 replies; 27+ messages in thread
From: Jens Axboe @ 2018-05-02 14:20 UTC (permalink / raw)
  To: Martin K. Petersen; +Cc: Dave Chinner, linux-xfs, linux-block, hch

On 5/1/18 8:54 PM, Martin K. Petersen wrote:
> 
> Jens,
> 
>> diff --git a/block/blk-wbt.h b/block/blk-wbt.h
>> index a232c98fbf4d..aec5bc82d580 100644
>> --- a/block/blk-wbt.h
>> +++ b/block/blk-wbt.h
>> @@ -14,12 +14,17 @@ enum wbt_flags {
>>  	WBT_TRACKED		= 1,	/* write, tracked for throttling */
>>  	WBT_READ		= 2,	/* read */
>>  	WBT_KSWAPD		= 4,	/* write, from kswapd */
>> +	WBT_TRIM		= 8,
> 
> The term TRIM does not apply to NVMe, nor SCSI. Please call it
> WBT_DISCARD.

Sure, I can do that.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 27+ messages in thread

end of thread, other threads:[~2018-05-02 14:20 UTC | newest]

Thread overview: 27+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-04-30 15:32 [PATCHSET 0/2] sync discard Jens Axboe
2018-04-30 15:32 ` [PATCH 1/2] block: add BLKDEV_DISCARD_SYNC flag Jens Axboe
2018-04-30 15:32 ` [PATCH 2/2] xfs: add 'discard_sync' mount flag Jens Axboe
2018-04-30 17:19   ` Brian Foster
2018-04-30 18:07     ` Jens Axboe
2018-04-30 18:25       ` Luis R. Rodriguez
2018-04-30 18:31         ` Jens Axboe
2018-04-30 19:19         ` Eric Sandeen
2018-04-30 19:21           ` Jens Axboe
2018-04-30 19:57             ` Eric Sandeen
2018-04-30 19:58               ` Jens Axboe
2018-04-30 22:59                 ` Eric Sandeen
2018-04-30 23:02                   ` Jens Axboe
2018-04-30 19:18       ` Brian Foster
2018-04-30 21:31   ` Dave Chinner
2018-04-30 21:42     ` Jens Axboe
2018-04-30 22:28       ` Dave Chinner
2018-04-30 22:40         ` Jens Axboe
2018-04-30 23:00           ` Jens Axboe
2018-04-30 23:23             ` Dave Chinner
2018-05-01 11:11               ` Brian Foster
2018-05-01 15:23               ` Jens Axboe
2018-05-02  2:54                 ` Martin K. Petersen
2018-05-02 14:20                   ` Jens Axboe
2018-04-30 23:01           ` Darrick J. Wong
2018-05-02 12:45 ` [PATCHSET 0/2] sync discard Christoph Hellwig
2018-05-02 14:19   ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.