All of lore.kernel.org
 help / color / mirror / Atom feed
From: Brian Foster <bfoster@redhat.com>
To: Christoph Hellwig <hch@lst.de>
Cc: linux-xfs@vger.kernel.org
Subject: Re: [PATCH 2/4] xfs: improve handling of busy extents in the low-level allocator
Date: Fri, 3 Feb 2017 10:22:33 -0500	[thread overview]
Message-ID: <20170203152233.GC45388@bfoster.bfoster> (raw)
In-Reply-To: <1485715421-17182-3-git-send-email-hch@lst.de>

On Sun, Jan 29, 2017 at 07:43:39PM +0100, Christoph Hellwig wrote:
> Currently we force the log and simply try again if we hit a busy extent,
> but especially with online discard enabled it might take a while after
> the log force for the busy extents to disappear, and we might have
> already completed our second pass.
> 
> So instead we add a new waitqueue and a generation counter to the pag
> structure so that we can do wakeups once we've removed busy extents,
> and we replace the single retry with an unconditional one - after
> all we hold the AGF buffer lock, so no other allocations or frees
> can be racing with us in this AG.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  fs/xfs/libxfs/xfs_alloc.c | 93 +++++++++++++++++++++++---------------------
>  fs/xfs/xfs_extent_busy.c  | 98 ++++++++++++++++++++++++++++++++++-------------
>  fs/xfs/xfs_extent_busy.h  |  8 +++-
>  fs/xfs/xfs_mount.c        |  1 +
>  fs/xfs/xfs_mount.h        |  2 +
>  5 files changed, 129 insertions(+), 73 deletions(-)
> 
> diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
> index 9f06a21..fe98fbc 100644
> --- a/fs/xfs/libxfs/xfs_alloc.c
> +++ b/fs/xfs/libxfs/xfs_alloc.c
...
> @@ -1183,8 +1194,8 @@ xfs_alloc_ag_vextent_near(
>  			if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
>  				goto error0;
>  			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
> -			xfs_alloc_compute_aligned(args, ltbno, ltlen,
> -						  &ltbnoa, &ltlena);
> +			busy |= xfs_alloc_compute_aligned(args, ltbno, ltlen,
> +					&ltbnoa, &ltlena, &busy_gen);
>  			if (ltlena >= args->minlen && ltbnoa >= args->min_agbno)
>  				break;
>  			if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
> @@ -1199,8 +1210,8 @@ xfs_alloc_ag_vextent_near(
>  			if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
>  				goto error0;
>  			XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
> -			xfs_alloc_compute_aligned(args, gtbno, gtlen,
> -						  &gtbnoa, &gtlena);
> +			busy |= xfs_alloc_compute_aligned(args, gtbno, gtlen,
> +					&gtbnoa, &gtlena, &busy_gen);

Not a big deal, but perhaps in the above two cases where we're
traversing the bnobt, just track the max busy gen and use that being set
non-zero to trigger (hopefully) fewer flushes rather than being subject
to whatever the last value was? Then we don't have to do the 'busy |=
..' thing either. That doesn't cover the overflow case, but that should
be rare and we still have the retry.

>  			if (gtlena >= args->minlen && gtbnoa <= args->max_agbno)
>  				break;
>  			if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
...
> diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
> index 29c2f99..8251359 100644
> --- a/fs/xfs/xfs_extent_busy.c
> +++ b/fs/xfs/xfs_extent_busy.c
> @@ -334,14 +334,18 @@ xfs_extent_busy_reuse(
>   * subset of the extent that is not busy.  If *rlen is smaller than
>   * args->minlen no suitable extent could be found, and the higher level
>   * code needs to force out the log and retry the allocation.
> + *
> + * Return the current discard generation for the AG if the file system
> + * has online discard enabled.  This value can be used to wait for
> + * the trimmed extent to become fully available if the AG is running out
> + * of space.
>   */
> -void
> +bool
>  xfs_extent_busy_trim(
>  	struct xfs_alloc_arg	*args,
> -	xfs_agblock_t		bno,
> -	xfs_extlen_t		len,
> -	xfs_agblock_t		*rbno,
> -	xfs_extlen_t		*rlen)
> +	xfs_agblock_t		*bno,
> +	xfs_extlen_t		*len,
> +	unsigned		*busy_gen)
>  {
>  	xfs_agblock_t		fbno;
>  	xfs_extlen_t		flen;
> @@ -351,8 +355,8 @@ xfs_extent_busy_trim(
>  
>  	spin_lock(&args->pag->pagb_lock);
>  restart:
> -	fbno = bno;
> -	flen = len;
> +	fbno = *bno;
> +	flen = *len;
>  	rbp = args->pag->pagb_tree.rb_node;
>  	while (rbp && flen >= args->minlen) {
>  		struct xfs_extent_busy *busyp =
> @@ -504,24 +508,25 @@ xfs_extent_busy_trim(
>  
>  		flen = fend - fbno;
>  	}
> +out:
>  	spin_unlock(&args->pag->pagb_lock);
>  
> -	if (fbno != bno || flen != len) {
> -		trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len,
> +	if (fbno != *bno || flen != *len) {
> +		trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
>  					  fbno, flen);
> +		*bno = fbno;
> +		*len = flen;
> +		*busy_gen = args->pag->pagb_gen;
> +		return true;

We've already dropped pagb_lock by the time we grab pagb_gen. What
prevents this from racing with a flush and pagb_gen bump and returning a
gen value that might not have any associated busy extents?

>  	}
> -	*rbno = fbno;
> -	*rlen = flen;
> -	return;
> +	return false;
>  fail:
>  	/*
>  	 * Return a zero extent length as failure indications.  All callers
>  	 * re-check if the trimmed extent satisfies the minlen requirement.
>  	 */
> -	spin_unlock(&args->pag->pagb_lock);
> -	trace_xfs_extent_busy_trim(args->mp, args->agno, bno, len, fbno, 0);
> -	*rbno = fbno;
> -	*rlen = 0;
> +	flen = 0;
> +	goto out;
>  }
>  
>  STATIC void
...
> @@ -554,29 +574,53 @@ xfs_extent_busy_clear(
...
> +/*
> + * Flush out all busy extents for this AG.
> + */
> +void
> +xfs_extent_busy_flush(
> +	struct xfs_mount	*mp,
> +	struct xfs_perag	*pag,
> +	unsigned		busy_gen)
> +{
> +	DEFINE_WAIT		(wait);
> +	int			log_flushed = 0, error;
> +
> +	trace_xfs_log_force(mp, 0, _THIS_IP_);
> +	error = _xfs_log_force(mp, XFS_LOG_SYNC, &log_flushed);
> +	if (error)
> +		return;
> +
> +	while (busy_gen == READ_ONCE(pag->pagb_gen)) {
> +		prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
> +		schedule();
>  	}
> +	finish_wait(&pag->pagb_wait, &wait);

This seems racy. Shouldn't this do something like:

	do {
		prepare_to_wait();
		if (busy_gen != pagb_gen)
			break;
		schedule();
		finish_wait();
	} while (1);
	finish_wait();

... to make sure we don't lose a wakeup between setting the task state
and actually scheduling out?

>  }
>  
>  /*
...
> diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
> index 7f351f7..7363499 100644
> --- a/fs/xfs/xfs_mount.h
> +++ b/fs/xfs/xfs_mount.h
> @@ -384,6 +384,8 @@ typedef struct xfs_perag {
>  	xfs_agino_t	pagl_rightrec;
>  	spinlock_t	pagb_lock;	/* lock for pagb_tree */
>  	struct rb_root	pagb_tree;	/* ordered tree of busy extents */
> +	unsigned int	pagb_gen;
> +	wait_queue_head_t pagb_wait;

Can we add some comments here similar to the other fields? Also, how
about slightly more informative names... pagb_discard_[gen|wait], or
pagb_busy_*?

Brian

>  
>  	atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
>  
> -- 
> 2.1.4
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-xfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

  reply	other threads:[~2017-02-03 15:22 UTC|newest]

Thread overview: 24+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-01-29 18:43 improve busy extent handling and add async discard support Christoph Hellwig
2017-01-29 18:43 ` [PATCH 1/4] xfs: don't fail xfs_extent_busy allocation Christoph Hellwig
2017-02-03 15:20   ` Brian Foster
2017-02-04  9:50     ` Christoph Hellwig
2017-02-06 16:43       ` Brian Foster
2017-02-07  9:42         ` Christoph Hellwig
2017-01-29 18:43 ` [PATCH 2/4] xfs: improve handling of busy extents in the low-level allocator Christoph Hellwig
2017-02-03 15:22   ` Brian Foster [this message]
2017-02-04  9:54     ` Christoph Hellwig
2017-02-03 16:22   ` Brian Foster
2017-02-04  9:56     ` Christoph Hellwig
2017-02-06 16:47       ` Brian Foster
2017-02-07  9:43         ` Christoph Hellwig
2017-02-07 13:13           ` Brian Foster
2017-02-07 15:45             ` Christoph Hellwig
2017-01-29 18:43 ` [PATCH 3/4] xfs: improve busy extent sorting Christoph Hellwig
2017-02-03 15:22   ` Brian Foster
2017-02-04  9:58     ` Christoph Hellwig
2017-01-29 18:43 ` [PATCH 4/4] xfs: don't block the log commit handler for discards Christoph Hellwig
2017-02-03 16:22   ` Brian Foster
2017-02-04  9:59     ` Christoph Hellwig
2017-02-06 16:49       ` Brian Foster
2017-02-07  9:50         ` Christoph Hellwig
2017-02-05 17:11 improve busy extent handling and add async discard support V2 Christoph Hellwig
2017-02-05 17:11 ` [PATCH 2/4] xfs: improve handling of busy extents in the low-level allocator Christoph Hellwig

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170203152233.GC45388@bfoster.bfoster \
    --to=bfoster@redhat.com \
    --cc=hch@lst.de \
    --cc=linux-xfs@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.