All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] xfs: Do background CIL flushes via a workqueue
@ 2012-03-27  9:46 Dave Chinner
  2012-03-27 14:31 ` Christoph Hellwig
  0 siblings, 1 reply; 10+ messages in thread
From: Dave Chinner @ 2012-03-27  9:46 UTC (permalink / raw)
  To: xfs

From: Dave Chinner <dchinner@redhat.com>

Doing background CIL flushes adds significant latency to whatever
async transaction that triggers it. To avoid blocking async
transactions on things like waiting for log buffer IO to complete,
move the CIL push off into a workqueue.  By moving the push work
into a workqueue, we remove all the latency that the commit adds
from the foreground transaction commit path. This also means that
single threaded workloads won't do the CIL push procssing, leaving
them more CPU to do more async transactions.

To do this, we need to keep track of the sequence number we have
pushed work for. This avoids having many transaction commits
attempting to schedule work for the same sequence, and ensures that
we only ever have one push (background or forced) in progress at a
time. It also means that we don't need to take the CIL lock in write
mode to check for potential background push races, which reduces
lock contention.

To avoid potential issues with "smart" IO schedulers, don't use the
workqueue for log force triggered flushes. Instead, do them directly
so that the log IO is done directly by the process issuing the log
force and so doesn't get stuck on IO elevator queue idling
incorrectly delaying the log IO from the workqueue.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log_cil.c  |  241 ++++++++++++++++++++++++++++++-------------------
 fs/xfs/xfs_log_priv.h |    4 +
 fs/xfs/xfs_super.c    |    6 ++
 3 files changed, 158 insertions(+), 93 deletions(-)

diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index d4fadbe..6a5a7ba 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -31,57 +31,7 @@
 #include "xfs_alloc.h"
 #include "xfs_discard.h"
 
-/*
- * Perform initial CIL structure initialisation.
- */
-int
-xlog_cil_init(
-	struct log	*log)
-{
-	struct xfs_cil	*cil;
-	struct xfs_cil_ctx *ctx;
-
-	cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
-	if (!cil)
-		return ENOMEM;
-
-	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
-	if (!ctx) {
-		kmem_free(cil);
-		return ENOMEM;
-	}
-
-	INIT_LIST_HEAD(&cil->xc_cil);
-	INIT_LIST_HEAD(&cil->xc_committing);
-	spin_lock_init(&cil->xc_cil_lock);
-	init_rwsem(&cil->xc_ctx_lock);
-	init_waitqueue_head(&cil->xc_commit_wait);
-
-	INIT_LIST_HEAD(&ctx->committing);
-	INIT_LIST_HEAD(&ctx->busy_extents);
-	ctx->sequence = 1;
-	ctx->cil = cil;
-	cil->xc_ctx = ctx;
-	cil->xc_current_sequence = ctx->sequence;
-
-	cil->xc_log = log;
-	log->l_cilp = cil;
-	return 0;
-}
-
-void
-xlog_cil_destroy(
-	struct log	*log)
-{
-	if (log->l_cilp->xc_ctx) {
-		if (log->l_cilp->xc_ctx->ticket)
-			xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
-		kmem_free(log->l_cilp->xc_ctx);
-	}
-
-	ASSERT(list_empty(&log->l_cilp->xc_cil));
-	kmem_free(log->l_cilp);
-}
+struct workqueue_struct *xfs_cil_wq;
 
 /*
  * Allocate a new ticket. Failing to get a new ticket makes it really hard to
@@ -426,8 +376,7 @@ xlog_cil_committed(
  */
 STATIC int
 xlog_cil_push(
-	struct log		*log,
-	xfs_lsn_t		push_seq)
+	struct log		*log)
 {
 	struct xfs_cil		*cil = log->l_cilp;
 	struct xfs_log_vec	*lv;
@@ -443,39 +392,35 @@ xlog_cil_push(
 	struct xfs_log_iovec	lhdr;
 	struct xfs_log_vec	lvhdr = { NULL };
 	xfs_lsn_t		commit_lsn;
+	xfs_lsn_t		push_seq;
 
 	if (!cil)
 		return 0;
 
-	ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
-
 	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
 	new_ctx->ticket = xlog_cil_ticket_alloc(log);
 
-	/*
-	 * Lock out transaction commit, but don't block for background pushes
-	 * unless we are well over the CIL space limit. See the definition of
-	 * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
-	 * used here.
-	 */
-	if (!down_write_trylock(&cil->xc_ctx_lock)) {
-		if (!push_seq &&
-		    cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
-			goto out_free_ticket;
-		down_write(&cil->xc_ctx_lock);
-	}
+	down_write(&cil->xc_ctx_lock);
 	ctx = cil->xc_ctx;
 
-	/* check if we've anything to push */
-	if (list_empty(&cil->xc_cil))
-		goto out_skip;
+	spin_lock(&cil->xc_cil_lock);
+	push_seq = cil->xc_push_seq;
+	ASSERT(push_seq > 0 && push_seq <= ctx->sequence);
 
-	/* check for spurious background flush */
-	if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+	/*
+	 * Check if we've anything to push. If there is nothing, then we don't
+	 * move on to a new sequence number and so we have to be able to push
+	 * this sequence again later.
+	 */
+	if (list_empty(&cil->xc_cil)) {
+		cil->xc_push_seq = 0;
+		spin_unlock(&cil->xc_cil_lock);
 		goto out_skip;
+	}
+	spin_unlock(&cil->xc_cil_lock);
 
 	/* check for a previously pushed seqeunce */
-	if (push_seq && push_seq < cil->xc_ctx->sequence)
+	if (push_seq < cil->xc_ctx->sequence)
 		goto out_skip;
 
 	/*
@@ -629,7 +574,6 @@ restart:
 
 out_skip:
 	up_write(&cil->xc_ctx_lock);
-out_free_ticket:
 	xfs_log_ticket_put(new_ctx->ticket);
 	kmem_free(new_ctx);
 	return 0;
@@ -641,6 +585,80 @@ out_abort:
 	return XFS_ERROR(EIO);
 }
 
+static void
+xlog_cil_push_work(
+	struct work_struct	*work)
+{
+	struct xfs_cil		*cil = container_of(work, struct xfs_cil,
+							xc_push_work);
+	xlog_cil_push(cil->xc_log);
+}
+
+/*
+ * We need to push CIL every so often so we don't cache more than we can fit in
+ * the log. The limit really is that a checkpoint can't be more than half the
+ * log (the current checkpoint is not allowed to overwrite the previous
+ * checkpoint), but commit latency and memory usage limit this to a smaller
+ * size.
+ */
+static void
+xlog_cil_push_background(
+	struct log	*log)
+{
+	struct xfs_cil	*cil = log->l_cilp;
+
+	/*
+	 * The cil won't be empty because we are called while holding the
+	 * context lock so whatever we added to the CIL will still be there
+	 */
+	ASSERT(!list_empty(&cil->xc_cil));
+
+	/*
+	 * don't do a background push if we haven't used up all the
+	 * space available yet.
+	 */
+	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+		return;
+
+	spin_lock(&cil->xc_cil_lock);
+	cil->xc_push_seq = cil->xc_current_sequence;
+	queue_work(xfs_cil_wq, &cil->xc_push_work);
+	spin_unlock(&cil->xc_cil_lock);
+
+}
+
+static void
+xlog_cil_push_foreground(
+	struct log	*log,
+	xfs_lsn_t	push_seq)
+{
+	struct xfs_cil	*cil = log->l_cilp;
+
+	if (!cil)
+		return;
+
+	ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
+
+	/* start on any pending background push to minimise wait time on it */
+	flush_work(&cil->xc_push_work);
+
+	/*
+	 * If the CIL is empty or we've already pushed the sequence then
+	 * there's no work we need to do.
+	 */
+	spin_lock(&cil->xc_cil_lock);
+	if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
+		spin_unlock(&cil->xc_cil_lock);
+		return;
+	}
+
+	cil->xc_push_seq = push_seq;
+	spin_unlock(&cil->xc_cil_lock);
+
+	/* do the push now */
+	xlog_cil_push(log);
+}
+
 /*
  * Commit a transaction with the given vector to the Committed Item List.
  *
@@ -667,7 +685,6 @@ xfs_log_commit_cil(
 {
 	struct log		*log = mp->m_log;
 	int			log_flags = 0;
-	int			push = 0;
 	struct xfs_log_vec	*log_vector;
 
 	if (flags & XFS_TRANS_RELEASE_LOG_RES)
@@ -719,21 +736,9 @@ xfs_log_commit_cil(
 	 */
 	xfs_trans_free_items(tp, *commit_lsn, 0);
 
-	/* check for background commit before unlock */
-	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
-		push = 1;
+	xlog_cil_push_background(log);
 
 	up_read(&log->l_cilp->xc_ctx_lock);
-
-	/*
-	 * We need to push CIL every so often so we don't cache more than we
-	 * can fit in the log. The limit really is that a checkpoint can't be
-	 * more than half the log (the current checkpoint is not allowed to
-	 * overwrite the previous checkpoint), but commit latency and memory
-	 * usage limit this to a smaller size in most cases.
-	 */
-	if (push)
-		xlog_cil_push(log, 0);
 	return 0;
 }
 
@@ -746,9 +751,6 @@ xfs_log_commit_cil(
  *
  * We return the current commit lsn to allow the callers to determine if a
  * iclog flush is necessary following this call.
- *
- * XXX: Initially, just push the CIL unconditionally and return whatever
- * commit lsn is there. It'll be empty, so this is broken for now.
  */
 xfs_lsn_t
 xlog_cil_force_lsn(
@@ -766,8 +768,7 @@ xlog_cil_force_lsn(
 	 * xlog_cil_push() handles racing pushes for the same sequence,
 	 * so no need to deal with it here.
 	 */
-	if (sequence == cil->xc_current_sequence)
-		xlog_cil_push(log, sequence);
+	xlog_cil_push_foreground(log, sequence);
 
 	/*
 	 * See if we can find a previous sequence still committing.
@@ -826,3 +827,57 @@ xfs_log_item_in_current_chkpt(
 		return false;
 	return true;
 }
+
+/*
+ * Perform initial CIL structure initialisation.
+ */
+int
+xlog_cil_init(
+	struct log	*log)
+{
+	struct xfs_cil	*cil;
+	struct xfs_cil_ctx *ctx;
+
+	cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+	if (!cil)
+		return ENOMEM;
+
+	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+	if (!ctx) {
+		kmem_free(cil);
+		return ENOMEM;
+	}
+
+	INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
+	INIT_LIST_HEAD(&cil->xc_cil);
+	INIT_LIST_HEAD(&cil->xc_committing);
+	spin_lock_init(&cil->xc_cil_lock);
+	init_rwsem(&cil->xc_ctx_lock);
+	init_waitqueue_head(&cil->xc_commit_wait);
+
+	INIT_LIST_HEAD(&ctx->committing);
+	INIT_LIST_HEAD(&ctx->busy_extents);
+	ctx->sequence = 1;
+	ctx->cil = cil;
+	cil->xc_ctx = ctx;
+	cil->xc_current_sequence = ctx->sequence;
+
+	cil->xc_log = log;
+	log->l_cilp = cil;
+	return 0;
+}
+
+void
+xlog_cil_destroy(
+	struct log	*log)
+{
+	if (log->l_cilp->xc_ctx) {
+		if (log->l_cilp->xc_ctx->ticket)
+			xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+		kmem_free(log->l_cilp->xc_ctx);
+	}
+
+	ASSERT(list_empty(&log->l_cilp->xc_cil));
+	kmem_free(log->l_cilp);
+}
+
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 2152900..ea8c076 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -417,8 +417,12 @@ struct xfs_cil {
 	struct list_head	xc_committing;
 	wait_queue_head_t	xc_commit_wait;
 	xfs_lsn_t		xc_current_sequence;
+	struct work_struct	xc_push_work;
+	xfs_lsn_t		xc_push_seq;
 };
 
+extern struct workqueue_struct *xfs_cil_wq;
+
 /*
  * The amount of log space we allow the CIL to aggregate is difficult to size.
  * Whatever we choose, we have to make sure we can get a reservation for the
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index aef50ab..c5059f5 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1634,8 +1634,13 @@ xfs_init_workqueues(void)
 	if (!xfs_alloc_wq)
 		goto out_destroy_syncd;
 
+	xfs_cil_wq = alloc_workqueue("xfscwcilalloc", WQ_MEM_RECLAIM, 0);
+	if (!xfs_cil_wq)
+		goto out_destroy_alloc;
 	return 0;
 
+out_destroy_alloc:
+	destroy_workqueue(xfs_alloc_wq);
 out_destroy_syncd:
 	destroy_workqueue(xfs_syncd_wq);
 	return -ENOMEM;
@@ -1644,6 +1649,7 @@ out_destroy_syncd:
 STATIC void
 xfs_destroy_workqueues(void)
 {
+	destroy_workqueue(xfs_cil_wq);
 	destroy_workqueue(xfs_alloc_wq);
 	destroy_workqueue(xfs_syncd_wq);
 }
-- 
1.7.9

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfs: Do background CIL flushes via a workqueue
  2012-03-27  9:46 [PATCH] xfs: Do background CIL flushes via a workqueue Dave Chinner
@ 2012-03-27 14:31 ` Christoph Hellwig
  2012-03-27 15:57   ` Vivek Goyal
  0 siblings, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2012-03-27 14:31 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Vivek Goyal, xfs

Vivek, does CFQ still need any hints for this sort of handoff?

On Tue, Mar 27, 2012 at 08:46:45PM +1100, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> Doing background CIL flushes adds significant latency to whatever
> async transaction that triggers it. To avoid blocking async
> transactions on things like waiting for log buffer IO to complete,
> move the CIL push off into a workqueue.  By moving the push work
> into a workqueue, we remove all the latency that the commit adds
> from the foreground transaction commit path. This also means that
> single threaded workloads won't do the CIL push procssing, leaving
> them more CPU to do more async transactions.
> 
> To do this, we need to keep track of the sequence number we have
> pushed work for. This avoids having many transaction commits
> attempting to schedule work for the same sequence, and ensures that
> we only ever have one push (background or forced) in progress at a
> time. It also means that we don't need to take the CIL lock in write
> mode to check for potential background push races, which reduces
> lock contention.
> 
> To avoid potential issues with "smart" IO schedulers, don't use the
> workqueue for log force triggered flushes. Instead, do them directly
> so that the log IO is done directly by the process issuing the log
> force and so doesn't get stuck on IO elevator queue idling
> incorrectly delaying the log IO from the workqueue.
> 
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_log_cil.c  |  241 ++++++++++++++++++++++++++++++-------------------
>  fs/xfs/xfs_log_priv.h |    4 +
>  fs/xfs/xfs_super.c    |    6 ++
>  3 files changed, 158 insertions(+), 93 deletions(-)
> 
> diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
> index d4fadbe..6a5a7ba 100644
> --- a/fs/xfs/xfs_log_cil.c
> +++ b/fs/xfs/xfs_log_cil.c
> @@ -31,57 +31,7 @@
>  #include "xfs_alloc.h"
>  #include "xfs_discard.h"
>  
> -/*
> - * Perform initial CIL structure initialisation.
> - */
> -int
> -xlog_cil_init(
> -	struct log	*log)
> -{
> -	struct xfs_cil	*cil;
> -	struct xfs_cil_ctx *ctx;
> -
> -	cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
> -	if (!cil)
> -		return ENOMEM;
> -
> -	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
> -	if (!ctx) {
> -		kmem_free(cil);
> -		return ENOMEM;
> -	}
> -
> -	INIT_LIST_HEAD(&cil->xc_cil);
> -	INIT_LIST_HEAD(&cil->xc_committing);
> -	spin_lock_init(&cil->xc_cil_lock);
> -	init_rwsem(&cil->xc_ctx_lock);
> -	init_waitqueue_head(&cil->xc_commit_wait);
> -
> -	INIT_LIST_HEAD(&ctx->committing);
> -	INIT_LIST_HEAD(&ctx->busy_extents);
> -	ctx->sequence = 1;
> -	ctx->cil = cil;
> -	cil->xc_ctx = ctx;
> -	cil->xc_current_sequence = ctx->sequence;
> -
> -	cil->xc_log = log;
> -	log->l_cilp = cil;
> -	return 0;
> -}
> -
> -void
> -xlog_cil_destroy(
> -	struct log	*log)
> -{
> -	if (log->l_cilp->xc_ctx) {
> -		if (log->l_cilp->xc_ctx->ticket)
> -			xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
> -		kmem_free(log->l_cilp->xc_ctx);
> -	}
> -
> -	ASSERT(list_empty(&log->l_cilp->xc_cil));
> -	kmem_free(log->l_cilp);
> -}
> +struct workqueue_struct *xfs_cil_wq;
>  
>  /*
>   * Allocate a new ticket. Failing to get a new ticket makes it really hard to
> @@ -426,8 +376,7 @@ xlog_cil_committed(
>   */
>  STATIC int
>  xlog_cil_push(
> -	struct log		*log,
> -	xfs_lsn_t		push_seq)
> +	struct log		*log)
>  {
>  	struct xfs_cil		*cil = log->l_cilp;
>  	struct xfs_log_vec	*lv;
> @@ -443,39 +392,35 @@ xlog_cil_push(
>  	struct xfs_log_iovec	lhdr;
>  	struct xfs_log_vec	lvhdr = { NULL };
>  	xfs_lsn_t		commit_lsn;
> +	xfs_lsn_t		push_seq;
>  
>  	if (!cil)
>  		return 0;
>  
> -	ASSERT(!push_seq || push_seq <= cil->xc_ctx->sequence);
> -
>  	new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
>  	new_ctx->ticket = xlog_cil_ticket_alloc(log);
>  
> -	/*
> -	 * Lock out transaction commit, but don't block for background pushes
> -	 * unless we are well over the CIL space limit. See the definition of
> -	 * XLOG_CIL_HARD_SPACE_LIMIT() for the full explanation of the logic
> -	 * used here.
> -	 */
> -	if (!down_write_trylock(&cil->xc_ctx_lock)) {
> -		if (!push_seq &&
> -		    cil->xc_ctx->space_used < XLOG_CIL_HARD_SPACE_LIMIT(log))
> -			goto out_free_ticket;
> -		down_write(&cil->xc_ctx_lock);
> -	}
> +	down_write(&cil->xc_ctx_lock);
>  	ctx = cil->xc_ctx;
>  
> -	/* check if we've anything to push */
> -	if (list_empty(&cil->xc_cil))
> -		goto out_skip;
> +	spin_lock(&cil->xc_cil_lock);
> +	push_seq = cil->xc_push_seq;
> +	ASSERT(push_seq > 0 && push_seq <= ctx->sequence);
>  
> -	/* check for spurious background flush */
> -	if (!push_seq && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
> +	/*
> +	 * Check if we've anything to push. If there is nothing, then we don't
> +	 * move on to a new sequence number and so we have to be able to push
> +	 * this sequence again later.
> +	 */
> +	if (list_empty(&cil->xc_cil)) {
> +		cil->xc_push_seq = 0;
> +		spin_unlock(&cil->xc_cil_lock);
>  		goto out_skip;
> +	}
> +	spin_unlock(&cil->xc_cil_lock);
>  
>  	/* check for a previously pushed seqeunce */
> -	if (push_seq && push_seq < cil->xc_ctx->sequence)
> +	if (push_seq < cil->xc_ctx->sequence)
>  		goto out_skip;
>  
>  	/*
> @@ -629,7 +574,6 @@ restart:
>  
>  out_skip:
>  	up_write(&cil->xc_ctx_lock);
> -out_free_ticket:
>  	xfs_log_ticket_put(new_ctx->ticket);
>  	kmem_free(new_ctx);
>  	return 0;
> @@ -641,6 +585,80 @@ out_abort:
>  	return XFS_ERROR(EIO);
>  }
>  
> +static void
> +xlog_cil_push_work(
> +	struct work_struct	*work)
> +{
> +	struct xfs_cil		*cil = container_of(work, struct xfs_cil,
> +							xc_push_work);
> +	xlog_cil_push(cil->xc_log);
> +}
> +
> +/*
> + * We need to push CIL every so often so we don't cache more than we can fit in
> + * the log. The limit really is that a checkpoint can't be more than half the
> + * log (the current checkpoint is not allowed to overwrite the previous
> + * checkpoint), but commit latency and memory usage limit this to a smaller
> + * size.
> + */
> +static void
> +xlog_cil_push_background(
> +	struct log	*log)
> +{
> +	struct xfs_cil	*cil = log->l_cilp;
> +
> +	/*
> +	 * The cil won't be empty because we are called while holding the
> +	 * context lock so whatever we added to the CIL will still be there
> +	 */
> +	ASSERT(!list_empty(&cil->xc_cil));
> +
> +	/*
> +	 * don't do a background push if we haven't used up all the
> +	 * space available yet.
> +	 */
> +	if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
> +		return;
> +
> +	spin_lock(&cil->xc_cil_lock);
> +	cil->xc_push_seq = cil->xc_current_sequence;
> +	queue_work(xfs_cil_wq, &cil->xc_push_work);
> +	spin_unlock(&cil->xc_cil_lock);
> +
> +}
> +
> +static void
> +xlog_cil_push_foreground(
> +	struct log	*log,
> +	xfs_lsn_t	push_seq)
> +{
> +	struct xfs_cil	*cil = log->l_cilp;
> +
> +	if (!cil)
> +		return;
> +
> +	ASSERT(push_seq && push_seq <= cil->xc_current_sequence);
> +
> +	/* start on any pending background push to minimise wait time on it */
> +	flush_work(&cil->xc_push_work);
> +
> +	/*
> +	 * If the CIL is empty or we've already pushed the sequence then
> +	 * there's no work we need to do.
> +	 */
> +	spin_lock(&cil->xc_cil_lock);
> +	if (list_empty(&cil->xc_cil) || push_seq <= cil->xc_push_seq) {
> +		spin_unlock(&cil->xc_cil_lock);
> +		return;
> +	}
> +
> +	cil->xc_push_seq = push_seq;
> +	spin_unlock(&cil->xc_cil_lock);
> +
> +	/* do the push now */
> +	xlog_cil_push(log);
> +}
> +
>  /*
>   * Commit a transaction with the given vector to the Committed Item List.
>   *
> @@ -667,7 +685,6 @@ xfs_log_commit_cil(
>  {
>  	struct log		*log = mp->m_log;
>  	int			log_flags = 0;
> -	int			push = 0;
>  	struct xfs_log_vec	*log_vector;
>  
>  	if (flags & XFS_TRANS_RELEASE_LOG_RES)
> @@ -719,21 +736,9 @@ xfs_log_commit_cil(
>  	 */
>  	xfs_trans_free_items(tp, *commit_lsn, 0);
>  
> -	/* check for background commit before unlock */
> -	if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
> -		push = 1;
> +	xlog_cil_push_background(log);
>  
>  	up_read(&log->l_cilp->xc_ctx_lock);
> -
> -	/*
> -	 * We need to push CIL every so often so we don't cache more than we
> -	 * can fit in the log. The limit really is that a checkpoint can't be
> -	 * more than half the log (the current checkpoint is not allowed to
> -	 * overwrite the previous checkpoint), but commit latency and memory
> -	 * usage limit this to a smaller size in most cases.
> -	 */
> -	if (push)
> -		xlog_cil_push(log, 0);
>  	return 0;
>  }
>  
> @@ -746,9 +751,6 @@ xfs_log_commit_cil(
>   *
>   * We return the current commit lsn to allow the callers to determine if a
>   * iclog flush is necessary following this call.
> - *
> - * XXX: Initially, just push the CIL unconditionally and return whatever
> - * commit lsn is there. It'll be empty, so this is broken for now.
>   */
>  xfs_lsn_t
>  xlog_cil_force_lsn(
> @@ -766,8 +768,7 @@ xlog_cil_force_lsn(
>  	 * xlog_cil_push() handles racing pushes for the same sequence,
>  	 * so no need to deal with it here.
>  	 */
> -	if (sequence == cil->xc_current_sequence)
> -		xlog_cil_push(log, sequence);
> +	xlog_cil_push_foreground(log, sequence);
>  
>  	/*
>  	 * See if we can find a previous sequence still committing.
> @@ -826,3 +827,57 @@ xfs_log_item_in_current_chkpt(
>  		return false;
>  	return true;
>  }
> +
> +/*
> + * Perform initial CIL structure initialisation.
> + */
> +int
> +xlog_cil_init(
> +	struct log	*log)
> +{
> +	struct xfs_cil	*cil;
> +	struct xfs_cil_ctx *ctx;
> +
> +	cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
> +	if (!cil)
> +		return ENOMEM;
> +
> +	ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
> +	if (!ctx) {
> +		kmem_free(cil);
> +		return ENOMEM;
> +	}
> +
> +	INIT_WORK(&cil->xc_push_work, xlog_cil_push_work);
> +	INIT_LIST_HEAD(&cil->xc_cil);
> +	INIT_LIST_HEAD(&cil->xc_committing);
> +	spin_lock_init(&cil->xc_cil_lock);
> +	init_rwsem(&cil->xc_ctx_lock);
> +	init_waitqueue_head(&cil->xc_commit_wait);
> +
> +	INIT_LIST_HEAD(&ctx->committing);
> +	INIT_LIST_HEAD(&ctx->busy_extents);
> +	ctx->sequence = 1;
> +	ctx->cil = cil;
> +	cil->xc_ctx = ctx;
> +	cil->xc_current_sequence = ctx->sequence;
> +
> +	cil->xc_log = log;
> +	log->l_cilp = cil;
> +	return 0;
> +}
> +
> +void
> +xlog_cil_destroy(
> +	struct log	*log)
> +{
> +	if (log->l_cilp->xc_ctx) {
> +		if (log->l_cilp->xc_ctx->ticket)
> +			xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
> +		kmem_free(log->l_cilp->xc_ctx);
> +	}
> +
> +	ASSERT(list_empty(&log->l_cilp->xc_cil));
> +	kmem_free(log->l_cilp);
> +}
> +
> diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
> index 2152900..ea8c076 100644
> --- a/fs/xfs/xfs_log_priv.h
> +++ b/fs/xfs/xfs_log_priv.h
> @@ -417,8 +417,12 @@ struct xfs_cil {
>  	struct list_head	xc_committing;
>  	wait_queue_head_t	xc_commit_wait;
>  	xfs_lsn_t		xc_current_sequence;
> +	struct work_struct	xc_push_work;
> +	xfs_lsn_t		xc_push_seq;
>  };
>  
> +extern struct workqueue_struct *xfs_cil_wq;
> +
>  /*
>   * The amount of log space we allow the CIL to aggregate is difficult to size.
>   * Whatever we choose, we have to make sure we can get a reservation for the
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index aef50ab..c5059f5 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -1634,8 +1634,13 @@ xfs_init_workqueues(void)
>  	if (!xfs_alloc_wq)
>  		goto out_destroy_syncd;
>  
> +	xfs_cil_wq = alloc_workqueue("xfscwcilalloc", WQ_MEM_RECLAIM, 0);
> +	if (!xfs_cil_wq)
> +		goto out_destroy_alloc;
>  	return 0;
>  
> +out_destroy_alloc:
> +	destroy_workqueue(xfs_alloc_wq);
>  out_destroy_syncd:
>  	destroy_workqueue(xfs_syncd_wq);
>  	return -ENOMEM;
> @@ -1644,6 +1649,7 @@ out_destroy_syncd:
>  STATIC void
>  xfs_destroy_workqueues(void)
>  {
> +	destroy_workqueue(xfs_cil_wq);
>  	destroy_workqueue(xfs_alloc_wq);
>  	destroy_workqueue(xfs_syncd_wq);
>  }
> -- 
> 1.7.9
> 
> _______________________________________________
> xfs mailing list
> xfs@oss.sgi.com
> http://oss.sgi.com/mailman/listinfo/xfs
---end quoted text---

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfs: Do background CIL flushes via a workqueue
  2012-03-27 14:31 ` Christoph Hellwig
@ 2012-03-27 15:57   ` Vivek Goyal
  2012-03-27 16:03     ` Christoph Hellwig
  0 siblings, 1 reply; 10+ messages in thread
From: Vivek Goyal @ 2012-03-27 15:57 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Tue, Mar 27, 2012 at 10:31:27AM -0400, Christoph Hellwig wrote:
> Vivek, does CFQ still need any hints for this sort of handoff?
> 

Christoph, I don't understand the issue enough to comment on it.

Had a quick look at the patch. Looks like some action (writing log), has
been moved to a worker thread.  And in some cases (log force triggered
flush, whatever it is), we seem to prefer to do it from the submitter's
context.

> > To avoid potential issues with "smart" IO schedulers, don't use the
> > workqueue for log force triggered flushes. Instead, do them directly
> > so that the log IO is done directly by the process issuing the log
> > force and so doesn't get stuck on IO elevator queue idling
> > incorrectly delaying the log IO from the workqueue.

Dave, can you explain a bit more that how CFQ idling in the above case.
You seem to be saying that if we do more idling if this operation is
done from worker thread context. I don't udnerstand why.

In general, one can run into issues with CFQ if there are two processes
doing IO and they are dependent on each other for completion of IO. CFQ
will not know about this dependency and might end up waiting for IO from
one process while next IO will come from next process now. 

So I am not sure why "smart" IO schedulers will run into issues if we
use a worker thread to do log IO. Also not sure, why same thing is right
thing to do when it is not forced flush.

Thanks
Vivek

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfs: Do background CIL flushes via a workqueue
  2012-03-27 15:57   ` Vivek Goyal
@ 2012-03-27 16:03     ` Christoph Hellwig
  2012-03-27 16:19       ` Vivek Goyal
  0 siblings, 1 reply; 10+ messages in thread
From: Christoph Hellwig @ 2012-03-27 16:03 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Christoph Hellwig, xfs

On Tue, Mar 27, 2012 at 11:57:59AM -0400, Vivek Goyal wrote:
> On Tue, Mar 27, 2012 at 10:31:27AM -0400, Christoph Hellwig wrote:
> > Vivek, does CFQ still need any hints for this sort of handoff?
> > 
> 
> Christoph, I don't understand the issue enough to comment on it.
> 
> Had a quick look at the patch. Looks like some action (writing log), has
> been moved to a worker thread.  And in some cases (log force triggered
> flush, whatever it is), we seem to prefer to do it from the submitter's
> context.

Yes.  This is to workaround the old problem of cfq getting utterly
confused if cooperating I/O beeing submitted from different threads.

The case in the previous version of this patch was:

 - thread doing the fsync will write out data, and wait for it
 - then we'd force the log by kicking a workqueue and waiting for it

quite similar to the ext3/4 fsync issues that we had long discussions
about.

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfs: Do background CIL flushes via a workqueue
  2012-03-27 16:03     ` Christoph Hellwig
@ 2012-03-27 16:19       ` Vivek Goyal
  2012-03-27 17:03         ` Christoph Hellwig
  2012-03-29  8:51         ` Dave Chinner
  0 siblings, 2 replies; 10+ messages in thread
From: Vivek Goyal @ 2012-03-27 16:19 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Tue, Mar 27, 2012 at 12:03:00PM -0400, Christoph Hellwig wrote:
> On Tue, Mar 27, 2012 at 11:57:59AM -0400, Vivek Goyal wrote:
> > On Tue, Mar 27, 2012 at 10:31:27AM -0400, Christoph Hellwig wrote:
> > > Vivek, does CFQ still need any hints for this sort of handoff?
> > > 
> > 
> > Christoph, I don't understand the issue enough to comment on it.
> > 
> > Had a quick look at the patch. Looks like some action (writing log), has
> > been moved to a worker thread.  And in some cases (log force triggered
> > flush, whatever it is), we seem to prefer to do it from the submitter's
> > context.
> 
> Yes.  This is to workaround the old problem of cfq getting utterly
> confused if cooperating I/O beeing submitted from different threads.
> 
> The case in the previous version of this patch was:
> 
>  - thread doing the fsync will write out data, and wait for it
>  - then we'd force the log by kicking a workqueue and waiting for it
> 
> quite similar to the ext3/4 fsync issues that we had long discussions
> about.

Ok, then I think that fundamental issue still remains with CFQ. And there
is no general solution to recognizing dependency between processes.

But a specific workaround  for ext3/ext4 fsync problem was put by corrado
long back.

commit 749ef9f8423054e326f3a246327ed2db4b6d395f
Author: Corrado Zoccolo <czoccolo@gmail.com>
Date:   Mon Sep 20 15:24:50 2010 +0200

    cfq: improve fsync performance for small files

Basically, I think previously journal commits were "WRITE" and were
showing most likely on async IO tree. And "fsync" IO was synchronous
and probably showing up on "sync-noidle" tree. CFQ does idling before
it switches between trees hence transition from one process to other
was slow.

Now corrado, changed the IO type from journaling thread to "WRITE_SYNC"
which makes writes synchronous and sets the REQ_NOIDLE flag. Hence forcing
"journal" thread to show up on "sync-noidle" tree. I think "fsync" was
already there so effectively both the processes are on same service tree
and we don't idle between processes when they are on "sync-noidle" tree.

So xfs either need to resort to similar optimizaiton where IO type from
both the process context is of same type or try to do all the IO from
one process context. 

AFAIK, CFQ does not have any generic mechanism to detect process IO
dependecny.

Thanks
Vivek

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfs: Do background CIL flushes via a workqueue
  2012-03-27 16:19       ` Vivek Goyal
@ 2012-03-27 17:03         ` Christoph Hellwig
  2012-03-27 17:18           ` Vivek Goyal
                             ` (2 more replies)
  2012-03-29  8:51         ` Dave Chinner
  1 sibling, 3 replies; 10+ messages in thread
From: Christoph Hellwig @ 2012-03-27 17:03 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Christoph Hellwig, xfs

On Tue, Mar 27, 2012 at 12:19:41PM -0400, Vivek Goyal wrote:
> So xfs either need to resort to similar optimizaiton where IO type from
> both the process context is of same type or try to do all the IO from
> one process context. 

All XFS log I/O is marked SYNC and (FUA and/or FLUSH).

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfs: Do background CIL flushes via a workqueue
  2012-03-27 17:03         ` Christoph Hellwig
@ 2012-03-27 17:18           ` Vivek Goyal
  2012-03-27 17:23           ` Vivek Goyal
  2012-03-29  8:52           ` Dave Chinner
  2 siblings, 0 replies; 10+ messages in thread
From: Vivek Goyal @ 2012-03-27 17:18 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Tue, Mar 27, 2012 at 01:03:09PM -0400, Christoph Hellwig wrote:
> On Tue, Mar 27, 2012 at 12:19:41PM -0400, Vivek Goyal wrote:
> > So xfs either need to resort to similar optimizaiton where IO type from
> > both the process context is of same type or try to do all the IO from
> > one process context. 
> 
> All XFS log I/O is marked SYNC and (FUA and/or FLUSH).

In that case I am not sure if XFS should still face the idling issue. If
it does, then we need to debug it further and see why the fix is not
working.

Dave, any chance of 10 seconds of blktrace data when this problem happens?

Thanks
Vivek

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfs: Do background CIL flushes via a workqueue
  2012-03-27 17:03         ` Christoph Hellwig
  2012-03-27 17:18           ` Vivek Goyal
@ 2012-03-27 17:23           ` Vivek Goyal
  2012-03-29  8:52           ` Dave Chinner
  2 siblings, 0 replies; 10+ messages in thread
From: Vivek Goyal @ 2012-03-27 17:23 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: xfs

On Tue, Mar 27, 2012 at 01:03:09PM -0400, Christoph Hellwig wrote:
> On Tue, Mar 27, 2012 at 12:19:41PM -0400, Vivek Goyal wrote:
> > So xfs either need to resort to similar optimizaiton where IO type from
> > both the process context is of same type or try to do all the IO from
> > one process context. 
> 
> All XFS log I/O is marked SYNC and (FUA and/or FLUSH).

Well, if all the requests are marked with FUA/FLUSH, then I think these
requests will not even be given to IO scheduler. And we should not have
the issue of idling.

        if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
                spin_lock_irq(q->queue_lock);
                where = ELEVATOR_INSERT_FLUSH;
                goto get_rq;
        }

Thanks
Vivek

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfs: Do background CIL flushes via a workqueue
  2012-03-27 16:19       ` Vivek Goyal
  2012-03-27 17:03         ` Christoph Hellwig
@ 2012-03-29  8:51         ` Dave Chinner
  1 sibling, 0 replies; 10+ messages in thread
From: Dave Chinner @ 2012-03-29  8:51 UTC (permalink / raw)
  To: Vivek Goyal; +Cc: Christoph Hellwig, xfs

On Tue, Mar 27, 2012 at 12:19:41PM -0400, Vivek Goyal wrote:
> On Tue, Mar 27, 2012 at 12:03:00PM -0400, Christoph Hellwig wrote:
> > On Tue, Mar 27, 2012 at 11:57:59AM -0400, Vivek Goyal wrote:
> > > On Tue, Mar 27, 2012 at 10:31:27AM -0400, Christoph Hellwig wrote:
> > > > Vivek, does CFQ still need any hints for this sort of handoff?
> > > > 
> > > 
> > > Christoph, I don't understand the issue enough to comment on it.
> > > 
> > > Had a quick look at the patch. Looks like some action (writing log), has
> > > been moved to a worker thread.  And in some cases (log force triggered
> > > flush, whatever it is), we seem to prefer to do it from the submitter's
> > > context.
> > 
> > Yes.  This is to workaround the old problem of cfq getting utterly
> > confused if cooperating I/O beeing submitted from different threads.
> > 
> > The case in the previous version of this patch was:
> > 
> >  - thread doing the fsync will write out data, and wait for it
> >  - then we'd force the log by kicking a workqueue and waiting for it
> > 
> > quite similar to the ext3/4 fsync issues that we had long discussions
> > about.
> 
> Ok, then I think that fundamental issue still remains with CFQ. And there
> is no general solution to recognizing dependency between processes.
> 
> But a specific workaround  for ext3/ext4 fsync problem was put by corrado
> long back.
> 
> commit 749ef9f8423054e326f3a246327ed2db4b6d395f
> Author: Corrado Zoccolo <czoccolo@gmail.com>
> Date:   Mon Sep 20 15:24:50 2010 +0200
> 
>     cfq: improve fsync performance for small files
> 
> Basically, I think previously journal commits were "WRITE" and were
> showing most likely on async IO tree. And "fsync" IO was synchronous
> and probably showing up on "sync-noidle" tree. CFQ does idling before
> it switches between trees hence transition from one process to other
> was slow.
> 
> Now corrado, changed the IO type from journaling thread to "WRITE_SYNC"
> which makes writes synchronous and sets the REQ_NOIDLE flag. Hence forcing
> "journal" thread to show up on "sync-noidle" tree. I think "fsync" was
> already there so effectively both the processes are on same service tree
> and we don't idle between processes when they are on "sync-noidle" tree.

Oh, that hack.  Have a look at the patch Jan Kara just posted to the
ext4 list (jbd: Refine commit writeout logic,
http://comments.gmane.org/gmane.comp.file-systems.ext4/31704) where
he removes that WRITE_SYNC from the ext4 journal commit to reduce
read latencies in the presence of writes by an order of magnitude.
IOWs, we need to revert the hack that ext4 uses to work around the
CFQ-caused fsync latency problem because it causes much worse read
latency problems on CFQ.

That is, we simply can't fix CFQ idling heuristic deficiencies at the
filesystem level simply by changing the IO classification. All we
can chosen from is bad performance on workload A or bad performance
on workload B, neither of which are particularly appealing.

> So xfs either need to resort to similar optimizaiton where IO type from
> both the process context is of same type or try to do all the IO from
> one process context. 

Submiting all the IO from the same context is exactly what this
patch does - it avoids the whole steaming pile of CFQ problems that
way.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] xfs: Do background CIL flushes via a workqueue
  2012-03-27 17:03         ` Christoph Hellwig
  2012-03-27 17:18           ` Vivek Goyal
  2012-03-27 17:23           ` Vivek Goyal
@ 2012-03-29  8:52           ` Dave Chinner
  2 siblings, 0 replies; 10+ messages in thread
From: Dave Chinner @ 2012-03-29  8:52 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Vivek Goyal, xfs

On Tue, Mar 27, 2012 at 01:03:09PM -0400, Christoph Hellwig wrote:
> On Tue, Mar 27, 2012 at 12:19:41PM -0400, Vivek Goyal wrote:
> > So xfs either need to resort to similar optimizaiton where IO type from
> > both the process context is of same type or try to do all the IO from
> > one process context. 
> 
> All XFS log I/O is marked SYNC and (FUA and/or FLUSH).

Not if the filesystem is mounted nobarrier, as is recommended
for filesystems on hardware RAID with BBWCs....

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2012-03-29  8:52 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-03-27  9:46 [PATCH] xfs: Do background CIL flushes via a workqueue Dave Chinner
2012-03-27 14:31 ` Christoph Hellwig
2012-03-27 15:57   ` Vivek Goyal
2012-03-27 16:03     ` Christoph Hellwig
2012-03-27 16:19       ` Vivek Goyal
2012-03-27 17:03         ` Christoph Hellwig
2012-03-27 17:18           ` Vivek Goyal
2012-03-27 17:23           ` Vivek Goyal
2012-03-29  8:52           ` Dave Chinner
2012-03-29  8:51         ` Dave Chinner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.