All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] xfs: Remove the entries from the queue while waking them up.
@ 2011-11-18 19:20 Chandra Seetharaman
  2011-11-19  1:14 ` Dave Chinner
  2011-11-19 18:19 ` Christoph Hellwig
  0 siblings, 2 replies; 9+ messages in thread
From: Chandra Seetharaman @ 2011-11-18 19:20 UTC (permalink / raw)
  To: XFS Mailing List

l_reserveq and l_writeq maintains a list of processes waiting to get log
space. Processes are supposed to get in the list when the amount of free
space available in the log is less than what they need.

When space becomes available current code, wakes up the processes, but
expect the processes to remove themselves from the queue.

Since the lock protecting the list is only acquired later by the woken
up process, there is a window of time were a new process that is looking
for space can wrongly get into the queue while there is enough space
available. 

Since there is enough space available, this process can never be woken
up, which leads to the hang of the process.

This was originally reported by Alex Elder <aelder@sgi.com> as hang seen
in xfstests #234.

With log of log activities, this problem may not be seen, as some
process will be pushing the processes along. But, 234 does lot of quota
operations only, hence the problem was noticed in that test.

This patch fixes the problem by removing the element from the queue
(safely) when the process was woken up.

Reported-by: Alex elder <aelder@sgi.com>
Signed-Off-by: Chandra Seethraman <sekharan@us.ibm.com>

---
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index a14cd89..9941fcb 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -674,7 +674,7 @@ void
 xfs_log_move_tail(xfs_mount_t	*mp,
 		  xfs_lsn_t	tail_lsn)
 {
-	xlog_ticket_t	*tic;
+	xlog_ticket_t	*tic, *tmp;
 	xlog_t		*log = mp->m_log;
 	int		need_bytes, free_bytes;
 
@@ -695,7 +695,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 #endif
 		spin_lock(&log->l_grant_write_lock);
 		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-		list_for_each_entry(tic, &log->l_writeq, t_queue) {
+		list_for_each_entry_safe(tic, tmp, &log->l_writeq, t_queue) {
 			ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
 
 			if (free_bytes < tic->t_unit_res && tail_lsn != 1)
@@ -703,6 +703,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 			tail_lsn = 0;
 			free_bytes -= tic->t_unit_res;
 			trace_xfs_log_regrant_write_wake_up(log, tic);
+			list_del_init(&tic->t_queue);
 			wake_up(&tic->t_wait);
 		}
 		spin_unlock(&log->l_grant_write_lock);
@@ -715,7 +716,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 #endif
 		spin_lock(&log->l_grant_reserve_lock);
 		free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-		list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+		list_for_each_entry_safe(tic, tmp, &log->l_reserveq, t_queue) {
 			if (tic->t_flags & XLOG_TIC_PERM_RESERV)
 				need_bytes = tic->t_unit_res*tic->t_cnt;
 			else
@@ -725,6 +726,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
 			tail_lsn = 0;
 			free_bytes -= need_bytes;
 			trace_xfs_log_grant_wake_up(log, tic);
+			list_del_init(&tic->t_queue);
 			wake_up(&tic->t_wait);
 		}
 		spin_unlock(&log->l_grant_reserve_lock);
@@ -2550,8 +2552,7 @@ redo:
 	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
 	if (free_bytes < need_bytes) {
 		spin_lock(&log->l_grant_reserve_lock);
-		if (list_empty(&tic->t_queue))
-			list_add_tail(&tic->t_queue, &log->l_reserveq);
+		list_add_tail(&tic->t_queue, &log->l_reserveq);
 
 		trace_xfs_log_grant_sleep2(log, tic);
 
@@ -2567,12 +2568,6 @@ redo:
 		goto redo;
 	}
 
-	if (!list_empty(&tic->t_queue)) {
-		spin_lock(&log->l_grant_reserve_lock);
-		list_del_init(&tic->t_queue);
-		spin_unlock(&log->l_grant_reserve_lock);
-	}
-
 	/* we've got enough space */
 	xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
 	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
@@ -2626,30 +2621,28 @@ xlog_regrant_write_log_space(xlog_t	   *log,
 		goto error_return_unlocked;
 
 	/* If there are other waiters on the queue then give them a
-	 * chance at logspace before us. Wake up the first waiters,
-	 * if we do not wake up all the waiters then go to sleep waiting
-	 * for more free space, otherwise try to get some space for
-	 * this transaction.
+	 * chance at logspace before us. If we do not wake up all
+	 * the waiters then go to sleep waiting for more free space,
+	 * otherwise try to get some space for this transaction.
 	 */
 	need_bytes = tic->t_unit_res;
 	if (!list_empty_careful(&log->l_writeq)) {
-		struct xlog_ticket *ntic;
+		struct xlog_ticket *ntic, *tmp;
 
 		spin_lock(&log->l_grant_write_lock);
 		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-		list_for_each_entry(ntic, &log->l_writeq, t_queue) {
+		list_for_each_entry_safe(ntic, tmp, &log->l_writeq, t_queue) {
 			ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
 
 			if (free_bytes < ntic->t_unit_res)
 				break;
 			free_bytes -= ntic->t_unit_res;
+			list_del_init(&ntic->t_queue);
 			wake_up(&ntic->t_wait);
 		}
 
-		if (ntic != list_first_entry(&log->l_writeq,
-						struct xlog_ticket, t_queue)) {
-			if (list_empty(&tic->t_queue))
-				list_add_tail(&tic->t_queue, &log->l_writeq);
+		if (!list_empty(&log->l_writeq)) {
+			list_add_tail(&tic->t_queue, &log->l_writeq);
 			trace_xfs_log_regrant_write_sleep1(log, tic);
 
 			xlog_grant_push_ail(log, need_bytes);
@@ -2668,8 +2661,7 @@ redo:
 	free_bytes = xlog_space_left(log, &log->l_grant_write_head);
 	if (free_bytes < need_bytes) {
 		spin_lock(&log->l_grant_write_lock);
-		if (list_empty(&tic->t_queue))
-			list_add_tail(&tic->t_queue, &log->l_writeq);
+		list_add_tail(&tic->t_queue, &log->l_writeq);
 
 		if (XLOG_FORCED_SHUTDOWN(log))
 			goto error_return;
@@ -2684,12 +2676,6 @@ redo:
 		goto redo;
 	}
 
-	if (!list_empty(&tic->t_queue)) {
-		spin_lock(&log->l_grant_write_lock);
-		list_del_init(&tic->t_queue);
-		spin_unlock(&log->l_grant_write_lock);
-	}
-
 	/* we've got enough space */
 	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
 	trace_xfs_log_regrant_write_exit(log, tic);
@@ -3621,7 +3607,7 @@ xfs_log_force_umount(
 	struct xfs_mount	*mp,
 	int			logerror)
 {
-	xlog_ticket_t	*tic;
+	xlog_ticket_t	*tic, *tmp;
 	xlog_t		*log;
 	int		retval;
 
@@ -3690,13 +3676,17 @@ xfs_log_force_umount(
 	 * action is protected by the grant locks.
 	 */
 	spin_lock(&log->l_grant_reserve_lock);
-	list_for_each_entry(tic, &log->l_reserveq, t_queue)
+	list_for_each_entry_safe(tic, tmp, &log->l_reserveq, t_queue) {
+		list_del_init(&tic->t_queue);
 		wake_up(&tic->t_wait);
+	}
 	spin_unlock(&log->l_grant_reserve_lock);
 
 	spin_lock(&log->l_grant_write_lock);
-	list_for_each_entry(tic, &log->l_writeq, t_queue)
+	list_for_each_entry_safe(tic, tmp, &log->l_writeq, t_queue) {
+		list_del_init(&tic->t_queue);
 		wake_up(&tic->t_wait);
+	}
 	spin_unlock(&log->l_grant_write_lock);
 
 	if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) {


_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] xfs: Remove the entries from the queue while waking them up.
  2011-11-18 19:20 [PATCH] xfs: Remove the entries from the queue while waking them up Chandra Seetharaman
@ 2011-11-19  1:14 ` Dave Chinner
  2011-11-21 23:01   ` Chandra Seetharaman
  2011-11-19 18:19 ` Christoph Hellwig
  1 sibling, 1 reply; 9+ messages in thread
From: Dave Chinner @ 2011-11-19  1:14 UTC (permalink / raw)
  To: Chandra Seetharaman; +Cc: XFS Mailing List

On Fri, Nov 18, 2011 at 01:20:54PM -0600, Chandra Seetharaman wrote:
> l_reserveq and l_writeq maintains a list of processes waiting to get log
> space. Processes are supposed to get in the list when the amount of free
> space available in the log is less than what they need.
> 
> When space becomes available current code, wakes up the processes, but
> expect the processes to remove themselves from the queue.
> 
> Since the lock protecting the list is only acquired later by the woken
> up process, there is a window of time were a new process that is looking
> for space can wrongly get into the queue while there is enough space
> available. 
> 
> Since there is enough space available, this process can never be woken
> up, which leads to the hang of the process.

Excellent work, Chandra.

> This was originally reported by Alex Elder <aelder@sgi.com> as hang seen
> in xfstests #234.
> 
> With log of log activities, this problem may not be seen, as some
> process will be pushing the processes along. But, 234 does lot of quota
> operations only, hence the problem was noticed in that test.

Right, and it's only since we made the log reserve path lockless
that we've been tripping over this race condition. I'd say those
changes made it easier to hit, because this race condition has been
there for a long time.

Indeed, the wakeup race was present in the initial commit
for the grant write/reserve head accounting code way back in June of
1994:

http://oss.sgi.com/cgi-bin/gitweb.cgi?p=archive/xfs-import.git;a=commitdiff;h=84181026c6c915d7727a4cf316415dd1e8d5805a

This is effectively a zero-day bug you've just got to the bottom of
- it's one of the oldest bugs ever found in the XFS codebase - and
would explain the occasional random log space hangs that have been
seen over the years and we've never been able to reproduce.

> This patch fixes the problem by removing the element from the queue
> (safely) when the process was woken up.
> 
> Reported-by: Alex elder <aelder@sgi.com>
> Signed-Off-by: Chandra Seethraman <sekharan@us.ibm.com>

Couple of comments about the patch below

> ---
> diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
> index a14cd89..9941fcb 100644
> --- a/fs/xfs/xfs_log.c
> +++ b/fs/xfs/xfs_log.c
> @@ -674,7 +674,7 @@ void
>  xfs_log_move_tail(xfs_mount_t	*mp,
>  		  xfs_lsn_t	tail_lsn)
>  {
> -	xlog_ticket_t	*tic;
> +	xlog_ticket_t	*tic, *tmp;

When we modify such declarations, we normally remove the typedef.
Typically we also only declare one variable per line.  Also, it's
best to avoid variables named "tmp". What it really is used for is
the loop next pointer, so "next" is a much better name for it.

i.e:

-	xlog_ticket_t *tic;
+	struct xlog_ticket *tic;
+	struct xlog_ticket *next;

Can change all the "tmp" variables appropriately?

>  	xlog_t		*log = mp->m_log;
>  	int		need_bytes, free_bytes;
>  
> @@ -695,7 +695,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
>  #endif
>  		spin_lock(&log->l_grant_write_lock);
>  		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
> -		list_for_each_entry(tic, &log->l_writeq, t_queue) {
> +		list_for_each_entry_safe(tic, tmp, &log->l_writeq, t_queue) {
>  			ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
>  
>  			if (free_bytes < tic->t_unit_res && tail_lsn != 1)
> @@ -703,6 +703,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
>  			tail_lsn = 0;
>  			free_bytes -= tic->t_unit_res;
>  			trace_xfs_log_regrant_write_wake_up(log, tic);
> +			list_del_init(&tic->t_queue);
>  			wake_up(&tic->t_wait);
>  		}
>  		spin_unlock(&log->l_grant_write_lock);

A comment here describing the reason why we need to remove the
ticket from the queue before issuing the wakeup would be good. We
normally add comments explaining how we've avoided race conditions
in the code so that in a couple of years time we know why the code
was written that way without having to go looking in the commit
history.

> @@ -715,7 +716,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
>  #endif
>  		spin_lock(&log->l_grant_reserve_lock);
>  		free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
> -		list_for_each_entry(tic, &log->l_reserveq, t_queue) {
> +		list_for_each_entry_safe(tic, tmp, &log->l_reserveq, t_queue) {
>  			if (tic->t_flags & XLOG_TIC_PERM_RESERV)
>  				need_bytes = tic->t_unit_res*tic->t_cnt;
>  			else
> @@ -725,6 +726,7 @@ xfs_log_move_tail(xfs_mount_t	*mp,
>  			tail_lsn = 0;
>  			free_bytes -= need_bytes;
>  			trace_xfs_log_grant_wake_up(log, tic);
> +			list_del_init(&tic->t_queue);
>  			wake_up(&tic->t_wait);
>  		}
>  		spin_unlock(&log->l_grant_reserve_lock);
> @@ -2550,8 +2552,7 @@ redo:
>  	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
>  	if (free_bytes < need_bytes) {
>  		spin_lock(&log->l_grant_reserve_lock);
> -		if (list_empty(&tic->t_queue))
> -			list_add_tail(&tic->t_queue, &log->l_reserveq);
> +		list_add_tail(&tic->t_queue, &log->l_reserveq);
>  
>  		trace_xfs_log_grant_sleep2(log, tic);
>  

Ok, we now have the assumption that when we enter this code the
ticket is not on any queue at all. Can you add an
"ASSERT(list_empty(&tic->t_queue));" to the code before the above
xlog_space_left() call? That way we'll know if we violate that
assumption as potentially corrupt memory....

> @@ -2567,12 +2568,6 @@ redo:
>  		goto redo;
>  	}
>  
> -	if (!list_empty(&tic->t_queue)) {
> -		spin_lock(&log->l_grant_reserve_lock);
> -		list_del_init(&tic->t_queue);
> -		spin_unlock(&log->l_grant_reserve_lock);
> -	}
> -
>  	/* we've got enough space */
>  	xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
>  	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
> @@ -2626,30 +2621,28 @@ xlog_regrant_write_log_space(xlog_t	   *log,
>  		goto error_return_unlocked;
>  
>  	/* If there are other waiters on the queue then give them a
> -	 * chance at logspace before us. Wake up the first waiters,
> -	 * if we do not wake up all the waiters then go to sleep waiting
> -	 * for more free space, otherwise try to get some space for
> -	 * this transaction.
> +	 * chance at logspace before us. If we do not wake up all
> +	 * the waiters then go to sleep waiting for more free space,
> +	 * otherwise try to get some space for this transaction.
>  	 */
>  	need_bytes = tic->t_unit_res;
>  	if (!list_empty_careful(&log->l_writeq)) {
> -		struct xlog_ticket *ntic;
> +		struct xlog_ticket *ntic, *tmp;

tmp -> next

And probably need an assert for tic not being in a queue.

>  
>  		spin_lock(&log->l_grant_write_lock);
>  		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
> -		list_for_each_entry(ntic, &log->l_writeq, t_queue) {
> +		list_for_each_entry_safe(ntic, tmp, &log->l_writeq, t_queue) {
>  			ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
>  
>  			if (free_bytes < ntic->t_unit_res)
>  				break;
>  			free_bytes -= ntic->t_unit_res;
> +			list_del_init(&ntic->t_queue);
>  			wake_up(&ntic->t_wait);
>  		}
>  
> -		if (ntic != list_first_entry(&log->l_writeq,
> -						struct xlog_ticket, t_queue)) {
> -			if (list_empty(&tic->t_queue))
> -				list_add_tail(&tic->t_queue, &log->l_writeq);
> +		if (!list_empty(&log->l_writeq)) {
> +			list_add_tail(&tic->t_queue, &log->l_writeq);

>  			trace_xfs_log_regrant_write_sleep1(log, tic);
>  
>  			xlog_grant_push_ail(log, need_bytes);
> @@ -2668,8 +2661,7 @@ redo:
>  	free_bytes = xlog_space_left(log, &log->l_grant_write_head);
>  	if (free_bytes < need_bytes) {
>  		spin_lock(&log->l_grant_write_lock);
> -		if (list_empty(&tic->t_queue))
> -			list_add_tail(&tic->t_queue, &log->l_writeq);
> +		list_add_tail(&tic->t_queue, &log->l_writeq);
>  
>  		if (XLOG_FORCED_SHUTDOWN(log))
>  			goto error_return;

Same again - assert above xlog_space_left.

> @@ -2684,12 +2676,6 @@ redo:
>  		goto redo;
>  	}
>  
> -	if (!list_empty(&tic->t_queue)) {
> -		spin_lock(&log->l_grant_write_lock);
> -		list_del_init(&tic->t_queue);
> -		spin_unlock(&log->l_grant_write_lock);
> -	}
> -
>  	/* we've got enough space */
>  	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
>  	trace_xfs_log_regrant_write_exit(log, tic);
> @@ -3621,7 +3607,7 @@ xfs_log_force_umount(
>  	struct xfs_mount	*mp,
>  	int			logerror)
>  {
> -	xlog_ticket_t	*tic;
> +	xlog_ticket_t	*tic, *tmp;
>  	xlog_t		*log;
>  	int		retval;

tmp -> next

>  
> @@ -3690,13 +3676,17 @@ xfs_log_force_umount(
>  	 * action is protected by the grant locks.
>  	 */
>  	spin_lock(&log->l_grant_reserve_lock);
> -	list_for_each_entry(tic, &log->l_reserveq, t_queue)
> +	list_for_each_entry_safe(tic, tmp, &log->l_reserveq, t_queue) {
> +		list_del_init(&tic->t_queue);
>  		wake_up(&tic->t_wait);
> +	}
>  	spin_unlock(&log->l_grant_reserve_lock);
>  
>  	spin_lock(&log->l_grant_write_lock);
> -	list_for_each_entry(tic, &log->l_writeq, t_queue)
> +	list_for_each_entry_safe(tic, tmp, &log->l_writeq, t_queue) {
> +		list_del_init(&tic->t_queue);
>  		wake_up(&tic->t_wait);
> +	}
>  	spin_unlock(&log->l_grant_write_lock);

And comments explaining why we are removing the ticket from the
queue before issuing the wakeup.

I'll run some testing on the patch - it looks like it's correct, but
seeing as this logic hasn't changed for so long a bit more testing
won't hurt. ;)

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] xfs: Remove the entries from the queue while waking them up.
  2011-11-18 19:20 [PATCH] xfs: Remove the entries from the queue while waking them up Chandra Seetharaman
  2011-11-19  1:14 ` Dave Chinner
@ 2011-11-19 18:19 ` Christoph Hellwig
  2011-11-20  8:22   ` Dave Chinner
  2011-11-21 19:11   ` [PATCH] xfs: Remove the entries from the queue while waking them up Chandra Seetharaman
  1 sibling, 2 replies; 9+ messages in thread
From: Christoph Hellwig @ 2011-11-19 18:19 UTC (permalink / raw)
  To: Chandra Seetharaman; +Cc: XFS Mailing List

Thanks a lot for tracking this issue down!

Unfortunately I don't think the fix is entirely safe.  If we remove
the ticket from the list before the wakeup we have to assume no one
else ever wakes up a process waiting for log space.  In Linux that
generally isn't a safe assumption - e.g. higher level code could have
added itself to another waitqueue before calling into this code (
possibly even outside the XFS code) and now getting a wake up, or
other bits of the kernel could have all kinds of reasons to wake
this process up.

Below is patch I hacked up on the airplane today - it makes sure
we always wake other waiters on the log space queues first before
adding a new process and should have the same effect.  Can you
test if this also fixes the 234 hang for you?

---
From: Christoph Hellwig <hch@lst.de>
Subject: xfs: fix and cleanup logspace waiter lists

Apply the scheme used in log_regrant_write_log_space to wake up any
other threads waiting for log space before the newly added one to
log_regrant_write_log_space as well, and factor the code into
readable helpers.  For each of the queues we have:

 - one helper to wake up all waiting threads, and return if we
   succeeded into doing that.  These helpers will also be usable
   by xfs_log_move_tail once we remove the current opportunistic
   wakeups in it.
 - one helper to sleep on t_wait until enough log space is available,
   which is modelled after the Linux waitqueue model.
 
and rewrite log_regrant_write_log_space and log_regrant_write_log_space
around these helpers, including comments documenting what is going on.
These two function now use one and the same algorithm for waiting
on log space instead of subtly different ones before.

Signed-off-by: Christoph Hellwig <hch@lst.de>

---
 fs/xfs/linux-2.6/xfs_trace.h |   12 -
 fs/xfs/xfs_log.c             |  329 +++++++++++++++++++++----------------------
 2 files changed, 170 insertions(+), 171 deletions(-)

Index: linux-2.6/fs/xfs/xfs_log.c
===================================================================
--- linux-2.6.orig/fs/xfs/xfs_log.c	2011-11-19 15:51:55.689999172 +0100
+++ linux-2.6/fs/xfs/xfs_log.c	2011-11-19 16:57:07.226659537 +0100
@@ -670,6 +670,52 @@ xfs_log_write(
 	return error;
 }
 
+STATIC bool
+xlog_wake_writeq(
+	struct log		*log,
+	int			free_bytes)
+{
+	struct xlog_ticket	*tic;
+
+	list_for_each_entry(tic, &log->l_writeq, t_queue) {
+		ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+
+		if (free_bytes < tic->t_unit_res)
+			return false;
+		free_bytes -= tic->t_unit_res;
+
+		trace_xfs_log_regrant_write_wake_up(log, tic);
+		wake_up(&tic->t_wait);
+	}
+
+	return true;
+}
+
+STATIC bool
+xlog_wake_reserveq(
+	struct log		*log,
+	int			free_bytes)
+{
+	struct xlog_ticket	*tic;
+	int			need_bytes;
+
+	list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+		if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+			need_bytes = tic->t_unit_res*tic->t_cnt;
+		else
+			need_bytes = tic->t_unit_res;
+
+		if (free_bytes < need_bytes)
+			return false;
+		free_bytes -= tic->t_unit_res;
+
+		trace_xfs_log_grant_wake_up(log, tic);
+		wake_up(&tic->t_wait);
+	}
+
+	return true;
+}
+
 void
 xfs_log_move_tail(xfs_mount_t	*mp,
 		  xfs_lsn_t	tail_lsn)
@@ -2492,11 +2538,42 @@ restart:
 	return 0;
 }	/* xlog_state_get_iclog_space */
 
+STATIC int
+xlog_reserveq_wait(
+	struct log		*log,
+	struct xlog_ticket	*tic,
+	int			need_bytes)
+{
+	list_add_tail(&tic->t_queue, &log->l_reserveq);
+
+	do {
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto shutdown;
+		xlog_grant_push_ail(log, need_bytes);
+
+		XFS_STATS_INC(xs_sleep_logspace);
+		trace_xfs_log_grant_sleep(log, tic);
+
+		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+		trace_xfs_log_grant_wake(log, tic);
+
+		spin_lock(&log->l_grant_reserve_lock);
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto shutdown;
+	} while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
+
+	list_del_init(&tic->t_queue);
+	return 0;
+shutdown:
+	list_del_init(&tic->t_queue);
+	return XFS_ERROR(EIO);
+}
+
 /*
  * Atomically get the log space required for a log ticket.
  *
- * Once a ticket gets put onto the reserveq, it will only return after
- * the needed reservation is satisfied.
+ * Once a ticket gets put onto the reserveq, it will only return after the
+ * needed reservation is satisfied.
  *
  * This function is structured so that it has a lock free fast path. This is
  * necessary because every new transaction reservation will come through this
@@ -2504,113 +2581,94 @@ restart:
  * every pass.
  *
  * As tickets are only ever moved on and off the reserveq under the
- * l_grant_reserve_lock, we only need to take that lock if we are going
- * to add the ticket to the queue and sleep. We can avoid taking the lock if the
- * ticket was never added to the reserveq because the t_queue list head will be
- * empty and we hold the only reference to it so it can safely be checked
- * unlocked.
+ * l_grant_reserve_lock, we only need to take that lock if we are going to add
+ * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
+ * was never added to the reserveq because the t_queue list head will be empty
+ * and we hold the only reference to it so it can safely be checked unlocked.
  */
 STATIC int
-xlog_grant_log_space(xlog_t	   *log,
-		     xlog_ticket_t *tic)
+xlog_grant_log_space(
+	struct log		*log,
+	struct xlog_ticket	*tic)
 {
-	int		 free_bytes;
-	int		 need_bytes;
+	int			free_bytes, need_bytes;
+	int			error = 0;
 
-#ifdef DEBUG
-	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-		panic("grant Recovery problem");
-#endif
+	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
 
 	trace_xfs_log_grant_enter(log, tic);
 
+	/*
+	 * If there are other waiters on the queue then give them a chance at
+	 * logspace before us.  Wake up the first waiters, if we do not wake
+	 * up all the waiters then go to sleep waiting for more free space,
+	 * otherwise try to get some space for this transaction.
+	 */
 	need_bytes = tic->t_unit_res;
 	if (tic->t_flags & XFS_LOG_PERM_RESERV)
 		need_bytes *= tic->t_ocnt;
-
-	/* something is already sleeping; insert new transaction at end */
-	if (!list_empty_careful(&log->l_reserveq)) {
-		spin_lock(&log->l_grant_reserve_lock);
-		/* recheck the queue now we are locked */
-		if (list_empty(&log->l_reserveq)) {
-			spin_unlock(&log->l_grant_reserve_lock);
-			goto redo;
-		}
-		list_add_tail(&tic->t_queue, &log->l_reserveq);
-
-		trace_xfs_log_grant_sleep1(log, tic);
-
-		/*
-		 * Gotta check this before going to sleep, while we're
-		 * holding the grant lock.
-		 */
-		if (XLOG_FORCED_SHUTDOWN(log))
-			goto error_return;
-
-		XFS_STATS_INC(xs_sleep_logspace);
-		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-
-		/*
-		 * If we got an error, and the filesystem is shutting down,
-		 * we'll catch it down below. So just continue...
-		 */
-		trace_xfs_log_grant_wake1(log, tic);
-	}
-
-redo:
-	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return_unlocked;
-
 	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-	if (free_bytes < need_bytes) {
+	if (!list_empty_careful(&log->l_reserveq)) {
 		spin_lock(&log->l_grant_reserve_lock);
-		if (list_empty(&tic->t_queue))
-			list_add_tail(&tic->t_queue, &log->l_reserveq);
-
-		trace_xfs_log_grant_sleep2(log, tic);
-
-		if (XLOG_FORCED_SHUTDOWN(log))
-			goto error_return;
-
-		xlog_grant_push_ail(log, need_bytes);
-
-		XFS_STATS_INC(xs_sleep_logspace);
-		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-
-		trace_xfs_log_grant_wake2(log, tic);
-		goto redo;
-	}
-
-	if (!list_empty(&tic->t_queue)) {
+		if (!xlog_wake_reserveq(log, free_bytes))
+			error = xlog_reserveq_wait(log, tic, need_bytes);
+		spin_unlock(&log->l_grant_reserve_lock);
+	} else if (free_bytes < need_bytes) {
 		spin_lock(&log->l_grant_reserve_lock);
-		list_del_init(&tic->t_queue);
+		error = xlog_reserveq_wait(log, tic, need_bytes);
 		spin_unlock(&log->l_grant_reserve_lock);
 	}
 
-	/* we've got enough space */
+	if (error)
+		goto error0;
+
 	xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
 	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
 	trace_xfs_log_grant_exit(log, tic);
 	xlog_verify_grant_tail(log);
 	return 0;
 
-error_return_unlocked:
-	spin_lock(&log->l_grant_reserve_lock);
-error_return:
-	list_del_init(&tic->t_queue);
-	spin_unlock(&log->l_grant_reserve_lock);
-	trace_xfs_log_grant_error(log, tic);
-
+error0:
 	/*
-	 * If we are failing, make sure the ticket doesn't have any
-	 * current reservations. We don't want to add this back when
-	 * the ticket/transaction gets cancelled.
+	 * If we are failing, make sure the ticket doesn't have any current
+	 * reservations.  We don't want to add this back when the ticket/
+	 * transaction gets cancelled.
 	 */
 	tic->t_curr_res = 0;
 	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-	return XFS_ERROR(EIO);
-}	/* xlog_grant_log_space */
+	return error;
+}
+
+STATIC int
+xlog_writeq_wait(
+	struct log		*log,
+	struct xlog_ticket	*tic,
+	int			need_bytes)
+{
+	list_add_tail(&tic->t_queue, &log->l_writeq);
+
+	do {
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto shutdown;
+		xlog_grant_push_ail(log, need_bytes);
+
+		XFS_STATS_INC(xs_sleep_logspace);
+		trace_xfs_log_regrant_write_sleep(log, tic);
+
+		xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+		trace_xfs_log_regrant_write_wake(log, tic);
+
+		spin_lock(&log->l_grant_write_lock);
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto shutdown;
+	} while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
 
+	list_del_init(&tic->t_queue);
+	return 0;
+shutdown:
+	list_del_init(&tic->t_queue);
+	return XFS_ERROR(EIO);
+}
 
 /*
  * Replenish the byte reservation required by moving the grant write head.
@@ -2619,10 +2677,12 @@ error_return:
  * free fast path.
  */
 STATIC int
-xlog_regrant_write_log_space(xlog_t	   *log,
-			     xlog_ticket_t *tic)
+xlog_regrant_write_log_space(
+	struct log		*log,
+	struct xlog_ticket	*tic)
 {
-	int		free_bytes, need_bytes;
+	int			free_bytes, need_bytes;
+	int			error = 0;
 
 	tic->t_curr_res = tic->t_unit_res;
 	xlog_tic_reset_res(tic);
@@ -2630,104 +2690,47 @@ xlog_regrant_write_log_space(xlog_t	   *
 	if (tic->t_cnt > 0)
 		return 0;
 
-#ifdef DEBUG
-	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-		panic("regrant Recovery problem");
-#endif
+	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
 
 	trace_xfs_log_regrant_write_enter(log, tic);
-	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return_unlocked;
 
-	/* If there are other waiters on the queue then give them a
-	 * chance at logspace before us. Wake up the first waiters,
-	 * if we do not wake up all the waiters then go to sleep waiting
-	 * for more free space, otherwise try to get some space for
-	 * this transaction.
+	/*
+	 * If there are other waiters on the queue then give them a chance at
+	 * logspace before us.  Wake up the first waiters, if we do not wake
+	 * up all the waiters then go to sleep waiting for more free space,
+	 * otherwise try to get some space for this transaction.
 	 */
 	need_bytes = tic->t_unit_res;
-	if (!list_empty_careful(&log->l_writeq)) {
-		struct xlog_ticket *ntic;
-
-		spin_lock(&log->l_grant_write_lock);
-		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-		list_for_each_entry(ntic, &log->l_writeq, t_queue) {
-			ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
-
-			if (free_bytes < ntic->t_unit_res)
-				break;
-			free_bytes -= ntic->t_unit_res;
-			wake_up(&ntic->t_wait);
-		}
-
-		if (ntic != list_first_entry(&log->l_writeq,
-						struct xlog_ticket, t_queue)) {
-			if (list_empty(&tic->t_queue))
-				list_add_tail(&tic->t_queue, &log->l_writeq);
-			trace_xfs_log_regrant_write_sleep1(log, tic);
-
-			xlog_grant_push_ail(log, need_bytes);
-
-			XFS_STATS_INC(xs_sleep_logspace);
-			xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-			trace_xfs_log_regrant_write_wake1(log, tic);
-		} else
-			spin_unlock(&log->l_grant_write_lock);
-	}
-
-redo:
-	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return_unlocked;
-
 	free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-	if (free_bytes < need_bytes) {
+	if (!list_empty_careful(&log->l_writeq)) {
 		spin_lock(&log->l_grant_write_lock);
-		if (list_empty(&tic->t_queue))
-			list_add_tail(&tic->t_queue, &log->l_writeq);
-
-		if (XLOG_FORCED_SHUTDOWN(log))
-			goto error_return;
-
-		xlog_grant_push_ail(log, need_bytes);
-
-		XFS_STATS_INC(xs_sleep_logspace);
-		trace_xfs_log_regrant_write_sleep2(log, tic);
-		xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-
-		trace_xfs_log_regrant_write_wake2(log, tic);
-		goto redo;
-	}
-
-	if (!list_empty(&tic->t_queue)) {
+		if (!xlog_wake_writeq(log, free_bytes))
+			error = xlog_writeq_wait(log, tic, need_bytes);
+		spin_unlock(&log->l_grant_write_lock);
+	} else if (free_bytes < need_bytes) {
 		spin_lock(&log->l_grant_write_lock);
-		list_del_init(&tic->t_queue);
+		error = xlog_writeq_wait(log, tic, need_bytes);
 		spin_unlock(&log->l_grant_write_lock);
 	}
 
-	/* we've got enough space */
+	if (error)
+		goto error0;
+
 	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
 	trace_xfs_log_regrant_write_exit(log, tic);
 	xlog_verify_grant_tail(log);
 	return 0;
 
-
- error_return_unlocked:
-	spin_lock(&log->l_grant_write_lock);
- error_return:
-	list_del_init(&tic->t_queue);
-	spin_unlock(&log->l_grant_write_lock);
-	trace_xfs_log_regrant_write_error(log, tic);
-
+error0:
 	/*
-	 * If we are failing, make sure the ticket doesn't have any
-	 * current reservations. We don't want to add this back when
-	 * the ticket/transaction gets cancelled.
+	 * If we are failing, make sure the ticket doesn't have any current
+	 * reservations. We don't want to add this back when the ticket/
+	 * transaction gets cancelled.
 	 */
 	tic->t_curr_res = 0;
 	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-	return XFS_ERROR(EIO);
-}	/* xlog_regrant_write_log_space */
-
+	return error;
+}
 
 /* The first cnt-1 times through here we don't need to
  * move the grant write head because the permanent
Index: linux-2.6/fs/xfs/linux-2.6/xfs_trace.h
===================================================================
--- linux-2.6.orig/fs/xfs/linux-2.6/xfs_trace.h	2011-11-19 16:36:00.493329401 +0100
+++ linux-2.6/fs/xfs/linux-2.6/xfs_trace.h	2011-11-19 16:39:53.303332382 +0100
@@ -833,18 +833,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_wri
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] xfs: Remove the entries from the queue while waking them up.
  2011-11-19 18:19 ` Christoph Hellwig
@ 2011-11-20  8:22   ` Dave Chinner
  2011-11-20 11:43     ` Christoph Hellwig
  2011-11-21 19:11   ` [PATCH] xfs: Remove the entries from the queue while waking them up Chandra Seetharaman
  1 sibling, 1 reply; 9+ messages in thread
From: Dave Chinner @ 2011-11-20  8:22 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: Chandra Seetharaman, XFS Mailing List

On Sat, Nov 19, 2011 at 01:19:29PM -0500, Christoph Hellwig wrote:
> Thanks a lot for tracking this issue down!
> 
> Unfortunately I don't think the fix is entirely safe.  If we remove
> the ticket from the list before the wakeup we have to assume no one
> else ever wakes up a process waiting for log space.  In Linux that
> generally isn't a safe assumption - e.g. higher level code could have
> added itself to another waitqueue before calling into this code (

But doing that would be a bug in the higher level code, yes? You are
only supposed to add yourself to a wait queue just before changing
the task state and sleeping, not just before execute something that
is likely to block and sleep and use waitqueues itself (like a
filesystem transaction) ....

> possibly even outside the XFS code) and now getting a wake up, or
> other bits of the kernel could have all kinds of reasons to wake
> this process up.

Signals are the only ones I can think of that are likely to occur
while waiting on a private wait queue. However, that is likely to
occur, so I agree that the waiting context needs to do the queue
removal, not the waker.

> Below is patch I hacked up on the airplane today - it makes sure
> we always wake other waiters on the log space queues first before
> adding a new process and should have the same effect.  Can you
> test if this also fixes the 234 hang for you?

A few comments....

> 
> ---
> From: Christoph Hellwig <hch@lst.de>
> Subject: xfs: fix and cleanup logspace waiter lists
> 
> Apply the scheme used in log_regrant_write_log_space to wake up any
> other threads waiting for log space before the newly added one to
> log_regrant_write_log_space as well, and factor the code into
> readable helpers.  For each of the queues we have:
> 
>  - one helper to wake up all waiting threads, and return if we
>    succeeded into doing that.  These helpers will also be usable
>    by xfs_log_move_tail once we remove the current opportunistic
>    wakeups in it.
>  - one helper to sleep on t_wait until enough log space is available,
>    which is modelled after the Linux waitqueue model.
>  
> and rewrite log_regrant_write_log_space and log_regrant_write_log_space
> around these helpers, including comments documenting what is going on.
> These two function now use one and the same algorithm for waiting
> on log space instead of subtly different ones before.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> ---
>  fs/xfs/linux-2.6/xfs_trace.h |   12 -
>  fs/xfs/xfs_log.c             |  329 +++++++++++++++++++++----------------------
>  2 files changed, 170 insertions(+), 171 deletions(-)
> 
> Index: linux-2.6/fs/xfs/xfs_log.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/xfs_log.c	2011-11-19 15:51:55.689999172 +0100
> +++ linux-2.6/fs/xfs/xfs_log.c	2011-11-19 16:57:07.226659537 +0100
> @@ -670,6 +670,52 @@ xfs_log_write(
>  	return error;
>  }
>  
> +STATIC bool
> +xlog_wake_writeq(
> +	struct log		*log,
> +	int			free_bytes)
> +{
> +	struct xlog_ticket	*tic;
> +
> +	list_for_each_entry(tic, &log->l_writeq, t_queue) {
> +		ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
> +
> +		if (free_bytes < tic->t_unit_res)
> +			return false;
> +		free_bytes -= tic->t_unit_res;
> +
> +		trace_xfs_log_regrant_write_wake_up(log, tic);
> +		wake_up(&tic->t_wait);
> +	}
> +
> +	return true;
> +}
> +
> +STATIC bool
> +xlog_wake_reserveq(
> +	struct log		*log,
> +	int			free_bytes)
> +{
> +	struct xlog_ticket	*tic;
> +	int			need_bytes;
> +
> +	list_for_each_entry(tic, &log->l_reserveq, t_queue) {
> +		if (tic->t_flags & XLOG_TIC_PERM_RESERV)
> +			need_bytes = tic->t_unit_res*tic->t_cnt;
> +		else
> +			need_bytes = tic->t_unit_res;
> +
> +		if (free_bytes < need_bytes)
> +			return false;
> +		free_bytes -= tic->t_unit_res;
> +
> +		trace_xfs_log_grant_wake_up(log, tic);
> +		wake_up(&tic->t_wait);
> +	}
> +
> +	return true;
> +}

Be nice to factor these into a single function - they do exactly the
same thing except for the queue they walk and the need_bytes
calculation.

> +
>  void
>  xfs_log_move_tail(xfs_mount_t	*mp,
>  		  xfs_lsn_t	tail_lsn)

Also, wouldn't it be a good idea to use these helpers in
xfs_log_move_tail() and remove the code duplication from there as
well? i.e. the four places we do this open coded wakeup walk could
be replaced with a single implementation....

> @@ -2492,11 +2538,42 @@ restart:
>  	return 0;
>  }	/* xlog_state_get_iclog_space */
>  
> +STATIC int
> +xlog_reserveq_wait(
> +	struct log		*log,
> +	struct xlog_ticket	*tic,
> +	int			need_bytes)
> +{
> +	list_add_tail(&tic->t_queue, &log->l_reserveq);
> +
> +	do {
> +		if (XLOG_FORCED_SHUTDOWN(log))
> +			goto shutdown;
> +		xlog_grant_push_ail(log, need_bytes);
> +
> +		XFS_STATS_INC(xs_sleep_logspace);
> +		trace_xfs_log_grant_sleep(log, tic);
> +
> +		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
> +		trace_xfs_log_grant_wake(log, tic);
> +
> +		spin_lock(&log->l_grant_reserve_lock);
> +		if (XLOG_FORCED_SHUTDOWN(log))
> +			goto shutdown;
> +	} while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
> +
> +	list_del_init(&tic->t_queue);
> +	return 0;
> +shutdown:
> +	list_del_init(&tic->t_queue);
> +	return XFS_ERROR(EIO);
> +}

Same with this function and xlog_writeq_wait - they are identical
code just operating on a different grant head. If we had

struct xlog_grant_head {
	spinlock_t	lock;
	struct list_head queue;
	atomic64_t	head;
};

Rather than the current open-coded definitions, then this could all
share the same code rather than continuing to duplicate it. If we've
got to restructure the code to fix the bug, it makes sense to me to
remove the code duplication as well...

> +
>  /*
>   * Atomically get the log space required for a log ticket.
>   *
> - * Once a ticket gets put onto the reserveq, it will only return after
> - * the needed reservation is satisfied.
> + * Once a ticket gets put onto the reserveq, it will only return after the
> + * needed reservation is satisfied.
>   *
>   * This function is structured so that it has a lock free fast path. This is
>   * necessary because every new transaction reservation will come through this
> @@ -2504,113 +2581,94 @@ restart:
>   * every pass.
>   *
>   * As tickets are only ever moved on and off the reserveq under the
> - * l_grant_reserve_lock, we only need to take that lock if we are going
> - * to add the ticket to the queue and sleep. We can avoid taking the lock if the
> - * ticket was never added to the reserveq because the t_queue list head will be
> - * empty and we hold the only reference to it so it can safely be checked
> - * unlocked.
> + * l_grant_reserve_lock, we only need to take that lock if we are going to add
> + * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
> + * was never added to the reserveq because the t_queue list head will be empty
> + * and we hold the only reference to it so it can safely be checked unlocked.
>   */
>  STATIC int
> -xlog_grant_log_space(xlog_t	   *log,
> -		     xlog_ticket_t *tic)
> +xlog_grant_log_space(
> +	struct log		*log,
> +	struct xlog_ticket	*tic)
>  {
> -	int		 free_bytes;
> -	int		 need_bytes;
> +	int			free_bytes, need_bytes;
> +	int			error = 0;
>  
> -#ifdef DEBUG
> -	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
> -		panic("grant Recovery problem");
> -#endif
> +	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
>  
>  	trace_xfs_log_grant_enter(log, tic);
>  
> +	/*
> +	 * If there are other waiters on the queue then give them a chance at
> +	 * logspace before us.  Wake up the first waiters, if we do not wake
> +	 * up all the waiters then go to sleep waiting for more free space,
> +	 * otherwise try to get some space for this transaction.
> +	 */
>  	need_bytes = tic->t_unit_res;
>  	if (tic->t_flags & XFS_LOG_PERM_RESERV)
>  		need_bytes *= tic->t_ocnt;
.....
>  	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
> -	if (free_bytes < need_bytes) {
> +	if (!list_empty_careful(&log->l_reserveq)) {
>  		spin_lock(&log->l_grant_reserve_lock);
.....
> +		if (!xlog_wake_reserveq(log, free_bytes))
> +			error = xlog_reserveq_wait(log, tic, need_bytes);
> +		spin_unlock(&log->l_grant_reserve_lock);
> +	} else if (free_bytes < need_bytes) {
>  		spin_lock(&log->l_grant_reserve_lock);
> -		list_del_init(&tic->t_queue);
> +		error = xlog_reserveq_wait(log, tic, need_bytes);
>  		spin_unlock(&log->l_grant_reserve_lock);
>  	}

I don't think this logic is quite correct. When we have a waiter on
the queue, we try to wake all the waiters before sleeping if that
did not succeed. If we woke everyone, we allow this process to
continue. The problem is that we don't check to see if there is
space remaining for this process to continue after doing all the
wakeups.

i.e. if need_bytes = 10000 and free_bytes = 50000, and the waiters
on the queue total 45000 bytes, then we fall through with free_bytes
= 5000 and need_bytes = 10000. In that case, we should have gone to
sleep because we are 5000 bytes short of our required space but we
don't and potentially deadlock due to oversubscribing the log space.

IOWs, xlog_wake_reserveq() needs to return the amount of remaining
free space after the wakeups so we can then run the if (free_bytes <
need_bytes) check unconditionally....

.....

>   */
>  STATIC int
> -xlog_regrant_write_log_space(xlog_t	   *log,
> -			     xlog_ticket_t *tic)
> +xlog_regrant_write_log_space(
> +	struct log		*log,
> +	struct xlog_ticket	*tic)

Ditto for the write grant head manipulations.

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] xfs: Remove the entries from the queue while waking them up.
  2011-11-20  8:22   ` Dave Chinner
@ 2011-11-20 11:43     ` Christoph Hellwig
  2011-11-20 19:49       ` [PATCH v2] xfs: fix and cleanup logspace waiter lists Christoph Hellwig
  0 siblings, 1 reply; 9+ messages in thread
From: Christoph Hellwig @ 2011-11-20 11:43 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, Chandra Seetharaman, XFS Mailing List

On Sun, Nov 20, 2011 at 07:22:35PM +1100, Dave Chinner wrote:
> Be nice to factor these into a single function - they do exactly the
> same thing except for the queue they walk and the need_bytes
> calculation.

I thought about it, and I think we can do it as a broader refactoring
of the two list/lock/head tuples for the reserve/grant queues.  But
I'd rather do a simple minimal patch that can easily be backported
first.

> 
> > +
> >  void
> >  xfs_log_move_tail(xfs_mount_t	*mp,
> >  		  xfs_lsn_t	tail_lsn)
> 
> Also, wouldn't it be a good idea to use these helpers in
> xfs_log_move_tail() and remove the code duplication from there as
> well? i.e. the four places we do this open coded wakeup walk could
> be replaced with a single implementation....

Yes, I even hinted at that in the changelog.  To make it not ugly
it requires removing the opportunistic wakers.  Fortunately I already
have patches for that, which I plan to sumit for 3.3.  We an easily
do that factoring as part of that patchset.

> Same with this function and xlog_writeq_wait - they are identical
> code just operating on a different grant head. If we had
> 
> struct xlog_grant_head {
> 	spinlock_t	lock;
> 	struct list_head queue;
> 	atomic64_t	head;
> };
> 
> Rather than the current open-coded definitions, then this could all
> share the same code rather than continuing to duplicate it. If we've
> got to restructure the code to fix the bug, it makes sense to me to
> remove the code duplication as well...

I agree.  But let's fix the issue first and to the cleanups later.

> .....
> >  	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
> > +	if (!list_empty_careful(&log->l_reserveq)) {
> >  		spin_lock(&log->l_grant_reserve_lock);
> .....
> > +		if (!xlog_wake_reserveq(log, free_bytes))
> > +			error = xlog_reserveq_wait(log, tic, need_bytes);
> > +		spin_unlock(&log->l_grant_reserve_lock);
> > +	} else if (free_bytes < need_bytes) {
> >  		spin_lock(&log->l_grant_reserve_lock);
> > -		list_del_init(&tic->t_queue);
> > +		error = xlog_reserveq_wait(log, tic, need_bytes);
> >  		spin_unlock(&log->l_grant_reserve_lock);
> >  	}
> 
> I don't think this logic is quite correct. When we have a waiter on
> the queue, we try to wake all the waiters before sleeping if that
> did not succeed. If we woke everyone, we allow this process to
> continue. The problem is that we don't check to see if there is
> space remaining for this process to continue after doing all the
> wakeups.
> 
> i.e. if need_bytes = 10000 and free_bytes = 50000, and the waiters
> on the queue total 45000 bytes, then we fall through with free_bytes
> = 5000 and need_bytes = 10000. In that case, we should have gone to
> sleep because we are 5000 bytes short of our required space but we
> don't and potentially deadlock due to oversubscribing the log space.
> 
> IOWs, xlog_wake_reserveq() needs to return the amount of remaining
> free space after the wakeups so we can then run the if (free_bytes <
> need_bytes) check unconditionally....

Indeed.  I think the better way to do it is to make the free_bytes in
xlog_wake_reserveq a pointer, so that the caller sees the update and
then do:


 	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
	if (!list_empty_careful(&log->l_reserveq)) {
 		spin_lock(&log->l_grant_reserve_lock);
		if (!xlog_wake_reserveq(log, &free_bytes) ||
		    free_bytes < need_bytes)
			error = xlog_reserveq_wait(log, tic, need_bytes);
		spin_unlock(&log->l_grant_reserve_lock);
	} else if (free_bytes < need_bytes) {
 		spin_lock(&log->l_grant_reserve_lock);
		list_del_init(&tic->t_queue);
		error = xlog_reserveq_wait(log, tic, need_bytes);
  		spin_unlock(&log->l_grant_reserve_lock);
  	}

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2] xfs: fix and cleanup logspace waiter lists
  2011-11-20 11:43     ` Christoph Hellwig
@ 2011-11-20 19:49       ` Christoph Hellwig
  0 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2011-11-20 19:49 UTC (permalink / raw)
  To: Dave Chinner; +Cc: Christoph Hellwig, Chandra Seetharaman, XFS Mailing List

Apply the scheme used in log_regrant_write_log_space to wake up any other
threads waiting for log space before the newly added one to
log_regrant_write_log_space as well, and factor the code into readable
helpers.  For each of the queues we have add two helpers:

 - one to try to wake up all waiting threads.  This helper will also be
   usable by xfs_log_move_tail once we remove the current opportunistic
   wakeups in it.
 - one to sleep on t_wait until enough log space is available, loosely
   modelled after Linux waitqueues.
 
And use them to reimplement the guts of log_regrant_write_log_space and
log_regrant_write_log_space.  These two function now use one and the same
algorithm for waiting on log space instead of subtly different ones before,
with an option to completely unify them in the near future.

Also move the filesystem shutdown handling to the common caller given
that we had to touch it anyway.

Based on hard debugging and an earlier patch from
Chandra Seetharaman <sekharan@us.ibm.com>.

Signed-off-by: Christoph Hellwig <hch@lst.de>

---
 fs/xfs/xfs_log.c   |  348 ++++++++++++++++++++++++++---------------------------
 fs/xfs/xfs_trace.h |   12 -
 2 files changed, 177 insertions(+), 183 deletions(-)

Index: xfs/fs/xfs/xfs_log.c
===================================================================
--- xfs.orig/fs/xfs/xfs_log.c	2011-11-20 16:29:49.356194023 +0100
+++ xfs/fs/xfs/xfs_log.c	2011-11-20 16:46:33.440754431 +0100
@@ -150,6 +150,117 @@ xlog_grant_add_space(
 	} while (head_val != old);
 }
 
+STATIC bool
+xlog_reserveq_wake(
+	struct log		*log,
+	int			*free_bytes)
+{
+	struct xlog_ticket	*tic;
+	int			need_bytes;
+
+	list_for_each_entry(tic, &log->l_reserveq, t_queue) {
+		if (tic->t_flags & XLOG_TIC_PERM_RESERV)
+			need_bytes = tic->t_unit_res * tic->t_cnt;
+		else
+			need_bytes = tic->t_unit_res;
+
+		if (*free_bytes < need_bytes)
+			return false;
+		*free_bytes -= need_bytes;
+
+		trace_xfs_log_grant_wake_up(log, tic);
+		wake_up(&tic->t_wait);
+	}
+
+	return true;
+}
+
+STATIC bool
+xlog_writeq_wake(
+	struct log		*log,
+	int			*free_bytes)
+{
+	struct xlog_ticket	*tic;
+	int			need_bytes;
+
+	list_for_each_entry(tic, &log->l_writeq, t_queue) {
+		ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
+
+		need_bytes = tic->t_unit_res;
+
+		if (*free_bytes < need_bytes)
+			return false;
+		*free_bytes -= need_bytes;
+
+		trace_xfs_log_regrant_write_wake_up(log, tic);
+		wake_up(&tic->t_wait);
+	}
+
+	return true;
+}
+
+STATIC int
+xlog_reserveq_wait(
+	struct log		*log,
+	struct xlog_ticket	*tic,
+	int			need_bytes)
+{
+	list_add_tail(&tic->t_queue, &log->l_reserveq);
+
+	do {
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto shutdown;
+		xlog_grant_push_ail(log, need_bytes);
+
+		XFS_STATS_INC(xs_sleep_logspace);
+		trace_xfs_log_grant_sleep(log, tic);
+
+		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
+		trace_xfs_log_grant_wake(log, tic);
+
+		spin_lock(&log->l_grant_reserve_lock);
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto shutdown;
+	} while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
+
+	list_del_init(&tic->t_queue);
+	return 0;
+shutdown:
+	list_del_init(&tic->t_queue);
+	return XFS_ERROR(EIO);
+}
+
+STATIC int
+xlog_writeq_wait(
+	struct log		*log,
+	struct xlog_ticket	*tic,
+	int			need_bytes)
+{
+	list_add_tail(&tic->t_queue, &log->l_writeq);
+
+	do {
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto shutdown;
+		xlog_grant_push_ail(log, need_bytes);
+
+		XFS_STATS_INC(xs_sleep_logspace);
+		trace_xfs_log_regrant_write_sleep(log, tic);
+
+		xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
+		trace_xfs_log_regrant_write_wake(log, tic);
+
+		spin_lock(&log->l_grant_write_lock);
+		if (XLOG_FORCED_SHUTDOWN(log))
+			goto shutdown;
+	} while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
+
+	list_del_init(&tic->t_queue);
+	return 0;
+shutdown:
+	list_del_init(&tic->t_queue);
+	return XFS_ERROR(EIO);
+}
+
 static void
 xlog_tic_reset_res(xlog_ticket_t *tic)
 {
@@ -350,8 +461,19 @@ xfs_log_reserve(
 		retval = xlog_grant_log_space(log, internal_ticket);
 	}
 
+	if (unlikely(retval)) {
+		/*
+		 * If we are failing, make sure the ticket doesn't have any
+		 * current reservations.  We don't want to add this back
+		 * when the ticket/ transaction gets cancelled.
+		 */
+		internal_ticket->t_curr_res = 0;
+ 		/* ungrant will give back unit_res * t_cnt. */
+		internal_ticket->t_cnt = 0;
+	}
+
 	return retval;
-}	/* xfs_log_reserve */
+}
 
 
 /*
@@ -2481,8 +2603,8 @@ restart:
 /*
  * Atomically get the log space required for a log ticket.
  *
- * Once a ticket gets put onto the reserveq, it will only return after
- * the needed reservation is satisfied.
+ * Once a ticket gets put onto the reserveq, it will only return after the
+ * needed reservation is satisfied.
  *
  * This function is structured so that it has a lock free fast path. This is
  * necessary because every new transaction reservation will come through this
@@ -2490,113 +2612,53 @@ restart:
  * every pass.
  *
  * As tickets are only ever moved on and off the reserveq under the
- * l_grant_reserve_lock, we only need to take that lock if we are going
- * to add the ticket to the queue and sleep. We can avoid taking the lock if the
- * ticket was never added to the reserveq because the t_queue list head will be
- * empty and we hold the only reference to it so it can safely be checked
- * unlocked.
+ * l_grant_reserve_lock, we only need to take that lock if we are going to add
+ * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
+ * was never added to the reserveq because the t_queue list head will be empty
+ * and we hold the only reference to it so it can safely be checked unlocked.
  */
 STATIC int
-xlog_grant_log_space(xlog_t	   *log,
-		     xlog_ticket_t *tic)
+xlog_grant_log_space(
+	struct log		*log,
+	struct xlog_ticket	*tic)
 {
-	int		 free_bytes;
-	int		 need_bytes;
+	int			free_bytes, need_bytes;
+	int			error = 0;
 
-#ifdef DEBUG
-	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-		panic("grant Recovery problem");
-#endif
+	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
 
 	trace_xfs_log_grant_enter(log, tic);
 
+	/*
+	 * If there are other waiters on the queue then give them a chance at
+	 * logspace before us.  Wake up the first waiters, if we do not wake
+	 * up all the waiters then go to sleep waiting for more free space,
+	 * otherwise try to get some space for this transaction.
+	 */
 	need_bytes = tic->t_unit_res;
 	if (tic->t_flags & XFS_LOG_PERM_RESERV)
 		need_bytes *= tic->t_ocnt;
-
-	/* something is already sleeping; insert new transaction at end */
-	if (!list_empty_careful(&log->l_reserveq)) {
-		spin_lock(&log->l_grant_reserve_lock);
-		/* recheck the queue now we are locked */
-		if (list_empty(&log->l_reserveq)) {
-			spin_unlock(&log->l_grant_reserve_lock);
-			goto redo;
-		}
-		list_add_tail(&tic->t_queue, &log->l_reserveq);
-
-		trace_xfs_log_grant_sleep1(log, tic);
-
-		/*
-		 * Gotta check this before going to sleep, while we're
-		 * holding the grant lock.
-		 */
-		if (XLOG_FORCED_SHUTDOWN(log))
-			goto error_return;
-
-		XFS_STATS_INC(xs_sleep_logspace);
-		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-
-		/*
-		 * If we got an error, and the filesystem is shutting down,
-		 * we'll catch it down below. So just continue...
-		 */
-		trace_xfs_log_grant_wake1(log, tic);
-	}
-
-redo:
-	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return_unlocked;
-
 	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
-	if (free_bytes < need_bytes) {
+	if (!list_empty_careful(&log->l_reserveq)) {
 		spin_lock(&log->l_grant_reserve_lock);
-		if (list_empty(&tic->t_queue))
-			list_add_tail(&tic->t_queue, &log->l_reserveq);
-
-		trace_xfs_log_grant_sleep2(log, tic);
-
-		if (XLOG_FORCED_SHUTDOWN(log))
-			goto error_return;
-
-		xlog_grant_push_ail(log, need_bytes);
-
-		XFS_STATS_INC(xs_sleep_logspace);
-		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
-
-		trace_xfs_log_grant_wake2(log, tic);
-		goto redo;
-	}
-
-	if (!list_empty(&tic->t_queue)) {
+		if (!xlog_reserveq_wake(log, &free_bytes) ||
+		    free_bytes < need_bytes)
+			error = xlog_reserveq_wait(log, tic, need_bytes);
+		spin_unlock(&log->l_grant_reserve_lock);
+	} else if (free_bytes < need_bytes) {
 		spin_lock(&log->l_grant_reserve_lock);
-		list_del_init(&tic->t_queue);
+		error = xlog_reserveq_wait(log, tic, need_bytes);
 		spin_unlock(&log->l_grant_reserve_lock);
 	}
+	if (error)
+		return error;
 
-	/* we've got enough space */
 	xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
 	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
 	trace_xfs_log_grant_exit(log, tic);
 	xlog_verify_grant_tail(log);
 	return 0;
-
-error_return_unlocked:
-	spin_lock(&log->l_grant_reserve_lock);
-error_return:
-	list_del_init(&tic->t_queue);
-	spin_unlock(&log->l_grant_reserve_lock);
-	trace_xfs_log_grant_error(log, tic);
-
-	/*
-	 * If we are failing, make sure the ticket doesn't have any
-	 * current reservations. We don't want to add this back when
-	 * the ticket/transaction gets cancelled.
-	 */
-	tic->t_curr_res = 0;
-	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-	return XFS_ERROR(EIO);
-}	/* xlog_grant_log_space */
-
+}
 
 /*
  * Replenish the byte reservation required by moving the grant write head.
@@ -2605,10 +2667,12 @@ error_return:
  * free fast path.
  */
 STATIC int
-xlog_regrant_write_log_space(xlog_t	   *log,
-			     xlog_ticket_t *tic)
+xlog_regrant_write_log_space(
+	struct log		*log,
+	struct xlog_ticket	*tic)
 {
-	int		free_bytes, need_bytes;
+	int			free_bytes, need_bytes;
+	int			error = 0;
 
 	tic->t_curr_res = tic->t_unit_res;
 	xlog_tic_reset_res(tic);
@@ -2616,104 +2680,38 @@ xlog_regrant_write_log_space(xlog_t	   *
 	if (tic->t_cnt > 0)
 		return 0;
 
-#ifdef DEBUG
-	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
-		panic("regrant Recovery problem");
-#endif
+	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
 
 	trace_xfs_log_regrant_write_enter(log, tic);
-	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return_unlocked;
 
-	/* If there are other waiters on the queue then give them a
-	 * chance at logspace before us. Wake up the first waiters,
-	 * if we do not wake up all the waiters then go to sleep waiting
-	 * for more free space, otherwise try to get some space for
-	 * this transaction.
+	/*
+	 * If there are other waiters on the queue then give them a chance at
+	 * logspace before us.  Wake up the first waiters, if we do not wake
+	 * up all the waiters then go to sleep waiting for more free space,
+	 * otherwise try to get some space for this transaction.
 	 */
 	need_bytes = tic->t_unit_res;
-	if (!list_empty_careful(&log->l_writeq)) {
-		struct xlog_ticket *ntic;
-
-		spin_lock(&log->l_grant_write_lock);
-		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-		list_for_each_entry(ntic, &log->l_writeq, t_queue) {
-			ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
-
-			if (free_bytes < ntic->t_unit_res)
-				break;
-			free_bytes -= ntic->t_unit_res;
-			wake_up(&ntic->t_wait);
-		}
-
-		if (ntic != list_first_entry(&log->l_writeq,
-						struct xlog_ticket, t_queue)) {
-			if (list_empty(&tic->t_queue))
-				list_add_tail(&tic->t_queue, &log->l_writeq);
-			trace_xfs_log_regrant_write_sleep1(log, tic);
-
-			xlog_grant_push_ail(log, need_bytes);
-
-			XFS_STATS_INC(xs_sleep_logspace);
-			xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-			trace_xfs_log_regrant_write_wake1(log, tic);
-		} else
-			spin_unlock(&log->l_grant_write_lock);
-	}
-
-redo:
-	if (XLOG_FORCED_SHUTDOWN(log))
-		goto error_return_unlocked;
-
 	free_bytes = xlog_space_left(log, &log->l_grant_write_head);
-	if (free_bytes < need_bytes) {
+	if (!list_empty_careful(&log->l_writeq)) {
 		spin_lock(&log->l_grant_write_lock);
-		if (list_empty(&tic->t_queue))
-			list_add_tail(&tic->t_queue, &log->l_writeq);
-
-		if (XLOG_FORCED_SHUTDOWN(log))
-			goto error_return;
-
-		xlog_grant_push_ail(log, need_bytes);
-
-		XFS_STATS_INC(xs_sleep_logspace);
-		trace_xfs_log_regrant_write_sleep2(log, tic);
-		xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
-
-		trace_xfs_log_regrant_write_wake2(log, tic);
-		goto redo;
-	}
-
-	if (!list_empty(&tic->t_queue)) {
+		if (!xlog_writeq_wake(log, &free_bytes) ||
+		    free_bytes < need_bytes)
+			error = xlog_writeq_wait(log, tic, need_bytes);
+		spin_unlock(&log->l_grant_write_lock);
+	} else if (free_bytes < need_bytes) {
 		spin_lock(&log->l_grant_write_lock);
-		list_del_init(&tic->t_queue);
+		error = xlog_writeq_wait(log, tic, need_bytes);
 		spin_unlock(&log->l_grant_write_lock);
 	}
 
-	/* we've got enough space */
+	if (error)
+		return error;
+
 	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
 	trace_xfs_log_regrant_write_exit(log, tic);
 	xlog_verify_grant_tail(log);
 	return 0;
-
-
- error_return_unlocked:
-	spin_lock(&log->l_grant_write_lock);
- error_return:
-	list_del_init(&tic->t_queue);
-	spin_unlock(&log->l_grant_write_lock);
-	trace_xfs_log_regrant_write_error(log, tic);
-
-	/*
-	 * If we are failing, make sure the ticket doesn't have any
-	 * current reservations. We don't want to add this back when
-	 * the ticket/transaction gets cancelled.
-	 */
-	tic->t_curr_res = 0;
-	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
-	return XFS_ERROR(EIO);
-}	/* xlog_regrant_write_log_space */
-
+}
 
 /* The first cnt-1 times through here we don't need to
  * move the grant write head because the permanent
Index: xfs/fs/xfs/xfs_trace.h
===================================================================
--- xfs.orig/fs/xfs/xfs_trace.h	2011-11-20 16:29:49.362860654 +0100
+++ xfs/fs/xfs/xfs_trace.h	2011-11-20 16:34:23.954706395 +0100
@@ -834,18 +834,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_wri
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
+DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
 DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
-DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
+DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] xfs: Remove the entries from the queue while waking them up.
  2011-11-19 18:19 ` Christoph Hellwig
  2011-11-20  8:22   ` Dave Chinner
@ 2011-11-21 19:11   ` Chandra Seetharaman
  2011-11-22 10:23     ` Christoph Hellwig
  1 sibling, 1 reply; 9+ messages in thread
From: Chandra Seetharaman @ 2011-11-21 19:11 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: XFS Mailing List

On Sat, 2011-11-19 at 13:19 -0500, Christoph Hellwig wrote:

Hi Christoph,

Thanks for your review and response.... see my comments below.

> Thanks a lot for tracking this issue down!
> 
> Unfortunately I don't think the fix is entirely safe.  If we remove
> the ticket from the list before the wakeup we have to assume no one
> else ever wakes up a process waiting for log space.  In Linux that

The code does not assume that it got the space when it wakes up. After
every wake up it does check for free bytes available and compares to
required bytes before granting the bytes to itself. (IOW, after waking
up it behaves the same way as the lock-less case)

As Dave pointed, I can see only the signal case to be effecting this
scenario. With that case in mind, I can see one change required to my
patch: Add the ticket to the list the second time (in a function) only
if the t_queue is not empty.

> generally isn't a safe assumption - e.g. higher level code could have
> added itself to another waitqueue before calling into this code (
> possibly even outside the XFS code) and now getting a wake up, or
> other bits of the kernel could have all kinds of reasons to wake
> this process up.
> 
> Below is patch I hacked up on the airplane today - it makes sure
> we always wake other waiters on the log space queues first before
> adding a new process and should have the same effect.  Can you
> test if this also fixes the 234 hang for you?

> 
> ---
> From: Christoph Hellwig <hch@lst.de>
> Subject: xfs: fix and cleanup logspace waiter lists
> 
> Apply the scheme used in log_regrant_write_log_space to wake up any
> other threads waiting for log space before the newly added one to
> log_regrant_write_log_space as well, and factor the code into
> readable helpers.  For each of the queues we have:
> 
>  - one helper to wake up all waiting threads, and return if we
>    succeeded into doing that.  These helpers will also be usable
>    by xfs_log_move_tail once we remove the current opportunistic
>    wakeups in it.
>  - one helper to sleep on t_wait until enough log space is available,
>    which is modelled after the Linux waitqueue model.
> 
> and rewrite log_regrant_write_log_space and log_regrant_write_log_space
> around these helpers, including comments documenting what is going on.
> These two function now use one and the same algorithm for waiting
> on log space instead of subtly different ones before.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> 
> ---
>  fs/xfs/linux-2.6/xfs_trace.h |   12 -
>  fs/xfs/xfs_log.c             |  329 +++++++++++++++++++++----------------------
>  2 files changed, 170 insertions(+), 171 deletions(-)
> 
> Index: linux-2.6/fs/xfs/xfs_log.c
> ===================================================================
> --- linux-2.6.orig/fs/xfs/xfs_log.c	2011-11-19 15:51:55.689999172 +0100
> +++ linux-2.6/fs/xfs/xfs_log.c	2011-11-19 16:57:07.226659537 +0100
> @@ -670,6 +670,52 @@ xfs_log_write(
>  	return error;
>  }
> 
> +STATIC bool
> +xlog_wake_writeq(
> +	struct log		*log,
> +	int			free_bytes)
> +{
> +	struct xlog_ticket	*tic;
> +
> +	list_for_each_entry(tic, &log->l_writeq, t_queue) {
> +		ASSERT(tic->t_flags & XLOG_TIC_PERM_RESERV);
> +
> +		if (free_bytes < tic->t_unit_res)
> +			return false;
> +		free_bytes -= tic->t_unit_res;
> +
> +		trace_xfs_log_regrant_write_wake_up(log, tic);
> +		wake_up(&tic->t_wait);
> +	}
> +
> +	return true;
> +}
> +
> +STATIC bool
> +xlog_wake_reserveq(
> +	struct log		*log,
> +	int			free_bytes)
> +{
> +	struct xlog_ticket	*tic;
> +	int			need_bytes;
> +
> +	list_for_each_entry(tic, &log->l_reserveq, t_queue) {
> +		if (tic->t_flags & XLOG_TIC_PERM_RESERV)
> +			need_bytes = tic->t_unit_res*tic->t_cnt;
> +		else
> +			need_bytes = tic->t_unit_res;
> +
> +		if (free_bytes < need_bytes)
> +			return false;
> +		free_bytes -= tic->t_unit_res;
> +
> +		trace_xfs_log_grant_wake_up(log, tic);
> +		wake_up(&tic->t_wait);
> +	}
> +
> +	return true;
> +}
> +
>  void
>  xfs_log_move_tail(xfs_mount_t	*mp,
>  		  xfs_lsn_t	tail_lsn)
> @@ -2492,11 +2538,42 @@ restart:
>  	return 0;
>  }	/* xlog_state_get_iclog_space */
> 
> +STATIC int
> +xlog_reserveq_wait(
> +	struct log		*log,
> +	struct xlog_ticket	*tic,
> +	int			need_bytes)
> +{
> +	list_add_tail(&tic->t_queue, &log->l_reserveq);
> +
> +	do {
> +		if (XLOG_FORCED_SHUTDOWN(log))
> +			goto shutdown;
> +		xlog_grant_push_ail(log, need_bytes);
> +
> +		XFS_STATS_INC(xs_sleep_logspace);
> +		trace_xfs_log_grant_sleep(log, tic);
> +
> +		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
> +		trace_xfs_log_grant_wake(log, tic);
> +
> +		spin_lock(&log->l_grant_reserve_lock);
> +		if (XLOG_FORCED_SHUTDOWN(log))
> +			goto shutdown;
> +	} while (xlog_space_left(log, &log->l_grant_reserve_head) < need_bytes);
> +
> +	list_del_init(&tic->t_queue);
> +	return 0;
> +shutdown:
> +	list_del_init(&tic->t_queue);
> +	return XFS_ERROR(EIO);
> +}
> +
>  /*
>   * Atomically get the log space required for a log ticket.
>   *
> - * Once a ticket gets put onto the reserveq, it will only return after
> - * the needed reservation is satisfied.
> + * Once a ticket gets put onto the reserveq, it will only return after the
> + * needed reservation is satisfied.
>   *
>   * This function is structured so that it has a lock free fast path. This is
>   * necessary because every new transaction reservation will come through this
> @@ -2504,113 +2581,94 @@ restart:
>   * every pass.
>   *
>   * As tickets are only ever moved on and off the reserveq under the
> - * l_grant_reserve_lock, we only need to take that lock if we are going
> - * to add the ticket to the queue and sleep. We can avoid taking the lock if the
> - * ticket was never added to the reserveq because the t_queue list head will be
> - * empty and we hold the only reference to it so it can safely be checked
> - * unlocked.
> + * l_grant_reserve_lock, we only need to take that lock if we are going to add
> + * the ticket to the queue and sleep. We can avoid taking the lock if the ticket
> + * was never added to the reserveq because the t_queue list head will be empty
> + * and we hold the only reference to it so it can safely be checked unlocked.
>   */
>  STATIC int
> -xlog_grant_log_space(xlog_t	   *log,
> -		     xlog_ticket_t *tic)
> +xlog_grant_log_space(
> +	struct log		*log,
> +	struct xlog_ticket	*tic)
>  {
> -	int		 free_bytes;
> -	int		 need_bytes;
> +	int			free_bytes, need_bytes;
> +	int			error = 0;
> 
> -#ifdef DEBUG
> -	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
> -		panic("grant Recovery problem");
> -#endif
> +	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
> 
>  	trace_xfs_log_grant_enter(log, tic);
> 
> +	/*
> +	 * If there are other waiters on the queue then give them a chance at
> +	 * logspace before us.  Wake up the first waiters, if we do not wake
> +	 * up all the waiters then go to sleep waiting for more free space,
> +	 * otherwise try to get some space for this transaction.
> +	 */
>  	need_bytes = tic->t_unit_res;
>  	if (tic->t_flags & XFS_LOG_PERM_RESERV)
>  		need_bytes *= tic->t_ocnt;
> -
> -	/* something is already sleeping; insert new transaction at end */
> -	if (!list_empty_careful(&log->l_reserveq)) {
> -		spin_lock(&log->l_grant_reserve_lock);
> -		/* recheck the queue now we are locked */
> -		if (list_empty(&log->l_reserveq)) {
> -			spin_unlock(&log->l_grant_reserve_lock);
> -			goto redo;
> -		}
> -		list_add_tail(&tic->t_queue, &log->l_reserveq);
> -
> -		trace_xfs_log_grant_sleep1(log, tic);
> -
> -		/*
> -		 * Gotta check this before going to sleep, while we're
> -		 * holding the grant lock.
> -		 */
> -		if (XLOG_FORCED_SHUTDOWN(log))
> -			goto error_return;
> -
> -		XFS_STATS_INC(xs_sleep_logspace);
> -		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
> -
> -		/*
> -		 * If we got an error, and the filesystem is shutting down,
> -		 * we'll catch it down below. So just continue...
> -		 */
> -		trace_xfs_log_grant_wake1(log, tic);
> -	}
> -
> -redo:
> -	if (XLOG_FORCED_SHUTDOWN(log))
> -		goto error_return_unlocked;
> -
>  	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
> -	if (free_bytes < need_bytes) {
> +	if (!list_empty_careful(&log->l_reserveq)) {
>  		spin_lock(&log->l_grant_reserve_lock);
> -		if (list_empty(&tic->t_queue))
> -			list_add_tail(&tic->t_queue, &log->l_reserveq);
> -
> -		trace_xfs_log_grant_sleep2(log, tic);
> -
> -		if (XLOG_FORCED_SHUTDOWN(log))
> -			goto error_return;
> -
> -		xlog_grant_push_ail(log, need_bytes);
> -
> -		XFS_STATS_INC(xs_sleep_logspace);
> -		xlog_wait(&tic->t_wait, &log->l_grant_reserve_lock);
> -
> -		trace_xfs_log_grant_wake2(log, tic);
> -		goto redo;
> -	}
> -
> -	if (!list_empty(&tic->t_queue)) {
> +		if (!xlog_wake_reserveq(log, free_bytes))
> +			error = xlog_reserveq_wait(log, tic, need_bytes);
> +		spin_unlock(&log->l_grant_reserve_lock);
> +	} else if (free_bytes < need_bytes) {
>  		spin_lock(&log->l_grant_reserve_lock);
> -		list_del_init(&tic->t_queue);
> +		error = xlog_reserveq_wait(log, tic, need_bytes);
>  		spin_unlock(&log->l_grant_reserve_lock);
>  	}
> 
> -	/* we've got enough space */
> +	if (error)
> +		goto error0;
> +
>  	xlog_grant_add_space(log, &log->l_grant_reserve_head, need_bytes);
>  	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
>  	trace_xfs_log_grant_exit(log, tic);
>  	xlog_verify_grant_tail(log);
>  	return 0;
> 
> -error_return_unlocked:
> -	spin_lock(&log->l_grant_reserve_lock);
> -error_return:
> -	list_del_init(&tic->t_queue);
> -	spin_unlock(&log->l_grant_reserve_lock);
> -	trace_xfs_log_grant_error(log, tic);
> -
> +error0:
>  	/*
> -	 * If we are failing, make sure the ticket doesn't have any
> -	 * current reservations. We don't want to add this back when
> -	 * the ticket/transaction gets cancelled.
> +	 * If we are failing, make sure the ticket doesn't have any current
> +	 * reservations.  We don't want to add this back when the ticket/
> +	 * transaction gets cancelled.
>  	 */
>  	tic->t_curr_res = 0;
>  	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
> -	return XFS_ERROR(EIO);
> -}	/* xlog_grant_log_space */
> +	return error;
> +}
> +
> +STATIC int
> +xlog_writeq_wait(
> +	struct log		*log,
> +	struct xlog_ticket	*tic,
> +	int			need_bytes)
> +{
> +	list_add_tail(&tic->t_queue, &log->l_writeq);
> +
> +	do {
> +		if (XLOG_FORCED_SHUTDOWN(log))
> +			goto shutdown;
> +		xlog_grant_push_ail(log, need_bytes);
> +
> +		XFS_STATS_INC(xs_sleep_logspace);
> +		trace_xfs_log_regrant_write_sleep(log, tic);
> +
> +		xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
> +		trace_xfs_log_regrant_write_wake(log, tic);
> +
> +		spin_lock(&log->l_grant_write_lock);
> +		if (XLOG_FORCED_SHUTDOWN(log))
> +			goto shutdown;
> +	} while (xlog_space_left(log, &log->l_grant_write_head) < need_bytes);
> 
> +	list_del_init(&tic->t_queue);
> +	return 0;
> +shutdown:
> +	list_del_init(&tic->t_queue);
> +	return XFS_ERROR(EIO);
> +}
> 
>  /*
>   * Replenish the byte reservation required by moving the grant write head.
> @@ -2619,10 +2677,12 @@ error_return:
>   * free fast path.
>   */
>  STATIC int
> -xlog_regrant_write_log_space(xlog_t	   *log,
> -			     xlog_ticket_t *tic)
> +xlog_regrant_write_log_space(
> +	struct log		*log,
> +	struct xlog_ticket	*tic)
>  {
> -	int		free_bytes, need_bytes;
> +	int			free_bytes, need_bytes;
> +	int			error = 0;
> 
>  	tic->t_curr_res = tic->t_unit_res;
>  	xlog_tic_reset_res(tic);
> @@ -2630,104 +2690,47 @@ xlog_regrant_write_log_space(xlog_t	   *
>  	if (tic->t_cnt > 0)
>  		return 0;
> 
> -#ifdef DEBUG
> -	if (log->l_flags & XLOG_ACTIVE_RECOVERY)
> -		panic("regrant Recovery problem");
> -#endif
> +	ASSERT(!(log->l_flags & XLOG_ACTIVE_RECOVERY));
> 
>  	trace_xfs_log_regrant_write_enter(log, tic);
> -	if (XLOG_FORCED_SHUTDOWN(log))
> -		goto error_return_unlocked;
> 
> -	/* If there are other waiters on the queue then give them a
> -	 * chance at logspace before us. Wake up the first waiters,
> -	 * if we do not wake up all the waiters then go to sleep waiting
> -	 * for more free space, otherwise try to get some space for
> -	 * this transaction.
> +	/*
> +	 * If there are other waiters on the queue then give them a chance at
> +	 * logspace before us.  Wake up the first waiters, if we do not wake
> +	 * up all the waiters then go to sleep waiting for more free space,
> +	 * otherwise try to get some space for this transaction.
>  	 */
>  	need_bytes = tic->t_unit_res;
> -	if (!list_empty_careful(&log->l_writeq)) {
> -		struct xlog_ticket *ntic;
> -
> -		spin_lock(&log->l_grant_write_lock);
> -		free_bytes = xlog_space_left(log, &log->l_grant_write_head);
> -		list_for_each_entry(ntic, &log->l_writeq, t_queue) {
> -			ASSERT(ntic->t_flags & XLOG_TIC_PERM_RESERV);
> -
> -			if (free_bytes < ntic->t_unit_res)
> -				break;
> -			free_bytes -= ntic->t_unit_res;
> -			wake_up(&ntic->t_wait);
> -		}
> -
> -		if (ntic != list_first_entry(&log->l_writeq,
> -						struct xlog_ticket, t_queue)) {
> -			if (list_empty(&tic->t_queue))
> -				list_add_tail(&tic->t_queue, &log->l_writeq);
> -			trace_xfs_log_regrant_write_sleep1(log, tic);
> -
> -			xlog_grant_push_ail(log, need_bytes);
> -
> -			XFS_STATS_INC(xs_sleep_logspace);
> -			xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
> -			trace_xfs_log_regrant_write_wake1(log, tic);
> -		} else
> -			spin_unlock(&log->l_grant_write_lock);
> -	}
> -
> -redo:
> -	if (XLOG_FORCED_SHUTDOWN(log))
> -		goto error_return_unlocked;
> -
>  	free_bytes = xlog_space_left(log, &log->l_grant_write_head);
> -	if (free_bytes < need_bytes) {
> +	if (!list_empty_careful(&log->l_writeq)) {
>  		spin_lock(&log->l_grant_write_lock);
> -		if (list_empty(&tic->t_queue))
> -			list_add_tail(&tic->t_queue, &log->l_writeq);
> -
> -		if (XLOG_FORCED_SHUTDOWN(log))
> -			goto error_return;
> -
> -		xlog_grant_push_ail(log, need_bytes);
> -
> -		XFS_STATS_INC(xs_sleep_logspace);
> -		trace_xfs_log_regrant_write_sleep2(log, tic);
> -		xlog_wait(&tic->t_wait, &log->l_grant_write_lock);
> -
> -		trace_xfs_log_regrant_write_wake2(log, tic);
> -		goto redo;
> -	}
> -
> -	if (!list_empty(&tic->t_queue)) {
> +		if (!xlog_wake_writeq(log, free_bytes))
> +			error = xlog_writeq_wait(log, tic, need_bytes);
> +		spin_unlock(&log->l_grant_write_lock);
> +	} else if (free_bytes < need_bytes) {
>  		spin_lock(&log->l_grant_write_lock);
> -		list_del_init(&tic->t_queue);
> +		error = xlog_writeq_wait(log, tic, need_bytes);
>  		spin_unlock(&log->l_grant_write_lock);
>  	}
> 
> -	/* we've got enough space */
> +	if (error)
> +		goto error0;
> +
>  	xlog_grant_add_space(log, &log->l_grant_write_head, need_bytes);
>  	trace_xfs_log_regrant_write_exit(log, tic);
>  	xlog_verify_grant_tail(log);
>  	return 0;
> 
> -
> - error_return_unlocked:
> -	spin_lock(&log->l_grant_write_lock);
> - error_return:
> -	list_del_init(&tic->t_queue);
> -	spin_unlock(&log->l_grant_write_lock);
> -	trace_xfs_log_regrant_write_error(log, tic);
> -
> +error0:
>  	/*
> -	 * If we are failing, make sure the ticket doesn't have any
> -	 * current reservations. We don't want to add this back when
> -	 * the ticket/transaction gets cancelled.
> +	 * If we are failing, make sure the ticket doesn't have any current
> +	 * reservations. We don't want to add this back when the ticket/
> +	 * transaction gets cancelled.
>  	 */
>  	tic->t_curr_res = 0;
>  	tic->t_cnt = 0; /* ungrant will give back unit_res * t_cnt. */
> -	return XFS_ERROR(EIO);
> -}	/* xlog_regrant_write_log_space */
> -
> +	return error;
> +}
> 
>  /* The first cnt-1 times through here we don't need to
>   * move the grant write head because the permanent
> Index: linux-2.6/fs/xfs/linux-2.6/xfs_trace.h
> ===================================================================
> --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_trace.h	2011-11-19 16:36:00.493329401 +0100
> +++ linux-2.6/fs/xfs/linux-2.6/xfs_trace.h	2011-11-19 16:39:53.303332382 +0100
> @@ -833,18 +833,14 @@ DEFINE_LOGGRANT_EVENT(xfs_log_umount_wri
>  DEFINE_LOGGRANT_EVENT(xfs_log_grant_enter);
>  DEFINE_LOGGRANT_EVENT(xfs_log_grant_exit);
>  DEFINE_LOGGRANT_EVENT(xfs_log_grant_error);
> -DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep1);
> -DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake1);
> -DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep2);
> -DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake2);
> +DEFINE_LOGGRANT_EVENT(xfs_log_grant_sleep);
> +DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake);
>  DEFINE_LOGGRANT_EVENT(xfs_log_grant_wake_up);
>  DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_enter);
>  DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_exit);
>  DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_error);
> -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep1);
> -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake1);
> -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep2);
> -DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake2);
> +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_sleep);
> +DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake);
>  DEFINE_LOGGRANT_EVENT(xfs_log_regrant_write_wake_up);
>  DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_enter);
>  DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_exit);
> 


_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] xfs: Remove the entries from the queue while waking them up.
  2011-11-19  1:14 ` Dave Chinner
@ 2011-11-21 23:01   ` Chandra Seetharaman
  0 siblings, 0 replies; 9+ messages in thread
From: Chandra Seetharaman @ 2011-11-21 23:01 UTC (permalink / raw)
  To: Dave Chinner; +Cc: XFS Mailing List

Hi Dave,

Thanks for the review and comments.

On Sat, 2011-11-19 at 12:14 +1100, Dave Chinner wrote:
> On Fri, Nov 18, 2011 at 01:20:54PM -0600, Chandra Seetharaman wrote:
> > l_reserveq and l_writeq maintains a list of processes waiting to get log
> > space. Processes are supposed to get in the list when the amount of free
> > space available in the log is less than what they need.
> > 
> > When space becomes available current code, wakes up the processes, but
> > expect the processes to remove themselves from the queue.
> > 
> > Since the lock protecting the list is only acquired later by the woken
> > up process, there is a window of time were a new process that is looking
> > for space can wrongly get into the queue while there is enough space
> > available. 
> > 
> > Since there is enough space available, this process can never be woken
> > up, which leads to the hang of the process.
> 
> Excellent work, Chandra.

Thanks Dave.

Made all the changes (except one as noted below). Testing the new code
and will sent it soon.

:
:
:
<snip>

> > @@ -2550,8 +2552,7 @@ redo:
> >  	free_bytes = xlog_space_left(log, &log->l_grant_reserve_head);
> >  	if (free_bytes < need_bytes) {
> >  		spin_lock(&log->l_grant_reserve_lock);
> > -		if (list_empty(&tic->t_queue))
> > -			list_add_tail(&tic->t_queue, &log->l_reserveq);
> > +		list_add_tail(&tic->t_queue, &log->l_reserveq);
> >  
> >  		trace_xfs_log_grant_sleep2(log, tic);
> >  
> 
> Ok, we now have the assumption that when we enter this code the
> ticket is not on any queue at all. Can you add an
> "ASSERT(list_empty(&tic->t_queue));" to the code before the above
> xlog_space_left() call? That way we'll know if we violate that
> assumption as potentially corrupt memory....

As Christoph pointed, we might have been woken up by some system event.
So, I changed the code a bit around here. Please comment on the new
code.


<snip>

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] xfs: Remove the entries from the queue while waking them up.
  2011-11-21 19:11   ` [PATCH] xfs: Remove the entries from the queue while waking them up Chandra Seetharaman
@ 2011-11-22 10:23     ` Christoph Hellwig
  0 siblings, 0 replies; 9+ messages in thread
From: Christoph Hellwig @ 2011-11-22 10:23 UTC (permalink / raw)
  To: Chandra Seetharaman; +Cc: Christoph Hellwig, XFS Mailing List

On Mon, Nov 21, 2011 at 01:11:41PM -0600, Chandra Seetharaman wrote:
> The code does not assume that it got the space when it wakes up. After
> every wake up it does check for free bytes available and compares to
> required bytes before granting the bytes to itself. (IOW, after waking
> up it behaves the same way as the lock-less case)
> 
> As Dave pointed, I can see only the signal case to be effecting this
> scenario. With that case in mind, I can see one change required to my
> patch: Add the ticket to the list the second time (in a function) only
> if the t_queue is not empty.

You can still leak with the process added to the queue if you get a
wakeup and there is space available.  And your second patch now has
to re-add conditional add to queue band aids that the first one so
nicely removed.  My version of that patch has the big advantage of
actually making the whole scheme mirror that of a wait queue.  In fact
we could use workqueues and the helpers for it later (except for the
waker side, which is special).

_______________________________________________
xfs mailing list
xfs@oss.sgi.com
http://oss.sgi.com/mailman/listinfo/xfs

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2011-11-22 10:23 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-11-18 19:20 [PATCH] xfs: Remove the entries from the queue while waking them up Chandra Seetharaman
2011-11-19  1:14 ` Dave Chinner
2011-11-21 23:01   ` Chandra Seetharaman
2011-11-19 18:19 ` Christoph Hellwig
2011-11-20  8:22   ` Dave Chinner
2011-11-20 11:43     ` Christoph Hellwig
2011-11-20 19:49       ` [PATCH v2] xfs: fix and cleanup logspace waiter lists Christoph Hellwig
2011-11-21 19:11   ` [PATCH] xfs: Remove the entries from the queue while waking them up Chandra Seetharaman
2011-11-22 10:23     ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.