All of lore.kernel.org
 help / color / mirror / Atom feed
* [Ocfs2-devel] [PATCH 1/1] ocfs2 fix o2dlm dlm run purgelist
@ 2010-06-19 19:56 Srinivas Eeda
  2010-06-20 19:34 ` Sunil Mushran
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Srinivas Eeda @ 2010-06-19 19:56 UTC (permalink / raw)
  To: ocfs2-devel

There are two problems in dlm_run_purgelist

1. If a lockres is found to be in use, dlm_run_purgelist keeps trying to purge
the same lockres instead of trying the next lockres.

2. When a lockres is found unused, dlm_run_purgelist releases lockres spinlock
before setting DLM_LOCK_RES_DROPPING_REF and calls dlm_purge_lockres.
spinlock is reacquired but in this window lockres can get reused. This leads
to BUG.

This patch modifies dlm_run_purgelist to skip lockres if it's in use and purge
 next lockres. It also sets DLM_LOCK_RES_DROPPING_REF before releasing the
lockres spinlock protecting it from getting reused.

Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
---
 fs/ocfs2/dlm/dlmthread.c |   55 +++++++++++++++++++++------------------------
 1 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 11a6d1f..79d1ef6 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -158,15 +158,6 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 	int master;
 	int ret = 0;
 
-	spin_lock(&res->spinlock);
-	if (!__dlm_lockres_unused(res)) {
-		mlog(0, "%s:%.*s: tried to purge but not unused\n",
-		     dlm->name, res->lockname.len, res->lockname.name);
-		__dlm_print_one_lock_resource(res);
-		spin_unlock(&res->spinlock);
-		BUG();
-	}
-
 	if (res->state & DLM_LOCK_RES_MIGRATING) {
 		mlog(0, "%s:%.*s: Delay dropref as this lockres is "
 		     "being remastered\n", dlm->name, res->lockname.len,
@@ -184,13 +175,13 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 	if (!master)
 		res->state |= DLM_LOCK_RES_DROPPING_REF;
-	spin_unlock(&res->spinlock);
 
 	mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
 	     res->lockname.name, master);
 
 	if (!master) {
 		/* drop spinlock...  retake below */
+		spin_unlock(&res->spinlock);
 		spin_unlock(&dlm->spinlock);
 
 		spin_lock(&res->spinlock);
@@ -208,30 +199,34 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 		mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
 		     dlm->name, res->lockname.len, res->lockname.name, ret);
 		spin_lock(&dlm->spinlock);
+		spin_lock(&res->spinlock);
 	}
 
-	spin_lock(&res->spinlock);
 	if (!list_empty(&res->purge)) {
 		mlog(0, "removing lockres %.*s:%p from purgelist, "
 		     "master = %d\n", res->lockname.len, res->lockname.name,
 		     res, master);
 		list_del_init(&res->purge);
-		spin_unlock(&res->spinlock);
 		dlm_lockres_put(res);
 		dlm->purge_count--;
-	} else
-		spin_unlock(&res->spinlock);
+	}
 
-	__dlm_unhash_lockres(res);
+	if (__dlm_lockres_unused(res))
+		__dlm_unhash_lockres(res);
+	else {
+		mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
+		     dlm->name, res->lockname.len, res->lockname.name);
+		__dlm_print_one_lock_resource(res);
+	}
 
 	/* lockres is not in the hash now.  drop the flag and wake up
 	 * any processes waiting in dlm_get_lock_resource. */
 	if (!master) {
-		spin_lock(&res->spinlock);
 		res->state &= ~DLM_LOCK_RES_DROPPING_REF;
 		spin_unlock(&res->spinlock);
 		wake_up(&res->wq);
-	}
+	} else
+		spin_unlock(&res->spinlock);
 	return 0;
 }
 
@@ -251,17 +246,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
 		lockres = list_entry(dlm->purge_list.next,
 				     struct dlm_lock_resource, purge);
 
-		/* Status of the lockres *might* change so double
-		 * check. If the lockres is unused, holding the dlm
-		 * spinlock will prevent people from getting and more
-		 * refs on it -- there's no need to keep the lockres
-		 * spinlock. */
 		spin_lock(&lockres->spinlock);
-		unused = __dlm_lockres_unused(lockres);
-		spin_unlock(&lockres->spinlock);
-
-		if (!unused)
-			continue;
 
 		purge_jiffies = lockres->last_used +
 			msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
@@ -273,15 +258,27 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
 			 * in tail order, we can stop at the first
 			 * unpurgable resource -- anyone added after
 			 * him will have a greater last_used value */
+			spin_unlock(&lockres->spinlock);
 			break;
 		}
 
+		/* Status of the lockres *might* change so double
+		 * check. If the lockres is unused, holding the dlm
+		 * spinlock will prevent people from getting and more
+		 * refs on it -- there's no need to keep the lockres
+		 * spinlock. */
+		unused = __dlm_lockres_unused(lockres);
+		if (!unused) {
+			list_move_tail(&dlm->purge_list, &lockres->purge);
+			spin_unlock(&lockres->spinlock);
+			continue;
+		}
+
 		dlm_lockres_get(lockres);
 
 		/* This may drop and reacquire the dlm spinlock if it
 		 * has to do migration. */
-		if (dlm_purge_lockres(dlm, lockres))
-			BUG();
+		dlm_purge_lockres(dlm, lockres);
 
 		dlm_lockres_put(lockres);
 
-- 
1.5.6.5

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [Ocfs2-devel] [PATCH 1/1] ocfs2 fix o2dlm dlm run purgelist
  2010-06-19 19:56 [Ocfs2-devel] [PATCH 1/1] ocfs2 fix o2dlm dlm run purgelist Srinivas Eeda
@ 2010-06-20 19:34 ` Sunil Mushran
  2010-06-21  5:19 ` Wengang Wang
  2010-06-22  3:34 ` Joel Becker
  2 siblings, 0 replies; 4+ messages in thread
From: Sunil Mushran @ 2010-06-20 19:34 UTC (permalink / raw)
  To: ocfs2-devel

On 06/19/2010 12:56 PM, Srinivas Eeda wrote:
> There are two problems in dlm_run_purgelist
>
> 1. If a lockres is found to be in use, dlm_run_purgelist keeps trying to purge
> the same lockres instead of trying the next lockres.
>
> 2. When a lockres is found unused, dlm_run_purgelist releases lockres spinlock
> before setting DLM_LOCK_RES_DROPPING_REF and calls dlm_purge_lockres.
> spinlock is reacquired but in this window lockres can get reused. This leads
> to BUG.
>
> This patch modifies dlm_run_purgelist to skip lockres if it's in use and purge
>   next lockres. It also sets DLM_LOCK_RES_DROPPING_REF before releasing the
> lockres spinlock protecting it from getting reused.
>
> Signed-off-by: Srinivas Eeda<srinivas.eeda@oracle.com>
> ---
>   fs/ocfs2/dlm/dlmthread.c |   55 +++++++++++++++++++++------------------------
>   1 files changed, 26 insertions(+), 29 deletions(-)
>
> diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
> index 11a6d1f..79d1ef6 100644
> --- a/fs/ocfs2/dlm/dlmthread.c
> +++ b/fs/ocfs2/dlm/dlmthread.c
> @@ -158,15 +158,6 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
>   	int master;
>   	int ret = 0;
>
> -	spin_lock(&res->spinlock);
> -	if (!__dlm_lockres_unused(res)) {
> -		mlog(0, "%s:%.*s: tried to purge but not unused\n",
> -		     dlm->name, res->lockname.len, res->lockname.name);
> -		__dlm_print_one_lock_resource(res);
> -		spin_unlock(&res->spinlock);
> -		BUG();
> -	}
> -

Always have assert_spin_locked() if the function expects the
spin locks to have been taken by the caller.

>   	if (res->state&  DLM_LOCK_RES_MIGRATING) {
>   		mlog(0, "%s:%.*s: Delay dropref as this lockres is "
>   		     "being remastered\n", dlm->name, res->lockname.len,
> @@ -184,13 +175,13 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
>
>   	if (!master)
>   		res->state |= DLM_LOCK_RES_DROPPING_REF;
> -	spin_unlock(&res->spinlock);
>
>   	mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
>   	     res->lockname.name, master);
>
>   	if (!master) {
>   		/* drop spinlock...  retake below */
> +		spin_unlock(&res->spinlock);
>   		spin_unlock(&dlm->spinlock);
>
>   		spin_lock(&res->spinlock);
> @@ -208,30 +199,34 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
>   		mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
>   		     dlm->name, res->lockname.len, res->lockname.name, ret);
>   		spin_lock(&dlm->spinlock);
> +		spin_lock(&res->spinlock);
>   	}
>
> -	spin_lock(&res->spinlock);
>   	if (!list_empty(&res->purge)) {
>   		mlog(0, "removing lockres %.*s:%p from purgelist, "
>   		     "master = %d\n", res->lockname.len, res->lockname.name,
>   		     res, master);
>   		list_del_init(&res->purge);
> -		spin_unlock(&res->spinlock);
>   		dlm_lockres_put(res);
>   		dlm->purge_count--;
> -	} else
> -		spin_unlock(&res->spinlock);
> +	}
>
> -	__dlm_unhash_lockres(res);
> +	if (__dlm_lockres_unused(res))
> +		__dlm_unhash_lockres(res);
> +	else {
> +		mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
> +		     dlm->name, res->lockname.len, res->lockname.name);
> +		__dlm_print_one_lock_resource(res);
> +	}

Why do we need this? If we run into this condition, the only
safe response is a BUG_ON.

>
>   	/* lockres is not in the hash now.  drop the flag and wake up
>   	 * any processes waiting in dlm_get_lock_resource. */
>   	if (!master) {
> -		spin_lock(&res->spinlock);
>   		res->state&= ~DLM_LOCK_RES_DROPPING_REF;
>   		spin_unlock(&res->spinlock);
>   		wake_up(&res->wq);
> -	}
> +	} else
> +		spin_unlock(&res->spinlock);
>   	return 0;
>   }
>
> @@ -251,17 +246,7 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
>   		lockres = list_entry(dlm->purge_list.next,
>   				     struct dlm_lock_resource, purge);
>
> -		/* Status of the lockres *might* change so double
> -		 * check. If the lockres is unused, holding the dlm
> -		 * spinlock will prevent people from getting and more
> -		 * refs on it -- there's no need to keep the lockres
> -		 * spinlock. */
>   		spin_lock(&lockres->spinlock);
> -		unused = __dlm_lockres_unused(lockres);
> -		spin_unlock(&lockres->spinlock);
> -
> -		if (!unused)
> -			continue;
>
>   		purge_jiffies = lockres->last_used +
>   			msecs_to_jiffies(DLM_PURGE_INTERVAL_MS);
> @@ -273,15 +258,27 @@ static void dlm_run_purge_list(struct dlm_ctxt *dlm,
>   			 * in tail order, we can stop at the first
>   			 * unpurgable resource -- anyone added after
>   			 * him will have a greater last_used value */
> +			spin_unlock(&lockres->spinlock);
>   			break;
>   		}
>
> +		/* Status of the lockres *might* change so double
> +		 * check. If the lockres is unused, holding the dlm
> +		 * spinlock will prevent people from getting and more
> +		 * refs on it -- there's no need to keep the lockres
> +		 * spinlock. */
> +		unused = __dlm_lockres_unused(lockres);
> +		if (!unused) {
> +			list_move_tail(&dlm->purge_list,&lockres->purge);
> +			spin_unlock(&lockres->spinlock);
> +			continue;
> +		}
> +

+		if (!unused ||
+		    lockres->state&  DLM_LOCK_RES_MIGRATING) {
+			list_move_tail(&dlm->purge_list,&lockres->purge);
+			spin_unlock(&lockres->spinlock);
+			continue;
+		}


You can move the MIGRATING check from atop dlm_purge_lockres() to here.


>   		dlm_lockres_get(lockres);
>
>   		/* This may drop and reacquire the dlm spinlock if it
>   		 * has to do migration. */
> -		if (dlm_purge_lockres(dlm, lockres))
> -			BUG();
> +		dlm_purge_lockres(dlm, lockres);
>

Make the function a void.

>   		dlm_lockres_put(lockres);
>

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Ocfs2-devel] [PATCH 1/1] ocfs2 fix o2dlm dlm run purgelist
  2010-06-19 19:56 [Ocfs2-devel] [PATCH 1/1] ocfs2 fix o2dlm dlm run purgelist Srinivas Eeda
  2010-06-20 19:34 ` Sunil Mushran
@ 2010-06-21  5:19 ` Wengang Wang
  2010-06-22  3:34 ` Joel Becker
  2 siblings, 0 replies; 4+ messages in thread
From: Wengang Wang @ 2010-06-21  5:19 UTC (permalink / raw)
  To: ocfs2-devel

On 10-06-19 12:56, Srinivas Eeda wrote:
> There are two problems in dlm_run_purgelist
> 
> 1. If a lockres is found to be in use, dlm_run_purgelist keeps trying to purge
> the same lockres instead of trying the next lockres.
> 
> 2. When a lockres is found unused, dlm_run_purgelist releases lockres spinlock
> before setting DLM_LOCK_RES_DROPPING_REF and calls dlm_purge_lockres.
> spinlock is reacquired but in this window lockres can get reused. This leads
> to BUG.
> 
> This patch modifies dlm_run_purgelist to skip lockres if it's in use and purge
>  next lockres. It also sets DLM_LOCK_RES_DROPPING_REF before releasing the
> lockres spinlock protecting it from getting reused.
> 
> Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>
> ---
>  fs/ocfs2/dlm/dlmthread.c |   55 +++++++++++++++++++++------------------------
>  1 files changed, 26 insertions(+), 29 deletions(-)
> 
> diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
> index 11a6d1f..79d1ef6 100644
> --- a/fs/ocfs2/dlm/dlmthread.c
> +++ b/fs/ocfs2/dlm/dlmthread.c
> @@ -158,15 +158,6 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
>  	int master;
>  	int ret = 0;
>  
> -	spin_lock(&res->spinlock);
> -	if (!__dlm_lockres_unused(res)) {
> -		mlog(0, "%s:%.*s: tried to purge but not unused\n",
> -		     dlm->name, res->lockname.len, res->lockname.name);
> -		__dlm_print_one_lock_resource(res);
> -		spin_unlock(&res->spinlock);
> -		BUG();
> -	}
> -
>  	if (res->state & DLM_LOCK_RES_MIGRATING) {
>  		mlog(0, "%s:%.*s: Delay dropref as this lockres is "
>  		     "being remastered\n", dlm->name, res->lockname.len,
> @@ -184,13 +175,13 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
>  
>  	if (!master)
>  		res->state |= DLM_LOCK_RES_DROPPING_REF;
> -	spin_unlock(&res->spinlock);
>  
>  	mlog(0, "purging lockres %.*s, master = %d\n", res->lockname.len,
>  	     res->lockname.name, master);
>  
>  	if (!master) {
>  		/* drop spinlock...  retake below */
> +		spin_unlock(&res->spinlock);
>  		spin_unlock(&dlm->spinlock);
>  
>  		spin_lock(&res->spinlock);
> @@ -208,30 +199,34 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
>  		mlog(0, "%s:%.*s: dlm_deref_lockres returned %d\n",
>  		     dlm->name, res->lockname.len, res->lockname.name, ret);
>  		spin_lock(&dlm->spinlock);
> +		spin_lock(&res->spinlock);
>  	}
>  
> -	spin_lock(&res->spinlock);
>  	if (!list_empty(&res->purge)) {
>  		mlog(0, "removing lockres %.*s:%p from purgelist, "
>  		     "master = %d\n", res->lockname.len, res->lockname.name,
>  		     res, master);
>  		list_del_init(&res->purge);
> -		spin_unlock(&res->spinlock);
>  		dlm_lockres_put(res);
>  		dlm->purge_count--;
> -	} else
> -		spin_unlock(&res->spinlock);
> +	}
>  
> -	__dlm_unhash_lockres(res);
> +	if (__dlm_lockres_unused(res))
> +		__dlm_unhash_lockres(res);
> +	else {
> +		mlog(ML_ERROR, "found lockres %s:%.*s: in use after deref\n",
> +		     dlm->name, res->lockname.len, res->lockname.name);
> +		__dlm_print_one_lock_resource(res);
> +	}

This is not an error. ML_NOTICE instead?

regards,
wengang.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Ocfs2-devel] [PATCH 1/1] ocfs2 fix o2dlm dlm run purgelist
  2010-06-19 19:56 [Ocfs2-devel] [PATCH 1/1] ocfs2 fix o2dlm dlm run purgelist Srinivas Eeda
  2010-06-20 19:34 ` Sunil Mushran
  2010-06-21  5:19 ` Wengang Wang
@ 2010-06-22  3:34 ` Joel Becker
  2 siblings, 0 replies; 4+ messages in thread
From: Joel Becker @ 2010-06-22  3:34 UTC (permalink / raw)
  To: ocfs2-devel

On Sat, Jun 19, 2010 at 12:56:11PM -0700, Srinivas Eeda wrote:
> There are two problems in dlm_run_purgelist
> 
> 1. If a lockres is found to be in use, dlm_run_purgelist keeps trying to purge
> the same lockres instead of trying the next lockres.
> 
> 2. When a lockres is found unused, dlm_run_purgelist releases lockres spinlock
> before setting DLM_LOCK_RES_DROPPING_REF and calls dlm_purge_lockres.
> spinlock is reacquired but in this window lockres can get reused. This leads
> to BUG.
> 
> This patch modifies dlm_run_purgelist to skip lockres if it's in use and purge
>  next lockres. It also sets DLM_LOCK_RES_DROPPING_REF before releasing the
> lockres spinlock protecting it from getting reused.
> 
> Signed-off-by: Srinivas Eeda <srinivas.eeda@oracle.com>

	I agree with everything Sunil said.  More below.

> +		/* Status of the lockres *might* change so double
> +		 * check. If the lockres is unused, holding the dlm
> +		 * spinlock will prevent people from getting and more
> +		 * refs on it -- there's no need to keep the lockres
> +		 * spinlock. */
> +		unused = __dlm_lockres_unused(lockres);

	There's no point in commenting about "no need to keep the
lockres spinlock" when you are now holding it through the operation.
You can drop everything after the '--'.

Joel

-- 

"Here's a nickle -- get yourself a better X server."
	- Keith Packard

Joel Becker
Principal Software Developer
Oracle
E-mail: joel.becker at oracle.com
Phone: (650) 506-8127

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2010-06-22  3:34 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-06-19 19:56 [Ocfs2-devel] [PATCH 1/1] ocfs2 fix o2dlm dlm run purgelist Srinivas Eeda
2010-06-20 19:34 ` Sunil Mushran
2010-06-21  5:19 ` Wengang Wang
2010-06-22  3:34 ` Joel Becker

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.