* [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery
@ 2015-09-18 11:25 Joseph Qi
2015-09-20 7:20 ` Junxiao Bi
0 siblings, 1 reply; 10+ messages in thread
From: Joseph Qi @ 2015-09-18 11:25 UTC (permalink / raw)
To: ocfs2-devel
There is a race window between dlmconvert_remote and
dlm_move_lockres_to_recovery_list, which will cause a lock with
OCFS2_LOCK_BUSY in grant list, thus system hangs.
dlmconvert_remote
{
spin_lock(&res->spinlock);
list_move_tail(&lock->list, &res->converting);
lock->convert_pending = 1;
spin_unlock(&res->spinlock);
status = dlm_send_remote_convert_request();
>>>>>> race window, master has queued ast and return DLM_NORMAL,
and then down before sending ast.
this node detects master down and calls
dlm_move_lockres_to_recovery_list, which will revert the
lock to grant list.
Then OCFS2_LOCK_BUSY won't be cleared as new master won't
send ast any more because it thinks already be authorized.
spin_lock(&res->spinlock);
lock->convert_pending = 0;
if (status != DLM_NORMAL)
dlm_revert_pending_convert(res, lock);
spin_unlock(&res->spinlock);
}
In this case, just leave it in convert list and new master will take care
of it after recovery. And if convert request returns other than
DLM_NORMAL, convert thread will do the revert itself. So remove the
revert logic in dlm_move_lockres_to_recovery_list.
changelog since v1:
Clean up convert_pending since it is now useless.
Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
Cc: <stable@vger.kernel.org>
---
fs/ocfs2/dlm/dlmcommon.h | 1 -
fs/ocfs2/dlm/dlmconvert.c | 2 --
fs/ocfs2/dlm/dlmdebug.c | 7 +++----
fs/ocfs2/dlm/dlmlock.c | 1 -
fs/ocfs2/dlm/dlmrecovery.c | 10 +---------
5 files changed, 4 insertions(+), 17 deletions(-)
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e88ccf8..25853d7 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -369,7 +369,6 @@ struct dlm_lock
struct dlm_lockstatus *lksb;
unsigned ast_pending:1,
bast_pending:1,
- convert_pending:1,
lock_pending:1,
cancel_pending:1,
unlock_pending:1,
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index e36d63f..82ba8d5 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -291,7 +291,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->converting);
- lock->convert_pending = 1;
lock->ml.convert_type = type;
if (flags & LKM_VALBLK) {
@@ -315,7 +314,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
- lock->convert_pending = 0;
/* if it failed, move it back to granted queue */
if (status != DLM_NORMAL) {
if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 8251360..710e94c 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -77,7 +77,7 @@ static void __dlm_print_lock(struct dlm_lock *lock)
printk(" type=%d, conv=%d, node=%u, cookie=%u:%llu, "
"ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
- "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
+ "pending=(lock=%c,cancel=%c,unlock=%c)\n",
lock->ml.type, lock->ml.convert_type, lock->ml.node,
dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
@@ -86,7 +86,6 @@ static void __dlm_print_lock(struct dlm_lock *lock)
(lock->ast_pending ? 'y' : 'n'),
(list_empty(&lock->bast_list) ? 'y' : 'n'),
(lock->bast_pending ? 'y' : 'n'),
- (lock->convert_pending ? 'y' : 'n'),
(lock->lock_pending ? 'y' : 'n'),
(lock->cancel_pending ? 'y' : 'n'),
(lock->unlock_pending ? 'y' : 'n'));
@@ -502,7 +501,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
#define DEBUG_LOCK_VERSION 1
spin_lock(&lock->spinlock);
- out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
+ out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,"
"%d,%d,%d,%d\n",
DEBUG_LOCK_VERSION,
list_type, lock->ml.type, lock->ml.convert_type,
@@ -512,7 +511,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
!list_empty(&lock->ast_list),
!list_empty(&lock->bast_list),
lock->ast_pending, lock->bast_pending,
- lock->convert_pending, lock->lock_pending,
+ lock->lock_pending,
lock->cancel_pending, lock->unlock_pending,
atomic_read(&lock->lock_refs.refcount));
spin_unlock(&lock->spinlock);
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 66c2a49..857dd15 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -411,7 +411,6 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
newlock->ml.cookie = cpu_to_be64(cookie);
newlock->ast_pending = 0;
newlock->bast_pending = 0;
- newlock->convert_pending = 0;
newlock->lock_pending = 0;
newlock->unlock_pending = 0;
newlock->cancel_pending = 0;
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 3d90ad7..2402ef7 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2062,15 +2062,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
queue = dlm_list_idx_to_ptr(res, i);
list_for_each_entry_safe(lock, next, queue, list) {
dlm_lock_get(lock);
- if (lock->convert_pending) {
- /* move converting lock back to granted */
- BUG_ON(i != DLM_CONVERTING_LIST);
- mlog(0, "node died with convert pending "
- "on %.*s. move back to granted list.\n",
- res->lockname.len, res->lockname.name);
- dlm_revert_pending_convert(res, lock);
- lock->convert_pending = 0;
- } else if (lock->lock_pending) {
+ if (lock->lock_pending) {
/* remove pending lock requests completely */
BUG_ON(i != DLM_BLOCKED_LIST);
mlog(0, "node died with lock pending "
--
1.8.4.3
^ permalink raw reply related [flat|nested] 10+ messages in thread
* [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery
2015-09-18 11:25 [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery Joseph Qi
@ 2015-09-20 7:20 ` Junxiao Bi
2015-09-21 9:23 ` Joseph Qi
0 siblings, 1 reply; 10+ messages in thread
From: Junxiao Bi @ 2015-09-20 7:20 UTC (permalink / raw)
To: ocfs2-devel
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>
> There is a race window between dlmconvert_remote and
> dlm_move_lockres_to_recovery_list, which will cause a lock with
> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>
> dlmconvert_remote
> {
> spin_lock(&res->spinlock);
> list_move_tail(&lock->list, &res->converting);
> lock->convert_pending = 1;
> spin_unlock(&res->spinlock);
>
> status = dlm_send_remote_convert_request();
>>>>>>> race window, master has queued ast and return DLM_NORMAL,
> and then down before sending ast.
> this node detects master down and calls
> dlm_move_lockres_to_recovery_list, which will revert the
> lock to grant list.
> Then OCFS2_LOCK_BUSY won't be cleared as new master won't
> send ast any more because it thinks already be authorized.
>
> spin_lock(&res->spinlock);
> lock->convert_pending = 0;
> if (status != DLM_NORMAL)
> dlm_revert_pending_convert(res, lock);
> spin_unlock(&res->spinlock);
> }
>
> In this case, just leave it in convert list and new master will take care
> of it after recovery. And if convert request returns other than
> DLM_NORMAL, convert thread will do the revert itself. So remove the
> revert logic in dlm_move_lockres_to_recovery_list.
>
> changelog since v1:
> Clean up convert_pending since it is now useless.
>
> Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
> Reported-by: Yiwen Jiang <jiangyiwen@huawei.com>
> Cc: <stable@vger.kernel.org>
> ---
> fs/ocfs2/dlm/dlmcommon.h | 1 -
> fs/ocfs2/dlm/dlmconvert.c | 2 --
> fs/ocfs2/dlm/dlmdebug.c | 7 +++----
> fs/ocfs2/dlm/dlmlock.c | 1 -
> fs/ocfs2/dlm/dlmrecovery.c | 10 +---------
> 5 files changed, 4 insertions(+), 17 deletions(-)
>
> diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
> index e88ccf8..25853d7 100644
> --- a/fs/ocfs2/dlm/dlmcommon.h
> +++ b/fs/ocfs2/dlm/dlmcommon.h
> @@ -369,7 +369,6 @@ struct dlm_lock
> struct dlm_lockstatus *lksb;
> unsigned ast_pending:1,
> bast_pending:1,
> - convert_pending:1,
> lock_pending:1,
> cancel_pending:1,
> unlock_pending:1,
> diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
> index e36d63f..82ba8d5 100644
> --- a/fs/ocfs2/dlm/dlmconvert.c
> +++ b/fs/ocfs2/dlm/dlmconvert.c
> @@ -291,7 +291,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
> /* move lock to local convert queue */
> /* do not alter lock refcount. switching lists. */
> list_move_tail(&lock->list, &res->converting);
> - lock->convert_pending = 1;
> lock->ml.convert_type = type;
>
> if (flags & LKM_VALBLK) {
> @@ -315,7 +314,6 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
>
> spin_lock(&res->spinlock);
> res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
> - lock->convert_pending = 0;
> /* if it failed, move it back to granted queue */
> if (status != DLM_NORMAL) {
> if (status != DLM_NOTQUEUED)
> diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
> index 8251360..710e94c 100644
> --- a/fs/ocfs2/dlm/dlmdebug.c
> +++ b/fs/ocfs2/dlm/dlmdebug.c
> @@ -77,7 +77,7 @@ static void __dlm_print_lock(struct dlm_lock *lock)
>
> printk(" type=%d, conv=%d, node=%u, cookie=%u:%llu, "
> "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
> - "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
> + "pending=(lock=%c,cancel=%c,unlock=%c)\n",
> lock->ml.type, lock->ml.convert_type, lock->ml.node,
> dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
> dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
> @@ -86,7 +86,6 @@ static void __dlm_print_lock(struct dlm_lock *lock)
> (lock->ast_pending ? 'y' : 'n'),
> (list_empty(&lock->bast_list) ? 'y' : 'n'),
> (lock->bast_pending ? 'y' : 'n'),
> - (lock->convert_pending ? 'y' : 'n'),
> (lock->lock_pending ? 'y' : 'n'),
> (lock->cancel_pending ? 'y' : 'n'),
> (lock->unlock_pending ? 'y' : 'n'));
> @@ -502,7 +501,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
>
> #define DEBUG_LOCK_VERSION 1
> spin_lock(&lock->spinlock);
> - out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
> + out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,"
> "%d,%d,%d,%d\n",
> DEBUG_LOCK_VERSION,
> list_type, lock->ml.type, lock->ml.convert_type,
> @@ -512,7 +511,7 @@ static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
> !list_empty(&lock->ast_list),
> !list_empty(&lock->bast_list),
> lock->ast_pending, lock->bast_pending,
> - lock->convert_pending, lock->lock_pending,
> + lock->lock_pending,
> lock->cancel_pending, lock->unlock_pending,
> atomic_read(&lock->lock_refs.refcount));
> spin_unlock(&lock->spinlock);
> diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
> index 66c2a49..857dd15 100644
> --- a/fs/ocfs2/dlm/dlmlock.c
> +++ b/fs/ocfs2/dlm/dlmlock.c
> @@ -411,7 +411,6 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
> newlock->ml.cookie = cpu_to_be64(cookie);
> newlock->ast_pending = 0;
> newlock->bast_pending = 0;
> - newlock->convert_pending = 0;
> newlock->lock_pending = 0;
> newlock->unlock_pending = 0;
> newlock->cancel_pending = 0;
> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
> index 3d90ad7..2402ef7 100644
> --- a/fs/ocfs2/dlm/dlmrecovery.c
> +++ b/fs/ocfs2/dlm/dlmrecovery.c
> @@ -2062,15 +2062,7 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
> queue = dlm_list_idx_to_ptr(res, i);
> list_for_each_entry_safe(lock, next, queue, list) {
> dlm_lock_get(lock);
> - if (lock->convert_pending) {
> - /* move converting lock back to granted */
> - BUG_ON(i != DLM_CONVERTING_LIST);
> - mlog(0, "node died with convert pending "
> - "on %.*s. move back to granted list.\n",
> - res->lockname.len, res->lockname.name);
> - dlm_revert_pending_convert(res, lock);
> - lock->convert_pending = 0;
> - } else if (lock->lock_pending) {
> + if (lock->lock_pending) {
> /* remove pending lock requests completely */
> BUG_ON(i != DLM_BLOCKED_LIST);
> mlog(0, "node died with lock pending "
> --
> 1.8.4.3
>
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery
2015-09-20 7:20 ` Junxiao Bi
@ 2015-09-21 9:23 ` Joseph Qi
2015-09-22 0:55 ` Junxiao Bi
0 siblings, 1 reply; 10+ messages in thread
From: Joseph Qi @ 2015-09-21 9:23 UTC (permalink / raw)
To: ocfs2-devel
Hi Junxiao,
This solution may have problem in the following scenario:
Consider dlm_send_remote_convert_request has taken too much time, and
dlm_move_lockres_to_recovery_list runs first and new master will see
this node is currently in convert list after recovery.
Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
it will revert it to grant list, then retry convert. This will makes
this node and master inconsistent.
I will try to find another solution to resolve the race issue.
On 2015/9/20 15:20, Junxiao Bi wrote:
> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>
>> > ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>> >
>> > There is a race window between dlmconvert_remote and
>> > dlm_move_lockres_to_recovery_list, which will cause a lock with
>> > OCFS2_LOCK_BUSY in grant list, thus system hangs.
>> >
>> > dlmconvert_remote
>> > {
>> > spin_lock(&res->spinlock);
>> > list_move_tail(&lock->list, &res->converting);
>> > lock->convert_pending = 1;
>> > spin_unlock(&res->spinlock);
>> >
>> > status = dlm_send_remote_convert_request();
>>>>>>>> >>>>>>> race window, master has queued ast and return DLM_NORMAL,
>> > and then down before sending ast.
>> > this node detects master down and calls
>> > dlm_move_lockres_to_recovery_list, which will revert the
>> > lock to grant list.
>> > Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>> > send ast any more because it thinks already be authorized.
>> >
>> > spin_lock(&res->spinlock);
>> > lock->convert_pending = 0;
>> > if (status != DLM_NORMAL)
>> > dlm_revert_pending_convert(res, lock);
>> > spin_unlock(&res->spinlock);
>> > }
>> >
>> > In this case, just leave it in convert list and new master will take care
>> > of it after recovery. And if convert request returns other than
>> > DLM_NORMAL, convert thread will do the revert itself. So remove the
>> > revert logic in dlm_move_lockres_to_recovery_list.
>> >
>> > changelog since v1:
>> > Clean up convert_pending since it is now useless.
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery
2015-09-21 9:23 ` Joseph Qi
@ 2015-09-22 0:55 ` Junxiao Bi
2015-09-22 12:23 ` Joseph Qi
0 siblings, 1 reply; 10+ messages in thread
From: Junxiao Bi @ 2015-09-22 0:55 UTC (permalink / raw)
To: ocfs2-devel
On 09/21/2015 05:23 PM, Joseph Qi wrote:
> Hi Junxiao,
> This solution may have problem in the following scenario:
> Consider dlm_send_remote_convert_request has taken too much time, and
> dlm_move_lockres_to_recovery_list runs first and new master will see
> this node is currently in convert list after recovery.
> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
> it will revert it to grant list, then retry convert. This will makes
> this node and master inconsistent.
> I will try to find another solution to resolve the race issue.
If master is down, no need retry convert. May check the return value of
dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
otherwise retry?
Thanks,
Junxiao.
>
> On 2015/9/20 15:20, Junxiao Bi wrote:
>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>
>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>
>>>> There is a race window between dlmconvert_remote and
>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>
>>>> dlmconvert_remote
>>>> {
>>>> spin_lock(&res->spinlock);
>>>> list_move_tail(&lock->list, &res->converting);
>>>> lock->convert_pending = 1;
>>>> spin_unlock(&res->spinlock);
>>>>
>>>> status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>> and then down before sending ast.
>>>> this node detects master down and calls
>>>> dlm_move_lockres_to_recovery_list, which will revert the
>>>> lock to grant list.
>>>> Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>> send ast any more because it thinks already be authorized.
>>>>
>>>> spin_lock(&res->spinlock);
>>>> lock->convert_pending = 0;
>>>> if (status != DLM_NORMAL)
>>>> dlm_revert_pending_convert(res, lock);
>>>> spin_unlock(&res->spinlock);
>>>> }
>>>>
>>>> In this case, just leave it in convert list and new master will take care
>>>> of it after recovery. And if convert request returns other than
>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>
>>>> changelog since v1:
>>>> Clean up convert_pending since it is now useless.
>
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery
2015-09-22 0:55 ` Junxiao Bi
@ 2015-09-22 12:23 ` Joseph Qi
2015-09-23 1:25 ` Junxiao Bi
0 siblings, 1 reply; 10+ messages in thread
From: Joseph Qi @ 2015-09-22 12:23 UTC (permalink / raw)
To: ocfs2-devel
Hi Junxiao,
On 2015/9/22 8:55, Junxiao Bi wrote:
> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>> Hi Junxiao,
>> This solution may have problem in the following scenario:
>> Consider dlm_send_remote_convert_request has taken too much time, and
>> dlm_move_lockres_to_recovery_list runs first and new master will see
>> this node is currently in convert list after recovery.
>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>> it will revert it to grant list, then retry convert. This will makes
>> this node and master inconsistent.
>> I will try to find another solution to resolve the race issue.
>
> If master is down, no need retry convert. May check the return value of
> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
> otherwise retry?
I want to keep the original logic. And for fixing the race case I
described, how about the following idea?
Check the status DLM_NORMAL. If res->state is currently
DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
still in recovery) or res master changed (means new master has finished
the recovery), reset the status to DLM_RECOVERING, just like the check
at the beginning of dlmconvert_remote. Then it is now in grant list and
outer will retry.
>
> Thanks,
> Junxiao.
>
>>
>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>
>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>
>>>>> There is a race window between dlmconvert_remote and
>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>
>>>>> dlmconvert_remote
>>>>> {
>>>>> spin_lock(&res->spinlock);
>>>>> list_move_tail(&lock->list, &res->converting);
>>>>> lock->convert_pending = 1;
>>>>> spin_unlock(&res->spinlock);
>>>>>
>>>>> status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>> and then down before sending ast.
>>>>> this node detects master down and calls
>>>>> dlm_move_lockres_to_recovery_list, which will revert the
>>>>> lock to grant list.
>>>>> Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>> send ast any more because it thinks already be authorized.
>>>>>
>>>>> spin_lock(&res->spinlock);
>>>>> lock->convert_pending = 0;
>>>>> if (status != DLM_NORMAL)
>>>>> dlm_revert_pending_convert(res, lock);
>>>>> spin_unlock(&res->spinlock);
>>>>> }
>>>>>
>>>>> In this case, just leave it in convert list and new master will take care
>>>>> of it after recovery. And if convert request returns other than
>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>
>>>>> changelog since v1:
>>>>> Clean up convert_pending since it is now useless.
>>
>>
>
>
> .
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery
2015-09-22 12:23 ` Joseph Qi
@ 2015-09-23 1:25 ` Junxiao Bi
2015-09-23 1:47 ` Joseph Qi
0 siblings, 1 reply; 10+ messages in thread
From: Junxiao Bi @ 2015-09-23 1:25 UTC (permalink / raw)
To: ocfs2-devel
On 09/22/2015 08:23 PM, Joseph Qi wrote:
> Hi Junxiao,
>
> On 2015/9/22 8:55, Junxiao Bi wrote:
>> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>>> Hi Junxiao,
>>> This solution may have problem in the following scenario:
>>> Consider dlm_send_remote_convert_request has taken too much time, and
>>> dlm_move_lockres_to_recovery_list runs first and new master will see
>>> this node is currently in convert list after recovery.
>>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>>> it will revert it to grant list, then retry convert. This will makes
>>> this node and master inconsistent.
>>> I will try to find another solution to resolve the race issue.
>>
>> If master is down, no need retry convert. May check the return value of
>> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
>> otherwise retry?
>
> I want to keep the original logic. And for fixing the race case I
> described, how about the following idea?
>
> Check the status DLM_NORMAL. If res->state is currently
> DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
> still in recovery) or res master changed (means new master has finished
> the recovery), reset the status to DLM_RECOVERING, just like the check
> at the beginning of dlmconvert_remote. Then it is now in grant list and
> outer will retry.
How this idea fix the race windows you described in patch log? Lock is
reverted to granted list but dlm_send_remote_convert_request() return
DLM_NORMAL.
Thanks,
Junxiao.
>
>>
>> Thanks,
>> Junxiao.
>>
>>>
>>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>
>>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>>
>>>>>> There is a race window between dlmconvert_remote and
>>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>>
>>>>>> dlmconvert_remote
>>>>>> {
>>>>>> spin_lock(&res->spinlock);
>>>>>> list_move_tail(&lock->list, &res->converting);
>>>>>> lock->convert_pending = 1;
>>>>>> spin_unlock(&res->spinlock);
>>>>>>
>>>>>> status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>>> and then down before sending ast.
>>>>>> this node detects master down and calls
>>>>>> dlm_move_lockres_to_recovery_list, which will revert the
>>>>>> lock to grant list.
>>>>>> Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>>> send ast any more because it thinks already be authorized.
>>>>>>
>>>>>> spin_lock(&res->spinlock);
>>>>>> lock->convert_pending = 0;
>>>>>> if (status != DLM_NORMAL)
>>>>>> dlm_revert_pending_convert(res, lock);
>>>>>> spin_unlock(&res->spinlock);
>>>>>> }
>>>>>>
>>>>>> In this case, just leave it in convert list and new master will take care
>>>>>> of it after recovery. And if convert request returns other than
>>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>>
>>>>>> changelog since v1:
>>>>>> Clean up convert_pending since it is now useless.
>>>
>>>
>>
>>
>> .
>>
>
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery
2015-09-23 1:25 ` Junxiao Bi
@ 2015-09-23 1:47 ` Joseph Qi
2015-09-23 1:59 ` Junxiao Bi
0 siblings, 1 reply; 10+ messages in thread
From: Joseph Qi @ 2015-09-23 1:47 UTC (permalink / raw)
To: ocfs2-devel
On 2015/9/23 9:25, Junxiao Bi wrote:
> On 09/22/2015 08:23 PM, Joseph Qi wrote:
>> Hi Junxiao,
>>
>> On 2015/9/22 8:55, Junxiao Bi wrote:
>>> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>>>> Hi Junxiao,
>>>> This solution may have problem in the following scenario:
>>>> Consider dlm_send_remote_convert_request has taken too much time, and
>>>> dlm_move_lockres_to_recovery_list runs first and new master will see
>>>> this node is currently in convert list after recovery.
>>>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>>>> it will revert it to grant list, then retry convert. This will makes
>>>> this node and master inconsistent.
>>>> I will try to find another solution to resolve the race issue.
>>>
>>> If master is down, no need retry convert. May check the return value of
>>> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
>>> otherwise retry?
>>
>> I want to keep the original logic. And for fixing the race case I
>> described, how about the following idea?
>>
>> Check the status DLM_NORMAL. If res->state is currently
>> DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
>> still in recovery) or res master changed (means new master has finished
>> the recovery), reset the status to DLM_RECOVERING, just like the check
>> at the beginning of dlmconvert_remote. Then it is now in grant list and
>> outer will retry.
> How this idea fix the race windows you described in patch log? Lock is
> reverted to granted list but dlm_send_remote_convert_request() return
> DLM_NORMAL.
>
As I described in the former mail, status is now reset to DLM_RECOVERING,
caller will retry. And the original way will just wait forever.
Thanks
Joseph
> Thanks,
> Junxiao.
>>
>>>
>>> Thanks,
>>> Junxiao.
>>>
>>>>
>>>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>>
>>>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>>>
>>>>>>> There is a race window between dlmconvert_remote and
>>>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>>>
>>>>>>> dlmconvert_remote
>>>>>>> {
>>>>>>> spin_lock(&res->spinlock);
>>>>>>> list_move_tail(&lock->list, &res->converting);
>>>>>>> lock->convert_pending = 1;
>>>>>>> spin_unlock(&res->spinlock);
>>>>>>>
>>>>>>> status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>>>> and then down before sending ast.
>>>>>>> this node detects master down and calls
>>>>>>> dlm_move_lockres_to_recovery_list, which will revert the
>>>>>>> lock to grant list.
>>>>>>> Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>>>> send ast any more because it thinks already be authorized.
>>>>>>>
>>>>>>> spin_lock(&res->spinlock);
>>>>>>> lock->convert_pending = 0;
>>>>>>> if (status != DLM_NORMAL)
>>>>>>> dlm_revert_pending_convert(res, lock);
>>>>>>> spin_unlock(&res->spinlock);
>>>>>>> }
>>>>>>>
>>>>>>> In this case, just leave it in convert list and new master will take care
>>>>>>> of it after recovery. And if convert request returns other than
>>>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>>>
>>>>>>> changelog since v1:
>>>>>>> Clean up convert_pending since it is now useless.
>>>>
>>>>
>>>
>>>
>>> .
>>>
>>
>>
>
>
> .
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery
2015-09-23 1:47 ` Joseph Qi
@ 2015-09-23 1:59 ` Junxiao Bi
2015-09-23 7:48 ` Joseph Qi
0 siblings, 1 reply; 10+ messages in thread
From: Junxiao Bi @ 2015-09-23 1:59 UTC (permalink / raw)
To: ocfs2-devel
On 09/23/2015 09:47 AM, Joseph Qi wrote:
> On 2015/9/23 9:25, Junxiao Bi wrote:
>> On 09/22/2015 08:23 PM, Joseph Qi wrote:
>>> Hi Junxiao,
>>>
>>> On 2015/9/22 8:55, Junxiao Bi wrote:
>>>> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>>>>> Hi Junxiao,
>>>>> This solution may have problem in the following scenario:
>>>>> Consider dlm_send_remote_convert_request has taken too much time, and
>>>>> dlm_move_lockres_to_recovery_list runs first and new master will see
>>>>> this node is currently in convert list after recovery.
>>>>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>>>>> it will revert it to grant list, then retry convert. This will makes
>>>>> this node and master inconsistent.
>>>>> I will try to find another solution to resolve the race issue.
>>>>
>>>> If master is down, no need retry convert. May check the return value of
>>>> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
>>>> otherwise retry?
>>>
>>> I want to keep the original logic. And for fixing the race case I
>>> described, how about the following idea?
>>>
>>> Check the status DLM_NORMAL. If res->state is currently
>>> DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
>>> still in recovery) or res master changed (means new master has finished
>>> the recovery), reset the status to DLM_RECOVERING, just like the check
>>> at the beginning of dlmconvert_remote. Then it is now in grant list and
>>> outer will retry.
>> How this idea fix the race windows you described in patch log? Lock is
>> reverted to granted list but dlm_send_remote_convert_request() return
>> DLM_NORMAL.
>>
> As I described in the former mail, status is now reset to DLM_RECOVERING,
> caller will retry. And the original way will just wait forever.
I mean convert request was sent to res master successfully,
dlm_send_remote_convert_request() have return DLM_NORMAL, but before res
master send ast it panic. Looks convert will not retry in this case.
Thanks,
Junxiao.
>
> Thanks
> Joseph
>
>> Thanks,
>> Junxiao.
>>>
>>>>
>>>> Thanks,
>>>> Junxiao.
>>>>
>>>>>
>>>>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>>>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>>>
>>>>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>>>>
>>>>>>>> There is a race window between dlmconvert_remote and
>>>>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>>>>
>>>>>>>> dlmconvert_remote
>>>>>>>> {
>>>>>>>> spin_lock(&res->spinlock);
>>>>>>>> list_move_tail(&lock->list, &res->converting);
>>>>>>>> lock->convert_pending = 1;
>>>>>>>> spin_unlock(&res->spinlock);
>>>>>>>>
>>>>>>>> status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>>>>> and then down before sending ast.
>>>>>>>> this node detects master down and calls
>>>>>>>> dlm_move_lockres_to_recovery_list, which will revert the
>>>>>>>> lock to grant list.
>>>>>>>> Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>>>>> send ast any more because it thinks already be authorized.
>>>>>>>>
>>>>>>>> spin_lock(&res->spinlock);
>>>>>>>> lock->convert_pending = 0;
>>>>>>>> if (status != DLM_NORMAL)
>>>>>>>> dlm_revert_pending_convert(res, lock);
>>>>>>>> spin_unlock(&res->spinlock);
>>>>>>>> }
>>>>>>>>
>>>>>>>> In this case, just leave it in convert list and new master will take care
>>>>>>>> of it after recovery. And if convert request returns other than
>>>>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>>>>
>>>>>>>> changelog since v1:
>>>>>>>> Clean up convert_pending since it is now useless.
>>>>>
>>>>>
>>>>
>>>>
>>>> .
>>>>
>>>
>>>
>>
>>
>> .
>>
>
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery
2015-09-23 1:59 ` Junxiao Bi
@ 2015-09-23 7:48 ` Joseph Qi
2015-09-23 8:42 ` Junxiao Bi
0 siblings, 1 reply; 10+ messages in thread
From: Joseph Qi @ 2015-09-23 7:48 UTC (permalink / raw)
To: ocfs2-devel
On 2015/9/23 9:59, Junxiao Bi wrote:
> On 09/23/2015 09:47 AM, Joseph Qi wrote:
>> On 2015/9/23 9:25, Junxiao Bi wrote:
>>> On 09/22/2015 08:23 PM, Joseph Qi wrote:
>>>> Hi Junxiao,
>>>>
>>>> On 2015/9/22 8:55, Junxiao Bi wrote:
>>>>> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>>>>>> Hi Junxiao,
>>>>>> This solution may have problem in the following scenario:
>>>>>> Consider dlm_send_remote_convert_request has taken too much time, and
>>>>>> dlm_move_lockres_to_recovery_list runs first and new master will see
>>>>>> this node is currently in convert list after recovery.
>>>>>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>>>>>> it will revert it to grant list, then retry convert. This will makes
>>>>>> this node and master inconsistent.
>>>>>> I will try to find another solution to resolve the race issue.
>>>>>
>>>>> If master is down, no need retry convert. May check the return value of
>>>>> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
>>>>> otherwise retry?
>>>>
>>>> I want to keep the original logic. And for fixing the race case I
>>>> described, how about the following idea?
>>>>
>>>> Check the status DLM_NORMAL. If res->state is currently
>>>> DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
>>>> still in recovery) or res master changed (means new master has finished
>>>> the recovery), reset the status to DLM_RECOVERING, just like the check
>>>> at the beginning of dlmconvert_remote. Then it is now in grant list and
>>>> outer will retry.
>>> How this idea fix the race windows you described in patch log? Lock is
>>> reverted to granted list but dlm_send_remote_convert_request() return
>>> DLM_NORMAL.
>>>
>> As I described in the former mail, status is now reset to DLM_RECOVERING,
>> caller will retry. And the original way will just wait forever.
> I mean convert request was sent to res master successfully,
> dlm_send_remote_convert_request() have return DLM_NORMAL, but before res
> master send ast it panic. Looks convert will not retry in this case.
>
The original way is just what you said.
And my idea is letting dlmlock retry by resetting status to DLM_RECOVERING in
such a case.
> Thanks,
> Junxiao.
>>
>> Thanks
>> Joseph
>>
>>> Thanks,
>>> Junxiao.
>>>>
>>>>>
>>>>> Thanks,
>>>>> Junxiao.
>>>>>
>>>>>>
>>>>>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>>>>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>>>>
>>>>>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>>>>>
>>>>>>>>> There is a race window between dlmconvert_remote and
>>>>>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>>>>>
>>>>>>>>> dlmconvert_remote
>>>>>>>>> {
>>>>>>>>> spin_lock(&res->spinlock);
>>>>>>>>> list_move_tail(&lock->list, &res->converting);
>>>>>>>>> lock->convert_pending = 1;
>>>>>>>>> spin_unlock(&res->spinlock);
>>>>>>>>>
>>>>>>>>> status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>>>>>> and then down before sending ast.
>>>>>>>>> this node detects master down and calls
>>>>>>>>> dlm_move_lockres_to_recovery_list, which will revert the
>>>>>>>>> lock to grant list.
>>>>>>>>> Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>>>>>> send ast any more because it thinks already be authorized.
>>>>>>>>>
>>>>>>>>> spin_lock(&res->spinlock);
>>>>>>>>> lock->convert_pending = 0;
>>>>>>>>> if (status != DLM_NORMAL)
>>>>>>>>> dlm_revert_pending_convert(res, lock);
>>>>>>>>> spin_unlock(&res->spinlock);
>>>>>>>>> }
>>>>>>>>>
>>>>>>>>> In this case, just leave it in convert list and new master will take care
>>>>>>>>> of it after recovery. And if convert request returns other than
>>>>>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>>>>>
>>>>>>>>> changelog since v1:
>>>>>>>>> Clean up convert_pending since it is now useless.
>>>>>>
>>>>>>
>>>>>
>>>>>
>>>>> .
>>>>>
>>>>
>>>>
>>>
>>>
>>> .
>>>
>>
>>
>
>
> .
>
^ permalink raw reply [flat|nested] 10+ messages in thread
* [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery
2015-09-23 7:48 ` Joseph Qi
@ 2015-09-23 8:42 ` Junxiao Bi
0 siblings, 0 replies; 10+ messages in thread
From: Junxiao Bi @ 2015-09-23 8:42 UTC (permalink / raw)
To: ocfs2-devel
On 09/23/2015 03:48 PM, Joseph Qi wrote:
> On 2015/9/23 9:59, Junxiao Bi wrote:
>> On 09/23/2015 09:47 AM, Joseph Qi wrote:
>>> On 2015/9/23 9:25, Junxiao Bi wrote:
>>>> On 09/22/2015 08:23 PM, Joseph Qi wrote:
>>>>> Hi Junxiao,
>>>>>
>>>>> On 2015/9/22 8:55, Junxiao Bi wrote:
>>>>>> On 09/21/2015 05:23 PM, Joseph Qi wrote:
>>>>>>> Hi Junxiao,
>>>>>>> This solution may have problem in the following scenario:
>>>>>>> Consider dlm_send_remote_convert_request has taken too much time, and
>>>>>>> dlm_move_lockres_to_recovery_list runs first and new master will see
>>>>>>> this node is currently in convert list after recovery.
>>>>>>> Then dlm_send_remote_convert_request returns other than DLM_NORMAL and
>>>>>>> it will revert it to grant list, then retry convert. This will makes
>>>>>>> this node and master inconsistent.
>>>>>>> I will try to find another solution to resolve the race issue.
>>>>>>
>>>>>> If master is down, no need retry convert. May check the return value of
>>>>>> dlm_send_remote_convert_request(), if DLM_RECOVERING, don't retry,
>>>>>> otherwise retry?
>>>>>
>>>>> I want to keep the original logic. And for fixing the race case I
>>>>> described, how about the following idea?
>>>>>
>>>>> Check the status DLM_NORMAL. If res->state is currently
>>>>> DLM_LOCK_RES_RECOVERING (set in dlm_move_lockres_to_recovery_list, means
>>>>> still in recovery) or res master changed (means new master has finished
>>>>> the recovery), reset the status to DLM_RECOVERING, just like the check
>>>>> at the beginning of dlmconvert_remote. Then it is now in grant list and
>>>>> outer will retry.
>>>> How this idea fix the race windows you described in patch log? Lock is
>>>> reverted to granted list but dlm_send_remote_convert_request() return
>>>> DLM_NORMAL.
>>>>
>>> As I described in the former mail, status is now reset to DLM_RECOVERING,
>>> caller will retry. And the original way will just wait forever.
>> I mean convert request was sent to res master successfully,
>> dlm_send_remote_convert_request() have return DLM_NORMAL, but before res
>> master send ast it panic. Looks convert will not retry in this case.
>>
> The original way is just what you said.
> And my idea is letting dlmlock retry by resetting status to DLM_RECOVERING in
> such a case.
So convert_pending is still dropped? If so when
dlm_send_remote_convert_request() return DLM_RECOVERING, should reset it
to DLM_NORMAL and didn't revert lock to granted list. Retry is not needed.
Thanks,
Junxiao.
>
>> Thanks,
>> Junxiao.
>>>
>>> Thanks
>>> Joseph
>>>
>>>> Thanks,
>>>> Junxiao.
>>>>>
>>>>>>
>>>>>> Thanks,
>>>>>> Junxiao.
>>>>>>
>>>>>>>
>>>>>>> On 2015/9/20 15:20, Junxiao Bi wrote:
>>>>>>>> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
>>>>>>>>
>>>>>>>>>> ? 2015?9?18????7:25?Joseph Qi <joseph.qi@huawei.com> ???
>>>>>>>>>>
>>>>>>>>>> There is a race window between dlmconvert_remote and
>>>>>>>>>> dlm_move_lockres_to_recovery_list, which will cause a lock with
>>>>>>>>>> OCFS2_LOCK_BUSY in grant list, thus system hangs.
>>>>>>>>>>
>>>>>>>>>> dlmconvert_remote
>>>>>>>>>> {
>>>>>>>>>> spin_lock(&res->spinlock);
>>>>>>>>>> list_move_tail(&lock->list, &res->converting);
>>>>>>>>>> lock->convert_pending = 1;
>>>>>>>>>> spin_unlock(&res->spinlock);
>>>>>>>>>>
>>>>>>>>>> status = dlm_send_remote_convert_request();
>>>>>>>>>>>>>>>>>>>>>> race window, master has queued ast and return DLM_NORMAL,
>>>>>>>>>> and then down before sending ast.
>>>>>>>>>> this node detects master down and calls
>>>>>>>>>> dlm_move_lockres_to_recovery_list, which will revert the
>>>>>>>>>> lock to grant list.
>>>>>>>>>> Then OCFS2_LOCK_BUSY won't be cleared as new master won't
>>>>>>>>>> send ast any more because it thinks already be authorized.
>>>>>>>>>>
>>>>>>>>>> spin_lock(&res->spinlock);
>>>>>>>>>> lock->convert_pending = 0;
>>>>>>>>>> if (status != DLM_NORMAL)
>>>>>>>>>> dlm_revert_pending_convert(res, lock);
>>>>>>>>>> spin_unlock(&res->spinlock);
>>>>>>>>>> }
>>>>>>>>>>
>>>>>>>>>> In this case, just leave it in convert list and new master will take care
>>>>>>>>>> of it after recovery. And if convert request returns other than
>>>>>>>>>> DLM_NORMAL, convert thread will do the revert itself. So remove the
>>>>>>>>>> revert logic in dlm_move_lockres_to_recovery_list.
>>>>>>>>>>
>>>>>>>>>> changelog since v1:
>>>>>>>>>> Clean up convert_pending since it is now useless.
>>>>>>>
>>>>>>>
>>>>>>
>>>>>>
>>>>>> .
>>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>> .
>>>>
>>>
>>>
>>
>>
>> .
>>
>
>
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2015-09-23 8:42 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-09-18 11:25 [Ocfs2-devel] [PATCH v2] ocfs2/dlm: fix race between convert and recovery Joseph Qi
2015-09-20 7:20 ` Junxiao Bi
2015-09-21 9:23 ` Joseph Qi
2015-09-22 0:55 ` Junxiao Bi
2015-09-22 12:23 ` Joseph Qi
2015-09-23 1:25 ` Junxiao Bi
2015-09-23 1:47 ` Joseph Qi
2015-09-23 1:59 ` Junxiao Bi
2015-09-23 7:48 ` Joseph Qi
2015-09-23 8:42 ` Junxiao Bi
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.