All of lore.kernel.org
 help / color / mirror / Atom feed
* [Ocfs2-devel] [PATCH] ocfs2: resend master request when lost connection with someone
@ 2012-12-22  7:00 Xue jiufei
  2013-05-28  6:12 ` xiaowei.hu
  0 siblings, 1 reply; 4+ messages in thread
From: Xue jiufei @ 2012-12-22  7:00 UTC (permalink / raw)
  To: ocfs2-devel

  Function dlm_get_lock_resource() sends master request to all nodes in
domain_map and waits for their responses when the node(say nodeA) doesn't
known who the master is. 
  When nodeA sends the master request, it happened that network of
nodeB down for a while, and then restore. The master request
from nodeA does not reach nodeB. NodeA may wait again and again in
dlm_wait_for_lock_mastery() and never returns.
  This patch resend the mater request when a node lost connection with
some other nodes.

Signed-off-by: xuejiufei <xuejiufei@huawei.com>
---
 fs/ocfs2/dlm/dlmmaster.c |   41 +++++++++++++++++++++++++++++++++++------
 1 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index c491f97..2a99a95 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -106,7 +106,7 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
 				     struct dlm_lock_resource *res,
 				     struct dlm_master_list_entry *mle,
-				     int *blocked);
+				     int *blocked, int *retry, int host_down);
 static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
 				    struct dlm_lock_resource *res,
 				    struct dlm_master_list_entry *mle,
@@ -712,6 +712,8 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
 	unsigned int hash;
 	int tries = 0;
 	int bit, wait_on_recovery = 0;
+	int retry = 0;
+	unsigned long down_nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 
 	BUG_ON(!lockid);
 
@@ -910,11 +912,25 @@ redo_request:
 		goto wait;
 
 	ret = -EINVAL;
-	dlm_node_iter_init(mle->vote_map, &iter);
+	if (!retry)
+		dlm_node_iter_init(mle->vote_map, &iter);
+	else {
+		mlog(0, "%s:%.*s: retrying, send master request to maybe down node\n",
+				dlm->name, res->lockname.len, res->lockname.name);
+		dlm_node_iter_init(down_nodemap, &iter);
+	}
+	memset(down_nodemap, 0, sizeof(down_nodemap));
+
 	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
 		ret = dlm_do_master_request(res, mle, nodenum);
-		if (ret < 0)
+		if (ret < 0) {
 			mlog_errno(ret);
+			if (dlm_is_host_down(ret)) {
+				mlog(0, "%s:%.*s: node %u maybe dead, set down_nodemap\n",
+						dlm->name, res->lockname.len, res->lockname.name, nodenum);
+				set_bit(nodenum, down_nodemap);
+			}
+		}
 		if (mle->master != O2NM_MAX_NODES) {
 			/* found a master ! */
 			if (mle->master <= nodenum)
@@ -931,9 +947,11 @@ redo_request:
 
 wait:
 	/* keep going until the response map includes all nodes */
-	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
+	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked, &retry,
+			find_next_bit(down_nodemap, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES);
 	if (ret < 0) {
-		wait_on_recovery = 1;
+		if (!retry)
+			wait_on_recovery = 1;
 		mlog(0, "%s: res %.*s, Node map changed, redo the master "
 		     "request now, blocked=%d\n", dlm->name, res->lockname.len,
 		     res->lockname.name, blocked);
@@ -980,7 +998,7 @@ leave:
 static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
 				     struct dlm_lock_resource *res,
 				     struct dlm_master_list_entry *mle,
-				     int *blocked)
+				     int *blocked, int *retry, int host_down)
 {
 	u8 m;
 	int ret, bit;
@@ -990,6 +1008,7 @@ static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
 recheck:
 	ret = 0;
 	assert = 0;
+	*retry = 0;
 
 	/* check if another node has already become the owner */
 	spin_lock(&res->spinlock);
@@ -1043,6 +1062,16 @@ recheck:
 		     res->lockname.name);
 		goto recheck;
 	} else {
+		if (host_down && (m == O2NM_MAX_NODES)) {
+			mlog(0, "map not changed but some one may lost connection, "
+					"rechecking\n");
+			*retry = 1;
+			spin_unlock(&mle->spinlock);
+			msleep(DLM_NODE_DEATH_WAIT_MAX);
+			ret = -EAGAIN;
+			goto leave;
+		}
+
 		if (!voting_done) {
 			mlog(0, "map not changed and voting not done "
 			     "for %s:%.*s\n", dlm->name, res->lockname.len,
-- 
1.7.8.6

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: resend master request when lost connection with someone
  2012-12-22  7:00 [Ocfs2-devel] [PATCH] ocfs2: resend master request when lost connection with someone Xue jiufei
@ 2013-05-28  6:12 ` xiaowei.hu
  2013-05-31 10:38   ` Xue jiufei
  0 siblings, 1 reply; 4+ messages in thread
From: xiaowei.hu @ 2013-05-28  6:12 UTC (permalink / raw)
  To: ocfs2-devel

Hi,

I reviewed this patch , it did could fix a temp lost connection problem, 
but a few questions:

1. since we don't need to know the node numbers of down nodes, if simply 
replace the down_nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)], with a int 
named for example mreq_msg_send_fail ?

2.since the final work is to return -EAGAIN, the resend all master 
requests. How about we simply do this?:

  	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
  		ret = dlm_do_master_request(res, mle, nodenum);
-		if (ret < 0)
+		if (ret < 0) {
  			mlog_errno(ret);
+			wait_on_recovery = 1;
+			msleep(DLM_NODE_DEATH_WAIT_MAX);
+			goto redo_request;
+		}

Am I missing something?

Thanks,
Xiaowei

On 12/22/2012 03:00 PM, Xue jiufei wrote:
>    Function dlm_get_lock_resource() sends master request to all nodes in
> domain_map and waits for their responses when the node(say nodeA) doesn't
> known who the master is.
>    When nodeA sends the master request, it happened that network of
> nodeB down for a while, and then restore. The master request
> from nodeA does not reach nodeB. NodeA may wait again and again in
> dlm_wait_for_lock_mastery() and never returns.
>    This patch resend the mater request when a node lost connection with
> some other nodes.
>
> Signed-off-by: xuejiufei <xuejiufei@huawei.com>
> ---
>   fs/ocfs2/dlm/dlmmaster.c |   41 +++++++++++++++++++++++++++++++++++------
>   1 files changed, 35 insertions(+), 6 deletions(-)
>
> diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
> index c491f97..2a99a95 100644
> --- a/fs/ocfs2/dlm/dlmmaster.c
> +++ b/fs/ocfs2/dlm/dlmmaster.c
> @@ -106,7 +106,7 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
>   static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
>   				     struct dlm_lock_resource *res,
>   				     struct dlm_master_list_entry *mle,
> -				     int *blocked);
> +				     int *blocked, int *retry, int host_down);
>   static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
>   				    struct dlm_lock_resource *res,
>   				    struct dlm_master_list_entry *mle,
> @@ -712,6 +712,8 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
>   	unsigned int hash;
>   	int tries = 0;
>   	int bit, wait_on_recovery = 0;
> +	int retry = 0;
> +	unsigned long down_nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
>   
>   	BUG_ON(!lockid);
>   
> @@ -910,11 +912,25 @@ redo_request:
>   		goto wait;
>   
>   	ret = -EINVAL;
> -	dlm_node_iter_init(mle->vote_map, &iter);
> +	if (!retry)
> +		dlm_node_iter_init(mle->vote_map, &iter);
> +	else {
> +		mlog(0, "%s:%.*s: retrying, send master request to maybe down node\n",
> +				dlm->name, res->lockname.len, res->lockname.name);
> +		dlm_node_iter_init(down_nodemap, &iter);
> +	}
> +	memset(down_nodemap, 0, sizeof(down_nodemap));
> +
>   	while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
>   		ret = dlm_do_master_request(res, mle, nodenum);
> -		if (ret < 0)
> +		if (ret < 0) {
>   			mlog_errno(ret);
> +			if (dlm_is_host_down(ret)) {
> +				mlog(0, "%s:%.*s: node %u maybe dead, set down_nodemap\n",
> +						dlm->name, res->lockname.len, res->lockname.name, nodenum);
> +				set_bit(nodenum, down_nodemap);
> +			}
> +		}
>   		if (mle->master != O2NM_MAX_NODES) {
>   			/* found a master ! */
>   			if (mle->master <= nodenum)
> @@ -931,9 +947,11 @@ redo_request:
>   
>   wait:
>   	/* keep going until the response map includes all nodes */
> -	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
> +	ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked, &retry,
> +			find_next_bit(down_nodemap, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES);
>   	if (ret < 0) {
> -		wait_on_recovery = 1;
> +		if (!retry)
> +			wait_on_recovery = 1;
>   		mlog(0, "%s: res %.*s, Node map changed, redo the master "
>   		     "request now, blocked=%d\n", dlm->name, res->lockname.len,
>   		     res->lockname.name, blocked);
> @@ -980,7 +998,7 @@ leave:
>   static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
>   				     struct dlm_lock_resource *res,
>   				     struct dlm_master_list_entry *mle,
> -				     int *blocked)
> +				     int *blocked, int *retry, int host_down)
>   {
>   	u8 m;
>   	int ret, bit;
> @@ -990,6 +1008,7 @@ static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
>   recheck:
>   	ret = 0;
>   	assert = 0;
> +	*retry = 0;
>   
>   	/* check if another node has already become the owner */
>   	spin_lock(&res->spinlock);
> @@ -1043,6 +1062,16 @@ recheck:
>   		     res->lockname.name);
>   		goto recheck;
>   	} else {
> +		if (host_down && (m == O2NM_MAX_NODES)) {
> +			mlog(0, "map not changed but some one may lost connection, "
> +					"rechecking\n");
> +			*retry = 1;
> +			spin_unlock(&mle->spinlock);
> +			msleep(DLM_NODE_DEATH_WAIT_MAX);
> +			ret = -EAGAIN;
> +			goto leave;
> +		}
> +
>   		if (!voting_done) {
>   			mlog(0, "map not changed and voting not done "
>   			     "for %s:%.*s\n", dlm->name, res->lockname.len,

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: resend master request when lost connection with someone
  2013-05-28  6:12 ` xiaowei.hu
@ 2013-05-31 10:38   ` Xue jiufei
  2013-05-31 18:55     ` Srinivas Eeda
  0 siblings, 1 reply; 4+ messages in thread
From: Xue jiufei @ 2013-05-31 10:38 UTC (permalink / raw)
  To: ocfs2-devel

Hi, Xiaowei
It's OK to simlify the patch just as you did. But we don't want to resend
master request to all others nodes in consideration of network traffic.
So we record those maybe down nodes in down_nodemap.

? 2013/5/28 14:12, xiaowei.hu ??:
> Hi,
> 
> I reviewed this patch , it did could fix a temp lost connection problem, but a few questions:
> 
> 1. since we don't need to know the node numbers of down nodes, if simply replace the down_nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)], with a int named for example mreq_msg_send_fail ?
> 
> 2.since the final work is to return -EAGAIN, the resend all master requests. How about we simply do this?:
> 
>      while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
>          ret = dlm_do_master_request(res, mle, nodenum);
> -        if (ret < 0)
> +        if (ret < 0) {
>              mlog_errno(ret);
> +            wait_on_recovery = 1;
> +            msleep(DLM_NODE_DEATH_WAIT_MAX);
> +            goto redo_request;
> +        }
> 
> Am I missing something?
> 
> Thanks,
> Xiaowei
> 
> On 12/22/2012 03:00 PM, Xue jiufei wrote:
>>    Function dlm_get_lock_resource() sends master request to all nodes in
>> domain_map and waits for their responses when the node(say nodeA) doesn't
>> known who the master is.
>>    When nodeA sends the master request, it happened that network of
>> nodeB down for a while, and then restore. The master request
>> from nodeA does not reach nodeB. NodeA may wait again and again in
>> dlm_wait_for_lock_mastery() and never returns.
>>    This patch resend the mater request when a node lost connection with
>> some other nodes.
>>
>> Signed-off-by: xuejiufei <xuejiufei@huawei.com>
>> ---
>>   fs/ocfs2/dlm/dlmmaster.c |   41 +++++++++++++++++++++++++++++++++++------
>>   1 files changed, 35 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
>> index c491f97..2a99a95 100644
>> --- a/fs/ocfs2/dlm/dlmmaster.c
>> +++ b/fs/ocfs2/dlm/dlmmaster.c
>> @@ -106,7 +106,7 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
>>   static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
>>                        struct dlm_lock_resource *res,
>>                        struct dlm_master_list_entry *mle,
>> -                     int *blocked);
>> +                     int *blocked, int *retry, int host_down);
>>   static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
>>                       struct dlm_lock_resource *res,
>>                       struct dlm_master_list_entry *mle,
>> @@ -712,6 +712,8 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
>>       unsigned int hash;
>>       int tries = 0;
>>       int bit, wait_on_recovery = 0;
>> +    int retry = 0;
>> +    unsigned long down_nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
>>         BUG_ON(!lockid);
>>   @@ -910,11 +912,25 @@ redo_request:
>>           goto wait;
>>         ret = -EINVAL;
>> -    dlm_node_iter_init(mle->vote_map, &iter);
>> +    if (!retry)
>> +        dlm_node_iter_init(mle->vote_map, &iter);
>> +    else {
>> +        mlog(0, "%s:%.*s: retrying, send master request to maybe down node\n",
>> +                dlm->name, res->lockname.len, res->lockname.name);
>> +        dlm_node_iter_init(down_nodemap, &iter);
>> +    }
>> +    memset(down_nodemap, 0, sizeof(down_nodemap));
>> +
>>       while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
>>           ret = dlm_do_master_request(res, mle, nodenum);
>> -        if (ret < 0)
>> +        if (ret < 0) {
>>               mlog_errno(ret);
>> +            if (dlm_is_host_down(ret)) {
>> +                mlog(0, "%s:%.*s: node %u maybe dead, set down_nodemap\n",
>> +                        dlm->name, res->lockname.len, res->lockname.name, nodenum);
>> +                set_bit(nodenum, down_nodemap);
>> +            }
>> +        }
>>           if (mle->master != O2NM_MAX_NODES) {
>>               /* found a master ! */
>>               if (mle->master <= nodenum)
>> @@ -931,9 +947,11 @@ redo_request:
>>     wait:
>>       /* keep going until the response map includes all nodes */
>> -    ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
>> +    ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked, &retry,
>> +            find_next_bit(down_nodemap, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES);
>>       if (ret < 0) {
>> -        wait_on_recovery = 1;
>> +        if (!retry)
>> +            wait_on_recovery = 1;
>>           mlog(0, "%s: res %.*s, Node map changed, redo the master "
>>                "request now, blocked=%d\n", dlm->name, res->lockname.len,
>>                res->lockname.name, blocked);
>> @@ -980,7 +998,7 @@ leave:
>>   static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
>>                        struct dlm_lock_resource *res,
>>                        struct dlm_master_list_entry *mle,
>> -                     int *blocked)
>> +                     int *blocked, int *retry, int host_down)
>>   {
>>       u8 m;
>>       int ret, bit;
>> @@ -990,6 +1008,7 @@ static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
>>   recheck:
>>       ret = 0;
>>       assert = 0;
>> +    *retry = 0;
>>         /* check if another node has already become the owner */
>>       spin_lock(&res->spinlock);
>> @@ -1043,6 +1062,16 @@ recheck:
>>                res->lockname.name);
>>           goto recheck;
>>       } else {
>> +        if (host_down && (m == O2NM_MAX_NODES)) {
>> +            mlog(0, "map not changed but some one may lost connection, "
>> +                    "rechecking\n");
>> +            *retry = 1;
>> +            spin_unlock(&mle->spinlock);
>> +            msleep(DLM_NODE_DEATH_WAIT_MAX);
>> +            ret = -EAGAIN;
>> +            goto leave;
>> +        }
>> +
>>           if (!voting_done) {
>>               mlog(0, "map not changed and voting not done "
>>                    "for %s:%.*s\n", dlm->name, res->lockname.len,
> 
> .
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: resend master request when lost connection with someone
  2013-05-31 10:38   ` Xue jiufei
@ 2013-05-31 18:55     ` Srinivas Eeda
  0 siblings, 0 replies; 4+ messages in thread
From: Srinivas Eeda @ 2013-05-31 18:55 UTC (permalink / raw)
  To: ocfs2-devel

On 05/31/2013 03:38 AM, Xue jiufei wrote:
> Hi, Xiaowei
> It's OK to simlify the patch just as you did. But we don't want to resend
> master request to all others nodes in consideration of network traffic.
> So we record those maybe down nodes in down_nodemap.
>
> ? 2013/5/28 14:12, xiaowei.hu ??:
>> Hi,
>>
>> I reviewed this patch , it did could fix a temp lost connection problem, but a few questions:
>>
>> 1. since we don't need to know the node numbers of down nodes, if simply replace the down_nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)], with a int named for example mreq_msg_send_fail ?
>>
>> 2.since the final work is to return -EAGAIN, the resend all master requests. How about we simply do this?:
>>
>>       while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
>>           ret = dlm_do_master_request(res, mle, nodenum);
>> -        if (ret < 0)
>> +        if (ret < 0) {
>>               mlog_errno(ret);
>> +            wait_on_recovery = 1;
>> +            msleep(DLM_NODE_DEATH_WAIT_MAX);
>> +            goto redo_request;
>> +        }
>>
>> Am I missing something?
>>
>> Thanks,
>> Xiaowei
>>
>> On 12/22/2012 03:00 PM, Xue jiufei wrote:
>>>     Function dlm_get_lock_resource() sends master request to all nodes in
>>> domain_map and waits for their responses when the node(say nodeA) doesn't
>>> known who the master is.
>>>     When nodeA sends the master request, it happened that network of
>>> nodeB down for a while, and then restore. The master request
>>> from nodeA does not reach nodeB. NodeA may wait again and again in
>>> dlm_wait_for_lock_mastery() and never returns.
>>>     This patch resend the mater request when a node lost connection with
>>> some other nodes.
Yes, with the current reconnect code there is a possibility of message 
loss and can happen to all kinds of messages and responses. You are 
assuming the message never received by node B. Message might have been 
received by node B but the response might have lost too. Currently there 
is no way of telling the difference. DLM layer shouldn't worry about 
nodes loosing connection temporarily. Right way to fix this is to fix 
o2net reconnect code.
>>>
>>> Signed-off-by: xuejiufei <xuejiufei@huawei.com>
>>> ---
>>>    fs/ocfs2/dlm/dlmmaster.c |   41 +++++++++++++++++++++++++++++++++++------
>>>    1 files changed, 35 insertions(+), 6 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
>>> index c491f97..2a99a95 100644
>>> --- a/fs/ocfs2/dlm/dlmmaster.c
>>> +++ b/fs/ocfs2/dlm/dlmmaster.c
>>> @@ -106,7 +106,7 @@ static int dlm_do_master_request(struct dlm_lock_resource *res,
>>>    static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
>>>                         struct dlm_lock_resource *res,
>>>                         struct dlm_master_list_entry *mle,
>>> -                     int *blocked);
>>> +                     int *blocked, int *retry, int host_down);
>>>    static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
>>>                        struct dlm_lock_resource *res,
>>>                        struct dlm_master_list_entry *mle,
>>> @@ -712,6 +712,8 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
>>>        unsigned int hash;
>>>        int tries = 0;
>>>        int bit, wait_on_recovery = 0;
>>> +    int retry = 0;
>>> +    unsigned long down_nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
>>>          BUG_ON(!lockid);
>>>    @@ -910,11 +912,25 @@ redo_request:
>>>            goto wait;
>>>          ret = -EINVAL;
>>> -    dlm_node_iter_init(mle->vote_map, &iter);
>>> +    if (!retry)
>>> +        dlm_node_iter_init(mle->vote_map, &iter);
>>> +    else {
>>> +        mlog(0, "%s:%.*s: retrying, send master request to maybe down node\n",
>>> +                dlm->name, res->lockname.len, res->lockname.name);
>>> +        dlm_node_iter_init(down_nodemap, &iter);
>>> +    }
>>> +    memset(down_nodemap, 0, sizeof(down_nodemap));
>>> +
>>>        while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
>>>            ret = dlm_do_master_request(res, mle, nodenum);
>>> -        if (ret < 0)
>>> +        if (ret < 0) {
>>>                mlog_errno(ret);
>>> +            if (dlm_is_host_down(ret)) {
>>> +                mlog(0, "%s:%.*s: node %u maybe dead, set down_nodemap\n",
>>> +                        dlm->name, res->lockname.len, res->lockname.name, nodenum);
>>> +                set_bit(nodenum, down_nodemap);
>>> +            }
>>> +        }
>>>            if (mle->master != O2NM_MAX_NODES) {
>>>                /* found a master ! */
>>>                if (mle->master <= nodenum)
>>> @@ -931,9 +947,11 @@ redo_request:
>>>      wait:
>>>        /* keep going until the response map includes all nodes */
>>> -    ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
>>> +    ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked, &retry,
>>> +            find_next_bit(down_nodemap, O2NM_MAX_NODES, 0) < O2NM_MAX_NODES);
>>>        if (ret < 0) {
>>> -        wait_on_recovery = 1;
>>> +        if (!retry)
>>> +            wait_on_recovery = 1;
>>>            mlog(0, "%s: res %.*s, Node map changed, redo the master "
>>>                 "request now, blocked=%d\n", dlm->name, res->lockname.len,
>>>                 res->lockname.name, blocked);
>>> @@ -980,7 +998,7 @@ leave:
>>>    static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
>>>                         struct dlm_lock_resource *res,
>>>                         struct dlm_master_list_entry *mle,
>>> -                     int *blocked)
>>> +                     int *blocked, int *retry, int host_down)
>>>    {
>>>        u8 m;
>>>        int ret, bit;
>>> @@ -990,6 +1008,7 @@ static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
>>>    recheck:
>>>        ret = 0;
>>>        assert = 0;
>>> +    *retry = 0;
>>>          /* check if another node has already become the owner */
>>>        spin_lock(&res->spinlock);
>>> @@ -1043,6 +1062,16 @@ recheck:
>>>                 res->lockname.name);
>>>            goto recheck;
>>>        } else {
>>> +        if (host_down && (m == O2NM_MAX_NODES)) {
>>> +            mlog(0, "map not changed but some one may lost connection, "
>>> +                    "rechecking\n");
>>> +            *retry = 1;
>>> +            spin_unlock(&mle->spinlock);
>>> +            msleep(DLM_NODE_DEATH_WAIT_MAX);
>>> +            ret = -EAGAIN;
>>> +            goto leave;
>>> +        }
>>> +
>>>            if (!voting_done) {
>>>                mlog(0, "map not changed and voting not done "
>>>                     "for %s:%.*s\n", dlm->name, res->lockname.len,
>> .
>>
>
>
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2013-05-31 18:55 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-12-22  7:00 [Ocfs2-devel] [PATCH] ocfs2: resend master request when lost connection with someone Xue jiufei
2013-05-28  6:12 ` xiaowei.hu
2013-05-31 10:38   ` Xue jiufei
2013-05-31 18:55     ` Srinivas Eeda

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.