All of lore.kernel.org
 help / color / mirror / Atom feed
* [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres
@ 2017-11-01  1:14 piaojun
  2017-11-01  2:47 ` Changwei Ge
  0 siblings, 1 reply; 11+ messages in thread
From: piaojun @ 2017-11-01  1:14 UTC (permalink / raw)
  To: ocfs2-devel

wait for dlm recovery done when migrating all lockres in case of new
lockres to be left after leaving dlm domain.

      NodeA                       NodeB                NodeC

umount and migrate
all lockres

                                 node down

do recovery for NodeB
and collect a new lockres
form other live nodes

leave domain but the
new lockres remains

                                                  mount and join domain

                                                  request for the owner
                                                  of the new lockres, but
                                                  all the other nodes said
                                                  'NO', so NodeC decide to
                                                  the owner, and send do
                                                  assert msg to other nodes.

                                                  other nodes receive the msg
                                                  and found two masters exist.
                                                  at last cause BUG in
                                                  dlm_assert_master_handler()
                                                  -->BUG();

Fixes: bc9838c4d44a ("dlm: allow dlm do recovery during shutdown")

Signed-off-by: Jun Piao <piaojun@huawei.com>
Reviewed-by: Alex Chen <alex.chen@huawei.com>
Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
---
 fs/ocfs2/dlm/dlmcommon.h   |  1 +
 fs/ocfs2/dlm/dlmdomain.c   | 14 ++++++++++++++
 fs/ocfs2/dlm/dlmrecovery.c | 12 +++++++++---
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e9f3705..999ab7d 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
 	u8 node_num;
 	u32 key;
 	u8  joining_node;
+	u8 migrate_done; /* set to 1 means node has migrated all lockres */
 	wait_queue_head_t dlm_join_events;
 	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index e1fea14..98a8f56 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -461,6 +461,18 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
 		cond_resched_lock(&dlm->spinlock);
 		num += n;
 	}
+
+	if (!num) {
+		if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
+			mlog(0, "%s: perhaps there are more lock resources need to "
+					"be migrated after dlm recovery\n", dlm->name);
+			ret = -EAGAIN;
+		} else {
+			mlog(0, "%s: we won't do dlm recovery after migrating all lockres",
+					dlm->name);
+			dlm->migrate_done = 1;
+		}
+	}
 	spin_unlock(&dlm->spinlock);
 	wake_up(&dlm->dlm_thread_wq);

@@ -2052,6 +2064,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
 	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
 	init_waitqueue_head(&dlm->dlm_join_events);

+	dlm->migrate_done = 0;
+
 	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
 	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 74407c6..3106332 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)

 static void dlm_begin_recovery(struct dlm_ctxt *dlm)
 {
-	spin_lock(&dlm->spinlock);
+	assert_spin_locked(&dlm->spinlock);
 	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
 	printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
 	       dlm->name, dlm->reco.dead_node);
 	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
-	spin_unlock(&dlm->spinlock);
 }

 static void dlm_end_recovery(struct dlm_ctxt *dlm)
@@ -456,6 +455,12 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)

 	spin_lock(&dlm->spinlock);

+	if (dlm->migrate_done) {
+		mlog(0, "%s: no need do recovery after migrating all lockres\n",
+				dlm->name);
+		return 0;
+	}
+
 	/* check to see if the new master has died */
 	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
 	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
@@ -490,12 +495,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
 	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
 	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
 	     dlm->reco.dead_node);
-	spin_unlock(&dlm->spinlock);

 	/* take write barrier */
 	/* (stops the list reshuffling thread, proxy ast handling) */
 	dlm_begin_recovery(dlm);

+	spin_unlock(&dlm->spinlock);
+
 	if (dlm->reco.new_master == dlm->node_num)
 		goto master_here;

-- 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres
  2017-11-01  1:14 [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres piaojun
@ 2017-11-01  2:47 ` Changwei Ge
  2017-11-01  5:52   ` piaojun
  0 siblings, 1 reply; 11+ messages in thread
From: Changwei Ge @ 2017-11-01  2:47 UTC (permalink / raw)
  To: ocfs2-devel

Hi Jun,

Thanks for reporting.
I am very interesting in this issue. But, first of all, I want to make 
this issue clear, so that I might be able to provide some comments.


On 2017/11/1 9:16, piaojun wrote:
> wait for dlm recovery done when migrating all lockres in case of new
> lockres to be left after leaving dlm domain.

What do you mean by 'a new lock resource to be left after leaving 
domain'? It means we leak a dlm lock resource if below situation happens.

> 
>        NodeA                       NodeB                NodeC
> 
> umount and migrate
> all lockres
> 
>                                   node down
> 
> do recovery for NodeB
> and collect a new lockres
> form other live nodes

You mean a lock resource whose owner was NodeB is just migrated from 
other cluster member nodes?

> 
> leave domain but the
> new lockres remains
> 
>                                                    mount and join domain
> 
>                                                    request for the owner
>                                                    of the new lockres, but
>                                                    all the other nodes said
>                                                    'NO', so NodeC decide to
>                                                    the owner, and send do
>                                                    assert msg to other nodes.
> 
>                                                    other nodes receive the msg
>                                                    and found two masters exist.
>                                                    at last cause BUG in
>                                                    dlm_assert_master_handler()
>                                                    -->BUG();

If this issue truly exists, can we take some efforts in 
dlm_exit_domain_handler? Or perhaps we should kick dlm's work queue 
before migrating all lock resources.

> 
> Fixes: bc9838c4d44a ("dlm: allow dlm do recovery during shutdown")
> 
> Signed-off-by: Jun Piao <piaojun@huawei.com>
> Reviewed-by: Alex Chen <alex.chen@huawei.com>
> Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
> ---
>   fs/ocfs2/dlm/dlmcommon.h   |  1 +
>   fs/ocfs2/dlm/dlmdomain.c   | 14 ++++++++++++++
>   fs/ocfs2/dlm/dlmrecovery.c | 12 +++++++++---
>   3 files changed, 24 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
> index e9f3705..999ab7d 100644
> --- a/fs/ocfs2/dlm/dlmcommon.h
> +++ b/fs/ocfs2/dlm/dlmcommon.h
> @@ -140,6 +140,7 @@ struct dlm_ctxt
>   	u8 node_num;
>   	u32 key;
>   	u8  joining_node;
> +	u8 migrate_done; /* set to 1 means node has migrated all lockres */
>   	wait_queue_head_t dlm_join_events;
>   	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
>   	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
> index e1fea14..98a8f56 100644
> --- a/fs/ocfs2/dlm/dlmdomain.c
> +++ b/fs/ocfs2/dlm/dlmdomain.c
> @@ -461,6 +461,18 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
>   		cond_resched_lock(&dlm->spinlock);
>   		num += n;
>   	}
> +
> +	if (!num) {
> +		if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
> +			mlog(0, "%s: perhaps there are more lock resources need to "
> +					"be migrated after dlm recovery\n", dlm->name);

If dlm is mark with DLM_RECO_STATE_ACTIVE, then a lock resource must 
already be marked with DLM_LOCK_RES_RECOVERING which can't be migrated. 
So code will goto redo_bucket in function dlm_migrate_all_locks.
So I don't understand why a judgement is added here?



> +			ret = -EAGAIN;
> +		} else {
> +			mlog(0, "%s: we won't do dlm recovery after migrating all lockres",
> +					dlm->name);
> +			dlm->migrate_done = 1;
> +		}
> +	}
>   	spin_unlock(&dlm->spinlock);
>   	wake_up(&dlm->dlm_thread_wq);
> 
> @@ -2052,6 +2064,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
>   	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
>   	init_waitqueue_head(&dlm->dlm_join_events);
> 
> +	dlm->migrate_done = 0;
> +
>   	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
>   	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
> 
> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
> index 74407c6..3106332 100644
> --- a/fs/ocfs2/dlm/dlmrecovery.c
> +++ b/fs/ocfs2/dlm/dlmrecovery.c
> @@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
> 
>   static void dlm_begin_recovery(struct dlm_ctxt *dlm)
>   {
> -	spin_lock(&dlm->spinlock);
> +	assert_spin_locked(&dlm->spinlock);
>   	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
>   	printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
>   	       dlm->name, dlm->reco.dead_node);
>   	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
> -	spin_unlock(&dlm->spinlock);
>   }
> 
>   static void dlm_end_recovery(struct dlm_ctxt *dlm)
> @@ -456,6 +455,12 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
> 
>   	spin_lock(&dlm->spinlock);
> 
> +	if (dlm->migrate_done) {
> +		mlog(0, "%s: no need do recovery after migrating all lockres\n",
> +				dlm->name);

Don't we need unlock above spin_lock before return?

And if we just return here, how dlm lock resource can clear its 
REDISCOVERING flag. I suppose this may cause cluster hang.

And I cc this to ocfs2 maintainers.

Thanks,
Changwei

> +		return 0;
> +	}
> +
>   	/* check to see if the new master has died */
>   	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
>   	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
> @@ -490,12 +495,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
>   	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
>   	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
>   	     dlm->reco.dead_node);
> -	spin_unlock(&dlm->spinlock);
> 
>   	/* take write barrier */
>   	/* (stops the list reshuffling thread, proxy ast handling) */
>   	dlm_begin_recovery(dlm);
> 
> +	spin_unlock(&dlm->spinlock);
> +
>   	if (dlm->reco.new_master == dlm->node_num)
>   		goto master_here;
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres
  2017-11-01  2:47 ` Changwei Ge
@ 2017-11-01  5:52   ` piaojun
  2017-11-01  7:13     ` Changwei Ge
  0 siblings, 1 reply; 11+ messages in thread
From: piaojun @ 2017-11-01  5:52 UTC (permalink / raw)
  To: ocfs2-devel

Hi Changwei,

On 2017/11/1 10:47, Changwei Ge wrote:
> Hi Jun,
> 
> Thanks for reporting.
> I am very interesting in this issue. But, first of all, I want to make 
> this issue clear, so that I might be able to provide some comments.
> 
> 
> On 2017/11/1 9:16, piaojun wrote:
>> wait for dlm recovery done when migrating all lockres in case of new
>> lockres to be left after leaving dlm domain.
> 
> What do you mean by 'a new lock resource to be left after leaving 
> domain'? It means we leak a dlm lock resource if below situation happens.
> 
a new lockres is the one collected by NodeA during recoverying for
NodeB. It leaks a lockres indeed.
>>
>>        NodeA                       NodeB                NodeC
>>
>> umount and migrate
>> all lockres
>>
>>                                   node down
>>
>> do recovery for NodeB
>> and collect a new lockres
>> form other live nodes
> 
> You mean a lock resource whose owner was NodeB is just migrated from 
> other cluster member nodes?
> 
that is it.
>>
>> leave domain but the
>> new lockres remains
>>
>>                                                    mount and join domain
>>
>>                                                    request for the owner
>>                                                    of the new lockres, but
>>                                                    all the other nodes said
>>                                                    'NO', so NodeC decide to
>>                                                    the owner, and send do
>>                                                    assert msg to other nodes.
>>
>>                                                    other nodes receive the msg
>>                                                    and found two masters exist.
>>                                                    at last cause BUG in
>>                                                    dlm_assert_master_handler()
>>                                                    -->BUG();
> 
> If this issue truly exists, can we take some efforts in 
> dlm_exit_domain_handler? Or perhaps we should kick dlm's work queue 
> before migrating all lock resources.
> 
If NodeA has entered dlm_leave_domain(), we can hardly go back
migrating res. Perhaps more work will be needed in that way.
>>
>> Fixes: bc9838c4d44a ("dlm: allow dlm do recovery during shutdown")
>>
>> Signed-off-by: Jun Piao <piaojun@huawei.com>
>> Reviewed-by: Alex Chen <alex.chen@huawei.com>
>> Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
>> ---
>>   fs/ocfs2/dlm/dlmcommon.h   |  1 +
>>   fs/ocfs2/dlm/dlmdomain.c   | 14 ++++++++++++++
>>   fs/ocfs2/dlm/dlmrecovery.c | 12 +++++++++---
>>   3 files changed, 24 insertions(+), 3 deletions(-)
>>
>> diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
>> index e9f3705..999ab7d 100644
>> --- a/fs/ocfs2/dlm/dlmcommon.h
>> +++ b/fs/ocfs2/dlm/dlmcommon.h
>> @@ -140,6 +140,7 @@ struct dlm_ctxt
>>   	u8 node_num;
>>   	u32 key;
>>   	u8  joining_node;
>> +	u8 migrate_done; /* set to 1 means node has migrated all lockres */
>>   	wait_queue_head_t dlm_join_events;
>>   	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
>>   	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
>> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
>> index e1fea14..98a8f56 100644
>> --- a/fs/ocfs2/dlm/dlmdomain.c
>> +++ b/fs/ocfs2/dlm/dlmdomain.c
>> @@ -461,6 +461,18 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
>>   		cond_resched_lock(&dlm->spinlock);
>>   		num += n;
>>   	}
>> +
>> +	if (!num) {
>> +		if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
>> +			mlog(0, "%s: perhaps there are more lock resources need to "
>> +					"be migrated after dlm recovery\n", dlm->name);
> 
> If dlm is mark with DLM_RECO_STATE_ACTIVE, then a lock resource must 
> already be marked with DLM_LOCK_RES_RECOVERING which can't be migrated. 
> So code will goto redo_bucket in function dlm_migrate_all_locks.
> So I don't understand why a judgement is added here?
> 
> 
> 
because we have done migrating before recoverying. the judgement here
is to avoid the following potential recoverying.
>> +			ret = -EAGAIN;
>> +		} else {
>> +			mlog(0, "%s: we won't do dlm recovery after migrating all lockres",
>> +					dlm->name);
>> +			dlm->migrate_done = 1;
>> +		}
>> +	}
>>   	spin_unlock(&dlm->spinlock);
>>   	wake_up(&dlm->dlm_thread_wq);
>>
>> @@ -2052,6 +2064,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
>>   	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
>>   	init_waitqueue_head(&dlm->dlm_join_events);
>>
>> +	dlm->migrate_done = 0;
>> +
>>   	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
>>   	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
>>
>> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>> index 74407c6..3106332 100644
>> --- a/fs/ocfs2/dlm/dlmrecovery.c
>> +++ b/fs/ocfs2/dlm/dlmrecovery.c
>> @@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
>>
>>   static void dlm_begin_recovery(struct dlm_ctxt *dlm)
>>   {
>> -	spin_lock(&dlm->spinlock);
>> +	assert_spin_locked(&dlm->spinlock);
>>   	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
>>   	printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
>>   	       dlm->name, dlm->reco.dead_node);
>>   	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
>> -	spin_unlock(&dlm->spinlock);
>>   }
>>
>>   static void dlm_end_recovery(struct dlm_ctxt *dlm)
>> @@ -456,6 +455,12 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
>>
>>   	spin_lock(&dlm->spinlock);
>>
>> +	if (dlm->migrate_done) {
>> +		mlog(0, "%s: no need do recovery after migrating all lockres\n",
>> +				dlm->name);
> 
> Don't we need unlock above spin_lock before return?
> 
> And if we just return here, how dlm lock resource can clear its 
> REDISCOVERING flag. I suppose this may cause cluster hang.
> 
> And I cc this to ocfs2 maintainers.
> 
> Thanks,
> Changwei
> 
oh, good catch, I missed spin_unlock(&dlm->spinlock);
>> +		return 0;
>> +	}
>> +
>>   	/* check to see if the new master has died */
>>   	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
>>   	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
>> @@ -490,12 +495,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
>>   	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
>>   	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
>>   	     dlm->reco.dead_node);
>> -	spin_unlock(&dlm->spinlock);
>>
>>   	/* take write barrier */
>>   	/* (stops the list reshuffling thread, proxy ast handling) */
>>   	dlm_begin_recovery(dlm);
>>
>> +	spin_unlock(&dlm->spinlock);
>> +
>>   	if (dlm->reco.new_master == dlm->node_num)
>>   		goto master_here;
>>
> 
> .
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres
  2017-11-01  5:52   ` piaojun
@ 2017-11-01  7:13     ` Changwei Ge
  2017-11-01  7:56       ` piaojun
  0 siblings, 1 reply; 11+ messages in thread
From: Changwei Ge @ 2017-11-01  7:13 UTC (permalink / raw)
  To: ocfs2-devel

Hi Jun,

I probably get your point.

You mean that dlm finds no lock resource to be migrated and no more lock 
resource is managed by its hash table. After that a node dies all of a 
sudden and the dead node is put into dlm's recovery map, right? 
Furthermore, a lock resource is migrated from other nodes and local node 
has already asserted master to them.

If so, I want to suggest a easier way to solve it.
We don't have to add a new flag to dlm structure, just leverage existed 
dlm status and bitmap.
It will bring a bonus - no lock resource will be marked with RECOVERING, 
it's a safer way, I suppose.

Please take a review.

Thanks,
Changwei


Subject: [PATCH] ocfs2/dlm: a node can't be involved in recovery if it 
is being shutdown

Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
---
  fs/ocfs2/dlm/dlmdomain.c   | 4 ++++
  fs/ocfs2/dlm/dlmrecovery.c | 3 +++
  2 files changed, 7 insertions(+)

diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index a2b19fbdcf46..5e9283e509a4 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -707,11 +707,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
  		 * want new domain joins to communicate with us at
  		 * least until we've completed migration of our
  		 * resources. */
+		spin_lock(&dlm->spinlock);
  		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
+		spin_unlock(&dlm->spinlock);
  		leave = 1;
  	}
  	spin_unlock(&dlm_domain_lock);

+	dlm_wait_for_recovery(dlm);
+
  	if (leave) {
  		mlog(0, "shutting down domain %s\n", dlm->name);
  		dlm_begin_exit_domain(dlm);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 74407c6dd592..764c95b2b35c 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2441,6 +2441,9 @@ static void __dlm_hb_node_down(struct dlm_ctxt 
*dlm, int idx)
  {
  	assert_spin_locked(&dlm->spinlock);

+	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
+		return;
+
  	if (dlm->reco.new_master == idx) {
  		mlog(0, "%s: recovery master %d just died\n",
  		     dlm->name, idx);
-- 
2.11.0


On 2017/11/1 13:53, piaojun wrote:
> Hi Changwei,
> 
> On 2017/11/1 10:47, Changwei Ge wrote:
>> Hi Jun,
>>
>> Thanks for reporting.
>> I am very interesting in this issue. But, first of all, I want to make
>> this issue clear, so that I might be able to provide some comments.
>>
>>
>> On 2017/11/1 9:16, piaojun wrote:
>>> wait for dlm recovery done when migrating all lockres in case of new
>>> lockres to be left after leaving dlm domain.
>>
>> What do you mean by 'a new lock resource to be left after leaving
>> domain'? It means we leak a dlm lock resource if below situation happens.
>>
> a new lockres is the one collected by NodeA during recoverying for
> NodeB. It leaks a lockres indeed.
>>>
>>>         NodeA                       NodeB                NodeC
>>>
>>> umount and migrate
>>> all lockres
>>>
>>>                                    node down
>>>
>>> do recovery for NodeB
>>> and collect a new lockres
>>> form other live nodes
>>
>> You mean a lock resource whose owner was NodeB is just migrated from
>> other cluster member nodes?
>>
> that is it.
>>>
>>> leave domain but the
>>> new lockres remains
>>>
>>>                                                     mount and join domain
>>>
>>>                                                     request for the owner
>>>                                                     of the new lockres, but
>>>                                                     all the other nodes said
>>>                                                     'NO', so NodeC decide to
>>>                                                     the owner, and send do
>>>                                                     assert msg to other nodes.
>>>
>>>                                                     other nodes receive the msg
>>>                                                     and found two masters exist.
>>>                                                     at last cause BUG in
>>>                                                     dlm_assert_master_handler()
>>>                                                     -->BUG();
>>
>> If this issue truly exists, can we take some efforts in
>> dlm_exit_domain_handler? Or perhaps we should kick dlm's work queue
>> before migrating all lock resources.
>>
> If NodeA has entered dlm_leave_domain(), we can hardly go back
> migrating res. Perhaps more work will be needed in that way.
>>>
>>> Fixes: bc9838c4d44a ("dlm: allow dlm do recovery during shutdown")
>>>
>>> Signed-off-by: Jun Piao <piaojun@huawei.com>
>>> Reviewed-by: Alex Chen <alex.chen@huawei.com>
>>> Reviewed-by: Yiwen Jiang <jiangyiwen@huawei.com>
>>> ---
>>>    fs/ocfs2/dlm/dlmcommon.h   |  1 +
>>>    fs/ocfs2/dlm/dlmdomain.c   | 14 ++++++++++++++
>>>    fs/ocfs2/dlm/dlmrecovery.c | 12 +++++++++---
>>>    3 files changed, 24 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
>>> index e9f3705..999ab7d 100644
>>> --- a/fs/ocfs2/dlm/dlmcommon.h
>>> +++ b/fs/ocfs2/dlm/dlmcommon.h
>>> @@ -140,6 +140,7 @@ struct dlm_ctxt
>>>    	u8 node_num;
>>>    	u32 key;
>>>    	u8  joining_node;
>>> +	u8 migrate_done; /* set to 1 means node has migrated all lockres */
>>>    	wait_queue_head_t dlm_join_events;
>>>    	unsigned long live_nodes_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
>>>    	unsigned long domain_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
>>> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
>>> index e1fea14..98a8f56 100644
>>> --- a/fs/ocfs2/dlm/dlmdomain.c
>>> +++ b/fs/ocfs2/dlm/dlmdomain.c
>>> @@ -461,6 +461,18 @@ static int dlm_migrate_all_locks(struct dlm_ctxt *dlm)
>>>    		cond_resched_lock(&dlm->spinlock);
>>>    		num += n;
>>>    	}
>>> +
>>> +	if (!num) {
>>> +		if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
>>> +			mlog(0, "%s: perhaps there are more lock resources need to "
>>> +					"be migrated after dlm recovery\n", dlm->name);
>>
>> If dlm is mark with DLM_RECO_STATE_ACTIVE, then a lock resource must
>> already be marked with DLM_LOCK_RES_RECOVERING which can't be migrated.
>> So code will goto redo_bucket in function dlm_migrate_all_locks.
>> So I don't understand why a judgement is added here?
>>
>>
>>
> because we have done migrating before recoverying. the judgement here
> is to avoid the following potential recoverying.
>>> +			ret = -EAGAIN;
>>> +		} else {
>>> +			mlog(0, "%s: we won't do dlm recovery after migrating all lockres",
>>> +					dlm->name);
>>> +			dlm->migrate_done = 1;
>>> +		}
>>> +	}
>>>    	spin_unlock(&dlm->spinlock);
>>>    	wake_up(&dlm->dlm_thread_wq);
>>>
>>> @@ -2052,6 +2064,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
>>>    	dlm->joining_node = DLM_LOCK_RES_OWNER_UNKNOWN;
>>>    	init_waitqueue_head(&dlm->dlm_join_events);
>>>
>>> +	dlm->migrate_done = 0;
>>> +
>>>    	dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
>>>    	dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
>>>
>>> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>>> index 74407c6..3106332 100644
>>> --- a/fs/ocfs2/dlm/dlmrecovery.c
>>> +++ b/fs/ocfs2/dlm/dlmrecovery.c
>>> @@ -423,12 +423,11 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
>>>
>>>    static void dlm_begin_recovery(struct dlm_ctxt *dlm)
>>>    {
>>> -	spin_lock(&dlm->spinlock);
>>> +	assert_spin_locked(&dlm->spinlock);
>>>    	BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
>>>    	printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
>>>    	       dlm->name, dlm->reco.dead_node);
>>>    	dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
>>> -	spin_unlock(&dlm->spinlock);
>>>    }
>>>
>>>    static void dlm_end_recovery(struct dlm_ctxt *dlm)
>>> @@ -456,6 +455,12 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
>>>
>>>    	spin_lock(&dlm->spinlock);
>>>
>>> +	if (dlm->migrate_done) {
>>> +		mlog(0, "%s: no need do recovery after migrating all lockres\n",
>>> +				dlm->name);
>>
>> Don't we need unlock above spin_lock before return?
>>
>> And if we just return here, how dlm lock resource can clear its
>> REDISCOVERING flag. I suppose this may cause cluster hang.
>>
>> And I cc this to ocfs2 maintainers.
>>
>> Thanks,
>> Changwei
>>
> oh, good catch, I missed spin_unlock(&dlm->spinlock);
>>> +		return 0;
>>> +	}
>>> +
>>>    	/* check to see if the new master has died */
>>>    	if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM &&
>>>    	    test_bit(dlm->reco.new_master, dlm->recovery_map)) {
>>> @@ -490,12 +495,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
>>>    	mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
>>>    	     dlm->name, task_pid_nr(dlm->dlm_reco_thread_task),
>>>    	     dlm->reco.dead_node);
>>> -	spin_unlock(&dlm->spinlock);
>>>
>>>    	/* take write barrier */
>>>    	/* (stops the list reshuffling thread, proxy ast handling) */
>>>    	dlm_begin_recovery(dlm);
>>>
>>> +	spin_unlock(&dlm->spinlock);
>>> +
>>>    	if (dlm->reco.new_master == dlm->node_num)
>>>    		goto master_here;
>>>
>>
>> .
>>
> 

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres
  2017-11-01  7:13     ` Changwei Ge
@ 2017-11-01  7:56       ` piaojun
  2017-11-01  8:11         ` Changwei Ge
  0 siblings, 1 reply; 11+ messages in thread
From: piaojun @ 2017-11-01  7:56 UTC (permalink / raw)
  To: ocfs2-devel

Hi Changwei,

thanks for reviewing, and I think waiting for recoverying done before
migrating seems another solution, but I wonder if new problems will be
invoked as following comments.

thanks,
Jun

On 2017/11/1 15:13, Changwei Ge wrote:
> Hi Jun,
> 
> I probably get your point.
> 
> You mean that dlm finds no lock resource to be migrated and no more lock 
> resource is managed by its hash table. After that a node dies all of a 
> sudden and the dead node is put into dlm's recovery map, right? 
that is it.
> Furthermore, a lock resource is migrated from other nodes and local node 
> has already asserted master to them.
> 
> If so, I want to suggest a easier way to solve it.
> We don't have to add a new flag to dlm structure, just leverage existed 
> dlm status and bitmap.
> It will bring a bonus - no lock resource will be marked with RECOVERING, 
> it's a safer way, I suppose.
> 
> Please take a review.
> 
> Thanks,
> Changwei
> 
> 
> Subject: [PATCH] ocfs2/dlm: a node can't be involved in recovery if it 
> is being shutdown
> 
> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
> ---
>   fs/ocfs2/dlm/dlmdomain.c   | 4 ++++
>   fs/ocfs2/dlm/dlmrecovery.c | 3 +++
>   2 files changed, 7 insertions(+)
> 
> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
> index a2b19fbdcf46..5e9283e509a4 100644
> --- a/fs/ocfs2/dlm/dlmdomain.c
> +++ b/fs/ocfs2/dlm/dlmdomain.c
> @@ -707,11 +707,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
>   		 * want new domain joins to communicate with us at
>   		 * least until we've completed migration of our
>   		 * resources. */
> +		spin_lock(&dlm->spinlock);
>   		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
> +		spin_unlock(&dlm->spinlock);
I guess there will be misuse of 'dlm->spinlock' and dlm_domain_lock.
>   		leave = 1;
>   	}
>   	spin_unlock(&dlm_domain_lock);
> 
> +	dlm_wait_for_recovery(dlm);
> +
>   	if (leave) {
>   		mlog(0, "shutting down domain %s\n", dlm->name);
>   		dlm_begin_exit_domain(dlm);
> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
> index 74407c6dd592..764c95b2b35c 100644
> --- a/fs/ocfs2/dlm/dlmrecovery.c
> +++ b/fs/ocfs2/dlm/dlmrecovery.c
> @@ -2441,6 +2441,9 @@ static void __dlm_hb_node_down(struct dlm_ctxt 
> *dlm, int idx)
>   {
>   	assert_spin_locked(&dlm->spinlock);
> 
> +	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
> +		return;
> +
'dlm->dlm_state' probably need be to protected by 'dlm_domain_lock'.
and I wander if there is more work to be done when in
'DLM_CTXT_IN_SHUTDOWN'?
>   	if (dlm->reco.new_master == idx) {
>   		mlog(0, "%s: recovery master %d just died\n",
>   		     dlm->name, idx);
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres
  2017-11-01  7:56       ` piaojun
@ 2017-11-01  8:11         ` Changwei Ge
  2017-11-01  8:45           ` piaojun
  0 siblings, 1 reply; 11+ messages in thread
From: Changwei Ge @ 2017-11-01  8:11 UTC (permalink / raw)
  To: ocfs2-devel

Hi Jun,

Thanks for reviewing.
I don't think we have to worry about misusing *dlm_domain_lock* and 
*dlm::spinlock*. I admit my change may look a little different from most 
of other code snippets where using these two spin locks. But our purpose 
is to close the race between __dlm_hb_node_down and 
dlm_unregister_domain, right?  And my change meets that. :-)

I suppose we can do it in a flexible way.

Thanks,
Changwei


On 2017/11/1 15:57, piaojun wrote:
> Hi Changwei,
> 
> thanks for reviewing, and I think waiting for recoverying done before
> migrating seems another solution, but I wonder if new problems will be
> invoked as following comments.
> 
> thanks,
> Jun
> 
> On 2017/11/1 15:13, Changwei Ge wrote:
>> Hi Jun,
>>
>> I probably get your point.
>>
>> You mean that dlm finds no lock resource to be migrated and no more lock
>> resource is managed by its hash table. After that a node dies all of a
>> sudden and the dead node is put into dlm's recovery map, right?
> that is it.
>> Furthermore, a lock resource is migrated from other nodes and local node
>> has already asserted master to them.
>>
>> If so, I want to suggest a easier way to solve it.
>> We don't have to add a new flag to dlm structure, just leverage existed
>> dlm status and bitmap.
>> It will bring a bonus - no lock resource will be marked with RECOVERING,
>> it's a safer way, I suppose.
>>
>> Please take a review.
>>
>> Thanks,
>> Changwei
>>
>>
>> Subject: [PATCH] ocfs2/dlm: a node can't be involved in recovery if it
>> is being shutdown
>>
>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>> ---
>>    fs/ocfs2/dlm/dlmdomain.c   | 4 ++++
>>    fs/ocfs2/dlm/dlmrecovery.c | 3 +++
>>    2 files changed, 7 insertions(+)
>>
>> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
>> index a2b19fbdcf46..5e9283e509a4 100644
>> --- a/fs/ocfs2/dlm/dlmdomain.c
>> +++ b/fs/ocfs2/dlm/dlmdomain.c
>> @@ -707,11 +707,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
>>    		 * want new domain joins to communicate with us at
>>    		 * least until we've completed migration of our
>>    		 * resources. */
>> +		spin_lock(&dlm->spinlock);
>>    		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
>> +		spin_unlock(&dlm->spinlock);
> I guess there will be misuse of 'dlm->spinlock' and dlm_domain_lock.
>>    		leave = 1;
>>    	}
>>    	spin_unlock(&dlm_domain_lock);
>>
>> +	dlm_wait_for_recovery(dlm);
>> +
>>    	if (leave) {
>>    		mlog(0, "shutting down domain %s\n", dlm->name);
>>    		dlm_begin_exit_domain(dlm);
>> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>> index 74407c6dd592..764c95b2b35c 100644
>> --- a/fs/ocfs2/dlm/dlmrecovery.c
>> +++ b/fs/ocfs2/dlm/dlmrecovery.c
>> @@ -2441,6 +2441,9 @@ static void __dlm_hb_node_down(struct dlm_ctxt
>> *dlm, int idx)
>>    {
>>    	assert_spin_locked(&dlm->spinlock);
>>
>> +	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
>> +		return;
>> +
> 'dlm->dlm_state' probably need be to protected by 'dlm_domain_lock'.
> and I wander if there is more work to be done when in
> 'DLM_CTXT_IN_SHUTDOWN'?
>>    	if (dlm->reco.new_master == idx) {
>>    		mlog(0, "%s: recovery master %d just died\n",
>>    		     dlm->name, idx);
>>
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres
  2017-11-01  8:11         ` Changwei Ge
@ 2017-11-01  8:45           ` piaojun
  2017-11-01  9:00             ` Changwei Ge
  0 siblings, 1 reply; 11+ messages in thread
From: piaojun @ 2017-11-01  8:45 UTC (permalink / raw)
  To: ocfs2-devel

Hi Changwei,

I do think we need follow the principle that use 'dlm_domain_lock' to
protect 'dlm_state' as the NOTE says in 'dlm_ctxt':
/* NOTE: Next three are protected by dlm_domain_lock */

deadnode won't be cleared from 'dlm->domain_map' if return from
__dlm_hb_node_down(), and NodeA will retry migrating to NodeB forever
if only NodeA and NodeB in domain. I suggest more testing needed in
your solution.

thanks,
Jun

On 2017/11/1 16:11, Changwei Ge wrote:
> Hi Jun,
> 
> Thanks for reviewing.
> I don't think we have to worry about misusing *dlm_domain_lock* and 
> *dlm::spinlock*. I admit my change may look a little different from most 
> of other code snippets where using these two spin locks. But our purpose 
> is to close the race between __dlm_hb_node_down and 
> dlm_unregister_domain, right?  And my change meets that. :-)
> 
> I suppose we can do it in a flexible way.
> 
> Thanks,
> Changwei
> 
> 
> On 2017/11/1 15:57, piaojun wrote:
>> Hi Changwei,
>>
>> thanks for reviewing, and I think waiting for recoverying done before
>> migrating seems another solution, but I wonder if new problems will be
>> invoked as following comments.
>>
>> thanks,
>> Jun
>>
>> On 2017/11/1 15:13, Changwei Ge wrote:
>>> Hi Jun,
>>>
>>> I probably get your point.
>>>
>>> You mean that dlm finds no lock resource to be migrated and no more lock
>>> resource is managed by its hash table. After that a node dies all of a
>>> sudden and the dead node is put into dlm's recovery map, right?
>> that is it.
>>> Furthermore, a lock resource is migrated from other nodes and local node
>>> has already asserted master to them.
>>>
>>> If so, I want to suggest a easier way to solve it.
>>> We don't have to add a new flag to dlm structure, just leverage existed
>>> dlm status and bitmap.
>>> It will bring a bonus - no lock resource will be marked with RECOVERING,
>>> it's a safer way, I suppose.
>>>
>>> Please take a review.
>>>
>>> Thanks,
>>> Changwei
>>>
>>>
>>> Subject: [PATCH] ocfs2/dlm: a node can't be involved in recovery if it
>>> is being shutdown
>>>
>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>> ---
>>>    fs/ocfs2/dlm/dlmdomain.c   | 4 ++++
>>>    fs/ocfs2/dlm/dlmrecovery.c | 3 +++
>>>    2 files changed, 7 insertions(+)
>>>
>>> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
>>> index a2b19fbdcf46..5e9283e509a4 100644
>>> --- a/fs/ocfs2/dlm/dlmdomain.c
>>> +++ b/fs/ocfs2/dlm/dlmdomain.c
>>> @@ -707,11 +707,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
>>>    		 * want new domain joins to communicate with us at
>>>    		 * least until we've completed migration of our
>>>    		 * resources. */
>>> +		spin_lock(&dlm->spinlock);
>>>    		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
>>> +		spin_unlock(&dlm->spinlock);
>> I guess there will be misuse of 'dlm->spinlock' and dlm_domain_lock.
>>>    		leave = 1;
>>>    	}
>>>    	spin_unlock(&dlm_domain_lock);
>>>
>>> +	dlm_wait_for_recovery(dlm);
>>> +
>>>    	if (leave) {
>>>    		mlog(0, "shutting down domain %s\n", dlm->name);
>>>    		dlm_begin_exit_domain(dlm);
>>> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>>> index 74407c6dd592..764c95b2b35c 100644
>>> --- a/fs/ocfs2/dlm/dlmrecovery.c
>>> +++ b/fs/ocfs2/dlm/dlmrecovery.c
>>> @@ -2441,6 +2441,9 @@ static void __dlm_hb_node_down(struct dlm_ctxt
>>> *dlm, int idx)
>>>    {
>>>    	assert_spin_locked(&dlm->spinlock);
>>>
>>> +	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
>>> +		return;
>>> +
>> 'dlm->dlm_state' probably need be to protected by 'dlm_domain_lock'.
>> and I wander if there is more work to be done when in
>> 'DLM_CTXT_IN_SHUTDOWN'?
>>>    	if (dlm->reco.new_master == idx) {
>>>    		mlog(0, "%s: recovery master %d just died\n",
>>>    		     dlm->name, idx);
>>>
>>
> 
> .
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres
  2017-11-01  8:45           ` piaojun
@ 2017-11-01  9:00             ` Changwei Ge
  2017-11-02  1:42               ` piaojun
  0 siblings, 1 reply; 11+ messages in thread
From: Changwei Ge @ 2017-11-01  9:00 UTC (permalink / raw)
  To: ocfs2-devel

Hi Jun,

On 2017/11/1 16:46, piaojun wrote:
> Hi Changwei,
> 
> I do think we need follow the principle that use 'dlm_domain_lock' to
> protect 'dlm_state' as the NOTE says in 'dlm_ctxt':
> /* NOTE: Next three are protected by dlm_domain_lock */
> 
> deadnode won't be cleared from 'dlm->domain_map' if return from
> __dlm_hb_node_down(), and NodeA will retry migrating to NodeB forever
> if only NodeA and NodeB in domain. I suggest more testing needed in
> your solution.

I agree, however, my patch is just a draft to indicate my comments.

Perhaps we can figure out a better way to solve this, as your patch 
can't clear DLM RECOVERING flag on lock resource. I am not sure if it is 
reasonable, I suppose this may violate ocfs2/dlm design philosophy.

Thanks,
Changwei

> 
> thanks,
> Jun
> 
> On 2017/11/1 16:11, Changwei Ge wrote:
>> Hi Jun,
>>
>> Thanks for reviewing.
>> I don't think we have to worry about misusing *dlm_domain_lock* and
>> *dlm::spinlock*. I admit my change may look a little different from most
>> of other code snippets where using these two spin locks. But our purpose
>> is to close the race between __dlm_hb_node_down and
>> dlm_unregister_domain, right?  And my change meets that. :-)
>>
>> I suppose we can do it in a flexible way.
>>
>> Thanks,
>> Changwei
>>
>>
>> On 2017/11/1 15:57, piaojun wrote:
>>> Hi Changwei,
>>>
>>> thanks for reviewing, and I think waiting for recoverying done before
>>> migrating seems another solution, but I wonder if new problems will be
>>> invoked as following comments.
>>>
>>> thanks,
>>> Jun
>>>
>>> On 2017/11/1 15:13, Changwei Ge wrote:
>>>> Hi Jun,
>>>>
>>>> I probably get your point.
>>>>
>>>> You mean that dlm finds no lock resource to be migrated and no more lock
>>>> resource is managed by its hash table. After that a node dies all of a
>>>> sudden and the dead node is put into dlm's recovery map, right?
>>> that is it.
>>>> Furthermore, a lock resource is migrated from other nodes and local node
>>>> has already asserted master to them.
>>>>
>>>> If so, I want to suggest a easier way to solve it.
>>>> We don't have to add a new flag to dlm structure, just leverage existed
>>>> dlm status and bitmap.
>>>> It will bring a bonus - no lock resource will be marked with RECOVERING,
>>>> it's a safer way, I suppose.
>>>>
>>>> Please take a review.
>>>>
>>>> Thanks,
>>>> Changwei
>>>>
>>>>
>>>> Subject: [PATCH] ocfs2/dlm: a node can't be involved in recovery if it
>>>> is being shutdown
>>>>
>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>>> ---
>>>>     fs/ocfs2/dlm/dlmdomain.c   | 4 ++++
>>>>     fs/ocfs2/dlm/dlmrecovery.c | 3 +++
>>>>     2 files changed, 7 insertions(+)
>>>>
>>>> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
>>>> index a2b19fbdcf46..5e9283e509a4 100644
>>>> --- a/fs/ocfs2/dlm/dlmdomain.c
>>>> +++ b/fs/ocfs2/dlm/dlmdomain.c
>>>> @@ -707,11 +707,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
>>>>     		 * want new domain joins to communicate with us at
>>>>     		 * least until we've completed migration of our
>>>>     		 * resources. */
>>>> +		spin_lock(&dlm->spinlock);
>>>>     		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
>>>> +		spin_unlock(&dlm->spinlock);
>>> I guess there will be misuse of 'dlm->spinlock' and dlm_domain_lock.
>>>>     		leave = 1;
>>>>     	}
>>>>     	spin_unlock(&dlm_domain_lock);
>>>>
>>>> +	dlm_wait_for_recovery(dlm);
>>>> +
>>>>     	if (leave) {
>>>>     		mlog(0, "shutting down domain %s\n", dlm->name);
>>>>     		dlm_begin_exit_domain(dlm);
>>>> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>>>> index 74407c6dd592..764c95b2b35c 100644
>>>> --- a/fs/ocfs2/dlm/dlmrecovery.c
>>>> +++ b/fs/ocfs2/dlm/dlmrecovery.c
>>>> @@ -2441,6 +2441,9 @@ static void __dlm_hb_node_down(struct dlm_ctxt
>>>> *dlm, int idx)
>>>>     {
>>>>     	assert_spin_locked(&dlm->spinlock);
>>>>
>>>> +	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
>>>> +		return;
>>>> +
>>> 'dlm->dlm_state' probably need be to protected by 'dlm_domain_lock'.
>>> and I wander if there is more work to be done when in
>>> 'DLM_CTXT_IN_SHUTDOWN'?
>>>>     	if (dlm->reco.new_master == idx) {
>>>>     		mlog(0, "%s: recovery master %d just died\n",
>>>>     		     dlm->name, idx);
>>>>
>>>
>>
>> .
>>
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres
  2017-11-01  9:00             ` Changwei Ge
@ 2017-11-02  1:42               ` piaojun
  2017-11-02  1:56                 ` Changwei Ge
  0 siblings, 1 reply; 11+ messages in thread
From: piaojun @ 2017-11-02  1:42 UTC (permalink / raw)
  To: ocfs2-devel

Hi Changwei,

I had tried a solution like yours before, but failed to prevent the
race just by 'dlm_state' and the existed variable as
'DLM_CTXT_IN_SHUTDOWN' contains many status. so I think we need
introduce a 'migrate_done' to solve that problem.

thanks,
Jun

On 2017/11/1 17:00, Changwei Ge wrote:
> Hi Jun,
> 
> On 2017/11/1 16:46, piaojun wrote:
>> Hi Changwei,
>>
>> I do think we need follow the principle that use 'dlm_domain_lock' to
>> protect 'dlm_state' as the NOTE says in 'dlm_ctxt':
>> /* NOTE: Next three are protected by dlm_domain_lock */
>>
>> deadnode won't be cleared from 'dlm->domain_map' if return from
>> __dlm_hb_node_down(), and NodeA will retry migrating to NodeB forever
>> if only NodeA and NodeB in domain. I suggest more testing needed in
>> your solution.
> 
> I agree, however, my patch is just a draft to indicate my comments.
> 
> Perhaps we can figure out a better way to solve this, as your patch 
> can't clear DLM RECOVERING flag on lock resource. I am not sure if it is 
> reasonable, I suppose this may violate ocfs2/dlm design philosophy.
> 
> Thanks,
> Changwei
> 

if res is marked as DLM RECOVERING, migrating process will wait for
recoverying done. and DLM RECOVERING will be cleared after recoverying.

>>
>> thanks,
>> Jun
>>
>> On 2017/11/1 16:11, Changwei Ge wrote:
>>> Hi Jun,
>>>
>>> Thanks for reviewing.
>>> I don't think we have to worry about misusing *dlm_domain_lock* and
>>> *dlm::spinlock*. I admit my change may look a little different from most
>>> of other code snippets where using these two spin locks. But our purpose
>>> is to close the race between __dlm_hb_node_down and
>>> dlm_unregister_domain, right?  And my change meets that. :-)
>>>
>>> I suppose we can do it in a flexible way.
>>>
>>> Thanks,
>>> Changwei
>>>
>>>
>>> On 2017/11/1 15:57, piaojun wrote:
>>>> Hi Changwei,
>>>>
>>>> thanks for reviewing, and I think waiting for recoverying done before
>>>> migrating seems another solution, but I wonder if new problems will be
>>>> invoked as following comments.
>>>>
>>>> thanks,
>>>> Jun
>>>>
>>>> On 2017/11/1 15:13, Changwei Ge wrote:
>>>>> Hi Jun,
>>>>>
>>>>> I probably get your point.
>>>>>
>>>>> You mean that dlm finds no lock resource to be migrated and no more lock
>>>>> resource is managed by its hash table. After that a node dies all of a
>>>>> sudden and the dead node is put into dlm's recovery map, right?
>>>> that is it.
>>>>> Furthermore, a lock resource is migrated from other nodes and local node
>>>>> has already asserted master to them.
>>>>>
>>>>> If so, I want to suggest a easier way to solve it.
>>>>> We don't have to add a new flag to dlm structure, just leverage existed
>>>>> dlm status and bitmap.
>>>>> It will bring a bonus - no lock resource will be marked with RECOVERING,
>>>>> it's a safer way, I suppose.
>>>>>
>>>>> Please take a review.
>>>>>
>>>>> Thanks,
>>>>> Changwei
>>>>>
>>>>>
>>>>> Subject: [PATCH] ocfs2/dlm: a node can't be involved in recovery if it
>>>>> is being shutdown
>>>>>
>>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>>>> ---
>>>>>     fs/ocfs2/dlm/dlmdomain.c   | 4 ++++
>>>>>     fs/ocfs2/dlm/dlmrecovery.c | 3 +++
>>>>>     2 files changed, 7 insertions(+)
>>>>>
>>>>> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
>>>>> index a2b19fbdcf46..5e9283e509a4 100644
>>>>> --- a/fs/ocfs2/dlm/dlmdomain.c
>>>>> +++ b/fs/ocfs2/dlm/dlmdomain.c
>>>>> @@ -707,11 +707,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
>>>>>     		 * want new domain joins to communicate with us at
>>>>>     		 * least until we've completed migration of our
>>>>>     		 * resources. */
>>>>> +		spin_lock(&dlm->spinlock);
>>>>>     		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
>>>>> +		spin_unlock(&dlm->spinlock);
>>>> I guess there will be misuse of 'dlm->spinlock' and dlm_domain_lock.
>>>>>     		leave = 1;
>>>>>     	}
>>>>>     	spin_unlock(&dlm_domain_lock);
>>>>>
>>>>> +	dlm_wait_for_recovery(dlm);
>>>>> +
>>>>>     	if (leave) {
>>>>>     		mlog(0, "shutting down domain %s\n", dlm->name);
>>>>>     		dlm_begin_exit_domain(dlm);
>>>>> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>>>>> index 74407c6dd592..764c95b2b35c 100644
>>>>> --- a/fs/ocfs2/dlm/dlmrecovery.c
>>>>> +++ b/fs/ocfs2/dlm/dlmrecovery.c
>>>>> @@ -2441,6 +2441,9 @@ static void __dlm_hb_node_down(struct dlm_ctxt
>>>>> *dlm, int idx)
>>>>>     {
>>>>>     	assert_spin_locked(&dlm->spinlock);
>>>>>
>>>>> +	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
>>>>> +		return;
>>>>> +
>>>> 'dlm->dlm_state' probably need be to protected by 'dlm_domain_lock'.
>>>> and I wander if there is more work to be done when in
>>>> 'DLM_CTXT_IN_SHUTDOWN'?
>>>>>     	if (dlm->reco.new_master == idx) {
>>>>>     		mlog(0, "%s: recovery master %d just died\n",
>>>>>     		     dlm->name, idx);
>>>>>
>>>>
>>>
>>> .
>>>
>>
> 
> .
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres
  2017-11-02  1:42               ` piaojun
@ 2017-11-02  1:56                 ` Changwei Ge
  2017-11-03  1:01                   ` piaojun
  0 siblings, 1 reply; 11+ messages in thread
From: Changwei Ge @ 2017-11-02  1:56 UTC (permalink / raw)
  To: ocfs2-devel

On 2017/11/2 9:44, piaojun wrote:
> Hi Changwei,
> 
> I had tried a solution like yours before, but failed to prevent the
> race just by 'dlm_state' and the existed variable as
> 'DLM_CTXT_IN_SHUTDOWN' contains many status. so I think we need
> introduce a 'migrate_done' to solve that problem.
Hi Jun,

Yes, adding a new flag might be a direction, but I still think we need 
to clear other nodes' lock resources' flag - DLM_LOCK_RES_RECOVERING, 
which depends on NodeA's dlm recovering progress. Unfortunately, it is 
interrupted by the newly added flag ::migrate_done in your patch. :-(

So no DLM_FINALIZE_RECO_MSG message will be sent out to other nodes, 
thus DLM_LOCK_RES_RECOVERING can't be cleared.

As I know, if DLM_LOCK_RES_RECOVERING is set, all lock and unlock 
requests will be *hang*.

Thanks,
Changwei

> 
> thanks,
> Jun
> 
> On 2017/11/1 17:00, Changwei Ge wrote:
>> Hi Jun,
>>
>> On 2017/11/1 16:46, piaojun wrote:
>>> Hi Changwei,
>>>
>>> I do think we need follow the principle that use 'dlm_domain_lock' to
>>> protect 'dlm_state' as the NOTE says in 'dlm_ctxt':
>>> /* NOTE: Next three are protected by dlm_domain_lock */
>>>
>>> deadnode won't be cleared from 'dlm->domain_map' if return from
>>> __dlm_hb_node_down(), and NodeA will retry migrating to NodeB forever
>>> if only NodeA and NodeB in domain. I suggest more testing needed in
>>> your solution.
>>
>> I agree, however, my patch is just a draft to indicate my comments.
>>
>> Perhaps we can figure out a better way to solve this, as your patch
>> can't clear DLM RECOVERING flag on lock resource. I am not sure if it is
>> reasonable, I suppose this may violate ocfs2/dlm design philosophy.
>>
>> Thanks,
>> Changwei
>>
> 
> if res is marked as DLM RECOVERING, migrating process will wait for
> recoverying done. and DLM RECOVERING will be cleared after recoverying.
> 
>>>
>>> thanks,
>>> Jun
>>>
>>> On 2017/11/1 16:11, Changwei Ge wrote:
>>>> Hi Jun,
>>>>
>>>> Thanks for reviewing.
>>>> I don't think we have to worry about misusing *dlm_domain_lock* and
>>>> *dlm::spinlock*. I admit my change may look a little different from most
>>>> of other code snippets where using these two spin locks. But our purpose
>>>> is to close the race between __dlm_hb_node_down and
>>>> dlm_unregister_domain, right?  And my change meets that. :-)
>>>>
>>>> I suppose we can do it in a flexible way.
>>>>
>>>> Thanks,
>>>> Changwei
>>>>
>>>>
>>>> On 2017/11/1 15:57, piaojun wrote:
>>>>> Hi Changwei,
>>>>>
>>>>> thanks for reviewing, and I think waiting for recoverying done before
>>>>> migrating seems another solution, but I wonder if new problems will be
>>>>> invoked as following comments.
>>>>>
>>>>> thanks,
>>>>> Jun
>>>>>
>>>>> On 2017/11/1 15:13, Changwei Ge wrote:
>>>>>> Hi Jun,
>>>>>>
>>>>>> I probably get your point.
>>>>>>
>>>>>> You mean that dlm finds no lock resource to be migrated and no more lock
>>>>>> resource is managed by its hash table. After that a node dies all of a
>>>>>> sudden and the dead node is put into dlm's recovery map, right?
>>>>> that is it.
>>>>>> Furthermore, a lock resource is migrated from other nodes and local node
>>>>>> has already asserted master to them.
>>>>>>
>>>>>> If so, I want to suggest a easier way to solve it.
>>>>>> We don't have to add a new flag to dlm structure, just leverage existed
>>>>>> dlm status and bitmap.
>>>>>> It will bring a bonus - no lock resource will be marked with RECOVERING,
>>>>>> it's a safer way, I suppose.
>>>>>>
>>>>>> Please take a review.
>>>>>>
>>>>>> Thanks,
>>>>>> Changwei
>>>>>>
>>>>>>
>>>>>> Subject: [PATCH] ocfs2/dlm: a node can't be involved in recovery if it
>>>>>> is being shutdown
>>>>>>
>>>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>>>>> ---
>>>>>>      fs/ocfs2/dlm/dlmdomain.c   | 4 ++++
>>>>>>      fs/ocfs2/dlm/dlmrecovery.c | 3 +++
>>>>>>      2 files changed, 7 insertions(+)
>>>>>>
>>>>>> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
>>>>>> index a2b19fbdcf46..5e9283e509a4 100644
>>>>>> --- a/fs/ocfs2/dlm/dlmdomain.c
>>>>>> +++ b/fs/ocfs2/dlm/dlmdomain.c
>>>>>> @@ -707,11 +707,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
>>>>>>      		 * want new domain joins to communicate with us at
>>>>>>      		 * least until we've completed migration of our
>>>>>>      		 * resources. */
>>>>>> +		spin_lock(&dlm->spinlock);
>>>>>>      		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
>>>>>> +		spin_unlock(&dlm->spinlock);
>>>>> I guess there will be misuse of 'dlm->spinlock' and dlm_domain_lock.
>>>>>>      		leave = 1;
>>>>>>      	}
>>>>>>      	spin_unlock(&dlm_domain_lock);
>>>>>>
>>>>>> +	dlm_wait_for_recovery(dlm);
>>>>>> +
>>>>>>      	if (leave) {
>>>>>>      		mlog(0, "shutting down domain %s\n", dlm->name);
>>>>>>      		dlm_begin_exit_domain(dlm);
>>>>>> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>>>>>> index 74407c6dd592..764c95b2b35c 100644
>>>>>> --- a/fs/ocfs2/dlm/dlmrecovery.c
>>>>>> +++ b/fs/ocfs2/dlm/dlmrecovery.c
>>>>>> @@ -2441,6 +2441,9 @@ static void __dlm_hb_node_down(struct dlm_ctxt
>>>>>> *dlm, int idx)
>>>>>>      {
>>>>>>      	assert_spin_locked(&dlm->spinlock);
>>>>>>
>>>>>> +	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
>>>>>> +		return;
>>>>>> +
>>>>> 'dlm->dlm_state' probably need be to protected by 'dlm_domain_lock'.
>>>>> and I wander if there is more work to be done when in
>>>>> 'DLM_CTXT_IN_SHUTDOWN'?
>>>>>>      	if (dlm->reco.new_master == idx) {
>>>>>>      		mlog(0, "%s: recovery master %d just died\n",
>>>>>>      		     dlm->name, idx);
>>>>>>
>>>>>
>>>>
>>>> .
>>>>
>>>
>>
>> .
>>
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres
  2017-11-02  1:56                 ` Changwei Ge
@ 2017-11-03  1:01                   ` piaojun
  0 siblings, 0 replies; 11+ messages in thread
From: piaojun @ 2017-11-03  1:01 UTC (permalink / raw)
  To: ocfs2-devel

Hi Changwei,

please see my last mail said:

"
if res is marked as DLM RECOVERING, migrating process will wait for
recoverying done. and DLM RECOVERING will be cleared after recoverying.
"

and if there is still lockres, 'migrate_done' won't be set. moreover if
other node deaded after migrating, I just let another node to do
recovery.

thanks,
Jun

On 2017/11/2 9:56, Changwei Ge wrote:
> On 2017/11/2 9:44, piaojun wrote:
>> Hi Changwei,
>>
>> I had tried a solution like yours before, but failed to prevent the
>> race just by 'dlm_state' and the existed variable as
>> 'DLM_CTXT_IN_SHUTDOWN' contains many status. so I think we need
>> introduce a 'migrate_done' to solve that problem.
> Hi Jun,
> 
> Yes, adding a new flag might be a direction, but I still think we need 
> to clear other nodes' lock resources' flag - DLM_LOCK_RES_RECOVERING, 
> which depends on NodeA's dlm recovering progress. Unfortunately, it is 
> interrupted by the newly added flag ::migrate_done in your patch. :-(
> 
> So no DLM_FINALIZE_RECO_MSG message will be sent out to other nodes, 
> thus DLM_LOCK_RES_RECOVERING can't be cleared.
> 
> As I know, if DLM_LOCK_RES_RECOVERING is set, all lock and unlock 
> requests will be *hang*.
> 
> Thanks,
> Changwei
> 
>>
>> thanks,
>> Jun
>>
>> On 2017/11/1 17:00, Changwei Ge wrote:
>>> Hi Jun,
>>>
>>> On 2017/11/1 16:46, piaojun wrote:
>>>> Hi Changwei,
>>>>
>>>> I do think we need follow the principle that use 'dlm_domain_lock' to
>>>> protect 'dlm_state' as the NOTE says in 'dlm_ctxt':
>>>> /* NOTE: Next three are protected by dlm_domain_lock */
>>>>
>>>> deadnode won't be cleared from 'dlm->domain_map' if return from
>>>> __dlm_hb_node_down(), and NodeA will retry migrating to NodeB forever
>>>> if only NodeA and NodeB in domain. I suggest more testing needed in
>>>> your solution.
>>>
>>> I agree, however, my patch is just a draft to indicate my comments.
>>>
>>> Perhaps we can figure out a better way to solve this, as your patch
>>> can't clear DLM RECOVERING flag on lock resource. I am not sure if it is
>>> reasonable, I suppose this may violate ocfs2/dlm design philosophy.
>>>
>>> Thanks,
>>> Changwei
>>>
>>
>> if res is marked as DLM RECOVERING, migrating process will wait for
>> recoverying done. and DLM RECOVERING will be cleared after recoverying.
>>
>>>>
>>>> thanks,
>>>> Jun
>>>>
>>>> On 2017/11/1 16:11, Changwei Ge wrote:
>>>>> Hi Jun,
>>>>>
>>>>> Thanks for reviewing.
>>>>> I don't think we have to worry about misusing *dlm_domain_lock* and
>>>>> *dlm::spinlock*. I admit my change may look a little different from most
>>>>> of other code snippets where using these two spin locks. But our purpose
>>>>> is to close the race between __dlm_hb_node_down and
>>>>> dlm_unregister_domain, right?  And my change meets that. :-)
>>>>>
>>>>> I suppose we can do it in a flexible way.
>>>>>
>>>>> Thanks,
>>>>> Changwei
>>>>>
>>>>>
>>>>> On 2017/11/1 15:57, piaojun wrote:
>>>>>> Hi Changwei,
>>>>>>
>>>>>> thanks for reviewing, and I think waiting for recoverying done before
>>>>>> migrating seems another solution, but I wonder if new problems will be
>>>>>> invoked as following comments.
>>>>>>
>>>>>> thanks,
>>>>>> Jun
>>>>>>
>>>>>> On 2017/11/1 15:13, Changwei Ge wrote:
>>>>>>> Hi Jun,
>>>>>>>
>>>>>>> I probably get your point.
>>>>>>>
>>>>>>> You mean that dlm finds no lock resource to be migrated and no more lock
>>>>>>> resource is managed by its hash table. After that a node dies all of a
>>>>>>> sudden and the dead node is put into dlm's recovery map, right?
>>>>>> that is it.
>>>>>>> Furthermore, a lock resource is migrated from other nodes and local node
>>>>>>> has already asserted master to them.
>>>>>>>
>>>>>>> If so, I want to suggest a easier way to solve it.
>>>>>>> We don't have to add a new flag to dlm structure, just leverage existed
>>>>>>> dlm status and bitmap.
>>>>>>> It will bring a bonus - no lock resource will be marked with RECOVERING,
>>>>>>> it's a safer way, I suppose.
>>>>>>>
>>>>>>> Please take a review.
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Changwei
>>>>>>>
>>>>>>>
>>>>>>> Subject: [PATCH] ocfs2/dlm: a node can't be involved in recovery if it
>>>>>>> is being shutdown
>>>>>>>
>>>>>>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>>>>>>> ---
>>>>>>>      fs/ocfs2/dlm/dlmdomain.c   | 4 ++++
>>>>>>>      fs/ocfs2/dlm/dlmrecovery.c | 3 +++
>>>>>>>      2 files changed, 7 insertions(+)
>>>>>>>
>>>>>>> diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
>>>>>>> index a2b19fbdcf46..5e9283e509a4 100644
>>>>>>> --- a/fs/ocfs2/dlm/dlmdomain.c
>>>>>>> +++ b/fs/ocfs2/dlm/dlmdomain.c
>>>>>>> @@ -707,11 +707,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
>>>>>>>      		 * want new domain joins to communicate with us at
>>>>>>>      		 * least until we've completed migration of our
>>>>>>>      		 * resources. */
>>>>>>> +		spin_lock(&dlm->spinlock);
>>>>>>>      		dlm->dlm_state = DLM_CTXT_IN_SHUTDOWN;
>>>>>>> +		spin_unlock(&dlm->spinlock);
>>>>>> I guess there will be misuse of 'dlm->spinlock' and dlm_domain_lock.
>>>>>>>      		leave = 1;
>>>>>>>      	}
>>>>>>>      	spin_unlock(&dlm_domain_lock);
>>>>>>>
>>>>>>> +	dlm_wait_for_recovery(dlm);
>>>>>>> +
>>>>>>>      	if (leave) {
>>>>>>>      		mlog(0, "shutting down domain %s\n", dlm->name);
>>>>>>>      		dlm_begin_exit_domain(dlm);
>>>>>>> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>>>>>>> index 74407c6dd592..764c95b2b35c 100644
>>>>>>> --- a/fs/ocfs2/dlm/dlmrecovery.c
>>>>>>> +++ b/fs/ocfs2/dlm/dlmrecovery.c
>>>>>>> @@ -2441,6 +2441,9 @@ static void __dlm_hb_node_down(struct dlm_ctxt
>>>>>>> *dlm, int idx)
>>>>>>>      {
>>>>>>>      	assert_spin_locked(&dlm->spinlock);
>>>>>>>
>>>>>>> +	if (dlm->dlm_state == DLM_CTXT_IN_SHUTDOWN)
>>>>>>> +		return;
>>>>>>> +
>>>>>> 'dlm->dlm_state' probably need be to protected by 'dlm_domain_lock'.
>>>>>> and I wander if there is more work to be done when in
>>>>>> 'DLM_CTXT_IN_SHUTDOWN'?
>>>>>>>      	if (dlm->reco.new_master == idx) {
>>>>>>>      		mlog(0, "%s: recovery master %d just died\n",
>>>>>>>      		     dlm->name, idx);
>>>>>>>
>>>>>>
>>>>>
>>>>> .
>>>>>
>>>>
>>>
>>> .
>>>
>>
> 
> .
> 

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2017-11-03  1:01 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-11-01  1:14 [Ocfs2-devel] [PATCH] ocfs2/dlm: wait for dlm recovery done when migrating all lockres piaojun
2017-11-01  2:47 ` Changwei Ge
2017-11-01  5:52   ` piaojun
2017-11-01  7:13     ` Changwei Ge
2017-11-01  7:56       ` piaojun
2017-11-01  8:11         ` Changwei Ge
2017-11-01  8:45           ` piaojun
2017-11-01  9:00             ` Changwei Ge
2017-11-02  1:42               ` piaojun
2017-11-02  1:56                 ` Changwei Ge
2017-11-03  1:01                   ` piaojun

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.