All of lore.kernel.org
 help / color / mirror / Atom feed
* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
@ 2017-08-07  7:13 Changwei Ge
  2017-08-07  7:43 ` Gang He
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Changwei Ge @ 2017-08-07  7:13 UTC (permalink / raw)
  To: ocfs2-devel

Hi,

In current code, while flushing AST, we don't handle an exception that
sending AST or BAST is failed.
But it is indeed possible that AST or BAST is lost due to some kind of
networks fault.

If above exception happens, the requesting node will never obtain an AST
back, hence, it will never acquire the lock or abort current locking.

With this patch, I'd like to fix this issue by re-queuing the AST or
BAST if sending is failed due to networks fault.

And the re-queuing AST or BAST will be dropped if the requesting node is
dead!

It will improve the reliability a lot.


Thanks.

Changwei.

Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
---
 fs/ocfs2/dlm/dlmrecovery.c |   51
++++++++++++++++++++++++++++++++++++++++++--
 fs/ocfs2/dlm/dlmthread.c   |   39 +++++++++++++++++++++++++++------
 2 files changed, 81 insertions(+), 9 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 74407c6..ddfaf74 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2263,11 +2263,45 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
     }
 }
 
+static int dlm_drop_pending_ast_bast(struct dlm_ctxt *dlm,
+                     struct dlm_lock *lock)
+{
+    int reserved = 0;
+
+    spin_lock(&dlm->ast_lock);
+    if (!list_empty(&lock->ast_list)) {
+        mlog(0, "%s: drop pending AST for lock(cookie=%u:%llu).\n",
+             dlm->name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+        list_del_init(&lock->ast_list);
+        lock->ast_pending = 0;
+        dlm_lock_put(lock);
+        reserved++;
+    }
+
+    if (!list_empty(&lock->bast_list)) {
+        mlog(0, "%s: drop pending BAST for lock(cookie=%u:%llu).\n",
+             dlm->name,
+             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
+             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
+        list_del_init(&lock->bast_list);
+        lock->bast_pending = 0;
+        dlm_lock_put(lock);
+        reserved++;
+    }
+    spin_unlock(&dlm->ast_lock);
+
+    return reserved;
+}
+
 static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
-                struct dlm_lock_resource *res, u8 dead_node)
+                struct dlm_lock_resource *res, u8 dead_node,
+                int *reserved)
 {
     struct dlm_lock *lock, *next;
     unsigned int freed = 0;
+    int reserved_tmp = 0;
 
     /* this node is the lockres master:
      * 1) remove any stale locks for the dead node
@@ -2284,6 +2318,9 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
         if (lock->ml.node == dead_node) {
             list_del_init(&lock->list);
             dlm_lock_put(lock);
+
+            reserved_tmp += dlm_drop_pending_ast_bast(dlm, lock);
+
             /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
             dlm_lock_put(lock);
             freed++;
@@ -2293,6 +2330,9 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
         if (lock->ml.node == dead_node) {
             list_del_init(&lock->list);
             dlm_lock_put(lock);
+
+            reserved_tmp += dlm_drop_pending_ast_bast(dlm, lock);
+
             /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
             dlm_lock_put(lock);
             freed++;
@@ -2308,6 +2348,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
         }
     }
 
+    *reserved = reserved_tmp;
+
     if (freed) {
         mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
              "dropping ref from lockres\n", dlm->name,
@@ -2367,6 +2409,7 @@ static void dlm_do_local_recovery_cleanup(struct
dlm_ctxt *dlm, u8 dead_node)
     for (i = 0; i < DLM_HASH_BUCKETS; i++) {
         bucket = dlm_lockres_hash(dlm, i);
         hlist_for_each_entry_safe(res, tmp, bucket, hash_node) {
+            int reserved = 0;
              /* always prune any $RECOVERY entries for dead nodes,
               * otherwise hangs can occur during later recovery */
             if (dlm_is_recovery_lock(res->lockname.name,
@@ -2420,7 +2463,7 @@ static void dlm_do_local_recovery_cleanup(struct
dlm_ctxt *dlm, u8 dead_node)
                     continue;
                 }
             } else if (res->owner == dlm->node_num) {
-                dlm_free_dead_locks(dlm, res, dead_node);
+                dlm_free_dead_locks(dlm, res, dead_node, &reserved);
                 __dlm_lockres_calc_usage(dlm, res);
             } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
                 if (test_bit(dead_node, res->refmap)) {
@@ -2432,6 +2475,10 @@ static void dlm_do_local_recovery_cleanup(struct
dlm_ctxt *dlm, u8 dead_node)
                 }
             }
             spin_unlock(&res->spinlock);
+            while (reserved) {
+                dlm_lockres_release_ast(dlm, res);
+                reserved--;
+            }
         }
     }
 
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 838a06d..c34a619 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -587,13 +587,13 @@ static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
 
 static void dlm_flush_asts(struct dlm_ctxt *dlm)
 {
-    int ret;
+    int ret = 0;
     struct dlm_lock *lock;
     struct dlm_lock_resource *res;
     u8 hi;
 
     spin_lock(&dlm->ast_lock);
-    while (!list_empty(&dlm->pending_asts)) {
+    while (!list_empty(&dlm->pending_asts) && !ret) {
         lock = list_entry(dlm->pending_asts.next,
                   struct dlm_lock, ast_list);
         /* get an extra ref on lock */
@@ -628,8 +628,20 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
             mlog(0, "%s: res %.*s, AST queued while flushing last "
                  "one\n", dlm->name, res->lockname.len,
                  res->lockname.name);
-        } else
-            lock->ast_pending = 0;
+        } else {
+            if (unlikely(ret < 0)) {
+                /* If this AST is not sent back successfully,
+                 * there is no chance that the second lock
+                 * request comes.
+                 */
+                spin_lock(&res->spinlock);
+                __dlm_lockres_reserve_ast(res);
+                spin_unlock(&res->spinlock);
+                __dlm_queue_ast(dlm, lock);
+            } else {
+                lock->ast_pending = 0;
+            }
+        }
 
         /* drop the extra ref.
          * this may drop it completely. */
@@ -637,7 +649,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
         dlm_lockres_release_ast(dlm, res);
     }
 
-    while (!list_empty(&dlm->pending_basts)) {
+    ret = 0;
+
+    while (!list_empty(&dlm->pending_basts) && !ret) {
         lock = list_entry(dlm->pending_basts.next,
                   struct dlm_lock, bast_list);
         /* get an extra ref on lock */
@@ -650,7 +664,6 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
         spin_lock(&lock->spinlock);
         BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
         hi = lock->ml.highest_blocked;
-        lock->ml.highest_blocked = LKM_IVMODE;
         spin_unlock(&lock->spinlock);
 
         /* remove from list (including ref) */
@@ -681,7 +694,19 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
                  "one\n", dlm->name, res->lockname.len,
                  res->lockname.name);
         } else
-            lock->bast_pending = 0;
+            if (unlikely(ret)) {
+                spin_lock(&res->spinlock);
+                __dlm_lockres_reserve_ast(res);
+                spin_unlock(&res->spinlock);
+                __dlm_queue_bast(dlm, lock);
+            } else {
+                lock->bast_pending = 0;
+                /* Set ::highest_blocked to invalid after
+                 * sending BAST successfully so that
+                 * no more BAST would be queued.
+                 */
+                lock->ml.highest_blocked = LKM_IVMODE;
+            }
 
         /* drop the extra ref.
          * this may drop it completely. */
-- 
1.7.9.5

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-07  7:13 [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability Changwei Ge
@ 2017-08-07  7:43 ` Gang He
  2017-08-07  7:55   ` Changwei Ge
  2017-08-07 20:19 ` Mark Fasheh
  2017-08-09 11:32 ` Joseph Qi
  2 siblings, 1 reply; 16+ messages in thread
From: Gang He @ 2017-08-07  7:43 UTC (permalink / raw)
  To: ocfs2-devel

Base on your description, this case should be a corner case, NOT a fatal error.
Should we use mlog(ML_NOTICE, ...) to print these logs?

Thanks
Gang


>>> 
> Hi,
> 
> In current code, while flushing AST, we don't handle an exception that
> sending AST or BAST is failed.
> But it is indeed possible that AST or BAST is lost due to some kind of
> networks fault.
> 
> If above exception happens, the requesting node will never obtain an AST
> back, hence, it will never acquire the lock or abort current locking.
> 
> With this patch, I'd like to fix this issue by re-queuing the AST or
> BAST if sending is failed due to networks fault.
> 
> And the re-queuing AST or BAST will be dropped if the requesting node is
> dead!
> 
> It will improve the reliability a lot.
> 
> 
> Thanks.
> 
> Changwei.
> 
> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
> ---
>  fs/ocfs2/dlm/dlmrecovery.c |   51
> ++++++++++++++++++++++++++++++++++++++++++--
>  fs/ocfs2/dlm/dlmthread.c   |   39 +++++++++++++++++++++++++++------
>  2 files changed, 81 insertions(+), 9 deletions(-)
> 
> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
> index 74407c6..ddfaf74 100644
> --- a/fs/ocfs2/dlm/dlmrecovery.c
> +++ b/fs/ocfs2/dlm/dlmrecovery.c
> @@ -2263,11 +2263,45 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
>      }
>  }
>  
> +static int dlm_drop_pending_ast_bast(struct dlm_ctxt *dlm,
> +                     struct dlm_lock *lock)
> +{
> +    int reserved = 0;
> +
> +    spin_lock(&dlm->ast_lock);
> +    if (!list_empty(&lock->ast_list)) {
> +        mlog(0, "%s: drop pending AST for lock(cookie=%u:%llu).\n",
> +             dlm->name,
> +             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
> +             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
> +        list_del_init(&lock->ast_list);
> +        lock->ast_pending = 0;
> +        dlm_lock_put(lock);
> +        reserved++;
> +    }
> +
> +    if (!list_empty(&lock->bast_list)) {
> +        mlog(0, "%s: drop pending BAST for lock(cookie=%u:%llu).\n",
> +             dlm->name,
> +             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
> +             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
> +        list_del_init(&lock->bast_list);
> +        lock->bast_pending = 0;
> +        dlm_lock_put(lock);
> +        reserved++;
> +    }
> +    spin_unlock(&dlm->ast_lock);
> +
> +    return reserved;
> +}
> +
>  static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
> -                struct dlm_lock_resource *res, u8 dead_node)
> +                struct dlm_lock_resource *res, u8 dead_node,
> +                int *reserved)
>  {
>      struct dlm_lock *lock, *next;
>      unsigned int freed = 0;
> +    int reserved_tmp = 0;
>  
>      /* this node is the lockres master:
>       * 1) remove any stale locks for the dead node
> @@ -2284,6 +2318,9 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
>          if (lock->ml.node == dead_node) {
>              list_del_init(&lock->list);
>              dlm_lock_put(lock);
> +
> +            reserved_tmp += dlm_drop_pending_ast_bast(dlm, lock);
> +
>              /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
>              dlm_lock_put(lock);
>              freed++;
> @@ -2293,6 +2330,9 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
>          if (lock->ml.node == dead_node) {
>              list_del_init(&lock->list);
>              dlm_lock_put(lock);
> +
> +            reserved_tmp += dlm_drop_pending_ast_bast(dlm, lock);
> +
>              /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
>              dlm_lock_put(lock);
>              freed++;
> @@ -2308,6 +2348,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
>          }
>      }
>  
> +    *reserved = reserved_tmp;
> +
>      if (freed) {
>          mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
>               "dropping ref from lockres\n", dlm->name,
> @@ -2367,6 +2409,7 @@ static void dlm_do_local_recovery_cleanup(struct
> dlm_ctxt *dlm, u8 dead_node)
>      for (i = 0; i < DLM_HASH_BUCKETS; i++) {
>          bucket = dlm_lockres_hash(dlm, i);
>          hlist_for_each_entry_safe(res, tmp, bucket, hash_node) {
> +            int reserved = 0;
>               /* always prune any $RECOVERY entries for dead nodes,
>                * otherwise hangs can occur during later recovery */
>              if (dlm_is_recovery_lock(res->lockname.name,
> @@ -2420,7 +2463,7 @@ static void dlm_do_local_recovery_cleanup(struct
> dlm_ctxt *dlm, u8 dead_node)
>                      continue;
>                  }
>              } else if (res->owner == dlm->node_num) {
> -                dlm_free_dead_locks(dlm, res, dead_node);
> +                dlm_free_dead_locks(dlm, res, dead_node, &reserved);
>                  __dlm_lockres_calc_usage(dlm, res);
>              } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
>                  if (test_bit(dead_node, res->refmap)) {
> @@ -2432,6 +2475,10 @@ static void dlm_do_local_recovery_cleanup(struct
> dlm_ctxt *dlm, u8 dead_node)
>                  }
>              }
>              spin_unlock(&res->spinlock);
> +            while (reserved) {
> +                dlm_lockres_release_ast(dlm, res);
> +                reserved--;
> +            }
>          }
>      }
>  
> diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
> index 838a06d..c34a619 100644
> --- a/fs/ocfs2/dlm/dlmthread.c
> +++ b/fs/ocfs2/dlm/dlmthread.c
> @@ -587,13 +587,13 @@ static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
>  
>  static void dlm_flush_asts(struct dlm_ctxt *dlm)
>  {
> -    int ret;
> +    int ret = 0;
>      struct dlm_lock *lock;
>      struct dlm_lock_resource *res;
>      u8 hi;
>  
>      spin_lock(&dlm->ast_lock);
> -    while (!list_empty(&dlm->pending_asts)) {
> +    while (!list_empty(&dlm->pending_asts) && !ret) {
>          lock = list_entry(dlm->pending_asts.next,
>                    struct dlm_lock, ast_list);
>          /* get an extra ref on lock */
> @@ -628,8 +628,20 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
>              mlog(0, "%s: res %.*s, AST queued while flushing last "
>                   "one\n", dlm->name, res->lockname.len,
>                   res->lockname.name);
> -        } else
> -            lock->ast_pending = 0;
> +        } else {
> +            if (unlikely(ret < 0)) {
> +                /* If this AST is not sent back successfully,
> +                 * there is no chance that the second lock
> +                 * request comes.
> +                 */
> +                spin_lock(&res->spinlock);
> +                __dlm_lockres_reserve_ast(res);
> +                spin_unlock(&res->spinlock);
> +                __dlm_queue_ast(dlm, lock);
> +            } else {
> +                lock->ast_pending = 0;
> +            }
> +        }
>  
>          /* drop the extra ref.
>           * this may drop it completely. */
> @@ -637,7 +649,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
>          dlm_lockres_release_ast(dlm, res);
>      }
>  
> -    while (!list_empty(&dlm->pending_basts)) {
> +    ret = 0;
> +
> +    while (!list_empty(&dlm->pending_basts) && !ret) {
>          lock = list_entry(dlm->pending_basts.next,
>                    struct dlm_lock, bast_list);
>          /* get an extra ref on lock */
> @@ -650,7 +664,6 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
>          spin_lock(&lock->spinlock);
>          BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
>          hi = lock->ml.highest_blocked;
> -        lock->ml.highest_blocked = LKM_IVMODE;
>          spin_unlock(&lock->spinlock);
>  
>          /* remove from list (including ref) */
> @@ -681,7 +694,19 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
>                   "one\n", dlm->name, res->lockname.len,
>                   res->lockname.name);
>          } else
> -            lock->bast_pending = 0;
> +            if (unlikely(ret)) {
> +                spin_lock(&res->spinlock);
> +                __dlm_lockres_reserve_ast(res);
> +                spin_unlock(&res->spinlock);
> +                __dlm_queue_bast(dlm, lock);
> +            } else {
> +                lock->bast_pending = 0;
> +                /* Set ::highest_blocked to invalid after
> +                 * sending BAST successfully so that
> +                 * no more BAST would be queued.
> +                 */
> +                lock->ml.highest_blocked = LKM_IVMODE;
> +            }
>  
>          /* drop the extra ref.
>           * this may drop it completely. */
> -- 
> 1.7.9.5
> 
> 
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com 
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-07  7:43 ` Gang He
@ 2017-08-07  7:55   ` Changwei Ge
  0 siblings, 0 replies; 16+ messages in thread
From: Changwei Ge @ 2017-08-07  7:55 UTC (permalink / raw)
  To: ocfs2-devel

Hi Gang,

At present time, when AST or BAST sending is failed, it already prints
ERROR logs.

I admit that it's truly a corner case but a fatal error when networks is
not reliable.

Because if AST is not sent back to locking node, related procedure will
be pending, even the whole cluster will hang.

I think it is not permitted in enterprise-level usage scenario.

Thangks

Changwei



On 2017/8/7 15:43, Gang He wrote:
> Base on your description, this case should be a corner case, NOT a fatal error.
> Should we use mlog(ML_NOTICE, ...) to print these logs?
>
> Thanks
> Gang
>
>
>> Hi,
>>
>> In current code, while flushing AST, we don't handle an exception that
>> sending AST or BAST is failed.
>> But it is indeed possible that AST or BAST is lost due to some kind of
>> networks fault.
>>
>> If above exception happens, the requesting node will never obtain an AST
>> back, hence, it will never acquire the lock or abort current locking.
>>
>> With this patch, I'd like to fix this issue by re-queuing the AST or
>> BAST if sending is failed due to networks fault.
>>
>> And the re-queuing AST or BAST will be dropped if the requesting node is
>> dead!
>>
>> It will improve the reliability a lot.
>>
>>
>> Thanks.
>>
>> Changwei.
>>
>> Signed-off-by: Changwei Ge <ge.changwei@h3c.com>
>> ---
>>  fs/ocfs2/dlm/dlmrecovery.c |   51
>> ++++++++++++++++++++++++++++++++++++++++++--
>>  fs/ocfs2/dlm/dlmthread.c   |   39 +++++++++++++++++++++++++++------
>>  2 files changed, 81 insertions(+), 9 deletions(-)
>>
>> diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
>> index 74407c6..ddfaf74 100644
>> --- a/fs/ocfs2/dlm/dlmrecovery.c
>> +++ b/fs/ocfs2/dlm/dlmrecovery.c
>> @@ -2263,11 +2263,45 @@ static void dlm_revalidate_lvb(struct dlm_ctxt *dlm,
>>      }
>>  }
>>  
>> +static int dlm_drop_pending_ast_bast(struct dlm_ctxt *dlm,
>> +                     struct dlm_lock *lock)
>> +{
>> +    int reserved = 0;
>> +
>> +    spin_lock(&dlm->ast_lock);
>> +    if (!list_empty(&lock->ast_list)) {
>> +        mlog(0, "%s: drop pending AST for lock(cookie=%u:%llu).\n",
>> +             dlm->name,
>> +             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
>> +             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
>> +        list_del_init(&lock->ast_list);
>> +        lock->ast_pending = 0;
>> +        dlm_lock_put(lock);
>> +        reserved++;
>> +    }
>> +
>> +    if (!list_empty(&lock->bast_list)) {
>> +        mlog(0, "%s: drop pending BAST for lock(cookie=%u:%llu).\n",
>> +             dlm->name,
>> +             dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
>> +             dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)));
>> +        list_del_init(&lock->bast_list);
>> +        lock->bast_pending = 0;
>> +        dlm_lock_put(lock);
>> +        reserved++;
>> +    }
>> +    spin_unlock(&dlm->ast_lock);
>> +
>> +    return reserved;
>> +}
>> +
>>  static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
>> -                struct dlm_lock_resource *res, u8 dead_node)
>> +                struct dlm_lock_resource *res, u8 dead_node,
>> +                int *reserved)
>>  {
>>      struct dlm_lock *lock, *next;
>>      unsigned int freed = 0;
>> +    int reserved_tmp = 0;
>>  
>>      /* this node is the lockres master:
>>       * 1) remove any stale locks for the dead node
>> @@ -2284,6 +2318,9 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
>>          if (lock->ml.node == dead_node) {
>>              list_del_init(&lock->list);
>>              dlm_lock_put(lock);
>> +
>> +            reserved_tmp += dlm_drop_pending_ast_bast(dlm, lock);
>> +
>>              /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
>>              dlm_lock_put(lock);
>>              freed++;
>> @@ -2293,6 +2330,9 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
>>          if (lock->ml.node == dead_node) {
>>              list_del_init(&lock->list);
>>              dlm_lock_put(lock);
>> +
>> +            reserved_tmp += dlm_drop_pending_ast_bast(dlm, lock);
>> +
>>              /* Can't schedule DLM_UNLOCK_FREE_LOCK - do manually */
>>              dlm_lock_put(lock);
>>              freed++;
>> @@ -2308,6 +2348,8 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
>>          }
>>      }
>>  
>> +    *reserved = reserved_tmp;
>> +
>>      if (freed) {
>>          mlog(0, "%s:%.*s: freed %u locks for dead node %u, "
>>               "dropping ref from lockres\n", dlm->name,
>> @@ -2367,6 +2409,7 @@ static void dlm_do_local_recovery_cleanup(struct
>> dlm_ctxt *dlm, u8 dead_node)
>>      for (i = 0; i < DLM_HASH_BUCKETS; i++) {
>>          bucket = dlm_lockres_hash(dlm, i);
>>          hlist_for_each_entry_safe(res, tmp, bucket, hash_node) {
>> +            int reserved = 0;
>>               /* always prune any $RECOVERY entries for dead nodes,
>>                * otherwise hangs can occur during later recovery */
>>              if (dlm_is_recovery_lock(res->lockname.name,
>> @@ -2420,7 +2463,7 @@ static void dlm_do_local_recovery_cleanup(struct
>> dlm_ctxt *dlm, u8 dead_node)
>>                      continue;
>>                  }
>>              } else if (res->owner == dlm->node_num) {
>> -                dlm_free_dead_locks(dlm, res, dead_node);
>> +                dlm_free_dead_locks(dlm, res, dead_node, &reserved);
>>                  __dlm_lockres_calc_usage(dlm, res);
>>              } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
>>                  if (test_bit(dead_node, res->refmap)) {
>> @@ -2432,6 +2475,10 @@ static void dlm_do_local_recovery_cleanup(struct
>> dlm_ctxt *dlm, u8 dead_node)
>>                  }
>>              }
>>              spin_unlock(&res->spinlock);
>> +            while (reserved) {
>> +                dlm_lockres_release_ast(dlm, res);
>> +                reserved--;
>> +            }
>>          }
>>      }
>>  
>> diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
>> index 838a06d..c34a619 100644
>> --- a/fs/ocfs2/dlm/dlmthread.c
>> +++ b/fs/ocfs2/dlm/dlmthread.c
>> @@ -587,13 +587,13 @@ static int dlm_dirty_list_empty(struct dlm_ctxt *dlm)
>>  
>>  static void dlm_flush_asts(struct dlm_ctxt *dlm)
>>  {
>> -    int ret;
>> +    int ret = 0;
>>      struct dlm_lock *lock;
>>      struct dlm_lock_resource *res;
>>      u8 hi;
>>  
>>      spin_lock(&dlm->ast_lock);
>> -    while (!list_empty(&dlm->pending_asts)) {
>> +    while (!list_empty(&dlm->pending_asts) && !ret) {
>>          lock = list_entry(dlm->pending_asts.next,
>>                    struct dlm_lock, ast_list);
>>          /* get an extra ref on lock */
>> @@ -628,8 +628,20 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
>>              mlog(0, "%s: res %.*s, AST queued while flushing last "
>>                   "one\n", dlm->name, res->lockname.len,
>>                   res->lockname.name);
>> -        } else
>> -            lock->ast_pending = 0;
>> +        } else {
>> +            if (unlikely(ret < 0)) {
>> +                /* If this AST is not sent back successfully,
>> +                 * there is no chance that the second lock
>> +                 * request comes.
>> +                 */
>> +                spin_lock(&res->spinlock);
>> +                __dlm_lockres_reserve_ast(res);
>> +                spin_unlock(&res->spinlock);
>> +                __dlm_queue_ast(dlm, lock);
>> +            } else {
>> +                lock->ast_pending = 0;
>> +            }
>> +        }
>>  
>>          /* drop the extra ref.
>>           * this may drop it completely. */
>> @@ -637,7 +649,9 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
>>          dlm_lockres_release_ast(dlm, res);
>>      }
>>  
>> -    while (!list_empty(&dlm->pending_basts)) {
>> +    ret = 0;
>> +
>> +    while (!list_empty(&dlm->pending_basts) && !ret) {
>>          lock = list_entry(dlm->pending_basts.next,
>>                    struct dlm_lock, bast_list);
>>          /* get an extra ref on lock */
>> @@ -650,7 +664,6 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
>>          spin_lock(&lock->spinlock);
>>          BUG_ON(lock->ml.highest_blocked <= LKM_IVMODE);
>>          hi = lock->ml.highest_blocked;
>> -        lock->ml.highest_blocked = LKM_IVMODE;
>>          spin_unlock(&lock->spinlock);
>>  
>>          /* remove from list (including ref) */
>> @@ -681,7 +694,19 @@ static void dlm_flush_asts(struct dlm_ctxt *dlm)
>>                   "one\n", dlm->name, res->lockname.len,
>>                   res->lockname.name);
>>          } else
>> -            lock->bast_pending = 0;
>> +            if (unlikely(ret)) {
>> +                spin_lock(&res->spinlock);
>> +                __dlm_lockres_reserve_ast(res);
>> +                spin_unlock(&res->spinlock);
>> +                __dlm_queue_bast(dlm, lock);
>> +            } else {
>> +                lock->bast_pending = 0;
>> +                /* Set ::highest_blocked to invalid after
>> +                 * sending BAST successfully so that
>> +                 * no more BAST would be queued.
>> +                 */
>> +                lock->ml.highest_blocked = LKM_IVMODE;
>> +            }
>>  
>>          /* drop the extra ref.
>>           * this may drop it completely. */
>> -- 
>> 1.7.9.5
>>
>>
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel at oss.oracle.com 
>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-07  7:13 [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability Changwei Ge
  2017-08-07  7:43 ` Gang He
@ 2017-08-07 20:19 ` Mark Fasheh
  2017-08-08 10:56   ` Changwei Ge
  2017-08-09 11:32 ` Joseph Qi
  2 siblings, 1 reply; 16+ messages in thread
From: Mark Fasheh @ 2017-08-07 20:19 UTC (permalink / raw)
  To: ocfs2-devel

On Mon, Aug 7, 2017 at 2:13 AM, Changwei Ge <ge.changwei@h3c.com> wrote:
> Hi,
>
> In current code, while flushing AST, we don't handle an exception that
> sending AST or BAST is failed.
> But it is indeed possible that AST or BAST is lost due to some kind of
> networks fault.
>
> If above exception happens, the requesting node will never obtain an AST
> back, hence, it will never acquire the lock or abort current locking.
>
> With this patch, I'd like to fix this issue by re-queuing the AST or
> BAST if sending is failed due to networks fault.
>
> And the re-queuing AST or BAST will be dropped if the requesting node is
> dead!
>
> It will improve the reliability a lot.

Can you detail your testing? Code-wise this looks fine to me but as
you note, this is a pretty hard to hit corner case so it'd be nice to
hear that you were able to exercise it.

Thanks,
   --Mark

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-07 20:19 ` Mark Fasheh
@ 2017-08-08 10:56   ` Changwei Ge
  2017-08-22 20:49     ` Mark Fasheh
  0 siblings, 1 reply; 16+ messages in thread
From: Changwei Ge @ 2017-08-08 10:56 UTC (permalink / raw)
  To: ocfs2-devel



On 2017/8/8 4:20, Mark Fasheh wrote:
> On Mon, Aug 7, 2017 at 2:13 AM, Changwei Ge <ge.changwei@h3c.com> wrote:
>> Hi,
>>
>> In current code, while flushing AST, we don't handle an exception that
>> sending AST or BAST is failed.
>> But it is indeed possible that AST or BAST is lost due to some kind of
>> networks fault.
>>
>> If above exception happens, the requesting node will never obtain an AST
>> back, hence, it will never acquire the lock or abort current locking.
>>
>> With this patch, I'd like to fix this issue by re-queuing the AST or
>> BAST if sending is failed due to networks fault.
>>
>> And the re-queuing AST or BAST will be dropped if the requesting node is
>> dead!
>>
>> It will improve the reliability a lot.
> Can you detail your testing? Code-wise this looks fine to me but as
> you note, this is a pretty hard to hit corner case so it'd be nice to
> hear that you were able to exercise it.
>
> Thanks,
>    --Mark
Hi Mark,

My test is quite simple to perform.
Test environment includes 7 hosts. Ethernet devices in 6 of them are
down and then up repetitively.
After several rounds of up and down. Some file operation hangs.

Through debugfs.ocfs2 tool involved in NODE 2 which was the owner of
lock resource 'O000000000000000011150300000000',
it told that:

debugfs: dlm_locks O000000000000000011150300000000
Lockres: O000000000000000011150300000000   Owner: 2    State: 0x0
Last Used: 0      ASTs Reserved: 0    Inflight: 0    Migration Pending: No
Refs: 4    Locks: 2    On Lists: None
Reference Map: 3
 Lock-Queue  Node  Level  Conv  Cookie           Refs  AST  BAST 
Pending-Action
 Granted     2     PR     -1    2:53             2     No   No    None
 Granted     3     PR     -1    3:48             2     No   No    None

That meant NODE 2 had granted NODE 3 and the AST had been transited to
NODE 3.

Meanwhile, through debugfs.ocfs2 tool involved in NODE 3,
it told that:
debugfs: dlm_locks O000000000000000011150300000000
Lockres: O000000000000000011150300000000   Owner: 2    State: 0x0
Last Used: 0      ASTs Reserved: 0    Inflight: 0    Migration Pending: No
Refs: 3    Locks: 1    On Lists: None
Reference Map:
 Lock-Queue  Node  Level  Conv  Cookie           Refs  AST  BAST 
Pending-Action
 Blocked     3     PR     -1    3:48             2     No   No    None

That meant NODE 3 didn't ever receive any AST to move local lock from
blocked list to grant list.

This consequence  makes sense, since AST sending is failed which can be
seen in kernel log.

As for BAST, it is more or less the same.

Thanks
Changwei


From

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-07  7:13 [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability Changwei Ge
  2017-08-07  7:43 ` Gang He
  2017-08-07 20:19 ` Mark Fasheh
@ 2017-08-09 11:32 ` Joseph Qi
  2017-08-09 15:24   ` ge changwei
  2 siblings, 1 reply; 16+ messages in thread
From: Joseph Qi @ 2017-08-09 11:32 UTC (permalink / raw)
  To: ocfs2-devel

Hi,

On 17/8/7 15:13, Changwei Ge wrote:
> Hi,
> 
> In current code, while flushing AST, we don't handle an exception that
> sending AST or BAST is failed.
> But it is indeed possible that AST or BAST is lost due to some kind of
> networks fault.
> 
Could you please describe this issue more clearly? It is better analyze
issue along with the error message and the status of related nodes.
IMO, if network is down, one of the two nodes will be fenced. So what's
your case here?

Thanks,
Joseph

> If above exception happens, the requesting node will never obtain an AST
> back, hence, it will never acquire the lock or abort current locking.
> 
> With this patch, I'd like to fix this issue by re-queuing the AST or
> BAST if sending is failed due to networks fault.
> 
> And the re-queuing AST or BAST will be dropped if the requesting node is
> dead!
> 
> It will improve the reliability a lot.
> 
> 
> Thanks.
> 
> Changwei.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-09 11:32 ` Joseph Qi
@ 2017-08-09 15:24   ` ge changwei
  2017-08-10  9:34     ` Joseph Qi
  0 siblings, 1 reply; 16+ messages in thread
From: ge changwei @ 2017-08-09 15:24 UTC (permalink / raw)
  To: ocfs2-devel

Hi


On 2017/8/9 ??7:32, Joseph Qi wrote:
> Hi,
>
> On 17/8/7 15:13, Changwei Ge wrote:
>> Hi,
>>
>> In current code, while flushing AST, we don't handle an exception that
>> sending AST or BAST is failed.
>> But it is indeed possible that AST or BAST is lost due to some kind of
>> networks fault.
>>
> Could you please describe this issue more clearly? It is better analyze
> issue along with the error message and the status of related nodes.
> IMO, if network is down, one of the two nodes will be fenced. So what's
> your case here?
>
> Thanks,
> Joseph

I have posted the status of related lock resource in my preceding email. 
Please check them out.

Moreover, network is not down forever even not longer than threshold  to 
be fenced.
So no node will be fenced.

This issue happens in terrible network environment. Some messages may be 
abandoned by switch due to various conditions.
And even frequent and fast link up and down will also cause this issue.

In a nutshell,  re-queuing AST and BAST is crucial when link between 
nodes recover quickly. It prevents cluster from hanging.

Thanks,
Changwei
>> If above exception happens, the requesting node will never obtain an AST
>> back, hence, it will never acquire the lock or abort current locking.
>>
>> With this patch, I'd like to fix this issue by re-queuing the AST or
>> BAST if sending is failed due to networks fault.
>>
>> And the re-queuing AST or BAST will be dropped if the requesting node is
>> dead!
>>
>> It will improve the reliability a lot.
>>
>>
>> Thanks.
>>
>> Changwei.
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-09 15:24   ` ge changwei
@ 2017-08-10  9:34     ` Joseph Qi
  2017-08-10 10:49       ` Changwei Ge
  0 siblings, 1 reply; 16+ messages in thread
From: Joseph Qi @ 2017-08-10  9:34 UTC (permalink / raw)
  To: ocfs2-devel

Hi Changwei,

On 17/8/9 23:24, ge changwei wrote:
> Hi
> 
> 
> On 2017/8/9 ??7:32, Joseph Qi wrote:
>> Hi,
>>
>> On 17/8/7 15:13, Changwei Ge wrote:
>>> Hi,
>>>
>>> In current code, while flushing AST, we don't handle an exception that
>>> sending AST or BAST is failed.
>>> But it is indeed possible that AST or BAST is lost due to some kind of
>>> networks fault.
>>>
>> Could you please describe this issue more clearly? It is better analyze
>> issue along with the error message and the status of related nodes.
>> IMO, if network is down, one of the two nodes will be fenced. So what's
>> your case here?
>>
>> Thanks,
>> Joseph
> 
> I have posted the status of related lock resource in my preceding email. 
> Please check them out.
> 
> Moreover, network is not down forever even not longer than threshold  to 
> be fenced.
> So no node will be fenced.
> 
> This issue happens in terrible network environment. Some messages may be 
> abandoned by switch due to various conditions.
> And even frequent and fast link up and down will also cause this issue.
> 
> In a nutshell,  re-queuing AST and BAST is crucial when link between 
> nodes recover quickly. It prevents cluster from hanging.
>So you mean the tcp packet is lost due to connection reset? IIRC,
Junxiao has posted a patchset to fix this issue.
If you are using the way of re-queuing, how to make sure the original
message is *truly* lost and the same ast/bast won't be sent twice?

Thanks,
Joseph
 
> Thanks,
> Changwei
>>> If above exception happens, the requesting node will never obtain an AST
>>> back, hence, it will never acquire the lock or abort current locking.
>>>
>>> With this patch, I'd like to fix this issue by re-queuing the AST or
>>> BAST if sending is failed due to networks fault.
>>>
>>> And the re-queuing AST or BAST will be dropped if the requesting node is
>>> dead!
>>>
>>> It will improve the reliability a lot.
>>>
>>>
>>> Thanks.
>>>
>>> Changwei.
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel at oss.oracle.com
>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-10  9:34     ` Joseph Qi
@ 2017-08-10 10:49       ` Changwei Ge
  2017-08-23  2:23         ` Junxiao Bi
  0 siblings, 1 reply; 16+ messages in thread
From: Changwei Ge @ 2017-08-10 10:49 UTC (permalink / raw)
  To: ocfs2-devel

Hi Joseph,


On 2017/8/10 17:53, Joseph Qi wrote:
> Hi Changwei,
>
> On 17/8/9 23:24, ge changwei wrote:
>> Hi
>>
>>
>> On 2017/8/9 ??7:32, Joseph Qi wrote:
>>> Hi,
>>>
>>> On 17/8/7 15:13, Changwei Ge wrote:
>>>> Hi,
>>>>
>>>> In current code, while flushing AST, we don't handle an exception that
>>>> sending AST or BAST is failed.
>>>> But it is indeed possible that AST or BAST is lost due to some kind of
>>>> networks fault.
>>>>
>>> Could you please describe this issue more clearly? It is better analyze
>>> issue along with the error message and the status of related nodes.
>>> IMO, if network is down, one of the two nodes will be fenced. So what's
>>> your case here?
>>>
>>> Thanks,
>>> Joseph
>> I have posted the status of related lock resource in my preceding email. 
>> Please check them out.
>>
>> Moreover, network is not down forever even not longer than threshold  to 
>> be fenced.
>> So no node will be fenced.
>>
>> This issue happens in terrible network environment. Some messages may be 
>> abandoned by switch due to various conditions.
>> And even frequent and fast link up and down will also cause this issue.
>>
>> In a nutshell,  re-queuing AST and BAST is crucial when link between 
>> nodes recover quickly. It prevents cluster from hanging.
>> So you mean the tcp packet is lost due to connection reset? IIRC,
Yes, it's something like that exception which I think is deserved to be
fixed within OCFS2.
> Junxiao has posted a patchset to fix this issue.
> If you are using the way of re-queuing, how to make sure the original
> message is *truly* lost and the same ast/bast won't be sent twice?
With regards to TCP layer, if it returns error to OCFS2, packets must
not be sent successfully. So no node will obtain such an AST or BAST.
With regards to OCFS2, my patch can guarantee that one AST/BAST can't be
queued on pending list twice of which are both sent successfully.

Thanks,
Changwei
>
> Thanks,
> Joseph
>  
>> Thanks,
>> Changwei
>>>> If above exception happens, the requesting node will never obtain an AST
>>>> back, hence, it will never acquire the lock or abort current locking.
>>>>
>>>> With this patch, I'd like to fix this issue by re-queuing the AST or
>>>> BAST if sending is failed due to networks fault.
>>>>
>>>> And the re-queuing AST or BAST will be dropped if the requesting node is
>>>> dead!
>>>>
>>>> It will improve the reliability a lot.
>>>>
>>>>
>>>> Thanks.
>>>>
>>>> Changwei.
>>> _______________________________________________
>>> Ocfs2-devel mailing list
>>> Ocfs2-devel at oss.oracle.com
>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-08 10:56   ` Changwei Ge
@ 2017-08-22 20:49     ` Mark Fasheh
  2017-08-23  1:06       ` Joseph Qi
  0 siblings, 1 reply; 16+ messages in thread
From: Mark Fasheh @ 2017-08-22 20:49 UTC (permalink / raw)
  To: ocfs2-devel

On Tue, Aug 8, 2017 at 5:56 AM, Changwei Ge <ge.changwei@h3c.com> wrote:
>>> It will improve the reliability a lot.
>> Can you detail your testing? Code-wise this looks fine to me but as
>> you note, this is a pretty hard to hit corner case so it'd be nice to
>> hear that you were able to exercise it.
>>
>> Thanks,
>>    --Mark
> Hi Mark,
>
> My test is quite simple to perform.
> Test environment includes 7 hosts. Ethernet devices in 6 of them are
> down and then up repetitively.
> After several rounds of up and down. Some file operation hangs.
>
> Through debugfs.ocfs2 tool involved in NODE 2 which was the owner of
> lock resource 'O000000000000000011150300000000',
> it told that:
>
> debugfs: dlm_locks O000000000000000011150300000000
> Lockres: O000000000000000011150300000000   Owner: 2    State: 0x0
> Last Used: 0      ASTs Reserved: 0    Inflight: 0    Migration Pending: No
> Refs: 4    Locks: 2    On Lists: None
> Reference Map: 3
>  Lock-Queue  Node  Level  Conv  Cookie           Refs  AST  BAST
> Pending-Action
>  Granted     2     PR     -1    2:53             2     No   No    None
>  Granted     3     PR     -1    3:48             2     No   No    None
>
> That meant NODE 2 had granted NODE 3 and the AST had been transited to
> NODE 3.
>
> Meanwhile, through debugfs.ocfs2 tool involved in NODE 3,
> it told that:
> debugfs: dlm_locks O000000000000000011150300000000
> Lockres: O000000000000000011150300000000   Owner: 2    State: 0x0
> Last Used: 0      ASTs Reserved: 0    Inflight: 0    Migration Pending: No
> Refs: 3    Locks: 1    On Lists: None
> Reference Map:
>  Lock-Queue  Node  Level  Conv  Cookie           Refs  AST  BAST
> Pending-Action
>  Blocked     3     PR     -1    3:48             2     No   No    None
>
> That meant NODE 3 didn't ever receive any AST to move local lock from
> blocked list to grant list.
>
> This consequence  makes sense, since AST sending is failed which can be
> seen in kernel log.
>
> As for BAST, it is more or less the same.
>
> Thanks
> Changwei


Thanks for the testing details. I think you got Andrew's e-mail wrong
so I'm CC'ing him now. It might be a good idea to re-send the patch
with the right CC's - add some of your testing details to the log.
You're free to use my

Reviewed-by: Mark Fasheh <mfasheh@versity.com>

as well.

Thanks again,
   --Mark

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-22 20:49     ` Mark Fasheh
@ 2017-08-23  1:06       ` Joseph Qi
  0 siblings, 0 replies; 16+ messages in thread
From: Joseph Qi @ 2017-08-23  1:06 UTC (permalink / raw)
  To: ocfs2-devel

Hi Mark,

On 17/8/23 04:49, Mark Fasheh wrote:
> On Tue, Aug 8, 2017 at 5:56 AM, Changwei Ge <ge.changwei@h3c.com> wrote:
>>>> It will improve the reliability a lot.
>>> Can you detail your testing? Code-wise this looks fine to me but as
>>> you note, this is a pretty hard to hit corner case so it'd be nice to
>>> hear that you were able to exercise it.
>>>
>>> Thanks,
>>>    --Mark
>> Hi Mark,
>>
>> My test is quite simple to perform.
>> Test environment includes 7 hosts. Ethernet devices in 6 of them are
>> down and then up repetitively.
>> After several rounds of up and down. Some file operation hangs.
>>
>> Through debugfs.ocfs2 tool involved in NODE 2 which was the owner of
>> lock resource 'O000000000000000011150300000000',
>> it told that:
>>
>> debugfs: dlm_locks O000000000000000011150300000000
>> Lockres: O000000000000000011150300000000   Owner: 2    State: 0x0
>> Last Used: 0      ASTs Reserved: 0    Inflight: 0    Migration Pending: No
>> Refs: 4    Locks: 2    On Lists: None
>> Reference Map: 3
>>  Lock-Queue  Node  Level  Conv  Cookie           Refs  AST  BAST
>> Pending-Action
>>  Granted     2     PR     -1    2:53             2     No   No    None
>>  Granted     3     PR     -1    3:48             2     No   No    None
>>
>> That meant NODE 2 had granted NODE 3 and the AST had been transited to
>> NODE 3.
>>
>> Meanwhile, through debugfs.ocfs2 tool involved in NODE 3,
>> it told that:
>> debugfs: dlm_locks O000000000000000011150300000000
>> Lockres: O000000000000000011150300000000   Owner: 2    State: 0x0
>> Last Used: 0      ASTs Reserved: 0    Inflight: 0    Migration Pending: No
>> Refs: 3    Locks: 1    On Lists: None
>> Reference Map:
>>  Lock-Queue  Node  Level  Conv  Cookie           Refs  AST  BAST
>> Pending-Action
>>  Blocked     3     PR     -1    3:48             2     No   No    None
>>
>> That meant NODE 3 didn't ever receive any AST to move local lock from
>> blocked list to grant list.
>>
>> This consequence  makes sense, since AST sending is failed which can be
>> seen in kernel log.
>>
>> As for BAST, it is more or less the same.
>>
>> Thanks
>> Changwei
> 
> 
> Thanks for the testing details. I think you got Andrew's e-mail wrong
> so I'm CC'ing him now. It might be a good idea to re-send the patch
> with the right CC's - add some of your testing details to the log.

IMO, network error occurs cannot make sure that target node hasn't
received the message. A complete message round includes:
1. sending to the target node;
2. get response from the target node.

So if network error happens on phase 2, re-queue the message will
cause ast/bast to be sent twice. I'm afraid this cannot be handled
currently.

If I understand wrong, please point out.

Thanks,
Joseph

> You're free to use my
> 
> Reviewed-by: Mark Fasheh <mfasheh@versity.com>
> 
> as well.
> 
> Thanks again,
>    --Mark
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-10 10:49       ` Changwei Ge
@ 2017-08-23  2:23         ` Junxiao Bi
  2017-08-23  3:34           ` Joseph Qi
  0 siblings, 1 reply; 16+ messages in thread
From: Junxiao Bi @ 2017-08-23  2:23 UTC (permalink / raw)
  To: ocfs2-devel

On 08/10/2017 06:49 PM, Changwei Ge wrote:
> Hi Joseph,
> 
> 
> On 2017/8/10 17:53, Joseph Qi wrote:
>> Hi Changwei,
>>
>> On 17/8/9 23:24, ge changwei wrote:
>>> Hi
>>>
>>>
>>> On 2017/8/9 ??7:32, Joseph Qi wrote:
>>>> Hi,
>>>>
>>>> On 17/8/7 15:13, Changwei Ge wrote:
>>>>> Hi,
>>>>>
>>>>> In current code, while flushing AST, we don't handle an exception that
>>>>> sending AST or BAST is failed.
>>>>> But it is indeed possible that AST or BAST is lost due to some kind of
>>>>> networks fault.
>>>>>
>>>> Could you please describe this issue more clearly? It is better analyze
>>>> issue along with the error message and the status of related nodes.
>>>> IMO, if network is down, one of the two nodes will be fenced. So what's
>>>> your case here?
>>>>
>>>> Thanks,
>>>> Joseph
>>> I have posted the status of related lock resource in my preceding email. 
>>> Please check them out.
>>>
>>> Moreover, network is not down forever even not longer than threshold  to 
>>> be fenced.
>>> So no node will be fenced.
>>>
>>> This issue happens in terrible network environment. Some messages may be 
>>> abandoned by switch due to various conditions.
>>> And even frequent and fast link up and down will also cause this issue.
>>>
>>> In a nutshell,  re-queuing AST and BAST is crucial when link between 
>>> nodes recover quickly. It prevents cluster from hanging.
>>> So you mean the tcp packet is lost due to connection reset? IIRC,
> Yes, it's something like that exception which I think is deserved to be
> fixed within OCFS2.
>> Junxiao has posted a patchset to fix this issue.
>> If you are using the way of re-queuing, how to make sure the original
>> message is *truly* lost and the same ast/bast won't be sent twice?
> With regards to TCP layer, if it returns error to OCFS2, packets must
> not be sent successfully. So no node will obtain such an AST or BAST.
Right, but not only AST/BAST, other messages pending in tcp queue will
also lost if tcp return error to ocfs2, this can also caused hung.
Besides, your fix may introduce duplicated ast/bast message Joseph
mentioned.
Ocfs2 depends tcp a lot, it can't work well if tcp return error to it.
To fix it, maybe ocfs2 should maintain its own message queue and ack
messages while not depend on TCP.

Thanks,
Junxiao.


> With regards to OCFS2, my patch can guarantee that one AST/BAST can't be
> queued on pending list twice of which are both sent successfully.
> 
> Thanks,
> Changwei
>>
>> Thanks,
>> Joseph
>>  
>>> Thanks,
>>> Changwei
>>>>> If above exception happens, the requesting node will never obtain an AST
>>>>> back, hence, it will never acquire the lock or abort current locking.
>>>>>
>>>>> With this patch, I'd like to fix this issue by re-queuing the AST or
>>>>> BAST if sending is failed due to networks fault.
>>>>>
>>>>> And the re-queuing AST or BAST will be dropped if the requesting node is
>>>>> dead!
>>>>>
>>>>> It will improve the reliability a lot.
>>>>>
>>>>>
>>>>> Thanks.
>>>>>
>>>>> Changwei.
>>>> _______________________________________________
>>>> Ocfs2-devel mailing list
>>>> Ocfs2-devel at oss.oracle.com
>>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-23  2:23         ` Junxiao Bi
@ 2017-08-23  3:34           ` Joseph Qi
  2017-08-23  4:47             ` Gang He
  0 siblings, 1 reply; 16+ messages in thread
From: Joseph Qi @ 2017-08-23  3:34 UTC (permalink / raw)
  To: ocfs2-devel



On 17/8/23 10:23, Junxiao Bi wrote:
> On 08/10/2017 06:49 PM, Changwei Ge wrote:
>> Hi Joseph,
>>
>>
>> On 2017/8/10 17:53, Joseph Qi wrote:
>>> Hi Changwei,
>>>
>>> On 17/8/9 23:24, ge changwei wrote:
>>>> Hi
>>>>
>>>>
>>>> On 2017/8/9 ??7:32, Joseph Qi wrote:
>>>>> Hi,
>>>>>
>>>>> On 17/8/7 15:13, Changwei Ge wrote:
>>>>>> Hi,
>>>>>>
>>>>>> In current code, while flushing AST, we don't handle an exception that
>>>>>> sending AST or BAST is failed.
>>>>>> But it is indeed possible that AST or BAST is lost due to some kind of
>>>>>> networks fault.
>>>>>>
>>>>> Could you please describe this issue more clearly? It is better analyze
>>>>> issue along with the error message and the status of related nodes.
>>>>> IMO, if network is down, one of the two nodes will be fenced. So what's
>>>>> your case here?
>>>>>
>>>>> Thanks,
>>>>> Joseph
>>>> I have posted the status of related lock resource in my preceding email. 
>>>> Please check them out.
>>>>
>>>> Moreover, network is not down forever even not longer than threshold  to 
>>>> be fenced.
>>>> So no node will be fenced.
>>>>
>>>> This issue happens in terrible network environment. Some messages may be 
>>>> abandoned by switch due to various conditions.
>>>> And even frequent and fast link up and down will also cause this issue.
>>>>
>>>> In a nutshell,  re-queuing AST and BAST is crucial when link between 
>>>> nodes recover quickly. It prevents cluster from hanging.
>>>> So you mean the tcp packet is lost due to connection reset? IIRC,
>> Yes, it's something like that exception which I think is deserved to be
>> fixed within OCFS2.
>>> Junxiao has posted a patchset to fix this issue.
>>> If you are using the way of re-queuing, how to make sure the original
>>> message is *truly* lost and the same ast/bast won't be sent twice?
>> With regards to TCP layer, if it returns error to OCFS2, packets must
>> not be sent successfully. So no node will obtain such an AST or BAST.
> Right, but not only AST/BAST, other messages pending in tcp queue will
> also lost if tcp return error to ocfs2, this can also caused hung.
> Besides, your fix may introduce duplicated ast/bast message Joseph
> mentioned.
> Ocfs2 depends tcp a lot, it can't work well if tcp return error to it.
> To fix it, maybe ocfs2 should maintain its own message queue and ack
> messages while not depend on TCP.>
Agree. Or we can add a sequence to distinguish duplicate message. Under
this, we can simply resend message if fails.

Thanks,
Joseph
 
> Thanks,
> Junxiao.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-23  3:34           ` Joseph Qi
@ 2017-08-23  4:47             ` Gang He
  2017-08-23  5:56               ` Changwei Ge
  0 siblings, 1 reply; 16+ messages in thread
From: Gang He @ 2017-08-23  4:47 UTC (permalink / raw)
  To: ocfs2-devel




>>> 

> 
> On 17/8/23 10:23, Junxiao Bi wrote:
>> On 08/10/2017 06:49 PM, Changwei Ge wrote:
>>> Hi Joseph,
>>>
>>>
>>> On 2017/8/10 17:53, Joseph Qi wrote:
>>>> Hi Changwei,
>>>>
>>>> On 17/8/9 23:24, ge changwei wrote:
>>>>> Hi
>>>>>
>>>>>
>>>>> On 2017/8/9 ??7:32, Joseph Qi wrote:
>>>>>> Hi,
>>>>>>
>>>>>> On 17/8/7 15:13, Changwei Ge wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> In current code, while flushing AST, we don't handle an exception that
>>>>>>> sending AST or BAST is failed.
>>>>>>> But it is indeed possible that AST or BAST is lost due to some kind of
>>>>>>> networks fault.
>>>>>>>
>>>>>> Could you please describe this issue more clearly? It is better analyze
>>>>>> issue along with the error message and the status of related nodes.
>>>>>> IMO, if network is down, one of the two nodes will be fenced. So what's
>>>>>> your case here?
>>>>>>
>>>>>> Thanks,
>>>>>> Joseph
>>>>> I have posted the status of related lock resource in my preceding email. 
>>>>> Please check them out.
>>>>>
>>>>> Moreover, network is not down forever even not longer than threshold  to 
>>>>> be fenced.
>>>>> So no node will be fenced.
>>>>>
>>>>> This issue happens in terrible network environment. Some messages may be 
>>>>> abandoned by switch due to various conditions.
>>>>> And even frequent and fast link up and down will also cause this issue.
>>>>>
>>>>> In a nutshell,  re-queuing AST and BAST is crucial when link between 
>>>>> nodes recover quickly. It prevents cluster from hanging.
>>>>> So you mean the tcp packet is lost due to connection reset? IIRC,
>>> Yes, it's something like that exception which I think is deserved to be
>>> fixed within OCFS2.
>>>> Junxiao has posted a patchset to fix this issue.
>>>> If you are using the way of re-queuing, how to make sure the original
>>>> message is *truly* lost and the same ast/bast won't be sent twice?
>>> With regards to TCP layer, if it returns error to OCFS2, packets must
>>> not be sent successfully. So no node will obtain such an AST or BAST.
>> Right, but not only AST/BAST, other messages pending in tcp queue will
>> also lost if tcp return error to ocfs2, this can also caused hung.
>> Besides, your fix may introduce duplicated ast/bast message Joseph
>> mentioned.
>> Ocfs2 depends tcp a lot, it can't work well if tcp return error to it.
>> To fix it, maybe ocfs2 should maintain its own message queue and ack
>> messages while not depend on TCP.>
> Agree. Or we can add a sequence to distinguish duplicate message. Under
> this, we can simply resend message if fails.
Look likes, we need to make the message stateless.
Maybe, we can refer to GFS2, to see if GFS2 has considered this issue.

Thanks
Gang

> 
> Thanks,
> Joseph
>  
>> Thanks,
>> Junxiao.
> 
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com 
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
  2017-08-23  4:47             ` Gang He
@ 2017-08-23  5:56               ` Changwei Ge
       [not found]                 ` <63ADC13FD55D6546B7DECE290D39E373CED4F4ED@H3CMLB14-EX.srv.huawei-3com.com>
  0 siblings, 1 reply; 16+ messages in thread
From: Changwei Ge @ 2017-08-23  5:56 UTC (permalink / raw)
  To: ocfs2-devel



On 2017/8/23 12:48, Gang He wrote:
>
>
>> On 17/8/23 10:23, Junxiao Bi wrote:
>>> On 08/10/2017 06:49 PM, Changwei Ge wrote:
>>>> Hi Joseph,
>>>>
>>>>
>>>> On 2017/8/10 17:53, Joseph Qi wrote:
>>>>> Hi Changwei,
>>>>>
>>>>> On 17/8/9 23:24, ge changwei wrote:
>>>>>> Hi
>>>>>>
>>>>>>
>>>>>> On 2017/8/9 ??7:32, Joseph Qi wrote:
>>>>>>> Hi,
>>>>>>>
>>>>>>> On 17/8/7 15:13, Changwei Ge wrote:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> In current code, while flushing AST, we don't handle an exception that
>>>>>>>> sending AST or BAST is failed.
>>>>>>>> But it is indeed possible that AST or BAST is lost due to some kind of
>>>>>>>> networks fault.
>>>>>>>>
>>>>>>> Could you please describe this issue more clearly? It is better analyze
>>>>>>> issue along with the error message and the status of related nodes.
>>>>>>> IMO, if network is down, one of the two nodes will be fenced. So what's
>>>>>>> your case here?
>>>>>>>
>>>>>>> Thanks,
>>>>>>> Joseph
>>>>>> I have posted the status of related lock resource in my preceding email. 
>>>>>> Please check them out.
>>>>>>
>>>>>> Moreover, network is not down forever even not longer than threshold  to 
>>>>>> be fenced.
>>>>>> So no node will be fenced.
>>>>>>
>>>>>> This issue happens in terrible network environment. Some messages may be 
>>>>>> abandoned by switch due to various conditions.
>>>>>> And even frequent and fast link up and down will also cause this issue.
>>>>>>
>>>>>> In a nutshell,  re-queuing AST and BAST is crucial when link between 
>>>>>> nodes recover quickly. It prevents cluster from hanging.
>>>>>> So you mean the tcp packet is lost due to connection reset? IIRC,
>>>> Yes, it's something like that exception which I think is deserved to be
>>>> fixed within OCFS2.
>>>>> Junxiao has posted a patchset to fix this issue.
>>>>> If you are using the way of re-queuing, how to make sure the original
>>>>> message is *truly* lost and the same ast/bast won't be sent twice?
>>>> With regards to TCP layer, if it returns error to OCFS2, packets must
>>>> not be sent successfully. So no node will obtain such an AST or BAST.
>>> Right, but not only AST/BAST, other messages pending in tcp queue will
>>> also lost if tcp return error to ocfs2, this can also caused hung.
>>> Besides, your fix may introduce duplicated ast/bast message Joseph
>>> mentioned.
>>> Ocfs2 depends tcp a lot, it can't work well if tcp return error to it.
>>> To fix it, maybe ocfs2 should maintain its own message queue and ack
>>> messages while not depend on TCP.>
>> Agree. Or we can add a sequence to distinguish duplicate message. Under
>> this, we can simply resend message if fails.
> Look likes, we need to make the message stateless.
> Maybe, we can refer to GFS2, to see if GFS2 has considered this issue.
>
> Thanks
> Gang
Um.
Since Joseph, Junxiao and Gang all have a different or opposite opinion
on this hang issue fix, I will perform more tests to check if the
previously mentioned duplicated ast issue truly exists. And if it does
exist, I will try to figure out a new way to fix it and send a improved
version of this patch.

I will report the test results few days later. Anyway, thanks for your
comments.

Thank,
Changwei.
>> Thanks,
>> Joseph
>>  
>>> Thanks,
>>> Junxiao.
>> _______________________________________________
>> Ocfs2-devel mailing list
>> Ocfs2-devel at oss.oracle.com 
>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability
       [not found]                 ` <63ADC13FD55D6546B7DECE290D39E373CED4F4ED@H3CMLB14-EX.srv.huawei-3com.com>
@ 2017-09-13  7:03                   ` Changwei Ge
  0 siblings, 0 replies; 16+ messages in thread
From: Changwei Ge @ 2017-09-13  7:03 UTC (permalink / raw)
  To: ocfs2-devel

Hi,

I think the mentioned duplicated AST issue doesn't even exist.
Because the re-sended AST won't find any lock on converting list or 
blocked list.
How AST callback can be called twice?

Thanks,
Changwei

> 
> On 2017/8/23 12:48, Gang He wrote:
>>
>>
>>> On 17/8/23 10:23, Junxiao Bi wrote:
>>>> On 08/10/2017 06:49 PM, Changwei Ge wrote:
>>>>> Hi Joseph,
>>>>>
>>>>>
>>>>> On 2017/8/10 17:53, Joseph Qi wrote:
>>>>>> Hi Changwei,
>>>>>>
>>>>>> On 17/8/9 23:24, ge changwei wrote:
>>>>>>> Hi
>>>>>>>
>>>>>>>
>>>>>>> On 2017/8/9 ??7:32, Joseph Qi wrote:
>>>>>>>> Hi,
>>>>>>>>
>>>>>>>> On 17/8/7 15:13, Changwei Ge wrote:
>>>>>>>>> Hi,
>>>>>>>>>
>>>>>>>>> In current code, while flushing AST, we don't handle an
>>>>>>>>> exception that sending AST or BAST is failed.
>>>>>>>>> But it is indeed possible that AST or BAST is lost due to some
>>>>>>>>> kind of networks fault.
>>>>>>>>>
>>>>>>>> Could you please describe this issue more clearly? It is better
>>>>>>>> analyze issue along with the error message and the status of related nodes.
>>>>>>>> IMO, if network is down, one of the two nodes will be fenced. So
>>>>>>>> what's your case here?
>>>>>>>>
>>>>>>>> Thanks,
>>>>>>>> Joseph
>>>>>>> I have posted the status of related lock resource in my preceding email.
>>>>>>> Please check them out.
>>>>>>>
>>>>>>> Moreover, network is not down forever even not longer than
>>>>>>> threshold  to be fenced.
>>>>>>> So no node will be fenced.
>>>>>>>
>>>>>>> This issue happens in terrible network environment. Some messages
>>>>>>> may be abandoned by switch due to various conditions.
>>>>>>> And even frequent and fast link up and down will also cause this issue.
>>>>>>>
>>>>>>> In a nutshell,  re-queuing AST and BAST is crucial when link
>>>>>>> between nodes recover quickly. It prevents cluster from hanging.
>>>>>>> So you mean the tcp packet is lost due to connection reset? IIRC,
>>>>> Yes, it's something like that exception which I think is deserved
>>>>> to be fixed within OCFS2.
>>>>>> Junxiao has posted a patchset to fix this issue.
>>>>>> If you are using the way of re-queuing, how to make sure the
>>>>>> original message is *truly* lost and the same ast/bast won't be sent twice?
>>>>> With regards to TCP layer, if it returns error to OCFS2, packets
>>>>> must not be sent successfully. So no node will obtain such an AST or BAST.
>>>> Right, but not only AST/BAST, other messages pending in tcp queue
>>>> will also lost if tcp return error to ocfs2, this can also caused hung.
>>>> Besides, your fix may introduce duplicated ast/bast message Joseph
>>>> mentioned.
>>>> Ocfs2 depends tcp a lot, it can't work well if tcp return error to it.
>>>> To fix it, maybe ocfs2 should maintain its own message queue and ack
>>>> messages while not depend on TCP.>
>>> Agree. Or we can add a sequence to distinguish duplicate message.
>>> Under this, we can simply resend message if fails.
>> Look likes, we need to make the message stateless.
>> Maybe, we can refer to GFS2, to see if GFS2 has considered this issue.
>>
>> Thanks
>> Gang
> Um.
> Since Joseph, Junxiao and Gang all have a different or opposite opinion on this hang issue fix, I will perform more tests to check if the previously mentioned duplicated ast issue truly exists. And if it does exist, I will try to figure out a new way to fix it and send a improved version of this patch.
> 
> I will report the test results few days later. Anyway, thanks for your comments.
> 
> Thank,
> Changwei.
>>> Thanks,
>>> Joseph
>>>   
>>>> Thanks,
>>>> Junxiao.
>>> _______________________________________________
>>> Ocfs2-devel mailing list
>>> Ocfs2-devel at oss.oracle.com
>>> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
> 
> 
> _______________________________________________
> Ocfs2-devel mailing list
> Ocfs2-devel at oss.oracle.com
> https://oss.oracle.com/mailman/listinfo/ocfs2-devel
> 

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2017-09-13  7:03 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-08-07  7:13 [Ocfs2-devel] [PATCH] ocfs2: re-queue AST or BAST if sending is failed to improve the reliability Changwei Ge
2017-08-07  7:43 ` Gang He
2017-08-07  7:55   ` Changwei Ge
2017-08-07 20:19 ` Mark Fasheh
2017-08-08 10:56   ` Changwei Ge
2017-08-22 20:49     ` Mark Fasheh
2017-08-23  1:06       ` Joseph Qi
2017-08-09 11:32 ` Joseph Qi
2017-08-09 15:24   ` ge changwei
2017-08-10  9:34     ` Joseph Qi
2017-08-10 10:49       ` Changwei Ge
2017-08-23  2:23         ` Junxiao Bi
2017-08-23  3:34           ` Joseph Qi
2017-08-23  4:47             ` Gang He
2017-08-23  5:56               ` Changwei Ge
     [not found]                 ` <63ADC13FD55D6546B7DECE290D39E373CED4F4ED@H3CMLB14-EX.srv.huawei-3com.com>
2017-09-13  7:03                   ` Changwei Ge

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.