All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH] ext4: add a configurable parameter to prevent endless loop in ext4_mb_discard_group_p
@ 2021-04-08 18:50 Wen Yang
  2021-04-08 22:11 ` Andreas Dilger
  2021-04-09  5:47 ` riteshh
  0 siblings, 2 replies; 8+ messages in thread
From: Wen Yang @ 2021-04-08 18:50 UTC (permalink / raw)
  To: adilger, riteshh; +Cc: tytso, linux-ext4, jack, linux-kernel, baoyou.xie

 > On Apr 7, 2021, at 5:16 AM, riteshh <riteshh@linux.ibm.com> wrote:
 >>
 >> On 21/04/07 03:01PM, Wen Yang wrote:
 >>> From: Wen Yang <simon.wy@alibaba-inc.com>
 >>>
 >>> The kworker has occupied 100% of the CPU for several days:
 >>> PID USER  PR  NI VIRT RES SHR S  %CPU  %MEM TIME+  COMMAND
 >>> 68086 root 20 0  0    0   0   R  100.0 0.0  9718:18 kworker/u64:11
 >>>
 >>> And the stack obtained through sysrq is as follows:
 >>> [20613144.850426] task: ffff8800b5e08000 task.stack: ffffc9001342c000
 >>> [20613144.850438] Call Trace:
 >>> [20613144.850439] 
[<ffffffffa0244209>]ext4_mb_new_blocks+0x429/0x550 [ext4]
 >>> [20613144.850439]  [<ffffffffa02389ae>] 
ext4_ext_map_blocks+0xb5e/0xf30 [ext4]
 >>> [20613144.850441]  [<ffffffffa0204b52>] ext4_map_blocks+0x172/0x620 
[ext4]
 >>> [20613144.850442]  [<ffffffffa0208675>] ext4_writepages+0x7e5/0xf00 
[ext4]
 >>> [20613144.850443]  [<ffffffff811c487e>] do_writepages+0x1e/0x30
 >>> [20613144.850444]  [<ffffffff81280265>] 
__writeback_single_inode+0x45/0x320
 >>> [20613144.850444]  [<ffffffff81280ab2>] writeback_sb_inodes+0x272/0x600
 >>> [20613144.850445]  [<ffffffff81280ed2>] __writeback_inodes_wb+0x92/0xc0
 >>> [20613144.850445]  [<ffffffff81281238>] wb_writeback+0x268/0x300
 >>> [20613144.850446]  [<ffffffff812819f4>] wb_workfn+0xb4/0x380
 >>> [20613144.850447]  [<ffffffff810a5dc9>] process_one_work+0x189/0x420
 >>> [20613144.850447]  [<ffffffff810a60ae>] worker_thread+0x4e/0x4b0
 >>>
 >>> The cpu resources of the cloud server are precious, and the server
 >>> cannot be restarted after running for a long time, so a configuration
 >>> parameter is added to prevent this endless loop.
 >>
 >> Strange, if there is a endless loop here. Then I would definitely see
 >> if there is any accounting problem in pa->pa_count. Otherwise busy=1
 >> should not be set everytime. ext4_mb_show_pa() function may help 
debug this.
 >>
 >> If yes, then that means there always exists either a file preallocation
 >> or a group preallocation. Maybe it is possible, in some use case.
 >> Others may know of such use case, if any.

 > If this code is broken, then it doesn't make sense to me that we would
 > leave it in the "run forever" state after the patch, and require a sysfs
 > tunable to be set to have a properly working system?

 > Is there anything particularly strange about the workload/system that
 > might cause this?  Filesystem is very full, memory is very low, etc?

Hi Ritesh and Andreas,

Thank you for your reply. Since there is still a faulty machine, we have 
analyzed it again and found it is indeed a very special case:


crash> struct ext4_group_info ffff8813bb5f72d0
struct ext4_group_info {
   bb_state = 0,
   bb_free_root = {
     rb_node = 0x0
   },
   bb_first_free = 1681,
   bb_free = 0,
   bb_fragments = 0,
   bb_largest_free_order = -1,
   bb_prealloc_list = {
     next = 0xffff880268291d78,
     prev = 0xffff880268291d78     ---> *** The list is empty
   },
   alloc_sem = {
     count = {
       counter = 0
     },
     wait_list = {
       next = 0xffff8813bb5f7308,
       prev = 0xffff8813bb5f7308
     },
     wait_lock = {
       raw_lock = {
         {
           val = {
             counter = 0
           },
           {
             locked = 0 '\000',
             pending = 0 '\000'
           },
           {
             locked_pending = 0,
             tail = 0
           }
         }
       }
     },
     osq = {
       tail = {
         counter = 0
       }
     },
     owner = 0x0
   },
   bb_counters = 0xffff8813bb5f7328
}
crash>


crash> list 0xffff880268291d78  -l ext4_prealloc_space.pa_group_list -s 
ext4_prealloc_space.pa_count
ffff880268291d78
   pa_count = {
     counter = 1      ---> ****pa->pa_count
   }
ffff8813bb5f72f0
   pa_count = {
     counter = -30701
   }


crash> struct -xo  ext4_prealloc_space
struct ext4_prealloc_space {
    [0x0] struct list_head pa_inode_list;
   [0x10] struct list_head pa_group_list;
          union {
              struct list_head pa_tmp_list;
              struct callback_head pa_rcu;
   [0x20] } u;
   [0x30] spinlock_t pa_lock;
   [0x34] atomic_t pa_count;
   [0x38] unsigned int pa_deleted;
   [0x40] ext4_fsblk_t pa_pstart;
   [0x48] ext4_lblk_t pa_lstart;
   [0x4c] ext4_grpblk_t pa_len;
   [0x50] ext4_grpblk_t pa_free;
   [0x54] unsigned short pa_type;
   [0x58] spinlock_t *pa_obj_lock;
   [0x60] struct inode *pa_inode;
}
SIZE: 0x68
	

crash> rd 0xffff880268291d68 20
ffff880268291d68:  ffff881822f8a4c8 ffff881822f8a4c8   ..."......."....
ffff880268291d78:  ffff8813bb5f72f0 ffff8813bb5f72f0   .r_......r_.....
ffff880268291d88:  0000000000001000 ffff880db2371000   ..........7.....
ffff880268291d98:  0000000100000001 0000000000000000   ................
ffff880268291da8:  0000000000029c39 0000001700000c41   9.......A.......
ffff880268291db8:  ffff000000000016 ffff881822f8a4d8   ..........."....
ffff880268291dc8:  ffff881822f8a268 ffff880268291af8   h.."......)h....
ffff880268291dd8:  ffff880268291dd0 ffffea00069a07c0   ..)h............
ffff880268291de8:  00000000004d5bb7 0000000000001000   .[M.............
ffff880268291df8:  ffff8801a681f000 0000000000000000   ................



Before "goto repeat", it is necessary to check whether 
grp->bb_prealloc_list is empty.
This patch may fix it.
Please kindly give us your comments and suggestions.
Thanks.


diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 99bf091..8082e2d 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4290,7 +4290,7 @@ static void ext4_mb_new_preallocation(struct 
ext4_allocation_context *ac)
         free_total += free;

         /* if we still need more blocks and some PAs were used, try 
again */
-       if (free_total < needed && busy) {
+       if (free_total < needed && busy && 
!list_empty(&grp->bb_prealloc_list)) {
                 ext4_unlock_group(sb, group);
                 cond_resched();
                 busy = 0;



--
Best wishes,
Wen





^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: add a configurable parameter to prevent endless loop in ext4_mb_discard_group_p
  2021-04-08 18:50 [PATCH] ext4: add a configurable parameter to prevent endless loop in ext4_mb_discard_group_p Wen Yang
@ 2021-04-08 22:11 ` Andreas Dilger
  2021-04-09  5:47 ` riteshh
  1 sibling, 0 replies; 8+ messages in thread
From: Andreas Dilger @ 2021-04-08 22:11 UTC (permalink / raw)
  To: Wen Yang; +Cc: riteshh, tytso, linux-ext4, jack, linux-kernel, baoyou.xie

[-- Attachment #1: Type: text/plain, Size: 3693 bytes --]

On Apr 8, 2021, at 12:50 PM, Wen Yang <wenyang@linux.alibaba.com> wrote:
> 
> Hi Ritesh and Andreas,
> 
> Thank you for your reply. Since there is still a faulty machine, we have analyzed it again and found it is indeed a very special case:
> 
> 
> crash> struct ext4_group_info ffff8813bb5f72d0
> struct ext4_group_info {
>  bb_state = 0,
>  bb_free_root = {
>    rb_node = 0x0
>  },
>  bb_first_free = 1681,
>  bb_free = 0,
>  bb_fragments = 0,
>  bb_largest_free_order = -1,
>  bb_prealloc_list = {
>    next = 0xffff880268291d78,
>    prev = 0xffff880268291d78     ---> *** The list is empty
>  },
>  alloc_sem = {
>    count = {
>      counter = 0
>    },
>    wait_list = {
>      next = 0xffff8813bb5f7308,
>      prev = 0xffff8813bb5f7308
>    },
>    wait_lock = {
>      raw_lock = {
>        {
>          val = {
>            counter = 0
>          },
>          {
>            locked = 0 '\000',
>            pending = 0 '\000'
>          },
>          {
>            locked_pending = 0,
>            tail = 0
>          }
>        }
>      }
>    },
>    osq = {
>      tail = {
>        counter = 0
>      }
>    },
>    owner = 0x0
>  },
>  bb_counters = 0xffff8813bb5f7328
> }
> crash>
> 
> 
> crash> list 0xffff880268291d78  -l ext4_prealloc_space.pa_group_list -s ext4_prealloc_space.pa_count
> ffff880268291d78
>  pa_count = {
>    counter = 1      ---> ****pa->pa_count
>  }
> ffff8813bb5f72f0
>  pa_count = {
>    counter = -30701
>  }
> 
> 
> crash> struct -xo  ext4_prealloc_space
> struct ext4_prealloc_space {
>   [0x0] struct list_head pa_inode_list;
>  [0x10] struct list_head pa_group_list;
>         union {
>             struct list_head pa_tmp_list;
>             struct callback_head pa_rcu;
>  [0x20] } u;
>  [0x30] spinlock_t pa_lock;
>  [0x34] atomic_t pa_count;
>  [0x38] unsigned int pa_deleted;
>  [0x40] ext4_fsblk_t pa_pstart;
>  [0x48] ext4_lblk_t pa_lstart;
>  [0x4c] ext4_grpblk_t pa_len;
>  [0x50] ext4_grpblk_t pa_free;
>  [0x54] unsigned short pa_type;
>  [0x58] spinlock_t *pa_obj_lock;
>  [0x60] struct inode *pa_inode;
> }
> SIZE: 0x68
> 
> 
> Before "goto repeat", it is necessary to check whether grp->bb_prealloc_list is empty.  This patch may fix it.
> Please kindly give us your comments and suggestions.
> Thanks.

This patch definitely looks more reasonable than the previous one, but
I don't think it is correct?

It isn't clear how the system could get into this state, or stay there.

If bb_prealloc_list is empty, then "busy" should be 0, since busy = 1
is only set inside "list_for_each_entry_safe(bb_prealloc_list)", and
that implies the list is *not* empty?  The ext4_lock_group() is held
over this code, so the list should not be changing in this case, and
even if the list changed, it _should_ only affect the loop one time.

> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 99bf091..8082e2d 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4290,7 +4290,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
>        free_total += free;
> 
>        /* if we still need more blocks and some PAs were used, try again */
> -       if (free_total < needed && busy) {
> +       if (free_total < needed && busy && !list_empty(&grp->bb_prealloc_list)) {
>                ext4_unlock_group(sb, group);
>                cond_resched();
>                busy = 0;
>                goto repeat;

Minor suggestion - moving "busy = 0" right after "repeat:" and removing
the "int busy = 0" initializer would make this code a bit more clear.

Cheers, Andreas




[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 873 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: add a configurable parameter to prevent endless loop in ext4_mb_discard_group_p
  2021-04-08 18:50 [PATCH] ext4: add a configurable parameter to prevent endless loop in ext4_mb_discard_group_p Wen Yang
  2021-04-08 22:11 ` Andreas Dilger
@ 2021-04-09  5:47 ` riteshh
  2021-04-09 10:18   ` Jan Kara
  2021-04-10 19:45   ` Wen Yang
  1 sibling, 2 replies; 8+ messages in thread
From: riteshh @ 2021-04-09  5:47 UTC (permalink / raw)
  To: Wen Yang; +Cc: adilger, tytso, linux-ext4, jack, linux-kernel, baoyou.xie

On 21/04/09 02:50AM, Wen Yang wrote:
> > On Apr 7, 2021, at 5:16 AM, riteshh <riteshh@linux.ibm.com> wrote:
> >>
> >> On 21/04/07 03:01PM, Wen Yang wrote:
> >>> From: Wen Yang <simon.wy@alibaba-inc.com>
> >>>
> >>> The kworker has occupied 100% of the CPU for several days:
> >>> PID USER  PR  NI VIRT RES SHR S  %CPU  %MEM TIME+  COMMAND
> >>> 68086 root 20 0  0    0   0   R  100.0 0.0  9718:18 kworker/u64:11
> >>>
> >>> And the stack obtained through sysrq is as follows:
> >>> [20613144.850426] task: ffff8800b5e08000 task.stack: ffffc9001342c000
> >>> [20613144.850438] Call Trace:
> >>> [20613144.850439] [<ffffffffa0244209>]ext4_mb_new_blocks+0x429/0x550
> [ext4]
> >>> [20613144.850439]  [<ffffffffa02389ae>] ext4_ext_map_blocks+0xb5e/0xf30
> [ext4]
> >>> [20613144.850441]  [<ffffffffa0204b52>] ext4_map_blocks+0x172/0x620
> [ext4]
> >>> [20613144.850442]  [<ffffffffa0208675>] ext4_writepages+0x7e5/0xf00
> [ext4]
> >>> [20613144.850443]  [<ffffffff811c487e>] do_writepages+0x1e/0x30
> >>> [20613144.850444]  [<ffffffff81280265>]
> __writeback_single_inode+0x45/0x320
> >>> [20613144.850444]  [<ffffffff81280ab2>] writeback_sb_inodes+0x272/0x600
> >>> [20613144.850445]  [<ffffffff81280ed2>] __writeback_inodes_wb+0x92/0xc0
> >>> [20613144.850445]  [<ffffffff81281238>] wb_writeback+0x268/0x300
> >>> [20613144.850446]  [<ffffffff812819f4>] wb_workfn+0xb4/0x380
> >>> [20613144.850447]  [<ffffffff810a5dc9>] process_one_work+0x189/0x420
> >>> [20613144.850447]  [<ffffffff810a60ae>] worker_thread+0x4e/0x4b0
> >>>
> >>> The cpu resources of the cloud server are precious, and the server
> >>> cannot be restarted after running for a long time, so a configuration
> >>> parameter is added to prevent this endless loop.
> >>
> >> Strange, if there is a endless loop here. Then I would definitely see
> >> if there is any accounting problem in pa->pa_count. Otherwise busy=1
> >> should not be set everytime. ext4_mb_show_pa() function may help debug
> this.
> >>
> >> If yes, then that means there always exists either a file preallocation
> >> or a group preallocation. Maybe it is possible, in some use case.
> >> Others may know of such use case, if any.
>
> > If this code is broken, then it doesn't make sense to me that we would
> > leave it in the "run forever" state after the patch, and require a sysfs
> > tunable to be set to have a properly working system?
>
> > Is there anything particularly strange about the workload/system that
> > might cause this?  Filesystem is very full, memory is very low, etc?
>
> Hi Ritesh and Andreas,
>
> Thank you for your reply. Since there is still a faulty machine, we have
> analyzed it again and found it is indeed a very special case:
>
>
> crash> struct ext4_group_info ffff8813bb5f72d0
> struct ext4_group_info {
>   bb_state = 0,
>   bb_free_root = {
>     rb_node = 0x0
>   },
>   bb_first_free = 1681,
>   bb_free = 0,

Not related to this issue, but above two variables values doesn't looks
consistent.

>   bb_fragments = 0,
>   bb_largest_free_order = -1,
>   bb_prealloc_list = {
>     next = 0xffff880268291d78,
>     prev = 0xffff880268291d78     ---> *** The list is empty
>   },

Ok. So when you collected the dump this list was empty.

>   alloc_sem = {
>     count = {
>       counter = 0
>     },
>     wait_list = {
>       next = 0xffff8813bb5f7308,
>       prev = 0xffff8813bb5f7308
>     },
>     wait_lock = {
>       raw_lock = {
>         {
>           val = {
>             counter = 0
>           },
>           {
>             locked = 0 '\000',
>             pending = 0 '\000'
>           },
>           {
>             locked_pending = 0,
>             tail = 0
>           }
>         }
>       }
>     },
>     osq = {
>       tail = {
>         counter = 0
>       }
>     },
>     owner = 0x0
>   },
>   bb_counters = 0xffff8813bb5f7328
> }
> crash>
>
>
> crash> list 0xffff880268291d78  -l ext4_prealloc_space.pa_group_list -s

No point of doing this I guess, since the list anyway is empty.
What you may be seeing below is some garbage data.

> ext4_prealloc_space.pa_count
> ffff880268291d78
>   pa_count = {
>     counter = 1      ---> ****pa->pa_count
>   }
> ffff8813bb5f72f0
>   pa_count = {
>     counter = -30701
>   }

I guess, since list is empty and you are seeing garbage hence counter value
of above node looks weird.

>
>
> crash> struct -xo  ext4_prealloc_space
> struct ext4_prealloc_space {
>    [0x0] struct list_head pa_inode_list;
>   [0x10] struct list_head pa_group_list;
>          union {
>              struct list_head pa_tmp_list;
>              struct callback_head pa_rcu;
>   [0x20] } u;
>   [0x30] spinlock_t pa_lock;
>   [0x34] atomic_t pa_count;
>   [0x38] unsigned int pa_deleted;
>   [0x40] ext4_fsblk_t pa_pstart;
>   [0x48] ext4_lblk_t pa_lstart;
>   [0x4c] ext4_grpblk_t pa_len;
>   [0x50] ext4_grpblk_t pa_free;
>   [0x54] unsigned short pa_type;
>   [0x58] spinlock_t *pa_obj_lock;
>   [0x60] struct inode *pa_inode;
> }
> SIZE: 0x68
>
>
> crash> rd 0xffff880268291d68 20
> ffff880268291d68:  ffff881822f8a4c8 ffff881822f8a4c8   ..."......."....
> ffff880268291d78:  ffff8813bb5f72f0 ffff8813bb5f72f0   .r_......r_.....
> ffff880268291d88:  0000000000001000 ffff880db2371000   ..........7.....
> ffff880268291d98:  0000000100000001 0000000000000000   ................
> ffff880268291da8:  0000000000029c39 0000001700000c41   9.......A.......
> ffff880268291db8:  ffff000000000016 ffff881822f8a4d8   ..........."....
> ffff880268291dc8:  ffff881822f8a268 ffff880268291af8   h.."......)h....
> ffff880268291dd8:  ffff880268291dd0 ffffea00069a07c0   ..)h............
> ffff880268291de8:  00000000004d5bb7 0000000000001000   .[M.............
> ffff880268291df8:  ffff8801a681f000 0000000000000000   ................

I am not sure what was intention behind above data.

>
>
>
> Before "goto repeat", it is necessary to check whether grp->bb_prealloc_list
> is empty.
> This patch may fix it.
> Please kindly give us your comments and suggestions.
> Thanks.
>
>
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 99bf091..8082e2d 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4290,7 +4290,7 @@ static void ext4_mb_new_preallocation(struct
> ext4_allocation_context *ac)
>         free_total += free;
>
>         /* if we still need more blocks and some PAs were used, try again */
> -       if (free_total < needed && busy) {
> +       if (free_total < needed && busy &&
> !list_empty(&grp->bb_prealloc_list)) {
>                 ext4_unlock_group(sb, group);
>                 cond_resched();
>                 busy = 0;
>
>

Not really. Since we anyway check busy variable too. Which is only set when the
bb_prealloc_list is actually not empty and have some busy pa's in it.

What may explain this scenario is.
1. That this writeback thread which is trying to write the dirty data, tries to
   free up some blocks but since the free_total < needed and busy == 1,
   -> so it released the group lock (ext4_unlock_group()) and make busy = 0.
   -> at this point there could be something running in parallel which may take
   the group lock and allocate those PAs which were released by this process.

2. writeback thread again comes and tries to check if the bb_prealloc_list is
   empty and it is not since some other thread again allocated something which
   this guys freed for itself.


Tell me -
1. How low was free space when you hit this issue.
2. How big was your FS? How many groups?
3. Is there some backgroud worker constantly running who is doing some
   allocations. Do you have a single cpu system?
   On this as soon writeback thread release the group lock, the other process
   gets schedule in, takes the lock and does some group preallocations from the
   same group from which wb thread freed some blocks.
   And so wb thread keeps looping.
4. Is this a real workload or is it some sort of simulated tests?

Maybe if you explain your above setup/environment better, that will help in
debugging on why this writeback thread was constantly running/enlessly-looping
for days

-ritesh


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: add a configurable parameter to prevent endless loop in ext4_mb_discard_group_p
  2021-04-09  5:47 ` riteshh
@ 2021-04-09 10:18   ` Jan Kara
  2021-04-09 11:28     ` riteshh
  2021-04-10 19:45   ` Wen Yang
  1 sibling, 1 reply; 8+ messages in thread
From: Jan Kara @ 2021-04-09 10:18 UTC (permalink / raw)
  To: riteshh
  Cc: Wen Yang, adilger, tytso, linux-ext4, jack, linux-kernel, baoyou.xie

On Fri 09-04-21 11:17:33, riteshh wrote:
> On 21/04/09 02:50AM, Wen Yang wrote:
> > > On Apr 7, 2021, at 5:16 AM, riteshh <riteshh@linux.ibm.com> wrote:
> > >>
> > >> On 21/04/07 03:01PM, Wen Yang wrote:
> > >>> From: Wen Yang <simon.wy@alibaba-inc.com>
> > >>>
> > >>> The kworker has occupied 100% of the CPU for several days:
> > >>> PID USER  PR  NI VIRT RES SHR S  %CPU  %MEM TIME+  COMMAND
> > >>> 68086 root 20 0  0    0   0   R  100.0 0.0  9718:18 kworker/u64:11
> > >>>
> > >>> And the stack obtained through sysrq is as follows:
> > >>> [20613144.850426] task: ffff8800b5e08000 task.stack: ffffc9001342c000
> > >>> [20613144.850438] Call Trace:
> > >>> [20613144.850439] [<ffffffffa0244209>]ext4_mb_new_blocks+0x429/0x550
> > [ext4]
> > >>> [20613144.850439]  [<ffffffffa02389ae>] ext4_ext_map_blocks+0xb5e/0xf30
> > [ext4]
> > >>> [20613144.850441]  [<ffffffffa0204b52>] ext4_map_blocks+0x172/0x620
> > [ext4]
> > >>> [20613144.850442]  [<ffffffffa0208675>] ext4_writepages+0x7e5/0xf00
> > [ext4]
> > >>> [20613144.850443]  [<ffffffff811c487e>] do_writepages+0x1e/0x30
> > >>> [20613144.850444]  [<ffffffff81280265>]
> > __writeback_single_inode+0x45/0x320
> > >>> [20613144.850444]  [<ffffffff81280ab2>] writeback_sb_inodes+0x272/0x600
> > >>> [20613144.850445]  [<ffffffff81280ed2>] __writeback_inodes_wb+0x92/0xc0
> > >>> [20613144.850445]  [<ffffffff81281238>] wb_writeback+0x268/0x300
> > >>> [20613144.850446]  [<ffffffff812819f4>] wb_workfn+0xb4/0x380
> > >>> [20613144.850447]  [<ffffffff810a5dc9>] process_one_work+0x189/0x420
> > >>> [20613144.850447]  [<ffffffff810a60ae>] worker_thread+0x4e/0x4b0
> > >>>
> > >>> The cpu resources of the cloud server are precious, and the server
> > >>> cannot be restarted after running for a long time, so a configuration
> > >>> parameter is added to prevent this endless loop.
> > >>
> > >> Strange, if there is a endless loop here. Then I would definitely see
> > >> if there is any accounting problem in pa->pa_count. Otherwise busy=1
> > >> should not be set everytime. ext4_mb_show_pa() function may help debug
> > this.
> > >>
> > >> If yes, then that means there always exists either a file preallocation
> > >> or a group preallocation. Maybe it is possible, in some use case.
> > >> Others may know of such use case, if any.
> >
> > > If this code is broken, then it doesn't make sense to me that we would
> > > leave it in the "run forever" state after the patch, and require a sysfs
> > > tunable to be set to have a properly working system?
> >
> > > Is there anything particularly strange about the workload/system that
> > > might cause this?  Filesystem is very full, memory is very low, etc?
> >
> > Hi Ritesh and Andreas,
> >
> > Thank you for your reply. Since there is still a faulty machine, we have
> > analyzed it again and found it is indeed a very special case:
> >
> >
> > crash> struct ext4_group_info ffff8813bb5f72d0
> > struct ext4_group_info {
> >   bb_state = 0,
> >   bb_free_root = {
> >     rb_node = 0x0
> >   },
> >   bb_first_free = 1681,
> >   bb_free = 0,
> 
> Not related to this issue, but above two variables values doesn't looks
> consistent.
> 
> >   bb_fragments = 0,
> >   bb_largest_free_order = -1,
> >   bb_prealloc_list = {
> >     next = 0xffff880268291d78,
> >     prev = 0xffff880268291d78     ---> *** The list is empty
> >   },
> 
> Ok. So when you collected the dump this list was empty.

No, it is not empty. It has a single element. Note that the structure is at
ffff8813bb5f72d0 so the pointers would have to be like ffff8813bb5f7xxx.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: add a configurable parameter to prevent endless loop in ext4_mb_discard_group_p
  2021-04-09 10:18   ` Jan Kara
@ 2021-04-09 11:28     ` riteshh
  0 siblings, 0 replies; 8+ messages in thread
From: riteshh @ 2021-04-09 11:28 UTC (permalink / raw)
  To: Jan Kara; +Cc: Wen Yang, adilger, tytso, linux-ext4, linux-kernel, baoyou.xie

On 21/04/09 12:18PM, Jan Kara wrote:
> On Fri 09-04-21 11:17:33, riteshh wrote:
> > On 21/04/09 02:50AM, Wen Yang wrote:
> > > > On Apr 7, 2021, at 5:16 AM, riteshh <riteshh@linux.ibm.com> wrote:
> > > >>
> > > >> On 21/04/07 03:01PM, Wen Yang wrote:
> > > >>> From: Wen Yang <simon.wy@alibaba-inc.com>
> > > >>>
> > > >>> The kworker has occupied 100% of the CPU for several days:
> > > >>> PID USER  PR  NI VIRT RES SHR S  %CPU  %MEM TIME+  COMMAND
> > > >>> 68086 root 20 0  0    0   0   R  100.0 0.0  9718:18 kworker/u64:11
> > > >>>
> > > >>> And the stack obtained through sysrq is as follows:
> > > >>> [20613144.850426] task: ffff8800b5e08000 task.stack: ffffc9001342c000
> > > >>> [20613144.850438] Call Trace:
> > > >>> [20613144.850439] [<ffffffffa0244209>]ext4_mb_new_blocks+0x429/0x550
> > > [ext4]
> > > >>> [20613144.850439]  [<ffffffffa02389ae>] ext4_ext_map_blocks+0xb5e/0xf30
> > > [ext4]
> > > >>> [20613144.850441]  [<ffffffffa0204b52>] ext4_map_blocks+0x172/0x620
> > > [ext4]
> > > >>> [20613144.850442]  [<ffffffffa0208675>] ext4_writepages+0x7e5/0xf00
> > > [ext4]
> > > >>> [20613144.850443]  [<ffffffff811c487e>] do_writepages+0x1e/0x30
> > > >>> [20613144.850444]  [<ffffffff81280265>]
> > > __writeback_single_inode+0x45/0x320
> > > >>> [20613144.850444]  [<ffffffff81280ab2>] writeback_sb_inodes+0x272/0x600
> > > >>> [20613144.850445]  [<ffffffff81280ed2>] __writeback_inodes_wb+0x92/0xc0
> > > >>> [20613144.850445]  [<ffffffff81281238>] wb_writeback+0x268/0x300
> > > >>> [20613144.850446]  [<ffffffff812819f4>] wb_workfn+0xb4/0x380
> > > >>> [20613144.850447]  [<ffffffff810a5dc9>] process_one_work+0x189/0x420
> > > >>> [20613144.850447]  [<ffffffff810a60ae>] worker_thread+0x4e/0x4b0
> > > >>>
> > > >>> The cpu resources of the cloud server are precious, and the server
> > > >>> cannot be restarted after running for a long time, so a configuration
> > > >>> parameter is added to prevent this endless loop.
> > > >>
> > > >> Strange, if there is a endless loop here. Then I would definitely see
> > > >> if there is any accounting problem in pa->pa_count. Otherwise busy=1
> > > >> should not be set everytime. ext4_mb_show_pa() function may help debug
> > > this.
> > > >>
> > > >> If yes, then that means there always exists either a file preallocation
> > > >> or a group preallocation. Maybe it is possible, in some use case.
> > > >> Others may know of such use case, if any.
> > >
> > > > If this code is broken, then it doesn't make sense to me that we would
> > > > leave it in the "run forever" state after the patch, and require a sysfs
> > > > tunable to be set to have a properly working system?
> > >
> > > > Is there anything particularly strange about the workload/system that
> > > > might cause this?  Filesystem is very full, memory is very low, etc?
> > >
> > > Hi Ritesh and Andreas,
> > >
> > > Thank you for your reply. Since there is still a faulty machine, we have
> > > analyzed it again and found it is indeed a very special case:
> > >
> > >
> > > crash> struct ext4_group_info ffff8813bb5f72d0
> > > struct ext4_group_info {
> > >   bb_state = 0,
> > >   bb_free_root = {
> > >     rb_node = 0x0
> > >   },
> > >   bb_first_free = 1681,
> > >   bb_free = 0,
> >
> > Not related to this issue, but above two variables values doesn't looks
> > consistent.
> >
> > >   bb_fragments = 0,
> > >   bb_largest_free_order = -1,
> > >   bb_prealloc_list = {
> > >     next = 0xffff880268291d78,
> > >     prev = 0xffff880268291d78     ---> *** The list is empty
> > >   },
> >
> > Ok. So when you collected the dump this list was empty.
>
> No, it is not empty. It has a single element. Note that the structure is at
> ffff8813bb5f72d0 so the pointers would have to be like ffff8813bb5f7xxx.

Errr, yes right. So the list is not empty.
But I guess the other arguments discussed in that mail should still be valid.

-ritesh


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: add a configurable parameter to prevent endless loop in ext4_mb_discard_group_p
  2021-04-09  5:47 ` riteshh
  2021-04-09 10:18   ` Jan Kara
@ 2021-04-10 19:45   ` Wen Yang
  2021-04-11  4:25     ` Theodore Ts'o
  1 sibling, 1 reply; 8+ messages in thread
From: Wen Yang @ 2021-04-10 19:45 UTC (permalink / raw)
  To: riteshh; +Cc: adilger, tytso, linux-ext4, jack, linux-kernel, baoyou.xie



在 2021/4/9 下午1:47, riteshh 写道:
> On 21/04/09 02:50AM, Wen Yang wrote:
>>> On Apr 7, 2021, at 5:16 AM, riteshh <riteshh@linux.ibm.com> wrote:
>>>>
>>>> On 21/04/07 03:01PM, Wen Yang wrote:
>>>>> From: Wen Yang <simon.wy@alibaba-inc.com>
>>>>>
>>>>> The kworker has occupied 100% of the CPU for several days:
>>>>> PID USER  PR  NI VIRT RES SHR S  %CPU  %MEM TIME+  COMMAND
>>>>> 68086 root 20 0  0    0   0   R  100.0 0.0  9718:18 kworker/u64:11
>>>>>
>>>>> And the stack obtained through sysrq is as follows:
>>>>> [20613144.850426] task: ffff8800b5e08000 task.stack: ffffc9001342c000
>>>>> [20613144.850438] Call Trace:
>>>>> [20613144.850439] [<ffffffffa0244209>]ext4_mb_new_blocks+0x429/0x550
>> [ext4]
>>>>> [20613144.850439]  [<ffffffffa02389ae>] ext4_ext_map_blocks+0xb5e/0xf30
>> [ext4]
>>>>> [20613144.850441]  [<ffffffffa0204b52>] ext4_map_blocks+0x172/0x620
>> [ext4]
>>>>> [20613144.850442]  [<ffffffffa0208675>] ext4_writepages+0x7e5/0xf00
>> [ext4]
>>>>> [20613144.850443]  [<ffffffff811c487e>] do_writepages+0x1e/0x30
>>>>> [20613144.850444]  [<ffffffff81280265>]
>> __writeback_single_inode+0x45/0x320
>>>>> [20613144.850444]  [<ffffffff81280ab2>] writeback_sb_inodes+0x272/0x600
>>>>> [20613144.850445]  [<ffffffff81280ed2>] __writeback_inodes_wb+0x92/0xc0
>>>>> [20613144.850445]  [<ffffffff81281238>] wb_writeback+0x268/0x300
>>>>> [20613144.850446]  [<ffffffff812819f4>] wb_workfn+0xb4/0x380
>>>>> [20613144.850447]  [<ffffffff810a5dc9>] process_one_work+0x189/0x420
>>>>> [20613144.850447]  [<ffffffff810a60ae>] worker_thread+0x4e/0x4b0
>>>>>
>>>>> The cpu resources of the cloud server are precious, and the server
>>>>> cannot be restarted after running for a long time, so a configuration
>>>>> parameter is added to prevent this endless loop.
>>>>
>>>> Strange, if there is a endless loop here. Then I would definitely see
>>>> if there is any accounting problem in pa->pa_count. Otherwise busy=1
>>>> should not be set everytime. ext4_mb_show_pa() function may help debug
>> this.
>>>>
>>>> If yes, then that means there always exists either a file preallocation
>>>> or a group preallocation. Maybe it is possible, in some use case.
>>>> Others may know of such use case, if any.
>>
>>> If this code is broken, then it doesn't make sense to me that we would
>>> leave it in the "run forever" state after the patch, and require a sysfs
>>> tunable to be set to have a properly working system?
>>
>>> Is there anything particularly strange about the workload/system that
>>> might cause this?  Filesystem is very full, memory is very low, etc?
>>
>> Hi Ritesh and Andreas,
>>
>> Thank you for your reply. Since there is still a faulty machine, we have
>> analyzed it again and found it is indeed a very special case:
>>
>>
>> crash> struct ext4_group_info ffff8813bb5f72d0
>> struct ext4_group_info {
>>    bb_state = 0,
>>    bb_free_root = {
>>      rb_node = 0x0
>>    },
>>    bb_first_free = 1681,
>>    bb_free = 0,
> 
> Not related to this issue, but above two variables values doesn't looks
> consistent.
> 
>>    bb_fragments = 0,
>>    bb_largest_free_order = -1,
>>    bb_prealloc_list = {
>>      next = 0xffff880268291d78,
>>      prev = 0xffff880268291d78     ---> *** The list is empty
>>    },
> 
> Ok. So when you collected the dump this list was empty.
> 
>>    alloc_sem = {
>>      count = {
>>        counter = 0
>>      },
>>      wait_list = {
>>        next = 0xffff8813bb5f7308,
>>        prev = 0xffff8813bb5f7308
>>      },
>>      wait_lock = {
>>        raw_lock = {
>>          {
>>            val = {
>>              counter = 0
>>            },
>>            {
>>              locked = 0 '\000',
>>              pending = 0 '\000'
>>            },
>>            {
>>              locked_pending = 0,
>>              tail = 0
>>            }
>>          }
>>        }
>>      },
>>      osq = {
>>        tail = {
>>          counter = 0
>>        }
>>      },
>>      owner = 0x0
>>    },
>>    bb_counters = 0xffff8813bb5f7328
>> }
>> crash>
>>
>>
>> crash> list 0xffff880268291d78  -l ext4_prealloc_space.pa_group_list -s
> 
> No point of doing this I guess, since the list anyway is empty.
> What you may be seeing below is some garbage data.
> 
>> ext4_prealloc_space.pa_count
>> ffff880268291d78
>>    pa_count = {
>>      counter = 1      ---> ****pa->pa_count
>>    }
>> ffff8813bb5f72f0
>>    pa_count = {
>>      counter = -30701
>>    }
> 
> I guess, since list is empty and you are seeing garbage hence counter value
> of above node looks weird.
> 
>>
>>
>> crash> struct -xo  ext4_prealloc_space
>> struct ext4_prealloc_space {
>>     [0x0] struct list_head pa_inode_list;
>>    [0x10] struct list_head pa_group_list;
>>           union {
>>               struct list_head pa_tmp_list;
>>               struct callback_head pa_rcu;
>>    [0x20] } u;
>>    [0x30] spinlock_t pa_lock;
>>    [0x34] atomic_t pa_count;
>>    [0x38] unsigned int pa_deleted;
>>    [0x40] ext4_fsblk_t pa_pstart;
>>    [0x48] ext4_lblk_t pa_lstart;
>>    [0x4c] ext4_grpblk_t pa_len;
>>    [0x50] ext4_grpblk_t pa_free;
>>    [0x54] unsigned short pa_type;
>>    [0x58] spinlock_t *pa_obj_lock;
>>    [0x60] struct inode *pa_inode;
>> }
>> SIZE: 0x68
>>
>>
>> crash> rd 0xffff880268291d68 20
>> ffff880268291d68:  ffff881822f8a4c8 ffff881822f8a4c8   ..."......."....
>> ffff880268291d78:  ffff8813bb5f72f0 ffff8813bb5f72f0   .r_......r_.....
>> ffff880268291d88:  0000000000001000 ffff880db2371000   ..........7.....
>> ffff880268291d98:  0000000100000001 0000000000000000   ................
>> ffff880268291da8:  0000000000029c39 0000001700000c41   9.......A.......
>> ffff880268291db8:  ffff000000000016 ffff881822f8a4d8   ..........."....
>> ffff880268291dc8:  ffff881822f8a268 ffff880268291af8   h.."......)h....
>> ffff880268291dd8:  ffff880268291dd0 ffffea00069a07c0   ..)h............
>> ffff880268291de8:  00000000004d5bb7 0000000000001000   .[M.............
>> ffff880268291df8:  ffff8801a681f000 0000000000000000   ................
> 
> I am not sure what was intention behind above data.
> 

The above is the original data of ext4_prealloc_space(0xffff880268291d78 
- 0x10). After formatting, it is as follows:

crash> struct ext4_prealloc_space  0xffff880268291d68
struct ext4_prealloc_space {
   pa_inode_list = {
     next = 0xffff881822f8a4c8,
     prev = 0xffff881822f8a4c8
   },
   pa_group_list = {
     next = 0xffff8813bb5f72f0,
     prev = 0xffff8813bb5f72f0
   },
   u = {
     pa_tmp_list = {
       next = 0x1000,
       prev = 0xffff880db2371000
     },
     pa_rcu = {
       next = 0x1000,
       func = 0xffff880db2371000
     }
   },
   pa_lock = {
     {
       rlock = {
         raw_lock = {
           {
             val = {
               counter = 0
             },
             {
               locked = 0 '\000',
               pending = 0 '\000'
             },
             {
               locked_pending = 0,
               tail = 0
             }
           }
         }
       }
     }
   },
   pa_count = {
     counter = 1
   },
   pa_deleted = 0,
   pa_pstart = 171065,
   pa_lstart = 3137,
   pa_len = 23,
   pa_free = 22,
   pa_type = 0,
   pa_obj_lock = 0xffff881822f8a4d8,
   pa_inode = 0xffff881822f8a268
}


It is a legal data, which can be verified by pa_inode:
crash>  kmem 0xffff881822f8a268
CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
ffff881fa6fd4a00     1064      95450    196620   6554    32k 
ext4_inode_cache
   SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
   ffffea00608be200  ffff881822f88000     0     30         27     3
   FREE / [ALLOCATED]
   [ffff881822f8a180]

We further analyze the pa (0xffff880268291d68) and find that another 
kworker is indeed using it:


ffff881822f8a4c8: ffff880268291d68

crash> kmem ffff881822f8a4c8
CACHE             OBJSIZE  ALLOCATED     TOTAL  SLABS  SSIZE  NAME
ffff881fa6fd4a00     1064      95450    196620   6554    32k 
ext4_inode_cache
   SLAB              MEMORY            NODE  TOTAL  ALLOCATED  FREE
   ffffea00608be200  ffff881822f88000     0     30         27     3
   FREE / [ALLOCATED]
   [ffff881822f8a180]

crash> struct ext4_inode_info.i_prealloc_list ffff881822f8a180
   i_prealloc_list = {
     next = 0xffff880268291d68,
     prev = 0xffff880268291d68
   }


PID: 15140  TASK: ffff88004d6dc300  CPU: 16  COMMAND: "kworker/u64:1"
  #0 [ffffc900273e7518] __schedule at ffffffff8173ca3b
  #1 [ffffc900273e75a0] schedule at ffffffff8173cfb6
  #2 [ffffc900273e75b8] io_schedule at ffffffff810bb75a
  #3 [ffffc900273e75e0] bit_wait_io at ffffffff8173d8d1
  #4 [ffffc900273e75f8] __wait_on_bit_lock at ffffffff8173d4e9
  #5 [ffffc900273e7638] out_of_line_wait_on_bit_lock at ffffffff8173d742
  #6 [ffffc900273e76b0] __lock_buffer at ffffffff81288c32
  #7 [ffffc900273e76c8] do_get_write_access at ffffffffa00dd177 [jbd2]
  #8 [ffffc900273e7728] jbd2_journal_get_write_access at 
ffffffffa00dd3a3 [jbd2]
  #9 [ffffc900273e7750] __ext4_journal_get_write_access at 
ffffffffa023b37b [ext4]
#10 [ffffc900273e7788] ext4_mb_mark_diskspace_used at ffffffffa0242a0b 
[ext4]
     ffffc900273e7790: ffff880f80618000 ffff881822f8a4c8 ----> here
     ffffc900273e77a0: ffffc900273e77e8 00000000a023e517
     ffffc900273e77b0: ffff8800436a6140 16abbde9a386f6c8
     ffffc900273e77c0: ffff880f8061c800 0000000000000000
     ffffc900273e77d0: ffffc900273e78bc ffff880f80618000
     ffffc900273e77e0: ffffc900273e78e0 ffffc900273e7858
     ffffc900273e77f0: ffffffffa0244100

#11 [ffffc900273e77f0] ext4_mb_new_blocks at ffffffffa0244100 [ext4]
#12 [ffffc900273e7860] ext4_ext_map_blocks at ffffffffa02389ae [ext4]
#13 [ffffc900273e7950] ext4_map_blocks at ffffffffa0204b52 [ext4]
#14 [ffffc900273e79d0] ext4_writepages at ffffffffa0208675 [ext4]
#15 [ffffc900273e7b30] do_writepages at ffffffff811c487e
#16 [ffffc900273e7b40] __writeback_single_inode at ffffffff81280265
#17 [ffffc900273e7b90] writeback_sb_inodes at ffffffff81280ab2
#18 [ffffc900273e7c90] __writeback_inodes_wb at ffffffff81280ed2
#19 [ffffc900273e7cd8] wb_writeback at ffffffff81281238
#20 [ffffc900273e7d80] wb_workfn at ffffffff812819f4
#21 [ffffc900273e7e18] process_one_work at ffffffff810a5dc9
#22 [ffffc900273e7e60] worker_thread at ffffffff810a60ae
#23 [ffffc900273e7ec0] kthread at ffffffff810ac696
#24 [ffffc900273e7f50] ret_from_fork at ffffffff81741dd9


At this time, some logs are lost. It is suspected that the hard disk 
itself is faulty.

There are many hard disks on our server. Maybe we should not occupy 100% 
CPU for a long time just because one hard disk fails.

Please check the patch below again, thanks

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 99bf091..de423234 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -351,6 +351,8 @@ static void ext4_mb_generate_from_freelist(struct 
super_block *sb, void *bitmap,
  						ext4_group_t group);
  static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);

+static inline void ext4_mb_show_pa(struct super_block *sb);
+
  /*
   * The algorithm using this percpu seq counter goes below:
   * 1. We sample the percpu discard_pa_seq counter before trying for block
@@ -4211,7 +4213,7 @@ static void ext4_mb_new_preallocation(struct 
ext4_allocation_context *ac)
  	struct list_head list;
  	struct ext4_buddy e4b;
  	int err;
-	int busy = 0;
+	int busy;
  	int free, free_total = 0;

  	mb_debug(sb, "discard preallocation for group %u\n", group);
@@ -4240,6 +4242,7 @@ static void ext4_mb_new_preallocation(struct 
ext4_allocation_context *ac)

  	INIT_LIST_HEAD(&list);
  repeat:
+	busy = 0;
  	free = 0;
  	ext4_lock_group(sb, group);
  	list_for_each_entry_safe(pa, tmp,
@@ -4248,6 +4251,8 @@ static void ext4_mb_new_preallocation(struct 
ext4_allocation_context *ac)
  		if (atomic_read(&pa->pa_count)) {
  			spin_unlock(&pa->pa_lock);
  			busy = 1;
+			mb_debug(sb, "used pa while discarding for group %u\n", group);
+			ext4_mb_show_pa(sb);
  			continue;
  		}
  		if (pa->pa_deleted) {
@@ -4292,8 +4297,7 @@ static void ext4_mb_new_preallocation(struct 
ext4_allocation_context *ac)
  	/* if we still need more blocks and some PAs were used, try again */
  	if (free_total < needed && busy) {
  		ext4_unlock_group(sb, group);
-		cond_resched();
-		busy = 0;
+		schedule_timeout_uninterruptible(HZ/10);
  		goto repeat;
  	}
  	ext4_unlock_group(sb, group);


--

>>
>>
>>
>> Before "goto repeat", it is necessary to check whether grp->bb_prealloc_list
>> is empty.
>> This patch may fix it.
>> Please kindly give us your comments and suggestions.
>> Thanks.
>>
>>
>> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
>> index 99bf091..8082e2d 100644
>> --- a/fs/ext4/mballoc.c
>> +++ b/fs/ext4/mballoc.c
>> @@ -4290,7 +4290,7 @@ static void ext4_mb_new_preallocation(struct
>> ext4_allocation_context *ac)
>>          free_total += free;
>>
>>          /* if we still need more blocks and some PAs were used, try again */
>> -       if (free_total < needed && busy) {
>> +       if (free_total < needed && busy &&
>> !list_empty(&grp->bb_prealloc_list)) {
>>                  ext4_unlock_group(sb, group);
>>                  cond_resched();
>>                  busy = 0;
>>
>>
> 
> Not really. Since we anyway check busy variable too. Which is only set when the
> bb_prealloc_list is actually not empty and have some busy pa's in it.
> 
> What may explain this scenario is.
> 1. That this writeback thread which is trying to write the dirty data, tries to
>     free up some blocks but since the free_total < needed and busy == 1,
>     -> so it released the group lock (ext4_unlock_group()) and make busy = 0.
>     -> at this point there could be something running in parallel which may take
>     the group lock and allocate those PAs which were released by this process.
> 
> 2. writeback thread again comes and tries to check if the bb_prealloc_list is
>     empty and it is not since some other thread again allocated something which
>     this guys freed for itself.
> 
> 
> Tell me -
> 1. How low was free space when you hit this issue.
> 2. How big was your FS? How many groups?
> 3. Is there some backgroud worker constantly running who is doing some
>     allocations. Do you have a single cpu system?
>     On this as soon writeback thread release the group lock, the other process
>     gets schedule in, takes the lock and does some group preallocations from the
>     same group from which wb thread freed some blocks.
>     And so wb thread keeps looping.
> 4. Is this a real workload or is it some sort of simulated tests?
> 

This is a real workload.
More detailed information is still being collected. At present, we only 
analyze it based on the vmcore.


--
Best wishes,
Wen


> Maybe if you explain your above setup/environment better, that will help in
> debugging on why this writeback thread was constantly running/enlessly-looping
> for days
> 
> -ritesh
> 

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: add a configurable parameter to prevent endless loop in ext4_mb_discard_group_p
  2021-04-10 19:45   ` Wen Yang
@ 2021-04-11  4:25     ` Theodore Ts'o
  2021-04-15  7:53       ` Wen Yang
  0 siblings, 1 reply; 8+ messages in thread
From: Theodore Ts'o @ 2021-04-11  4:25 UTC (permalink / raw)
  To: Wen Yang; +Cc: riteshh, adilger, linux-ext4, jack, linux-kernel, baoyou.xie

On Sun, Apr 11, 2021 at 03:45:01AM +0800, Wen Yang wrote:
> At this time, some logs are lost. It is suspected that the hard disk itself
> is faulty.

If you have a kernel crash dump, that means you can extract out the
dmesg buffer, correct?  Is there any I/O error messages in the kernel
log?

What is the basis of the suspicion that the hard drive is faulty?
Kernel dmesg output?  Error reporting from smartctl?

> There are many hard disks on our server. Maybe we should not occupy 100% CPU
> for a long time just because one hard disk fails.

It depends on the nature of the hard drive failure.  How is it
failing?

One thing which we do need to be careful about is when focusing on how
to prevent a failure caused by some particular (potentially extreme)
scenarios, that we don't cause problems on more common scenarios (for
example a heavily loaded server, and/or a case where the file system
is almost full where we have multiple files "fighting" over a small
number of free blocks).

In general, my attitude is that the best way to protect against hard
drive failures is to have processes which are monitoring the health of
the system, and if there is evidence of a failed drive, that we
immediately kill all jobs which are relying on that drive (which we
call "draining" a particular drive), and/or if a sufficiently large
percentage of the drives have failed, or the machine can no longer do
its job, to automatically move all of those jobs to other servers
(e.g., "drain" the server), and then send the machine to some kind of
data center repair service, where the failed hard drives can be
replaced.

I'm skeptical of attempts to try to make the file system to somehow
continue to be able to "work" in the face of hard drive failures,
since failures can be highly atypical, and what might work well in one
failure scenario might be catastrophic in another.  It's especially
problematic if the HDD is not explcitly signalling an error condition,
but rather being slow (because it's doing a huge number of retries),
or the HDD is returning data which is simply different from what was
previously written.  The best we can do in that case is to detect that
something is wrong (this is where metadata checksums would be very
helpful), and then either remount the file system r/o, or panic the
machine, and/or signal to userspace that some particular file system
should be drained.

Cheers,

						- Ted

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] ext4: add a configurable parameter to prevent endless loop in ext4_mb_discard_group_p
  2021-04-11  4:25     ` Theodore Ts'o
@ 2021-04-15  7:53       ` Wen Yang
  0 siblings, 0 replies; 8+ messages in thread
From: Wen Yang @ 2021-04-15  7:53 UTC (permalink / raw)
  To: Theodore Ts'o
  Cc: riteshh, adilger, linux-ext4, jack, linux-kernel, baoyou.xie



在 2021/4/11 下午12:25, Theodore Ts'o 写道:
> On Sun, Apr 11, 2021 at 03:45:01AM +0800, Wen Yang wrote:
>> At this time, some logs are lost. It is suspected that the hard disk itself
>> is faulty.
> 
> If you have a kernel crash dump, that means you can extract out the
> dmesg buffer, correct?  Is there any I/O error messages in the kernel
> log?
> 
> What is the basis of the suspicion that the hard drive is faulty?
> Kernel dmesg output?  Error reporting from smartctl?
> 

Hello, we are using a Bare-metal Cloud server 
(https://www.semanticscholar.org/paper/High-density-Multi-tenant-Bare-metal-Cloud-Zhang-Zheng/ab1b5f0743816c8cb7188019d844ff3f7d565d9f/figure/3), 
so there is no error log in dmesg or smartctl, and we have to check it 
in Bm-hypervisor. We finally found that the io processing process on 
Bm-hypervisor is indeed abnormal.


>> There are many hard disks on our server. Maybe we should not occupy 100% CPU
>> for a long time just because one hard disk fails.
> 
> It depends on the nature of the hard drive failure.  How is it
> failing?
> 
> One thing which we do need to be careful about is when focusing on how
> to prevent a failure caused by some particular (potentially extreme)
> scenarios, that we don't cause problems on more common scenarios (for
> example a heavily loaded server, and/or a case where the file system
> is almost full where we have multiple files "fighting" over a small
> number of free blocks).
> 
> In general, my attitude is that the best way to protect against hard
> drive failures is to have processes which are monitoring the health of
> the system, and if there is evidence of a failed drive, that we
> immediately kill all jobs which are relying on that drive (which we
> call "draining" a particular drive), and/or if a sufficiently large
> percentage of the drives have failed, or the machine can no longer do
> its job, to automatically move all of those jobs to other servers
> (e.g., "drain" the server), and then send the machine to some kind of
> data center repair service, where the failed hard drives can be
> replaced.
> 
> I'm skeptical of attempts to try to make the file system to somehow
> continue to be able to "work" in the face of hard drive failures,
> since failures can be highly atypical, and what might work well in one
> failure scenario might be catastrophic in another.  It's especially
> problematic if the HDD is not explcitly signalling an error condition,
> but rather being slow (because it's doing a huge number of retries),
> or the HDD is returning data which is simply different from what was
> previously written.  The best we can do in that case is to detect that
> something is wrong (this is where metadata checksums would be very
> helpful), and then either remount the file system r/o, or panic the
> machine, and/or signal to userspace that some particular file system
> should be drained.
> 

Thanks you.
We generally separate the physical disks. One system disk and several 
business disks. The linux kernel runs on this system disk, and various 
services run on several business disks. In this way, even if a business 
disk has a problem , it will not affect the entire system.


But the current implementation of mblloc may cause 100% of the cpu to be 
occupied for a long time, could we optimize it slightly, as follows:

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index a02fadf..c73f212 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -351,6 +351,8 @@ static void ext4_mb_generate_from_freelist(struct 
super_block *sb, void *bitmap,
  						ext4_group_t group);
  static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);

+static inline void ext4_mb_show_pa(struct super_block *sb);
+
  /*
   * The algorithm using this percpu seq counter goes below:
   * 1. We sample the percpu discard_pa_seq counter before trying for block
@@ -4217,9 +4219,9 @@ static void ext4_mb_new_preallocation(struct 
ext4_allocation_context *ac)
  	struct ext4_prealloc_space *pa, *tmp;
  	struct list_head list;
  	struct ext4_buddy e4b;
+	int free_total = 0;
+	int busy, free;
  	int err;
-	int busy = 0;
-	int free, free_total = 0;

  	mb_debug(sb, "discard preallocation for group %u\n", group);
  	if (list_empty(&grp->bb_prealloc_list))
@@ -4247,6 +4249,7 @@ static void ext4_mb_new_preallocation(struct 
ext4_allocation_context *ac)

  	INIT_LIST_HEAD(&list);
  repeat:
+	busy = 0;
  	free = 0;
  	ext4_lock_group(sb, group);
  	list_for_each_entry_safe(pa, tmp,
@@ -4255,6 +4258,8 @@ static void ext4_mb_new_preallocation(struct 
ext4_allocation_context *ac)
  		if (atomic_read(&pa->pa_count)) {
  			spin_unlock(&pa->pa_lock);
  			busy = 1;
+			mb_debug(sb, "used pa while discarding for group %u\n", group);
+			ext4_mb_show_pa(sb);
  			continue;
  		}
  		if (pa->pa_deleted) {
@@ -4299,8 +4304,7 @@ static void ext4_mb_new_preallocation(struct 
ext4_allocation_context *ac)
  	/* if we still need more blocks and some PAs were used, try again */
  	if (free_total < needed && busy) {
  		ext4_unlock_group(sb, group);
-		cond_resched();
-		busy = 0;
+		schedule_timeout_uninterruptible(HZ/100);
  		goto repeat;
  	}
  	ext4_unlock_group(sb, group);



--
Best wishes,
Wen



^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-04-15  7:53 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-08 18:50 [PATCH] ext4: add a configurable parameter to prevent endless loop in ext4_mb_discard_group_p Wen Yang
2021-04-08 22:11 ` Andreas Dilger
2021-04-09  5:47 ` riteshh
2021-04-09 10:18   ` Jan Kara
2021-04-09 11:28     ` riteshh
2021-04-10 19:45   ` Wen Yang
2021-04-11  4:25     ` Theodore Ts'o
2021-04-15  7:53       ` Wen Yang

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.