linux-raid.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Slow initial resync in RAID6 with 36 SAS drives
@ 2021-08-19  9:28 Marcin Wanat
  2021-08-25 10:06 ` Marcin Wanat
  0 siblings, 1 reply; 9+ messages in thread
From: Marcin Wanat @ 2021-08-19  9:28 UTC (permalink / raw)
  To: linux-raid

Sorry, this will be a long email with everything I find to be relevant.
I have a mdraid6 array with 36 hdd SAS drives each able to do
>200MB/s, but I am unable to get more than 38MB/s resync speed on a
fast system (48cores/96GB ram) with no other load.

Here are all details:

kernel 5.13.11-1.el8.elrepo.x86_64

# mdadm --detail /dev/md0
/dev/md0:
           Version : 1.2
     Creation Time : Tue Aug 17 09:37:39 2021
        Raid Level : raid6
        Array Size : 464838634496 (432.91 TiB 475.99 TB)
     Used Dev Size : 13671724544 (12.73 TiB 14.00 TB)
      Raid Devices : 36
     Total Devices : 36
       Persistence : Superblock is persistent

     Intent Bitmap : Internal

       Update Time : Thu Aug 19 09:26:33 2021
             State : clean, resyncing
    Active Devices : 36
   Working Devices : 36
    Failed Devices : 0
     Spare Devices : 0

            Layout : left-symmetric
        Chunk Size : 512K

Consistency Policy : bitmap

     Resync Status : 49% complete

              Name : large1:0  (local to host large1)
              UUID : b7cace22:832e570f:eba39768:bb1a1ed6
            Events : 40702

    Number   Major   Minor   RaidDevice State
       0       8       33        0      active sync   /dev/sdc1
       1       8       49        1      active sync   /dev/sdd1
       2       8       65        2      active sync   /dev/sde1
       3       8       81        3      active sync   /dev/sdf1
       4       8       97        4      active sync   /dev/sdg1
       5       8      113        5      active sync   /dev/sdh1
       6       8      129        6      active sync   /dev/sdi1
       7       8      145        7      active sync   /dev/sdj1
       8       8      161        8      active sync   /dev/sdk1
       9       8      209        9      active sync   /dev/sdn1
      10       8      177       10      active sync   /dev/sdl1
      11       8      225       11      active sync   /dev/sdo1
      12       8      241       12      active sync   /dev/sdp1
      13      65        1       13      active sync   /dev/sdq1
      14      65       17       14      active sync   /dev/sdr1
      15       8      193       15      active sync   /dev/sdm1
      16      65      145       16      active sync   /dev/sdz1
      17      65      161       17      active sync   /dev/sdaa1
      18      65       33       18      active sync   /dev/sds1
      19      65       49       19      active sync   /dev/sdt1
      20      65       65       20      active sync   /dev/sdu1
      21      65       81       21      active sync   /dev/sdv1
      22      65       97       22      active sync   /dev/sdw1
      23      65      113       23      active sync   /dev/sdx1
      24      65      129       24      active sync   /dev/sdy1
      25      65      177       25      active sync   /dev/sdab1
      26      65      193       26      active sync   /dev/sdac1
      27      65      209       27      active sync   /dev/sdad1
      28      65      225       28      active sync   /dev/sdae1
      29      65      241       29      active sync   /dev/sdaf1
      30      66        1       30      active sync   /dev/sdag1
      31      66       17       31      active sync   /dev/sdah1
      32      66       33       32      active sync   /dev/sdai1
      33      66       49       33      active sync   /dev/sdaj1
      34      66       65       34      active sync   /dev/sdak1
      35      66       81       35      active sync   /dev/sdal1


# cat /proc/mdstat

md0 : active raid6 sdal1[35] sdak1[34] sdaj1[33] sdah1[31] sdai1[32]
sdag1[30] sdaf1[29] sdac1[26] sdae1[28] sdab1[25] sdad1[27] sds1[18]
sdq1[13] sdz1[16] sdo1[11] sdp1[12] sdx1[23] sdr1[14] sdw1[22] sdn1[9]
sdaa1[17] sdv1[21] sdu1[20] sdy1[24] sdt1[19] sdk1[8] sdm1[15]
sdl1[10] sdh1[5] sdj1[7] sdf1[3] sdi1[6] sdc1[0] sdg1[4] sde1[2]
sdd1[1]
      464838634496 blocks super 1.2 level 6, 512k chunk, algorithm 2
[36/36] [UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU]
      [=========>...........]  resync = 49.5% (6773640400/13671724544)
finish=3026.3min speed=37987K/sec
      bitmap: 53/102 pages [212KB], 65536KB chunk

# iostat -dx 5

Device            r/s     w/s     rkB/s     wkB/s   rrqm/s   wrqm/s
%rrqm  %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util
sdc           9738.60    1.40  38956.00      5.80     0.40     0.40
0.00  22.22    0.20    9.29   1.93     4.00     4.14   0.07  71.82
sdd           9738.20    1.00  38952.80      2.60     0.00     0.00
0.00   0.00    0.89    5.80   8.68     4.00     2.60   0.07  71.60
sde           9738.60    1.40  38956.00      5.80     0.40     0.40
0.00  22.22    0.31    3.71   3.02     4.00     4.14   0.07  70.60
sdf           9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.17    3.20   1.69     4.00     2.60   0.07  70.56
sdg           9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.85    4.20   8.31     4.00     2.60   0.07  70.72
sdh           9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.20    4.00   1.93     4.00     2.60   0.07  70.64
sdi           9738.60    1.00  38954.40      2.60     0.00     0.00
0.00   0.00    0.17    8.20   1.70     4.00     2.60   0.07  70.98
sdj           9714.60    1.00  38954.40      2.60    24.00     0.00
0.25   0.00    0.58    4.00   5.61     4.01     2.60   0.07  70.66
sdk           9677.00    1.00  38953.60      2.60    61.40     0.00
0.63   0.00    1.23    4.40  11.94     4.03     2.60   0.07  70.76
sdl           9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.15    5.80   1.44     4.00     2.60   0.07  70.76
sdm           9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.38    2.80   3.73     4.00     2.60   0.07  70.96
sdo           9705.60    1.00  38953.60      2.60    32.80     0.00
0.34   0.00    0.83    5.80   8.07     4.01     2.60   0.07  70.80
sdp           9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.30    4.20   2.91     4.00     2.60   0.07  70.60
sdn           9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.34    5.60   3.30     4.00     2.60   0.07  70.76
sdt           9659.80    1.00  38954.40      2.60    78.80     0.00
0.81   0.00    1.00    4.00   9.71     4.03     2.60   0.07  70.44
sds           9640.40    1.00  38954.40      2.60    98.20     0.00
1.01   0.00    1.29    5.60  12.42     4.04     2.60   0.07  70.60
sdq           9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.30    4.40   2.92     4.00     2.60   0.07  70.68
sdu           9738.60    1.00  38954.40      2.60     0.00     0.00
0.00   0.00    0.13    4.40   1.31     4.00     2.60   0.07  70.66
sdv           9696.20    1.00  38954.40      2.60    42.40     0.00
0.44   0.00    1.30    4.20  12.57     4.02     2.60   0.07  70.76
sdw           9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.94    4.20   9.13     4.00     2.60   0.07  70.70
sdy           9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.11    4.40   1.05     4.00     2.60   0.07  70.62
sdr           9730.80    1.00  38953.60      2.60     7.60     0.00
0.08   0.00    1.22    4.20  11.87     4.00     2.60   0.07  70.68
sdx           9718.00    1.00  38954.40      2.60    20.60     0.00
0.21   0.00    0.88    4.20   8.57     4.01     2.60   0.07  70.70
sdaa          9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.24    4.20   2.38     4.00     2.60   0.07  70.60
sdz           9738.40    1.00  38953.60      2.60     0.00     0.00
0.00   0.00    0.20    4.20   1.91     4.00     2.60   0.07  70.60
sdab          9633.60    1.00  38953.60      2.60   104.80     0.00
1.08   0.00    1.38    4.20  13.33     4.04     2.60   0.07  70.52
sdac          9639.20    1.00  38954.40      2.60    99.40     0.00
1.02   0.00    1.08    5.60  10.45     4.04     2.60   0.07  70.56
sdad          9536.20    1.00  38954.40      2.60   202.40     0.00
2.08   0.00    2.73    4.00  26.04     4.08     2.60   0.07  70.36
sdaf          9738.60    1.00  38954.40      2.60     0.00     0.00
0.00   0.00    0.37    4.00   3.63     4.00     2.60   0.07  70.64
sdae          9738.60    1.00  38954.40      2.60     0.00     0.00
0.00   0.00    0.16    5.40   1.61     4.00     2.60   0.07  70.72
sdag          9735.20    1.00  38940.80      2.60     0.00     0.00
0.00   0.00    0.46    5.80   4.48     4.00     2.60   0.07  70.76
sdai          9738.60    1.00  38954.40      2.60     0.00     0.00
0.00   0.00    0.31    4.00   3.01     4.00     2.60   0.07  70.60
sdah          9661.60    1.00  38955.20      2.60    77.00     0.00
0.79   0.00    1.51    4.20  14.57     4.03     2.60   0.07  70.70
sdal          9739.20    1.40  38958.40      5.80     0.40     0.40
0.00  22.22    0.27    4.86   2.65     4.00     4.14   0.07  70.80
sdaj          9738.60    1.00  38954.40      2.60     0.00     0.00
0.00   0.00    0.17    4.40   1.68     4.00     2.60   0.07  70.64
sdak          9738.80    1.00  38955.20      2.60     0.00     0.00
0.00   0.00    0.53    5.40   5.21     4.00     2.60   0.07  70.80

# blockdev --report
RO    RA   SSZ   BSZ   StartSec            Size   Device
rw  8192   512  4096          0    480103981056   /dev/sda
rw  8192   512  4096       2048       535822336   /dev/sda1
rw  8192   512  4096    1048576       536870912   /dev/sda2
rw  8192   512  4096    2097152    447569985536   /dev/sda3
rw  8192   512  4096  876257280     31457280000   /dev/sda4
rw  8192   512  4096          0    480103981056   /dev/sdb
rw  8192   512   512       2048       535822336   /dev/sdb1
rw  8192   512  4096    1048576       536870912   /dev/sdb2
rw  8192   512  4096    2097152    447569985536   /dev/sdb3
rw  8192   512  4096  876257280     31457280000   /dev/sdb4
rw  8192   512   512  937698992         2080256   /dev/sdb5
rw  8192   512  4096          0  14000519643136   /dev/sdc
rw  8192   512   512       2048  13999981706752   /dev/sdc1
rw  8192   512  4096          0  14000519643136   /dev/sdd
rw  8192   512   512       2048  13999981706752   /dev/sdd1
rw  8192   512  4096          0  14000519643136   /dev/sde
rw  8192   512   512       2048  13999981706752   /dev/sde1
rw  8192   512  4096          0  14000519643136   /dev/sdf
rw  8192   512   512       2048  13999981706752   /dev/sdf1
rw  8192   512  4096          0  14000519643136   /dev/sdg
rw  8192   512   512       2048  13999981706752   /dev/sdg1
rw  8192   512  4096          0  14000519643136   /dev/sdh
rw  8192   512   512       2048  13999981706752   /dev/sdh1
rw  8192   512  4096          0  14000519643136   /dev/sdi
rw  8192   512   512       2048  13999981706752   /dev/sdi1
rw  8192   512  4096          0  14000519643136   /dev/sdj
rw  8192   512   512       2048  13999981706752   /dev/sdj1
rw  8192   512  4096          0       536281088   /dev/md2
rw  8192   512  4096          0  14000519643136   /dev/sdk
rw  8192   512   512       2048  13999981706752   /dev/sdk1
rw  8192   512  4096          0  14000519643136   /dev/sdl
rw  8192   512   512       2048  13999981706752   /dev/sdl1
rw  8192   512  4096          0    447435767808   /dev/md3
rw  8192   512  4096          0  14000519643136   /dev/sdm
rw  8192   512   512       2048  13999981706752   /dev/sdm1
rw 69632   512  4096          0 475994761723904   /dev/md0
rw  8192   512  4096          0  14000519643136   /dev/sdo
rw  8192   512   512       2048  13999981706752   /dev/sdo1
rw  8192   512  4096          0  14000519643136   /dev/sdp
rw  8192   512   512       2048  13999981706752   /dev/sdp1
rw  8192   512  4096          0  14000519643136   /dev/sdn
rw  8192   512   512       2048  13999981706752   /dev/sdn1
rw  8192   512  4096          0  14000519643136   /dev/sdt
rw  8192   512   512       2048  13999981706752   /dev/sdt1
rw  8192   512  4096          0  14000519643136   /dev/sds
rw  8192   512   512       2048  13999981706752   /dev/sds1
rw  8192   512  4096          0  14000519643136   /dev/sdq
rw  8192   512   512       2048  13999981706752   /dev/sdq1
rw  8192   512  4096          0  14000519643136   /dev/sdu
rw  8192   512   512       2048  13999981706752   /dev/sdu1
rw  8192   512  4096          0  14000519643136   /dev/sdv
rw  8192   512   512       2048  13999981706752   /dev/sdv1
rw  8192   512  4096          0  14000519643136   /dev/sdw
rw  8192   512   512       2048  13999981706752   /dev/sdw1
rw  8192   512  4096          0  14000519643136   /dev/sdy
rw  8192   512   512       2048  13999981706752   /dev/sdy1
rw  8192   512  4096          0  14000519643136   /dev/sdr
rw  8192   512   512       2048  13999981706752   /dev/sdr1
rw  8192   512  4096          0  14000519643136   /dev/sdx
rw  8192   512   512       2048  13999981706752   /dev/sdx1
rw  8192   512  4096          0  14000519643136   /dev/sdaa
rw  8192   512   512       2048  13999981706752   /dev/sdaa1
rw  8192   512  4096          0  14000519643136   /dev/sdz
rw  8192   512   512       2048  13999981706752   /dev/sdz1
rw  8192   512  4096          0  14000519643136   /dev/sdab
rw  8192   512   512       2048  13999981706752   /dev/sdab1
rw  8192   512  4096          0  14000519643136   /dev/sdac
rw  8192   512   512       2048  13999981706752   /dev/sdac1
rw  8192   512  4096          0  14000519643136   /dev/sdad
rw  8192   512   512       2048  13999981706752   /dev/sdad1
rw  8192   512  4096          0  14000519643136   /dev/sdaf
rw  8192   512   512       2048  13999981706752   /dev/sdaf1
rw  8192   512  4096          0  14000519643136   /dev/sdae
rw  8192   512   512       2048  13999981706752   /dev/sdae1
rw  8192   512  4096          0  14000519643136   /dev/sdag
rw  8192   512   512       2048  13999981706752   /dev/sdag1
rw  8192   512  4096          0  14000519643136   /dev/sdai
rw  8192   512   512       2048  13999981706752   /dev/sdai1
rw  8192   512  4096          0  14000519643136   /dev/sdah
rw  8192   512   512       2048  13999981706752   /dev/sdah1
rw  8192   512  4096          0  14000519643136   /dev/sdal
rw  8192   512   512       2048  13999981706752   /dev/sdal1
rw  8192   512  4096          0  14000519643136   /dev/sdaj
rw  8192   512   512       2048  13999981706752   /dev/sdaj1
rw  8192   512  4096          0  14000519643136   /dev/sdak
rw  8192   512   512       2048  13999981706752   /dev/sdak1

# find /sys/block/md0/queue/ -type f -printf "%h/%f : " -exec cat '{}' ';'
/sys/block/md0/queue/io_poll_delay : 0
/sys/block/md0/queue/max_integrity_segments : 0
/sys/block/md0/queue/zoned : none
/sys/block/md0/queue/scheduler : none
/sys/block/md0/queue/io_poll : 0
/sys/block/md0/queue/discard_zeroes_data : 0
/sys/block/md0/queue/minimum_io_size : 524288
/sys/block/md0/queue/nr_zones : 0
/sys/block/md0/queue/write_same_max_bytes : 0
/sys/block/md0/queue/max_segments : 128
/sys/block/md0/queue/dax : 0
/sys/block/md0/queue/physical_block_size : 4096
/sys/block/md0/queue/logical_block_size : 512
/sys/block/md0/queue/virt_boundary_mask : 0
/sys/block/md0/queue/zone_append_max_bytes : 0
/sys/block/md0/queue/nr_requests : 128
/sys/block/md0/queue/write_cache : write back
/sys/block/md0/queue/stable_writes : 0
/sys/block/md0/queue/max_segment_size : 4294967295
/sys/block/md0/queue/rotational : 1
/sys/block/md0/queue/discard_max_bytes : 0
/sys/block/md0/queue/add_random : 0
/sys/block/md0/queue/discard_max_hw_bytes : 0
/sys/block/md0/queue/optimal_io_size : 17825792
/sys/block/md0/queue/chunk_sectors : 0
/sys/block/md0/queue/read_ahead_kb : 34816
/sys/block/md0/queue/max_discard_segments : 1
/sys/block/md0/queue/write_zeroes_max_bytes : 0
/sys/block/md0/queue/nomerges : 0
/sys/block/md0/queue/zone_write_granularity : 0
/sys/block/md0/queue/wbt_lat_usec : cat:
/sys/block/md0/queue/wbt_lat_usec: Invalid argument
/sys/block/md0/queue/fua : 1
/sys/block/md0/queue/discard_granularity : 33554432
/sys/block/md0/queue/rq_affinity : 0
/sys/block/md0/queue/max_sectors_kb : 1280
/sys/block/md0/queue/hw_sector_size : 512
/sys/block/md0/queue/max_hw_sectors_kb : 2147483647
/sys/block/md0/queue/iostats : 0

# find /sys/block/md0/md/ -type f -not -path "*/dev-*" -printf "%h/%f
: " -exec cat '{}' ';'
/sys/block/md0/md/sync_min : 0
/sys/block/md0/md/new_dev : cat: /sys/block/md0/md/new_dev: Permission denied
/sys/block/md0/md/consistency_policy : bitmap
/sys/block/md0/md/sync_max : max
/sys/block/md0/md/sync_speed_min : 1000 (local)
/sys/block/md0/md/stripe_cache_active : 37522
/sys/block/md0/md/stripe_cache_size : 32768
/sys/block/md0/md/array_size : default
/sys/block/md0/md/suspend_hi : 0
/sys/block/md0/md/chunk_size : 524288
/sys/block/md0/md/max_read_errors : 20
/sys/block/md0/md/component_size : 13671724544
/sys/block/md0/md/rmw_level : 1
/sys/block/md0/md/sync_speed_max : 200000 (system)
/sys/block/md0/md/sync_force_parallel : 0
/sys/block/md0/md/layout : 2
/sys/block/md0/md/safe_mode_delay : 0.201
/sys/block/md0/md/reshape_position : none
/sys/block/md0/md/sync_action : resync
/sys/block/md0/md/resync_start : 13501066048
/sys/block/md0/md/serialize_policy : n/a
/sys/block/md0/md/bitmap_set_bits : cat:
/sys/block/md0/md/bitmap_set_bits: Permission denied
/sys/block/md0/md/degraded : 0
/sys/block/md0/md/bitmap/location : +8
/sys/block/md0/md/bitmap/space : 0
/sys/block/md0/md/bitmap/can_clear : false
/sys/block/md0/md/bitmap/metadata : internal
/sys/block/md0/md/bitmap/max_backlog_used : 0
/sys/block/md0/md/bitmap/time_base : 5
/sys/block/md0/md/bitmap/chunksize : 67108864
/sys/block/md0/md/bitmap/backlog : 0
/sys/block/md0/md/uuid : b7cace22-832e-570f-eba3-9768bb1a1ed6
/sys/block/md0/md/skip_copy : 0
/sys/block/md0/md/mismatch_cnt : 0
/sys/block/md0/md/last_sync_action : resync
/sys/block/md0/md/sync_speed : 38966
/sys/block/md0/md/raid_disks : 36
/sys/block/md0/md/sync_completed : 13518742200 / 27343449088
/sys/block/md0/md/array_state : active-idle
/sys/block/md0/md/reshape_direction : forwards
/sys/block/md0/md/journal_mode : /sys/block/md0/md/level : raid6
/sys/block/md0/md/suspend_lo : 0
/sys/block/md0/md/fail_last_dev : 0
/sys/block/md0/md/preread_bypass_threshold : 1
/sys/block/md0/md/stripe_size : 4096
/sys/block/md0/md/group_thread_cnt : 0
/sys/block/md0/md/metadata_version : 1.2
/sys/block/md0/md/ppl_write_hint : 0


# find /sys/block/sdc/queue/ -type f -printf "%h/%f : " -exec cat '{}' ';'
/sys/block/sdc/queue/io_poll_delay : -1
/sys/block/sdc/queue/max_integrity_segments : 0
/sys/block/sdc/queue/zoned : none
/sys/block/sdc/queue/scheduler : [mq-deadline] kyber bfq none
/sys/block/sdc/queue/io_poll : 0
/sys/block/sdc/queue/discard_zeroes_data : 0
/sys/block/sdc/queue/minimum_io_size : 4096
/sys/block/sdc/queue/nr_zones : 0
/sys/block/sdc/queue/write_same_max_bytes : 33550336
/sys/block/sdc/queue/max_segments : 128
/sys/block/sdc/queue/dax : 0
/sys/block/sdc/queue/physical_block_size : 4096
/sys/block/sdc/queue/logical_block_size : 512
/sys/block/sdc/queue/virt_boundary_mask : 0
/sys/block/sdc/queue/zone_append_max_bytes : 0
/sys/block/sdc/queue/io_timeout : 30000
/sys/block/sdc/queue/nr_requests : 256
/sys/block/sdc/queue/write_cache : write back
/sys/block/sdc/queue/stable_writes : 0
/sys/block/sdc/queue/max_segment_size : 4294967295
/sys/block/sdc/queue/rotational : 1
/sys/block/sdc/queue/discard_max_bytes : 0
/sys/block/sdc/queue/add_random : 1
/sys/block/sdc/queue/discard_max_hw_bytes : 0
/sys/block/sdc/queue/optimal_io_size : 0
/sys/block/sdc/queue/chunk_sectors : 0
/sys/block/sdc/queue/iosched/front_merges : 1
/sys/block/sdc/queue/iosched/read_expire : 500
/sys/block/sdc/queue/iosched/fifo_batch : 16
/sys/block/sdc/queue/iosched/write_expire : 5000
/sys/block/sdc/queue/iosched/writes_starved : 2
/sys/block/sdc/queue/read_ahead_kb : 4096
/sys/block/sdc/queue/max_discard_segments : 1
/sys/block/sdc/queue/write_zeroes_max_bytes : 33550336
/sys/block/sdc/queue/nomerges : 0
/sys/block/sdc/queue/zone_write_granularity : 0
/sys/block/sdc/queue/wbt_lat_usec : 75000
/sys/block/sdc/queue/fua : 1
/sys/block/sdc/queue/discard_granularity : 0
/sys/block/sdc/queue/rq_affinity : 1
/sys/block/sdc/queue/max_sectors_kb : 1280
/sys/block/sdc/queue/hw_sector_size : 512
/sys/block/sdc/queue/max_hw_sectors_kb : 16383
/sys/block/sdc/queue/iostats : 1


Array is running on systems with 48 cores, 96GB RAM with no other load.

stripe_cache_size = 32768

md0_raid6 process is using 50-75% cpu.

14TB hdds in 36 drives array are: WDC WUH721414AL5201

Each drive in has >200MB/s sequential read/write when tested by fio.

sync_speed_min/sync_speed_max is set to 200000.

36 drives are connected as JBOD via two controllers LSI SAS3008
PCI-Express Fusion-MPT SAS-3

Both controllers are in PCI-E 3.0 x8 slots: LnkSta: Speed 8GT/s (ok),
Width x8 (ok)

Iostat for this array shows:
9738.60 r/s
38956.00 rKB/s
0 rrqm/s
4.00 rareq-sz

Why there are so many 4KB IOPS issued to drives ? With other raid6
arrays during resync i always have rareq-sz at least 200KB and IOPS at
least 10x lower.

I was trying setting group_thread_cnt = 4. This was helpful on other
arrays but on this array it only resulted in increase in CPU iowait
and resync speed decrease to 33MB/s.

Regards,
Marcin Wanat

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Slow initial resync in RAID6 with 36 SAS drives
  2021-08-19  9:28 Slow initial resync in RAID6 with 36 SAS drives Marcin Wanat
@ 2021-08-25 10:06 ` Marcin Wanat
  2021-08-25 10:28   ` [Non-DoD Source] " Finlayson, James M CIV (USA)
  2021-09-01  5:19   ` Song Liu
  0 siblings, 2 replies; 9+ messages in thread
From: Marcin Wanat @ 2021-08-25 10:06 UTC (permalink / raw)
  To: linux-raid

On Thu, Aug 19, 2021 at 11:28 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
>
> Sorry, this will be a long email with everything I find to be relevant.
> I have a mdraid6 array with 36 hdd SAS drives each able to do
> >200MB/s, but I am unable to get more than 38MB/s resync speed on a
> fast system (48cores/96GB ram) with no other load.

I have done a bit more research on 24 NVMe drives server and found
that resync speed bottleneck affect RAID6 with >16 drives:

# mdadm --create --verbose /dev/md0 --level=6 --raid-devices=16
/dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1
/dev/nvme6n1 /dev/nvme7n1 /dev/nvme8n1 /dev/nvme9n1 /dev/nvme10n1
/dev/nvme11n1 /dev/nvme12n1 /dev/nvme13n1 /dev/nvme14n1 /dev/nvme15n1
/dev/nvme16n1
# iostat -dx 5
Device            r/s     w/s     rkB/s     wkB/s   rrqm/s   wrqm/s
%rrqm  %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util
nvme0n1          0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme1n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.88    0.00   0.99   470.84     2.25   2.51  86.04
nvme4n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.89    0.00   0.99   470.84     2.25   2.51  86.06
nvme5n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.89    0.00   0.99   470.84     2.25   2.51  86.14
nvme10n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.90    0.00   0.99   470.84     2.25   2.51  86.20
nvme9n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.91    0.00   1.00   470.84     2.25   2.53  86.76
nvme13n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.93    0.00   1.00   470.84     2.25   2.54  87.00
nvme12n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.94    0.00   1.01   470.84     2.25   2.54  87.08
nvme8n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.93    0.00   1.00   470.84     2.25   2.54  87.02
nvme14n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.96    0.00   1.01   470.84     2.25   2.56  87.64
nvme22n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme17n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme16n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    3.05    0.00   1.04   470.84     2.25   2.58  88.56
nvme19n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme2n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.94    0.00   1.01   470.84     2.25   2.54  87.20
nvme6n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.95    0.00   1.01   470.84     2.25   2.55  87.52
nvme7n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.94    0.00   1.01   470.84     2.25   2.54  87.22
nvme21n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme11n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.96    0.00   1.02   470.84     2.25   2.56  87.72
nvme15n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.99    0.00   1.02   470.84     2.25   2.53  86.84
nvme23n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme18n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme3n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.97    0.00   1.02   470.84     2.25   2.53  86.66
nvme20n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00

as you can see, there are 342 iops with ~470 rareq-sz, but when i
create RAID6 with 17 drives or more:

# mdadm --create --verbose /dev/md0 --level=6 --raid-devices=17
/dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1
/dev/nvme6n1 /dev/nvme7n1 /dev/nvme8n1 /dev/nvme9n1 /dev/nvme10n1
/dev/nvme11n1 /dev/nvme12n1 /dev/nvme13n1 /dev/nvme14n1 /dev/nvme15n1
/dev/nvme16n1 /dev/nvme17n1
# iostat -dx 5
Device            r/s     w/s     rkB/s     wkB/s   rrqm/s   wrqm/s
%rrqm  %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util
nvme0n1          0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme1n1       21484.20    0.40  85936.80      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.82     4.00     2.25   0.05  99.16
nvme4n1       21484.00    0.40  85936.00      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.74     4.00     2.25   0.05  99.16
nvme5n1       21484.00    0.40  85936.00      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.84     4.00     2.25   0.05  99.16
nvme10n1      21483.80    0.40  85935.20      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.65     4.00     2.25   0.04  83.64
nvme9n1       21483.80    0.40  85935.20      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.67     4.00     2.25   0.04  85.86
nvme13n1      21483.60    0.40  85934.40      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.63     4.00     2.25   0.04  83.66
nvme12n1      21483.60    0.40  85934.40      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.65     4.00     2.25   0.04  83.66
nvme8n1       21483.60    0.40  85934.40      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.81     4.00     2.25   0.05  99.22
nvme14n1      21481.80    0.40  85927.20      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.67     4.00     2.25   0.04  83.66
nvme22n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme17n1      21482.00    0.40  85928.00      0.90     0.00     0.00
0.00   0.00    0.02    0.00   0.49     4.00     2.25   0.03  67.12
nvme16n1      21481.60    0.40  85926.40      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.75     4.00     2.25   0.04  83.66
nvme19n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme2n1       21481.60    0.40  85926.40      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.95     4.00     2.25   0.05  99.26
nvme6n1       21481.60    0.40  85926.40      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.91     4.00     2.25   0.05  99.26
nvme7n1       21481.60    0.40  85926.40      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.87     4.00     2.25   0.05  99.24
nvme21n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme11n1      21481.20    0.40  85924.80      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.75     4.00     2.25   0.04  83.66
nvme15n1      21480.20    0.40  85920.80      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.80     4.00     2.25   0.04  83.66
nvme23n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme18n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme3n1       21480.40    0.40  85921.60      0.90     0.00     0.00
0.00   0.00    0.05    0.00   1.02     4.00     2.25   0.05  99.26
nvme20n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00

rareq-sz drops to 4, iops increase to 21483 and resync speed drops to 85MB/s.

Why is it like that? Could someone let me know which part of mdraid
kernel code is responsible for this limitation ? Is changing this and
recompiling the kernel on machine with 512GB+ ram safe ?

Regards,
Marcin Wanat

^ permalink raw reply	[flat|nested] 9+ messages in thread

* RE: [Non-DoD Source] Re: Slow initial resync in RAID6 with 36 SAS drives
  2021-08-25 10:06 ` Marcin Wanat
@ 2021-08-25 10:28   ` Finlayson, James M CIV (USA)
  2021-09-01  1:22     ` antlists
  2021-09-01  5:19   ` Song Liu
  1 sibling, 1 reply; 9+ messages in thread
From: Finlayson, James M CIV (USA) @ 2021-08-25 10:28 UTC (permalink / raw)
  To: 'Marcin Wanat', linux-raid

I'm not a person necessarily in the "know", but I mess with these in udev  for SSDs:
SUBSYSTEM=="block", ACTION=="add|change", KERNEL=="md*", ATTR{md/sync_speed_max}="2000000",ATTR{md/group_thread_cnt}="64", ATTR{md/stripe_cache_size}="8192"

Guidance on why I change the values this way - educated "guess", as I'm an experienced practitioner at best and not someone that messes with the code......I've seen resync rates as high as 900MB/s sustained on my SSD mdraids, even though the SSDs should be able to sustain higher.....

-----Original Message-----
From: Marcin Wanat <marcin.wanat@gmail.com> 
Sent: Wednesday, August 25, 2021 6:06 AM
To: linux-raid@vger.kernel.org
Subject: [Non-DoD Source] Re: Slow initial resync in RAID6 with 36 SAS drives

On Thu, Aug 19, 2021 at 11:28 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
>
> Sorry, this will be a long email with everything I find to be relevant.
> I have a mdraid6 array with 36 hdd SAS drives each able to do
> >200MB/s, but I am unable to get more than 38MB/s resync speed on a
> fast system (48cores/96GB ram) with no other load.

I have done a bit more research on 24 NVMe drives server and found that resync speed bottleneck affect RAID6 with >16 drives:

# mdadm --create --verbose /dev/md0 --level=6 --raid-devices=16
/dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1
/dev/nvme6n1 /dev/nvme7n1 /dev/nvme8n1 /dev/nvme9n1 /dev/nvme10n1
/dev/nvme11n1 /dev/nvme12n1 /dev/nvme13n1 /dev/nvme14n1 /dev/nvme15n1
/dev/nvme16n1
# iostat -dx 5
Device            r/s     w/s     rkB/s     wkB/s   rrqm/s   wrqm/s
%rrqm  %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util
nvme0n1          0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme1n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.88    0.00   0.99   470.84     2.25   2.51  86.04
nvme4n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.89    0.00   0.99   470.84     2.25   2.51  86.06
nvme5n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.89    0.00   0.99   470.84     2.25   2.51  86.14
nvme10n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.90    0.00   0.99   470.84     2.25   2.51  86.20
nvme9n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.91    0.00   1.00   470.84     2.25   2.53  86.76
nvme13n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.93    0.00   1.00   470.84     2.25   2.54  87.00
nvme12n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.94    0.00   1.01   470.84     2.25   2.54  87.08
nvme8n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.93    0.00   1.00   470.84     2.25   2.54  87.02
nvme14n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.96    0.00   1.01   470.84     2.25   2.56  87.64
nvme22n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme17n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme16n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    3.05    0.00   1.04   470.84     2.25   2.58  88.56
nvme19n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme2n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.94    0.00   1.01   470.84     2.25   2.54  87.20
nvme6n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.95    0.00   1.01   470.84     2.25   2.55  87.52
nvme7n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.94    0.00   1.01   470.84     2.25   2.54  87.22
nvme21n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme11n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.96    0.00   1.02   470.84     2.25   2.56  87.72
nvme15n1       342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.99    0.00   1.02   470.84     2.25   2.53  86.84
nvme23n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme18n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme3n1        342.60    0.40 161311.20      0.90 39996.60     0.00
99.15   0.00    2.97    0.00   1.02   470.84     2.25   2.53  86.66
nvme20n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00

as you can see, there are 342 iops with ~470 rareq-sz, but when i create RAID6 with 17 drives or more:

# mdadm --create --verbose /dev/md0 --level=6 --raid-devices=17
/dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1
/dev/nvme6n1 /dev/nvme7n1 /dev/nvme8n1 /dev/nvme9n1 /dev/nvme10n1
/dev/nvme11n1 /dev/nvme12n1 /dev/nvme13n1 /dev/nvme14n1 /dev/nvme15n1
/dev/nvme16n1 /dev/nvme17n1
# iostat -dx 5
Device            r/s     w/s     rkB/s     wkB/s   rrqm/s   wrqm/s
%rrqm  %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util
nvme0n1          0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme1n1       21484.20    0.40  85936.80      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.82     4.00     2.25   0.05  99.16
nvme4n1       21484.00    0.40  85936.00      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.74     4.00     2.25   0.05  99.16
nvme5n1       21484.00    0.40  85936.00      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.84     4.00     2.25   0.05  99.16
nvme10n1      21483.80    0.40  85935.20      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.65     4.00     2.25   0.04  83.64
nvme9n1       21483.80    0.40  85935.20      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.67     4.00     2.25   0.04  85.86
nvme13n1      21483.60    0.40  85934.40      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.63     4.00     2.25   0.04  83.66
nvme12n1      21483.60    0.40  85934.40      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.65     4.00     2.25   0.04  83.66
nvme8n1       21483.60    0.40  85934.40      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.81     4.00     2.25   0.05  99.22
nvme14n1      21481.80    0.40  85927.20      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.67     4.00     2.25   0.04  83.66
nvme22n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme17n1      21482.00    0.40  85928.00      0.90     0.00     0.00
0.00   0.00    0.02    0.00   0.49     4.00     2.25   0.03  67.12
nvme16n1      21481.60    0.40  85926.40      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.75     4.00     2.25   0.04  83.66
nvme19n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme2n1       21481.60    0.40  85926.40      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.95     4.00     2.25   0.05  99.26
nvme6n1       21481.60    0.40  85926.40      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.91     4.00     2.25   0.05  99.26
nvme7n1       21481.60    0.40  85926.40      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.87     4.00     2.25   0.05  99.24
nvme21n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme11n1      21481.20    0.40  85924.80      0.90     0.00     0.00
0.00   0.00    0.03    0.00   0.75     4.00     2.25   0.04  83.66
nvme15n1      21480.20    0.40  85920.80      0.90     0.00     0.00
0.00   0.00    0.04    0.00   0.80     4.00     2.25   0.04  83.66
nvme23n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme18n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
nvme3n1       21480.40    0.40  85921.60      0.90     0.00     0.00
0.00   0.00    0.05    0.00   1.02     4.00     2.25   0.05  99.26
nvme20n1         0.00    0.00      0.00      0.00     0.00     0.00
0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00

rareq-sz drops to 4, iops increase to 21483 and resync speed drops to 85MB/s.

Why is it like that? Could someone let me know which part of mdraid kernel code is responsible for this limitation ? Is changing this and recompiling the kernel on machine with 512GB+ ram safe ?

Regards,
Marcin Wanat

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Non-DoD Source] Re: Slow initial resync in RAID6 with 36 SAS drives
  2021-08-25 10:28   ` [Non-DoD Source] " Finlayson, James M CIV (USA)
@ 2021-09-01  1:22     ` antlists
  2021-09-01  1:50       ` Guoqing Jiang
  0 siblings, 1 reply; 9+ messages in thread
From: antlists @ 2021-09-01  1:22 UTC (permalink / raw)
  To: Finlayson, James M CIV (USA), 'Marcin Wanat', linux-raid

On 25/08/2021 11:28, Finlayson, James M CIV (USA) wrote:
> I'm not a person necessarily in the "know", but I mess with these in udev  for SSDs:
> SUBSYSTEM=="block", ACTION=="add|change", KERNEL=="md*", ATTR{md/sync_speed_max}="2000000",ATTR{md/group_thread_cnt}="64", ATTR{md/stripe_cache_size}="8192"
> 
> Guidance on why I change the values this way - educated "guess", as I'm an experienced practitioner at best and not someone that messes with the code......I've seen resync rates as high as 900MB/s sustained on my SSD mdraids, even though the SSDs should be able to sustain higher.....

Not raid, but I've picked up on comments (on LWN?) that certain 
file-systems are very much "single threaded" in their behaviour. I 
strongly suspect that md-raid predates common multi-cpu/multi-core 
systems, and I've heard somewhere that md-raid doesn't thread that well.

It'd be nice to fix it, but retro-fitting existing code is a lot more 
work than starting from scratch ...

Cheers,
Wol

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [Non-DoD Source] Re: Slow initial resync in RAID6 with 36 SAS drives
  2021-09-01  1:22     ` antlists
@ 2021-09-01  1:50       ` Guoqing Jiang
  0 siblings, 0 replies; 9+ messages in thread
From: Guoqing Jiang @ 2021-09-01  1:50 UTC (permalink / raw)
  To: antlists, Finlayson, James M CIV (USA), 'Marcin Wanat',
	linux-raid



On 9/1/21 9:22 AM, antlists wrote:
> On 25/08/2021 11:28, Finlayson, James M CIV (USA) wrote:
>> I'm not a person necessarily in the "know", but I mess with these in 
>> udev  for SSDs:
>> SUBSYSTEM=="block", ACTION=="add|change", KERNEL=="md*", 
>> ATTR{md/sync_speed_max}="2000000",ATTR{md/group_thread_cnt}="64", 
>> ATTR{md/stripe_cache_size}="8192"
>>
>> Guidance on why I change the values this way - educated "guess", as 
>> I'm an experienced practitioner at best and not someone that messes 
>> with the code......I've seen resync rates as high as 900MB/s 
>> sustained on my SSD mdraids, even though the SSDs should be able to 
>> sustain higher.....
>
> Not raid, but I've picked up on comments (on LWN?) that certain 
> file-systems are very much "single threaded" in their behaviour. I 
> strongly suspect that md-raid predates common multi-cpu/multi-core 
> systems, and I've heard somewhere that md-raid doesn't thread that well.

Yes and no, raid5 does support multi-threading if group_thread_cnt > 0, 
but IIUC, only one thread (mdx_resync) deals with resync IO.

Thanks,
Guoqing

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Slow initial resync in RAID6 with 36 SAS drives
  2021-08-25 10:06 ` Marcin Wanat
  2021-08-25 10:28   ` [Non-DoD Source] " Finlayson, James M CIV (USA)
@ 2021-09-01  5:19   ` Song Liu
  2021-09-03  0:58     ` Song Liu
  1 sibling, 1 reply; 9+ messages in thread
From: Song Liu @ 2021-09-01  5:19 UTC (permalink / raw)
  To: Marcin Wanat; +Cc: linux-raid

On Wed, Aug 25, 2021 at 3:06 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
>
> On Thu, Aug 19, 2021 at 11:28 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
> >
> > Sorry, this will be a long email with everything I find to be relevant.
> > I have a mdraid6 array with 36 hdd SAS drives each able to do
> > >200MB/s, but I am unable to get more than 38MB/s resync speed on a
> > fast system (48cores/96GB ram) with no other load.
>
> I have done a bit more research on 24 NVMe drives server and found
> that resync speed bottleneck affect RAID6 with >16 drives:

Sorry for the late response.

This is interesting behavior. I don't really know why this is the case at the
moment. Let me try to reproduce this first.

Thanks,
Song

>
> # mdadm --create --verbose /dev/md0 --level=6 --raid-devices=16
> /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1
> /dev/nvme6n1 /dev/nvme7n1 /dev/nvme8n1 /dev/nvme9n1 /dev/nvme10n1
> /dev/nvme11n1 /dev/nvme12n1 /dev/nvme13n1 /dev/nvme14n1 /dev/nvme15n1
> /dev/nvme16n1
> # iostat -dx 5
> Device            r/s     w/s     rkB/s     wkB/s   rrqm/s   wrqm/s
> %rrqm  %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util
> nvme0n1          0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme1n1        342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.88    0.00   0.99   470.84     2.25   2.51  86.04
> nvme4n1        342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.89    0.00   0.99   470.84     2.25   2.51  86.06
> nvme5n1        342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.89    0.00   0.99   470.84     2.25   2.51  86.14
> nvme10n1       342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.90    0.00   0.99   470.84     2.25   2.51  86.20
> nvme9n1        342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.91    0.00   1.00   470.84     2.25   2.53  86.76
> nvme13n1       342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.93    0.00   1.00   470.84     2.25   2.54  87.00
> nvme12n1       342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.94    0.00   1.01   470.84     2.25   2.54  87.08
> nvme8n1        342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.93    0.00   1.00   470.84     2.25   2.54  87.02
> nvme14n1       342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.96    0.00   1.01   470.84     2.25   2.56  87.64
> nvme22n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme17n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme16n1       342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    3.05    0.00   1.04   470.84     2.25   2.58  88.56
> nvme19n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme2n1        342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.94    0.00   1.01   470.84     2.25   2.54  87.20
> nvme6n1        342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.95    0.00   1.01   470.84     2.25   2.55  87.52
> nvme7n1        342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.94    0.00   1.01   470.84     2.25   2.54  87.22
> nvme21n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme11n1       342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.96    0.00   1.02   470.84     2.25   2.56  87.72
> nvme15n1       342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.99    0.00   1.02   470.84     2.25   2.53  86.84
> nvme23n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme18n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme3n1        342.60    0.40 161311.20      0.90 39996.60     0.00
> 99.15   0.00    2.97    0.00   1.02   470.84     2.25   2.53  86.66
> nvme20n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
>
> as you can see, there are 342 iops with ~470 rareq-sz, but when i
> create RAID6 with 17 drives or more:
>
> # mdadm --create --verbose /dev/md0 --level=6 --raid-devices=17
> /dev/nvme1n1 /dev/nvme2n1 /dev/nvme3n1 /dev/nvme4n1 /dev/nvme5n1
> /dev/nvme6n1 /dev/nvme7n1 /dev/nvme8n1 /dev/nvme9n1 /dev/nvme10n1
> /dev/nvme11n1 /dev/nvme12n1 /dev/nvme13n1 /dev/nvme14n1 /dev/nvme15n1
> /dev/nvme16n1 /dev/nvme17n1
> # iostat -dx 5
> Device            r/s     w/s     rkB/s     wkB/s   rrqm/s   wrqm/s
> %rrqm  %wrqm r_await w_await aqu-sz rareq-sz wareq-sz  svctm  %util
> nvme0n1          0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme1n1       21484.20    0.40  85936.80      0.90     0.00     0.00
> 0.00   0.00    0.04    0.00   0.82     4.00     2.25   0.05  99.16
> nvme4n1       21484.00    0.40  85936.00      0.90     0.00     0.00
> 0.00   0.00    0.03    0.00   0.74     4.00     2.25   0.05  99.16
> nvme5n1       21484.00    0.40  85936.00      0.90     0.00     0.00
> 0.00   0.00    0.04    0.00   0.84     4.00     2.25   0.05  99.16
> nvme10n1      21483.80    0.40  85935.20      0.90     0.00     0.00
> 0.00   0.00    0.03    0.00   0.65     4.00     2.25   0.04  83.64
> nvme9n1       21483.80    0.40  85935.20      0.90     0.00     0.00
> 0.00   0.00    0.03    0.00   0.67     4.00     2.25   0.04  85.86
> nvme13n1      21483.60    0.40  85934.40      0.90     0.00     0.00
> 0.00   0.00    0.03    0.00   0.63     4.00     2.25   0.04  83.66
> nvme12n1      21483.60    0.40  85934.40      0.90     0.00     0.00
> 0.00   0.00    0.03    0.00   0.65     4.00     2.25   0.04  83.66
> nvme8n1       21483.60    0.40  85934.40      0.90     0.00     0.00
> 0.00   0.00    0.04    0.00   0.81     4.00     2.25   0.05  99.22
> nvme14n1      21481.80    0.40  85927.20      0.90     0.00     0.00
> 0.00   0.00    0.03    0.00   0.67     4.00     2.25   0.04  83.66
> nvme22n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme17n1      21482.00    0.40  85928.00      0.90     0.00     0.00
> 0.00   0.00    0.02    0.00   0.49     4.00     2.25   0.03  67.12
> nvme16n1      21481.60    0.40  85926.40      0.90     0.00     0.00
> 0.00   0.00    0.03    0.00   0.75     4.00     2.25   0.04  83.66
> nvme19n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme2n1       21481.60    0.40  85926.40      0.90     0.00     0.00
> 0.00   0.00    0.04    0.00   0.95     4.00     2.25   0.05  99.26
> nvme6n1       21481.60    0.40  85926.40      0.90     0.00     0.00
> 0.00   0.00    0.04    0.00   0.91     4.00     2.25   0.05  99.26
> nvme7n1       21481.60    0.40  85926.40      0.90     0.00     0.00
> 0.00   0.00    0.04    0.00   0.87     4.00     2.25   0.05  99.24
> nvme21n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme11n1      21481.20    0.40  85924.80      0.90     0.00     0.00
> 0.00   0.00    0.03    0.00   0.75     4.00     2.25   0.04  83.66
> nvme15n1      21480.20    0.40  85920.80      0.90     0.00     0.00
> 0.00   0.00    0.04    0.00   0.80     4.00     2.25   0.04  83.66
> nvme23n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme18n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
> nvme3n1       21480.40    0.40  85921.60      0.90     0.00     0.00
> 0.00   0.00    0.05    0.00   1.02     4.00     2.25   0.05  99.26
> nvme20n1         0.00    0.00      0.00      0.00     0.00     0.00
> 0.00   0.00    0.00    0.00   0.00     0.00     0.00   0.00   0.00
>
> rareq-sz drops to 4, iops increase to 21483 and resync speed drops to 85MB/s.
>
> Why is it like that? Could someone let me know which part of mdraid
> kernel code is responsible for this limitation ? Is changing this and
> recompiling the kernel on machine with 512GB+ ram safe ?
>
> Regards,
> Marcin Wanat

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Slow initial resync in RAID6 with 36 SAS drives
  2021-09-01  5:19   ` Song Liu
@ 2021-09-03  0:58     ` Song Liu
  2021-09-03  2:56       ` Jens Axboe
  2021-09-04 15:24       ` Marcin Wanat
  0 siblings, 2 replies; 9+ messages in thread
From: Song Liu @ 2021-09-03  0:58 UTC (permalink / raw)
  To: Marcin Wanat, Jens Axboe; +Cc: linux-raid

On Tue, Aug 31, 2021 at 10:19 PM Song Liu <song@kernel.org> wrote:
>
> On Wed, Aug 25, 2021 at 3:06 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
> >
> > On Thu, Aug 19, 2021 at 11:28 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
> > >
> > > Sorry, this will be a long email with everything I find to be relevant.
> > > I have a mdraid6 array with 36 hdd SAS drives each able to do
> > > >200MB/s, but I am unable to get more than 38MB/s resync speed on a
> > > fast system (48cores/96GB ram) with no other load.
> >
> > I have done a bit more research on 24 NVMe drives server and found
> > that resync speed bottleneck affect RAID6 with >16 drives:
>
> Sorry for the late response.
>
> This is interesting behavior. I don't really know why this is the case at the
> moment. Let me try to reproduce this first.
>
> Thanks,
> Song

The issue is caused by blk_plug logic. Something like the following should
fix it.

Marcin, could you please give it a try?

Thanks,
Song

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2c4ac51e54eba..fdb945be85753 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2251,7 +2251,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
                else
                        last = list_entry_rq(plug->mq_list.prev);

-               if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
+               if (request_count >= blk_plug_max_rq_count(plug) || (last &&
                    blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
                        blk_flush_plug_list(plug, false);
                        trace_block_plug(q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b5c033cf5f26f..2e3c07e959c14 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1239,6 +1239,13 @@ extern void blk_start_plug(struct blk_plug *);
 extern void blk_finish_plug(struct blk_plug *);
 extern void blk_flush_plug_list(struct blk_plug *, bool);

+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
+{
+       if (plug->multiple_queues)
+               return BLK_MAX_REQUEST_COUNT * 4;
+       return BLK_MAX_REQUEST_COUNT;
+}
+
 static inline void blk_flush_plug(struct task_struct *tsk)
 {
        struct blk_plug *plug = tsk->plug;

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: Slow initial resync in RAID6 with 36 SAS drives
  2021-09-03  0:58     ` Song Liu
@ 2021-09-03  2:56       ` Jens Axboe
  2021-09-04 15:24       ` Marcin Wanat
  1 sibling, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2021-09-03  2:56 UTC (permalink / raw)
  To: Song Liu, Marcin Wanat; +Cc: linux-raid

On 9/2/21 6:58 PM, Song Liu wrote:
> On Tue, Aug 31, 2021 at 10:19 PM Song Liu <song@kernel.org> wrote:
>>
>> On Wed, Aug 25, 2021 at 3:06 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
>>>
>>> On Thu, Aug 19, 2021 at 11:28 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
>>>>
>>>> Sorry, this will be a long email with everything I find to be relevant.
>>>> I have a mdraid6 array with 36 hdd SAS drives each able to do
>>>>> 200MB/s, but I am unable to get more than 38MB/s resync speed on a
>>>> fast system (48cores/96GB ram) with no other load.
>>>
>>> I have done a bit more research on 24 NVMe drives server and found
>>> that resync speed bottleneck affect RAID6 with >16 drives:
>>
>> Sorry for the late response.
>>
>> This is interesting behavior. I don't really know why this is the case at the
>> moment. Let me try to reproduce this first.
>>
>> Thanks,
>> Song
> 
> The issue is caused by blk_plug logic. Something like the following should
> fix it.
> 
> Marcin, could you please give it a try?
> 
> Thanks,
> Song
> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 2c4ac51e54eba..fdb945be85753 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2251,7 +2251,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
>                 else
>                         last = list_entry_rq(plug->mq_list.prev);
> 
> -               if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
> +               if (request_count >= blk_plug_max_rq_count(plug) || (last &&
>                     blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
>                         blk_flush_plug_list(plug, false);
>                         trace_block_plug(q);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index b5c033cf5f26f..2e3c07e959c14 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1239,6 +1239,13 @@ extern void blk_start_plug(struct blk_plug *);
>  extern void blk_finish_plug(struct blk_plug *);
>  extern void blk_flush_plug_list(struct blk_plug *, bool);
> 
> +static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
> +{
> +       if (plug->multiple_queues)
> +               return BLK_MAX_REQUEST_COUNT * 4;
> +       return BLK_MAX_REQUEST_COUNT;
> +}
> +
>  static inline void blk_flush_plug(struct task_struct *tsk)
>  {
>         struct blk_plug *plug = tsk->plug;

Just put this in blk-mq.c, there's no reason to put this in blkdev.h. It
could go in block/blk.h, but let's just put it where it's used for now.

Should also have a comment on why we allow more for multiple_queues.

That said, the principle is sound imho.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: Slow initial resync in RAID6 with 36 SAS drives
  2021-09-03  0:58     ` Song Liu
  2021-09-03  2:56       ` Jens Axboe
@ 2021-09-04 15:24       ` Marcin Wanat
  1 sibling, 0 replies; 9+ messages in thread
From: Marcin Wanat @ 2021-09-04 15:24 UTC (permalink / raw)
  To: Song Liu; +Cc: Jens Axboe, linux-raid

On 03.09.2021 02:58, Song Liu wrote:
> On Tue, Aug 31, 2021 at 10:19 PM Song Liu <song@kernel.org> wrote:
>>
>> On Wed, Aug 25, 2021 at 3:06 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
>>>
>>> On Thu, Aug 19, 2021 at 11:28 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
>>>>
>>>> Sorry, this will be a long email with everything I find to be relevant.
>>>> I have a mdraid6 array with 36 hdd SAS drives each able to do
>>>>> 200MB/s, but I am unable to get more than 38MB/s resync speed on a
>>>> fast system (48cores/96GB ram) with no other load.
>>>
>>> I have done a bit more research on 24 NVMe drives server and found
>>> that resync speed bottleneck affect RAID6 with >16 drives:
>>
>> Sorry for the late response.
>>
>> This is interesting behavior. I don't really know why this is the case at the
>> moment. Let me try to reproduce this first.
>>
>> Thanks,
>> Song
>
> The issue is caused by blk_plug logic. Something like the following should
> fix it.
>
> Marcin, could you please give it a try?
>

Patch tested against the latest longterm kernel(as this is a
semi-production server) and 36 HDD raid6 and working fine.

Here are numbers. Without patch:

# cat /proc/mdstat
Personalities : [raid1] [raid6] [raid5] [raid4]
md0 : active raid6 sdak1[23] sdag1[34] sdaj1[22] sdai1[21] sdah1[35]
sdal1[24] sdae1[20] sdaa1[29] sdx1[19] sdaf1[33] sdad1[32] sdac1[31]
sdab1[30] sdu1[14] sdz1[28] sdv1[18] sdw1[26] sdy1[27] sdt1[13]
sdq1[17] sdr1[25] sds1[12] sdp1[11] sdo1[9] sdm1[15] sdn1[16] sdl1[10]
sdk1[8] sda1[2] sdh1[6] sdc1[1] sdj1[7] sde1[3] sdb1[0] sdi1[4]
sdg1[5]
464838634496 blocks super 1.2 level 6, 512k chunk, algorithm 2 [36/36]
[UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU]
[>....................] check = 0.0% (651600/13671724544)
finish=5944.4min speed=38329K/sec
bitmap: 1/102 pages [4KB], 65536KB chunk

# iostat -dx 5
Linux 5.10.62 (large1) 09/04/2021 _x86_64_ (48 CPU)

Device r/s w/s rkB/s wkB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await
aqu-sz rareq-sz wareq-sz svctm %util
sdc 9998.40 1.40 41277.60 3.40 321.00 0.00 3.11 0.00 1.42 2.57 14.19
4.13 2.43 0.08 80.54
sdb 10154.80 1.40 41277.60 3.40 164.60 0.00 1.60 0.00 0.83 5.86 8.47
4.06 2.43 0.08 82.02
sdi 10182.80 1.40 41277.60 3.40 136.60 0.00 1.32 0.00 0.78 9.29 7.98
4.05 2.43 0.08 81.86
sdh 10002.00 1.40 41278.40 3.40 317.60 0.00 3.08 0.00 1.11 7.71 11.14
4.13 2.43 0.08 80.56
sde 9779.40 1.40 41278.40 3.40 540.20 0.00 5.23 0.00 2.10 2.43 20.50
4.22 2.43 0.08 80.10
sdj 9861.80 1.40 41277.60 3.40 457.60 0.00 4.43 0.00 1.72 15.14 16.98
4.19 2.43 0.08 81.36
sdg 10167.80 1.40 41277.60 3.40 151.60 0.00 1.47 0.00 0.95 3.43 9.68
4.06 2.43 0.08 80.06
[...]

With patch:
# cat /proc/mdstat
Personalities : [raid1] [raid6] [raid5] [raid4]
md0 : active raid6 sdak1[34] sdal1[35] sdaj1[33] sdah1[31] sdai1[32]
sdae1[28] sdag1[30] sdaf1[29] sdad1[27] sdac1[26] sdab1[25] sdaa1[17]
sdz1[16] sdy1[15] sdx1[10] sdv1[23] sds1[20] sdw1[24] sdq1[18]
sdr1[19] sdt1[21] sdu1[22] sdm1[11] sdp1[14] sdl1[9] sdn1[12] sdo1[13]
sdk1[8] sdh1[5] sdj1[7] sdc1[0] sdg1[4] sdf1[3] sde1[2] sdd1[1]
sdi1[6]
464838634496 blocks super 1.2 level 6, 512k chunk, algorithm 2 [36/36]
[UUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUUU]
[>....................] check = 0.0% (488284/13671724544)
finish=2799.8min speed=81380K/sec
bitmap: 2/102 pages [8KB], 65536KB chunk

# iostat -dx 5
Linux 5.10.62 (large1) 09/04/2021 _x86_64_ (48 CPU)

Device r/s w/s rkB/s wkB/s rrqm/s wrqm/s %rrqm %wrqm r_await w_await
aqu-sz rareq-sz wareq-sz svctm %util
sdd 168.00 1.80 80596.80 4.40 19971.00 0.00 99.17 0.00 6.67 7.44 1.14
479.74 2.44 3.22 54.72
sdc 168.20 2.00 80598.40 6.00 19971.00 0.20 99.16 9.09 6.55 4.60 1.11
479.18 3.00 3.18 54.08
sde 168.20 2.00 80598.40 6.00 19964.60 0.20 99.16 9.09 6.69 4.90 1.14
479.18 3.00 3.22 54.80
sdg 168.00 1.80 80596.80 4.40 19964.20 0.00 99.17 0.00 6.66 3.11 1.13
479.74 2.44 3.25 55.20
sdf 168.00 1.80 80596.80 4.40 19964.20 0.00 99.17 0.00 6.62 3.11 1.12
479.74 2.44 3.23 54.84
sdk 168.20 1.80 80596.80 4.40 19963.80 0.00 99.16 0.00 6.82 3.56 1.16
479.17 2.44 3.33 56.58
sdi 168.00 1.80 80596.80 4.40 19964.00 0.00 99.17 0.00 6.67 3.00 1.13
479.74 2.44 3.29 55.78
sdj 168.00 1.80 80596.80 4.40 19963.80 0.00 99.17 0.00 6.75 3.33 1.14
479.74 2.44 3.31 56.14
sdh 168.00 1.80 80596.80 4.40 19962.60 0.00 99.17 0.00 6.72 3.33 1.14
479.74 2.44 3.29 55.86
[...]


With this patch iops dropped from ~10000 to 168 and rareq-sz increased
from 4 to 479. Resync speed increased from 38MB/s to 81MB/s.
These numbers were obtained using default kernel settings, but I am
pretty sure that I can now fine-tune it using
group_thread_cnt/stripe_cache_size to get even better numbers.

Thank you!


-- 
Marcin Wanat

On Fri, Sep 3, 2021 at 2:58 AM Song Liu <song@kernel.org> wrote:
>
> On Tue, Aug 31, 2021 at 10:19 PM Song Liu <song@kernel.org> wrote:
> >
> > On Wed, Aug 25, 2021 at 3:06 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
> > >
> > > On Thu, Aug 19, 2021 at 11:28 AM Marcin Wanat <marcin.wanat@gmail.com> wrote:
> > > >
> > > > Sorry, this will be a long email with everything I find to be relevant.
> > > > I have a mdraid6 array with 36 hdd SAS drives each able to do
> > > > >200MB/s, but I am unable to get more than 38MB/s resync speed on a
> > > > fast system (48cores/96GB ram) with no other load.
> > >
> > > I have done a bit more research on 24 NVMe drives server and found
> > > that resync speed bottleneck affect RAID6 with >16 drives:
> >
> > Sorry for the late response.
> >
> > This is interesting behavior. I don't really know why this is the case at the
> > moment. Let me try to reproduce this first.
> >
> > Thanks,
> > Song
>
> The issue is caused by blk_plug logic. Something like the following should
> fix it.
>
> Marcin, could you please give it a try?
>
> Thanks,
> Song
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 2c4ac51e54eba..fdb945be85753 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2251,7 +2251,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
>                 else
>                         last = list_entry_rq(plug->mq_list.prev);
>
> -               if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
> +               if (request_count >= blk_plug_max_rq_count(plug) || (last &&
>                     blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
>                         blk_flush_plug_list(plug, false);
>                         trace_block_plug(q);
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index b5c033cf5f26f..2e3c07e959c14 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1239,6 +1239,13 @@ extern void blk_start_plug(struct blk_plug *);
>  extern void blk_finish_plug(struct blk_plug *);
>  extern void blk_flush_plug_list(struct blk_plug *, bool);
>
> +static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
> +{
> +       if (plug->multiple_queues)
> +               return BLK_MAX_REQUEST_COUNT * 4;
> +       return BLK_MAX_REQUEST_COUNT;
> +}
> +
>  static inline void blk_flush_plug(struct task_struct *tsk)
>  {
>         struct blk_plug *plug = tsk->plug;

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2021-09-04 15:25 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-08-19  9:28 Slow initial resync in RAID6 with 36 SAS drives Marcin Wanat
2021-08-25 10:06 ` Marcin Wanat
2021-08-25 10:28   ` [Non-DoD Source] " Finlayson, James M CIV (USA)
2021-09-01  1:22     ` antlists
2021-09-01  1:50       ` Guoqing Jiang
2021-09-01  5:19   ` Song Liu
2021-09-03  0:58     ` Song Liu
2021-09-03  2:56       ` Jens Axboe
2021-09-04 15:24       ` Marcin Wanat

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).