All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3]md/raid5: improve IO pattern
@ 2017-02-17 23:32 Shaohua Li
  2017-02-17 23:32 ` [PATCH 1/3] md/raid5: prioritize stripes for writeback Shaohua Li
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Shaohua Li @ 2017-02-17 23:32 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, songliubraving, kernel-team

Hi,

These are some patches I'm testing to improve raid5-cache performance. The basic
idea is to flush a lot of stripes to raid disks and then do smart scheduling of
IOs. In my test of a 12-disk raid6 array, this improves around 20% throughput
and request size/disk seek are improved a lot. fio script I'm using are:

[global]
ioengine=libaio
direct=1
loops=1000
runtime=120
time_based=1
file_service_type=random:36
overwrite=1
thread=0
group_reporting=1

[test]
filename=/dev/md0
bs=4k
readwrite=randwrite
numjobs=8
offset_increment=10G

--------------------------------------------
Shaohua Li (3):
  md/raid5: prioritize stripes for writeback
  md/raid5-cache: bump flush stripe batch size
  md/raid5: sort bios

 drivers/md/raid5-cache.c |   2 +-
 drivers/md/raid5.c       | 189 ++++++++++++++++++++++++++++++++++++++---------
 drivers/md/raid5.h       |  13 +++-
 3 files changed, 169 insertions(+), 35 deletions(-)

-- 
2.9.3


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/3] md/raid5: prioritize stripes for writeback
  2017-02-17 23:32 [PATCH 0/3]md/raid5: improve IO pattern Shaohua Li
@ 2017-02-17 23:32 ` Shaohua Li
  2017-02-17 23:32 ` [PATCH 2/3] md/raid5-cache: bump flush stripe batch size Shaohua Li
  2017-02-17 23:32 ` [PATCH 3/3] md/raid5: sort bios Shaohua Li
  2 siblings, 0 replies; 12+ messages in thread
From: Shaohua Li @ 2017-02-17 23:32 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, songliubraving, kernel-team

In raid5-cache writeback mode, we have two types of stripes to handle.
- stripes which aren't cached yet
- stripes which are cached and flushing out to raid disks

Upperlayer is more sensistive to latency of the first type of stripes
generally. But we only one handle list for all these stripes, where the
two types of stripes are mixed together. When reclaim flushes a lot of
stripes, the first type of stripes could be noticeably delayed. On the
other hand, if the log space is tight, we'd like to handle the second
type of stripes faster and free log space.

This patch destinguishes the two types stripes. They are added into
different handle list. When we try to get a stripe to handl, we prefer
the first type of stripes unless log space is tight.

This should have no impact for !writeback case.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 48 +++++++++++++++++++++++++++++++++++++++---------
 drivers/md/raid5.h |  2 ++
 2 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7b7722bb2..d7c90da 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -174,6 +174,13 @@ static int stripe_operations_active(struct stripe_head *sh)
 	       test_bit(STRIPE_COMPUTE_RUN, &sh->state);
 }
 
+static bool stripe_is_lowprio(struct stripe_head *sh)
+{
+	return (test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state) ||
+		test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) &&
+	       !test_bit(STRIPE_R5C_CACHING, &sh->state);
+}
+
 static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 {
 	struct r5conf *conf = sh->raid_conf;
@@ -189,7 +196,10 @@ static void raid5_wakeup_stripe_thread(struct stripe_head *sh)
 	if (list_empty(&sh->lru)) {
 		struct r5worker_group *group;
 		group = conf->worker_groups + cpu_to_group(cpu);
-		list_add_tail(&sh->lru, &group->handle_list);
+		if (stripe_is_lowprio(sh))
+			list_add_tail(&sh->lru, &group->loprio_list);
+		else
+			list_add_tail(&sh->lru, &group->handle_list);
 		group->stripes_cnt++;
 		sh->group = group;
 	}
@@ -252,7 +262,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh,
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			clear_bit(STRIPE_BIT_DELAY, &sh->state);
 			if (conf->worker_cnt_per_group == 0) {
-				list_add_tail(&sh->lru, &conf->handle_list);
+				if (stripe_is_lowprio(sh))
+					list_add_tail(&sh->lru,
+							&conf->loprio_list);
+				else
+					list_add_tail(&sh->lru,
+							&conf->handle_list);
 			} else {
 				raid5_wakeup_stripe_thread(sh);
 				return;
@@ -5169,19 +5184,27 @@ static struct bio *chunk_aligned_read(struct mddev *mddev, struct bio *raid_bio)
  */
 static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 {
-	struct stripe_head *sh = NULL, *tmp;
+	struct stripe_head *sh, *tmp;
 	struct list_head *handle_list = NULL;
-	struct r5worker_group *wg = NULL;
+	struct r5worker_group *wg;
+	bool second_try = !r5c_is_writeback(conf->log);
+	bool try_loprio = test_bit(R5C_LOG_TIGHT, &conf->cache_state);
 
+again:
+	wg = NULL;
+	sh = NULL;
 	if (conf->worker_cnt_per_group == 0) {
-		handle_list = &conf->handle_list;
+		handle_list = try_loprio ? &conf->loprio_list :
+					&conf->handle_list;
 	} else if (group != ANY_GROUP) {
-		handle_list = &conf->worker_groups[group].handle_list;
+		handle_list = try_loprio ? &conf->worker_groups[group].loprio_list :
+				&conf->worker_groups[group].handle_list;
 		wg = &conf->worker_groups[group];
 	} else {
 		int i;
 		for (i = 0; i < conf->group_cnt; i++) {
-			handle_list = &conf->worker_groups[i].handle_list;
+			handle_list = try_loprio ? &conf->worker_groups[i].loprio_list :
+				&conf->worker_groups[i].handle_list;
 			wg = &conf->worker_groups[i];
 			if (!list_empty(handle_list))
 				break;
@@ -5232,8 +5255,13 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group)
 		wg = NULL;
 	}
 
-	if (!sh)
-		return NULL;
+	if (!sh) {
+		if (second_try)
+			return NULL;
+		second_try = true;
+		try_loprio = !try_loprio;
+		goto again;
+	}
 
 	if (wg) {
 		wg->stripes_cnt--;
@@ -6543,6 +6571,7 @@ static int alloc_thread_groups(struct r5conf *conf, int cnt,
 
 		group = &(*worker_groups)[i];
 		INIT_LIST_HEAD(&group->handle_list);
+		INIT_LIST_HEAD(&group->loprio_list);
 		group->conf = conf;
 		group->workers = workers + i * cnt;
 
@@ -6770,6 +6799,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
+	INIT_LIST_HEAD(&conf->loprio_list);
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 4bb27b9..6b9d2e8 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -542,6 +542,7 @@ struct r5worker {
 
 struct r5worker_group {
 	struct list_head handle_list;
+	struct list_head loprio_list;
 	struct r5conf *conf;
 	struct r5worker *workers;
 	int stripes_cnt;
@@ -608,6 +609,7 @@ struct r5conf {
 						  */
 
 	struct list_head	handle_list; /* stripes needing handling */
+	struct list_head	loprio_list; /* low priority stripes */
 	struct list_head	hold_list; /* preread ready stripes */
 	struct list_head	delayed_list; /* stripes that have plugged requests */
 	struct list_head	bitmap_list; /* stripes delaying awaiting bitmap update */
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 2/3] md/raid5-cache: bump flush stripe batch size
  2017-02-17 23:32 [PATCH 0/3]md/raid5: improve IO pattern Shaohua Li
  2017-02-17 23:32 ` [PATCH 1/3] md/raid5: prioritize stripes for writeback Shaohua Li
@ 2017-02-17 23:32 ` Shaohua Li
  2017-03-03  3:03   ` NeilBrown
  2017-02-17 23:32 ` [PATCH 3/3] md/raid5: sort bios Shaohua Li
  2 siblings, 1 reply; 12+ messages in thread
From: Shaohua Li @ 2017-02-17 23:32 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, songliubraving, kernel-team

Bump the flush stripe batch size to 2048. For my 12 disks raid
array, the stripes takes:
12 * 4k * 2048 = 96MB

This is still quite small. A hardware raid card generally has 1GB size,
which we suggest the raid5-cache has similar cache size.

The advantage of a big batch size is we can dispatch a lot of IO in the
same time, then we can do some scheduling to make better IO pattern.

Last patch prioritizes stripes, so we don't worry about a big flush
stripe batch will starve normal stripes.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 3f307be..b25512c 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -43,7 +43,7 @@
 /* wake up reclaim thread periodically */
 #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
 /* start flush with these full stripes */
-#define R5C_FULL_STRIPE_FLUSH_BATCH 256
+#define R5C_FULL_STRIPE_FLUSH_BATCH 2048
 /* reclaim stripes in groups */
 #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
 
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH 3/3] md/raid5: sort bios
  2017-02-17 23:32 [PATCH 0/3]md/raid5: improve IO pattern Shaohua Li
  2017-02-17 23:32 ` [PATCH 1/3] md/raid5: prioritize stripes for writeback Shaohua Li
  2017-02-17 23:32 ` [PATCH 2/3] md/raid5-cache: bump flush stripe batch size Shaohua Li
@ 2017-02-17 23:32 ` Shaohua Li
  2017-03-03  3:43   ` NeilBrown
  2 siblings, 1 reply; 12+ messages in thread
From: Shaohua Li @ 2017-02-17 23:32 UTC (permalink / raw)
  To: linux-raid; +Cc: neilb, songliubraving, kernel-team

Previous patch (raid5: only dispatch IO from raid5d for harddisk raid)
defers IO dispatching. The goal is to create better IO pattern. At that
time, we don't sort the deffered IO and hope the block layer can do IO
merge and sort. Now the raid5-cache writeback could create large amount
of bios. And if we enable muti-thread for stripe handling, we can't
control when to dispatch IO to raid disks. In a lot of time, we are
dispatching IO which block layer can't do merge effectively.

This patch moves further for the IO dispatching defer. We accumulate
bios, but we don't dispatch all the bios after a threshold is met. This
'dispatch partial portion of bios' stragety allows bios coming in a
large time window are sent to disks together. At the dispatching time,
there is large chance the block layer can merge the bios. To make this
more effective, we dispatch IO in ascending order. This increases
request merge chance and reduces disk seek.

Signed-off-by: Shaohua Li <shli@fb.com>
---
 drivers/md/raid5.c | 141 ++++++++++++++++++++++++++++++++++++++++++++---------
 drivers/md/raid5.h |  11 ++++-
 2 files changed, 127 insertions(+), 25 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index d7c90da..fe6232f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -56,6 +56,7 @@
 #include <linux/nodemask.h>
 #include <linux/flex_array.h>
 #include <trace/events/block.h>
+#include <linux/sort.h>
 
 #include "md.h"
 #include "raid5.h"
@@ -876,41 +877,121 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
 	return 1;
 }
 
-static void flush_deferred_bios(struct r5conf *conf)
+static void dispatch_bio_list(struct bio_list *tmp)
 {
-	struct bio_list tmp;
 	struct bio *bio;
 
+	while ((bio = bio_list_pop(tmp)))
+		generic_make_request(bio);
+}
+
+static int cmp_sh(const void *a, const void *b)
+{
+	const struct r5pending_data *da = a;
+	const struct r5pending_data *db = b;
+
+	if (da->sector > db->sector)
+		return 1;
+	if (da->sector < db->sector)
+		return -1;
+	return 0;
+}
+
+static void dispatch_defer_bios(struct r5conf *conf, int target,
+				struct bio_list *list)
+{
+	int start = 0;
+	int end = conf->pending_data_cnt - 1;
+	int mid;
+	int cnt = 0;
+
+	if (conf->pending_data_cnt == 0)
+		return;
+	sort(conf->pending_data, conf->pending_data_cnt,
+		sizeof(struct r5pending_data), cmp_sh, NULL);
+
+	while (start <= end) {
+		mid = (start + end) / 2;
+		if (conf->pending_data[mid].sector > conf->last_bio_pos)
+			end = mid - 1;
+		else if (conf->pending_data[mid].sector < conf->last_bio_pos)
+			start = mid + 1;
+		else {
+			start = mid;
+			break;
+		}
+	}
+
+	end = conf->pending_data_cnt - 1;
+	for (mid = start; mid <= end; mid++) {
+		conf->last_bio_pos = conf->pending_data[mid].sector;
+		bio_list_merge(list, &conf->pending_data[mid].bios);
+
+		cnt++;
+		if (cnt >= target) {
+			mid++;
+			break;
+		}
+	}
+	conf->pending_data_cnt -= cnt;
+	BUG_ON(conf->pending_data_cnt < 0);
+	if (mid <= end) {
+		memmove(&conf->pending_data[start], &conf->pending_data[mid],
+			(end - mid + 1) * sizeof(struct r5pending_data));
+		return;
+	}
+
+	for (mid = 0; mid < start; mid++) {
+		conf->last_bio_pos = conf->pending_data[mid].sector;
+		bio_list_merge(list, &conf->pending_data[mid].bios);
+
+		cnt++;
+		if (cnt >= target) {
+			mid++;
+			break;
+		}
+	}
+
+	conf->pending_data_cnt -= mid;
+	BUG_ON(conf->pending_data_cnt < 0);
+	if (mid == start) {
+		BUG_ON(conf->pending_data_cnt);
+		return;
+	}
+	memmove(&conf->pending_data[0], &conf->pending_data[mid],
+			(start - mid) * sizeof(struct r5pending_data));
+}
+
+static void flush_deferred_bios(struct r5conf *conf)
+{
+	struct bio_list tmp = BIO_EMPTY_LIST;
+
 	if (!conf->batch_bio_dispatch || !conf->group_cnt)
 		return;
 
-	bio_list_init(&tmp);
 	spin_lock(&conf->pending_bios_lock);
-	bio_list_merge(&tmp, &conf->pending_bios);
-	bio_list_init(&conf->pending_bios);
+	dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
 	spin_unlock(&conf->pending_bios_lock);
 
-	while ((bio = bio_list_pop(&tmp)))
-		generic_make_request(bio);
+	dispatch_bio_list(&tmp);
 }
 
-static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
+static void defer_issue_bios(struct r5conf *conf, sector_t sector,
+				struct bio_list *bios)
 {
-	/*
-	 * change group_cnt will drain all bios, so this is safe
-	 *
-	 * A read generally means a read-modify-write, which usually means a
-	 * randwrite, so we don't delay it
-	 */
-	if (!conf->batch_bio_dispatch || !conf->group_cnt ||
-	    bio_op(bio) == REQ_OP_READ) {
-		generic_make_request(bio);
-		return;
-	}
+	struct bio_list tmp = BIO_EMPTY_LIST;
+
 	spin_lock(&conf->pending_bios_lock);
-	bio_list_add(&conf->pending_bios, bio);
+	conf->pending_data[conf->pending_data_cnt].sector = sector;
+	bio_list_init(&conf->pending_data[conf->pending_data_cnt].bios);
+	bio_list_merge(&conf->pending_data[conf->pending_data_cnt].bios,
+		bios);
+	conf->pending_data_cnt++;
+	if (conf->pending_data_cnt >= PENDING_IO_MAX)
+		dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
 	spin_unlock(&conf->pending_bios_lock);
-	md_wakeup_thread(conf->mddev->thread);
+
+	dispatch_bio_list(&tmp);
 }
 
 static void
@@ -923,6 +1004,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 	struct r5conf *conf = sh->raid_conf;
 	int i, disks = sh->disks;
 	struct stripe_head *head_sh = sh;
+	struct bio_list pending_bios = BIO_EMPTY_LIST;
+	bool should_defer;
 
 	might_sleep();
 
@@ -939,6 +1022,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		}
 	}
 
+	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
+
 	for (i = disks; i--; ) {
 		int op, op_flags = 0;
 		int replace_only = 0;
@@ -1093,7 +1178,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
 						      bi, disk_devt(conf->mddev->gendisk),
 						      sh->dev[i].sector);
-			defer_bio_issue(conf, bi);
+			if (should_defer && op_is_write(op))
+				bio_list_add(&pending_bios, bi);
+			else
+				generic_make_request(bi);
 		}
 		if (rrdev) {
 			if (s->syncing || s->expanding || s->expanded
@@ -1138,7 +1226,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
 						      rbi, disk_devt(conf->mddev->gendisk),
 						      sh->dev[i].sector);
-			defer_bio_issue(conf, rbi);
+			if (should_defer && op_is_write(op))
+				bio_list_add(&pending_bios, rbi);
+			else
+				generic_make_request(rbi);
 		}
 		if (!rdev && !rrdev) {
 			if (op_is_write(op))
@@ -1156,6 +1247,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 		if (sh != head_sh)
 			goto again;
 	}
+
+	if (should_defer && !bio_list_empty(&pending_bios))
+		defer_issue_bios(conf, head_sh->sector, &pending_bios);
 }
 
 static struct dma_async_tx_descriptor *
@@ -6808,7 +6902,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
 	atomic_set(&conf->active_aligned_reads, 0);
-	bio_list_init(&conf->pending_bios);
 	spin_lock_init(&conf->pending_bios_lock);
 	conf->batch_bio_dispatch = true;
 	rdev_for_each(rdev, mddev) {
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 6b9d2e8..eb6d5d5 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -572,6 +572,13 @@ enum r5_cache_state {
 				 */
 };
 
+#define PENDING_IO_MAX 512
+#define PENDING_IO_ONE_FLUSH 128
+struct r5pending_data {
+	sector_t sector; /* stripe sector */
+	struct bio_list bios;
+};
+
 struct r5conf {
 	struct hlist_head	*stripe_hashtbl;
 	/* only protect corresponding hash list and inactive_list */
@@ -689,9 +696,11 @@ struct r5conf {
 	int			worker_cnt_per_group;
 	struct r5l_log		*log;
 
-	struct bio_list		pending_bios;
 	spinlock_t		pending_bios_lock;
 	bool			batch_bio_dispatch;
+	struct r5pending_data	pending_data[PENDING_IO_MAX];
+	int			pending_data_cnt;
+	sector_t		last_bio_pos;
 };
 
 
-- 
2.9.3


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] md/raid5-cache: bump flush stripe batch size
  2017-02-17 23:32 ` [PATCH 2/3] md/raid5-cache: bump flush stripe batch size Shaohua Li
@ 2017-03-03  3:03   ` NeilBrown
  2017-03-03 17:41     ` Shaohua Li
  0 siblings, 1 reply; 12+ messages in thread
From: NeilBrown @ 2017-03-03  3:03 UTC (permalink / raw)
  To: Shaohua Li, linux-raid; +Cc: songliubraving, kernel-team

[-- Attachment #1: Type: text/plain, Size: 1466 bytes --]

On Fri, Feb 17 2017, Shaohua Li wrote:

> Bump the flush stripe batch size to 2048. For my 12 disks raid
> array, the stripes takes:
> 12 * 4k * 2048 = 96MB
>
> This is still quite small. A hardware raid card generally has 1GB size,
> which we suggest the raid5-cache has similar cache size.
>
> The advantage of a big batch size is we can dispatch a lot of IO in the
> same time, then we can do some scheduling to make better IO pattern.
>
> Last patch prioritizes stripes, so we don't worry about a big flush
> stripe batch will starve normal stripes.
>
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  drivers/md/raid5-cache.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> index 3f307be..b25512c 100644
> --- a/drivers/md/raid5-cache.c
> +++ b/drivers/md/raid5-cache.c
> @@ -43,7 +43,7 @@
>  /* wake up reclaim thread periodically */
>  #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
>  /* start flush with these full stripes */
> -#define R5C_FULL_STRIPE_FLUSH_BATCH 256
> +#define R5C_FULL_STRIPE_FLUSH_BATCH 2048

Fixed numbers are warning signs... I wonder if there is something better
we could do?   "conf->max_nr_stripes / 4" maybe?  We use that sort of
number elsewhere.
Would that make sense?

Thanks,
NeilBrown

>  /* reclaim stripes in groups */
>  #define R5C_RECLAIM_STRIPE_GROUP (NR_STRIPE_HASH_LOCKS * 2)
>  
> -- 
> 2.9.3

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] md/raid5: sort bios
  2017-02-17 23:32 ` [PATCH 3/3] md/raid5: sort bios Shaohua Li
@ 2017-03-03  3:43   ` NeilBrown
  2017-03-03 17:59     ` Shaohua Li
  0 siblings, 1 reply; 12+ messages in thread
From: NeilBrown @ 2017-03-03  3:43 UTC (permalink / raw)
  To: Shaohua Li, linux-raid; +Cc: songliubraving, kernel-team

[-- Attachment #1: Type: text/plain, Size: 10131 bytes --]

On Fri, Feb 17 2017, Shaohua Li wrote:

> Previous patch (raid5: only dispatch IO from raid5d for harddisk raid)
> defers IO dispatching. The goal is to create better IO pattern. At that
> time, we don't sort the deffered IO and hope the block layer can do IO
> merge and sort. Now the raid5-cache writeback could create large amount
> of bios. And if we enable muti-thread for stripe handling, we can't
> control when to dispatch IO to raid disks. In a lot of time, we are
> dispatching IO which block layer can't do merge effectively.
>
> This patch moves further for the IO dispatching defer. We accumulate
> bios, but we don't dispatch all the bios after a threshold is met. This
> 'dispatch partial portion of bios' stragety allows bios coming in a
> large time window are sent to disks together. At the dispatching time,
> there is large chance the block layer can merge the bios. To make this
> more effective, we dispatch IO in ascending order. This increases
> request merge chance and reduces disk seek.

I can see the benefit of batching and sorting requests.

I wonder if the extra complexity of grouping together 512 requests, then
submitting the "first" 128 is really worth it.  Have you measured the
value of that?
If you just submitted every time you got 512 requests, you could use
list_sort() on the bio list and wouldn't need an array.

If an array really is best, it would be really nice if "sort" could pass
a 'void*' down to the cmp function, and it could sort all bios that are
*after* last_bio_pos first, and then the others.  That would make the
code much simpler.  I guess sort() could be changed (list_sort() already
has a 'priv' argument like this).

If we cannot change sort(), then maybe use lib/bsearch.c for the binary
search.  Performing two comparisons in the loop of a binary search
should get a *fail* in any algorithms class!!

The "pending_data" array that you have added to the r5conf structure
adds 4096 bytes.  This means it is larger than a page, which is best
avoided (though it is unlikely to cause problems).  I would allocate it
separately.

So there is a lot that I don't really like, but it seems like a good
idea in principle.

Thanks,
NeilBrown


>
> Signed-off-by: Shaohua Li <shli@fb.com>
> ---
>  drivers/md/raid5.c | 141 ++++++++++++++++++++++++++++++++++++++++++++---------
>  drivers/md/raid5.h |  11 ++++-
>  2 files changed, 127 insertions(+), 25 deletions(-)
>
> diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
> index d7c90da..fe6232f 100644
> --- a/drivers/md/raid5.c
> +++ b/drivers/md/raid5.c
> @@ -56,6 +56,7 @@
>  #include <linux/nodemask.h>
>  #include <linux/flex_array.h>
>  #include <trace/events/block.h>
> +#include <linux/sort.h>
>  
>  #include "md.h"
>  #include "raid5.h"
> @@ -876,41 +877,121 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh)
>  	return 1;
>  }
>  
> -static void flush_deferred_bios(struct r5conf *conf)
> +static void dispatch_bio_list(struct bio_list *tmp)
>  {
> -	struct bio_list tmp;
>  	struct bio *bio;
>  
> +	while ((bio = bio_list_pop(tmp)))
> +		generic_make_request(bio);
> +}
> +
> +static int cmp_sh(const void *a, const void *b)
> +{
> +	const struct r5pending_data *da = a;
> +	const struct r5pending_data *db = b;
> +
> +	if (da->sector > db->sector)
> +		return 1;
> +	if (da->sector < db->sector)
> +		return -1;
> +	return 0;
> +}
> +
> +static void dispatch_defer_bios(struct r5conf *conf, int target,
> +				struct bio_list *list)
> +{
> +	int start = 0;
> +	int end = conf->pending_data_cnt - 1;
> +	int mid;
> +	int cnt = 0;
> +
> +	if (conf->pending_data_cnt == 0)
> +		return;
> +	sort(conf->pending_data, conf->pending_data_cnt,
> +		sizeof(struct r5pending_data), cmp_sh, NULL);
> +
> +	while (start <= end) {
> +		mid = (start + end) / 2;
> +		if (conf->pending_data[mid].sector > conf->last_bio_pos)
> +			end = mid - 1;
> +		else if (conf->pending_data[mid].sector < conf->last_bio_pos)
> +			start = mid + 1;
> +		else {
> +			start = mid;
> +			break;
> +		}
> +	}
> +
> +	end = conf->pending_data_cnt - 1;
> +	for (mid = start; mid <= end; mid++) {
> +		conf->last_bio_pos = conf->pending_data[mid].sector;
> +		bio_list_merge(list, &conf->pending_data[mid].bios);
> +
> +		cnt++;
> +		if (cnt >= target) {
> +			mid++;
> +			break;
> +		}
> +	}
> +	conf->pending_data_cnt -= cnt;
> +	BUG_ON(conf->pending_data_cnt < 0);
> +	if (mid <= end) {
> +		memmove(&conf->pending_data[start], &conf->pending_data[mid],
> +			(end - mid + 1) * sizeof(struct r5pending_data));
> +		return;
> +	}
> +
> +	for (mid = 0; mid < start; mid++) {
> +		conf->last_bio_pos = conf->pending_data[mid].sector;
> +		bio_list_merge(list, &conf->pending_data[mid].bios);
> +
> +		cnt++;
> +		if (cnt >= target) {
> +			mid++;
> +			break;
> +		}
> +	}
> +
> +	conf->pending_data_cnt -= mid;
> +	BUG_ON(conf->pending_data_cnt < 0);
> +	if (mid == start) {
> +		BUG_ON(conf->pending_data_cnt);
> +		return;
> +	}
> +	memmove(&conf->pending_data[0], &conf->pending_data[mid],
> +			(start - mid) * sizeof(struct r5pending_data));
> +}
> +
> +static void flush_deferred_bios(struct r5conf *conf)
> +{
> +	struct bio_list tmp = BIO_EMPTY_LIST;
> +
>  	if (!conf->batch_bio_dispatch || !conf->group_cnt)
>  		return;
>  
> -	bio_list_init(&tmp);
>  	spin_lock(&conf->pending_bios_lock);
> -	bio_list_merge(&tmp, &conf->pending_bios);
> -	bio_list_init(&conf->pending_bios);
> +	dispatch_defer_bios(conf, conf->pending_data_cnt, &tmp);
>  	spin_unlock(&conf->pending_bios_lock);
>  
> -	while ((bio = bio_list_pop(&tmp)))
> -		generic_make_request(bio);
> +	dispatch_bio_list(&tmp);
>  }
>  
> -static void defer_bio_issue(struct r5conf *conf, struct bio *bio)
> +static void defer_issue_bios(struct r5conf *conf, sector_t sector,
> +				struct bio_list *bios)
>  {
> -	/*
> -	 * change group_cnt will drain all bios, so this is safe
> -	 *
> -	 * A read generally means a read-modify-write, which usually means a
> -	 * randwrite, so we don't delay it
> -	 */
> -	if (!conf->batch_bio_dispatch || !conf->group_cnt ||
> -	    bio_op(bio) == REQ_OP_READ) {
> -		generic_make_request(bio);
> -		return;
> -	}
> +	struct bio_list tmp = BIO_EMPTY_LIST;
> +
>  	spin_lock(&conf->pending_bios_lock);
> -	bio_list_add(&conf->pending_bios, bio);
> +	conf->pending_data[conf->pending_data_cnt].sector = sector;
> +	bio_list_init(&conf->pending_data[conf->pending_data_cnt].bios);
> +	bio_list_merge(&conf->pending_data[conf->pending_data_cnt].bios,
> +		bios);
> +	conf->pending_data_cnt++;
> +	if (conf->pending_data_cnt >= PENDING_IO_MAX)
> +		dispatch_defer_bios(conf, PENDING_IO_ONE_FLUSH, &tmp);
>  	spin_unlock(&conf->pending_bios_lock);
> -	md_wakeup_thread(conf->mddev->thread);
> +
> +	dispatch_bio_list(&tmp);
>  }
>  
>  static void
> @@ -923,6 +1004,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>  	struct r5conf *conf = sh->raid_conf;
>  	int i, disks = sh->disks;
>  	struct stripe_head *head_sh = sh;
> +	struct bio_list pending_bios = BIO_EMPTY_LIST;
> +	bool should_defer;
>  
>  	might_sleep();
>  
> @@ -939,6 +1022,8 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>  		}
>  	}
>  
> +	should_defer = conf->batch_bio_dispatch && conf->group_cnt;
> +
>  	for (i = disks; i--; ) {
>  		int op, op_flags = 0;
>  		int replace_only = 0;
> @@ -1093,7 +1178,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>  				trace_block_bio_remap(bdev_get_queue(bi->bi_bdev),
>  						      bi, disk_devt(conf->mddev->gendisk),
>  						      sh->dev[i].sector);
> -			defer_bio_issue(conf, bi);
> +			if (should_defer && op_is_write(op))
> +				bio_list_add(&pending_bios, bi);
> +			else
> +				generic_make_request(bi);
>  		}
>  		if (rrdev) {
>  			if (s->syncing || s->expanding || s->expanded
> @@ -1138,7 +1226,10 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>  				trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
>  						      rbi, disk_devt(conf->mddev->gendisk),
>  						      sh->dev[i].sector);
> -			defer_bio_issue(conf, rbi);
> +			if (should_defer && op_is_write(op))
> +				bio_list_add(&pending_bios, rbi);
> +			else
> +				generic_make_request(rbi);
>  		}
>  		if (!rdev && !rrdev) {
>  			if (op_is_write(op))
> @@ -1156,6 +1247,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
>  		if (sh != head_sh)
>  			goto again;
>  	}
> +
> +	if (should_defer && !bio_list_empty(&pending_bios))
> +		defer_issue_bios(conf, head_sh->sector, &pending_bios);
>  }
>  
>  static struct dma_async_tx_descriptor *
> @@ -6808,7 +6902,6 @@ static struct r5conf *setup_conf(struct mddev *mddev)
>  	atomic_set(&conf->active_stripes, 0);
>  	atomic_set(&conf->preread_active_stripes, 0);
>  	atomic_set(&conf->active_aligned_reads, 0);
> -	bio_list_init(&conf->pending_bios);
>  	spin_lock_init(&conf->pending_bios_lock);
>  	conf->batch_bio_dispatch = true;
>  	rdev_for_each(rdev, mddev) {
> diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
> index 6b9d2e8..eb6d5d5 100644
> --- a/drivers/md/raid5.h
> +++ b/drivers/md/raid5.h
> @@ -572,6 +572,13 @@ enum r5_cache_state {
>  				 */
>  };
>  
> +#define PENDING_IO_MAX 512
> +#define PENDING_IO_ONE_FLUSH 128
> +struct r5pending_data {
> +	sector_t sector; /* stripe sector */
> +	struct bio_list bios;
> +};
> +
>  struct r5conf {
>  	struct hlist_head	*stripe_hashtbl;
>  	/* only protect corresponding hash list and inactive_list */
> @@ -689,9 +696,11 @@ struct r5conf {
>  	int			worker_cnt_per_group;
>  	struct r5l_log		*log;
>  
> -	struct bio_list		pending_bios;
>  	spinlock_t		pending_bios_lock;
>  	bool			batch_bio_dispatch;
> +	struct r5pending_data	pending_data[PENDING_IO_MAX];
> +	int			pending_data_cnt;
> +	sector_t		last_bio_pos;
>  };
>  
>  
> -- 
> 2.9.3

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] md/raid5-cache: bump flush stripe batch size
  2017-03-03  3:03   ` NeilBrown
@ 2017-03-03 17:41     ` Shaohua Li
  2017-03-06  6:23       ` NeilBrown
  0 siblings, 1 reply; 12+ messages in thread
From: Shaohua Li @ 2017-03-03 17:41 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, songliubraving, kernel-team

On Fri, Mar 03, 2017 at 02:03:31PM +1100, Neil Brown wrote:
> On Fri, Feb 17 2017, Shaohua Li wrote:
> 
> > Bump the flush stripe batch size to 2048. For my 12 disks raid
> > array, the stripes takes:
> > 12 * 4k * 2048 = 96MB
> >
> > This is still quite small. A hardware raid card generally has 1GB size,
> > which we suggest the raid5-cache has similar cache size.
> >
> > The advantage of a big batch size is we can dispatch a lot of IO in the
> > same time, then we can do some scheduling to make better IO pattern.
> >
> > Last patch prioritizes stripes, so we don't worry about a big flush
> > stripe batch will starve normal stripes.
> >
> > Signed-off-by: Shaohua Li <shli@fb.com>
> > ---
> >  drivers/md/raid5-cache.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> > index 3f307be..b25512c 100644
> > --- a/drivers/md/raid5-cache.c
> > +++ b/drivers/md/raid5-cache.c
> > @@ -43,7 +43,7 @@
> >  /* wake up reclaim thread periodically */
> >  #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
> >  /* start flush with these full stripes */
> > -#define R5C_FULL_STRIPE_FLUSH_BATCH 256
> > +#define R5C_FULL_STRIPE_FLUSH_BATCH 2048
> 
> Fixed numbers are warning signs... I wonder if there is something better
> we could do?   "conf->max_nr_stripes / 4" maybe?  We use that sort of
> number elsewhere.
> Would that make sense?

The code where we check the batch size (in r5c_do_reclaim) already a check:
total_cached > conf->min_nr_stripes * 1 / 2
so I think that's ok, no?

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] md/raid5: sort bios
  2017-03-03  3:43   ` NeilBrown
@ 2017-03-03 17:59     ` Shaohua Li
  2017-03-06  6:40       ` NeilBrown
  0 siblings, 1 reply; 12+ messages in thread
From: Shaohua Li @ 2017-03-03 17:59 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, songliubraving, kernel-team

On Fri, Mar 03, 2017 at 02:43:49PM +1100, Neil Brown wrote:
> On Fri, Feb 17 2017, Shaohua Li wrote:
> 
> > Previous patch (raid5: only dispatch IO from raid5d for harddisk raid)
> > defers IO dispatching. The goal is to create better IO pattern. At that
> > time, we don't sort the deffered IO and hope the block layer can do IO
> > merge and sort. Now the raid5-cache writeback could create large amount
> > of bios. And if we enable muti-thread for stripe handling, we can't
> > control when to dispatch IO to raid disks. In a lot of time, we are
> > dispatching IO which block layer can't do merge effectively.
> >
> > This patch moves further for the IO dispatching defer. We accumulate
> > bios, but we don't dispatch all the bios after a threshold is met. This
> > 'dispatch partial portion of bios' stragety allows bios coming in a
> > large time window are sent to disks together. At the dispatching time,
> > there is large chance the block layer can merge the bios. To make this
> > more effective, we dispatch IO in ascending order. This increases
> > request merge chance and reduces disk seek.
> 
> I can see the benefit of batching and sorting requests.
> 
> I wonder if the extra complexity of grouping together 512 requests, then
> submitting the "first" 128 is really worth it.  Have you measured the
> value of that?

I'm pretty sure I tried. The whole point of dispatching the first 128 is we
don't have a better pipeline. Grouping 512 and then dispatching them together
definitely improve the IO patter, but the request accumulation takes time, we
will have no IO running in the window.

> If you just submitted every time you got 512 requests, you could use
> list_sort() on the bio list and wouldn't need an array.
> 
> If an array really is best, it would be really nice if "sort" could pass
> a 'void*' down to the cmp function,
> and it could sort all bios that are
> *after* last_bio_pos first, and then the others.  That would make the
> code much simpler.  I guess sort() could be changed (list_sort() already
> has a 'priv' argument like this).

Ok, I'll change this to a list. And add extra pointer to record the last sorted
entry. I didn't see the sort uses much time in my profile, but the merge sort
looks better. Will do the change.

> If we cannot change sort(), then maybe use lib/bsearch.c for the binary
> search.  Performing two comparisons in the loop of a binary search
> should get a *fail* in any algorithms class!!
> The "pending_data" array that you have added to the r5conf structure
> adds 4096 bytes.  This means it is larger than a page, which is best
> avoided (though it is unlikely to cause problems).  I would allocate it
> separately.

Yep, already fixed internally.

> 
> So there is a lot that I don't really like, but it seems like a good
> idea in principle.

ok, thanks for your time!

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] md/raid5-cache: bump flush stripe batch size
  2017-03-03 17:41     ` Shaohua Li
@ 2017-03-06  6:23       ` NeilBrown
  2017-03-07 20:50         ` Shaohua Li
  0 siblings, 1 reply; 12+ messages in thread
From: NeilBrown @ 2017-03-06  6:23 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Shaohua Li, linux-raid, songliubraving, kernel-team

[-- Attachment #1: Type: text/plain, Size: 1974 bytes --]

On Fri, Mar 03 2017, Shaohua Li wrote:

> On Fri, Mar 03, 2017 at 02:03:31PM +1100, Neil Brown wrote:
>> On Fri, Feb 17 2017, Shaohua Li wrote:
>> 
>> > Bump the flush stripe batch size to 2048. For my 12 disks raid
>> > array, the stripes takes:
>> > 12 * 4k * 2048 = 96MB
>> >
>> > This is still quite small. A hardware raid card generally has 1GB size,
>> > which we suggest the raid5-cache has similar cache size.
>> >
>> > The advantage of a big batch size is we can dispatch a lot of IO in the
>> > same time, then we can do some scheduling to make better IO pattern.
>> >
>> > Last patch prioritizes stripes, so we don't worry about a big flush
>> > stripe batch will starve normal stripes.
>> >
>> > Signed-off-by: Shaohua Li <shli@fb.com>
>> > ---
>> >  drivers/md/raid5-cache.c | 2 +-
>> >  1 file changed, 1 insertion(+), 1 deletion(-)
>> >
>> > diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
>> > index 3f307be..b25512c 100644
>> > --- a/drivers/md/raid5-cache.c
>> > +++ b/drivers/md/raid5-cache.c
>> > @@ -43,7 +43,7 @@
>> >  /* wake up reclaim thread periodically */
>> >  #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
>> >  /* start flush with these full stripes */
>> > -#define R5C_FULL_STRIPE_FLUSH_BATCH 256
>> > +#define R5C_FULL_STRIPE_FLUSH_BATCH 2048
>> 
>> Fixed numbers are warning signs... I wonder if there is something better
>> we could do?   "conf->max_nr_stripes / 4" maybe?  We use that sort of
>> number elsewhere.
>> Would that make sense?
>
> The code where we check the batch size (in r5c_do_reclaim) already a check:
> total_cached > conf->min_nr_stripes * 1 / 2
> so I think that's ok, no?

I'm not sure what you are saying.

I'm suggesting that we get rid of R5C_FULL_STRIPE_FLUSH_BATCH and use a
number like "conf->max_nr_stripes / 4"
Are you agreeing, or are you saying that you don't think we need to get
rid of R5C_FULL_STRIPE_FLUSH_BATCH??

Thanks,
NeilBrown

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] md/raid5: sort bios
  2017-03-03 17:59     ` Shaohua Li
@ 2017-03-06  6:40       ` NeilBrown
  2017-03-07 20:23         ` Shaohua Li
  0 siblings, 1 reply; 12+ messages in thread
From: NeilBrown @ 2017-03-06  6:40 UTC (permalink / raw)
  To: Shaohua Li; +Cc: Shaohua Li, linux-raid, songliubraving, kernel-team

[-- Attachment #1: Type: text/plain, Size: 3826 bytes --]

On Fri, Mar 03 2017, Shaohua Li wrote:

> On Fri, Mar 03, 2017 at 02:43:49PM +1100, Neil Brown wrote:
>> On Fri, Feb 17 2017, Shaohua Li wrote:
>> 
>> > Previous patch (raid5: only dispatch IO from raid5d for harddisk raid)
>> > defers IO dispatching. The goal is to create better IO pattern. At that
>> > time, we don't sort the deffered IO and hope the block layer can do IO
>> > merge and sort. Now the raid5-cache writeback could create large amount
>> > of bios. And if we enable muti-thread for stripe handling, we can't
>> > control when to dispatch IO to raid disks. In a lot of time, we are
>> > dispatching IO which block layer can't do merge effectively.
>> >
>> > This patch moves further for the IO dispatching defer. We accumulate
>> > bios, but we don't dispatch all the bios after a threshold is met. This
>> > 'dispatch partial portion of bios' stragety allows bios coming in a
>> > large time window are sent to disks together. At the dispatching time,
>> > there is large chance the block layer can merge the bios. To make this
>> > more effective, we dispatch IO in ascending order. This increases
>> > request merge chance and reduces disk seek.
>> 
>> I can see the benefit of batching and sorting requests.
>> 
>> I wonder if the extra complexity of grouping together 512 requests, then
>> submitting the "first" 128 is really worth it.  Have you measured the
>> value of that?
>
> I'm pretty sure I tried. The whole point of dispatching the first 128 is we
> don't have a better pipeline. Grouping 512 and then dispatching them together
> definitely improve the IO patter, but the request accumulation takes time, we
> will have no IO running in the window.

But we don't wait for the first batch before we start collecting the
next batch - do we?  Why would there be a window with no IO running?


>
>> If you just submitted every time you got 512 requests, you could use
>> list_sort() on the bio list and wouldn't need an array.
>> 
>> If an array really is best, it would be really nice if "sort" could pass
>> a 'void*' down to the cmp function,
>> and it could sort all bios that are
>> *after* last_bio_pos first, and then the others.  That would make the
>> code much simpler.  I guess sort() could be changed (list_sort() already
>> has a 'priv' argument like this).
>
> Ok, I'll change this to a list. And add extra pointer to record the last sorted
> entry. I didn't see the sort uses much time in my profile, but the merge sort
> looks better. Will do the change.

I think both sorts are O(log(N)).
I had thought that list_sort() would work on a bio_list, but it requires
a list_head (even though it doesn't use the prev pointer).
If it worked on a bio_list and if you could just submit the whole batch,
then using list_sort would have meant that you don't need to allocate a
table of r5pending_data.
Now with the struct list_head in there, the data is twice the size.

I guess that doesn't matter too much.

It just feels like there should be a cleaner solution, but I cannot find
it without writing a new sort function (not that it would be so hard do
to that).

Thanks,
NeilBrown


>
>> If we cannot change sort(), then maybe use lib/bsearch.c for the binary
>> search.  Performing two comparisons in the loop of a binary search
>> should get a *fail* in any algorithms class!!
>> The "pending_data" array that you have added to the r5conf structure
>> adds 4096 bytes.  This means it is larger than a page, which is best
>> avoided (though it is unlikely to cause problems).  I would allocate it
>> separately.
>
> Yep, already fixed internally.
>
>> 
>> So there is a lot that I don't really like, but it seems like a good
>> idea in principle.
>
> ok, thanks for your time!
>
> Thanks,
> Shaohua

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 832 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] md/raid5: sort bios
  2017-03-06  6:40       ` NeilBrown
@ 2017-03-07 20:23         ` Shaohua Li
  0 siblings, 0 replies; 12+ messages in thread
From: Shaohua Li @ 2017-03-07 20:23 UTC (permalink / raw)
  To: NeilBrown; +Cc: Shaohua Li, linux-raid, songliubraving, kernel-team

On Mon, Mar 06, 2017 at 05:40:16PM +1100, Neil Brown wrote:
> On Fri, Mar 03 2017, Shaohua Li wrote:
> 
> > On Fri, Mar 03, 2017 at 02:43:49PM +1100, Neil Brown wrote:
> >> On Fri, Feb 17 2017, Shaohua Li wrote:
> >> 
> >> > Previous patch (raid5: only dispatch IO from raid5d for harddisk raid)
> >> > defers IO dispatching. The goal is to create better IO pattern. At that
> >> > time, we don't sort the deffered IO and hope the block layer can do IO
> >> > merge and sort. Now the raid5-cache writeback could create large amount
> >> > of bios. And if we enable muti-thread for stripe handling, we can't
> >> > control when to dispatch IO to raid disks. In a lot of time, we are
> >> > dispatching IO which block layer can't do merge effectively.
> >> >
> >> > This patch moves further for the IO dispatching defer. We accumulate
> >> > bios, but we don't dispatch all the bios after a threshold is met. This
> >> > 'dispatch partial portion of bios' stragety allows bios coming in a
> >> > large time window are sent to disks together. At the dispatching time,
> >> > there is large chance the block layer can merge the bios. To make this
> >> > more effective, we dispatch IO in ascending order. This increases
> >> > request merge chance and reduces disk seek.
> >> 
> >> I can see the benefit of batching and sorting requests.
> >> 
> >> I wonder if the extra complexity of grouping together 512 requests, then
> >> submitting the "first" 128 is really worth it.  Have you measured the
> >> value of that?
> >
> > I'm pretty sure I tried. The whole point of dispatching the first 128 is we
> > don't have a better pipeline. Grouping 512 and then dispatching them together
> > definitely improve the IO patter, but the request accumulation takes time, we
> > will have no IO running in the window.
> 
> But we don't wait for the first batch before we start collecting the
> next batch - do we?  Why would there be a window with no IO running?

We don't. Dispatching the 128 instead of 512 is to avoid the case the first
batch is finished but the second batch hasn't accumulate enough stripes yet. In
that case, we will have no IO running, which is the window I mentioned.

> >
> >> If you just submitted every time you got 512 requests, you could use
> >> list_sort() on the bio list and wouldn't need an array.
> >> 
> >> If an array really is best, it would be really nice if "sort" could pass
> >> a 'void*' down to the cmp function,
> >> and it could sort all bios that are
> >> *after* last_bio_pos first, and then the others.  That would make the
> >> code much simpler.  I guess sort() could be changed (list_sort() already
> >> has a 'priv' argument like this).
> >
> > Ok, I'll change this to a list. And add extra pointer to record the last sorted
> > entry. I didn't see the sort uses much time in my profile, but the merge sort
> > looks better. Will do the change.
> 
> I think both sorts are O(log(N)).
> I had thought that list_sort() would work on a bio_list, but it requires
> a list_head (even though it doesn't use the prev pointer).
> If it worked on a bio_list and if you could just submit the whole batch,
> then using list_sort would have meant that you don't need to allocate a
> table of r5pending_data.
> Now with the struct list_head in there, the data is twice the size.
> 
> I guess that doesn't matter too much.
> 
> It just feels like there should be a cleaner solution, but I cannot find
> it without writing a new sort function (not that it would be so hard do
> to that).

How about the v2 I sent? Using list can avoid the memmove

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] md/raid5-cache: bump flush stripe batch size
  2017-03-06  6:23       ` NeilBrown
@ 2017-03-07 20:50         ` Shaohua Li
  0 siblings, 0 replies; 12+ messages in thread
From: Shaohua Li @ 2017-03-07 20:50 UTC (permalink / raw)
  To: NeilBrown; +Cc: linux-raid, songliubraving

On Mon, Mar 06, 2017 at 05:23:00PM +1100, Neil Brown wrote:
> On Fri, Mar 03 2017, Shaohua Li wrote:
> 
> > On Fri, Mar 03, 2017 at 02:03:31PM +1100, Neil Brown wrote:
> >> On Fri, Feb 17 2017, Shaohua Li wrote:
> >> 
> >> > Bump the flush stripe batch size to 2048. For my 12 disks raid
> >> > array, the stripes takes:
> >> > 12 * 4k * 2048 = 96MB
> >> >
> >> > This is still quite small. A hardware raid card generally has 1GB size,
> >> > which we suggest the raid5-cache has similar cache size.
> >> >
> >> > The advantage of a big batch size is we can dispatch a lot of IO in the
> >> > same time, then we can do some scheduling to make better IO pattern.
> >> >
> >> > Last patch prioritizes stripes, so we don't worry about a big flush
> >> > stripe batch will starve normal stripes.
> >> >
> >> > Signed-off-by: Shaohua Li <shli@fb.com>
> >> > ---
> >> >  drivers/md/raid5-cache.c | 2 +-
> >> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >> >
> >> > diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
> >> > index 3f307be..b25512c 100644
> >> > --- a/drivers/md/raid5-cache.c
> >> > +++ b/drivers/md/raid5-cache.c
> >> > @@ -43,7 +43,7 @@
> >> >  /* wake up reclaim thread periodically */
> >> >  #define R5C_RECLAIM_WAKEUP_INTERVAL (30 * HZ)
> >> >  /* start flush with these full stripes */
> >> > -#define R5C_FULL_STRIPE_FLUSH_BATCH 256
> >> > +#define R5C_FULL_STRIPE_FLUSH_BATCH 2048
> >> 
> >> Fixed numbers are warning signs... I wonder if there is something better
> >> we could do?   "conf->max_nr_stripes / 4" maybe?  We use that sort of
> >> number elsewhere.
> >> Would that make sense?
> >
> > The code where we check the batch size (in r5c_do_reclaim) already a check:
> > total_cached > conf->min_nr_stripes * 1 / 2
> > so I think that's ok, no?
> 
> I'm not sure what you are saying.
> 
> I'm suggesting that we get rid of R5C_FULL_STRIPE_FLUSH_BATCH and use a
> number like "conf->max_nr_stripes / 4"
> Are you agreeing, or are you saying that you don't think we need to get
> rid of R5C_FULL_STRIPE_FLUSH_BATCH??

What I mean is we already check the min_nr_stripes which is related to
max_nr_stripes, so we don't need check max_nr_stripes again. Thinking this
more, max_nr_stripes / 4 does make more sense if the cache is very big. I'll
change R5C_FULL_STRIPE_FLUSH_BATCH to 'conf->max_nr_stripes / 4'.

Thanks,
Shaohua

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2017-03-07 20:50 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-02-17 23:32 [PATCH 0/3]md/raid5: improve IO pattern Shaohua Li
2017-02-17 23:32 ` [PATCH 1/3] md/raid5: prioritize stripes for writeback Shaohua Li
2017-02-17 23:32 ` [PATCH 2/3] md/raid5-cache: bump flush stripe batch size Shaohua Li
2017-03-03  3:03   ` NeilBrown
2017-03-03 17:41     ` Shaohua Li
2017-03-06  6:23       ` NeilBrown
2017-03-07 20:50         ` Shaohua Li
2017-02-17 23:32 ` [PATCH 3/3] md/raid5: sort bios Shaohua Li
2017-03-03  3:43   ` NeilBrown
2017-03-03 17:59     ` Shaohua Li
2017-03-06  6:40       ` NeilBrown
2017-03-07 20:23         ` Shaohua Li

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.