Load balancing reads on dmraid1 (and 01) arrays

* Load balancing reads on dmraid1 (and 01) arrays
@ 2012-06-05  0:54 Robert Collins
  2012-07-17 19:20 ` Alasdair G Kergon
  0 siblings, 1 reply; 2+ messages in thread
From: Robert Collins @ 2012-06-05  0:54 UTC (permalink / raw)
  To: dm-devel

[-- Attachment #1: Type: text/plain, Size: 1973 bytes --]

Hi, I've had this sitting in a local tree for a bit (> 12 months -
blush!), been meaning to send it here...

Its my first dm submission, so please be gentle :) - I'll happily fix
up any issues in the patch (or in other metadata) - though I have
limited bandwidth to spend on this, so I may not get around to it
immediately.

The attached patches change dmraid1 to use all backing devices to
satisfy read requests rather than just one.

The first uses a very simple heuristic - which ever device had the
most recent read dispatched to a given location is preferred for
subsequent reads. There is an obvious theoretical issue with this:
writes move the disk head as well and so effectively should reset
allow the devices to the same position. The problem though is that
this really is a scheduling problem, and one that the multipath stack
would be better at handling. (e.g. multipath shoud be aware of corner
cases like : what if there is a dispatched read on the device we think
is closest, *after* a write. or what if load isn't being balanced
because all the reads are happening in one spot and one device is
totally idle). So, I used a very simple heuristic, which works well
for me - I get significantly better read performance with the first
patch than without.

The second patch tweaks the heuristic to handle the case where one
disk is being monopolised, which I found happens quite frequently with
e.g. kernel builds, where there is a high locality of reference to the
content. Again, its not perfect, and in this case there is a horrible
magic constant :(.

Again, *I* find this a net win, but I have no idea whether other folk
will find it so.

I had a brief look at making this use multipath instead, but that
looks to my naive eye to be both a significant undertaking (which I
don't have time for - sorry!) and also perhaps conceptually
problematic, as this needs to be an asymmetric multipath: reads can
multipath but writes must hit all devices.

Cheers,
Rob

[-- Attachment #2: 0001-Load-balance-dm-raid1-reads-by-most-recently-dispatc.patch --]
[-- Type: application/octet-stream, Size: 6871 bytes --]

From a7047dcb4f19335089de39bcc5faee27e68f1b74 Mon Sep 17 00:00:00 2001
From: Robert Collins <robertc@robertcollins.net>
Date: Fri, 8 Jul 2011 22:42:02 +1200
Subject: [PATCH 1/2] Load balance dm-raid1 reads by most recently dispatched
 sector.

---
 drivers/md/dm-raid1.c |   97 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 83 insertions(+), 14 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9bfd057..e16a030 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -26,6 +26,14 @@
 #define DM_RAID1_HANDLE_ERRORS 0x01
 #define errors_handled(p)	((p)->features & DM_RAID1_HANDLE_ERRORS)
 
+#ifndef PRINTK
+#  if DEBUG > 0
+#    define PRINTK(x...) printk(KERN_DEBUG x)
+#  else
+#    define PRINTK(x...)
+#  endif
+#endif
+
 static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
 
 /*-----------------------------------------------------------------
@@ -44,6 +52,7 @@ struct mirror {
 	unsigned long error_type;
 	struct dm_dev *dev;
 	sector_t offset;
+        sector_t last_read_position; /* let us map sequential IO to one disk */
 };
 
 struct mirror_set {
@@ -403,19 +412,77 @@ static void do_recovery(struct mirror_set *ms)
 /*-----------------------------------------------------------------
  * Reads
  *---------------------------------------------------------------*/
-static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
+static sector_t bio_distance(struct mirror *m, sector_t sector)
 {
-	struct mirror *m = get_default_mirror(ms);
-
-	do {
-		if (likely(!atomic_read(&m->error_count)))
-			return m;
-
-		if (m-- == ms->mirror)
-			m += ms->nr_mirrors;
-	} while (m != get_default_mirror(ms));
+        return m->last_read_position > sector ? m->last_read_position - sector : sector - m->last_read_position;
+}
 
-	return NULL;
+static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector, sector_t count, bool update_read_pos)
+{
+        struct mirror *m, *closest = NULL;
+        sector_t distance;
+        sector_t temp_distance;
+        PRINTK("Choosing mirror: %llu %llu %d\n",
+                (unsigned long long) sector,
+                (unsigned long long) count,
+                (int) update_read_pos);
+        /* Find the first usable */
+        for (m = &ms->mirror[ms->nr_mirrors]; m != ms->mirror;) {
+		if (likely(!atomic_read(&(--m)->error_count))) {
+                        distance = bio_distance(m, sector);
+                        closest = m;
+                        PRINTK("Choosing mirror: %llu %llu %d: closest=%p mirror=%p distance=%llu last-read %llu\n",
+                                (unsigned long long) sector,
+                                (unsigned long long) count,
+                                (int) update_read_pos,
+                                closest, ms->mirror, (unsigned long long)distance,
+                                (unsigned long long)m->last_read_position);
+                        break;
+                } else {
+                        PRINTK("Choosing mirror: %llu %llu %d: has-errors %d\n",
+                                (unsigned long long) sector,
+                                (unsigned long long) count,
+                                (int) update_read_pos,
+                                atomic_read(&m->error_count));
+                }
+        }
+        /* Nothing usable */
+        if (unlikely(closest == NULL))
+                return NULL;
+        if (unlikely(closest == ms->mirror))
+                return closest;
+        /* Now see if there is a closer mirror */
+        for (m = closest; m != ms->mirror;) {
+                if (unlikely(atomic_read(&(--m)->error_count))){
+                        PRINTK("Choosing mirror: %llu %llu %d: has-errors %d\n",
+                                (unsigned long long) sector,
+                                (unsigned long long) count,
+                                (int) update_read_pos,
+                                atomic_read(&m->error_count));
+                        continue;
+                }
+                temp_distance = bio_distance(m, sector);
+                PRINTK("Choosing mirror: %llu %llu %d: closest=%p mirror=%p distance=%llu, temp_distance=%llu last-read %llu\n",
+                        (unsigned long long) sector,
+                        (unsigned long long) count,
+                        (int) update_read_pos,
+                        closest, ms->mirror,
+                        (unsigned long long) distance,
+                        (unsigned long long) temp_distance,
+                        (unsigned long long)m->last_read_position);
+                if (temp_distance < distance) {
+                        distance = temp_distance;
+                        closest = m;
+                }
+        }
+        /* TODO: also track where IO's have completed against: tracking submits
+         * lets us handle sequential requests, but the disks probably have ahci
+         * tagging - if we know where the disk is after either a write or a
+         * read we can dispatch nearer it.
+         */
+        if (likely(update_read_pos))
+                closest->last_read_position = sector + count;
+        return closest;
 }
 
 static int default_ok(struct mirror *m)
@@ -431,7 +498,7 @@ static int mirror_available(struct mirror_set *ms, struct bio *bio)
 	region_t region = dm_rh_bio_to_region(ms->rh, bio);
 
 	if (log->type->in_sync(log, region, 0))
-		return choose_mirror(ms,  bio->bi_sector) ? 1 : 0;
+		return choose_mirror(ms,  bio->bi_sector, bio->bi_size >> 9, false) ? 1 : 0;
 
 	return 0;
 }
@@ -500,6 +567,7 @@ static void read_callback(unsigned long error, void *context)
 	bio_set_m(bio, NULL);
 
 	if (likely(!error)) {
+                /* Should update head position here */
 		bio_endio(bio, 0);
 		return;
 	}
@@ -558,7 +626,7 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads)
 		 * We can only read balance if the region is in sync.
 		 */
 		if (likely(region_in_sync(ms, region, 1)))
-			m = choose_mirror(ms, bio->bi_sector);
+			m = choose_mirror(ms, bio->bi_sector, bio->bi_size >> 9, 1);
 		else if (m && atomic_read(&m->error_count))
 			m = NULL;
 
@@ -940,6 +1008,7 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
 	atomic_set(&(ms->mirror[mirror].error_count), 0);
 	ms->mirror[mirror].error_type = 0;
 	ms->mirror[mirror].offset = offset;
+	ms->mirror[mirror].last_read_position = 0;
 
 	return 0;
 }
@@ -1181,7 +1250,7 @@ static int mirror_map(struct dm_target *ti, struct bio *bio,
 	 * The region is in-sync and we can perform reads directly.
 	 * Store enough information so we can retry if it fails.
 	 */
-	m = choose_mirror(ms, bio->bi_sector);
+	m = choose_mirror(ms, bio->bi_sector, bio->bi_size >> 9, 1);
 	if (unlikely(!m))
 		return -EIO;
 
-- 
1.7.9.5


[-- Attachment #3: 0002-Force-switching-drives-after-enough-linear-IO-allows.patch --]
[-- Type: application/octet-stream, Size: 7553 bytes --]

From 693806f872ff119aa526216ee45628e929c386f6 Mon Sep 17 00:00:00 2001
From: Robert Collins <robertc@robertcollins.net>
Date: Mon, 4 Jun 2012 15:01:14 +1200
Subject: [PATCH 2/2] Force switching drives after enough linear IO - allows
 readahead to exercise parallelism as well.

---
 drivers/md/dm-raid1.c |   59 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 8 deletions(-)

diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index e16a030..b9504d8 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -22,11 +22,19 @@
 #define DM_MSG_PREFIX "raid1"
 
 #define MAX_RECOVERY 1	/* Maximum number of regions recovered in parallel. */
+/* Maximum sector count of sequential IO sent to one mirror. This needs to be
+ * large enough not to saturate drive command queues but small enough that we
+ * can keep both drives active.
+ */
+#define DM_SEQUENTIAL_IO_MIRROR_LIMIT 131072
 
 #define DM_RAID1_HANDLE_ERRORS 0x01
 #define errors_handled(p)	((p)->features & DM_RAID1_HANDLE_ERRORS)
 
 #ifndef PRINTK
+#  ifndef DEBUG
+#    define DEBUG 0
+#  endif
 #  if DEBUG > 0
 #    define PRINTK(x...) printk(KERN_DEBUG x)
 #  else
@@ -52,6 +60,7 @@ struct mirror {
 	unsigned long error_type;
 	struct dm_dev *dev;
 	sector_t offset;
+        sector_t sequential_io_start_position; /* lets us detect long runs of sequential IO for load balancing */
         sector_t last_read_position; /* let us map sequential IO to one disk */
 };
 
@@ -417,11 +426,18 @@ static sector_t bio_distance(struct mirror *m, sector_t sector)
         return m->last_read_position > sector ? m->last_read_position - sector : sector - m->last_read_position;
 }
 
+/* Return the number of sequential read IO in sectors undertaken so far by m */
+static sector_t sequential_ios(struct mirror *m)
+{
+        return m->last_read_position - m->sequential_io_start_position;
+}
+
 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector, sector_t count, bool update_read_pos)
 {
         struct mirror *m, *closest = NULL;
         sector_t distance;
         sector_t temp_distance;
+        bool prefer_other_mirror;
         PRINTK("Choosing mirror: %llu %llu %d\n",
                 (unsigned long long) sector,
                 (unsigned long long) count,
@@ -431,12 +447,15 @@ static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector, sect
 		if (likely(!atomic_read(&(--m)->error_count))) {
                         distance = bio_distance(m, sector);
                         closest = m;
-                        PRINTK("Choosing mirror: %llu %llu %d: closest=%p mirror=%p distance=%llu last-read %llu\n",
+                        PRINTK("Choosing mirror: %llu %llu %d: closest=%p mirror=%p distance=%llu last-read %llu sequential %llu\n",
                                 (unsigned long long) sector,
                                 (unsigned long long) count,
                                 (int) update_read_pos,
                                 closest, ms->mirror, (unsigned long long)distance,
-                                (unsigned long long)m->last_read_position);
+                                (unsigned long long)m->last_read_position,
+                                (unsigned long long)sequential_ios(m));
+                        /* Actively prefer a different mirror on sequential IO if we have exceeded the threshold */
+                        prefer_other_mirror = distance==0 && (sequential_ios(m) >= DM_SEQUENTIAL_IO_MIRROR_LIMIT);
                         break;
                 } else {
                         PRINTK("Choosing mirror: %llu %llu %d: has-errors %d\n",
@@ -449,9 +468,15 @@ static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector, sect
         /* Nothing usable */
         if (unlikely(closest == NULL))
                 return NULL;
+        /* Only one drive availale */
         if (unlikely(closest == ms->mirror))
                 return closest;
-        /* Now see if there is a closer mirror */
+        /* Now see if there is another usable mirror that is closer and has not
+         * had too much sequential read IO dispatched to it yet.
+         * TODO: When N = or > nr_mirrors sets of sequential read IO are
+         * happening, allow each mirror to specialise, and avoid extraneous
+         * seeking.
+         */
         for (m = closest; m != ms->mirror;) {
                 if (unlikely(atomic_read(&(--m)->error_count))){
                         PRINTK("Choosing mirror: %llu %llu %d: has-errors %d\n",
@@ -462,17 +487,31 @@ static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector, sect
                         continue;
                 }
                 temp_distance = bio_distance(m, sector);
-                PRINTK("Choosing mirror: %llu %llu %d: closest=%p mirror=%p distance=%llu, temp_distance=%llu last-read %llu\n",
+                PRINTK("Choosing mirror: %llu %llu %d: closest=%p m=%p distance=%llu, temp_distance=%llu last-read %llu sequential %llu prefer-other %d\n",
                         (unsigned long long) sector,
                         (unsigned long long) count,
                         (int) update_read_pos,
-                        closest, ms->mirror,
+                        closest, m,
                         (unsigned long long) distance,
                         (unsigned long long) temp_distance,
-                        (unsigned long long)m->last_read_position);
-                if (temp_distance < distance) {
+                        (unsigned long long)m->last_read_position,
+                        (unsigned long long)sequential_ios(m),
+                        (int)prefer_other_mirror);
+                /* Use this valid mirror if:
+                 *  - the first usable mirror has hit its sequential limit
+                 *  - or this mirror is closer and (on sequential IO) has not hit its limit
+                 */
+                if (prefer_other_mirror ||
+                        (temp_distance < distance &&
+                        (temp_distance != 0 || sequential_ios(m) < DM_SEQUENTIAL_IO_MIRROR_LIMIT))) {
                         distance = temp_distance;
                         closest = m;
+                        prefer_other_mirror = false;
+                        PRINTK("Choosing mirror: %llu %llu %d: selected %p\n",
+                                (unsigned long long) sector,
+                                (unsigned long long) count,
+                                (int) update_read_pos,
+                                m);
                 }
         }
         /* TODO: also track where IO's have completed against: tracking submits
@@ -480,8 +519,11 @@ static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector, sect
          * tagging - if we know where the disk is after either a write or a
          * read we can dispatch nearer it.
          */
-        if (likely(update_read_pos))
+        if (likely(update_read_pos)) {
+                if (closest->last_read_position != sector)
+                    closest->sequential_io_start_position = sector;
                 closest->last_read_position = sector + count;
+        }
         return closest;
 }
 
@@ -1009,6 +1051,7 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
 	ms->mirror[mirror].error_type = 0;
 	ms->mirror[mirror].offset = offset;
 	ms->mirror[mirror].last_read_position = 0;
+	ms->mirror[mirror].sequential_io_start_position = 0;
 
 	return 0;
 }
-- 
1.7.9.5


[-- Attachment #4: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply related	[flat|nested] 2+ messages in thread