* [PATCH 0/3] dm-mpath: dynamic load balancers (v2)
@ 2009-04-24 8:04 Kiyoshi Ueda
2009-04-24 8:06 ` [PATCH 1/3] dm-mpath: interface change for dynamic load balancers Kiyoshi Ueda
` (2 more replies)
0 siblings, 3 replies; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-04-24 8:04 UTC (permalink / raw)
To: Alasdair Kergon; +Cc: device-mapper development, stefan.bader
Hi,
The patch-set adds the following 2 dynamic load balancers:
o dm-queue-length: queue-length oriented dynamic load balancer
o dm-service-time: service-time oriented dynamic load balancer
This patch-set can be applied on top of 2.6.30-rc3.
No dependencies on Alasdair's linux-next patches.
NOTE:
While the patches compile and work with the current bio-based dm,
for them to *properly* work, the request-based dm patches should
be applied, too.
See <http://marc.info/?l=dm-devel&m=123736590931733&w=2> for
why request-based dm improves multipath load balancing.
Summary of the patch-set
========================
1/3: dm-mpath: interface change for dynamic load balancers
2/3: dm-mpath: add queue-length oriented dynamic load balancer
3/3: dm-mpath: add service-time oriented dynamic load balancer
drivers/md/Kconfig | 18 ++
drivers/md/Makefile | 2
drivers/md/dm-mpath.c | 28 ++-
drivers/md/dm-path-selector.h | 8 -
drivers/md/dm-queue-length.c | 257 +++++++++++++++++++++++++++++++++++
drivers/md/dm-round-robin.c | 2
drivers/md/dm-service-time.c | 301 ++++++++++++++++++++++++++++++++++++++++++
7 files changed, 603 insertions(+), 13 deletions(-)
CHANGES
=======
v2:
- Rebased to 2.6.30-rc3
v1: (http://marc.info/?l=dm-devel&m=123736539330980&w=2)
- Rebased to 2.6.29-rc8
Thanks,
Kiyoshi Ueda
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 1/3] dm-mpath: interface change for dynamic load balancers
2009-04-24 8:04 [PATCH 0/3] dm-mpath: dynamic load balancers (v2) Kiyoshi Ueda
@ 2009-04-24 8:06 ` Kiyoshi Ueda
2009-04-24 8:07 ` [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer Kiyoshi Ueda
2009-04-24 8:08 ` [PATCH 3/3] dm-mpath: add service-time " Kiyoshi Ueda
2 siblings, 0 replies; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-04-24 8:06 UTC (permalink / raw)
To: Alasdair Kergon; +Cc: device-mapper development, stefan.bader
This patch changes the dm path selector interface for dynamic
load balancers:
o adding a new hook, start_io()
o adding 'nr_bytes' parameter to select_path()/start_io()/end_io()
to pass the size of the I/O
start_io() is called when a target driver actually submits I/O
to the selected path.
Path selectors can use it to start accounting of the I/O.
(e.g. counting the number of in-flight I/Os.)
The start_io hook is based on the patch posted by Stefan Bader:
https://www.redhat.com/archives/dm-devel/2005-October/msg00050.html
nr_bytes, the size of the I/O, is used by path selectors for
size-based decision.
dm-service-time uses it to estimate service time, for example.
(Added the nr_bytes member to dm_mpath_io instead of using existing
details.bi_size, since request-based dm patch deletes it.)
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
---
drivers/md/dm-mpath.c | 28 ++++++++++++++++++----------
drivers/md/dm-path-selector.h | 8 ++++++--
drivers/md/dm-round-robin.c | 2 +-
3 files changed, 25 insertions(+), 13 deletions(-)
Index: 2.6.30-rc3/drivers/md/dm-mpath.c
===================================================================
--- 2.6.30-rc3.orig/drivers/md/dm-mpath.c
+++ 2.6.30-rc3/drivers/md/dm-mpath.c
@@ -102,6 +102,7 @@ struct multipath {
struct dm_mpath_io {
struct pgpath *pgpath;
struct dm_bio_details details;
+ size_t nr_bytes;
};
typedef int (*action_fn) (struct pgpath *pgpath);
@@ -250,11 +251,12 @@ static void __switch_pg(struct multipath
m->pg_init_count = 0;
}
-static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
+ size_t nr_bytes)
{
struct dm_path *path;
- path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
+ path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
if (!path)
return -ENXIO;
@@ -266,7 +268,7 @@ static int __choose_path_in_pg(struct mu
return 0;
}
-static void __choose_pgpath(struct multipath *m)
+static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
{
struct priority_group *pg;
unsigned bypassed = 1;
@@ -278,12 +280,12 @@ static void __choose_pgpath(struct multi
if (m->next_pg) {
pg = m->next_pg;
m->next_pg = NULL;
- if (!__choose_path_in_pg(m, pg))
+ if (!__choose_path_in_pg(m, pg, nr_bytes))
return;
}
/* Don't change PG until it has no remaining paths */
- if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
+ if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
return;
/*
@@ -295,7 +297,7 @@ static void __choose_pgpath(struct multi
list_for_each_entry(pg, &m->priority_groups, list) {
if (pg->bypassed == bypassed)
continue;
- if (!__choose_path_in_pg(m, pg))
+ if (!__choose_path_in_pg(m, pg, nr_bytes))
return;
}
} while (bypassed--);
@@ -326,6 +328,7 @@ static int map_io(struct multipath *m, s
struct dm_mpath_io *mpio, unsigned was_queued)
{
int r = DM_MAPIO_REMAPPED;
+ size_t nr_bytes = bio->bi_size;
unsigned long flags;
struct pgpath *pgpath;
@@ -334,7 +337,7 @@ static int map_io(struct multipath *m, s
/* Do we need to select a new pgpath? */
if (!m->current_pgpath ||
(!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
- __choose_pgpath(m);
+ __choose_pgpath(m, nr_bytes);
pgpath = m->current_pgpath;
@@ -359,6 +362,11 @@ static int map_io(struct multipath *m, s
r = -EIO; /* Failed */
mpio->pgpath = pgpath;
+ mpio->nr_bytes = nr_bytes;
+
+ if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
+ pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
+ nr_bytes);
spin_unlock_irqrestore(&m->lock, flags);
@@ -437,7 +445,7 @@ static void process_queued_ios(struct wo
goto out;
if (!m->current_pgpath)
- __choose_pgpath(m);
+ __choose_pgpath(m, 0);
pgpath = m->current_pgpath;
@@ -1195,7 +1203,7 @@ static int multipath_end_io(struct dm_ta
if (pgpath) {
ps = &pgpath->pg->ps;
if (ps->type->end_io)
- ps->type->end_io(ps, &pgpath->path);
+ ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
}
if (r != DM_ENDIO_INCOMPLETE)
mempool_free(mpio, m->mpio_pool);
@@ -1411,7 +1419,7 @@ static int multipath_ioctl(struct dm_tar
spin_lock_irqsave(&m->lock, flags);
if (!m->current_pgpath)
- __choose_pgpath(m);
+ __choose_pgpath(m, 0);
if (m->current_pgpath) {
bdev = m->current_pgpath->path.dev->bdev;
Index: 2.6.30-rc3/drivers/md/dm-path-selector.h
===================================================================
--- 2.6.30-rc3.orig/drivers/md/dm-path-selector.h
+++ 2.6.30-rc3/drivers/md/dm-path-selector.h
@@ -56,7 +56,8 @@ struct path_selector_type {
* the path fails.
*/
struct dm_path *(*select_path) (struct path_selector *ps,
- unsigned *repeat_count);
+ unsigned *repeat_count,
+ size_t nr_bytes);
/*
* Notify the selector that a path has failed.
@@ -75,7 +76,10 @@ struct path_selector_type {
int (*status) (struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned int maxlen);
- int (*end_io) (struct path_selector *ps, struct dm_path *path);
+ int (*start_io) (struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes);
+ int (*end_io) (struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes);
};
/* Register a path selector */
Index: 2.6.30-rc3/drivers/md/dm-round-robin.c
===================================================================
--- 2.6.30-rc3.orig/drivers/md/dm-round-robin.c
+++ 2.6.30-rc3/drivers/md/dm-round-robin.c
@@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path
}
static struct dm_path *rr_select_path(struct path_selector *ps,
- unsigned *repeat_count)
+ unsigned *repeat_count, size_t nr_bytes)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi = NULL;
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer
2009-04-24 8:04 [PATCH 0/3] dm-mpath: dynamic load balancers (v2) Kiyoshi Ueda
2009-04-24 8:06 ` [PATCH 1/3] dm-mpath: interface change for dynamic load balancers Kiyoshi Ueda
@ 2009-04-24 8:07 ` Kiyoshi Ueda
2009-05-09 0:31 ` Alasdair G Kergon
2009-04-24 8:08 ` [PATCH 3/3] dm-mpath: add service-time " Kiyoshi Ueda
2 siblings, 1 reply; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-04-24 8:07 UTC (permalink / raw)
To: Alasdair Kergon; +Cc: device-mapper development, stefan.bader
This patch adds a dynamic load balancer, dm-queue-length, which
balances the number of in-flight I/Os.
The code is based on the patch posted by Stefan Bader:
https://www.redhat.com/archives/dm-devel/2005-October/msg00050.html
Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
---
drivers/md/Kconfig | 9 +
drivers/md/Makefile | 1
drivers/md/dm-queue-length.c | 257 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 267 insertions(+)
Index: 2.6.30-rc3/drivers/md/dm-queue-length.c
===================================================================
--- /dev/null
+++ 2.6.30-rc3/drivers/md/dm-queue-length.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2004-2005 IBM Corp. All Rights Reserved.
+ * Copyright (C) 2006-2009 NEC Corporation.
+ *
+ * dm-queue-length.c
+ *
+ * Module Author: Stefan Bader, IBM
+ * Modified by: Kiyoshi Ueda, NEC
+ *
+ * This file is released under the GPL.
+ *
+ * queue-length path selector, which chooses a path with the least number of
+ * in-flight I/Os.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/atomic.h>
+
+#define DM_MSG_PREFIX "multipath queue-length"
+#define QL_MIN_IO 128
+#define QL_VERSION "0.1.0"
+
+struct selector {
+ struct list_head valid_paths;
+ struct list_head failed_paths;
+};
+
+struct path_info {
+ struct list_head list;
+ struct dm_path *path;
+ unsigned int repeat_count;
+ atomic_t qlen;
+};
+
+static struct selector *alloc_selector(void)
+{
+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s) {
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->failed_paths);
+ }
+
+ return s;
+}
+
+static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+ struct selector *s = alloc_selector();
+
+ if (!s)
+ return -ENOMEM;
+
+ ps->context = s;
+ return 0;
+}
+
+static void ql_free_paths(struct list_head *paths)
+{
+ struct path_info *pi, *next;
+
+ list_for_each_entry_safe(pi, next, paths, list) {
+ list_del(&pi->list);
+ kfree(pi);
+ }
+}
+
+static void ql_destroy(struct path_selector *ps)
+{
+ struct selector *s = (struct selector *) ps->context;
+
+ ql_free_paths(&s->valid_paths);
+ ql_free_paths(&s->failed_paths);
+ kfree(s);
+ ps->context = NULL;
+}
+
+static int ql_status(struct path_selector *ps, struct dm_path *path,
+ status_type_t type, char *result, unsigned int maxlen)
+{
+ int sz = 0;
+ struct path_info *pi;
+
+ /* When called with (path == NULL), return selector status/args. */
+ if (!path)
+ DMEMIT("0 ");
+ else {
+ pi = path->pscontext;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%u ", atomic_read(&pi->qlen));
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u ", pi->repeat_count);
+ break;
+ }
+ }
+
+ return sz;
+}
+
+static int ql_add_path(struct path_selector *ps, struct dm_path *path,
+ int argc, char **argv, char **error)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi;
+ unsigned int repeat_count = QL_MIN_IO;
+
+ /* Parse the arguments */
+ if (argc > 1) {
+ *error = "queue-length ps: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ /* First path argument is number of I/Os before switching path. */
+ if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+ *error = "queue-length ps: invalid repeat count";
+ return -EINVAL;
+ }
+
+ /* Allocate the path information structure */
+ pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+ if (!pi) {
+ *error = "queue-length ps: Error allocating path information";
+ return -ENOMEM;
+ }
+
+ pi->path = path;
+ pi->repeat_count = repeat_count;
+ atomic_set(&pi->qlen, 0);
+
+ path->pscontext = pi;
+
+ list_add_tail(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi = path->pscontext;
+
+ list_move(&pi->list, &s->failed_paths);
+}
+
+static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi = path->pscontext;
+
+ list_move_tail(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+static struct dm_path *ql_select_path(struct path_selector *ps,
+ unsigned *repeat_count, size_t nr_bytes)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi = NULL, *best = NULL;
+
+ if (list_empty(&s->valid_paths))
+ return NULL;
+
+ /* Change preferred (first in list) path to evenly balance. */
+ list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+ list_for_each_entry(pi, &s->valid_paths, list) {
+ if (!best ||
+ (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
+ best = pi;
+
+ if (!atomic_read(&best->qlen))
+ break;
+ }
+
+ if (!best)
+ return NULL;
+
+ *repeat_count = best->repeat_count;
+
+ return best->path;
+}
+
+static int ql_start_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = path->pscontext;
+
+ atomic_inc(&pi->qlen);
+
+ return 0;
+}
+
+static int ql_end_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = path->pscontext;
+
+ atomic_dec(&pi->qlen);
+
+ return 0;
+}
+
+static struct path_selector_type ql_ps = {
+ .name = "queue-length",
+ .module = THIS_MODULE,
+ .table_args = 1,
+ .info_args = 1,
+ .create = ql_create,
+ .destroy = ql_destroy,
+ .status = ql_status,
+ .add_path = ql_add_path,
+ .fail_path = ql_fail_path,
+ .reinstate_path = ql_reinstate_path,
+ .select_path = ql_select_path,
+ .start_io = ql_start_io,
+ .end_io = ql_end_io,
+};
+
+static int __init dm_ql_init(void)
+{
+ int r = dm_register_path_selector(&ql_ps);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ DMINFO("version " QL_VERSION " loaded");
+
+ return r;
+}
+
+static void __exit dm_ql_exit(void)
+{
+ int r = dm_unregister_path_selector(&ql_ps);
+
+ if (r < 0)
+ DMERR("unregister failed %d", r);
+}
+
+module_init(dm_ql_init);
+module_exit(dm_ql_exit);
+
+MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
+MODULE_DESCRIPTION(
+ "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n"
+ DM_NAME " path selector to balance the number of in-flight I/Os"
+);
+MODULE_LICENSE("GPL");
Index: 2.6.30-rc3/drivers/md/Makefile
===================================================================
--- 2.6.30-rc3.orig/drivers/md/Makefile
+++ 2.6.30-rc3/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
Index: 2.6.30-rc3/drivers/md/Kconfig
===================================================================
--- 2.6.30-rc3.orig/drivers/md/Kconfig
+++ 2.6.30-rc3/drivers/md/Kconfig
@@ -249,6 +249,15 @@ config DM_MULTIPATH
---help---
Allow volume managers to support multipath hardware.
+config DM_MULTIPATH_QL
+ tristate "I/O Path Selector based on the number of in-flight I/Os"
+ depends on DM_MULTIPATH
+ ---help---
+ This path selector is a dynamic load balancer which selects
+ a path having the least number of in-flight I/Os.
+
+ If unsure, say N.
+
config DM_DELAY
tristate "I/O delaying target (EXPERIMENTAL)"
depends on BLK_DEV_DM && EXPERIMENTAL
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
2009-04-24 8:04 [PATCH 0/3] dm-mpath: dynamic load balancers (v2) Kiyoshi Ueda
2009-04-24 8:06 ` [PATCH 1/3] dm-mpath: interface change for dynamic load balancers Kiyoshi Ueda
2009-04-24 8:07 ` [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer Kiyoshi Ueda
@ 2009-04-24 8:08 ` Kiyoshi Ueda
2009-05-09 0:49 ` Alasdair G Kergon
2 siblings, 1 reply; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-04-24 8:08 UTC (permalink / raw)
To: Alasdair Kergon; +Cc: device-mapper development
This patch adds a service time oriented dynamic load balancer,
dm-service-time, which selects a path with the shortest estimated
service time for the incoming I/O.
The service time is estimated by dividing the in-flight I/O size
with performance value of each path.
The performance value can be given as a table argument at the table
loading time. If no performance value is given, all paths are
recognized as equal performance.
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
---
drivers/md/Kconfig | 9 +
drivers/md/Makefile | 1
drivers/md/dm-service-time.c | 301 +++++++++++++++++++++++++++++++++++++++++++
3 files changed, 311 insertions(+)
Index: 2.6.30-rc3/drivers/md/dm-service-time.c
===================================================================
--- /dev/null
+++ 2.6.30-rc3/drivers/md/dm-service-time.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2007-2009 NEC Corporation. All Rights Reserved.
+ *
+ * Module Author: Kiyoshi Ueda
+ *
+ * This file is released under the GPL.
+ *
+ * Throughput oriented path selector.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#define DM_MSG_PREFIX "multipath service-time"
+#define ST_MIN_IO 1
+#define ST_VERSION "0.1.0"
+
+struct selector {
+ struct list_head valid_paths;
+ struct list_head failed_paths;
+};
+
+struct path_info {
+ struct list_head list;
+ struct dm_path *path;
+ unsigned int repeat_count;
+ size_t perf;
+ atomic_t in_flight_size; /* Total size of in-flight I/Os */
+};
+
+static struct selector *alloc_selector(void)
+{
+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s) {
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->failed_paths);
+ }
+
+ return s;
+}
+
+static int st_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+ struct selector *s = alloc_selector();
+
+ if (!s)
+ return -ENOMEM;
+
+ ps->context = s;
+ return 0;
+}
+
+static void free_paths(struct list_head *paths)
+{
+ struct path_info *pi, *next;
+
+ list_for_each_entry_safe(pi, next, paths, list) {
+ list_del(&pi->list);
+ kfree(pi);
+ }
+}
+
+static void st_destroy(struct path_selector *ps)
+{
+ struct selector *s = (struct selector *) ps->context;
+
+ free_paths(&s->valid_paths);
+ free_paths(&s->failed_paths);
+ kfree(s);
+ ps->context = NULL;
+}
+
+static int st_status(struct path_selector *ps, struct dm_path *path,
+ status_type_t type, char *result, unsigned int maxlen)
+{
+ int sz = 0;
+ struct path_info *pi;
+
+ if (!path)
+ DMEMIT("0 ");
+ else {
+ pi = path->pscontext;
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("%u %lu ", atomic_read(&pi->in_flight_size),
+ pi->perf);
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u %lu ", pi->repeat_count, pi->perf);
+ break;
+ }
+ }
+
+ return sz;
+}
+
+static int st_add_path(struct path_selector *ps, struct dm_path *path,
+ int argc, char **argv, char **error)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi;
+ unsigned int repeat_count = ST_MIN_IO;
+ size_t perf = 1;
+
+ if (argc > 2) {
+ *error = "service-time ps: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ /* First path argument is number of I/Os before switching path. */
+ if ((argc > 0) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+ *error = "service-time ps: invalid repeat count";
+ return -EINVAL;
+ }
+
+ /*
+ * Second path argument is a relative performance value.
+ * If 0 is given, the path isn't used while other paths having
+ * a positive value are available.
+ */
+ if ((argc == 2) && (sscanf(argv[1], "%lu", &perf) != 1)) {
+ *error = "service-time ps: invalid performance value";
+ return -EINVAL;
+ }
+
+ /* allocate the path */
+ pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+ if (!pi) {
+ *error = "service-time ps: Error allocating path context";
+ return -ENOMEM;
+ }
+
+ pi->path = path;
+ pi->repeat_count = repeat_count;
+ pi->perf = perf;
+ atomic_set(&pi->in_flight_size, 0);
+
+ path->pscontext = pi;
+
+ list_add_tail(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+static void st_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi = path->pscontext;
+
+ list_move(&pi->list, &s->failed_paths);
+}
+
+static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi = path->pscontext;
+
+ list_move_tail(&pi->list, &s->valid_paths);
+
+ return 0;
+}
+
+/*
+ * Returns:
+ * < 0 : pi1 is better
+ * 0 : no difference between pi1 and pi2
+ * > 0 : pi2 is better
+ */
+static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
+ size_t incoming)
+{
+ size_t sz1, sz2;
+
+ sz1 = atomic_read(&pi1->in_flight_size);
+ sz2 = atomic_read(&pi2->in_flight_size);
+
+ /*
+ * Case 1: Both have same performace value. Choose less loaded path.
+ */
+ if (pi1->perf == pi2->perf)
+ return sz1 - sz2;
+
+ /*
+ * Case 2a: Both have same load. Choose higher performance path.
+ * Case 2b: One path has no performance value. Choose the other one.
+ */
+ if (sz1 == sz2 || !pi1->perf || !pi2->perf)
+ return pi2->perf - pi1->perf;
+
+ /*
+ * Case 3: Calculate service time. Choose faster path.
+ * if ((sz1+incoming)/pi1->perf < (sz2+incoming)/pi2->perf) pi1
+ * if ((sz1+incoming)/pi1->perf > (sz2+incoming)/pi2->perf) pi2
+ */
+ sz1 += incoming;
+ sz2 += incoming;
+ while (sz1 && sz2 && (sz1 < pi1->perf) && (sz2 < pi2->perf)) {
+ /* Size is not big enough to compare by division. Shift up */
+ sz1 <<= 2;
+ sz2 <<= 2;
+ }
+ do_div(sz1, pi1->perf);
+ do_div(sz2, pi2->perf);
+
+ if (sz1 != sz2)
+ return sz1 - sz2;
+
+ /*
+ * Case 4: Service time is equal. Choose higher performance path.
+ */
+ return pi2->perf - pi1->perf;
+}
+
+static struct dm_path *st_select_path(struct path_selector *ps,
+ unsigned *repeat_count, size_t nr_bytes)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi = NULL, *best = NULL;
+
+ if (list_empty(&s->valid_paths))
+ return NULL;
+
+ /* Change preferred (first in list) path to evenly balance. */
+ list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+ list_for_each_entry(pi, &s->valid_paths, list)
+ if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
+ best = pi;
+
+ if (!best)
+ return NULL;
+
+ *repeat_count = best->repeat_count;
+
+ return best->path;
+}
+
+static int st_start_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = path->pscontext;
+
+ atomic_add(nr_bytes, &pi->in_flight_size);
+
+ return 0;
+}
+
+static int st_end_io(struct path_selector *ps, struct dm_path *path,
+ size_t nr_bytes)
+{
+ struct path_info *pi = path->pscontext;
+
+ atomic_sub(nr_bytes, &pi->in_flight_size);
+
+ return 0;
+}
+
+static struct path_selector_type st_ps = {
+ .name = "service-time",
+ .module = THIS_MODULE,
+ .table_args = 2,
+ .info_args = 2,
+ .create = st_create,
+ .destroy = st_destroy,
+ .status = st_status,
+ .add_path = st_add_path,
+ .fail_path = st_fail_path,
+ .reinstate_path = st_reinstate_path,
+ .select_path = st_select_path,
+ .start_io = st_start_io,
+ .end_io = st_end_io,
+};
+
+static int __init dm_st_init(void)
+{
+ int r = dm_register_path_selector(&st_ps);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ DMINFO("version " ST_VERSION " loaded");
+
+ return r;
+}
+
+static void __exit dm_st_exit(void)
+{
+ int r = dm_unregister_path_selector(&st_ps);
+
+ if (r < 0)
+ DMERR("unregister failed %d", r);
+}
+
+module_init(dm_st_init);
+module_exit(dm_st_exit);
+
+MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
+MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
+MODULE_LICENSE("GPL");
Index: 2.6.30-rc3/drivers/md/Makefile
===================================================================
--- 2.6.30-rc3.orig/drivers/md/Makefile
+++ 2.6.30-rc3/drivers/md/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
+obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o
obj-$(CONFIG_DM_ZERO) += dm-zero.o
Index: 2.6.30-rc3/drivers/md/Kconfig
===================================================================
--- 2.6.30-rc3.orig/drivers/md/Kconfig
+++ 2.6.30-rc3/drivers/md/Kconfig
@@ -258,6 +258,15 @@ config DM_MULTIPATH_QL
If unsure, say N.
+config DM_MULTIPATH_ST
+ tristate "I/O Path Selector based on the service time"
+ depends on DM_MULTIPATH
+ ---help---
+ This path selector is a dynamic load balancer which selects
+ a path to complete the incoming I/O with the shortest time.
+
+ If unsure, say N.
+
config DM_DELAY
tristate "I/O delaying target (EXPERIMENTAL)"
depends on BLK_DEV_DM && EXPERIMENTAL
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer
2009-04-24 8:07 ` [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer Kiyoshi Ueda
@ 2009-05-09 0:31 ` Alasdair G Kergon
2009-05-14 6:14 ` Kiyoshi Ueda
0 siblings, 1 reply; 12+ messages in thread
From: Alasdair G Kergon @ 2009-05-09 0:31 UTC (permalink / raw)
To: Kiyoshi Ueda; +Cc: device-mapper development, stefan.bader
On Fri, Apr 24, 2009 at 05:07:26PM +0900, Kiyoshi Ueda wrote:
> + unsigned int repeat_count;
Just 'unsigned repeat_count'
> + struct selector *s = (struct selector *) ps->context;
Drop the cast when it's from void.
struct selector *s = ps->context;
> +static int ql_status(struct path_selector *ps, struct dm_path *path,
> + status_type_t type, char *result, unsigned int maxlen)
unsigned maxlen
(We're doing things like this in all new patches, but only changing existing
code when it's touched for some other reason.)
> + int sz = 0;
signed or unsigned sz?
(It's already used inconsistently, I know - but is unsigned better here?)
> + case STATUSTYPE_INFO:
> + DMEMIT("%u ", atomic_read(&pi->qlen));
Signed or unsigned?
> + /* Parse the arguments */
Please document parameters and selection algorithm in Documentation dir and
maybe in inline comments too - it's not immediately obvious exactly how it
behaves.
> +static struct dm_path *ql_select_path(struct path_selector *ps,
> + unsigned *repeat_count, size_t nr_bytes)
Alasdair
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
2009-04-24 8:08 ` [PATCH 3/3] dm-mpath: add service-time " Kiyoshi Ueda
@ 2009-05-09 0:49 ` Alasdair G Kergon
2009-05-15 3:09 ` Kiyoshi Ueda
0 siblings, 1 reply; 12+ messages in thread
From: Alasdair G Kergon @ 2009-05-09 0:49 UTC (permalink / raw)
To: Kiyoshi Ueda; +Cc: device-mapper development
On Fri, Apr 24, 2009 at 05:08:02PM +0900, Kiyoshi Ueda wrote:
> + struct selector *s = (struct selector *) ps->context;
> + status_type_t type, char *result, unsigned int maxlen)
> + int sz = 0;
> + DMEMIT("%u %lu ", atomic_read(&pi->in_flight_size),
(similar comments)
> + DMEMIT("%u %lu ", pi->repeat_count, pi->perf);
Need to handle both 32 and 64 bit archs: cast to llu.
> + if ((argc == 2) && (sscanf(argv[1], "%lu", &perf) != 1)) {
Please do sscanf for size_t the same way as dm-crypt does.
(Or scan for lu, if the intent is to impose a size limit ahead of the later
do_div() then cast explicitly.)
> +/*
> + * Returns:
> + * < 0 : pi1 is better
> + * 0 : no difference between pi1 and pi2
> + * > 0 : pi2 is better
> + */
Please document the parameters and algorithm.
Nothing explains what the performance value is for and examples of anticipated values.
> + do_div(sz1, pi1->perf);
> + do_div(sz2, pi2->perf);
Or sector_div ?
But what's the performance of those divisions like?
Is there a better way of getting the same answer?
Is there validation limiting the size of 'perf'?
(Also, did you check there's adequate locking & memory barriers around all the
atomics?)
Alasdair
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer
2009-05-09 0:31 ` Alasdair G Kergon
@ 2009-05-14 6:14 ` Kiyoshi Ueda
2009-05-14 12:34 ` Alasdair G Kergon
0 siblings, 1 reply; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-05-14 6:14 UTC (permalink / raw)
To: Alasdair Kergon; +Cc: device-mapper development, stefan.bader
[-- Attachment #1: Type: text/plain, Size: 2034 bytes --]
Hi Alasdair,
Thank you for the review and comments.
I'm totally agree with your comments.
I found that you have already updated the patch in your editing tree, thanks.
I attached patches for remaining parts of your comments.
Please review and apply.
On 05/09/2009 09:31 AM +0900, Alasdair G Kergon wrote:
> On Fri, Apr 24, 2009 at 05:07:26PM +0900, Kiyoshi Ueda wrote:
>> + unsigned int repeat_count;
>
> Just 'unsigned repeat_count'
OK. (You have already fixed this.)
>> + struct selector *s = (struct selector *) ps->context;
>
> Drop the cast when it's from void.
>
> struct selector *s = ps->context;
OK. (You have already fixed this.)
>> +static int ql_status(struct path_selector *ps, struct dm_path *path,
>> + status_type_t type, char *result, unsigned int maxlen)
>
> unsigned maxlen
OK, fixed in rqdm-dlb-02-queue-length-dlb-type-fixes.patch
> (We're doing things like this in all new patches, but only changing existing
> code when it's touched for some other reason.)
OK, I followed the style of dm-round-robin and I understand your
preference now.
I'll also check the request-based dm patch-set, too, when I update
and repost it.
>> + int sz = 0;
>
> signed or unsigned sz?
> (It's already used inconsistently, I know - but is unsigned better here?)
Yes, I think unsigned is better. (You have already fixed this.)
>> + case STATUSTYPE_INFO:
>> + DMEMIT("%u ", atomic_read(&pi->qlen));
>
> Signed or unsigned?
qlen should be always >= 0, but atomic_t is defined as signed.
So I use signed here.
Fixed in rqdm-dlb-02-queue-length-dlb-type-fixes.patch
>> + /* Parse the arguments */
>
> Please document parameters and selection algorithm in Documentation dir and
> maybe in inline comments too - it's not immediately obvious exactly how it
> behaves.
>
>> +static struct dm_path *ql_select_path(struct path_selector *ps,
>> + unsigned *repeat_count, size_t nr_bytes)
OK, added comments/documentaion in
rqdm-dlb-02-queue-length-dlb-document.patch.
Thanks,
Kiyoshi Ueda
[-- Attachment #2: rqdm-dlb-02-queue-length-dlb-type-fixes.patch --]
[-- Type: text/plain, Size: 1174 bytes --]
o Use 'unsigned' instead of 'unsigned int' for maxlen in dm-queue-length.
o Fix the print format to use %d for qlen in dm-queue-length, since
atomic_t is defined as 'int', not 'unsigned int'.
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
drivers/md/dm-queue-length.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
Index: 2.6.30-rc5/drivers/md/dm-queue-length.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-queue-length.c
+++ 2.6.30-rc5/drivers/md/dm-queue-length.c
@@ -82,7 +82,7 @@ static void ql_destroy(struct path_selec
}
static int ql_status(struct path_selector *ps, struct dm_path *path,
- status_type_t type, char *result, unsigned int maxlen)
+ status_type_t type, char *result, unsigned maxlen)
{
unsigned sz = 0;
struct path_info *pi;
@@ -95,7 +95,7 @@ static int ql_status(struct path_selecto
switch (type) {
case STATUSTYPE_INFO:
- DMEMIT("%u ", atomic_read(&pi->qlen));
+ DMEMIT("%d ", atomic_read(&pi->qlen));
break;
case STATUSTYPE_TABLE:
DMEMIT("%u ", pi->repeat_count);
[-- Attachment #3: rqdm-dlb-02-queue-length-dlb-document.patch --]
[-- Type: text/plain, Size: 3178 bytes --]
Add documents/comments for dm-queue-length.
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
Documentation/device-mapper/dm-queue-length.txt | 39 ++++++++++++++++++++++++
drivers/md/dm-queue-length.c | 12 +++++--
2 files changed, 48 insertions(+), 3 deletions(-)
Index: 2.6.30-rc5/Documentation/device-mapper/dm-queue-length.txt
===================================================================
--- /dev/null
+++ 2.6.30-rc5/Documentation/device-mapper/dm-queue-length.txt
@@ -0,0 +1,39 @@
+dm-queue-length
+===============
+
+dm-queue-length is a path selector module for device-mapper targets,
+which selects a path having the least number of in-flight I/Os.
+The path selector name is 'queue-length'.
+
+Table parameters for each path: [<repeat_count>]
+ <repeat_count>: The number of I/Os to dispatch using the selected
+ path before switching to the next path.
+ If not given, internal default is used. To check
+ the default value, see the activated table.
+
+Status for each path: <status> <fail-count> <in-flight>
+ <status>: 'A' if the path is active, 'F' if the path is failed.
+ <fail-count>: The number of path failures.
+ <in-flight>: The number of in-flight I/Os on the path.
+
+
+Algorithm
+=========
+
+dm-queue-length increments/decrements 'in-flight' when an I/O is
+dispatched/completed respectively.
+dm-queue-length selects a path having minimum 'in-flight'.
+
+
+Examples
+========
+In case that 2 paths (sda and sdb) are used with repeat_count == 128.
+
+# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \
+ dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0
Index: 2.6.30-rc5/drivers/md/dm-queue-length.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-queue-length.c
+++ 2.6.30-rc5/drivers/md/dm-queue-length.c
@@ -35,7 +35,7 @@ struct path_info {
struct list_head list;
struct dm_path *path;
unsigned repeat_count;
- atomic_t qlen;
+ atomic_t qlen; /* the number of in-flight I/Os */
};
static struct selector *alloc_selector(void)
@@ -113,13 +113,16 @@ static int ql_add_path(struct path_selec
struct path_info *pi;
unsigned repeat_count = QL_MIN_IO;
- /* Parse the arguments */
+ /*
+ * Arguments: [<repeat_count>]
+ * <repeat_count>: The number of I/Os before switching path.
+ * If not given, default (QL_MIN_IO) is used.
+ */
if (argc > 1) {
*error = "queue-length ps: incorrect number of arguments";
return -EINVAL;
}
- /* First path argument is number of I/Os before switching path. */
if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
*error = "queue-length ps: invalid repeat count";
return -EINVAL;
@@ -161,6 +164,9 @@ static int ql_reinstate_path(struct path
return 0;
}
+/*
+ * Select a path having the minimum number of in-flight I/Os
+ */
static struct dm_path *ql_select_path(struct path_selector *ps,
unsigned *repeat_count, size_t nr_bytes)
{
[-- Attachment #4: Type: text/plain, Size: 0 bytes --]
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: Re: [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer
2009-05-14 6:14 ` Kiyoshi Ueda
@ 2009-05-14 12:34 ` Alasdair G Kergon
0 siblings, 0 replies; 12+ messages in thread
From: Alasdair G Kergon @ 2009-05-14 12:34 UTC (permalink / raw)
To: device-mapper development
On Thu, May 14, 2009 at 03:14:15PM +0900, Kiyoshi Ueda wrote:
> Please review and apply.
Pushed to linux-next.
Alasdair
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
2009-05-09 0:49 ` Alasdair G Kergon
@ 2009-05-15 3:09 ` Kiyoshi Ueda
2009-05-18 11:46 ` Alasdair G Kergon
0 siblings, 1 reply; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-05-15 3:09 UTC (permalink / raw)
To: Alasdair Kergon; +Cc: device-mapper development
[-- Attachment #1: Type: text/plain, Size: 2323 bytes --]
Hi Alasdair,
Thank you for the review and comments.
I'm totally agree with your comments.
I found that you have already updated the patch in your editing tree, thanks.
But I'm considering to update the patch to be simpler.
Please see below.
On 05/09/2009 09:49 AM +0900, Alasdair G Kergon wrote:
> On Fri, Apr 24, 2009 at 05:08:02PM +0900, Kiyoshi Ueda wrote:
>> + struct selector *s = (struct selector *) ps->context;
>> + status_type_t type, char *result, unsigned int maxlen)
>> + int sz = 0;
>> + DMEMIT("%u %lu ", atomic_read(&pi->in_flight_size),
>
> (similar comments)
OK, you have already fixed the pointer cast, the sz type and
the print format for in_flight_size.
I'll fix the type of maxlen.
>> +/*
>> + * Returns:
>> + * < 0 : pi1 is better
>> + * 0 : no difference between pi1 and pi2
>> + * > 0 : pi2 is better
>> + */
>
> Please document the parameters and algorithm.
> Nothing explains what the performance value is for and examples of anticipated values.
OK, I'll add comments and documentation like the attached patch.
>> + DMEMIT("%u %lu ", pi->repeat_count, pi->perf);
>
> Need to handle both 32 and 64 bit archs: cast to llu.
>
>> + if ((argc == 2) && (sscanf(argv[1], "%lu", &perf) != 1)) {
>
> Please do sscanf for size_t the same way as dm-crypt does.
> (Or scan for lu, if the intent is to impose a size limit ahead of the later
> do_div() then cast explicitly.)
>
>> + do_div(sz1, pi1->perf);
>> + do_div(sz2, pi2->perf);
>
> Or sector_div ?
>
> But what's the performance of those divisions like?
> Is there a better way of getting the same answer?
> Is there validation limiting the size of 'perf'?
I tried to use multiplication before, but it didn't work well,
because overflow happened when 'in_flight_size' and 'perf' had
pretty big values, and then I got wrong comparison results.
So I decided to use division.
However, if I limit the 'perf' value in smaller range (e.g. 0 to 100),
such overflow shouldn't happen. So I should be able to use
multiplication. Also, I don't need to use size_t for 'perf',
because 'unsinged' should be enough for such small values.
> (Also, did you check there's adequate locking & memory barriers around all the
> atomics?)
Yes, I did and I found no problem in both dm-queue-length and
dm-service-time.
Thanks,
Kiyoshi Ueda
[-- Attachment #2: rqdm-dlb-04-service-time-dlb-document.patch --]
[-- Type: text/plain, Size: 5834 bytes --]
Add documents/comments for dm-service-time.
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
Documentation/device-mapper/dm-service-time.txt | 87 ++++++++++++++++++++++++
drivers/md/dm-service-time.c | 26 +++++--
2 files changed, 107 insertions(+), 6 deletions(-)
Index: 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
===================================================================
--- /dev/null
+++ 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
@@ -0,0 +1,87 @@
+dm-service-time
+===============
+
+dm-service-time is a path selector module for device-mapper targets,
+which selects a path with the shortest estimated service time for
+the incoming I/O.
+
+The service time for each path is estimated by dividing the total size
+of in-flight I/Os on a path with the performance value of the path.
+The performance value is a relative throughput value among all paths
+in a path-group, and it can be specified as a table argument.
+
+The path selector name is 'service-time'.
+
+Table parameters for each path: [<repeat_count> [<performance>]]
+ <repeat_count>: The number of I/Os to dispatch using the selected
+ path before switching to the next path.
+ If not given, internal default is used. To check
+ the default value, see the activated table.
+ <performance>: The relative throughput value of the path
+ among all paths in the path-group.
+ If not given, minimum value '1' is used.
+ If '0' is given, the path isn't selected while
+ other paths having a positive value are available.
+
+Status for each path: <status> <fail-count> <in-flight-size> <performance>
+ <status>: 'A' if the path is active, 'F' if the path is failed.
+ <fail-count>: The number of path failures.
+ <in-flight-size>: The size of in-flight I/Os on the path.
+ <performance>: The relative throughput value of the path
+ among all paths in the path-group.
+
+
+Algorithm
+=========
+
+dm-service-time adds the I/O size to 'in-flight-size' when the I/O is
+dispatched and substracts when completed.
+Basically, dm-service-time selects a path having minimum service time
+which is calculated by:
+
+ ('in-flight-size' + 'size-of-incoming-io') / 'performance'
+
+However, some optimizations below are used to reduce the calculation
+as much as possible.
+
+ 1. If the paths have the same 'performance', skip the division
+ and just compare the 'in-flight-size'.
+
+ 2. If the paths have the same 'in-flight-size', skip the division
+ and just compare the 'performance'.
+
+ 3. If some paths have non-zero 'performance' and others have zero
+ 'performance', ignore those paths with zero 'performance'.
+
+If such optimizations can't be applied, calculate service time, and
+compare service time.
+If calculated service time is equal, the path having maximum 'performance'
+may be better. So compare 'performance' then.
+
+
+Examples
+========
+In case that 2 paths (sda and sdb) are used with repeat_count == 128
+and sda has an average throughput 1GB/s and sdb has 4GB/s, 'performance'
+value may be '1' for sda and '4' for sdb.
+
+# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
+ dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4
+
+
+Or '2' for sda and '8' for sdb would be also true.
+
+# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \
+ dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8
Index: 2.6.30-rc5/drivers/md/dm-service-time.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-service-time.c
+++ 2.6.30-rc5/drivers/md/dm-service-time.c
@@ -105,22 +105,27 @@ static int st_add_path(struct path_selec
unsigned repeat_count = ST_MIN_IO;
unsigned long long tmpll = 1;
+ /*
+ * Arguments: [<repeat_count> [<performance>]]
+ * <repeat_count>: The number of I/Os before switching path.
+ * If not given, default (ST_MIN_IO) is used.
+ * <performance>: The relative throughput value of the path
+ * among all paths in the path-group.
+ * If not given, minimum value '1' is used.
+ * If '0' is given, the path isn't selected while
+ * other paths having a positive value are
+ * available.
+ */
if (argc > 2) {
*error = "service-time ps: incorrect number of arguments";
return -EINVAL;
}
- /* First path argument is number of I/Os before switching path. */
if ((argc > 0) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
*error = "service-time ps: invalid repeat count";
return -EINVAL;
}
- /*
- * Second path argument is a relative performance value.
- * If 0 is given, the path isn't used while other paths having
- * a positive value are available.
- */
if ((argc == 2) && (sscanf(argv[1], "%llu", &tmpll) != 1)) {
*error = "service-time ps: invalid performance value";
return -EINVAL;
@@ -164,10 +169,19 @@ static int st_reinstate_path(struct path
}
/*
+ * Compare the estimated service time of 2 paths, pi1 and pi2,
+ * for the incoming I/O.
+ *
* Returns:
* < 0 : pi1 is better
* 0 : no difference between pi1 and pi2
* > 0 : pi2 is better
+ *
+ * Description:
+ * Basically, the service time is estimated by:
+ * ('pi->in-flight-size' + 'incoming') / 'pi->perf'
+ * To reduce the calculation, some optimizations are made.
+ * (See comments inline)
*/
static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
size_t incoming)
[-- Attachment #3: Type: text/plain, Size: 0 bytes --]
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: Re: [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
2009-05-15 3:09 ` Kiyoshi Ueda
@ 2009-05-18 11:46 ` Alasdair G Kergon
2009-05-19 2:59 ` Kiyoshi Ueda
0 siblings, 1 reply; 12+ messages in thread
From: Alasdair G Kergon @ 2009-05-18 11:46 UTC (permalink / raw)
To: device-mapper development
On Fri, May 15, 2009 at 12:09:21PM +0900, Kiyoshi Ueda wrote:
> However, if I limit the 'perf' value in smaller range (e.g. 0 to 100),
> such overflow shouldn't happen. So I should be able to use
> multiplication. Also, I don't need to use size_t for 'perf',
> because 'unsinged' should be enough for such small values.
I think a small range would be adequate - look at the number sizes and
go 0-100 or 0-1000 as appropriate.
Thinking of naming, is it 'relative_throughput' ?
However, is it also going to be possible to extend this target to measure the
the actual throughput? If we did that, would we use a special value to
indicate it or add a new table parameter and keep the 'perf' ones as a
manual adjustment factor applied on top of the calculated one?
Alasdair
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: Re: [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
2009-05-18 11:46 ` Alasdair G Kergon
@ 2009-05-19 2:59 ` Kiyoshi Ueda
2009-05-19 8:13 ` Kiyoshi Ueda
0 siblings, 1 reply; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-05-19 2:59 UTC (permalink / raw)
To: Alasdair Kergon; +Cc: device-mapper development
Hi Alasdair,
On 05/18/2009 08:46 PM +0900, Alasdair G Kergon wrote:
> On Fri, May 15, 2009 at 12:09:21PM +0900, Kiyoshi Ueda wrote:
>> However, if I limit the 'perf' value in smaller range (e.g. 0 to 100),
>> such overflow shouldn't happen. So I should be able to use
>> multiplication. Also, I don't need to use size_t for 'perf',
>> because 'unsinged' should be enough for such small values.
>
> I think a small range would be adequate - look at the number sizes and
> go 0-100 or 0-1000 as appropriate.
>
> Thinking of naming, is it 'relative_throughput' ?
Yes, 'relative_throughput' may be better naming. I'll take it.
> However, is it also going to be possible to extend this target to measure the
> the actual throughput? If we did that, would we use a special value to
> indicate it or add a new table parameter and keep the 'perf' ones as a
> manual adjustment factor applied on top of the calculated one?
The older version of dm-service-time calculated the actual throughput
dynamically using part_stat:
http://marc.info/?l=dm-devel&m=123321314426149&w=2
But the calculated values often had some noises and it didn't work well.
(e.g. Once the throughput value of a path was calculated so small,
the path was never used, and the small throughput value was never
updated, and the path was never used, and ...)
So I think doing such calculations in user-space and simplifying
the kernel-code are better.
For dynamically changing path's bandwidth, user-space daemon can
measure the throughput periodically and notify the dm device.
I've been thinking of adding a new dm message handler to update
'relative_thoughput' in future.
What do you think?
Thanks,
Kiyoshi Ueda
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: Re: [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
2009-05-19 2:59 ` Kiyoshi Ueda
@ 2009-05-19 8:13 ` Kiyoshi Ueda
0 siblings, 0 replies; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-05-19 8:13 UTC (permalink / raw)
To: Alasdair Kergon; +Cc: device-mapper development
[-- Attachment #1: Type: text/plain, Size: 1351 bytes --]
Hi Alasdair,
I attached 3 patches below.
- rqdm-dlb-04-service-time-dlb-maxlen-type-fix.patch:
Use 'unsigned' instead of 'unsigned int' for maxlen.
- rqdm-dlb-04-service-time-dlb-add-perf-limit.patch:
Limit the 'perf' value within the range of 0-100.
- rqdm-dlb-04-service-time-dlb-naming-change.patch:
Change the 'perf' naming to 'relative_throughput'.
Please review and apply if you have no problem.
On 05/19/2009 11:59 AM +0900, Kiyoshi Ueda wrote:
> Hi Alasdair,
>
> On 05/18/2009 08:46 PM +0900, Alasdair G Kergon wrote:
>> On Fri, May 15, 2009 at 12:09:21PM +0900, Kiyoshi Ueda wrote:
>>> However, if I limit the 'perf' value in smaller range (e.g. 0 to 100),
>>> such overflow shouldn't happen. So I should be able to use
>>> multiplication. Also, I don't need to use size_t for 'perf',
>>> because 'unsinged' should be enough for such small values.
>>
>> I think a small range would be adequate - look at the number sizes and
>> go 0-100 or 0-1000 as appropriate.
See rqdm-dlb-04-service-time-dlb-add-perf-limit.patch.
>> Thinking of naming, is it 'relative_throughput' ?
>
> Yes, 'relative_throughput' may be better naming. I'll take it.
See rqdm-dlb-04-service-time-dlb-naming-change.patch.
Maybe the string is too longt. If you feel so, too,
'throughput' may be another alternative?
Thanks,
Kiyoshi Ueda
[-- Attachment #2: rqdm-dlb-04-service-time-dlb-maxlen-type-fix.patch --]
[-- Type: text/plain, Size: 782 bytes --]
Use 'unsigned' instead of 'unsigned int' for maxlen in dm-service-time.
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
drivers/md/dm-service-time.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
Index: 2.6.30-rc5/drivers/md/dm-service-time.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-service-time.c
+++ 2.6.30-rc5/drivers/md/dm-service-time.c
@@ -72,7 +72,7 @@ static void st_destroy(struct path_selec
}
static int st_status(struct path_selector *ps, struct dm_path *path,
- status_type_t type, char *result, unsigned int maxlen)
+ status_type_t type, char *result, unsigned maxlen)
{
unsigned sz = 0;
struct path_info *pi;
[-- Attachment #3: rqdm-dlb-04-service-time-dlb-add-perf-limit.patch --]
[-- Type: text/plain, Size: 5554 bytes --]
o Limited the second table argument (relative throughput value)
in 0-100.
As a result, no need to use 'size_t' for ->perf. Use 'unsigned'.
Updated comments/documents.
o Converted the service time calculation method to multiplication
from division.
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
Documentation/device-mapper/dm-service-time.txt | 1
drivers/md/dm-service-time.c | 57 +++++++++++++++---------
2 files changed, 37 insertions(+), 21 deletions(-)
Index: 2.6.30-rc5/drivers/md/dm-service-time.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-service-time.c
+++ 2.6.30-rc5/drivers/md/dm-service-time.c
@@ -13,7 +13,10 @@
#define DM_MSG_PREFIX "multipath service-time"
#define ST_MIN_IO 1
-#define ST_VERSION "0.1.0"
+#define ST_MAX_PERF 100
+#define ST_MAX_PERF_SHIFT 7
+#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_PERF_SHIFT)
+#define ST_VERSION "0.2.0"
struct selector {
struct list_head valid_paths;
@@ -24,7 +27,7 @@ struct path_info {
struct list_head list;
struct dm_path *path;
unsigned repeat_count;
- size_t perf;
+ unsigned perf;
atomic_t in_flight_size; /* Total size of in-flight I/Os */
};
@@ -84,12 +87,11 @@ static int st_status(struct path_selecto
switch (type) {
case STATUSTYPE_INFO:
- DMEMIT("%d %llu ", atomic_read(&pi->in_flight_size),
- (unsigned long long)pi->perf);
+ DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
+ pi->perf);
break;
case STATUSTYPE_TABLE:
- DMEMIT("%u %llu ", pi->repeat_count,
- (unsigned long long)pi->perf);
+ DMEMIT("%u %u ", pi->repeat_count, pi->perf);
break;
}
}
@@ -103,7 +105,7 @@ static int st_add_path(struct path_selec
struct selector *s = ps->context;
struct path_info *pi;
unsigned repeat_count = ST_MIN_IO;
- unsigned long long tmpll = 1;
+ unsigned perf = 1;
/*
* Arguments: [<repeat_count> [<performance>]]
@@ -111,6 +113,7 @@ static int st_add_path(struct path_selec
* If not given, default (ST_MIN_IO) is used.
* <performance>: The relative throughput value of the path
* among all paths in the path-group.
+ * The valid range: 0-<ST_MAX_PERF>
* If not given, minimum value '1' is used.
* If '0' is given, the path isn't selected while
* other paths having a positive value are
@@ -126,7 +129,8 @@ static int st_add_path(struct path_selec
return -EINVAL;
}
- if ((argc == 2) && (sscanf(argv[1], "%llu", &tmpll) != 1)) {
+ if ((argc == 2) &&
+ (sscanf(argv[1], "%u", &perf) != 1 || perf > ST_MAX_PERF)) {
*error = "service-time ps: invalid performance value";
return -EINVAL;
}
@@ -140,7 +144,7 @@ static int st_add_path(struct path_selec
pi->path = path;
pi->repeat_count = repeat_count;
- pi->perf = tmpll;
+ pi->perf = perf;
atomic_set(&pi->in_flight_size, 0);
path->pscontext = pi;
@@ -186,7 +190,7 @@ static int st_reinstate_path(struct path
static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
size_t incoming)
{
- size_t sz1, sz2;
+ size_t sz1, sz2, st1, st2;
sz1 = atomic_read(&pi1->in_flight_size);
sz2 = atomic_read(&pi2->in_flight_size);
@@ -206,21 +210,32 @@ static int st_compare_load(struct path_i
/*
* Case 3: Calculate service time. Choose faster path.
- * if ((sz1+incoming)/pi1->perf < (sz2+incoming)/pi2->perf) pi1
- * if ((sz1+incoming)/pi1->perf > (sz2+incoming)/pi2->perf) pi2
+ * Service time using pi1: st1 = (sz1 + incoming) / pi1->perf
+ * Service time using pi2: st2 = (sz2 + incoming) / pi2->perf
+ *
+ * To avoid the division, transform the expression to use
+ * multiplication.
+ * Because ->perf > 0 here, if st1 < st2, the expressions
+ * below are the same meaning:
+ * (sz1 + incoming) / pi1->perf < (sz2 + incoming) / pi2->perf
+ * (sz1 + incoming) * pi2->perf < (sz2 + incoming) * pi1->perf
+ * So use the later one.
*/
sz1 += incoming;
sz2 += incoming;
- while (sz1 && sz2 && (sz1 < pi1->perf) && (sz2 < pi2->perf)) {
- /* Size is not big enough to compare by division. Shift up */
- sz1 <<= 2;
- sz2 <<= 2;
+ if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
+ sz2 >= ST_MAX_INFLIGHT_SIZE)) {
+ /*
+ * Size may be too big for multiplying pi->perf and overflow.
+ * To avoid the overflow and mis-selection, shift down both.
+ */
+ sz1 >>= ST_MAX_PERF_SHIFT;
+ sz2 >>= ST_MAX_PERF_SHIFT;
}
- do_div(sz1, pi1->perf);
- do_div(sz2, pi2->perf);
-
- if (sz1 != sz2)
- return sz1 - sz2;
+ st1 = sz1 * pi2->perf;
+ st2 = sz2 * pi1->perf;
+ if (st1 != st2)
+ return st1 - st2;
/*
* Case 4: Service time is equal. Choose higher performance path.
Index: 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
===================================================================
--- 2.6.30-rc5.orig/Documentation/device-mapper/dm-service-time.txt
+++ 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
@@ -19,6 +19,7 @@ Table parameters for each path: [<repeat
the default value, see the activated table.
<performance>: The relative throughput value of the path
among all paths in the path-group.
+ The valid range is 0-100.
If not given, minimum value '1' is used.
If '0' is given, the path isn't selected while
other paths having a positive value are available.
[-- Attachment #4: rqdm-dlb-04-service-time-dlb-naming-change.patch --]
[-- Type: text/plain, Size: 10844 bytes --]
Changed the name of the member 'perf' in the path_info structure
to 'relative_throughput' for readability.
Also, updated the related parts of comments and documents.
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
Documentation/device-mapper/dm-service-time.txt | 43 ++++++------
drivers/md/dm-service-time.c | 84 +++++++++++++-----------
2 files changed, 69 insertions(+), 58 deletions(-)
Index: 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
===================================================================
--- 2.6.30-rc5.orig/Documentation/device-mapper/dm-service-time.txt
+++ 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
@@ -12,24 +12,25 @@ in a path-group, and it can be specified
The path selector name is 'service-time'.
-Table parameters for each path: [<repeat_count> [<performance>]]
+Table parameters for each path: [<repeat_count> [<relative_throughput>]]
<repeat_count>: The number of I/Os to dispatch using the selected
path before switching to the next path.
If not given, internal default is used. To check
the default value, see the activated table.
- <performance>: The relative throughput value of the path
- among all paths in the path-group.
- The valid range is 0-100.
- If not given, minimum value '1' is used.
- If '0' is given, the path isn't selected while
- other paths having a positive value are available.
+ <relative_throughput>: The relative throughput value of the path
+ among all paths in the path-group.
+ The valid range is 0-100.
+ If not given, minimum value '1' is used.
+ If '0' is given, the path isn't selected while
+ other paths having a positive value are available.
-Status for each path: <status> <fail-count> <in-flight-size> <performance>
+Status for each path: <status> <fail-count> <in-flight-size> \
+ <relative_throughput>
<status>: 'A' if the path is active, 'F' if the path is failed.
<fail-count>: The number of path failures.
<in-flight-size>: The size of in-flight I/Os on the path.
- <performance>: The relative throughput value of the path
- among all paths in the path-group.
+ <relative_throughput>: The relative throughput value of the path
+ among all paths in the path-group.
Algorithm
@@ -40,31 +41,33 @@ dispatched and substracts when completed
Basically, dm-service-time selects a path having minimum service time
which is calculated by:
- ('in-flight-size' + 'size-of-incoming-io') / 'performance'
+ ('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput'
However, some optimizations below are used to reduce the calculation
as much as possible.
- 1. If the paths have the same 'performance', skip the division
- and just compare the 'in-flight-size'.
+ 1. If the paths have the same 'relative_throughput', skip
+ the division and just compare the 'in-flight-size'.
2. If the paths have the same 'in-flight-size', skip the division
- and just compare the 'performance'.
+ and just compare the 'relative_throughput'.
- 3. If some paths have non-zero 'performance' and others have zero
- 'performance', ignore those paths with zero 'performance'.
+ 3. If some paths have non-zero 'relative_throughput' and others
+ have zero 'relative_throughput', ignore those paths with zero
+ 'relative_throughput'.
If such optimizations can't be applied, calculate service time, and
compare service time.
-If calculated service time is equal, the path having maximum 'performance'
-may be better. So compare 'performance' then.
+If calculated service time is equal, the path having maximum
+'relative_throughput' may be better. So compare 'relative_throughput'
+then.
Examples
========
In case that 2 paths (sda and sdb) are used with repeat_count == 128
-and sda has an average throughput 1GB/s and sdb has 4GB/s, 'performance'
-value may be '1' for sda and '4' for sdb.
+and sda has an average throughput 1GB/s and sdb has 4GB/s,
+'relative_throughput' value may be '1' for sda and '4' for sdb.
# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
dmsetup create test
Index: 2.6.30-rc5/drivers/md/dm-service-time.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-service-time.c
+++ 2.6.30-rc5/drivers/md/dm-service-time.c
@@ -13,9 +13,9 @@
#define DM_MSG_PREFIX "multipath service-time"
#define ST_MIN_IO 1
-#define ST_MAX_PERF 100
-#define ST_MAX_PERF_SHIFT 7
-#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_PERF_SHIFT)
+#define ST_MAX_RELATIVE_THROUGHPUT 100
+#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT 7
+#define ST_MAX_INFLIGHT_SIZE ((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
#define ST_VERSION "0.2.0"
struct selector {
@@ -27,7 +27,7 @@ struct path_info {
struct list_head list;
struct dm_path *path;
unsigned repeat_count;
- unsigned perf;
+ unsigned relative_throughput;
atomic_t in_flight_size; /* Total size of in-flight I/Os */
};
@@ -88,10 +88,11 @@ static int st_status(struct path_selecto
switch (type) {
case STATUSTYPE_INFO:
DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
- pi->perf);
+ pi->relative_throughput);
break;
case STATUSTYPE_TABLE:
- DMEMIT("%u %u ", pi->repeat_count, pi->perf);
+ DMEMIT("%u %u ", pi->repeat_count,
+ pi->relative_throughput);
break;
}
}
@@ -105,19 +106,19 @@ static int st_add_path(struct path_selec
struct selector *s = ps->context;
struct path_info *pi;
unsigned repeat_count = ST_MIN_IO;
- unsigned perf = 1;
+ unsigned relative_throughput = 1;
/*
- * Arguments: [<repeat_count> [<performance>]]
+ * Arguments: [<repeat_count> [<relative_throughput>]]
* <repeat_count>: The number of I/Os before switching path.
* If not given, default (ST_MIN_IO) is used.
- * <performance>: The relative throughput value of the path
- * among all paths in the path-group.
- * The valid range: 0-<ST_MAX_PERF>
- * If not given, minimum value '1' is used.
- * If '0' is given, the path isn't selected while
- * other paths having a positive value are
- * available.
+ * <relative_throughput>: The relative throughput value of
+ * the path among all paths in the path-group.
+ * The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
+ * If not given, minimum value '1' is used.
+ * If '0' is given, the path isn't selected while
+ * other paths having a positive value are
+ * available.
*/
if (argc > 2) {
*error = "service-time ps: incorrect number of arguments";
@@ -130,8 +131,9 @@ static int st_add_path(struct path_selec
}
if ((argc == 2) &&
- (sscanf(argv[1], "%u", &perf) != 1 || perf > ST_MAX_PERF)) {
- *error = "service-time ps: invalid performance value";
+ (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
+ relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
+ *error = "service-time ps: invalid relative_throughput value";
return -EINVAL;
}
@@ -144,7 +146,7 @@ static int st_add_path(struct path_selec
pi->path = path;
pi->repeat_count = repeat_count;
- pi->perf = perf;
+ pi->relative_throughput = relative_throughput;
atomic_set(&pi->in_flight_size, 0);
path->pscontext = pi;
@@ -183,7 +185,7 @@ static int st_reinstate_path(struct path
*
* Description:
* Basically, the service time is estimated by:
- * ('pi->in-flight-size' + 'incoming') / 'pi->perf'
+ * ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
* To reduce the calculation, some optimizations are made.
* (See comments inline)
*/
@@ -196,29 +198,34 @@ static int st_compare_load(struct path_i
sz2 = atomic_read(&pi2->in_flight_size);
/*
- * Case 1: Both have same performance value. Choose less loaded path.
+ * Case 1: Both have same throughput value. Choose less loaded path.
*/
- if (pi1->perf == pi2->perf)
+ if (pi1->relative_throughput == pi2->relative_throughput)
return sz1 - sz2;
/*
- * Case 2a: Both have same load. Choose higher performance path.
- * Case 2b: One path has no performance value. Choose the other one.
+ * Case 2a: Both have same load. Choose higher throughput path.
+ * Case 2b: One path has no throughput value. Choose the other one.
*/
- if (sz1 == sz2 || !pi1->perf || !pi2->perf)
- return pi2->perf - pi1->perf;
+ if (sz1 == sz2 ||
+ !pi1->relative_throughput || !pi2->relative_throughput)
+ return pi2->relative_throughput - pi1->relative_throughput;
/*
* Case 3: Calculate service time. Choose faster path.
- * Service time using pi1: st1 = (sz1 + incoming) / pi1->perf
- * Service time using pi2: st2 = (sz2 + incoming) / pi2->perf
+ * Service time using pi1:
+ * st1 = (sz1 + incoming) / pi1->relative_throughput
+ * Service time using pi2:
+ * st2 = (sz2 + incoming) / pi2->relative_throughput
*
* To avoid the division, transform the expression to use
* multiplication.
- * Because ->perf > 0 here, if st1 < st2, the expressions
- * below are the same meaning:
- * (sz1 + incoming) / pi1->perf < (sz2 + incoming) / pi2->perf
- * (sz1 + incoming) * pi2->perf < (sz2 + incoming) * pi1->perf
+ * Because ->relative_throughput > 0 here, if st1 < st2,
+ * the expressions below are the same meaning:
+ * (sz1 + incoming) / pi1->relative_throughput <
+ * (sz2 + incoming) / pi2->relative_throughput
+ * (sz1 + incoming) * pi2->relative_throughput <
+ * (sz2 + incoming) * pi1->relative_throughput
* So use the later one.
*/
sz1 += incoming;
@@ -226,21 +233,22 @@ static int st_compare_load(struct path_i
if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
sz2 >= ST_MAX_INFLIGHT_SIZE)) {
/*
- * Size may be too big for multiplying pi->perf and overflow.
+ * Size may be too big for multiplying pi->relative_throughput
+ * and overflow.
* To avoid the overflow and mis-selection, shift down both.
*/
- sz1 >>= ST_MAX_PERF_SHIFT;
- sz2 >>= ST_MAX_PERF_SHIFT;
+ sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+ sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
}
- st1 = sz1 * pi2->perf;
- st2 = sz2 * pi1->perf;
+ st1 = sz1 * pi2->relative_throughput;
+ st2 = sz2 * pi1->relative_throughput;
if (st1 != st2)
return st1 - st2;
/*
- * Case 4: Service time is equal. Choose higher performance path.
+ * Case 4: Service time is equal. Choose higher throughput path.
*/
- return pi2->perf - pi1->perf;
+ return pi2->relative_throughput - pi1->relative_throughput;
}
static struct dm_path *st_select_path(struct path_selector *ps,
[-- Attachment #5: Type: text/plain, Size: 0 bytes --]
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2009-05-19 8:13 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-24 8:04 [PATCH 0/3] dm-mpath: dynamic load balancers (v2) Kiyoshi Ueda
2009-04-24 8:06 ` [PATCH 1/3] dm-mpath: interface change for dynamic load balancers Kiyoshi Ueda
2009-04-24 8:07 ` [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer Kiyoshi Ueda
2009-05-09 0:31 ` Alasdair G Kergon
2009-05-14 6:14 ` Kiyoshi Ueda
2009-05-14 12:34 ` Alasdair G Kergon
2009-04-24 8:08 ` [PATCH 3/3] dm-mpath: add service-time " Kiyoshi Ueda
2009-05-09 0:49 ` Alasdair G Kergon
2009-05-15 3:09 ` Kiyoshi Ueda
2009-05-18 11:46 ` Alasdair G Kergon
2009-05-19 2:59 ` Kiyoshi Ueda
2009-05-19 8:13 ` Kiyoshi Ueda
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.