All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/3] dm-mpath: dynamic load balancers (v2)
@ 2009-04-24  8:04 Kiyoshi Ueda
  2009-04-24  8:06 ` [PATCH 1/3] dm-mpath: interface change for dynamic load balancers Kiyoshi Ueda
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-04-24  8:04 UTC (permalink / raw)
  To: Alasdair Kergon; +Cc: device-mapper development, stefan.bader

Hi,

The patch-set adds the following 2 dynamic load balancers:
  o dm-queue-length: queue-length oriented dynamic load balancer
  o dm-service-time: service-time oriented dynamic load balancer

This patch-set can be applied on top of 2.6.30-rc3.
No dependencies on Alasdair's linux-next patches.

NOTE:
While the patches compile and work with the current bio-based dm,
for them to *properly* work, the request-based dm patches should
be applied, too.
See <http://marc.info/?l=dm-devel&m=123736590931733&w=2> for
why request-based dm improves multipath load balancing.


Summary of the patch-set
========================
  1/3: dm-mpath: interface change for dynamic load balancers
  2/3: dm-mpath: add queue-length oriented dynamic load balancer
  3/3: dm-mpath: add service-time oriented dynamic load balancer

 drivers/md/Kconfig            |   18 ++
 drivers/md/Makefile           |    2 
 drivers/md/dm-mpath.c         |   28 ++-
 drivers/md/dm-path-selector.h |    8 -
 drivers/md/dm-queue-length.c  |  257 +++++++++++++++++++++++++++++++++++
 drivers/md/dm-round-robin.c   |    2 
 drivers/md/dm-service-time.c  |  301 ++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 603 insertions(+), 13 deletions(-)


CHANGES
=======
v2:
  - Rebased to 2.6.30-rc3

v1: (http://marc.info/?l=dm-devel&m=123736539330980&w=2)
  - Rebased to 2.6.29-rc8

Thanks,
Kiyoshi Ueda

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 1/3] dm-mpath: interface change for dynamic load balancers
  2009-04-24  8:04 [PATCH 0/3] dm-mpath: dynamic load balancers (v2) Kiyoshi Ueda
@ 2009-04-24  8:06 ` Kiyoshi Ueda
  2009-04-24  8:07 ` [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer Kiyoshi Ueda
  2009-04-24  8:08 ` [PATCH 3/3] dm-mpath: add service-time " Kiyoshi Ueda
  2 siblings, 0 replies; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-04-24  8:06 UTC (permalink / raw)
  To: Alasdair Kergon; +Cc: device-mapper development, stefan.bader

This patch changes the dm path selector interface for dynamic
load balancers:
  o adding a new hook, start_io()
  o adding 'nr_bytes' parameter to select_path()/start_io()/end_io()
    to pass the size of the I/O

start_io() is called when a target driver actually submits I/O
to the selected path.
Path selectors can use it to start accounting of the I/O.
(e.g. counting the number of in-flight I/Os.)
The start_io hook is based on the patch posted by Stefan Bader:
https://www.redhat.com/archives/dm-devel/2005-October/msg00050.html

nr_bytes, the size of the I/O, is used by path selectors for
size-based decision.
dm-service-time uses it to estimate service time, for example.
(Added the nr_bytes member to dm_mpath_io instead of using existing
 details.bi_size, since request-based dm patch deletes it.)

Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c         |   28 ++++++++++++++++++----------
 drivers/md/dm-path-selector.h |    8 ++++++--
 drivers/md/dm-round-robin.c   |    2 +-
 3 files changed, 25 insertions(+), 13 deletions(-)

Index: 2.6.30-rc3/drivers/md/dm-mpath.c
===================================================================
--- 2.6.30-rc3.orig/drivers/md/dm-mpath.c
+++ 2.6.30-rc3/drivers/md/dm-mpath.c
@@ -102,6 +102,7 @@ struct multipath {
 struct dm_mpath_io {
 	struct pgpath *pgpath;
 	struct dm_bio_details details;
+	size_t nr_bytes;
 };
 
 typedef int (*action_fn) (struct pgpath *pgpath);
@@ -250,11 +251,12 @@ static void __switch_pg(struct multipath
 	m->pg_init_count = 0;
 }
 
-static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
+			       size_t nr_bytes)
 {
 	struct dm_path *path;
 
-	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
+	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
 	if (!path)
 		return -ENXIO;
 
@@ -266,7 +268,7 @@ static int __choose_path_in_pg(struct mu
 	return 0;
 }
 
-static void __choose_pgpath(struct multipath *m)
+static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 {
 	struct priority_group *pg;
 	unsigned bypassed = 1;
@@ -278,12 +280,12 @@ static void __choose_pgpath(struct multi
 	if (m->next_pg) {
 		pg = m->next_pg;
 		m->next_pg = NULL;
-		if (!__choose_path_in_pg(m, pg))
+		if (!__choose_path_in_pg(m, pg, nr_bytes))
 			return;
 	}
 
 	/* Don't change PG until it has no remaining paths */
-	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
+	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
 		return;
 
 	/*
@@ -295,7 +297,7 @@ static void __choose_pgpath(struct multi
 		list_for_each_entry(pg, &m->priority_groups, list) {
 			if (pg->bypassed == bypassed)
 				continue;
-			if (!__choose_path_in_pg(m, pg))
+			if (!__choose_path_in_pg(m, pg, nr_bytes))
 				return;
 		}
 	} while (bypassed--);
@@ -326,6 +328,7 @@ static int map_io(struct multipath *m, s
 		  struct dm_mpath_io *mpio, unsigned was_queued)
 {
 	int r = DM_MAPIO_REMAPPED;
+	size_t nr_bytes = bio->bi_size;
 	unsigned long flags;
 	struct pgpath *pgpath;
 
@@ -334,7 +337,7 @@ static int map_io(struct multipath *m, s
 	/* Do we need to select a new pgpath? */
 	if (!m->current_pgpath ||
 	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
-		__choose_pgpath(m);
+		__choose_pgpath(m, nr_bytes);
 
 	pgpath = m->current_pgpath;
 
@@ -359,6 +362,11 @@ static int map_io(struct multipath *m, s
 		r = -EIO;	/* Failed */
 
 	mpio->pgpath = pgpath;
+	mpio->nr_bytes = nr_bytes;
+
+	if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
+		pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
+					      nr_bytes);
 
 	spin_unlock_irqrestore(&m->lock, flags);
 
@@ -437,7 +445,7 @@ static void process_queued_ios(struct wo
 		goto out;
 
 	if (!m->current_pgpath)
-		__choose_pgpath(m);
+		__choose_pgpath(m, 0);
 
 	pgpath = m->current_pgpath;
 
@@ -1195,7 +1203,7 @@ static int multipath_end_io(struct dm_ta
 	if (pgpath) {
 		ps = &pgpath->pg->ps;
 		if (ps->type->end_io)
-			ps->type->end_io(ps, &pgpath->path);
+			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
 	}
 	if (r != DM_ENDIO_INCOMPLETE)
 		mempool_free(mpio, m->mpio_pool);
@@ -1411,7 +1419,7 @@ static int multipath_ioctl(struct dm_tar
 	spin_lock_irqsave(&m->lock, flags);
 
 	if (!m->current_pgpath)
-		__choose_pgpath(m);
+		__choose_pgpath(m, 0);
 
 	if (m->current_pgpath) {
 		bdev = m->current_pgpath->path.dev->bdev;
Index: 2.6.30-rc3/drivers/md/dm-path-selector.h
===================================================================
--- 2.6.30-rc3.orig/drivers/md/dm-path-selector.h
+++ 2.6.30-rc3/drivers/md/dm-path-selector.h
@@ -56,7 +56,8 @@ struct path_selector_type {
 	 * the path fails.
 	 */
 	struct dm_path *(*select_path) (struct path_selector *ps,
-				     unsigned *repeat_count);
+					unsigned *repeat_count,
+					size_t nr_bytes);
 
 	/*
 	 * Notify the selector that a path has failed.
@@ -75,7 +76,10 @@ struct path_selector_type {
 	int (*status) (struct path_selector *ps, struct dm_path *path,
 		       status_type_t type, char *result, unsigned int maxlen);
 
-	int (*end_io) (struct path_selector *ps, struct dm_path *path);
+	int (*start_io) (struct path_selector *ps, struct dm_path *path,
+			 size_t nr_bytes);
+	int (*end_io) (struct path_selector *ps, struct dm_path *path,
+		       size_t nr_bytes);
 };
 
 /* Register a path selector */
Index: 2.6.30-rc3/drivers/md/dm-round-robin.c
===================================================================
--- 2.6.30-rc3.orig/drivers/md/dm-round-robin.c
+++ 2.6.30-rc3/drivers/md/dm-round-robin.c
@@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path
 }
 
 static struct dm_path *rr_select_path(struct path_selector *ps,
-				   unsigned *repeat_count)
+				      unsigned *repeat_count, size_t nr_bytes)
 {
 	struct selector *s = (struct selector *) ps->context;
 	struct path_info *pi = NULL;

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer
  2009-04-24  8:04 [PATCH 0/3] dm-mpath: dynamic load balancers (v2) Kiyoshi Ueda
  2009-04-24  8:06 ` [PATCH 1/3] dm-mpath: interface change for dynamic load balancers Kiyoshi Ueda
@ 2009-04-24  8:07 ` Kiyoshi Ueda
  2009-05-09  0:31   ` Alasdair G Kergon
  2009-04-24  8:08 ` [PATCH 3/3] dm-mpath: add service-time " Kiyoshi Ueda
  2 siblings, 1 reply; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-04-24  8:07 UTC (permalink / raw)
  To: Alasdair Kergon; +Cc: device-mapper development, stefan.bader

This patch adds a dynamic load balancer, dm-queue-length, which
balances the number of in-flight I/Os.

The code is based on the patch posted by Stefan Bader:
https://www.redhat.com/archives/dm-devel/2005-October/msg00050.html


Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/Kconfig           |    9 +
 drivers/md/Makefile          |    1 
 drivers/md/dm-queue-length.c |  257 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 267 insertions(+)

Index: 2.6.30-rc3/drivers/md/dm-queue-length.c
===================================================================
--- /dev/null
+++ 2.6.30-rc3/drivers/md/dm-queue-length.c
@@ -0,0 +1,257 @@
+/*
+ * Copyright (C) 2004-2005 IBM Corp.  All Rights Reserved.
+ * Copyright (C) 2006-2009 NEC Corporation.
+ *
+ * dm-queue-length.c
+ *
+ * Module Author: Stefan Bader, IBM
+ * Modified by: Kiyoshi Ueda, NEC
+ *
+ * This file is released under the GPL.
+ *
+ * queue-length path selector, which chooses a path with the least number of
+ * in-flight I/Os.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/atomic.h>
+
+#define DM_MSG_PREFIX	"multipath queue-length"
+#define QL_MIN_IO	128
+#define QL_VERSION	"0.1.0"
+
+struct selector {
+	struct list_head	valid_paths;
+	struct list_head	failed_paths;
+};
+
+struct path_info {
+	struct list_head	list;
+	struct dm_path		*path;
+	unsigned int		repeat_count;
+	atomic_t		qlen;
+};
+
+static struct selector *alloc_selector(void)
+{
+	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (s) {
+		INIT_LIST_HEAD(&s->valid_paths);
+		INIT_LIST_HEAD(&s->failed_paths);
+	}
+
+	return s;
+}
+
+static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+	struct selector *s = alloc_selector();
+
+	if (!s)
+		return -ENOMEM;
+
+	ps->context = s;
+	return 0;
+}
+
+static void ql_free_paths(struct list_head *paths)
+{
+	struct path_info *pi, *next;
+
+	list_for_each_entry_safe(pi, next, paths, list) {
+		list_del(&pi->list);
+		kfree(pi);
+	}
+}
+
+static void ql_destroy(struct path_selector *ps)
+{
+	struct selector *s = (struct selector *) ps->context;
+
+	ql_free_paths(&s->valid_paths);
+	ql_free_paths(&s->failed_paths);
+	kfree(s);
+	ps->context = NULL;
+}
+
+static int ql_status(struct path_selector *ps, struct dm_path *path,
+		     status_type_t type, char *result, unsigned int maxlen)
+{
+	int sz = 0;
+	struct path_info *pi;
+
+	/* When called with (path == NULL), return selector status/args. */
+	if (!path)
+		DMEMIT("0 ");
+	else {
+		pi = path->pscontext;
+
+		switch (type) {
+		case STATUSTYPE_INFO:
+			DMEMIT("%u ", atomic_read(&pi->qlen));
+			break;
+		case STATUSTYPE_TABLE:
+			DMEMIT("%u ", pi->repeat_count);
+			break;
+		}
+	}
+
+	return sz;
+}
+
+static int ql_add_path(struct path_selector *ps, struct dm_path *path,
+		       int argc, char **argv, char **error)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi;
+	unsigned int repeat_count = QL_MIN_IO;
+
+	/* Parse the arguments */
+	if (argc > 1) {
+		*error = "queue-length ps: incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	/* First path argument is number of I/Os before switching path. */
+	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+		*error = "queue-length ps: invalid repeat count";
+		return -EINVAL;
+	}
+
+	/* Allocate the path information structure */
+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+	if (!pi) {
+		*error = "queue-length ps: Error allocating path information";
+		return -ENOMEM;
+	}
+
+	pi->path = path;
+	pi->repeat_count = repeat_count;
+	atomic_set(&pi->qlen, 0);
+
+	path->pscontext = pi;
+
+	list_add_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move(&pi->list, &s->failed_paths);
+}
+
+static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static struct dm_path *ql_select_path(struct path_selector *ps,
+				      unsigned *repeat_count, size_t nr_bytes)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = NULL, *best = NULL;
+
+	if (list_empty(&s->valid_paths))
+		return NULL;
+
+	/* Change preferred (first in list) path to evenly balance. */
+	list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+	list_for_each_entry(pi, &s->valid_paths, list) {
+		if (!best ||
+		    (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
+			best = pi;
+
+		if (!atomic_read(&best->qlen))
+			break;
+	}
+
+	if (!best)
+		return NULL;
+
+	*repeat_count = best->repeat_count;
+
+	return best->path;
+}
+
+static int ql_start_io(struct path_selector *ps, struct dm_path *path,
+		       size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_inc(&pi->qlen);
+
+	return 0;
+}
+
+static int ql_end_io(struct path_selector *ps, struct dm_path *path,
+		     size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_dec(&pi->qlen);
+
+	return 0;
+}
+
+static struct path_selector_type ql_ps = {
+	.name		= "queue-length",
+	.module		= THIS_MODULE,
+	.table_args	= 1,
+	.info_args	= 1,
+	.create		= ql_create,
+	.destroy	= ql_destroy,
+	.status		= ql_status,
+	.add_path	= ql_add_path,
+	.fail_path	= ql_fail_path,
+	.reinstate_path	= ql_reinstate_path,
+	.select_path	= ql_select_path,
+	.start_io	= ql_start_io,
+	.end_io		= ql_end_io,
+};
+
+static int __init dm_ql_init(void)
+{
+	int r = dm_register_path_selector(&ql_ps);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	DMINFO("version " QL_VERSION " loaded");
+
+	return r;
+}
+
+static void __exit dm_ql_exit(void)
+{
+	int r = dm_unregister_path_selector(&ql_ps);
+
+	if (r < 0)
+		DMERR("unregister failed %d", r);
+}
+
+module_init(dm_ql_init);
+module_exit(dm_ql_exit);
+
+MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
+MODULE_DESCRIPTION(
+	"(C) Copyright IBM Corp. 2004,2005   All Rights Reserved.\n"
+	DM_NAME " path selector to balance the number of in-flight I/Os"
+);
+MODULE_LICENSE("GPL");
Index: 2.6.30-rc3/drivers/md/Makefile
===================================================================
--- 2.6.30-rc3.orig/drivers/md/Makefile
+++ 2.6.30-rc3/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
Index: 2.6.30-rc3/drivers/md/Kconfig
===================================================================
--- 2.6.30-rc3.orig/drivers/md/Kconfig
+++ 2.6.30-rc3/drivers/md/Kconfig
@@ -249,6 +249,15 @@ config DM_MULTIPATH
 	---help---
 	  Allow volume managers to support multipath hardware.
 
+config DM_MULTIPATH_QL
+	tristate "I/O Path Selector based on the number of in-flight I/Os"
+	depends on DM_MULTIPATH
+	---help---
+	  This path selector is a dynamic load balancer which selects
+	  a path having the least number of in-flight I/Os.
+
+	  If unsure, say N.
+
 config DM_DELAY
 	tristate "I/O delaying target (EXPERIMENTAL)"
 	depends on BLK_DEV_DM && EXPERIMENTAL

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
  2009-04-24  8:04 [PATCH 0/3] dm-mpath: dynamic load balancers (v2) Kiyoshi Ueda
  2009-04-24  8:06 ` [PATCH 1/3] dm-mpath: interface change for dynamic load balancers Kiyoshi Ueda
  2009-04-24  8:07 ` [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer Kiyoshi Ueda
@ 2009-04-24  8:08 ` Kiyoshi Ueda
  2009-05-09  0:49   ` Alasdair G Kergon
  2 siblings, 1 reply; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-04-24  8:08 UTC (permalink / raw)
  To: Alasdair Kergon; +Cc: device-mapper development

This patch adds a service time oriented dynamic load balancer,
dm-service-time, which selects a path with the shortest estimated
service time for the incoming I/O.
The service time is estimated by dividing the in-flight I/O size
with performance value of each path.

The performance value can be given as a table argument at the table
loading time.  If no performance value is given, all paths are
recognized as equal performance.


Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/Kconfig           |    9 +
 drivers/md/Makefile          |    1 
 drivers/md/dm-service-time.c |  301 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 311 insertions(+)

Index: 2.6.30-rc3/drivers/md/dm-service-time.c
===================================================================
--- /dev/null
+++ 2.6.30-rc3/drivers/md/dm-service-time.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
+ *
+ * Module Author: Kiyoshi Ueda
+ *
+ * This file is released under the GPL.
+ *
+ * Throughput oriented path selector.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#define DM_MSG_PREFIX	"multipath service-time"
+#define ST_MIN_IO	1
+#define ST_VERSION	"0.1.0"
+
+struct selector {
+	struct list_head valid_paths;
+	struct list_head failed_paths;
+};
+
+struct path_info {
+	struct list_head list;
+	struct dm_path *path;
+	unsigned int repeat_count;
+	size_t perf;
+	atomic_t in_flight_size;	/* Total size of in-flight I/Os */
+};
+
+static struct selector *alloc_selector(void)
+{
+	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (s) {
+		INIT_LIST_HEAD(&s->valid_paths);
+		INIT_LIST_HEAD(&s->failed_paths);
+	}
+
+	return s;
+}
+
+static int st_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+	struct selector *s = alloc_selector();
+
+	if (!s)
+		return -ENOMEM;
+
+	ps->context = s;
+	return 0;
+}
+
+static void free_paths(struct list_head *paths)
+{
+	struct path_info *pi, *next;
+
+	list_for_each_entry_safe(pi, next, paths, list) {
+		list_del(&pi->list);
+		kfree(pi);
+	}
+}
+
+static void st_destroy(struct path_selector *ps)
+{
+	struct selector *s = (struct selector *) ps->context;
+
+	free_paths(&s->valid_paths);
+	free_paths(&s->failed_paths);
+	kfree(s);
+	ps->context = NULL;
+}
+
+static int st_status(struct path_selector *ps, struct dm_path *path,
+		     status_type_t type, char *result, unsigned int maxlen)
+{
+	int sz = 0;
+	struct path_info *pi;
+
+	if (!path)
+		DMEMIT("0 ");
+	else {
+		pi = path->pscontext;
+
+		switch (type) {
+		case STATUSTYPE_INFO:
+			DMEMIT("%u %lu ", atomic_read(&pi->in_flight_size),
+			       pi->perf);
+			break;
+		case STATUSTYPE_TABLE:
+			DMEMIT("%u %lu ", pi->repeat_count, pi->perf);
+			break;
+		}
+	}
+
+	return sz;
+}
+
+static int st_add_path(struct path_selector *ps, struct dm_path *path,
+		       int argc, char **argv, char **error)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi;
+	unsigned int repeat_count = ST_MIN_IO;
+	size_t perf = 1;
+
+	if (argc > 2) {
+		*error = "service-time ps: incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	/* First path argument is number of I/Os before switching path. */
+	if ((argc > 0) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+		*error = "service-time ps: invalid repeat count";
+		return -EINVAL;
+	}
+
+	/*
+	 * Second path argument is a relative performance value.
+	 * If 0 is given, the path isn't used while other paths having
+	 * a positive value are available.
+	 */
+	if ((argc == 2) && (sscanf(argv[1], "%lu", &perf) != 1)) {
+		*error = "service-time ps: invalid performance value";
+		return -EINVAL;
+	}
+
+	/* allocate the path */
+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+	if (!pi) {
+		*error = "service-time ps: Error allocating path context";
+		return -ENOMEM;
+	}
+
+	pi->path = path;
+	pi->repeat_count = repeat_count;
+	pi->perf = perf;
+	atomic_set(&pi->in_flight_size, 0);
+
+	path->pscontext = pi;
+
+	list_add_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static void st_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move(&pi->list, &s->failed_paths);
+}
+
+static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+/*
+ * Returns:
+ * < 0 : pi1 is better
+ * 0   : no difference between pi1 and pi2
+ * > 0 : pi2 is better
+ */
+static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
+			   size_t incoming)
+{
+	size_t sz1, sz2;
+
+	sz1 = atomic_read(&pi1->in_flight_size);
+	sz2 = atomic_read(&pi2->in_flight_size);
+
+	/*
+	 * Case 1: Both have same performace value. Choose less loaded path.
+	 */
+	if (pi1->perf == pi2->perf)
+		return sz1 - sz2;
+
+	/*
+	 * Case 2a: Both have same load. Choose higher performance path.
+	 * Case 2b: One path has no performance value. Choose the other one.
+	 */
+	if (sz1 == sz2 || !pi1->perf || !pi2->perf)
+		return pi2->perf - pi1->perf;
+
+	/*
+	 * Case 3: Calculate service time. Choose faster path.
+	 *         if ((sz1+incoming)/pi1->perf < (sz2+incoming)/pi2->perf) pi1
+	 *         if ((sz1+incoming)/pi1->perf > (sz2+incoming)/pi2->perf) pi2
+	 */
+	sz1 += incoming;
+	sz2 += incoming;
+	while (sz1 && sz2 && (sz1 < pi1->perf) && (sz2 < pi2->perf)) {
+		/* Size is not big enough to compare by division. Shift up */
+		sz1 <<= 2;
+		sz2 <<= 2;
+	}
+	do_div(sz1, pi1->perf);
+	do_div(sz2, pi2->perf);
+
+	if (sz1 != sz2)
+		return sz1 - sz2;
+
+	/*
+	 * Case 4: Service time is equal. Choose higher performance path.
+	 */
+	return pi2->perf - pi1->perf;
+}
+
+static struct dm_path *st_select_path(struct path_selector *ps,
+				      unsigned *repeat_count, size_t nr_bytes)
+{
+	struct selector *s = (struct selector *) ps->context;
+	struct path_info *pi = NULL, *best = NULL;
+
+	if (list_empty(&s->valid_paths))
+		return NULL;
+
+	/* Change preferred (first in list) path to evenly balance. */
+	list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+	list_for_each_entry(pi, &s->valid_paths, list)
+		if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
+			best = pi;
+
+	if (!best)
+		return NULL;
+
+	*repeat_count = best->repeat_count;
+
+	return best->path;
+}
+
+static int st_start_io(struct path_selector *ps, struct dm_path *path,
+		       size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_add(nr_bytes, &pi->in_flight_size);
+
+	return 0;
+}
+
+static int st_end_io(struct path_selector *ps, struct dm_path *path,
+		     size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_sub(nr_bytes, &pi->in_flight_size);
+
+	return 0;
+}
+
+static struct path_selector_type st_ps = {
+	.name		= "service-time",
+	.module		= THIS_MODULE,
+	.table_args	= 2,
+	.info_args	= 2,
+	.create		= st_create,
+	.destroy	= st_destroy,
+	.status		= st_status,
+	.add_path	= st_add_path,
+	.fail_path	= st_fail_path,
+	.reinstate_path	= st_reinstate_path,
+	.select_path	= st_select_path,
+	.start_io	= st_start_io,
+	.end_io		= st_end_io,
+};
+
+static int __init dm_st_init(void)
+{
+	int r = dm_register_path_selector(&st_ps);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	DMINFO("version " ST_VERSION " loaded");
+
+	return r;
+}
+
+static void __exit dm_st_exit(void)
+{
+	int r = dm_unregister_path_selector(&st_ps);
+
+	if (r < 0)
+		DMERR("unregister failed %d", r);
+}
+
+module_init(dm_st_init);
+module_exit(dm_st_exit);
+
+MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
+MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
+MODULE_LICENSE("GPL");
Index: 2.6.30-rc3/drivers/md/Makefile
===================================================================
--- 2.6.30-rc3.orig/drivers/md/Makefile
+++ 2.6.30-rc3/drivers/md/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
+obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
Index: 2.6.30-rc3/drivers/md/Kconfig
===================================================================
--- 2.6.30-rc3.orig/drivers/md/Kconfig
+++ 2.6.30-rc3/drivers/md/Kconfig
@@ -258,6 +258,15 @@ config DM_MULTIPATH_QL
 
 	  If unsure, say N.
 
+config DM_MULTIPATH_ST
+	tristate "I/O Path Selector based on the service time"
+	depends on DM_MULTIPATH
+	---help---
+	  This path selector is a dynamic load balancer which selects
+	  a path to complete the incoming I/O with the shortest time.
+
+	  If unsure, say N.
+
 config DM_DELAY
 	tristate "I/O delaying target (EXPERIMENTAL)"
 	depends on BLK_DEV_DM && EXPERIMENTAL

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer
  2009-04-24  8:07 ` [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer Kiyoshi Ueda
@ 2009-05-09  0:31   ` Alasdair G Kergon
  2009-05-14  6:14     ` Kiyoshi Ueda
  0 siblings, 1 reply; 12+ messages in thread
From: Alasdair G Kergon @ 2009-05-09  0:31 UTC (permalink / raw)
  To: Kiyoshi Ueda; +Cc: device-mapper development, stefan.bader

On Fri, Apr 24, 2009 at 05:07:26PM +0900, Kiyoshi Ueda wrote:
> +	unsigned int		repeat_count;

Just 'unsigned repeat_count'

> +	struct selector *s = (struct selector *) ps->context;

Drop the cast when it's from void.

	struct selector *s = ps->context;

> +static int ql_status(struct path_selector *ps, struct dm_path *path,
> +		     status_type_t type, char *result, unsigned int maxlen)

unsigned maxlen

(We're doing things like this in all new patches, but only changing existing
code when it's touched for some other reason.)

> +	int sz = 0;

signed or unsigned sz?
(It's already used inconsistently, I know - but is unsigned better here?)

> +		case STATUSTYPE_INFO:
> +			DMEMIT("%u ", atomic_read(&pi->qlen));

Signed or unsigned?

> +	/* Parse the arguments */

Please document parameters and selection algorithm in Documentation dir and
maybe in inline comments too - it's not immediately obvious exactly how it
behaves.

> +static struct dm_path *ql_select_path(struct path_selector *ps,
> +				      unsigned *repeat_count, size_t nr_bytes)

Alasdair

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
  2009-04-24  8:08 ` [PATCH 3/3] dm-mpath: add service-time " Kiyoshi Ueda
@ 2009-05-09  0:49   ` Alasdair G Kergon
  2009-05-15  3:09     ` Kiyoshi Ueda
  0 siblings, 1 reply; 12+ messages in thread
From: Alasdair G Kergon @ 2009-05-09  0:49 UTC (permalink / raw)
  To: Kiyoshi Ueda; +Cc: device-mapper development

On Fri, Apr 24, 2009 at 05:08:02PM +0900, Kiyoshi Ueda wrote:
> +	struct selector *s = (struct selector *) ps->context;
> +		     status_type_t type, char *result, unsigned int maxlen)
> +	int sz = 0;
> +			DMEMIT("%u %lu ", atomic_read(&pi->in_flight_size),

(similar comments)

> +			DMEMIT("%u %lu ", pi->repeat_count, pi->perf);

Need to handle both 32 and 64 bit archs: cast to llu.

> +	if ((argc == 2) && (sscanf(argv[1], "%lu", &perf) != 1)) {

Please do sscanf for size_t the same way as dm-crypt does.
(Or scan for lu, if the intent is to impose a size limit ahead of the later
do_div() then cast explicitly.)

> +/*
> + * Returns:
> + * < 0 : pi1 is better
> + * 0   : no difference between pi1 and pi2
> + * > 0 : pi2 is better
> + */

Please document the parameters and algorithm.
Nothing explains what the performance value is for and examples of anticipated values.

> +	do_div(sz1, pi1->perf);
> +	do_div(sz2, pi2->perf);

Or sector_div ?
But what's the performance of those divisions like?
Is there a better way of getting the same answer?
Is there validation limiting the size of 'perf'?

(Also, did you check there's adequate locking & memory barriers around all the
atomics?)

Alasdair

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer
  2009-05-09  0:31   ` Alasdair G Kergon
@ 2009-05-14  6:14     ` Kiyoshi Ueda
  2009-05-14 12:34       ` Alasdair G Kergon
  0 siblings, 1 reply; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-05-14  6:14 UTC (permalink / raw)
  To: Alasdair Kergon; +Cc: device-mapper development, stefan.bader

[-- Attachment #1: Type: text/plain, Size: 2034 bytes --]

Hi Alasdair,

Thank you for the review and comments.
I'm totally agree with your comments.

I found that you have already updated the patch in your editing tree, thanks.
I attached patches for remaining parts of your comments.

Please review and apply.


On 05/09/2009 09:31 AM +0900, Alasdair G Kergon wrote:
> On Fri, Apr 24, 2009 at 05:07:26PM +0900, Kiyoshi Ueda wrote:
>> +	unsigned int		repeat_count;
> 
> Just 'unsigned repeat_count'

OK. (You have already fixed this.)
 

>> +	struct selector *s = (struct selector *) ps->context;
> 
> Drop the cast when it's from void.
> 
> 	struct selector *s = ps->context;

OK. (You have already fixed this.)

 
>> +static int ql_status(struct path_selector *ps, struct dm_path *path,
>> +		     status_type_t type, char *result, unsigned int maxlen)
> 
> unsigned maxlen

OK, fixed in rqdm-dlb-02-queue-length-dlb-type-fixes.patch


> (We're doing things like this in all new patches, but only changing existing
> code when it's touched for some other reason.)

OK, I followed the style of dm-round-robin and I understand your
preference now.
I'll also check the request-based dm patch-set, too, when I update
and repost it.


>> +	int sz = 0;
> 
> signed or unsigned sz?
> (It's already used inconsistently, I know - but is unsigned better here?)

Yes, I think unsigned is better. (You have already fixed this.)

 
>> +		case STATUSTYPE_INFO:
>> +			DMEMIT("%u ", atomic_read(&pi->qlen));
> 
> Signed or unsigned?

qlen should be always >= 0, but atomic_t is defined as signed.
So I use signed here.
Fixed in rqdm-dlb-02-queue-length-dlb-type-fixes.patch


>> +	/* Parse the arguments */
> 
> Please document parameters and selection algorithm in Documentation dir and
> maybe in inline comments too - it's not immediately obvious exactly how it
> behaves.
> 
>> +static struct dm_path *ql_select_path(struct path_selector *ps,
>> +				      unsigned *repeat_count, size_t nr_bytes)

OK, added comments/documentaion in
rqdm-dlb-02-queue-length-dlb-document.patch.

Thanks,
Kiyoshi Ueda

[-- Attachment #2: rqdm-dlb-02-queue-length-dlb-type-fixes.patch --]
[-- Type: text/plain, Size: 1174 bytes --]

o Use 'unsigned' instead of 'unsigned int' for maxlen in dm-queue-length.

o Fix the print format to use %d for qlen in dm-queue-length, since
  atomic_t is defined as 'int', not 'unsigned int'.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
 drivers/md/dm-queue-length.c |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: 2.6.30-rc5/drivers/md/dm-queue-length.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-queue-length.c
+++ 2.6.30-rc5/drivers/md/dm-queue-length.c
@@ -82,7 +82,7 @@ static void ql_destroy(struct path_selec
 }
 
 static int ql_status(struct path_selector *ps, struct dm_path *path,
-		     status_type_t type, char *result, unsigned int maxlen)
+		     status_type_t type, char *result, unsigned maxlen)
 {
 	unsigned sz = 0;
 	struct path_info *pi;
@@ -95,7 +95,7 @@ static int ql_status(struct path_selecto
 
 		switch (type) {
 		case STATUSTYPE_INFO:
-			DMEMIT("%u ", atomic_read(&pi->qlen));
+			DMEMIT("%d ", atomic_read(&pi->qlen));
 			break;
 		case STATUSTYPE_TABLE:
 			DMEMIT("%u ", pi->repeat_count);

[-- Attachment #3: rqdm-dlb-02-queue-length-dlb-document.patch --]
[-- Type: text/plain, Size: 3178 bytes --]

Add documents/comments for dm-queue-length.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
 Documentation/device-mapper/dm-queue-length.txt |   39 ++++++++++++++++++++++++
 drivers/md/dm-queue-length.c                    |   12 +++++--
 2 files changed, 48 insertions(+), 3 deletions(-)

Index: 2.6.30-rc5/Documentation/device-mapper/dm-queue-length.txt
===================================================================
--- /dev/null
+++ 2.6.30-rc5/Documentation/device-mapper/dm-queue-length.txt
@@ -0,0 +1,39 @@
+dm-queue-length
+===============
+
+dm-queue-length is a path selector module for device-mapper targets,
+which selects a path having the least number of in-flight I/Os.
+The path selector name is 'queue-length'.
+
+Table parameters for each path: [<repeat_count>]
+	<repeat_count>: The number of I/Os to dispatch using the selected
+			path before switching to the next path.
+			If not given, internal default is used. To check
+			the default value, see the activated table.
+
+Status for each path: <status> <fail-count> <in-flight>
+	<status>: 'A' if the path is active, 'F' if the path is failed.
+	<fail-count>: The number of path failures.
+	<in-flight>: The number of in-flight I/Os on the path.
+
+
+Algorithm
+=========
+
+dm-queue-length increments/decrements 'in-flight' when an I/O is
+dispatched/completed respectively.
+dm-queue-length selects a path having minimum 'in-flight'.
+
+
+Examples
+========
+In case that 2 paths (sda and sdb) are used with repeat_count == 128.
+
+# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \
+  dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0
Index: 2.6.30-rc5/drivers/md/dm-queue-length.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-queue-length.c
+++ 2.6.30-rc5/drivers/md/dm-queue-length.c
@@ -35,7 +35,7 @@ struct path_info {
 	struct list_head	list;
 	struct dm_path		*path;
 	unsigned		repeat_count;
-	atomic_t		qlen;
+	atomic_t		qlen;	/* the number of in-flight I/Os */
 };
 
 static struct selector *alloc_selector(void)
@@ -113,13 +113,16 @@ static int ql_add_path(struct path_selec
 	struct path_info *pi;
 	unsigned repeat_count = QL_MIN_IO;
 
-	/* Parse the arguments */
+	/*
+	 * Arguments: [<repeat_count>]
+	 * 	<repeat_count>: The number of I/Os before switching path.
+	 * 			If not given, default (QL_MIN_IO) is used.
+	 */
 	if (argc > 1) {
 		*error = "queue-length ps: incorrect number of arguments";
 		return -EINVAL;
 	}
 
-	/* First path argument is number of I/Os before switching path. */
 	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
 		*error = "queue-length ps: invalid repeat count";
 		return -EINVAL;
@@ -161,6 +164,9 @@ static int ql_reinstate_path(struct path
 	return 0;
 }
 
+/*
+ * Select a path having the minimum number of in-flight I/Os
+ */
 static struct dm_path *ql_select_path(struct path_selector *ps,
 				      unsigned *repeat_count, size_t nr_bytes)
 {

[-- Attachment #4: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Re: [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer
  2009-05-14  6:14     ` Kiyoshi Ueda
@ 2009-05-14 12:34       ` Alasdair G Kergon
  0 siblings, 0 replies; 12+ messages in thread
From: Alasdair G Kergon @ 2009-05-14 12:34 UTC (permalink / raw)
  To: device-mapper development

On Thu, May 14, 2009 at 03:14:15PM +0900, Kiyoshi Ueda wrote:
> Please review and apply.

Pushed to linux-next.

Alasdair

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
  2009-05-09  0:49   ` Alasdair G Kergon
@ 2009-05-15  3:09     ` Kiyoshi Ueda
  2009-05-18 11:46       ` Alasdair G Kergon
  0 siblings, 1 reply; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-05-15  3:09 UTC (permalink / raw)
  To: Alasdair Kergon; +Cc: device-mapper development

[-- Attachment #1: Type: text/plain, Size: 2323 bytes --]

Hi Alasdair,

Thank you for the review and comments.
I'm totally agree with your comments.

I found that you have already updated the patch in your editing tree, thanks.
But I'm considering to update the patch to be simpler.
Please see below.

On 05/09/2009 09:49 AM +0900, Alasdair G Kergon wrote:
> On Fri, Apr 24, 2009 at 05:08:02PM +0900, Kiyoshi Ueda wrote:
>> +	struct selector *s = (struct selector *) ps->context;
>> +		     status_type_t type, char *result, unsigned int maxlen)
>> +	int sz = 0;
>> +			DMEMIT("%u %lu ", atomic_read(&pi->in_flight_size),
> 
> (similar comments)
 
OK, you have already fixed the pointer cast, the sz type and
the print format for in_flight_size.
I'll fix the type of maxlen.


>> +/*
>> + * Returns:
>> + * < 0 : pi1 is better
>> + * 0   : no difference between pi1 and pi2
>> + * > 0 : pi2 is better
>> + */
> 
> Please document the parameters and algorithm.
> Nothing explains what the performance value is for and examples of anticipated values.

OK, I'll add comments and documentation like the attached patch.
 

>> +			DMEMIT("%u %lu ", pi->repeat_count, pi->perf);
> 
> Need to handle both 32 and 64 bit archs: cast to llu.
>
>> +	if ((argc == 2) && (sscanf(argv[1], "%lu", &perf) != 1)) {
> 
> Please do sscanf for size_t the same way as dm-crypt does.
> (Or scan for lu, if the intent is to impose a size limit ahead of the later
> do_div() then cast explicitly.)
>
>> +	do_div(sz1, pi1->perf);
>> +	do_div(sz2, pi2->perf);
> 
> Or sector_div ?
>
> But what's the performance of those divisions like?
> Is there a better way of getting the same answer?
> Is there validation limiting the size of 'perf'?

I tried to use multiplication before, but it didn't work well,
because overflow happened when 'in_flight_size' and 'perf' had
pretty big values, and then I got wrong comparison results.
So I decided to use division.

However, if I limit the 'perf' value in smaller range (e.g. 0 to 100),
such overflow shouldn't happen.  So I should be able to use
multiplication.  Also, I don't need to use size_t for 'perf',
because 'unsinged' should be enough for such small values.

 
> (Also, did you check there's adequate locking & memory barriers around all the
> atomics?)

Yes, I did and I found no problem in both dm-queue-length and
dm-service-time.

Thanks,
Kiyoshi Ueda

[-- Attachment #2: rqdm-dlb-04-service-time-dlb-document.patch --]
[-- Type: text/plain, Size: 5834 bytes --]

Add documents/comments for dm-service-time.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
 Documentation/device-mapper/dm-service-time.txt |   87 ++++++++++++++++++++++++
 drivers/md/dm-service-time.c                    |   26 +++++--
 2 files changed, 107 insertions(+), 6 deletions(-)

Index: 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
===================================================================
--- /dev/null
+++ 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
@@ -0,0 +1,87 @@
+dm-service-time
+===============
+
+dm-service-time is a path selector module for device-mapper targets,
+which selects a path with the shortest estimated service time for
+the incoming I/O.
+
+The service time for each path is estimated by dividing the total size
+of in-flight I/Os on a path with the performance value of the path.
+The performance value is a relative throughput value among all paths
+in a path-group, and it can be specified as a table argument.
+
+The path selector name is 'service-time'.
+
+Table parameters for each path: [<repeat_count> [<performance>]]
+	<repeat_count>: The number of I/Os to dispatch using the selected
+			path before switching to the next path.
+			If not given, internal default is used.  To check
+			the default value, see the activated table.
+	<performance>: The relative throughput value of the path
+		       among all paths in the path-group.
+		       If not given, minimum value '1' is used.
+		       If '0' is given, the path isn't selected while
+		       other paths having a positive value are available.
+
+Status for each path: <status> <fail-count> <in-flight-size> <performance>
+	<status>: 'A' if the path is active, 'F' if the path is failed.
+	<fail-count>: The number of path failures.
+	<in-flight-size>: The size of in-flight I/Os on the path.
+	<performance>: The relative throughput value of the path
+		       among all paths in the path-group.
+
+
+Algorithm
+=========
+
+dm-service-time adds the I/O size to 'in-flight-size' when the I/O is
+dispatched and substracts when completed.
+Basically, dm-service-time selects a path having minimum service time
+which is calculated by:
+
+	('in-flight-size' + 'size-of-incoming-io') / 'performance'
+
+However, some optimizations below are used to reduce the calculation
+as much as possible.
+
+	1. If the paths have the same 'performance', skip the division
+	   and just compare the 'in-flight-size'.
+
+	2. If the paths have the same 'in-flight-size', skip the division
+	   and just compare the 'performance'.
+
+	3. If some paths have non-zero 'performance' and others have zero
+	   'performance', ignore those paths with zero 'performance'.
+
+If such optimizations can't be applied, calculate service time, and
+compare service time.
+If calculated service time is equal, the path having maximum 'performance'
+may be better.  So compare 'performance' then.
+
+
+Examples
+========
+In case that 2 paths (sda and sdb) are used with repeat_count == 128
+and sda has an average throughput 1GB/s and sdb has 4GB/s, 'performance'
+value may be '1' for sda and '4' for sdb.
+
+# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
+  dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4
+
+
+Or '2' for sda and '8' for sdb would be also true.
+
+# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \
+  dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8
Index: 2.6.30-rc5/drivers/md/dm-service-time.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-service-time.c
+++ 2.6.30-rc5/drivers/md/dm-service-time.c
@@ -105,22 +105,27 @@ static int st_add_path(struct path_selec
 	unsigned repeat_count = ST_MIN_IO;
 	unsigned long long tmpll = 1;
 
+	/*
+	 * Arguments: [<repeat_count> [<performance>]]
+	 * 	<repeat_count>: The number of I/Os before switching path.
+	 * 			If not given, default (ST_MIN_IO) is used.
+	 * 	<performance>: The relative throughput value of the path
+	 *		       among all paths in the path-group.
+	 *		       If not given, minimum value '1' is used.
+	 *		       If '0' is given, the path isn't selected while
+	 * 		       other paths having a positive value are
+	 * 		       available.
+	 */
 	if (argc > 2) {
 		*error = "service-time ps: incorrect number of arguments";
 		return -EINVAL;
 	}
 
-	/* First path argument is number of I/Os before switching path. */
 	if ((argc > 0) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
 		*error = "service-time ps: invalid repeat count";
 		return -EINVAL;
 	}
 
-	/*
-	 * Second path argument is a relative performance value.
-	 * If 0 is given, the path isn't used while other paths having
-	 * a positive value are available.
-	 */
 	if ((argc == 2) && (sscanf(argv[1], "%llu", &tmpll) != 1)) {
 		*error = "service-time ps: invalid performance value";
 		return -EINVAL;
@@ -164,10 +169,19 @@ static int st_reinstate_path(struct path
 }
 
 /*
+ * Compare the estimated service time of 2 paths, pi1 and pi2,
+ * for the incoming I/O.
+ *
  * Returns:
  * < 0 : pi1 is better
  * 0   : no difference between pi1 and pi2
  * > 0 : pi2 is better
+ *
+ * Description:
+ * Basically, the service time is estimated by:
+ *     ('pi->in-flight-size' + 'incoming') / 'pi->perf'
+ * To reduce the calculation, some optimizations are made.
+ * (See comments inline)
  */
 static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
 			   size_t incoming)

[-- Attachment #3: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Re: [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
  2009-05-15  3:09     ` Kiyoshi Ueda
@ 2009-05-18 11:46       ` Alasdair G Kergon
  2009-05-19  2:59         ` Kiyoshi Ueda
  0 siblings, 1 reply; 12+ messages in thread
From: Alasdair G Kergon @ 2009-05-18 11:46 UTC (permalink / raw)
  To: device-mapper development

On Fri, May 15, 2009 at 12:09:21PM +0900, Kiyoshi Ueda wrote:
> However, if I limit the 'perf' value in smaller range (e.g. 0 to 100),
> such overflow shouldn't happen.  So I should be able to use
> multiplication.  Also, I don't need to use size_t for 'perf',
> because 'unsinged' should be enough for such small values.
 
I think a small range would be adequate - look at the number sizes and
go 0-100 or 0-1000 as appropriate.

Thinking of naming, is it 'relative_throughput' ?

However, is it also going to be possible to extend this target to measure the
the actual throughput?  If we did that, would we use a special value to 
indicate it or add a new table parameter and keep the 'perf' ones as a
manual adjustment factor applied on top of the calculated one?

Alasdair

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Re: [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
  2009-05-18 11:46       ` Alasdair G Kergon
@ 2009-05-19  2:59         ` Kiyoshi Ueda
  2009-05-19  8:13           ` Kiyoshi Ueda
  0 siblings, 1 reply; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-05-19  2:59 UTC (permalink / raw)
  To: Alasdair Kergon; +Cc: device-mapper development

Hi Alasdair,

On 05/18/2009 08:46 PM +0900, Alasdair G Kergon wrote:
> On Fri, May 15, 2009 at 12:09:21PM +0900, Kiyoshi Ueda wrote:
>> However, if I limit the 'perf' value in smaller range (e.g. 0 to 100),
>> such overflow shouldn't happen.  So I should be able to use
>> multiplication.  Also, I don't need to use size_t for 'perf',
>> because 'unsinged' should be enough for such small values.
>  
> I think a small range would be adequate - look at the number sizes and
> go 0-100 or 0-1000 as appropriate.
> 
> Thinking of naming, is it 'relative_throughput' ?

Yes, 'relative_throughput' may be better naming.  I'll take it.


> However, is it also going to be possible to extend this target to measure the
> the actual throughput?  If we did that, would we use a special value to 
> indicate it or add a new table parameter and keep the 'perf' ones as a
> manual adjustment factor applied on top of the calculated one?

The older version of dm-service-time calculated the actual throughput
dynamically using part_stat:
    http://marc.info/?l=dm-devel&m=123321314426149&w=2

But the calculated values often had some noises and it didn't work well.
(e.g. Once the throughput value of a path was calculated so small,
      the path was never used, and the small throughput value was never
      updated, and the path was never used, and ...)

So I think doing such calculations in user-space and simplifying
the kernel-code are better.
For dynamically changing path's bandwidth, user-space daemon can
measure the throughput periodically and notify the dm device.
I've been thinking of adding a new dm message handler to update
'relative_thoughput' in future.
What do you think?

Thanks,
Kiyoshi Ueda

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: Re: [PATCH 3/3] dm-mpath: add service-time oriented dynamic load balancer
  2009-05-19  2:59         ` Kiyoshi Ueda
@ 2009-05-19  8:13           ` Kiyoshi Ueda
  0 siblings, 0 replies; 12+ messages in thread
From: Kiyoshi Ueda @ 2009-05-19  8:13 UTC (permalink / raw)
  To: Alasdair Kergon; +Cc: device-mapper development

[-- Attachment #1: Type: text/plain, Size: 1351 bytes --]

Hi Alasdair,

I attached 3 patches below.
  - rqdm-dlb-04-service-time-dlb-maxlen-type-fix.patch:
    Use 'unsigned' instead of 'unsigned int' for maxlen.

  - rqdm-dlb-04-service-time-dlb-add-perf-limit.patch:
    Limit the 'perf' value within the range of 0-100.

  - rqdm-dlb-04-service-time-dlb-naming-change.patch:
    Change the 'perf' naming to 'relative_throughput'.

Please review and apply if you have no problem.

On 05/19/2009 11:59 AM +0900, Kiyoshi Ueda wrote:
> Hi Alasdair,
> 
> On 05/18/2009 08:46 PM +0900, Alasdair G Kergon wrote:
>> On Fri, May 15, 2009 at 12:09:21PM +0900, Kiyoshi Ueda wrote:
>>> However, if I limit the 'perf' value in smaller range (e.g. 0 to 100),
>>> such overflow shouldn't happen.  So I should be able to use
>>> multiplication.  Also, I don't need to use size_t for 'perf',
>>> because 'unsinged' should be enough for such small values.
>>  
>> I think a small range would be adequate - look at the number sizes and
>> go 0-100 or 0-1000 as appropriate.

See rqdm-dlb-04-service-time-dlb-add-perf-limit.patch.


>> Thinking of naming, is it 'relative_throughput' ?
> 
> Yes, 'relative_throughput' may be better naming.  I'll take it.

See rqdm-dlb-04-service-time-dlb-naming-change.patch.
Maybe the string is too longt.  If you feel so, too,
'throughput' may be another alternative?

Thanks,
Kiyoshi Ueda

[-- Attachment #2: rqdm-dlb-04-service-time-dlb-maxlen-type-fix.patch --]
[-- Type: text/plain, Size: 782 bytes --]

Use 'unsigned' instead of 'unsigned int' for maxlen in dm-service-time.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
 drivers/md/dm-service-time.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: 2.6.30-rc5/drivers/md/dm-service-time.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-service-time.c
+++ 2.6.30-rc5/drivers/md/dm-service-time.c
@@ -72,7 +72,7 @@ static void st_destroy(struct path_selec
 }
 
 static int st_status(struct path_selector *ps, struct dm_path *path,
-		     status_type_t type, char *result, unsigned int maxlen)
+		     status_type_t type, char *result, unsigned maxlen)
 {
 	unsigned sz = 0;
 	struct path_info *pi;

[-- Attachment #3: rqdm-dlb-04-service-time-dlb-add-perf-limit.patch --]
[-- Type: text/plain, Size: 5554 bytes --]

o Limited the second table argument (relative throughput value)
  in 0-100.
  As a result, no need to use 'size_t' for ->perf.  Use 'unsigned'.
  Updated comments/documents.

o Converted the service time calculation method to multiplication
  from division.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
 Documentation/device-mapper/dm-service-time.txt |    1 
 drivers/md/dm-service-time.c                    |   57 +++++++++++++++---------
 2 files changed, 37 insertions(+), 21 deletions(-)

Index: 2.6.30-rc5/drivers/md/dm-service-time.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-service-time.c
+++ 2.6.30-rc5/drivers/md/dm-service-time.c
@@ -13,7 +13,10 @@
 
 #define DM_MSG_PREFIX	"multipath service-time"
 #define ST_MIN_IO	1
-#define ST_VERSION	"0.1.0"
+#define ST_MAX_PERF	100
+#define ST_MAX_PERF_SHIFT	7
+#define ST_MAX_INFLIGHT_SIZE	((size_t)-1 >> ST_MAX_PERF_SHIFT)
+#define ST_VERSION	"0.2.0"
 
 struct selector {
 	struct list_head valid_paths;
@@ -24,7 +27,7 @@ struct path_info {
 	struct list_head list;
 	struct dm_path *path;
 	unsigned repeat_count;
-	size_t perf;
+	unsigned perf;
 	atomic_t in_flight_size;	/* Total size of in-flight I/Os */
 };
 
@@ -84,12 +87,11 @@ static int st_status(struct path_selecto
 
 		switch (type) {
 		case STATUSTYPE_INFO:
-			DMEMIT("%d %llu ", atomic_read(&pi->in_flight_size),
-			       (unsigned long long)pi->perf);
+			DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
+			       pi->perf);
 			break;
 		case STATUSTYPE_TABLE:
-			DMEMIT("%u %llu ", pi->repeat_count,
-			       (unsigned long long)pi->perf);
+			DMEMIT("%u %u ", pi->repeat_count, pi->perf);
 			break;
 		}
 	}
@@ -103,7 +105,7 @@ static int st_add_path(struct path_selec
 	struct selector *s = ps->context;
 	struct path_info *pi;
 	unsigned repeat_count = ST_MIN_IO;
-	unsigned long long tmpll = 1;
+	unsigned perf = 1;
 
 	/*
 	 * Arguments: [<repeat_count> [<performance>]]
@@ -111,6 +113,7 @@ static int st_add_path(struct path_selec
 	 * 			If not given, default (ST_MIN_IO) is used.
 	 * 	<performance>: The relative throughput value of the path
 	 *		       among all paths in the path-group.
+	 * 		       The valid range: 0-<ST_MAX_PERF>
 	 *		       If not given, minimum value '1' is used.
 	 *		       If '0' is given, the path isn't selected while
 	 * 		       other paths having a positive value are
@@ -126,7 +129,8 @@ static int st_add_path(struct path_selec
 		return -EINVAL;
 	}
 
-	if ((argc == 2) && (sscanf(argv[1], "%llu", &tmpll) != 1)) {
+	if ((argc == 2) &&
+	    (sscanf(argv[1], "%u", &perf) != 1 || perf > ST_MAX_PERF)) {
 		*error = "service-time ps: invalid performance value";
 		return -EINVAL;
 	}
@@ -140,7 +144,7 @@ static int st_add_path(struct path_selec
 
 	pi->path = path;
 	pi->repeat_count = repeat_count;
-	pi->perf = tmpll;
+	pi->perf = perf;
 	atomic_set(&pi->in_flight_size, 0);
 
 	path->pscontext = pi;
@@ -186,7 +190,7 @@ static int st_reinstate_path(struct path
 static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
 			   size_t incoming)
 {
-	size_t sz1, sz2;
+	size_t sz1, sz2, st1, st2;
 
 	sz1 = atomic_read(&pi1->in_flight_size);
 	sz2 = atomic_read(&pi2->in_flight_size);
@@ -206,21 +210,32 @@ static int st_compare_load(struct path_i
 
 	/*
 	 * Case 3: Calculate service time. Choose faster path.
-	 *         if ((sz1+incoming)/pi1->perf < (sz2+incoming)/pi2->perf) pi1
-	 *         if ((sz1+incoming)/pi1->perf > (sz2+incoming)/pi2->perf) pi2
+	 *         Service time using pi1: st1 = (sz1 + incoming) / pi1->perf
+	 *         Service time using pi2: st2 = (sz2 + incoming) / pi2->perf
+	 *
+	 *         To avoid the division, transform the expression to use
+	 *         multiplication.
+	 *         Because ->perf > 0 here, if st1 < st2, the expressions
+	 *         below are the same meaning:
+	 *         (sz1 + incoming) / pi1->perf < (sz2 + incoming) / pi2->perf
+	 *         (sz1 + incoming) * pi2->perf < (sz2 + incoming) * pi1->perf
+	 *         So use the later one.
 	 */
 	sz1 += incoming;
 	sz2 += incoming;
-	while (sz1 && sz2 && (sz1 < pi1->perf) && (sz2 < pi2->perf)) {
-		/* Size is not big enough to compare by division. Shift up */
-		sz1 <<= 2;
-		sz2 <<= 2;
+	if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
+		     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
+		/*
+		 * Size may be too big for multiplying pi->perf and overflow.
+		 * To avoid the overflow and mis-selection, shift down both.
+		 */
+		sz1 >>= ST_MAX_PERF_SHIFT;
+		sz2 >>= ST_MAX_PERF_SHIFT;
 	}
-	do_div(sz1, pi1->perf);
-	do_div(sz2, pi2->perf);
-
-	if (sz1 != sz2)
-		return sz1 - sz2;
+	st1 = sz1 * pi2->perf;
+	st2 = sz2 * pi1->perf;
+	if (st1 != st2)
+		return st1 - st2;
 
 	/*
 	 * Case 4: Service time is equal. Choose higher performance path.
Index: 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
===================================================================
--- 2.6.30-rc5.orig/Documentation/device-mapper/dm-service-time.txt
+++ 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
@@ -19,6 +19,7 @@ Table parameters for each path: [<repeat
 			the default value, see the activated table.
 	<performance>: The relative throughput value of the path
 		       among all paths in the path-group.
+		       The valid range is 0-100.
 		       If not given, minimum value '1' is used.
 		       If '0' is given, the path isn't selected while
 		       other paths having a positive value are available.

[-- Attachment #4: rqdm-dlb-04-service-time-dlb-naming-change.patch --]
[-- Type: text/plain, Size: 10844 bytes --]

Changed the name of the member 'perf' in the path_info structure
to 'relative_throughput' for readability.

Also, updated the related parts of comments and documents.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
---
 Documentation/device-mapper/dm-service-time.txt |   43 ++++++------
 drivers/md/dm-service-time.c                    |   84 +++++++++++++-----------
 2 files changed, 69 insertions(+), 58 deletions(-)

Index: 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
===================================================================
--- 2.6.30-rc5.orig/Documentation/device-mapper/dm-service-time.txt
+++ 2.6.30-rc5/Documentation/device-mapper/dm-service-time.txt
@@ -12,24 +12,25 @@ in a path-group, and it can be specified
 
 The path selector name is 'service-time'.
 
-Table parameters for each path: [<repeat_count> [<performance>]]
+Table parameters for each path: [<repeat_count> [<relative_throughput>]]
 	<repeat_count>: The number of I/Os to dispatch using the selected
 			path before switching to the next path.
 			If not given, internal default is used.  To check
 			the default value, see the activated table.
-	<performance>: The relative throughput value of the path
-		       among all paths in the path-group.
-		       The valid range is 0-100.
-		       If not given, minimum value '1' is used.
-		       If '0' is given, the path isn't selected while
-		       other paths having a positive value are available.
+	<relative_throughput>: The relative throughput value of the path
+			among all paths in the path-group.
+			The valid range is 0-100.
+			If not given, minimum value '1' is used.
+			If '0' is given, the path isn't selected while
+			other paths having a positive value are available.
 
-Status for each path: <status> <fail-count> <in-flight-size> <performance>
+Status for each path: <status> <fail-count> <in-flight-size> \
+		      <relative_throughput>
 	<status>: 'A' if the path is active, 'F' if the path is failed.
 	<fail-count>: The number of path failures.
 	<in-flight-size>: The size of in-flight I/Os on the path.
-	<performance>: The relative throughput value of the path
-		       among all paths in the path-group.
+	<relative_throughput>: The relative throughput value of the path
+			among all paths in the path-group.
 
 
 Algorithm
@@ -40,31 +41,33 @@ dispatched and substracts when completed
 Basically, dm-service-time selects a path having minimum service time
 which is calculated by:
 
-	('in-flight-size' + 'size-of-incoming-io') / 'performance'
+	('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput'
 
 However, some optimizations below are used to reduce the calculation
 as much as possible.
 
-	1. If the paths have the same 'performance', skip the division
-	   and just compare the 'in-flight-size'.
+	1. If the paths have the same 'relative_throughput', skip
+	   the division and just compare the 'in-flight-size'.
 
 	2. If the paths have the same 'in-flight-size', skip the division
-	   and just compare the 'performance'.
+	   and just compare the 'relative_throughput'.
 
-	3. If some paths have non-zero 'performance' and others have zero
-	   'performance', ignore those paths with zero 'performance'.
+	3. If some paths have non-zero 'relative_throughput' and others
+	   have zero 'relative_throughput', ignore those paths with zero
+	   'relative_throughput'.
 
 If such optimizations can't be applied, calculate service time, and
 compare service time.
-If calculated service time is equal, the path having maximum 'performance'
-may be better.  So compare 'performance' then.
+If calculated service time is equal, the path having maximum
+'relative_throughput' may be better.  So compare 'relative_throughput'
+then.
 
 
 Examples
 ========
 In case that 2 paths (sda and sdb) are used with repeat_count == 128
-and sda has an average throughput 1GB/s and sdb has 4GB/s, 'performance'
-value may be '1' for sda and '4' for sdb.
+and sda has an average throughput 1GB/s and sdb has 4GB/s,
+'relative_throughput' value may be '1' for sda and '4' for sdb.
 
 # echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
   dmsetup create test
Index: 2.6.30-rc5/drivers/md/dm-service-time.c
===================================================================
--- 2.6.30-rc5.orig/drivers/md/dm-service-time.c
+++ 2.6.30-rc5/drivers/md/dm-service-time.c
@@ -13,9 +13,9 @@
 
 #define DM_MSG_PREFIX	"multipath service-time"
 #define ST_MIN_IO	1
-#define ST_MAX_PERF	100
-#define ST_MAX_PERF_SHIFT	7
-#define ST_MAX_INFLIGHT_SIZE	((size_t)-1 >> ST_MAX_PERF_SHIFT)
+#define ST_MAX_RELATIVE_THROUGHPUT	100
+#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT	7
+#define ST_MAX_INFLIGHT_SIZE	((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
 #define ST_VERSION	"0.2.0"
 
 struct selector {
@@ -27,7 +27,7 @@ struct path_info {
 	struct list_head list;
 	struct dm_path *path;
 	unsigned repeat_count;
-	unsigned perf;
+	unsigned relative_throughput;
 	atomic_t in_flight_size;	/* Total size of in-flight I/Os */
 };
 
@@ -88,10 +88,11 @@ static int st_status(struct path_selecto
 		switch (type) {
 		case STATUSTYPE_INFO:
 			DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
-			       pi->perf);
+			       pi->relative_throughput);
 			break;
 		case STATUSTYPE_TABLE:
-			DMEMIT("%u %u ", pi->repeat_count, pi->perf);
+			DMEMIT("%u %u ", pi->repeat_count,
+			       pi->relative_throughput);
 			break;
 		}
 	}
@@ -105,19 +106,19 @@ static int st_add_path(struct path_selec
 	struct selector *s = ps->context;
 	struct path_info *pi;
 	unsigned repeat_count = ST_MIN_IO;
-	unsigned perf = 1;
+	unsigned relative_throughput = 1;
 
 	/*
-	 * Arguments: [<repeat_count> [<performance>]]
+	 * Arguments: [<repeat_count> [<relative_throughput>]]
 	 * 	<repeat_count>: The number of I/Os before switching path.
 	 * 			If not given, default (ST_MIN_IO) is used.
-	 * 	<performance>: The relative throughput value of the path
-	 *		       among all paths in the path-group.
-	 * 		       The valid range: 0-<ST_MAX_PERF>
-	 *		       If not given, minimum value '1' is used.
-	 *		       If '0' is given, the path isn't selected while
-	 * 		       other paths having a positive value are
-	 * 		       available.
+	 * 	<relative_throughput>: The relative throughput value of
+	 *			the path among all paths in the path-group.
+	 * 			The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
+	 *			If not given, minimum value '1' is used.
+	 *			If '0' is given, the path isn't selected while
+	 * 			other paths having a positive value are
+	 * 			available.
 	 */
 	if (argc > 2) {
 		*error = "service-time ps: incorrect number of arguments";
@@ -130,8 +131,9 @@ static int st_add_path(struct path_selec
 	}
 
 	if ((argc == 2) &&
-	    (sscanf(argv[1], "%u", &perf) != 1 || perf > ST_MAX_PERF)) {
-		*error = "service-time ps: invalid performance value";
+	    (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
+	     relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
+		*error = "service-time ps: invalid relative_throughput value";
 		return -EINVAL;
 	}
 
@@ -144,7 +146,7 @@ static int st_add_path(struct path_selec
 
 	pi->path = path;
 	pi->repeat_count = repeat_count;
-	pi->perf = perf;
+	pi->relative_throughput = relative_throughput;
 	atomic_set(&pi->in_flight_size, 0);
 
 	path->pscontext = pi;
@@ -183,7 +185,7 @@ static int st_reinstate_path(struct path
  *
  * Description:
  * Basically, the service time is estimated by:
- *     ('pi->in-flight-size' + 'incoming') / 'pi->perf'
+ *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
  * To reduce the calculation, some optimizations are made.
  * (See comments inline)
  */
@@ -196,29 +198,34 @@ static int st_compare_load(struct path_i
 	sz2 = atomic_read(&pi2->in_flight_size);
 
 	/*
-	 * Case 1: Both have same performance value. Choose less loaded path.
+	 * Case 1: Both have same throughput value. Choose less loaded path.
 	 */
-	if (pi1->perf == pi2->perf)
+	if (pi1->relative_throughput == pi2->relative_throughput)
 		return sz1 - sz2;
 
 	/*
-	 * Case 2a: Both have same load. Choose higher performance path.
-	 * Case 2b: One path has no performance value. Choose the other one.
+	 * Case 2a: Both have same load. Choose higher throughput path.
+	 * Case 2b: One path has no throughput value. Choose the other one.
 	 */
-	if (sz1 == sz2 || !pi1->perf || !pi2->perf)
-		return pi2->perf - pi1->perf;
+	if (sz1 == sz2 ||
+	    !pi1->relative_throughput || !pi2->relative_throughput)
+		return pi2->relative_throughput - pi1->relative_throughput;
 
 	/*
 	 * Case 3: Calculate service time. Choose faster path.
-	 *         Service time using pi1: st1 = (sz1 + incoming) / pi1->perf
-	 *         Service time using pi2: st2 = (sz2 + incoming) / pi2->perf
+	 *         Service time using pi1:
+	 *             st1 = (sz1 + incoming) / pi1->relative_throughput
+	 *         Service time using pi2:
+	 *             st2 = (sz2 + incoming) / pi2->relative_throughput
 	 *
 	 *         To avoid the division, transform the expression to use
 	 *         multiplication.
-	 *         Because ->perf > 0 here, if st1 < st2, the expressions
-	 *         below are the same meaning:
-	 *         (sz1 + incoming) / pi1->perf < (sz2 + incoming) / pi2->perf
-	 *         (sz1 + incoming) * pi2->perf < (sz2 + incoming) * pi1->perf
+	 *         Because ->relative_throughput > 0 here, if st1 < st2,
+	 *         the expressions below are the same meaning:
+	 *             (sz1 + incoming) / pi1->relative_throughput <
+	 *                 (sz2 + incoming) / pi2->relative_throughput
+	 *             (sz1 + incoming) * pi2->relative_throughput <
+	 *                 (sz2 + incoming) * pi1->relative_throughput
 	 *         So use the later one.
 	 */
 	sz1 += incoming;
@@ -226,21 +233,22 @@ static int st_compare_load(struct path_i
 	if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
 		     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
 		/*
-		 * Size may be too big for multiplying pi->perf and overflow.
+		 * Size may be too big for multiplying pi->relative_throughput
+		 * and overflow.
 		 * To avoid the overflow and mis-selection, shift down both.
 		 */
-		sz1 >>= ST_MAX_PERF_SHIFT;
-		sz2 >>= ST_MAX_PERF_SHIFT;
+		sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+		sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
 	}
-	st1 = sz1 * pi2->perf;
-	st2 = sz2 * pi1->perf;
+	st1 = sz1 * pi2->relative_throughput;
+	st2 = sz2 * pi1->relative_throughput;
 	if (st1 != st2)
 		return st1 - st2;
 
 	/*
-	 * Case 4: Service time is equal. Choose higher performance path.
+	 * Case 4: Service time is equal. Choose higher throughput path.
 	 */
-	return pi2->perf - pi1->perf;
+	return pi2->relative_throughput - pi1->relative_throughput;
 }
 
 static struct dm_path *st_select_path(struct path_selector *ps,

[-- Attachment #5: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2009-05-19  8:13 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-04-24  8:04 [PATCH 0/3] dm-mpath: dynamic load balancers (v2) Kiyoshi Ueda
2009-04-24  8:06 ` [PATCH 1/3] dm-mpath: interface change for dynamic load balancers Kiyoshi Ueda
2009-04-24  8:07 ` [PATCH 2/3] dm-mpath: add queue-length oriented dynamic load balancer Kiyoshi Ueda
2009-05-09  0:31   ` Alasdair G Kergon
2009-05-14  6:14     ` Kiyoshi Ueda
2009-05-14 12:34       ` Alasdair G Kergon
2009-04-24  8:08 ` [PATCH 3/3] dm-mpath: add service-time " Kiyoshi Ueda
2009-05-09  0:49   ` Alasdair G Kergon
2009-05-15  3:09     ` Kiyoshi Ueda
2009-05-18 11:46       ` Alasdair G Kergon
2009-05-19  2:59         ` Kiyoshi Ueda
2009-05-19  8:13           ` Kiyoshi Ueda

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.