[PATCH 0/2] dm-ioband: I/O bandwidth controller v1.10.0: Introduction

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2] dm-ioband: I/O bandwidth controller v1.10.0: Introduction
@ 2009-01-20  5:10 Ryo Tsuruta
  2009-01-20  5:11 ` [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch Ryo Tsuruta
  2009-01-20 15:04 ` [PATCH 0/2] dm-ioband: I/O bandwidth controller v1.10.0: Introduction Alasdair G Kergon
  0 siblings, 2 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-20  5:10 UTC (permalink / raw)
  To: agk, dm-devel

Hi Alasdair and everyone,

This is the dm-ioband version 1.10.0 release. The patch is created
against the dm quilt tree so that it gets into Patchwork.

Dm-ioband is an I/O bandwidth controller implemented as a device-mapper
driver, which gives specified bandwidth to each job running on the same
physical device.

A lot more information (manual, benchmark results, all-in-one patch
and so on) is available at http://people.valinux.co.jp/~ryov/dm-ioband/ .
I welcome any feed-backs and suggestions. 

Thanks,
Ryo Tsuruta

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
  2009-01-20  5:10 [PATCH 0/2] dm-ioband: I/O bandwidth controller v1.10.0: Introduction Ryo Tsuruta
@ 2009-01-20  5:11 ` Ryo Tsuruta
  2009-01-20  5:12   ` [PATCH 2/2] dm-ioband: I/O bandwidth controller v1.10.0: Document Ryo Tsuruta
                     ` (3 more replies)
  2009-01-20 15:04 ` [PATCH 0/2] dm-ioband: I/O bandwidth controller v1.10.0: Introduction Alasdair G Kergon
  1 sibling, 4 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-20  5:11 UTC (permalink / raw)
  To: agk, dm-devel

This patch is the dm-ioband version 1.10.0 release.

Signed-off-by: Ryo Tsuruta <ryov@valinux.co.jp>
Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>

---
 drivers/md/Kconfig            |   13 
 drivers/md/Makefile           |    2 
 drivers/md/dm-ioband-ctl.c    | 1326 ++++++++++++++++++++++++++++++++++++++++++
 drivers/md/dm-ioband-policy.c |  460 ++++++++++++++
 drivers/md/dm-ioband-type.c   |   76 ++
 drivers/md/dm-ioband.h        |  194 ++++++
 6 files changed, 2071 insertions(+)

Index: linux-2.6/drivers/md/Kconfig
===================================================================
--- linux-2.6.orig/drivers/md/Kconfig
+++ linux-2.6/drivers/md/Kconfig
@@ -289,4 +289,17 @@ config DM_UEVENT
 	---help---
 	Generate udev events for DM events.
 
+config DM_IOBAND
+	tristate "I/O bandwidth control (EXPERIMENTAL)"
+	depends on BLK_DEV_DM && EXPERIMENTAL
+	---help---
+	This device-mapper target allows to define how the
+	available bandwidth of a storage device should be
+	shared between processes, cgroups, the partitions or the LUNs.
+
+	Information on how to use dm-ioband is available in:
+	   <file:Documentation/device-mapper/ioband.txt>.
+
+	If unsure, say N.
+
 endif # MD
Index: linux-2.6/drivers/md/Makefile
===================================================================
--- linux-2.6.orig/drivers/md/Makefile
+++ linux-2.6/drivers/md/Makefile
@@ -8,6 +8,7 @@ dm-multipath-objs := dm-path-selector.o 
 dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
 dm-mirror-objs	:= dm-raid1.o
+dm-ioband-objs	:= dm-ioband-ctl.o dm-ioband-policy.o dm-ioband-type.o
 md-mod-objs     := md.o bitmap.o
 raid456-objs	:= raid5.o raid6algos.o raid6recov.o raid6tables.o \
 		   raid6int1.o raid6int2.o raid6int4.o \
@@ -37,6 +38,7 @@ obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipa
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
+obj-$(CONFIG_DM_IOBAND)		+= dm-ioband.o
 
 quiet_cmd_unroll = UNROLL  $@
       cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
Index: linux-2.6/drivers/md/dm-ioband-ctl.c
===================================================================
--- /dev/null
+++ linux-2.6/drivers/md/dm-ioband-ctl.c
@@ -0,0 +1,1326 @@
+/*
+ * Copyright (C) 2008 VA Linux Systems Japan K.K.
+ * Authors: Hirokazu Takahashi <taka@valinux.co.jp>
+ *          Ryo Tsuruta <ryov@valinux.co.jp>
+ *
+ *  I/O bandwidth control
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/raid/md.h>
+#include <linux/rbtree.h>
+#include "dm.h"
+#include "dm-bio-list.h"
+#include "dm-ioband.h"
+
+#define DM_MSG_PREFIX "ioband"
+#define POLICY_PARAM_START 6
+#define POLICY_PARAM_DELIM "=:,"
+
+static LIST_HEAD(ioband_device_list);
+/* to protect ioband_device_list */
+static DEFINE_SPINLOCK(ioband_devicelist_lock);
+
+static void suspend_ioband_device(struct ioband_device *, unsigned long, int);
+static void resume_ioband_device(struct ioband_device *);
+static void ioband_conduct(struct work_struct *);
+static void ioband_hold_bio(struct ioband_group *, struct bio *);
+static struct bio *ioband_pop_bio(struct ioband_group *);
+static int ioband_set_param(struct ioband_group *, char *, char *);
+static int ioband_group_attach(struct ioband_group *, int, char *);
+static int ioband_group_type_select(struct ioband_group *, char *);
+
+long ioband_debug;	/* just for debugging */
+
+static void do_nothing(void) {}
+
+static int policy_init(struct ioband_device *dp, char *name,
+						int argc, char **argv)
+{
+	struct policy_type *p;
+	struct ioband_group *gp;
+	unsigned long flags;
+	int r;
+
+	for (p = dm_ioband_policy_type; p->p_name; p++) {
+		if (!strcmp(name, p->p_name))
+			break;
+	}
+	if (!p->p_name)
+		return -EINVAL;
+
+	spin_lock_irqsave(&dp->g_lock, flags);
+	if (dp->g_policy == p) {
+		/* do nothing if the same policy is already set */
+		spin_unlock_irqrestore(&dp->g_lock, flags);
+		return 0;
+	}
+
+	suspend_ioband_device(dp, flags, 1);
+	list_for_each_entry(gp, &dp->g_groups, c_list)
+		dp->g_group_dtr(gp);
+
+	/* switch to the new policy */
+	dp->g_policy = p;
+	r = p->p_policy_init(dp, argc, argv);
+	if (!r) {
+		if (!dp->g_hold_bio)
+			dp->g_hold_bio = ioband_hold_bio;
+		if (!dp->g_pop_bio)
+			dp->g_pop_bio = ioband_pop_bio;
+
+		list_for_each_entry(gp, &dp->g_groups, c_list)
+			dp->g_group_ctr(gp, NULL);
+	}
+	resume_ioband_device(dp);
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+	return r;
+}
+
+static struct ioband_device *alloc_ioband_device(char *name,
+					int io_throttle, int io_limit)
+
+{
+	struct ioband_device *dp, *new;
+	unsigned long flags;
+
+	new = kzalloc(sizeof(struct ioband_device), GFP_KERNEL);
+	if (!new)
+		return NULL;
+
+	/*
+	 * Prepare its own workqueue as generic_make_request() may
+	 * potentially block the workqueue when submitting BIOs.
+	 */
+	new->g_ioband_wq = create_workqueue("kioband");
+	if (!new->g_ioband_wq) {
+		kfree(new);
+		return NULL;
+	}
+
+	spin_lock_irqsave(&ioband_devicelist_lock, flags);
+	list_for_each_entry(dp, &ioband_device_list, g_list) {
+		if (!strcmp(dp->g_name, name)) {
+			dp->g_ref++;
+			spin_unlock_irqrestore(&ioband_devicelist_lock, flags);
+			destroy_workqueue(new->g_ioband_wq);
+			kfree(new);
+			return dp;
+		}
+	}
+
+	INIT_DELAYED_WORK(&new->g_conductor, ioband_conduct);
+	INIT_LIST_HEAD(&new->g_groups);
+	INIT_LIST_HEAD(&new->g_list);
+	spin_lock_init(&new->g_lock);
+	mutex_init(&new->g_lock_device);
+	bio_list_init(&new->g_urgent_bios);
+	new->g_io_throttle = io_throttle;
+	new->g_io_limit[0] = io_limit;
+	new->g_io_limit[1] = io_limit;
+	new->g_issued[0] = 0;
+	new->g_issued[1] = 0;
+	new->g_blocked = 0;
+	new->g_ref = 1;
+	new->g_flags = 0;
+	strlcpy(new->g_name, name, sizeof(new->g_name));
+	new->g_policy = NULL;
+	new->g_hold_bio = NULL;
+	new->g_pop_bio = NULL;
+	init_waitqueue_head(&new->g_waitq);
+	init_waitqueue_head(&new->g_waitq_suspend);
+	init_waitqueue_head(&new->g_waitq_flush);
+	list_add_tail(&new->g_list, &ioband_device_list);
+
+	spin_unlock_irqrestore(&ioband_devicelist_lock, flags);
+	return new;
+}
+
+static void release_ioband_device(struct ioband_device *dp)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioband_devicelist_lock, flags);
+	dp->g_ref--;
+	if (dp->g_ref > 0) {
+		spin_unlock_irqrestore(&ioband_devicelist_lock, flags);
+		return;
+	}
+	list_del(&dp->g_list);
+	spin_unlock_irqrestore(&ioband_devicelist_lock, flags);
+	destroy_workqueue(dp->g_ioband_wq);
+	kfree(dp);
+}
+
+static int is_ioband_device_flushed(struct ioband_device *dp,
+						int wait_completion)
+{
+	struct ioband_group *gp;
+
+	if (wait_completion && dp->g_issued[0] + dp->g_issued[1] > 0)
+		return 0;
+	if (dp->g_blocked || waitqueue_active(&dp->g_waitq))
+		return 0;
+	list_for_each_entry(gp, &dp->g_groups, c_list)
+		if (waitqueue_active(&gp->c_waitq))
+			return 0;
+	return 1;
+}
+
+static void suspend_ioband_device(struct ioband_device *dp,
+				unsigned long flags, int wait_completion)
+{
+	struct ioband_group *gp;
+
+	/* block incoming bios */
+	set_device_suspended(dp);
+
+	/* wake up all blocked processes and go down all ioband groups */
+	wake_up_all(&dp->g_waitq);
+	list_for_each_entry(gp, &dp->g_groups, c_list) {
+		if (!is_group_down(gp)) {
+			set_group_down(gp);
+			set_group_need_up(gp);
+		}
+		wake_up_all(&gp->c_waitq);
+	}
+
+	/* flush the already mapped bios */
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+	queue_delayed_work(dp->g_ioband_wq, &dp->g_conductor, 0);
+	flush_workqueue(dp->g_ioband_wq);
+
+	/* wait for all processes to wake up and bios to release */
+	spin_lock_irqsave(&dp->g_lock, flags);
+	wait_event_lock_irq(dp->g_waitq_flush,
+			is_ioband_device_flushed(dp, wait_completion),
+			dp->g_lock, do_nothing());
+}
+
+static void resume_ioband_device(struct ioband_device *dp)
+{
+	struct ioband_group *gp;
+
+	/* go up ioband groups */
+	list_for_each_entry(gp, &dp->g_groups, c_list) {
+		if (group_need_up(gp)) {
+			clear_group_need_up(gp);
+			clear_group_down(gp);
+		}
+	}
+
+	/* accept incoming bios */
+	wake_up_all(&dp->g_waitq_suspend);
+	clear_device_suspended(dp);
+}
+
+static struct ioband_group *ioband_group_find(
+					struct ioband_group *head, int id)
+{
+	struct rb_node *node = head->c_group_root.rb_node;
+
+	while (node) {
+		struct ioband_group *p =
+			container_of(node, struct ioband_group, c_group_node);
+
+		if (p->c_id == id || id == IOBAND_ID_ANY)
+			return p;
+		node = (id < p->c_id) ? node->rb_left : node->rb_right;
+	}
+	return NULL;
+}
+
+static void ioband_group_add_node(struct rb_root *root,
+						struct ioband_group *gp)
+{
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+	struct ioband_group *p;
+
+	while (*new) {
+		p = container_of(*new, struct ioband_group, c_group_node);
+		parent = *new;
+		new = (gp->c_id < p->c_id) ?
+					&(*new)->rb_left : &(*new)->rb_right;
+	}
+
+	rb_link_node(&gp->c_group_node, parent, new);
+	rb_insert_color(&gp->c_group_node, root);
+}
+
+static int ioband_group_init(struct ioband_group *gp,
+    struct ioband_group *head, struct ioband_device *dp, int id, char *param)
+{
+	unsigned long flags;
+	int r;
+
+	INIT_LIST_HEAD(&gp->c_list);
+	bio_list_init(&gp->c_blocked_bios);
+	bio_list_init(&gp->c_prio_bios);
+	gp->c_id = id;	/* should be verified */
+	gp->c_blocked = 0;
+	gp->c_prio_blocked = 0;
+	memset(gp->c_stat, 0, sizeof(gp->c_stat));
+	init_waitqueue_head(&gp->c_waitq);
+	gp->c_flags = 0;
+	gp->c_group_root = RB_ROOT;
+	gp->c_banddev = dp;
+
+	spin_lock_irqsave(&dp->g_lock, flags);
+	if (head && ioband_group_find(head, id)) {
+		spin_unlock_irqrestore(&dp->g_lock, flags);
+		DMWARN("ioband_group: id=%d already exists.", id);
+		return -EEXIST;
+	}
+
+	list_add_tail(&gp->c_list, &dp->g_groups);
+
+	r = dp->g_group_ctr(gp, param);
+	if (r) {
+		list_del(&gp->c_list);
+		spin_unlock_irqrestore(&dp->g_lock, flags);
+		return r;
+	}
+
+	if (head) {
+		ioband_group_add_node(&head->c_group_root, gp);
+		gp->c_dev = head->c_dev;
+		gp->c_target = head->c_target;
+	}
+
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+
+	return 0;
+}
+
+static void ioband_group_release(struct ioband_group *head,
+						struct ioband_group *gp)
+{
+	struct ioband_device *dp = gp->c_banddev;
+
+	list_del(&gp->c_list);
+	if (head)
+		rb_erase(&gp->c_group_node, &head->c_group_root);
+	dp->g_group_dtr(gp);
+	kfree(gp);
+}
+
+static void ioband_group_destroy_all(struct ioband_group *gp)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	struct ioband_group *group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dp->g_lock, flags);
+	while ((group = ioband_group_find(gp, IOBAND_ID_ANY)))
+		ioband_group_release(gp, group);
+	ioband_group_release(NULL, gp);
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+}
+
+static void ioband_group_stop_all(struct ioband_group *head, int suspend)
+{
+	struct ioband_device *dp = head->c_banddev;
+	struct ioband_group *p;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dp->g_lock, flags);
+	for (node = rb_first(&head->c_group_root); node; node = rb_next(node)) {
+		p = rb_entry(node, struct ioband_group, c_group_node);
+		set_group_down(p);
+		if (suspend) {
+			set_group_suspended(p);
+			dprintk(KERN_ERR "ioband suspend: gp(%p)\n", p);
+		}
+	}
+	set_group_down(head);
+	if (suspend) {
+		set_group_suspended(head);
+		dprintk(KERN_ERR "ioband suspend: gp(%p)\n", head);
+	}
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+	queue_delayed_work(dp->g_ioband_wq, &dp->g_conductor, 0);
+	flush_workqueue(dp->g_ioband_wq);
+}
+
+static void ioband_group_resume_all(struct ioband_group *head)
+{
+	struct ioband_device *dp = head->c_banddev;
+	struct ioband_group *p;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dp->g_lock, flags);
+	for (node = rb_first(&head->c_group_root); node;
+							node = rb_next(node)) {
+		p = rb_entry(node, struct ioband_group, c_group_node);
+		clear_group_down(p);
+		clear_group_suspended(p);
+		dprintk(KERN_ERR "ioband resume: gp(%p)\n", p);
+	}
+	clear_group_down(head);
+	clear_group_suspended(head);
+	dprintk(KERN_ERR "ioband resume: gp(%p)\n", head);
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+}
+
+static int split_string(char *s, long *id, char **v)
+{
+	char *p, *q;
+	int r = 0;
+
+	*id = IOBAND_ID_ANY;
+	p = strsep(&s, POLICY_PARAM_DELIM);
+	q = strsep(&s, POLICY_PARAM_DELIM);
+	if (!q) {
+		*v = p;
+	} else {
+		r = strict_strtol(p, 0, id);
+		*v = q;
+	}
+	return r;
+}
+
+/*
+ * Create a new band device:
+ *   parameters:  <device> <device-group-id> <io_throttle> <io_limit>
+ *     <type> <policy> <policy-param...> <group-id:group-param...>
+ */
+static int ioband_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct ioband_group *gp;
+	struct ioband_device *dp;
+	struct dm_dev *dev;
+	int io_throttle;
+	int io_limit;
+	int i, r, start;
+	long val, id;
+	char *param;
+
+	if (argc < POLICY_PARAM_START) {
+		ti->error = "Requires " __stringify(POLICY_PARAM_START)
+							" or more arguments";
+		return -EINVAL;
+	}
+
+	if (strlen(argv[1]) > IOBAND_NAME_MAX) {
+		ti->error = "Ioband device name is too long";
+		return -EINVAL;
+	}
+	dprintk(KERN_ERR "ioband_ctr ioband device name:%s\n", argv[1]);
+
+	r = strict_strtol(argv[2], 0, &val);
+	if (r || val < 0) {
+		ti->error = "Invalid io_throttle";
+		return -EINVAL;
+	}
+	io_throttle = (val == 0) ? DEFAULT_IO_THROTTLE : val;
+
+	r = strict_strtol(argv[3], 0, &val);
+	if (r || val < 0) {
+		ti->error = "Invalid io_limit";
+		return -EINVAL;
+	}
+	io_limit = val;
+
+	r = dm_get_device(ti, argv[0], 0, ti->len,
+				dm_table_get_mode(ti->table), &dev);
+	if (r) {
+		ti->error = "Device lookup failed";
+		return r;
+	}
+
+	if (io_limit == 0) {
+		struct request_queue *q;
+
+		q = bdev_get_queue(dev->bdev);
+		if (!q) {
+			ti->error = "Can't get queue size";
+			r = -ENXIO;
+			goto release_dm_device;
+		}
+		dprintk(KERN_ERR "ioband_ctr nr_requests:%lu\n",
+							q->nr_requests);
+		io_limit = q->nr_requests;
+	}
+
+	if (io_limit < io_throttle)
+		io_limit = io_throttle;
+	dprintk(KERN_ERR "ioband_ctr io_throttle:%d io_limit:%d\n",
+						io_throttle, io_limit);
+
+	dp = alloc_ioband_device(argv[1], io_throttle, io_limit);
+	if (!dp) {
+		ti->error = "Cannot create ioband device";
+		r = -EINVAL;
+		goto release_dm_device;
+	}
+
+	mutex_lock(&dp->g_lock_device);
+	r = policy_init(dp, argv[POLICY_PARAM_START - 1],
+			argc - POLICY_PARAM_START, &argv[POLICY_PARAM_START]);
+	if (r) {
+		ti->error = "Invalid policy parameter";
+		goto release_ioband_device;
+	}
+
+	gp = kzalloc(sizeof(struct ioband_group), GFP_KERNEL);
+	if (!gp) {
+		ti->error = "Cannot allocate memory for ioband group";
+		r = -ENOMEM;
+		goto release_ioband_device;
+	}
+
+	ti->private = gp;
+	gp->c_target = ti;
+	gp->c_dev = dev;
+
+	/* Find a default group parameter */
+	for (start = POLICY_PARAM_START; start < argc; start++)
+		if (argv[start][0] == ':')
+			break;
+	param = (start < argc) ? &argv[start][1] : NULL;
+
+	/* Create a default ioband group */
+	r = ioband_group_init(gp, NULL, dp, IOBAND_ID_ANY, param);
+	if (r) {
+		kfree(gp);
+		ti->error = "Cannot create default ioband group";
+		goto release_ioband_device;
+	}
+
+	r = ioband_group_type_select(gp, argv[4]);
+	if (r) {
+		ti->error = "Cannot set ioband group type";
+		goto release_ioband_group;
+	}
+
+	/* Create sub ioband groups */
+	for (i = start + 1; i < argc; i++) {
+		r = split_string(argv[i], &id, &param);
+		if (r) {
+			ti->error = "Invalid ioband group parameter";
+			goto release_ioband_group;
+		}
+		r = ioband_group_attach(gp, id, param);
+		if (r) {
+			ti->error = "Cannot create ioband group";
+			goto release_ioband_group;
+		}
+	}
+	mutex_unlock(&dp->g_lock_device);
+	return 0;
+
+release_ioband_group:
+	ioband_group_destroy_all(gp);
+release_ioband_device:
+	mutex_unlock(&dp->g_lock_device);
+	release_ioband_device(dp);
+release_dm_device:
+	dm_put_device(ti, dev);
+	return r;
+}
+
+static void ioband_dtr(struct dm_target *ti)
+{
+	struct ioband_group *gp = ti->private;
+	struct ioband_device *dp = gp->c_banddev;
+
+	mutex_lock(&dp->g_lock_device);
+	ioband_group_stop_all(gp, 0);
+	cancel_delayed_work_sync(&dp->g_conductor);
+	dm_put_device(ti, gp->c_dev);
+	ioband_group_destroy_all(gp);
+	mutex_unlock(&dp->g_lock_device);
+	release_ioband_device(dp);
+}
+
+static void ioband_hold_bio(struct ioband_group *gp, struct bio *bio)
+{
+	/* Todo: The list should be split into a read list and a write list */
+	bio_list_add(&gp->c_blocked_bios, bio);
+}
+
+static struct bio *ioband_pop_bio(struct ioband_group *gp)
+{
+	return bio_list_pop(&gp->c_blocked_bios);
+}
+
+static int is_urgent_bio(struct bio *bio)
+{
+	struct page *page = bio_iovec_idx(bio, 0)->bv_page;
+	/*
+	 * ToDo: A new flag should be added to struct bio, which indicates
+	 * 	it contains urgent I/O requests.
+	 */
+	if (!PageReclaim(page))
+		return 0;
+	if (PageSwapCache(page))
+		return 2;
+	return 1;
+}
+
+static inline int device_should_block(struct ioband_group *gp)
+{
+	struct ioband_device *dp = gp->c_banddev;
+
+	if (is_group_down(gp))
+		return 0;
+	if (is_device_blocked(dp))
+		return 1;
+	if (dp->g_blocked >= dp->g_io_limit[0] + dp->g_io_limit[1]) {
+		set_device_blocked(dp);
+		return 1;
+	}
+	return 0;
+}
+
+static inline int group_should_block(struct ioband_group *gp)
+{
+	struct ioband_device *dp = gp->c_banddev;
+
+	if (is_group_down(gp))
+		return 0;
+	if (is_group_blocked(gp))
+		return 1;
+	if (dp->g_should_block(gp)) {
+		set_group_blocked(gp);
+		return 1;
+	}
+	return 0;
+}
+
+static void prevent_burst_bios(struct ioband_group *gp, struct bio *bio)
+{
+	struct ioband_device *dp = gp->c_banddev;
+
+	if (current->flags & PF_KTHREAD || is_urgent_bio(bio)) {
+		/*
+		 * Kernel threads shouldn't be blocked easily since each of
+		 * them may handle BIOs for several groups on several
+		 * partitions.
+		 */
+		wait_event_lock_irq(dp->g_waitq, !device_should_block(gp),
+						dp->g_lock, do_nothing());
+	} else {
+		wait_event_lock_irq(gp->c_waitq, !group_should_block(gp),
+						dp->g_lock, do_nothing());
+	}
+}
+
+static inline int should_pushback_bio(struct ioband_group *gp)
+{
+	return is_group_suspended(gp) && dm_noflush_suspending(gp->c_target);
+}
+
+static inline int prepare_to_issue(struct ioband_group *gp, struct bio *bio)
+{
+	struct ioband_device *dp = gp->c_banddev;
+
+	dp->g_issued[bio_data_dir(bio)]++;
+	return dp->g_prepare_bio(gp, bio, 0);
+}
+
+static inline int room_for_bio(struct ioband_device *dp)
+{
+	return dp->g_issued[0] < dp->g_io_limit[0]
+		|| dp->g_issued[1] < dp->g_io_limit[1];
+}
+
+static void hold_bio(struct ioband_group *gp, struct bio *bio)
+{
+	struct ioband_device *dp = gp->c_banddev;
+
+	dp->g_blocked++;
+	if (is_urgent_bio(bio)) {
+		/*
+		 * ToDo:
+		 * When barrier mode is supported, write bios sharing the same
+		 * file system with the currnt one would be all moved
+		 * to g_urgent_bios list.
+		 * You don't have to care about barrier handling if the bio
+		 * is for swapping.
+		 */
+		dp->g_prepare_bio(gp, bio, IOBAND_URGENT);
+		bio_list_add(&dp->g_urgent_bios, bio);
+	} else {
+		gp->c_blocked++;
+		dp->g_hold_bio(gp, bio);
+	}
+}
+
+static inline int room_for_bio_rw(struct ioband_device *dp, int direct)
+{
+	return dp->g_issued[direct] < dp->g_io_limit[direct];
+}
+
+static void push_prio_bio(struct ioband_group *gp, struct bio *bio, int direct)
+{
+	if (bio_list_empty(&gp->c_prio_bios))
+		set_prio_queue(gp, direct);
+	bio_list_add(&gp->c_prio_bios, bio);
+	gp->c_prio_blocked++;
+}
+
+static struct bio *pop_prio_bio(struct ioband_group *gp)
+{
+	struct bio *bio = bio_list_pop(&gp->c_prio_bios);
+
+	if (bio_list_empty(&gp->c_prio_bios))
+		clear_prio_queue(gp);
+
+	if (bio)
+		gp->c_prio_blocked--;
+	return bio;
+}
+
+static int make_issue_list(struct ioband_group *gp, struct bio *bio,
+		 struct bio_list *issue_list, struct bio_list *pushback_list)
+{
+	struct ioband_device *dp = gp->c_banddev;
+
+	dp->g_blocked--;
+	gp->c_blocked--;
+	if (!gp->c_blocked && is_group_blocked(gp)) {
+		clear_group_blocked(gp);
+		wake_up_all(&gp->c_waitq);
+	}
+	if (should_pushback_bio(gp))
+		bio_list_add(pushback_list, bio);
+	else {
+		int rw = bio_data_dir(bio);
+
+		gp->c_stat[rw].deferred++;
+		gp->c_stat[rw].sectors += bio_sectors(bio);
+		bio_list_add(issue_list, bio);
+	}
+	return prepare_to_issue(gp, bio);
+}
+
+static void release_urgent_bios(struct ioband_device *dp,
+		struct bio_list *issue_list, struct bio_list *pushback_list)
+{
+	struct bio *bio;
+
+	if (bio_list_empty(&dp->g_urgent_bios))
+		return;
+	while (room_for_bio_rw(dp, 1)) {
+		bio = bio_list_pop(&dp->g_urgent_bios);
+		if (!bio)
+			return;
+		dp->g_blocked--;
+		dp->g_issued[bio_data_dir(bio)]++;
+		bio_list_add(issue_list, bio);
+	}
+}
+
+static int release_prio_bios(struct ioband_group *gp,
+		struct bio_list *issue_list, struct bio_list *pushback_list)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	struct bio *bio;
+	int direct;
+	int ret;
+
+	if (bio_list_empty(&gp->c_prio_bios))
+		return R_OK;
+	direct = prio_queue_direct(gp);
+	while (gp->c_prio_blocked) {
+		if (!dp->g_can_submit(gp))
+			return R_BLOCK;
+		if (!room_for_bio_rw(dp, direct))
+			return R_OK;
+		bio = pop_prio_bio(gp);
+		if (!bio)
+			return R_OK;
+		ret = make_issue_list(gp, bio, issue_list, pushback_list);
+		if (ret)
+			return ret;
+	}
+	return R_OK;
+}
+
+static int release_norm_bios(struct ioband_group *gp,
+		struct bio_list *issue_list, struct bio_list *pushback_list)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	struct bio *bio;
+	int direct;
+	int ret;
+
+	while (gp->c_blocked - gp->c_prio_blocked) {
+		if (!dp->g_can_submit(gp))
+			return R_BLOCK;
+		if (!room_for_bio(dp))
+			return R_OK;
+		bio = dp->g_pop_bio(gp);
+		if (!bio)
+			return R_OK;
+
+		direct = bio_data_dir(bio);
+		if (!room_for_bio_rw(dp, direct)) {
+			push_prio_bio(gp, bio, direct);
+			continue;
+		}
+		ret = make_issue_list(gp, bio, issue_list, pushback_list);
+		if (ret)
+			return ret;
+	}
+	return R_OK;
+}
+
+static inline int release_bios(struct ioband_group *gp,
+		struct bio_list *issue_list, struct bio_list *pushback_list)
+{
+	int ret = release_prio_bios(gp, issue_list, pushback_list);
+	if (ret)
+		return ret;
+	return release_norm_bios(gp, issue_list, pushback_list);
+}
+
+static struct ioband_group *ioband_group_get(struct ioband_group *head,
+							struct bio *bio)
+{
+	struct ioband_group *gp;
+
+	if (!head->c_type->t_getid)
+		return head;
+
+	gp = ioband_group_find(head, head->c_type->t_getid(bio));
+
+	if (!gp)
+		gp = head;
+	return gp;
+}
+
+/*
+ * Start to control the bandwidth once the number of uncompleted BIOs
+ * exceeds the value of "io_throttle".
+ */
+static int ioband_map(struct dm_target *ti, struct bio *bio,
+						union map_info *map_context)
+{
+	struct ioband_group *gp = ti->private;
+	struct ioband_device *dp = gp->c_banddev;
+	unsigned long flags;
+	int rw;
+
+	spin_lock_irqsave(&dp->g_lock, flags);
+
+	/*
+	 * The device is suspended while some of the ioband device
+	 * configurations are being changed.
+	 */
+	if (is_device_suspended(dp))
+		wait_event_lock_irq(dp->g_waitq_suspend,
+			!is_device_suspended(dp), dp->g_lock, do_nothing());
+
+	gp = ioband_group_get(gp, bio);
+	prevent_burst_bios(gp, bio);
+	if (should_pushback_bio(gp)) {
+		spin_unlock_irqrestore(&dp->g_lock, flags);
+		return DM_MAPIO_REQUEUE;
+	}
+
+	bio->bi_bdev = gp->c_dev->bdev;
+	bio->bi_sector -= ti->begin;
+	rw = bio_data_dir(bio);
+
+	if (!gp->c_blocked && room_for_bio_rw(dp, rw)) {
+		if (dp->g_can_submit(gp)) {
+			prepare_to_issue(gp, bio);
+			gp->c_stat[rw].immediate++;
+			gp->c_stat[rw].sectors += bio_sectors(bio);
+			spin_unlock_irqrestore(&dp->g_lock, flags);
+			return DM_MAPIO_REMAPPED;
+		} else if (!dp->g_blocked
+				&& dp->g_issued[0] + dp->g_issued[1] == 0) {
+			dprintk(KERN_ERR "ioband_map: token expired "
+					"gp:%p bio:%p\n", gp, bio);
+			queue_delayed_work(dp->g_ioband_wq,
+							&dp->g_conductor, 1);
+		}
+	}
+	hold_bio(gp, bio);
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+
+	return DM_MAPIO_SUBMITTED;
+}
+
+/*
+ * Select the best group to resubmit its BIOs.
+ */
+static struct ioband_group *choose_best_group(struct ioband_device *dp)
+{
+	struct ioband_group *gp;
+	struct ioband_group *best = NULL;
+	int	highest = 0;
+	int	pri;
+
+	/* Todo: The algorithm should be optimized.
+	 *       It would be better to use rbtree.
+	 */
+	list_for_each_entry(gp, &dp->g_groups, c_list) {
+		if (!gp->c_blocked || !room_for_bio(dp))
+			continue;
+		if (gp->c_blocked == gp->c_prio_blocked
+			&& !room_for_bio_rw(dp, prio_queue_direct(gp))) {
+			continue;
+		}
+		pri = dp->g_can_submit(gp);
+		if (pri > highest) {
+			highest = pri;
+			best = gp;
+		}
+	}
+
+	return best;
+}
+
+/*
+ * This function is called right after it becomes able to resubmit BIOs.
+ * It selects the best BIOs and passes them to the underlying layer.
+ */
+static void ioband_conduct(struct work_struct *work)
+{
+	struct ioband_device *dp =
+		container_of(work, struct ioband_device, g_conductor.work);
+	struct ioband_group *gp = NULL;
+	struct bio *bio;
+	unsigned long flags;
+	struct bio_list issue_list, pushback_list;
+
+	bio_list_init(&issue_list);
+	bio_list_init(&pushback_list);
+
+	spin_lock_irqsave(&dp->g_lock, flags);
+	release_urgent_bios(dp, &issue_list, &pushback_list);
+	if (dp->g_blocked) {
+		gp = choose_best_group(dp);
+		if (gp && release_bios(gp, &issue_list, &pushback_list)
+								== R_YIELD)
+			queue_delayed_work(dp->g_ioband_wq,
+							&dp->g_conductor, 0);
+	}
+
+	if (is_device_blocked(dp)
+	    && dp->g_blocked < dp->g_io_limit[0]+dp->g_io_limit[1]) {
+		clear_device_blocked(dp);
+		wake_up_all(&dp->g_waitq);
+	}
+
+	if (dp->g_blocked && room_for_bio_rw(dp, 0) && room_for_bio_rw(dp, 1) &&
+		bio_list_empty(&issue_list) && bio_list_empty(&pushback_list) &&
+		dp->g_restart_bios(dp)) {
+		dprintk(KERN_ERR "ioband_conduct: token expired dp:%p "
+			"issued(%d,%d) g_blocked(%d)\n", dp,
+			 dp->g_issued[0], dp->g_issued[1], dp->g_blocked);
+		queue_delayed_work(dp->g_ioband_wq, &dp->g_conductor, 0);
+	}
+
+
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+
+	while ((bio = bio_list_pop(&issue_list)))
+		generic_make_request(bio);
+	while ((bio = bio_list_pop(&pushback_list)))
+		bio_endio(bio, -EIO);
+}
+
+static int ioband_end_io(struct dm_target *ti, struct bio *bio,
+				int error, union map_info *map_context)
+{
+	struct ioband_group *gp = ti->private;
+	struct ioband_device *dp = gp->c_banddev;
+	unsigned long flags;
+	int r = error;
+
+	/*
+	 *  XXX: A new error code for device mapper devices should be used
+	 *       rather than EIO.
+	 */
+	if (error == -EIO && should_pushback_bio(gp)) {
+		/* This ioband device is suspending */
+		r = DM_ENDIO_REQUEUE;
+	}
+	/*
+	 * Todo: The algorithm should be optimized to eliminate the spinlock.
+	 */
+	spin_lock_irqsave(&dp->g_lock, flags);
+	dp->g_issued[bio_data_dir(bio)]--;
+
+	/*
+	 * Todo: It would be better to introduce high/low water marks here
+	 * 	 not to kick the workqueues so often.
+	 */
+	if (dp->g_blocked)
+		queue_delayed_work(dp->g_ioband_wq, &dp->g_conductor, 0);
+	else if (is_device_suspended(dp)
+				&& dp->g_issued[0] + dp->g_issued[1] == 0)
+		wake_up_all(&dp->g_waitq_flush);
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+	return r;
+}
+
+static void ioband_presuspend(struct dm_target *ti)
+{
+	struct ioband_group *gp = ti->private;
+	struct ioband_device *dp = gp->c_banddev;
+
+	mutex_lock(&dp->g_lock_device);
+	ioband_group_stop_all(gp, 1);
+	mutex_unlock(&dp->g_lock_device);
+}
+
+static void ioband_resume(struct dm_target *ti)
+{
+	struct ioband_group *gp = ti->private;
+	struct ioband_device *dp = gp->c_banddev;
+
+	mutex_lock(&dp->g_lock_device);
+	ioband_group_resume_all(gp);
+	mutex_unlock(&dp->g_lock_device);
+}
+
+
+static void ioband_group_status(struct ioband_group *gp, int *szp,
+					char *result, unsigned int maxlen)
+{
+	struct ioband_group_stat *stat;
+	int i, sz = *szp; /* used in DMEMIT() */
+
+	DMEMIT(" %d", gp->c_id);
+	for (i = 0; i < 2; i++) {
+		stat = &gp->c_stat[i];
+		DMEMIT(" %lu %lu %lu",
+			stat->immediate + stat->deferred, stat->deferred,
+			stat->sectors);
+	}
+	*szp = sz;
+}
+
+static int ioband_status(struct dm_target *ti, status_type_t type,
+					char *result, unsigned int maxlen)
+{
+	struct ioband_group *gp = ti->private, *p;
+	struct ioband_device *dp = gp->c_banddev;
+	struct rb_node *node;
+	int sz = 0;	/* used in DMEMIT() */
+	unsigned long flags;
+
+	mutex_lock(&dp->g_lock_device);
+
+	switch (type) {
+	case STATUSTYPE_INFO:
+		spin_lock_irqsave(&dp->g_lock, flags);
+		DMEMIT("%s", dp->g_name);
+		ioband_group_status(gp, &sz, result, maxlen);
+		for (node = rb_first(&gp->c_group_root); node;
+						node = rb_next(node)) {
+			p = rb_entry(node, struct ioband_group, c_group_node);
+			ioband_group_status(p, &sz, result, maxlen);
+		}
+		spin_unlock_irqrestore(&dp->g_lock, flags);
+		break;
+
+	case STATUSTYPE_TABLE:
+		spin_lock_irqsave(&dp->g_lock, flags);
+		DMEMIT("%s %s %d %d %s %s",
+				gp->c_dev->name, dp->g_name,
+				dp->g_io_throttle, dp->g_io_limit[0],
+				gp->c_type->t_name, dp->g_policy->p_name);
+		dp->g_show(gp, &sz, result, maxlen);
+		spin_unlock_irqrestore(&dp->g_lock, flags);
+		break;
+	}
+
+	mutex_unlock(&dp->g_lock_device);
+	return 0;
+}
+
+static int ioband_group_type_select(struct ioband_group *gp, char *name)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	struct group_type *t;
+	unsigned long flags;
+
+	for (t = dm_ioband_group_type; (t->t_name); t++) {
+		if (!strcmp(name, t->t_name))
+			break;
+	}
+	if (!t->t_name) {
+		DMWARN("ioband type select: %s isn't supported.", name);
+		return -EINVAL;
+	}
+	spin_lock_irqsave(&dp->g_lock, flags);
+	if (!RB_EMPTY_ROOT(&gp->c_group_root)) {
+		spin_unlock_irqrestore(&dp->g_lock, flags);
+		return -EBUSY;
+	}
+	gp->c_type = t;
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+
+	return 0;
+}
+
+static int ioband_set_param(struct ioband_group *gp, char *cmd, char *value)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	char *val_str;
+	long id;
+	unsigned long flags;
+	int r;
+
+	r = split_string(value, &id, &val_str);
+	if (r)
+		return r;
+
+	spin_lock_irqsave(&dp->g_lock, flags);
+	if (id != IOBAND_ID_ANY) {
+		gp = ioband_group_find(gp, id);
+		if (!gp) {
+			spin_unlock_irqrestore(&dp->g_lock, flags);
+			DMWARN("ioband_set_param: id=%ld not found.", id);
+			return -EINVAL;
+		}
+	}
+	r = dp->g_set_param(gp, cmd, val_str);
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+	return r;
+}
+
+static int ioband_group_attach(struct ioband_group *gp, int id, char *param)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	struct ioband_group *sub_gp;
+	int r;
+
+	if (id < 0) {
+		DMWARN("ioband_group_attach: invalid id:%d", id);
+		return -EINVAL;
+	}
+	if (!gp->c_type->t_getid) {
+		DMWARN("ioband_group_attach: "
+		       "no ioband group type is specified");
+		return -EINVAL;
+	}
+
+	sub_gp = kzalloc(sizeof(struct ioband_group), GFP_KERNEL);
+	if (!sub_gp)
+		return -ENOMEM;
+
+	r = ioband_group_init(sub_gp, gp, dp, id, param);
+	if (r < 0) {
+		kfree(sub_gp);
+		return r;
+	}
+	return 0;
+}
+
+static int ioband_group_detach(struct ioband_group *gp, int id)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	struct ioband_group *sub_gp;
+	unsigned long flags;
+
+	if (id < 0) {
+		DMWARN("ioband_group_detach: invalid id:%d", id);
+		return -EINVAL;
+	}
+	spin_lock_irqsave(&dp->g_lock, flags);
+	sub_gp = ioband_group_find(gp, id);
+	if (!sub_gp) {
+		spin_unlock_irqrestore(&dp->g_lock, flags);
+		DMWARN("ioband_group_detach: invalid id:%d", id);
+		return -EINVAL;
+	}
+
+	/*
+	 * Todo: Calling suspend_ioband_device() before releasing the
+	 *       ioband group has a large overhead. Need improvement.
+	 */
+	suspend_ioband_device(dp, flags, 0);
+	ioband_group_release(gp, sub_gp);
+	resume_ioband_device(dp);
+	spin_unlock_irqrestore(&dp->g_lock, flags);
+	return 0;
+}
+
+/*
+ * Message parameters:
+ *	"policy"      <name>
+ *       ex)
+ *		"policy" "weight"
+ *	"type"        "none"|"pid"|"pgrp"|"node"|"cpuset"|"cgroup"|"user"|"gid"
+ * 	"io_throttle" <value>
+ * 	"io_limit"    <value>
+ *	"attach"      <group id>
+ *	"detach"      <group id>
+ *	"any-command" <group id>:<value>
+ *       ex)
+ *		"weight" 0:<value>
+ *		"token"  24:<value>
+ */
+static int __ioband_message(struct dm_target *ti,
+					unsigned int argc, char **argv)
+{
+	struct ioband_group *gp = ti->private, *p;
+	struct ioband_device *dp = gp->c_banddev;
+	struct rb_node *node;
+	long val;
+	int r = 0;
+	unsigned long flags;
+
+	if (argc == 1 && !strcmp(argv[0], "reset")) {
+		spin_lock_irqsave(&dp->g_lock, flags);
+		memset(gp->c_stat, 0, sizeof(gp->c_stat));
+		for (node = rb_first(&gp->c_group_root); node;
+						 node = rb_next(node)) {
+			p = rb_entry(node, struct ioband_group, c_group_node);
+			memset(p->c_stat, 0, sizeof(p->c_stat));
+		}
+		spin_unlock_irqrestore(&dp->g_lock, flags);
+		return 0;
+	}
+
+	if (argc != 2) {
+		DMWARN("Unrecognised band message received.");
+		return -EINVAL;
+	}
+	if (!strcmp(argv[0], "debug")) {
+		r = strict_strtol(argv[1], 0, &val);
+		if (r || val < 0)
+			return -EINVAL;
+		ioband_debug = val;
+		return 0;
+	} else if (!strcmp(argv[0], "io_throttle")) {
+		r = strict_strtol(argv[1], 0, &val);
+		spin_lock_irqsave(&dp->g_lock, flags);
+		if (r || val < 0 ||
+			val > dp->g_io_limit[0] || val > dp->g_io_limit[1]) {
+			spin_unlock_irqrestore(&dp->g_lock, flags);
+			return -EINVAL;
+		}
+		dp->g_io_throttle = (val == 0) ? DEFAULT_IO_THROTTLE : val;
+		spin_unlock_irqrestore(&dp->g_lock, flags);
+		ioband_set_param(gp, argv[0], argv[1]);
+		return 0;
+	} else if (!strcmp(argv[0], "io_limit")) {
+		r = strict_strtol(argv[1], 0, &val);
+		if (r || val < 0)
+			return -EINVAL;
+		spin_lock_irqsave(&dp->g_lock, flags);
+		if (val == 0) {
+			struct request_queue *q;
+
+			q = bdev_get_queue(gp->c_dev->bdev);
+			if (!q) {
+				spin_unlock_irqrestore(&dp->g_lock, flags);
+				return -ENXIO;
+			}
+			val = q->nr_requests;
+		}
+		if (val < dp->g_io_throttle) {
+			spin_unlock_irqrestore(&dp->g_lock, flags);
+			return -EINVAL;
+		}
+		dp->g_io_limit[0] = dp->g_io_limit[1] = val;
+		spin_unlock_irqrestore(&dp->g_lock, flags);
+		ioband_set_param(gp, argv[0], argv[1]);
+		return 0;
+	} else if (!strcmp(argv[0], "type")) {
+		return ioband_group_type_select(gp, argv[1]);
+	} else if (!strcmp(argv[0], "attach")) {
+		r = strict_strtol(argv[1], 0, &val);
+		if (r)
+			return r;
+		return ioband_group_attach(gp, val, NULL);
+	} else if (!strcmp(argv[0], "detach")) {
+		r = strict_strtol(argv[1], 0, &val);
+		if (r)
+			return r;
+		return ioband_group_detach(gp, val);
+	} else if (!strcmp(argv[0], "policy")) {
+		r = policy_init(dp, argv[1], 0, &argv[2]);
+		return r;
+	} else {
+		/* message anycommand <group-id>:<value> */
+		r = ioband_set_param(gp, argv[0], argv[1]);
+		if (r < 0)
+			DMWARN("Unrecognised band message received.");
+		return r;
+	}
+	return 0;
+}
+
+static int ioband_message(struct dm_target *ti, unsigned int argc, char **argv)
+{
+	struct ioband_group *gp = ti->private;
+	struct ioband_device *dp = gp->c_banddev;
+	int r;
+
+	mutex_lock(&dp->g_lock_device);
+	r = __ioband_message(ti, argc, argv);
+	mutex_unlock(&dp->g_lock_device);
+	return r;
+}
+
+static int ioband_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
+					struct bio_vec *biovec, int max_size)
+{
+	struct ioband_group *gp = ti->private;
+	struct request_queue *q = bdev_get_queue(gp->c_dev->bdev);
+
+	if (!q->merge_bvec_fn)
+		return max_size;
+
+	bvm->bi_bdev = gp->c_dev->bdev;
+	bvm->bi_sector -= ti->begin;
+
+	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
+}
+
+static struct target_type ioband_target = {
+	.name	     = "ioband",
+	.module      = THIS_MODULE,
+	.version     = {1, 10, 0},
+	.ctr	     = ioband_ctr,
+	.dtr	     = ioband_dtr,
+	.map	     = ioband_map,
+	.end_io	     = ioband_end_io,
+	.presuspend  = ioband_presuspend,
+	.resume	     = ioband_resume,
+	.status	     = ioband_status,
+	.message     = ioband_message,
+	.merge       = ioband_merge,
+};
+
+static int __init dm_ioband_init(void)
+{
+	int r;
+
+	r = dm_register_target(&ioband_target);
+	if (r < 0) {
+		DMERR("register failed %d", r);
+		return r;
+	}
+	return r;
+}
+
+static void __exit dm_ioband_exit(void)
+{
+	dm_unregister_target(&ioband_target);
+}
+
+module_init(dm_ioband_init);
+module_exit(dm_ioband_exit);
+
+MODULE_DESCRIPTION(DM_NAME " I/O bandwidth control");
+MODULE_AUTHOR("Hirokazu Takahashi <taka@valinux.co.jp>, "
+	      "Ryo Tsuruta <ryov@valinux.co.jp");
+MODULE_LICENSE("GPL");
Index: linux-2.6/drivers/md/dm-ioband-policy.c
===================================================================
--- /dev/null
+++ linux-2.6/drivers/md/dm-ioband-policy.c
@@ -0,0 +1,460 @@
+/*
+ * Copyright (C) 2008 VA Linux Systems Japan K.K.
+ *
+ *  I/O bandwidth control
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/bio.h>
+#include <linux/workqueue.h>
+#include <linux/rbtree.h>
+#include "dm.h"
+#include "dm-bio-list.h"
+#include "dm-ioband.h"
+
+/*
+ * The following functions determine when and which BIOs should
+ * be submitted to control the I/O flow.
+ * It is possible to add a new BIO scheduling policy with it.
+ */
+
+
+/*
+ * Functions for weight balancing policy based on the number of I/Os.
+ */
+#define DEFAULT_WEIGHT		100
+#define DEFAULT_TOKENPOOL	2048
+#define DEFAULT_BUCKET		2
+#define IOBAND_IOPRIO_BASE	100
+#define TOKEN_BATCH_UNIT	20
+#define PROCEED_THRESHOLD	8
+#define	LOCAL_ACTIVE_RATIO	8
+#define	GLOBAL_ACTIVE_RATIO	16
+#define OVERCOMMIT_RATE		4
+
+/*
+ * Calculate the effective number of tokens this group has.
+ */
+static int get_token(struct ioband_group *gp)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	int token = gp->c_token;
+	int allowance = dp->g_epoch - gp->c_my_epoch;
+
+	if (allowance) {
+		if (allowance > dp->g_carryover)
+			allowance = dp->g_carryover;
+		token += gp->c_token_initial * allowance;
+	}
+	if (is_group_down(gp))
+		token += gp->c_token_initial * dp->g_carryover * 2;
+
+	return token;
+}
+
+/*
+ * Calculate the priority of a given group.
+ */
+static int iopriority(struct ioband_group *gp)
+{
+	return get_token(gp) * IOBAND_IOPRIO_BASE / gp->c_token_initial + 1;
+}
+
+/*
+ * This function is called when all the active group on the same ioband
+ * device has used up their tokens. It makes a new global epoch so that
+ * all groups on this device will get freshly assigned tokens.
+ */
+static int make_global_epoch(struct ioband_device *dp)
+{
+	struct ioband_group *gp = dp->g_dominant;
+
+	/*
+	 * Don't make a new epoch if the dominant group still has a lot of
+	 * tokens, except when the I/O load is low.
+	 */
+	if (gp) {
+		int iopri = iopriority(gp);
+		if (iopri * PROCEED_THRESHOLD > IOBAND_IOPRIO_BASE &&
+			dp->g_issued[0] + dp->g_issued[1] >= dp->g_io_throttle)
+			return 0;
+	}
+
+	dp->g_epoch++;
+	dprintk(KERN_ERR "make_epoch %d --> %d\n",
+						dp->g_epoch-1, dp->g_epoch);
+
+	/* The leftover tokens will be used in the next epoch. */
+	dp->g_token_extra = dp->g_token_left;
+	if (dp->g_token_extra < 0)
+		dp->g_token_extra = 0;
+	dp->g_token_left = dp->g_token_bucket;
+
+	dp->g_expired = NULL;
+	dp->g_dominant = NULL;
+
+	return 1;
+}
+
+/*
+ * This function is called when this group has used up its own tokens.
+ * It will check whether it's possible to make a new epoch of this group.
+ */
+static inline int make_epoch(struct ioband_group *gp)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	int allowance = dp->g_epoch - gp->c_my_epoch;
+
+	if (!allowance)
+		return 0;
+	if (allowance > dp->g_carryover)
+		allowance = dp->g_carryover;
+	gp->c_my_epoch = dp->g_epoch;
+	return allowance;
+}
+
+/*
+ * Check whether this group has tokens to issue an I/O. Return 0 if it
+ * doesn't have any, otherwise return the priority of this group.
+ */
+static int is_token_left(struct ioband_group *gp)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	int allowance;
+	int delta;
+	int extra;
+
+	if (gp->c_token > 0)
+		return iopriority(gp);
+
+	if (is_group_down(gp)) {
+		gp->c_token = gp->c_token_initial;
+		return iopriority(gp);
+	}
+	allowance = make_epoch(gp);
+	if (!allowance)
+		return 0;
+	/*
+	 * If this group has the right to get tokens for several epochs,
+	 * give all of them to the group here.
+	 */
+	delta = gp->c_token_initial * allowance;
+	dp->g_token_left -= delta;
+	/*
+	 * Give some extra tokens to this group when there have left unused
+	 * tokens on this ioband device from the previous epoch.
+	 */
+	extra = dp->g_token_extra * gp->c_token_initial /
+				 (dp->g_token_bucket - dp->g_token_extra/2);
+	delta += extra;
+	gp->c_token += delta;
+	gp->c_consumed = 0;
+
+	if (gp == dp->g_current)
+		dp->g_yield_mark += delta;
+	dprintk(KERN_ERR "refill token: "
+		"gp:%p token:%d->%d extra(%d) allowance(%d)\n",
+		gp, gp->c_token - delta, gp->c_token, extra, allowance);
+	if (gp->c_token > 0)
+		return iopriority(gp);
+	dprintk(KERN_ERR "refill token: yet empty gp:%p token:%d\n",
+						gp, gp->c_token);
+	return 0;
+}
+
+/*
+ * Use tokens to issue an I/O. After the operation, the number of tokens left
+ * on this group may become negative value, which will be treated as debt.
+ */
+static int consume_token(struct ioband_group *gp, int count, int flag)
+{
+	struct ioband_device *dp = gp->c_banddev;
+
+	if (gp->c_consumed * LOCAL_ACTIVE_RATIO < gp->c_token_initial &&
+		gp->c_consumed * GLOBAL_ACTIVE_RATIO < dp->g_token_bucket) {
+		; /* Do nothing unless this group is really active. */
+	} else if (!dp->g_dominant ||
+			get_token(gp) > get_token(dp->g_dominant)) {
+		/*
+		 * Regard this group as the dominant group on this
+		 * ioband device when it has larger number of tokens
+		 * than those of the previous one.
+		 */
+		dp->g_dominant = gp;
+	}
+	if (dp->g_epoch == gp->c_my_epoch &&
+			gp->c_token > 0 && gp->c_token - count <= 0) {
+		/* Remember the last group which used up its own tokens. */
+		dp->g_expired = gp;
+		if (dp->g_dominant == gp)
+			dp->g_dominant = NULL;
+	}
+
+	if (gp != dp->g_current) {
+		/* This group is the current already. */
+		dp->g_current = gp;
+		dp->g_yield_mark =
+			gp->c_token - (TOKEN_BATCH_UNIT << dp->g_token_unit);
+	}
+	gp->c_token -= count;
+	gp->c_consumed += count;
+	if (gp->c_token <= dp->g_yield_mark && !(flag & IOBAND_URGENT)) {
+		/*
+		 * Return-value 1 means that this policy requests dm-ioband
+		 * to give a chance to another group to be selected since
+		 * this group has already issued enough amount of I/Os.
+		 */
+		dp->g_current = NULL;
+		return R_YIELD;
+	}
+	/*
+	 * Return-value 0 means that this policy allows dm-ioband to select
+	 * this group to issue I/Os without a break.
+	 */
+	return R_OK;
+}
+
+/*
+ * Consume one token on each I/O.
+ */
+static int prepare_token(struct ioband_group *gp, struct bio *bio, int flag)
+{
+	return consume_token(gp, 1, flag);
+}
+
+/*
+ * Check if this group is able to receive a new bio.
+ */
+static int is_queue_full(struct ioband_group *gp)
+{
+	return gp->c_blocked >= gp->c_limit;
+}
+
+static void set_weight(struct ioband_group *gp, int new)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	struct ioband_group *p;
+
+	dp->g_weight_total += (new - gp->c_weight);
+	gp->c_weight = new;
+
+	if (dp->g_weight_total == 0) {
+		list_for_each_entry(p, &dp->g_groups, c_list)
+			p->c_token = p->c_token_initial = p->c_limit = 1;
+	} else {
+		list_for_each_entry(p, &dp->g_groups, c_list) {
+			p->c_token = p->c_token_initial =
+				dp->g_token_bucket * p->c_weight /
+				dp->g_weight_total + 1;
+			p->c_limit = (dp->g_io_limit[0] + dp->g_io_limit[1]) *
+				p->c_weight / dp->g_weight_total /
+				OVERCOMMIT_RATE + 1;
+		}
+	}
+}
+
+static void init_token_bucket(struct ioband_device *dp,
+					int token_bucket, int carryover)
+{
+	if (!token_bucket)
+		dp->g_token_bucket = ((dp->g_io_limit[0] + dp->g_io_limit[1]) *
+					DEFAULT_BUCKET) << dp->g_token_unit;
+	else
+		dp->g_token_bucket = token_bucket;
+	if (!carryover)
+		dp->g_carryover = (DEFAULT_TOKENPOOL << dp->g_token_unit) /
+							dp->g_token_bucket;
+	else
+		dp->g_carryover = carryover;
+	if (dp->g_carryover < 1)
+		dp->g_carryover = 1;
+	dp->g_token_left = 0;
+}
+
+static int policy_weight_param(struct ioband_group *gp, char *cmd, char *value)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	long val;
+	int r = 0, err;
+
+	err = strict_strtol(value, 0, &val);
+	if (!strcmp(cmd, "weight")) {
+		if (!err && 0 < val && val <= SHORT_MAX)
+			set_weight(gp, val);
+		else
+			r = -EINVAL;
+	} else if (!strcmp(cmd, "token")) {
+		if (!err && val >= 0) {
+			init_token_bucket(dp, val, 0);
+			set_weight(gp, gp->c_weight);
+			dp->g_token_extra = 0;
+		} else
+			r = -EINVAL;
+	} else if (!strcmp(cmd, "carryover")) {
+		if (!err && val >= 0) {
+			init_token_bucket(dp, dp->g_token_bucket, val);
+			set_weight(gp, gp->c_weight);
+			dp->g_token_extra = 0;
+		} else
+			r = -EINVAL;
+	} else if (!strcmp(cmd, "io_limit")) {
+		init_token_bucket(dp, 0, 0);
+		set_weight(gp, gp->c_weight);
+	} else {
+		r = -EINVAL;
+	}
+	return r;
+}
+
+static int policy_weight_ctr(struct ioband_group *gp, char *arg)
+{
+	struct ioband_device *dp = gp->c_banddev;
+
+	if (!arg)
+		arg = __stringify(DEFAULT_WEIGHT);
+	gp->c_my_epoch = dp->g_epoch;
+	gp->c_weight = 0;
+	gp->c_consumed = 0;
+	return policy_weight_param(gp, "weight", arg);
+}
+
+static void policy_weight_dtr(struct ioband_group *gp)
+{
+	struct ioband_device *dp = gp->c_banddev;
+	set_weight(gp, 0);
+	dp->g_dominant = NULL;
+	dp->g_expired = NULL;
+}
+
+static void policy_weight_show(struct ioband_group *gp, int *szp,
+					char *result, unsigned int maxlen)
+{
+	struct ioband_group *p;
+	struct ioband_device *dp = gp->c_banddev;
+	struct rb_node *node;
+	int sz = *szp; /* used in DMEMIT() */
+
+	DMEMIT(" %d :%d", dp->g_token_bucket, gp->c_weight);
+
+	for (node = rb_first(&gp->c_group_root); node; node = rb_next(node)) {
+		p = rb_entry(node, struct ioband_group, c_group_node);
+		DMEMIT(" %d:%d", p->c_id, p->c_weight);
+	}
+	*szp = sz;
+}
+
+/*
+ *  <Method>      <description>
+ * g_can_submit   : To determine whether a given group has the right to
+ *                  submit BIOs. The larger the return value the higher the
+ *                  priority to submit. Zero means it has no right.
+ * g_prepare_bio  : Called right before submitting each BIO.
+ * g_restart_bios : Called if this ioband device has some BIOs blocked but none
+ *                  of them can be submitted now. This method has to
+ *                  reinitialize the data to restart to submit BIOs and return
+ *                  0 or 1.
+ *                  The return value 0 means that it has become able to submit
+ *                  them now so that this ioband device will continue its work.
+ *                  The return value 1 means that it is still unable to submit
+ *                  them so that this device will stop its work. And this
+ *                  policy module has to reactivate the device when it gets
+ *                  to be able to submit BIOs.
+ * g_hold_bio     : To hold a given BIO until it is submitted.
+ *                  The default function is used when this method is undefined.
+ * g_pop_bio      : To select and get the best BIO to submit.
+ * g_group_ctr    : To initalize the policy own members of struct ioband_group.
+ * g_group_dtr    : Called when struct ioband_group is removed.
+ * g_set_param    : To update the policy own date.
+ *                  The parameters can be passed through "dmsetup message"
+ *                  command.
+ * g_should_block : Called every time this ioband device receive a BIO.
+ *                  Return 1 if a given group can't receive any more BIOs,
+ *                  otherwise return 0.
+ * g_show         : Show the configuration.
+ */
+static int policy_weight_init(struct ioband_device *dp, int argc, char **argv)
+{
+	long val;
+	int r = 0;
+
+	if (argc < 1)
+		val = 0;
+	else {
+		r = strict_strtol(argv[0], 0, &val);
+		if (r || val < 0)
+			return -EINVAL;
+	}
+
+	dp->g_can_submit = is_token_left;
+	dp->g_prepare_bio = prepare_token;
+	dp->g_restart_bios = make_global_epoch;
+	dp->g_group_ctr = policy_weight_ctr;
+	dp->g_group_dtr = policy_weight_dtr;
+	dp->g_set_param = policy_weight_param;
+	dp->g_should_block = is_queue_full;
+	dp->g_show  = policy_weight_show;
+
+	dp->g_epoch = 0;
+	dp->g_weight_total = 0;
+	dp->g_current = NULL;
+	dp->g_dominant = NULL;
+	dp->g_expired = NULL;
+	dp->g_token_extra = 0;
+	dp->g_token_unit = 0;
+	init_token_bucket(dp, val, 0);
+	dp->g_token_left = dp->g_token_bucket;
+
+	return 0;
+}
+/* weight balancing policy based on the number of I/Os. --- End --- */
+
+
+/*
+ * Functions for weight balancing policy based on I/O size.
+ * It just borrows a lot of functions from the regular weight balancing policy.
+ */
+static int w2_prepare_token(struct ioband_group *gp, struct bio *bio, int flag)
+{
+	/* Consume tokens depending on the size of a given bio. */
+	return consume_token(gp, bio_sectors(bio), flag);
+}
+
+static int w2_policy_weight_init(struct ioband_device *dp,
+							int argc, char **argv)
+{
+	long val;
+	int r = 0;
+
+	if (argc < 1)
+		val = 0;
+	else {
+		r = strict_strtol(argv[0], 0, &val);
+		if (r || val < 0)
+			return -EINVAL;
+	}
+
+	r = policy_weight_init(dp, argc, argv);
+	if (r < 0)
+		return r;
+
+	dp->g_prepare_bio = w2_prepare_token;
+	dp->g_token_unit = PAGE_SHIFT - 9;
+	init_token_bucket(dp, val, 0);
+	dp->g_token_left = dp->g_token_bucket;
+	return 0;
+}
+/* weight balancing policy based on I/O size. --- End --- */
+
+
+static int policy_default_init(struct ioband_device *dp,
+					int argc, char **argv)
+{
+	return policy_weight_init(dp, argc, argv);
+}
+
+struct policy_type dm_ioband_policy_type[] = {
+	{"default", policy_default_init},
+	{"weight", policy_weight_init},
+	{"weight-iosize", w2_policy_weight_init},
+	{NULL,     policy_default_init}
+};
Index: linux-2.6/drivers/md/dm-ioband-type.c
===================================================================
--- /dev/null
+++ linux-2.6/drivers/md/dm-ioband-type.c
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2008 VA Linux Systems Japan K.K.
+ *
+ *  I/O bandwidth control
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/bio.h>
+#include "dm.h"
+#include "dm-bio-list.h"
+#include "dm-ioband.h"
+
+/*
+ * Any I/O bandwidth can be divided into several bandwidth groups, each of which
+ * has its own unique ID. The following functions are called to determine
+ * which group a given BIO belongs to and return the ID of the group.
+ */
+
+/* ToDo: unsigned long value would be better for group ID */
+
+static int ioband_process_id(struct bio *bio)
+{
+	/*
+	 * This function will work for KVM and Xen.
+	 */
+	return (int)current->tgid;
+}
+
+static int ioband_process_group(struct bio *bio)
+{
+	return (int)task_pgrp_nr(current);
+}
+
+static int ioband_uid(struct bio *bio)
+{
+	return (int)current_uid();
+}
+
+static int ioband_gid(struct bio *bio)
+{
+	return (int)current_gid();
+}
+
+static int ioband_cpuset(struct bio *bio)
+{
+	return 0;	/* not implemented yet */
+}
+
+static int ioband_node(struct bio *bio)
+{
+	return 0;	/* not implemented yet */
+}
+
+static int ioband_cgroup(struct bio *bio)
+{
+  /*
+   * This function should return the ID of the cgroup which issued "bio".
+   * The ID of the cgroup which the current process belongs to won't be
+   * suitable ID for this purpose, since some BIOs will be handled by kernel
+   * threads like aio or pdflush on behalf of the process requesting the BIOs.
+   */
+	return 0;	/* not implemented yet */
+}
+
+struct group_type dm_ioband_group_type[] = {
+	{"none",   NULL},
+	{"pgrp",   ioband_process_group},
+	{"pid",    ioband_process_id},
+	{"node",   ioband_node},
+	{"cpuset", ioband_cpuset},
+	{"cgroup", ioband_cgroup},
+	{"user",   ioband_uid},
+	{"uid",    ioband_uid},
+	{"gid",    ioband_gid},
+	{NULL,     NULL}
+};
Index: linux-2.6/drivers/md/dm-ioband.h
===================================================================
--- /dev/null
+++ linux-2.6/drivers/md/dm-ioband.h
@@ -0,0 +1,194 @@
+/*
+ * Copyright (C) 2008 VA Linux Systems Japan K.K.
+ *
+ *  I/O bandwidth control
+ *
+ * This file is released under the GPL.
+ */
+
+#include <linux/version.h>
+#include <linux/wait.h>
+
+#define DEFAULT_IO_THROTTLE	4
+#define DEFAULT_IO_LIMIT	128
+#define IOBAND_NAME_MAX 31
+#define IOBAND_ID_ANY (-1)
+
+struct ioband_group;
+
+struct ioband_device {
+	struct list_head	g_groups;
+	struct delayed_work     g_conductor;
+	struct workqueue_struct	*g_ioband_wq;
+	struct	bio_list	g_urgent_bios;
+	int	g_io_throttle;
+	int	g_io_limit[2];
+	int	g_issued[2];
+	int	g_blocked;
+	spinlock_t	g_lock;
+	struct mutex	g_lock_device;
+	wait_queue_head_t g_waitq;
+	wait_queue_head_t g_waitq_suspend;
+	wait_queue_head_t g_waitq_flush;
+
+	int	g_ref;
+	struct	list_head g_list;
+	int	g_flags;
+	char	g_name[IOBAND_NAME_MAX + 1];
+	struct	policy_type *g_policy;
+
+	/* policy dependent */
+	int	(*g_can_submit)(struct ioband_group *);
+	int	(*g_prepare_bio)(struct ioband_group *, struct bio *, int);
+	int	(*g_restart_bios)(struct ioband_device *);
+	void	(*g_hold_bio)(struct ioband_group *, struct bio *);
+	struct bio * (*g_pop_bio)(struct ioband_group *);
+	int	(*g_group_ctr)(struct ioband_group *, char *);
+	void	(*g_group_dtr)(struct ioband_group *);
+	int	(*g_set_param)(struct ioband_group *, char *cmd, char *value);
+	int	(*g_should_block)(struct ioband_group *);
+	void	(*g_show)(struct ioband_group *, int *, char *, unsigned int);
+
+	/* members for weight balancing policy */
+	int	g_epoch;
+	int	g_weight_total;
+		/* the number of tokens which can be used in every epoch */
+	int	g_token_bucket;
+		/* how many epochs tokens can be carried over */
+	int	g_carryover;
+		/* how many tokens should be used for one page-sized I/O */
+	int	g_token_unit;
+		/* the last group which used a token */
+	struct ioband_group *g_current;
+		/* give another group a chance to be scheduled when the rest
+		   of tokens of the current group reaches this mark */
+	int	g_yield_mark;
+		/* the latest group which used up its tokens */
+	struct ioband_group *g_expired;
+		/* the group which has the largest number of tokens in the
+		   active groups */
+	struct ioband_group *g_dominant;
+		/* the number of unused tokens in this epoch */
+	int	g_token_left;
+		/* left-over tokens from the previous epoch */
+	int	g_token_extra;
+};
+
+struct ioband_group_stat {
+	unsigned long	sectors;
+	unsigned long	immediate;
+	unsigned long	deferred;
+};
+
+struct ioband_group {
+	struct	list_head c_list;
+	struct ioband_device *c_banddev;
+	struct dm_dev *c_dev;
+	struct dm_target *c_target;
+	struct	bio_list c_blocked_bios;
+	struct	bio_list c_prio_bios;
+	struct	rb_root c_group_root;
+	struct  rb_node c_group_node;
+	int	c_id;	/* should be unsigned long or unsigned long long */
+	char	c_name[IOBAND_NAME_MAX + 1];	/* rfu */
+	int	c_blocked;
+	int	c_prio_blocked;
+	wait_queue_head_t c_waitq;
+	int	c_flags;
+	struct	ioband_group_stat c_stat[2];	/* hold rd/wr status */
+	struct	group_type *c_type;
+
+	/* members for weight balancing policy */
+	int	c_weight;
+	int	c_my_epoch;
+	int	c_token;
+	int	c_token_initial;
+	int	c_limit;
+	int     c_consumed;
+
+	/* rfu */
+	/* struct bio_list	c_ordered_tag_bios; */
+};
+
+#define IOBAND_URGENT 1
+
+#define DEV_BIO_BLOCKED		1
+#define DEV_SUSPENDED		2
+
+#define set_device_blocked(dp)		((dp)->g_flags |= DEV_BIO_BLOCKED)
+#define clear_device_blocked(dp)	((dp)->g_flags &= ~DEV_BIO_BLOCKED)
+#define is_device_blocked(dp)		((dp)->g_flags & DEV_BIO_BLOCKED)
+
+#define set_device_suspended(dp)	((dp)->g_flags |= DEV_SUSPENDED)
+#define clear_device_suspended(dp)	((dp)->g_flags &= ~DEV_SUSPENDED)
+#define is_device_suspended(dp)		((dp)->g_flags & DEV_SUSPENDED)
+
+#define IOG_PRIO_BIO_WRITE	1
+#define IOG_PRIO_QUEUE		2
+#define IOG_BIO_BLOCKED		4
+#define IOG_GOING_DOWN		8
+#define IOG_SUSPENDED		16
+#define IOG_NEED_UP		32
+
+#define R_OK		0
+#define R_BLOCK		1
+#define R_YIELD		2
+
+#define set_group_blocked(gp)		((gp)->c_flags |= IOG_BIO_BLOCKED)
+#define clear_group_blocked(gp)		((gp)->c_flags &= ~IOG_BIO_BLOCKED)
+#define is_group_blocked(gp)		((gp)->c_flags & IOG_BIO_BLOCKED)
+
+#define set_group_down(gp)		((gp)->c_flags |= IOG_GOING_DOWN)
+#define clear_group_down(gp)		((gp)->c_flags &= ~IOG_GOING_DOWN)
+#define is_group_down(gp)		((gp)->c_flags & IOG_GOING_DOWN)
+
+#define set_group_suspended(gp)		((gp)->c_flags |= IOG_SUSPENDED)
+#define clear_group_suspended(gp)	((gp)->c_flags &= ~IOG_SUSPENDED)
+#define is_group_suspended(gp)		((gp)->c_flags & IOG_SUSPENDED)
+
+#define set_group_need_up(gp)		((gp)->c_flags |= IOG_NEED_UP)
+#define clear_group_need_up(gp)		((gp)->c_flags &= ~IOG_NEED_UP)
+#define group_need_up(gp)		((gp)->c_flags & IOG_NEED_UP)
+
+#define set_prio_read(gp)		((gp)->c_flags |= IOG_PRIO_QUEUE)
+#define clear_prio_read(gp)		((gp)->c_flags &= ~IOG_PRIO_QUEUE)
+#define is_prio_read(gp) \
+	((gp)->c_flags & (IOG_PRIO_QUEUE|IOG_PRIO_BIO_WRITE) == IOG_PRIO_QUEUE)
+
+#define set_prio_write(gp) \
+	((gp)->c_flags |= (IOG_PRIO_QUEUE|IOG_PRIO_BIO_WRITE))
+#define clear_prio_write(gp) \
+	((gp)->c_flags &= ~(IOG_PRIO_QUEUE|IOG_PRIO_BIO_WRITE))
+#define is_prio_write(gp) \
+	((gp)->c_flags & (IOG_PRIO_QUEUE|IOG_PRIO_BIO_WRITE) == \
+		(IOG_PRIO_QUEUE|IOG_PRIO_BIO_WRITE))
+
+#define set_prio_queue(gp, direct) \
+	((gp)->c_flags |= (IOG_PRIO_QUEUE|direct))
+#define clear_prio_queue(gp)		clear_prio_write(gp)
+#define is_prio_queue(gp)		((gp)->c_flags & IOG_PRIO_QUEUE)
+#define prio_queue_direct(gp)		((gp)->c_flags & IOG_PRIO_BIO_WRITE)
+
+
+struct policy_type {
+	const char *p_name;
+	int	  (*p_policy_init)(struct ioband_device *, int, char **);
+};
+
+extern struct policy_type dm_ioband_policy_type[];
+
+struct group_type {
+	const char *t_name;
+	int	  (*t_getid)(struct bio *);
+};
+
+extern struct group_type dm_ioband_group_type[];
+
+/* Just for debugging */
+extern long ioband_debug;
+#define dprintk(format, a...) do { \
+	if (ioband_debug > 0) {	\
+		ioband_debug--; \
+		printk(format, ##a); \
+	} \
+} while (0)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 2/2] dm-ioband: I/O bandwidth controller v1.10.0: Document
  2009-01-20  5:11 ` [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch Ryo Tsuruta
@ 2009-01-20  5:12   ` Ryo Tsuruta
  2009-01-20 14:52   ` [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch Alasdair G Kergon
                     ` (2 subsequent siblings)
  3 siblings, 0 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-20  5:12 UTC (permalink / raw)
  To: agk, dm-devel

This patch is the documentation of dm-ioband, design overview,
installation, command, reference and examples.

Signed-off-by: Ryo Tsuruta <ryov@valinux.co.jp>
Signed-off-by: Hirokazu Takahashi <taka@valinux.co.jp>

---
 Documentation/device-mapper/ioband.txt |  976 +++++++++++++++++++++++++++++++++
 1 file changed, 976 insertions(+)

Index: linux-2.6/Documentation/device-mapper/ioband.txt
===================================================================
--- /dev/null
+++ linux-2.6/Documentation/device-mapper/ioband.txt
@@ -0,0 +1,976 @@
+                     Block I/O bandwidth control: dm-ioband
+
+            -------------------------------------------------------
+
+   Table of Contents
+
+   [1]What's dm-ioband all about?
+
+   [2]Differences from the CFQ I/O scheduler
+
+   [3]How dm-ioband works.
+
+   [4]Setup and Installation
+
+   [5]Getting started
+
+   [6]Command Reference
+
+   [7]Examples
+
+What's dm-ioband all about?
+
+     dm-ioband is an I/O bandwidth controller implemented as a device-mapper
+   driver. Several jobs using the same block device have to share the
+   bandwidth of the device. dm-ioband gives bandwidth to each job according
+   to its weight, which each job can set its own value to.
+
+     A job is a group of processes with the same pid or pgrp or uid or a
+   virtual machine such as KVM or Xen. A job can also be a cgroup by applying
+   the bio-cgroup patch, which can be found at
+   [8]http://people.valinux.co.jp/~ryov/bio-cgroup/.
+
+     +------+ +------+ +------+   +------+ +------+ +------+
+     |cgroup| |cgroup| | the  |   | pid  | | pid  | | the  |  jobs
+     |  A   | |  B   | |others|   |  X   | |  Y   | |others|
+     +--|---+ +--|---+ +--|---+   +--|---+ +--|---+ +--|---+
+     +--V----+---V---+----V---+   +--V----+---V---+----V---+
+     | group | group | default|   | group | group | default|  ioband groups
+     |       |       |  group |   |       |       |  group |
+     +-------+-------+--------+   +-------+-------+--------+
+     |        ioband1         |   |       ioband2          |  ioband devices
+     +-----------|------------+   +-----------|------------+
+     +-----------V--------------+-------------V------------+
+     |                          |                          |
+     |          sdb1            |           sdb2           |  block devices
+     +--------------------------+--------------------------+
+
+
+   --------------------------------------------------------------------------
+
+Differences from the CFQ I/O scheduler
+
+     Dm-ioband is flexible to configure the bandwidth settings.
+
+     Dm-ioband can work with any type of I/O scheduler such as the NOOP
+   scheduler, which is often chosen for high-end storages, since it is
+   implemented outside the I/O scheduling layer. It allows both of partition
+   based bandwidth control and job --- a group of processes --- based
+   control. In addition, it can set different configuration on each block
+   device to control its bandwidth.
+
+     Meanwhile the current implementation of the CFQ scheduler has 8 IO
+   priority levels and all jobs whose processes have the same IO priority
+   share the bandwidth assigned to this level between them. And IO priority
+   is an attribute of a process, so that it equally effects to all block
+   devices.
+
+   --------------------------------------------------------------------------
+
+How dm-ioband works.
+
+     Every ioband device has one ioband group, which by default is called the
+   default group.
+
+     Ioband devices can also have extra ioband groups in them. Each ioband
+   group has a job to support and a weight. Proportional to the weight,
+   dm-ioband gives tokens to the group.
+
+     A group passes on I/O requests that its job issues to the underlying
+   layer so long as it has tokens left, while requests are blocked if there
+   aren't any tokens left in the group. Tokens are refilled once all of
+   groups that have requests on a given underlying block device use up their
+   tokens.
+
+     There are two policies for token consumption. One is that a token is
+   consumed for each I/O request. The other is that a token is consumed for
+   each I/O sector, for example, one I/O request which consists of
+   4Kbytes(512bytes * 8 sectors) read consumes 8 tokens. A user can choose
+   either policy.
+
+     With this approach, a job running on an ioband group with large weight
+   is guaranteed a wide I/O bandwidth.
+
+   --------------------------------------------------------------------------
+
+Setup and Installation
+
+     Build a kernel with these options enabled:
+
+     CONFIG_MD
+     CONFIG_BLK_DEV_DM
+     CONFIG_DM_IOBAND
+
+
+     If compiled as module, use modprobe to load dm-ioband.
+
+     # make modules
+     # make modules_install
+     # depmod -a
+     # modprobe dm-ioband
+
+
+     "dmsetup targets" command shows all available device-mapper targets.
+   "ioband" and the version number are displayed when dm-ioband has been
+   loaded.
+
+     # dmsetup targets | grep ioband
+     ioband           v1.10.0
+
+
+   --------------------------------------------------------------------------
+
+Getting started
+
+     The following is a brief description how to control the I/O bandwidth of
+   disks. In this description, we'll take one disk with two partitions as an
+   example target.
+
+   --------------------------------------------------------------------------
+
+  Create and map ioband devices
+
+     Create two ioband devices "ioband1" and "ioband2". "ioband1" is mapped
+   to "/dev/sda1" and has a weight of 40. "ioband2" is mapped to "/dev/sda2"
+   and has a weight of 10. "ioband1" can use 80% --- 40/(40+10)*100 --- of
+   the bandwidth of "/dev/sda" while "ioband2" can use 20%.
+
+    # echo "0 $(blockdev --getsize /dev/sda1) ioband /dev/sda1 1 0 0 none" \
+        "weight 0 :40" | dmsetup create ioband1
+    # echo "0 $(blockdev --getsize /dev/sda2) ioband /dev/sda2 1 0 0 none" \
+        "weight 0 :10" | dmsetup create ioband2
+
+
+     If the commands are successful then the device files
+   "/dev/mapper/ioband1" and "/dev/mapper/ioband2" will have been created.
+
+   --------------------------------------------------------------------------
+
+  Additional bandwidth control
+
+     In this example two extra ioband groups are created on "ioband1."
+
+     First, set the ioband group type as user. Next, create two ioband groups
+   that have id 1000 and 2000. Then, give weights of 30 and 20 to the ioband
+   groups respectively.
+
+    # dmsetup message ioband1 0 type user
+    # dmsetup message ioband1 0 attach 1000
+    # dmsetup message ioband1 0 attach 2000
+    # dmsetup message ioband1 0 weight 1000:30
+    # dmsetup message ioband1 0 weight 2000:20
+
+
+     Now the processes owned by uid 1000 can use 30% --- 30/(30+20+40+10)*100
+   --- of the bandwidth of "/dev/sda" when the processes issue I/O requests
+   through "ioband1." The processes owned by uid 2000 can use 20% of the
+   bandwidth likewise.
+
+   Table 1. Weight assignments
+
+   +----------------------------------------------------------------+
+   | ioband device |          ioband group          | ioband weight |
+   |---------------+--------------------------------+---------------|
+   | ioband1       | user id 1000                   | 30            |
+   |---------------+--------------------------------+---------------|
+   | ioband1       | user id 2000                   | 20            |
+   |---------------+--------------------------------+---------------|
+   | ioband1       | default group(the other users) | 40            |
+   |---------------+--------------------------------+---------------|
+   | ioband2       | default group                  | 10            |
+   +----------------------------------------------------------------+
+
+   --------------------------------------------------------------------------
+
+  Remove the ioband devices
+
+     Remove the ioband devices when no longer used.
+
+     # dmsetup remove ioband1
+     # dmsetup remove ioband2
+
+
+   --------------------------------------------------------------------------
+
+Command Reference
+
+  Create an ioband device
+
+   SYNOPSIS
+
+           dmsetup create IOBAND_DEVICE
+
+   DESCRIPTION
+
+             Create an ioband device with the given name IOBAND_DEVICE.
+           Generally, dmsetup reads a table from standard input. Each line of
+           the table specifies a single target and is of the form:
+
+             start_sector num_sectors "ioband" device_file ioband_device_id \
+                 io_throttle io_limit ioband_group_type policy token_base \
+                 :weight [ioband_group_id:weight...]
+
+
+                start_sector, num_sectors
+
+                          The sector range of the underlying device where
+                        dm-ioband maps.
+
+                ioband
+
+                          Specify the string "ioband" as a target type.
+
+                device_file
+
+                          Underlying device name.
+
+                ioband_device_id
+
+                          The ID number for an ioband device. The same ID
+                        must be set among the ioband devices that share the
+                        same bandwidth. This is useful for grouping disk
+                        drives partitioned from one disk drive such as RAID
+                        drive or LVM logical striped volume.
+
+                io_throttle
+
+                          Dm-ioband starts to control the bandwidth when the
+                        number of BIOs in progress exceeds this value. If 0
+                        is specified, the default value is used. This setting
+                        applies all ioband devices which has the same ioband
+                        device ID as you specified by "ioband_device_id."
+
+                io_limit
+
+                          Dm-ioband blocks all I/O requests for IOBAND_DEVICE
+                        when the number of BIOs in progress exceeds this
+                        value. If 0 is specified, the default value is used.
+                        This setting applies all ioband devices which has the
+                        same ioband device ID as you specified by
+                        "ioband_device_id."
+
+                ioband_group_type
+
+                          Specify how to evaluate the ioband group ID. The
+                        type must be one of "none", "user", "gid", "pid" or
+                        "pgrp." The type "cgroup" is enabled by applying the
+                        bio-cgroup patch. Specify "none" if you don't need
+                        any ioband groups other than the default ioband
+                        group.
+
+                policy
+
+                          Specify a bandwidth control policy. A user can
+                        choose either policy "weight" or "weight-iosize."
+                        This setting applies all ioband devices which has the
+                        same ioband device ID as you specified by
+                        "ioband_device_id."
+
+                             weight
+
+                                       This policy controls bandwidth
+                                     according to the proportional to the
+                                     weight of each ioband group based on the
+                                     number of I/O requests.
+
+                             weight-iosize
+
+                                       This policy controls bandwidth
+                                     according to the proportional to the
+                                     weight of each ioband group based on the
+                                     number of I/O sectors.
+
+                token_base
+
+                          The number of tokens which specified by token_base
+                        will be distributed to all ioband groups according to
+                        the proportional to the weight of each ioband group.
+                        If 0 is specified, the default value is used. This
+                        setting applies all ioband devices which has the same
+                        ioband device ID as you specified by
+                        "ioband_device_id."
+
+                ioband_group_id:weight
+
+                          Set the weight of the ioband group specified by
+                        ioband_group_id. If ioband_group_id is omitted, the
+                        weight is assigned to the default ioband group.
+
+   EXAMPLE
+
+             Create an ioband device with the following parameters:
+
+              *   Starting sector = "0"
+
+              *   The number of sectors = "$(blockdev --getsize /dev/sda1)"
+
+              *   Target type = "ioband"
+
+              *   Underlying device name = "/dev/sda1"
+
+              *   Ioband device ID = "128"
+
+              *   I/O throttle = "10"
+
+              *   I/O limit = "400"
+
+              *   Ioband group type = "user"
+
+              *   Bandwidth control policy = "weight"
+
+              *   Token base = "2048"
+
+              *   Weight for the default ioband group = "100"
+
+              *   Weight for the ioband group 1000 = "80"
+
+              *   Weight for the ioband group 2000 = "20"
+
+              *   Ioband device name = "ioband1"
+
+             # echo "0 $(blockdev --getsize /dev/sda1) ioband" \
+               "/dev/sda1 128 10 400 user weight 2048 :100 1000:80 2000:20" \
+               | dmsetup create ioband1
+
+
+             Create two device groups (ID=1,2). The bandwidths of these
+           device groups will be individually controlled.
+
+             # echo "0 $(blockdev --getsize /dev/sda1) ioband /dev/sda1 1" \
+               "0 0 none weight 0 :80" | dmsetup create ioband1
+             # echo "0 $(blockdev --getsize /dev/sda2) ioband /dev/sda2 1" \
+               "0 0 none weight 0 :20" | dmsetup create ioband2
+             # echo "0 $(blockdev --getsize /dev/sdb3) ioband /dev/sdb3 2" \
+               "0 0 none weight 0 :60" | dmsetup create ioband3
+             # echo "0 $(blockdev --getsize /dev/sdb4) ioband /dev/sdb4 2" \
+               "0 0 none weight 0 :40" | dmsetup create ioband4
+
+
+   --------------------------------------------------------------------------
+
+  Remove the ioband device
+
+   SYNOPSIS
+
+           dmsetup remove IOBAND_DEVICE
+
+   DESCRIPTION
+
+             Remove the specified ioband device IOBAND_DEVICE. All the band
+           groups attached to the ioband device are also removed
+           automatically.
+
+   EXAMPLE
+
+             Remove ioband device "ioband1."
+
+             # dmsetup remove ioband1
+
+
+   --------------------------------------------------------------------------
+
+  Set an ioband group type
+
+   SYNOPSIS
+
+           dmsetup message IOBAND_DEVICE 0 type TYPE
+
+   DESCRIPTION
+
+             Set an ioband group type of IOBAND_DEVICE. TYPE must be one of
+           "none", "user", "gid", "pid" or "pgrp." The type "cgroup" is
+           enabled by applying the bio-cgroup patch. Once the type is set,
+           new ioband groups can be created on IOBAND_DEVICE.
+
+   EXAMPLE
+
+             Set the ioband group type of ioband device "ioband1" to "user."
+
+             # dmsetup message ioband1 0 type user
+
+
+   --------------------------------------------------------------------------
+
+  Create an ioband group
+
+   SYNOPSIS
+
+           dmsetup message IOBAND_DEVICE 0 attach ID
+
+   DESCRIPTION
+
+             Create an ioband group and attach it to IOBAND_DEVICE. ID
+           specifies user-id, group-id, process-id or process-group-id
+           depending the ioband group type of IOBAND_DEVICE.
+
+   EXAMPLE
+
+             Create an ioband group which consists of all processes with
+           user-id 1000 and attach it to ioband device "ioband1."
+
+             # dmsetup message ioband1 0 type user
+             # dmsetup message ioband1 0 attach 1000
+
+
+   --------------------------------------------------------------------------
+
+  Detach the ioband group
+
+   SYNOPSIS
+
+           dmsetup message IOBAND_DEVICE 0 detach ID
+
+   DESCRIPTION
+
+             Detach the ioband group specified by ID from ioband device
+           IOBAND_DEVICE.
+
+   EXAMPLE
+
+             Detach the ioband group with ID "2000" from ioband device
+           "ioband2."
+
+             # dmsetup message ioband2 0 detach 1000
+
+
+   --------------------------------------------------------------------------
+
+  Set bandwidth control policy
+
+   SYNOPSIS
+
+           dmsetup message IOBAND_DEVICE 0 policy policy
+
+   DESCRIPTION
+
+             Set a bandwidth control policy. A user can choose either policy
+           "weight" or "weight-iosize." This setting applies all ioband
+           devices which has the same ioband device ID as IOBAND_DEVICE.
+
+                weight
+
+                          This policy controls bandwidth according to the
+                        proportional to the weight of each ioband group based
+                        on the number of I/O requests.
+
+                weight-iosize
+
+                          This policy controls bandwidth according to the
+                        proportional to the weight of each ioband group based
+                        on the number of I/O sectors.
+
+   EXAMPLE
+
+             Set bandwidth control policy of ioband devices which have the
+           same ioband device ID as "ioband1" to "weight-iosize."
+
+             # dmsetup message ioband1 0 policy weight-iosize
+
+
+   --------------------------------------------------------------------------
+
+  Set the weight of an ioband group
+
+   SYNOPSIS
+
+           dmsetup message IOBAND_DEVICE 0 weight VAL
+
+           dmsetup message IOBAND_DEVICE 0 weight ID:VAL
+
+   DESCRIPTION
+
+             Set the weight of the ioband group specified by ID. Set the
+           weight of the default ioband group of IOBAND_DEVICE if ID isn't
+           specified.
+
+             The following example means that "ioband1" can use 80% ---
+           40/(40+10)*100 --- of the bandwidth of the underlying block device
+           while "ioband2" can use 20%.
+
+             # dmsetup message ioband1 0 weight 40
+             # dmsetup message ioband2 0 weight 10
+
+
+             The following lines have the same effect as the above:
+
+             # dmsetup message ioband1 0 weight 4
+             # dmsetup message ioband2 0 weight 1
+
+
+             VAL must be an integer larger than 0. The default value, which
+           is assigned to newly created ioband groups, is 100.
+
+   EXAMPLE
+
+             Set the weight of the default ioband group of "ioband1" to 40.
+
+             # dmsetup message ioband1 0 weight 40
+
+
+             Set the weight of the ioband group of "ioband1" with ID "1000"
+           to 10.
+
+             # dmsetup message ioband1 0 weight 1000:10
+
+
+   --------------------------------------------------------------------------
+
+  Set the number of tokens
+
+   SYNOPSIS
+
+           dmsetup message IOBAND_DEVICE 0 token VAL
+
+   DESCRIPTION
+
+             The number of tokens will be distributed to all ioband groups
+           according to the proportional to the weight of each ioband group.
+           If 0 is specified, the default value is used. This setting applies
+           all ioband devices which has the same ioband device ID as
+           IOBAND_DEVICE
+
+   EXAMPLE
+
+             Set the number of tokens to 256.
+
+             # dmsetup message ioband1 0 token 256
+
+
+   --------------------------------------------------------------------------
+
+  Set a limit of how many tokens are carried over
+
+   SYNOPSIS
+
+           dmsetup message IOBAND_DEVICE 0 carryover VAL
+
+   DESCRIPTION
+
+             When dm-ioband tries to refill an ioband group with tokens after
+           another ioband group is already refilled several times, dm-ioband
+           determines the number of tokens to refill by multiplying the
+           number of tokens refilled once by the smaller of how many times
+           the other group is already refilled or this limit. If 0 is
+           specified, the default value is used. This setting applies all
+           ioband devices which has the same ioband device ID as
+           IOBAND_DEVICE.
+
+   EXAMPLE
+
+             Set a limit for "ioband1" to 2.
+
+             # dmsetup message ioband1 0 carryover 2
+
+
+   --------------------------------------------------------------------------
+
+  Set I/O throttling
+
+   SYNOPSIS
+
+           dmsetup message IOBAND_DEVICE 0 io_throttle VAL
+
+   DESCRIPTION
+
+             Dm-ioband starts to control the bandwidth when the number of
+           BIOs in progress exceeds this value. If 0 is specified, the
+           default value is used. This setting applies all ioband devices
+           which has the same ioband device ID as IOBAND_DEVICE.
+
+   EXAMPLE
+
+             Set the I/O throttling value of "ioband1" to 16.
+
+             # dmsetup message ioband1 0 io_throttle 16
+
+
+   --------------------------------------------------------------------------
+
+  Set I/O limiting
+
+   SYNOPSIS
+
+           dmsetup message IOBAND_DEVICE 0 io_limit VAL
+
+   DESCRIPTION
+
+             Dm-ioband blocks all I/O requests for IOBAND_DEVICE when the
+           number of BIOs in progress exceeds this value. If 0 is specified,
+           the default value is used. This setting applies all ioband devices
+           which has the same ioband device ID as IOBAND_DEVICE.
+
+   EXAMPLE
+
+             Set the I/O limiting value of "ioband1" to 128.
+
+             # dmsetup message ioband1 0 io_limit 128
+
+
+   --------------------------------------------------------------------------
+
+  Display settings
+
+   SYNOPSIS
+
+           dmsetup table --target ioband
+
+   DESCRIPTION
+
+             Display the current table for the ioband device in a format. See
+           "dmsetup create" command for information on the table format.
+
+   EXAMPLE
+
+             The following output shows the current table of "ioband1."
+
+             # dmsetup table --target ioband
+             ioband: 0 32129937 ioband1 8:29 128 10 400 user weight \
+               2048 :100 1000:80 2000:20
+
+
+   --------------------------------------------------------------------------
+
+  Display Statistics
+
+   SYNOPSIS
+
+           dmsetup status --target ioband
+
+   DESCRIPTION
+
+             Display the statistics of all the ioband devices whose target
+           type is "ioband."
+
+             The output format is as below. the first five columns shows:
+
+              *   ioband device name
+
+              *   logical start sector of the device (must be 0)
+
+              *   device size in sectors
+
+              *   target type (must be "ioband")
+
+              *   device group ID
+
+             The remaining columns show the statistics of each ioband group
+           on the band device. Each group uses seven columns for its
+           statistics.
+
+              *   ioband group ID (-1 means default)
+
+              *   total read requests
+
+              *   delayed read requests
+
+              *   total read sectors
+
+              *   total write requests
+
+              *   delayed write requests
+
+              *   total write sectors
+
+   EXAMPLE
+
+             The following output shows the statistics of two ioband devices.
+           Ioband2 only has the default ioband group and ioband1 has three
+           (default, 1001, 1002) ioband groups.
+
+             # dmsetup status
+             ioband2: 0 44371467 ioband 128 -1 143 90 424 122 78 352
+             ioband1: 0 44371467 ioband 128 -1 223 172 408 211 136 600 1001 \
+             166 107 472 139 95 352 1002 211 146 520 210 147 504
+
+
+   --------------------------------------------------------------------------
+
+  Reset status counter
+
+   SYNOPSIS
+
+           dmsetup message IOBAND_DEVICE 0 reset
+
+   DESCRIPTION
+
+             Reset the statistics of ioband device IOBAND_DEVICE.
+
+   EXAMPLE
+
+             Reset the statistics of "ioband1."
+
+             # dmsetup message ioband1 0 reset
+
+
+   --------------------------------------------------------------------------
+
+Examples
+
+  Example #1: Bandwidth control on Partitions
+
+     This example describes how to control the bandwidth with disk
+   partitions. The following diagram illustrates the configuration of this
+   example. You may want to run a database on /dev/mapper/ioband1 and web
+   applications on /dev/mapper/ioband2.
+
+                 /mnt1                        /mnt2            mount points
+                   |                              |
+     +-------------V------------+ +-------------V------------+
+     |   /dev/mapper/ioband1    | |   /dev/mapper/ioband2    | ioband devices
+     +--------------------------+ +--------------------------+
+     |       default group      | |       default group      | ioband groups
+     |           (80)           | |           (40)           |    (weight)
+     +-------------|------------+ +-------------|------------+
+                   |                            |
+     +-------------V-------------+--------------V------------+
+     |         /dev/sda1         |          /dev/sda2        | partitions
+     +---------------------------+---------------------------+
+
+
+     To setup the above configuration, follow these steps:
+
+    1.   Create ioband devices with the same device group ID and assign
+       weights of 80 and 40 to the default ioband groups respectively.
+
+         # echo "0 $(blockdev --getsize /dev/sda1) ioband /dev/sda1 1 0 0" \
+             "none weight 0 :80" | dmsetup create ioband1
+         # echo "0 $(blockdev --getsize /dev/sda2) ioband /dev/sda2 1 0 0" \
+             "none weight 0 :40" | dmsetup create ioband2
+
+
+    2.   Create filesystems on the ioband devices and mount them.
+
+         # mkfs.ext3 /dev/mapper/ioband1
+         # mount /dev/mapper/ioband1 /mnt1
+
+         # mkfs.ext3 /dev/mapper/ioband2
+         # mount /dev/mapper/ioband2 /mnt2
+
+
+   --------------------------------------------------------------------------
+
+  Example #2: Bandwidth control on Logical Volumes
+
+     This example is similar to the example #1 but it uses LVM logical
+   volumes instead of disk partitions. This example shows how to configure
+   ioband devices on two striped logical volumes.
+
+                 /mnt1                        /mnt2            mount points
+                   |                            |
+     +-------------V------------+ +-------------V------------+
+     |   /dev/mapper/ioband1    | |   /dev/mapper/ioband2    | ioband devices
+     +--------------------------+ +--------------------------+
+     |       default group      | |       default group      | ioband groups
+     |           (80)           | |           (40)           |    (weight)
+     +-------------|------------+ +-------------|------------+
+                   |                            |
+     +-------------V------------+ +-------------V------------+
+     |      /dev/mapper/lv0     | |     /dev/mapper/lv1      | striped logical
+     |                          | |                          | volumes
+     +-------------------------------------------------------+
+     |                          vg0                          | volume group
+     +-------------|----------------------------|------------+
+                   |                            |
+     +-------------V------------+ +-------------V------------+
+     |         /dev/sdb         | |         /dev/sdc         | physical disks
+     +--------------------------+ +--------------------------+
+
+
+     To setup the above configuration, follow these steps:
+
+    1.   Initialize the partitions for use by LVM.
+
+         # pvcreate /dev/sdb
+         # pvcreate /dev/sdc
+
+
+    2.   Create a new volume group named "vg0" with /dev/sdb and /dev/sdc.
+
+         # vgcreate vg0 /dev/sdb /dev/sdc
+
+
+    3.   Create two logical volumes in "vg0." The volumes have to be striped.
+
+         # lvcreate -n lv0 -i 2 -I 64 vg0 -L 1024M
+         # lvcreate -n lv1 -i 2 -I 64 vg0 -L 1024M
+
+
+         The rest is the same as the example #1.
+
+    4.   Create ioband devices corresponding to each logical volume and
+       assign weights of 80 and 40 to the default ioband groups respectively.
+
+         # echo "0 $(blockdev --getsize /dev/mapper/vg0-lv0)" \
+            "ioband /dev/mapper/vg0-lv0 1 0 0 none weight 0 :80" | \
+            dmsetup create ioband1
+         # echo "0 $(blockdev --getsize /dev/mapper/vg0-lv1)" \
+            "ioband /dev/mapper/vg0-lv1 1 0 0 none weight 0 :40" | \
+            dmsetup create ioband2
+
+
+    5.   Create filesystems on the ioband devices and mount them.
+
+         # mkfs.ext3 /dev/mapper/ioband1
+         # mount /dev/mapper/ioband1 /mnt1
+
+         # mkfs.ext3 /dev/mapper/ioband2
+         # mount /dev/mapper/ioband2 /mnt2
+
+
+   --------------------------------------------------------------------------
+
+  Example #3: Bandwidth control on processes
+
+     This example describes how to control the bandwidth with groups of
+   processes. You may also want to run an additional application on the same
+   machine described in the example #1. This example shows how to add a new
+   ioband group for this application.
+
+                 /mnt1                        /mnt2            mount points
+                   |                            |
+     +-------------V------------+ +-------------V------------+
+     |   /dev/mapper/ioband1    | |   /dev/mapper/ioband2    | ioband devices
+     +-------------+------------+ +-------------+------------+
+     |          default         | |  user=1000  |   default  | ioband groups
+     |           (80)           | |     (20)    |    (40)    |   (weight)
+     +-------------+------------+ +-------------+------------+
+                   |                            |
+     +-------------V-------------+--------------V------------+
+     |         /dev/sda1         |          /dev/sda2        | partitions
+     +---------------------------+---------------------------+
+
+
+     The following shows to set up a new ioband group on the machine that is
+   already configured as the example #1. The application will have a weight
+   of 20 and run with user-id 1000 on /dev/mapper/ioband2.
+
+    1.   Set the type of ioband2 to "user."
+
+         # dmsetup message ioband2 0 type user.
+
+
+    2.   Create a new ioband group on ioband2.
+
+         # dmsetup message ioband2 0 attach 1000
+
+
+    3.   Assign weight of 10 to this newly created ioband group.
+
+         # dmsetup message ioband2 0 weight 1000:20
+
+
+   --------------------------------------------------------------------------
+
+  Example #4: Bandwidth control for Xen virtual block devices
+
+     This example describes how to control the bandwidth for Xen virtual
+   block devices. The following diagram illustrates the configuration of this
+   example.
+
+           Virtual Machine 1            Virtual Machine 2      virtual machines
+                   |                            |
+     +-------------V------------+ +-------------V------------+
+     |         /dev/xvda1       | |         /dev/xvda1       | virtual block
+     +-------------|------------+ +-------------|------------+    devices
+                   |                            |
+     +-------------V------------+ +-------------V------------+
+     |   /dev/mapper/ioband1    | |   /dev/mapper/ioband2    | ioband devices
+     +--------------------------+ +--------------------------+
+     |       default group      | |       default group      | ioband groups
+     |           (80)           | |           (40)           |    (weight)
+     +-------------|------------+ +-------------|------------+
+                   |                            |
+     +-------------V-------------+--------------V------------+
+     |         /dev/sda1         |          /dev/sda2        | partitions
+     +---------------------------+---------------------------+
+
+
+     The followings shows how to map ioband device "ioband1" and "ioband2" to
+   virtual block device "/dev/xvda1 on Virtual Machine 1" and "/dev/xvda1 on
+   Virtual Machine 2" respectively on the machine configured as the example
+   #1. Add the following lines to the configuration files that are referenced
+   when creating "Virtual Machine 1" and "Virtual Machine 2."
+
+       For "Virtual Machine 1"
+       disk = [ 'phy:/dev/mapper/ioband1,xvda,w' ]
+
+       For "Virtual Machine 2"
+       disk = [ 'phy:/dev/mapper/ioband2,xvda,w' ]
+
+
+   --------------------------------------------------------------------------
+
+  Example #5: Bandwidth control for Xen blktap devices
+
+     This example describes how to control the bandwidth for Xen virtual
+   block devices when Xen blktap devices are used. The following diagram
+   illustrates the configuration of this example.
+
+           Virtual Machine 1            Virtual Machine 2      virtual machines
+                   |                            |
+     +-------------V------------+ +-------------V------------+
+     |         /dev/xvda1       | |         /dev/xvda1       | virtual block
+     +-------------|------------+ +-------------|------------+    devices
+                   |                            |
+     +-------------V----------------------------V------------+
+     |                  /dev/mapper/ioband1                  | ioband device
+     +---------------------------+---------------------------+
+     |       default group       |        default group      | ioband groups
+     |           (80)            |            (40)           |    (weight)
+     +-------------|-------------+--------------|------------+
+                   |                            |
+     +-------------|----------------------------|------------+
+     |  +----------V----------+      +----------V---------+  |
+     |  |       vm1.img       |      |       vm2.img      |  | disk image files
+     |  +---------------------+      +--------------------+  |
+     |                        /vmdisk                        | mount point
+     +---------------------------|---------------------------+
+                                 |
+     +---------------------------V---------------------------+
+     |                       /dev/sda1                       | partition
+     +-------------------------------------------------------+
+
+
+     To setup the above configuration, follow these steps:
+
+    1.   Create an ioband device.
+
+         # echo "0 $(blockdev --getsize /dev/sda1) ioband /dev/sda1" \
+             "1 0 0 none weight 0 :100" | dmsetup create ioband1
+
+
+    2.   Add the following lines to the configuration files that are
+       referenced when creating "Virtual Machine 1" and "Virtual Machine 2."
+       Disk image files "/vmdisk/vm1.img" and "/vmdisk/vm2.img" will be used.
+
+         For "Virtual Machine 1"
+         disk = [ 'tap:aio:/vmdisk/vm1.img,xvda,w', ]
+
+         For "Virtual Machine 1"
+         disk = [ 'tap:aio:/vmdisk/vm2.img,xvda,w', ]
+
+
+    3.   Run the virtual machines.
+
+         # xm create vm1
+         # xm create vm2
+
+
+    4.   Find out the process IDs of the daemons which control the blktap
+       devices.
+
+         # lsof /vmdisk/disk[12].img
+         COMMAND   PID USER   FD   TYPE DEVICE       SIZE  NODE NAME
+         tapdisk 15011 root   11u   REG  253,0 2147483648 48961 /vmdisk/vm1.img
+         tapdisk 15276 root   13u   REG  253,0 2147483648 48962 /vmdisk/vm2.img
+
+
+    5.   Create new ioband groups of pid 15011 and pid 15276, which are
+       process IDs of the tapdisks, and assign weight of 80 and 40 to the
+       groups respectively.
+
+         # dmsetup message ioband1 0 type pid
+         # dmsetup message ioband1 0 attach 15011
+         # dmsetup message ioband1 0 weight 15011:80
+         # dmsetup message ioband1 0 attach 15276
+         # dmsetup message ioband1 0 weight 15276:40

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
  2009-01-20  5:11 ` [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch Ryo Tsuruta
  2009-01-20  5:12   ` [PATCH 2/2] dm-ioband: I/O bandwidth controller v1.10.0: Document Ryo Tsuruta
@ 2009-01-20 14:52   ` Alasdair G Kergon
  2009-01-21 13:03     ` Ryo Tsuruta
  2009-01-20 15:19   ` Alasdair G Kergon
  2009-01-20 15:53   ` Alasdair G Kergon
  3 siblings, 1 reply; 23+ messages in thread
From: Alasdair G Kergon @ 2009-01-20 14:52 UTC (permalink / raw)
  To: Ryo Tsuruta; +Cc: dm-devel

On Tue, Jan 20, 2009 at 02:11:14PM +0900, Ryo Tsuruta wrote:
> This patch is the dm-ioband version 1.10.0 release.
 
drivers/md/dm-ioband-ctl.c:194:2: warning: context problem in 'suspend_ioband_device': '_spin_unlock_irqrestore' expected different c
ontext
drivers/md/dm-ioband-ctl.c:194:2:    context 'lock': wanted >= 1, got 0
drivers/md/dm-ioband-ctl.c:608:3: warning: context problem in 'prevent_burst_bios': '_spin_unlock_irq' expected different context
drivers/md/dm-ioband-ctl.c:608:3:    context 'lock': wanted >= 1, got 0


Last time we had something like that, it was straightforward to restructure the
functions to avoid it - see if you can manage that here too.

Alasdair

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/2] dm-ioband: I/O bandwidth controller v1.10.0: Introduction
  2009-01-20  5:10 [PATCH 0/2] dm-ioband: I/O bandwidth controller v1.10.0: Introduction Ryo Tsuruta
  2009-01-20  5:11 ` [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch Ryo Tsuruta
@ 2009-01-20 15:04 ` Alasdair G Kergon
  1 sibling, 0 replies; 23+ messages in thread
From: Alasdair G Kergon @ 2009-01-20 15:04 UTC (permalink / raw)
  To: Ryo Tsuruta; +Cc: dm-devel

By the way, you'll notice I've combined the three messages into a single
patch: there seems little point applying the documentation as a separate
patch.

  http://www.kernel.org/pub/linux/kernel/people/agk/patches/2.6/editing/dm-add-ioband.patch

When sending revisions, either send a fresh patch that is a drop-in
replacement for what I have included, or alternatively send a patch to
be applied after what I have got.  (The second way helps people on the
list see what you are changing without needing to perform their own
diffs, and I'll simply fold it into the patch I have.)

Alasdair
-- 
agk@redhat.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
  2009-01-20  5:11 ` [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch Ryo Tsuruta
  2009-01-20  5:12   ` [PATCH 2/2] dm-ioband: I/O bandwidth controller v1.10.0: Document Ryo Tsuruta
  2009-01-20 14:52   ` [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch Alasdair G Kergon
@ 2009-01-20 15:19   ` Alasdair G Kergon
  2009-01-20 15:53   ` Alasdair G Kergon
  3 siblings, 0 replies; 23+ messages in thread
From: Alasdair G Kergon @ 2009-01-20 15:19 UTC (permalink / raw)
  To: Ryo Tsuruta; +Cc: dm-devel

OK, an easy thing first.

Please review all the debugging messages and decide whether they will really
still be necessary in an upstream kernel.

Then convert them to DM_DEBUG or DM_DEBUG_LIMIT as appropriate and remove
the 'debug' option.

If you genuinely believe you need to be able to enable some of these at runtime
on a live machine - and so far no other part of dm has found that necessary -
then you should offer some justification and extend the DM_DEBUG* macros (in a
separate patch) to support that.

If however this is more about statistics/verbosity - i.e. information the
user might want to see about how the target is behaving, then consider
alternative ways of making that information available to userspace.

Alasdair

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
  2009-01-20  5:11 ` [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch Ryo Tsuruta
                     ` (2 preceding siblings ...)
  2009-01-20 15:19   ` Alasdair G Kergon
@ 2009-01-20 15:53   ` Alasdair G Kergon
       [not found]     ` <20090120155334.GH9859-swAlYijrCMMf7BdofF/totBPR1lH4CV8@public.gmane.org>
  3 siblings, 1 reply; 23+ messages in thread
From: Alasdair G Kergon @ 2009-01-20 15:53 UTC (permalink / raw)
  To: Ryo Tsuruta; +Cc: dm-devel

So, what needs to be reviewed?

1. General style/layout/naming cleanup.
- It's pretty good compared to a lot of patches that get sent, but there are
still a few things we can improve.

Lindent is throwing up some stuff (but remember it doesn't get things
perfect so don't make all the changes it recommends).
Remove double blank lines, plenty of unnecessary braces, "unsigned int" ->
"unsigned".  
Review the names - should a few more things get a DM_ or dm_ prefix?
Are all the names consistent and as self-explanatory as they can reasonably be?
(e.g. a variable called 'new' - new what?)

2. A high-level review.
Review the documentation and what the code does and how it does it.
- Does it make sense to add this to the kernel in this way?

3. A device-mapper review.
- Does the code fit into device-mapper correctly, using all the dm interfaces properly?

4. A detailed code review.
- Checking for correctness, fixing typos, improving messages, looking for ways
  to improve the code structure etc.

So, it needs volunteers on this list to take on aspects of that and lend their
support to the patch.

I consider items 1, 2 and 3 as blockers that must be completed satisfactorily before
I can push it upstream.  (Items 1 and 3 should be quick; 2 may take a bit
longer.)  Item 4 is of course open-ended so does not have to be finished before
it can be included.

Alasdair
-- 
agk@redhat.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
  2009-01-20 14:52   ` [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch Alasdair G Kergon
@ 2009-01-21 13:03     ` Ryo Tsuruta
  2009-01-21 17:18       ` Alasdair G Kergon
  0 siblings, 1 reply; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-21 13:03 UTC (permalink / raw)
  To: agk; +Cc: dm-devel

Hi Alasdair,

Thank you for reviewing and giving me suggestions.

> On Tue, Jan 20, 2009 at 02:11:14PM +0900, Ryo Tsuruta wrote:
> > This patch is the dm-ioband version 1.10.0 release.
>  
> drivers/md/dm-ioband-ctl.c:194:2: warning: context problem in 'suspend_ioband_device': '_spin_unlock_irqrestore' expected different c
> ontext
> drivers/md/dm-ioband-ctl.c:194:2:    context 'lock': wanted >= 1, got 0
> drivers/md/dm-ioband-ctl.c:608:3: warning: context problem in 'prevent_burst_bios': '_spin_unlock_irq' expected different context
> drivers/md/dm-ioband-ctl.c:608:3:    context 'lock': wanted >= 1, got 0
> 
> 
> Last time we had something like that, it was straightforward to restructure the
> functions to avoid it - see if you can manage that here too.

I'm trying to suppress these warnings, but the latest sparse command
(2009-01-21) doesn't seem to interpret __acquires() and __releases()
macros properly.

I wrote the following sample code and tested with sparse-0.4.1 and
sparse-2009-01-21.

  static void foo(struct ioband_device *dp, unsigned long flags)
  {
          spin_unlock_irqrestore(&dp->g_lock, flags);
  }

The both sparse commands issued warnings as I expected.

  sparse-0.4.1
    CHECK   /home/ryov/work/dm-ioband/dm-ioband/src/dm-ioband-ctl.c
  /home/ryov/work/dm-ioband/dm-ioband/src/dm-ioband-ctl.c:171:13:
    warning: context imbalance in 'foo' - unexpected unlock

  sparse-2009-01-21
  /home/ryov/work/dm-ioband/dm-ioband/src/dm-ioband-ctl.c:173:2:
  warning: context problem in 'foo': '_spin_unlock_irqrestore'
    expected different context
  /home/ryov/work/dm-ioband/dm-ioband/src/dm-ioband-ctl.c:173:2:
    context 'lock': wanted >= 1, got 0

Next, I added a __releases() macro and tested again.

  static void foo(struct ioband_device *dp, unsigned long flags)
	__releases(dp->g_lock)
  {
          spin_unlock_irqrestore(&dp->g_lock, flags);
  }

Only sparse-2009-01-21 still issued the warning.

  CHECK   /home/ryov/work/dm-ioband/dm-ioband/src/dm-ioband-ctl.c
  /home/ryov/work/dm-ioband/dm-ioband/src/dm-ioband-ctl.c:174:2:
    warning: context problem in 'foo': '_spin_unlock_irqrestore'
    expected different context
  /home/ryov/work/dm-ioband/dm-ioband/src/dm-ioband-ctl.c:174:2:
    context 'lock': wanted >= 1, got 0

Could you tell me which sparse version are you using?

Thanks,
Ryo Tsuruta

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
  2009-01-21 13:03     ` Ryo Tsuruta
@ 2009-01-21 17:18       ` Alasdair G Kergon
  2009-01-22 12:05         ` Ryo Tsuruta
  0 siblings, 1 reply; 23+ messages in thread
From: Alasdair G Kergon @ 2009-01-21 17:18 UTC (permalink / raw)
  To: Ryo Tsuruta; +Cc: dm-devel

I'm asking whether or not restructuring that little bit of code would improve
clarity, or whether it would become more convoluted.

Last time we had this, changing the functions so locks were acquired then
released actually improved clarity - I'm asking if the same is true here
or not.

Alasdair
-- 
agk@redhat.com

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
  2009-01-21 17:18       ` Alasdair G Kergon
@ 2009-01-22 12:05         ` Ryo Tsuruta
  2009-02-04  5:07           ` Ryo Tsuruta
  0 siblings, 1 reply; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-22 12:05 UTC (permalink / raw)
  To: agk; +Cc: dm-devel

Hi Alasdair and all,

> I'm asking whether or not restructuring that little bit of code would improve
> clarity, or whether it would become more convoluted.
>
> Last time we had this, changing the functions so locks were acquired then
> released actually improved clarity - I'm asking if the same is true here
> or not.

I've checked the whole code but there is no lock imbalance. It is not
easy to change the functions at this time.
# The latest sparse command seems to have a bug, the sparse v0.4.1
# doesn't issue any warnings.

BTW, I've attached a patch against the dm-add-ioband.patch in your
quilt tree. The patch is cleaned up and reflected some Lindent's
outputs and some points suggested by the previous email.
I would appreciate it if you merge the patch into your tree.

Thanks,
Ryo Tsuruta

--- dm-add-ioband.patch.orig	2009-01-22 16:36:29.000000000 +0900
+++ dm-add-ioband.patch	2009-01-22 20:18:37.000000000 +0900
@@ -15,16 +15,16 @@
  Documentation/device-mapper/ioband.txt |  976 ++++++++++++++++++++++++
  drivers/md/Kconfig                     |   13 
  drivers/md/Makefile                    |    2 
- drivers/md/dm-ioband-ctl.c             | 1326 +++++++++++++++++++++++++++++++++
- drivers/md/dm-ioband-policy.c          |  460 +++++++++++
- drivers/md/dm-ioband-type.c            |   76 +
- drivers/md/dm-ioband.h                 |  194 ++++
- 7 files changed, 3047 insertions(+)
+ drivers/md/dm-ioband-ctl.c             | 1308 +++++++++++++++++++++++++++++++++
+ drivers/md/dm-ioband-policy.c          |  457 +++++++++++
+ drivers/md/dm-ioband-type.c            |   77 +
+ drivers/md/dm-ioband.h                 |  186 ++++
+ 7 files changed, 3019 insertions(+)
 
-Index: linux/Documentation/device-mapper/ioband.txt
+Index: linux-2.6.29-rc2/Documentation/device-mapper/ioband.txt
 ===================================================================
---- /dev/null	1970-01-01 00:00:00.000000000 +0000
-+++ linux/Documentation/device-mapper/ioband.txt	2009-01-20 14:43:46.000000000 +0000
+--- /dev/null
++++ linux-2.6.29-rc2/Documentation/device-mapper/ioband.txt
 @@ -0,0 +1,976 @@
 +                     Block I/O bandwidth control: dm-ioband
 +
@@ -143,7 +143,7 @@
 +   loaded.
 +
 +     # dmsetup targets | grep ioband
-+     ioband           v1.10.0
++     ioband           v1.10.1
 +
 +
 +   --------------------------------------------------------------------------
@@ -1002,10 +1002,10 @@
 +         # dmsetup message ioband1 0 weight 15011:80
 +         # dmsetup message ioband1 0 attach 15276
 +         # dmsetup message ioband1 0 weight 15276:40
-Index: linux/drivers/md/Kconfig
+Index: linux-2.6.29-rc2/drivers/md/Kconfig
 ===================================================================
---- linux.orig/drivers/md/Kconfig	2008-10-21 17:40:56.000000000 +0100
-+++ linux/drivers/md/Kconfig	2009-01-20 14:43:16.000000000 +0000
+--- linux-2.6.29-rc2.orig/drivers/md/Kconfig
++++ linux-2.6.29-rc2/drivers/md/Kconfig
 @@ -289,4 +289,17 @@ config DM_UEVENT
  	---help---
  	Generate udev events for DM events.
@@ -1024,10 +1024,10 @@
 +	If unsure, say N.
 +
  endif # MD
-Index: linux/drivers/md/Makefile
+Index: linux-2.6.29-rc2/drivers/md/Makefile
 ===================================================================
---- linux.orig/drivers/md/Makefile	2009-01-06 03:57:48.000000000 +0000
-+++ linux/drivers/md/Makefile	2009-01-20 14:43:16.000000000 +0000
+--- linux-2.6.29-rc2.orig/drivers/md/Makefile
++++ linux-2.6.29-rc2/drivers/md/Makefile
 @@ -8,6 +8,7 @@ dm-multipath-objs := dm-path-selector.o 
  dm-snapshot-objs := dm-snap.o dm-exception-store.o dm-snap-transient.o \
  		    dm-snap-persistent.o
@@ -1044,11 +1044,11 @@
  
  quiet_cmd_unroll = UNROLL  $@
        cmd_unroll = $(PERL) $(srctree)/$(src)/unroll.pl $(UNROLL) \
-Index: linux/drivers/md/dm-ioband-ctl.c
+Index: linux-2.6.29-rc2/drivers/md/dm-ioband-ctl.c
 ===================================================================
---- /dev/null	1970-01-01 00:00:00.000000000 +0000
-+++ linux/drivers/md/dm-ioband-ctl.c	2009-01-20 14:43:16.000000000 +0000
-@@ -0,0 +1,1326 @@
+--- /dev/null
++++ linux-2.6.29-rc2/drivers/md/dm-ioband-ctl.c
+@@ -0,0 +1,1308 @@
 +/*
 + * Copyright (C) 2008 VA Linux Systems Japan K.K.
 + * Authors: Hirokazu Takahashi <taka@valinux.co.jp>
@@ -1069,7 +1069,6 @@
 +#include "dm-bio-list.h"
 +#include "dm-ioband.h"
 +
-+#define DM_MSG_PREFIX "ioband"
 +#define POLICY_PARAM_START 6
 +#define POLICY_PARAM_DELIM "=:,"
 +
@@ -1086,12 +1085,10 @@
 +static int ioband_group_attach(struct ioband_group *, int, char *);
 +static int ioband_group_type_select(struct ioband_group *, char *);
 +
-+long ioband_debug;	/* just for debugging */
-+
 +static void do_nothing(void) {}
 +
 +static int policy_init(struct ioband_device *dp, char *name,
-+						int argc, char **argv)
++		       int argc, char **argv)
 +{
 +	struct policy_type *p;
 +	struct ioband_group *gp;
@@ -1134,23 +1131,22 @@
 +}
 +
 +static struct ioband_device *alloc_ioband_device(char *name,
-+					int io_throttle, int io_limit)
-+
++						 int io_throttle, int io_limit)
 +{
-+	struct ioband_device *dp, *new;
++	struct ioband_device *dp, *new_dp;
 +	unsigned long flags;
 +
-+	new = kzalloc(sizeof(struct ioband_device), GFP_KERNEL);
-+	if (!new)
++	new_dp = kzalloc(sizeof(struct ioband_device), GFP_KERNEL);
++	if (!new_dp)
 +		return NULL;
 +
 +	/*
 +	 * Prepare its own workqueue as generic_make_request() may
 +	 * potentially block the workqueue when submitting BIOs.
 +	 */
-+	new->g_ioband_wq = create_workqueue("kioband");
-+	if (!new->g_ioband_wq) {
-+		kfree(new);
++	new_dp->g_ioband_wq = create_workqueue("kioband");
++	if (!new_dp->g_ioband_wq) {
++		kfree(new_dp);
 +		return NULL;
 +	}
 +
@@ -1159,37 +1155,37 @@
 +		if (!strcmp(dp->g_name, name)) {
 +			dp->g_ref++;
 +			spin_unlock_irqrestore(&ioband_devicelist_lock, flags);
-+			destroy_workqueue(new->g_ioband_wq);
-+			kfree(new);
++			destroy_workqueue(new_dp->g_ioband_wq);
++			kfree(new_dp);
 +			return dp;
 +		}
 +	}
 +
-+	INIT_DELAYED_WORK(&new->g_conductor, ioband_conduct);
-+	INIT_LIST_HEAD(&new->g_groups);
-+	INIT_LIST_HEAD(&new->g_list);
-+	spin_lock_init(&new->g_lock);
-+	mutex_init(&new->g_lock_device);
-+	bio_list_init(&new->g_urgent_bios);
-+	new->g_io_throttle = io_throttle;
-+	new->g_io_limit[0] = io_limit;
-+	new->g_io_limit[1] = io_limit;
-+	new->g_issued[0] = 0;
-+	new->g_issued[1] = 0;
-+	new->g_blocked = 0;
-+	new->g_ref = 1;
-+	new->g_flags = 0;
-+	strlcpy(new->g_name, name, sizeof(new->g_name));
-+	new->g_policy = NULL;
-+	new->g_hold_bio = NULL;
-+	new->g_pop_bio = NULL;
-+	init_waitqueue_head(&new->g_waitq);
-+	init_waitqueue_head(&new->g_waitq_suspend);
-+	init_waitqueue_head(&new->g_waitq_flush);
-+	list_add_tail(&new->g_list, &ioband_device_list);
++	INIT_DELAYED_WORK(&new_dp->g_conductor, ioband_conduct);
++	INIT_LIST_HEAD(&new_dp->g_groups);
++	INIT_LIST_HEAD(&new_dp->g_list);
++	spin_lock_init(&new_dp->g_lock);
++	mutex_init(&new_dp->g_lock_device);
++	bio_list_init(&new_dp->g_urgent_bios);
++	new_dp->g_io_throttle = io_throttle;
++	new_dp->g_io_limit[READ] = io_limit;
++	new_dp->g_io_limit[WRITE] = io_limit;
++	new_dp->g_issued[READ] = 0;
++	new_dp->g_issued[WRITE] = 0;
++	new_dp->g_blocked = 0;
++	new_dp->g_ref = 1;
++	new_dp->g_flags = 0;
++	strlcpy(new_dp->g_name, name, sizeof(new_dp->g_name));
++	new_dp->g_policy = NULL;
++	new_dp->g_hold_bio = NULL;
++	new_dp->g_pop_bio = NULL;
++	init_waitqueue_head(&new_dp->g_waitq);
++	init_waitqueue_head(&new_dp->g_waitq_suspend);
++	init_waitqueue_head(&new_dp->g_waitq_flush);
++	list_add_tail(&new_dp->g_list, &ioband_device_list);
 +
 +	spin_unlock_irqrestore(&ioband_devicelist_lock, flags);
-+	return new;
++	return new_dp;
 +}
 +
 +static void release_ioband_device(struct ioband_device *dp)
@@ -1209,11 +1205,11 @@
 +}
 +
 +static int is_ioband_device_flushed(struct ioband_device *dp,
-+						int wait_completion)
++				    int wait_completion)
 +{
 +	struct ioband_group *gp;
 +
-+	if (wait_completion && dp->g_issued[0] + dp->g_issued[1] > 0)
++	if (wait_completion && dp->g_issued[READ] + dp->g_issued[WRITE] > 0)
 +		return 0;
 +	if (dp->g_blocked || waitqueue_active(&dp->g_waitq))
 +		return 0;
@@ -1224,7 +1220,7 @@
 +}
 +
 +static void suspend_ioband_device(struct ioband_device *dp,
-+				unsigned long flags, int wait_completion)
++				  unsigned long flags, int wait_completion)
 +{
 +	struct ioband_group *gp;
 +
@@ -1249,8 +1245,8 @@
 +	/* wait for all processes to wake up and bios to release */
 +	spin_lock_irqsave(&dp->g_lock, flags);
 +	wait_event_lock_irq(dp->g_waitq_flush,
-+			is_ioband_device_flushed(dp, wait_completion),
-+			dp->g_lock, do_nothing());
++			    is_ioband_device_flushed(dp, wait_completion),
++			    dp->g_lock, do_nothing());
 +}
 +
 +static void resume_ioband_device(struct ioband_device *dp)
@@ -1270,8 +1266,7 @@
 +	clear_device_suspended(dp);
 +}
 +
-+static struct ioband_group *ioband_group_find(
-+					struct ioband_group *head, int id)
++static struct ioband_group *ioband_group_find(struct ioband_group *head, int id)
 +{
 +	struct rb_node *node = head->c_group_root.rb_node;
 +
@@ -1286,25 +1281,25 @@
 +	return NULL;
 +}
 +
-+static void ioband_group_add_node(struct rb_root *root,
-+						struct ioband_group *gp)
++static void ioband_group_add_node(struct rb_root *root, struct ioband_group *gp)
 +{
-+	struct rb_node **new = &root->rb_node, *parent = NULL;
++	struct rb_node **node = &root->rb_node, *parent = NULL;
 +	struct ioband_group *p;
 +
-+	while (*new) {
-+		p = container_of(*new, struct ioband_group, c_group_node);
-+		parent = *new;
-+		new = (gp->c_id < p->c_id) ?
-+					&(*new)->rb_left : &(*new)->rb_right;
++	while (*node) {
++		p = container_of(*node, struct ioband_group, c_group_node);
++		parent = *node;
++		node = (gp->c_id < p->c_id) ?
++				&(*node)->rb_left : &(*node)->rb_right;
 +	}
 +
-+	rb_link_node(&gp->c_group_node, parent, new);
++	rb_link_node(&gp->c_group_node, parent, node);
 +	rb_insert_color(&gp->c_group_node, root);
 +}
 +
 +static int ioband_group_init(struct ioband_group *gp,
-+    struct ioband_group *head, struct ioband_device *dp, int id, char *param)
++			     struct ioband_group *head,
++			     struct ioband_device *dp, int id, char *param)
 +{
 +	unsigned long flags;
 +	int r;
@@ -1349,7 +1344,7 @@
 +}
 +
 +static void ioband_group_release(struct ioband_group *head,
-+						struct ioband_group *gp)
++				 struct ioband_group *gp)
 +{
 +	struct ioband_device *dp = gp->c_banddev;
 +
@@ -1363,12 +1358,12 @@
 +static void ioband_group_destroy_all(struct ioband_group *gp)
 +{
 +	struct ioband_device *dp = gp->c_banddev;
-+	struct ioband_group *group;
++	struct ioband_group *p;
 +	unsigned long flags;
 +
 +	spin_lock_irqsave(&dp->g_lock, flags);
-+	while ((group = ioband_group_find(gp, IOBAND_ID_ANY)))
-+		ioband_group_release(gp, group);
++	while ((p = ioband_group_find(gp, IOBAND_ID_ANY)))
++		ioband_group_release(gp, p);
 +	ioband_group_release(NULL, gp);
 +	spin_unlock_irqrestore(&dp->g_lock, flags);
 +}
@@ -1384,16 +1379,12 @@
 +	for (node = rb_first(&head->c_group_root); node; node = rb_next(node)) {
 +		p = rb_entry(node, struct ioband_group, c_group_node);
 +		set_group_down(p);
-+		if (suspend) {
++		if (suspend)
 +			set_group_suspended(p);
-+			dprintk(KERN_ERR "ioband suspend: gp(%p)\n", p);
-+		}
 +	}
 +	set_group_down(head);
-+	if (suspend) {
++	if (suspend)
 +		set_group_suspended(head);
-+		dprintk(KERN_ERR "ioband suspend: gp(%p)\n", head);
-+	}
 +	spin_unlock_irqrestore(&dp->g_lock, flags);
 +	queue_delayed_work(dp->g_ioband_wq, &dp->g_conductor, 0);
 +	flush_workqueue(dp->g_ioband_wq);
@@ -1407,16 +1398,13 @@
 +	unsigned long flags;
 +
 +	spin_lock_irqsave(&dp->g_lock, flags);
-+	for (node = rb_first(&head->c_group_root); node;
-+							node = rb_next(node)) {
++	for (node = rb_first(&head->c_group_root); node; node = rb_next(node)) {
 +		p = rb_entry(node, struct ioband_group, c_group_node);
 +		clear_group_down(p);
 +		clear_group_suspended(p);
-+		dprintk(KERN_ERR "ioband resume: gp(%p)\n", p);
 +	}
 +	clear_group_down(head);
 +	clear_group_suspended(head);
-+	dprintk(KERN_ERR "ioband resume: gp(%p)\n", head);
 +	spin_unlock_irqrestore(&dp->g_lock, flags);
 +}
 +
@@ -1442,7 +1430,7 @@
 + *   parameters:  <device> <device-group-id> <io_throttle> <io_limit>
 + *     <type> <policy> <policy-param...> <group-id:group-param...>
 + */
-+static int ioband_ctr(struct dm_target *ti, unsigned int argc, char **argv)
++static int ioband_ctr(struct dm_target *ti, unsigned argc, char **argv)
 +{
 +	struct ioband_group *gp;
 +	struct ioband_device *dp;
@@ -1451,7 +1439,7 @@
 +	int io_limit;
 +	int i, r, start;
 +	long val, id;
-+	char *param;
++	char *param, *s;
 +
 +	if (argc < POLICY_PARAM_START) {
 +		ti->error = "Requires " __stringify(POLICY_PARAM_START)
@@ -1463,7 +1451,6 @@
 +		ti->error = "Ioband device name is too long";
 +		return -EINVAL;
 +	}
-+	dprintk(KERN_ERR "ioband_ctr ioband device name:%s\n", argv[1]);
 +
 +	r = strict_strtol(argv[2], 0, &val);
 +	if (r || val < 0) {
@@ -1480,7 +1467,7 @@
 +	io_limit = val;
 +
 +	r = dm_get_device(ti, argv[0], 0, ti->len,
-+				dm_table_get_mode(ti->table), &dev);
++			  dm_table_get_mode(ti->table), &dev);
 +	if (r) {
 +		ti->error = "Device lookup failed";
 +		return r;
@@ -1495,15 +1482,11 @@
 +			r = -ENXIO;
 +			goto release_dm_device;
 +		}
-+		dprintk(KERN_ERR "ioband_ctr nr_requests:%lu\n",
-+							q->nr_requests);
 +		io_limit = q->nr_requests;
 +	}
 +
 +	if (io_limit < io_throttle)
 +		io_limit = io_throttle;
-+	dprintk(KERN_ERR "ioband_ctr io_throttle:%d io_limit:%d\n",
-+						io_throttle, io_limit);
 +
 +	dp = alloc_ioband_device(argv[1], io_throttle, io_limit);
 +	if (!dp) {
@@ -1532,9 +1515,11 @@
 +	gp->c_dev = dev;
 +
 +	/* Find a default group parameter */
-+	for (start = POLICY_PARAM_START; start < argc; start++)
-+		if (argv[start][0] == ':')
++	for (start = POLICY_PARAM_START; start < argc; start++) {
++		s = strpbrk(argv[start], POLICY_PARAM_DELIM);
++		if (s == argv[start])
 +			break;
++	}
 +	param = (start < argc) ? &argv[start][1] : NULL;
 +
 +	/* Create a default ioband group */
@@ -1607,7 +1592,7 @@
 +	struct page *page = bio_iovec_idx(bio, 0)->bv_page;
 +	/*
 +	 * ToDo: A new flag should be added to struct bio, which indicates
-+	 * 	it contains urgent I/O requests.
++	 *       it contains urgent I/O requests.
 +	 */
 +	if (!PageReclaim(page))
 +		return 0;
@@ -1624,7 +1609,7 @@
 +		return 0;
 +	if (is_device_blocked(dp))
 +		return 1;
-+	if (dp->g_blocked >= dp->g_io_limit[0] + dp->g_io_limit[1]) {
++	if (dp->g_blocked >= dp->g_io_limit[READ] + dp->g_io_limit[WRITE]) {
 +		set_device_blocked(dp);
 +		return 1;
 +	}
@@ -1657,10 +1642,10 @@
 +		 * partitions.
 +		 */
 +		wait_event_lock_irq(dp->g_waitq, !device_should_block(gp),
-+						dp->g_lock, do_nothing());
++				    dp->g_lock, do_nothing());
 +	} else {
 +		wait_event_lock_irq(gp->c_waitq, !group_should_block(gp),
-+						dp->g_lock, do_nothing());
++				    dp->g_lock, do_nothing());
 +	}
 +}
 +
@@ -1679,8 +1664,8 @@
 +
 +static inline int room_for_bio(struct ioband_device *dp)
 +{
-+	return dp->g_issued[0] < dp->g_io_limit[0]
-+		|| dp->g_issued[1] < dp->g_io_limit[1];
++	return dp->g_issued[READ] < dp->g_io_limit[READ]
++		|| dp->g_issued[WRITE] < dp->g_io_limit[WRITE];
 +}
 +
 +static void hold_bio(struct ioband_group *gp, struct bio *bio)
@@ -1731,7 +1716,8 @@
 +}
 +
 +static int make_issue_list(struct ioband_group *gp, struct bio *bio,
-+		 struct bio_list *issue_list, struct bio_list *pushback_list)
++			   struct bio_list *issue_list,
++			   struct bio_list *pushback_list)
 +{
 +	struct ioband_device *dp = gp->c_banddev;
 +
@@ -1754,13 +1740,14 @@
 +}
 +
 +static void release_urgent_bios(struct ioband_device *dp,
-+		struct bio_list *issue_list, struct bio_list *pushback_list)
++				struct bio_list *issue_list,
++				struct bio_list *pushback_list)
 +{
 +	struct bio *bio;
 +
 +	if (bio_list_empty(&dp->g_urgent_bios))
 +		return;
-+	while (room_for_bio_rw(dp, 1)) {
++	while (room_for_bio_rw(dp, WRITE)) {
 +		bio = bio_list_pop(&dp->g_urgent_bios);
 +		if (!bio)
 +			return;
@@ -1771,7 +1758,8 @@
 +}
 +
 +static int release_prio_bios(struct ioband_group *gp,
-+		struct bio_list *issue_list, struct bio_list *pushback_list)
++			     struct bio_list *issue_list,
++			     struct bio_list *pushback_list)
 +{
 +	struct ioband_device *dp = gp->c_banddev;
 +	struct bio *bio;
@@ -1797,7 +1785,8 @@
 +}
 +
 +static int release_norm_bios(struct ioband_group *gp,
-+		struct bio_list *issue_list, struct bio_list *pushback_list)
++			     struct bio_list *issue_list,
++			     struct bio_list *pushback_list)
 +{
 +	struct ioband_device *dp = gp->c_banddev;
 +	struct bio *bio;
@@ -1826,7 +1815,8 @@
 +}
 +
 +static inline int release_bios(struct ioband_group *gp,
-+		struct bio_list *issue_list, struct bio_list *pushback_list)
++			       struct bio_list *issue_list,
++			       struct bio_list *pushback_list)
 +{
 +	int ret = release_prio_bios(gp, issue_list, pushback_list);
 +	if (ret)
@@ -1835,7 +1825,7 @@
 +}
 +
 +static struct ioband_group *ioband_group_get(struct ioband_group *head,
-+							struct bio *bio)
++					     struct bio *bio)
 +{
 +	struct ioband_group *gp;
 +
@@ -1854,12 +1844,12 @@
 + * exceeds the value of "io_throttle".
 + */
 +static int ioband_map(struct dm_target *ti, struct bio *bio,
-+						union map_info *map_context)
++		      union map_info *map_context)
 +{
 +	struct ioband_group *gp = ti->private;
 +	struct ioband_device *dp = gp->c_banddev;
 +	unsigned long flags;
-+	int rw;
++	int direct;
 +
 +	spin_lock_irqsave(&dp->g_lock, flags);
 +
@@ -1869,7 +1859,8 @@
 +	 */
 +	if (is_device_suspended(dp))
 +		wait_event_lock_irq(dp->g_waitq_suspend,
-+			!is_device_suspended(dp), dp->g_lock, do_nothing());
++				    !is_device_suspended(dp), dp->g_lock,
++				    do_nothing());
 +
 +	gp = ioband_group_get(gp, bio);
 +	prevent_burst_bios(gp, bio);
@@ -1880,21 +1871,20 @@
 +
 +	bio->bi_bdev = gp->c_dev->bdev;
 +	bio->bi_sector -= ti->begin;
-+	rw = bio_data_dir(bio);
++	direct = bio_data_dir(bio);
 +
-+	if (!gp->c_blocked && room_for_bio_rw(dp, rw)) {
++	if (!gp->c_blocked && room_for_bio_rw(dp, direct)) {
 +		if (dp->g_can_submit(gp)) {
 +			prepare_to_issue(gp, bio);
-+			gp->c_stat[rw].immediate++;
-+			gp->c_stat[rw].sectors += bio_sectors(bio);
++			gp->c_stat[direct].immediate++;
++			gp->c_stat[direct].sectors += bio_sectors(bio);
 +			spin_unlock_irqrestore(&dp->g_lock, flags);
 +			return DM_MAPIO_REMAPPED;
-+		} else if (!dp->g_blocked
-+				&& dp->g_issued[0] + dp->g_issued[1] == 0) {
-+			dprintk(KERN_ERR "ioband_map: token expired "
-+					"gp:%p bio:%p\n", gp, bio);
++		} else if (!dp->g_blocked &&
++			   dp->g_issued[READ] + dp->g_issued[WRITE] == 0) {
++			DMDEBUG("%s: token expired gp:%p", __func__, gp);
 +			queue_delayed_work(dp->g_ioband_wq,
-+							&dp->g_conductor, 1);
++					   &dp->g_conductor, 1);
 +		}
 +	}
 +	hold_bio(gp, bio);
@@ -1910,8 +1900,8 @@
 +{
 +	struct ioband_group *gp;
 +	struct ioband_group *best = NULL;
-+	int	highest = 0;
-+	int	pri;
++	int highest = 0;
++	int pri;
 +
 +	/* Todo: The algorithm should be optimized.
 +	 *       It would be better to use rbtree.
@@ -1919,8 +1909,8 @@
 +	list_for_each_entry(gp, &dp->g_groups, c_list) {
 +		if (!gp->c_blocked || !room_for_bio(dp))
 +			continue;
-+		if (gp->c_blocked == gp->c_prio_blocked
-+			&& !room_for_bio_rw(dp, prio_queue_direct(gp))) {
++		if (gp->c_blocked == gp->c_prio_blocked &&
++		    !room_for_bio_rw(dp, prio_queue_direct(gp))) {
 +			continue;
 +		}
 +		pri = dp->g_can_submit(gp);
@@ -1953,28 +1943,28 @@
 +	release_urgent_bios(dp, &issue_list, &pushback_list);
 +	if (dp->g_blocked) {
 +		gp = choose_best_group(dp);
-+		if (gp && release_bios(gp, &issue_list, &pushback_list)
-+								== R_YIELD)
++		if (gp &&
++		    release_bios(gp, &issue_list, &pushback_list) == R_YIELD)
 +			queue_delayed_work(dp->g_ioband_wq,
-+							&dp->g_conductor, 0);
++					   &dp->g_conductor, 0);
 +	}
 +
-+	if (is_device_blocked(dp)
-+	    && dp->g_blocked < dp->g_io_limit[0]+dp->g_io_limit[1]) {
++	if (is_device_blocked(dp) &&
++	    dp->g_blocked < dp->g_io_limit[READ] + dp->g_io_limit[WRITE]) {
 +		clear_device_blocked(dp);
 +		wake_up_all(&dp->g_waitq);
 +	}
 +
-+	if (dp->g_blocked && room_for_bio_rw(dp, 0) && room_for_bio_rw(dp, 1) &&
-+		bio_list_empty(&issue_list) && bio_list_empty(&pushback_list) &&
-+		dp->g_restart_bios(dp)) {
-+		dprintk(KERN_ERR "ioband_conduct: token expired dp:%p "
-+			"issued(%d,%d) g_blocked(%d)\n", dp,
-+			 dp->g_issued[0], dp->g_issued[1], dp->g_blocked);
++	if (dp->g_blocked &&
++	    room_for_bio_rw(dp, READ) && room_for_bio_rw(dp, WRITE) &&
++	    bio_list_empty(&issue_list) && bio_list_empty(&pushback_list) &&
++	    dp->g_restart_bios(dp)) {
++		DMDEBUG("%s: token expired dp:%p issued(%d,%d) g_blocked(%d)",
++			__func__, dp, dp->g_issued[READ], dp->g_issued[WRITE],
++			dp->g_blocked);
 +		queue_delayed_work(dp->g_ioband_wq, &dp->g_conductor, 0);
 +	}
 +
-+
 +	spin_unlock_irqrestore(&dp->g_lock, flags);
 +
 +	while ((bio = bio_list_pop(&issue_list)))
@@ -1984,7 +1974,7 @@
 +}
 +
 +static int ioband_end_io(struct dm_target *ti, struct bio *bio,
-+				int error, union map_info *map_context)
++			 int error, union map_info *map_context)
 +{
 +	struct ioband_group *gp = ti->private;
 +	struct ioband_device *dp = gp->c_banddev;
@@ -2007,12 +1997,12 @@
 +
 +	/*
 +	 * Todo: It would be better to introduce high/low water marks here
-+	 * 	 not to kick the workqueues so often.
++	 *       not to kick the workqueues so often.
 +	 */
 +	if (dp->g_blocked)
 +		queue_delayed_work(dp->g_ioband_wq, &dp->g_conductor, 0);
-+	else if (is_device_suspended(dp)
-+				&& dp->g_issued[0] + dp->g_issued[1] == 0)
++	else if (is_device_suspended(dp) &&
++		 dp->g_issued[READ] + dp->g_issued[WRITE] == 0)
 +		wake_up_all(&dp->g_waitq_flush);
 +	spin_unlock_irqrestore(&dp->g_lock, flags);
 +	return r;
@@ -2038,9 +2028,8 @@
 +	mutex_unlock(&dp->g_lock_device);
 +}
 +
-+
 +static void ioband_group_status(struct ioband_group *gp, int *szp,
-+					char *result, unsigned int maxlen)
++				char *result, unsigned maxlen)
 +{
 +	struct ioband_group_stat *stat;
 +	int i, sz = *szp; /* used in DMEMIT() */
@@ -2049,14 +2038,14 @@
 +	for (i = 0; i < 2; i++) {
 +		stat = &gp->c_stat[i];
 +		DMEMIT(" %lu %lu %lu",
-+			stat->immediate + stat->deferred, stat->deferred,
-+			stat->sectors);
++		       stat->immediate + stat->deferred, stat->deferred,
++		       stat->sectors);
 +	}
 +	*szp = sz;
 +}
 +
 +static int ioband_status(struct dm_target *ti, status_type_t type,
-+					char *result, unsigned int maxlen)
++			 char *result, unsigned maxlen)
 +{
 +	struct ioband_group *gp = ti->private, *p;
 +	struct ioband_device *dp = gp->c_banddev;
@@ -2072,7 +2061,7 @@
 +		DMEMIT("%s", dp->g_name);
 +		ioband_group_status(gp, &sz, result, maxlen);
 +		for (node = rb_first(&gp->c_group_root); node;
-+						node = rb_next(node)) {
++		     node = rb_next(node)) {
 +			p = rb_entry(node, struct ioband_group, c_group_node);
 +			ioband_group_status(p, &sz, result, maxlen);
 +		}
@@ -2082,9 +2071,9 @@
 +	case STATUSTYPE_TABLE:
 +		spin_lock_irqsave(&dp->g_lock, flags);
 +		DMEMIT("%s %s %d %d %s %s",
-+				gp->c_dev->name, dp->g_name,
-+				dp->g_io_throttle, dp->g_io_limit[0],
-+				gp->c_type->t_name, dp->g_policy->p_name);
++		       gp->c_dev->name, dp->g_name,
++		       dp->g_io_throttle, dp->g_io_limit[READ],
++		       gp->c_type->t_name, dp->g_policy->p_name);
 +		dp->g_show(gp, &sz, result, maxlen);
 +		spin_unlock_irqrestore(&dp->g_lock, flags);
 +		break;
@@ -2217,8 +2206,7 @@
 + *		"weight" 0:<value>
 + *		"token"  24:<value>
 + */
-+static int __ioband_message(struct dm_target *ti,
-+					unsigned int argc, char **argv)
++static int __ioband_message(struct dm_target *ti, unsigned argc, char **argv)
 +{
 +	struct ioband_group *gp = ti->private, *p;
 +	struct ioband_device *dp = gp->c_banddev;
@@ -2231,7 +2219,7 @@
 +		spin_lock_irqsave(&dp->g_lock, flags);
 +		memset(gp->c_stat, 0, sizeof(gp->c_stat));
 +		for (node = rb_first(&gp->c_group_root); node;
-+						 node = rb_next(node)) {
++		     node = rb_next(node)) {
 +			p = rb_entry(node, struct ioband_group, c_group_node);
 +			memset(p->c_stat, 0, sizeof(p->c_stat));
 +		}
@@ -2243,17 +2231,11 @@
 +		DMWARN("Unrecognised band message received.");
 +		return -EINVAL;
 +	}
-+	if (!strcmp(argv[0], "debug")) {
-+		r = strict_strtol(argv[1], 0, &val);
-+		if (r || val < 0)
-+			return -EINVAL;
-+		ioband_debug = val;
-+		return 0;
-+	} else if (!strcmp(argv[0], "io_throttle")) {
++	if (!strcmp(argv[0], "io_throttle")) {
 +		r = strict_strtol(argv[1], 0, &val);
 +		spin_lock_irqsave(&dp->g_lock, flags);
 +		if (r || val < 0 ||
-+			val > dp->g_io_limit[0] || val > dp->g_io_limit[1]) {
++		    val > dp->g_io_limit[READ] || val > dp->g_io_limit[WRITE]) {
 +			spin_unlock_irqrestore(&dp->g_lock, flags);
 +			return -EINVAL;
 +		}
@@ -2280,7 +2262,7 @@
 +			spin_unlock_irqrestore(&dp->g_lock, flags);
 +			return -EINVAL;
 +		}
-+		dp->g_io_limit[0] = dp->g_io_limit[1] = val;
++		dp->g_io_limit[READ] = dp->g_io_limit[WRITE] = val;
 +		spin_unlock_irqrestore(&dp->g_lock, flags);
 +		ioband_set_param(gp, argv[0], argv[1]);
 +		return 0;
@@ -2309,7 +2291,7 @@
 +	return 0;
 +}
 +
-+static int ioband_message(struct dm_target *ti, unsigned int argc, char **argv)
++static int ioband_message(struct dm_target *ti, unsigned argc, char **argv)
 +{
 +	struct ioband_group *gp = ti->private;
 +	struct ioband_device *dp = gp->c_banddev;
@@ -2322,7 +2304,7 @@
 +}
 +
 +static int ioband_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
-+					struct bio_vec *biovec, int max_size)
++			struct bio_vec *biovec, int max_size)
 +{
 +	struct ioband_group *gp = ti->private;
 +	struct request_queue *q = bdev_get_queue(gp->c_dev->bdev);
@@ -2339,7 +2321,7 @@
 +static struct target_type ioband_target = {
 +	.name	     = "ioband",
 +	.module      = THIS_MODULE,
-+	.version     = {1, 10, 0},
++	.version     = {1, 10, 1},
 +	.ctr	     = ioband_ctr,
 +	.dtr	     = ioband_dtr,
 +	.map	     = ioband_map,
@@ -2375,11 +2357,11 @@
 +MODULE_AUTHOR("Hirokazu Takahashi <taka@valinux.co.jp>, "
 +	      "Ryo Tsuruta <ryov@valinux.co.jp");
 +MODULE_LICENSE("GPL");
-Index: linux/drivers/md/dm-ioband-policy.c
+Index: linux-2.6.29-rc2/drivers/md/dm-ioband-policy.c
 ===================================================================
---- /dev/null	1970-01-01 00:00:00.000000000 +0000
-+++ linux/drivers/md/dm-ioband-policy.c	2009-01-20 14:43:16.000000000 +0000
-@@ -0,0 +1,460 @@
+--- /dev/null
++++ linux-2.6.29-rc2/drivers/md/dm-ioband-policy.c
+@@ -0,0 +1,457 @@
 +/*
 + * Copyright (C) 2008 VA Linux Systems Japan K.K.
 + *
@@ -2400,7 +2382,6 @@
 + * It is possible to add a new BIO scheduling policy with it.
 + */
 +
-+
 +/*
 + * Functions for weight balancing policy based on the number of I/Os.
 + */
@@ -2458,13 +2439,13 @@
 +	if (gp) {
 +		int iopri = iopriority(gp);
 +		if (iopri * PROCEED_THRESHOLD > IOBAND_IOPRIO_BASE &&
-+			dp->g_issued[0] + dp->g_issued[1] >= dp->g_io_throttle)
++		    dp->g_issued[READ] + dp->g_issued[WRITE] >=
++		    dp->g_io_throttle)
 +			return 0;
 +	}
 +
 +	dp->g_epoch++;
-+	dprintk(KERN_ERR "make_epoch %d --> %d\n",
-+						dp->g_epoch-1, dp->g_epoch);
++	DMDEBUG("make_epoch %d", dp->g_epoch);
 +
 +	/* The leftover tokens will be used in the next epoch. */
 +	dp->g_token_extra = dp->g_token_left;
@@ -2527,20 +2508,18 @@
 +	 * tokens on this ioband device from the previous epoch.
 +	 */
 +	extra = dp->g_token_extra * gp->c_token_initial /
-+				 (dp->g_token_bucket - dp->g_token_extra/2);
++	    (dp->g_token_bucket - dp->g_token_extra / 2);
 +	delta += extra;
 +	gp->c_token += delta;
 +	gp->c_consumed = 0;
 +
 +	if (gp == dp->g_current)
 +		dp->g_yield_mark += delta;
-+	dprintk(KERN_ERR "refill token: "
-+		"gp:%p token:%d->%d extra(%d) allowance(%d)\n",
++	DMDEBUG("refill token: gp:%p token:%d->%d extra(%d) allowance(%d)",
 +		gp, gp->c_token - delta, gp->c_token, extra, allowance);
 +	if (gp->c_token > 0)
 +		return iopriority(gp);
-+	dprintk(KERN_ERR "refill token: yet empty gp:%p token:%d\n",
-+						gp, gp->c_token);
++	DMDEBUG("refill token: yet empty gp:%p token:%d", gp, gp->c_token);
 +	return 0;
 +}
 +
@@ -2553,10 +2532,10 @@
 +	struct ioband_device *dp = gp->c_banddev;
 +
 +	if (gp->c_consumed * LOCAL_ACTIVE_RATIO < gp->c_token_initial &&
-+		gp->c_consumed * GLOBAL_ACTIVE_RATIO < dp->g_token_bucket) {
++	    gp->c_consumed * GLOBAL_ACTIVE_RATIO < dp->g_token_bucket) {
 +		; /* Do nothing unless this group is really active. */
 +	} else if (!dp->g_dominant ||
-+			get_token(gp) > get_token(dp->g_dominant)) {
++		   get_token(gp) > get_token(dp->g_dominant)) {
 +		/*
 +		 * Regard this group as the dominant group on this
 +		 * ioband device when it has larger number of tokens
@@ -2565,7 +2544,7 @@
 +		dp->g_dominant = gp;
 +	}
 +	if (dp->g_epoch == gp->c_my_epoch &&
-+			gp->c_token > 0 && gp->c_token - count <= 0) {
++	    gp->c_token > 0 && gp->c_token - count <= 0) {
 +		/* Remember the last group which used up its own tokens. */
 +		dp->g_expired = gp;
 +		if (dp->g_dominant == gp)
@@ -2576,7 +2555,7 @@
 +		/* This group is the current already. */
 +		dp->g_current = gp;
 +		dp->g_yield_mark =
-+			gp->c_token - (TOKEN_BATCH_UNIT << dp->g_token_unit);
++		    gp->c_token - (TOKEN_BATCH_UNIT << dp->g_token_unit);
 +	}
 +	gp->c_token -= count;
 +	gp->c_consumed += count;
@@ -2628,19 +2607,20 @@
 +			p->c_token = p->c_token_initial =
 +				dp->g_token_bucket * p->c_weight /
 +				dp->g_weight_total + 1;
-+			p->c_limit = (dp->g_io_limit[0] + dp->g_io_limit[1]) *
-+				p->c_weight / dp->g_weight_total /
-+				OVERCOMMIT_RATE + 1;
++			p->c_limit = (dp->g_io_limit[READ] +
++				dp->g_io_limit[WRITE]) * p->c_weight /
++				dp->g_weight_total / OVERCOMMIT_RATE + 1;
 +		}
 +	}
 +}
 +
 +static void init_token_bucket(struct ioband_device *dp,
-+					int token_bucket, int carryover)
++			      int token_bucket, int carryover)
 +{
 +	if (!token_bucket)
-+		dp->g_token_bucket = ((dp->g_io_limit[0] + dp->g_io_limit[1]) *
-+					DEFAULT_BUCKET) << dp->g_token_unit;
++		dp->g_token_bucket =
++			((dp->g_io_limit[READ] + dp->g_io_limit[WRITE]) *
++			DEFAULT_BUCKET) << dp->g_token_unit;
 +	else
 +		dp->g_token_bucket = token_bucket;
 +	if (!carryover)
@@ -2709,12 +2689,12 @@
 +}
 +
 +static void policy_weight_show(struct ioband_group *gp, int *szp,
-+					char *result, unsigned int maxlen)
++			       char *result, unsigned maxlen)
 +{
 +	struct ioband_group *p;
 +	struct ioband_device *dp = gp->c_banddev;
 +	struct rb_node *node;
-+	int sz = *szp; /* used in DMEMIT() */
++	int sz = *szp;	/* used in DMEMIT() */
 +
 +	DMEMIT(" %d :%d", dp->g_token_bucket, gp->c_weight);
 +
@@ -2774,7 +2754,7 @@
 +	dp->g_group_dtr = policy_weight_dtr;
 +	dp->g_set_param = policy_weight_param;
 +	dp->g_should_block = is_queue_full;
-+	dp->g_show  = policy_weight_show;
++	dp->g_show = policy_weight_show;
 +
 +	dp->g_epoch = 0;
 +	dp->g_weight_total = 0;
@@ -2788,8 +2768,8 @@
 +
 +	return 0;
 +}
-+/* weight balancing policy based on the number of I/Os. --- End --- */
 +
++/* weight balancing policy based on the number of I/Os. --- End --- */
 +
 +/*
 + * Functions for weight balancing policy based on I/O size.
@@ -2802,7 +2782,7 @@
 +}
 +
 +static int w2_policy_weight_init(struct ioband_device *dp,
-+							int argc, char **argv)
++				 int argc, char **argv)
 +{
 +	long val;
 +	int r = 0;
@@ -2825,11 +2805,10 @@
 +	dp->g_token_left = dp->g_token_bucket;
 +	return 0;
 +}
-+/* weight balancing policy based on I/O size. --- End --- */
 +
++/* weight balancing policy based on I/O size. --- End --- */
 +
-+static int policy_default_init(struct ioband_device *dp,
-+					int argc, char **argv)
++static int policy_default_init(struct ioband_device *dp, int argc, char **argv)
 +{
 +	return policy_weight_init(dp, argc, argv);
 +}
@@ -2838,13 +2817,13 @@
 +	{"default", policy_default_init},
 +	{"weight", policy_weight_init},
 +	{"weight-iosize", w2_policy_weight_init},
-+	{NULL,     policy_default_init}
++	{NULL, policy_default_init}
 +};
-Index: linux/drivers/md/dm-ioband-type.c
+Index: linux-2.6.29-rc2/drivers/md/dm-ioband-type.c
 ===================================================================
---- /dev/null	1970-01-01 00:00:00.000000000 +0000
-+++ linux/drivers/md/dm-ioband-type.c	2009-01-20 14:43:16.000000000 +0000
-@@ -0,0 +1,76 @@
+--- /dev/null
++++ linux-2.6.29-rc2/drivers/md/dm-ioband-type.c
+@@ -0,0 +1,77 @@
 +/*
 + * Copyright (C) 2008 VA Linux Systems Japan K.K.
 + *
@@ -2900,32 +2879,33 @@
 +
 +static int ioband_cgroup(struct bio *bio)
 +{
-+  /*
-+   * This function should return the ID of the cgroup which issued "bio".
-+   * The ID of the cgroup which the current process belongs to won't be
-+   * suitable ID for this purpose, since some BIOs will be handled by kernel
-+   * threads like aio or pdflush on behalf of the process requesting the BIOs.
-+   */
++	/*
++	 * This function should return the ID of the cgroup which
++	 * issued "bio". The ID of the cgroup which the current
++	 * process belongs to won't be suitable ID for this purpose,
++	 * since some BIOs will be handled by kernel threads like aio
++	 * or pdflush on behalf of the process requesting the BIOs.
++	 */
 +	return 0;	/* not implemented yet */
 +}
 +
 +struct group_type dm_ioband_group_type[] = {
-+	{"none",   NULL},
-+	{"pgrp",   ioband_process_group},
-+	{"pid",    ioband_process_id},
-+	{"node",   ioband_node},
++	{"none", NULL},
++	{"pgrp", ioband_process_group},
++	{"pid", ioband_process_id},
++	{"node", ioband_node},
 +	{"cpuset", ioband_cpuset},
 +	{"cgroup", ioband_cgroup},
-+	{"user",   ioband_uid},
-+	{"uid",    ioband_uid},
-+	{"gid",    ioband_gid},
-+	{NULL,     NULL}
++	{"user", ioband_uid},
++	{"uid", ioband_uid},
++	{"gid", ioband_gid},
++	{NULL, NULL}
 +};
-Index: linux/drivers/md/dm-ioband.h
+Index: linux-2.6.29-rc2/drivers/md/dm-ioband.h
 ===================================================================
---- /dev/null	1970-01-01 00:00:00.000000000 +0000
-+++ linux/drivers/md/dm-ioband.h	2009-01-20 14:43:16.000000000 +0000
-@@ -0,0 +1,194 @@
+--- /dev/null
++++ linux-2.6.29-rc2/drivers/md/dm-ioband.h
+@@ -0,0 +1,186 @@
 +/*
 + * Copyright (C) 2008 VA Linux Systems Japan K.K.
 + *
@@ -2937,6 +2917,8 @@
 +#include <linux/version.h>
 +#include <linux/wait.h>
 +
++#define DM_MSG_PREFIX "ioband"
++
 +#define DEFAULT_IO_THROTTLE	4
 +#define DEFAULT_IO_LIMIT	128
 +#define IOBAND_NAME_MAX 31
@@ -2945,94 +2927,94 @@
 +struct ioband_group;
 +
 +struct ioband_device {
-+	struct list_head	g_groups;
-+	struct delayed_work     g_conductor;
-+	struct workqueue_struct	*g_ioband_wq;
-+	struct	bio_list	g_urgent_bios;
-+	int	g_io_throttle;
-+	int	g_io_limit[2];
-+	int	g_issued[2];
-+	int	g_blocked;
-+	spinlock_t	g_lock;
-+	struct mutex	g_lock_device;
++	struct list_head g_groups;
++	struct delayed_work g_conductor;
++	struct workqueue_struct *g_ioband_wq;
++	struct bio_list g_urgent_bios;
++	int g_io_throttle;
++	int g_io_limit[2];
++	int g_issued[2];
++	int g_blocked;
++	spinlock_t g_lock;
++	struct mutex g_lock_device;
 +	wait_queue_head_t g_waitq;
 +	wait_queue_head_t g_waitq_suspend;
 +	wait_queue_head_t g_waitq_flush;
 +
-+	int	g_ref;
-+	struct	list_head g_list;
-+	int	g_flags;
-+	char	g_name[IOBAND_NAME_MAX + 1];
-+	struct	policy_type *g_policy;
++	int g_ref;
++	struct list_head g_list;
++	int g_flags;
++	char g_name[IOBAND_NAME_MAX + 1];
++	struct policy_type *g_policy;
 +
 +	/* policy dependent */
-+	int	(*g_can_submit)(struct ioband_group *);
-+	int	(*g_prepare_bio)(struct ioband_group *, struct bio *, int);
-+	int	(*g_restart_bios)(struct ioband_device *);
-+	void	(*g_hold_bio)(struct ioband_group *, struct bio *);
-+	struct bio * (*g_pop_bio)(struct ioband_group *);
-+	int	(*g_group_ctr)(struct ioband_group *, char *);
-+	void	(*g_group_dtr)(struct ioband_group *);
-+	int	(*g_set_param)(struct ioband_group *, char *cmd, char *value);
-+	int	(*g_should_block)(struct ioband_group *);
-+	void	(*g_show)(struct ioband_group *, int *, char *, unsigned int);
++	int (*g_can_submit) (struct ioband_group *);
++	int (*g_prepare_bio) (struct ioband_group *, struct bio *, int);
++	int (*g_restart_bios) (struct ioband_device *);
++	void (*g_hold_bio) (struct ioband_group *, struct bio *);
++	struct bio *(*g_pop_bio) (struct ioband_group *);
++	int (*g_group_ctr) (struct ioband_group *, char *);
++	void (*g_group_dtr) (struct ioband_group *);
++	int (*g_set_param) (struct ioband_group *, char *cmd, char *value);
++	int (*g_should_block) (struct ioband_group *);
++	void (*g_show) (struct ioband_group *, int *, char *, unsigned);
 +
 +	/* members for weight balancing policy */
-+	int	g_epoch;
-+	int	g_weight_total;
-+		/* the number of tokens which can be used in every epoch */
-+	int	g_token_bucket;
-+		/* how many epochs tokens can be carried over */
-+	int	g_carryover;
-+		/* how many tokens should be used for one page-sized I/O */
-+	int	g_token_unit;
-+		/* the last group which used a token */
++	int g_epoch;
++	int g_weight_total;
++	/* the number of tokens which can be used in every epoch */
++	int g_token_bucket;
++	/* how many epochs tokens can be carried over */
++	int g_carryover;
++	/* how many tokens should be used for one page-sized I/O */
++	int g_token_unit;
++	/* the last group which used a token */
 +	struct ioband_group *g_current;
-+		/* give another group a chance to be scheduled when the rest
-+		   of tokens of the current group reaches this mark */
-+	int	g_yield_mark;
-+		/* the latest group which used up its tokens */
++	/* give another group a chance to be scheduled when the rest
++	   of tokens of the current group reaches this mark */
++	int g_yield_mark;
++	/* the latest group which used up its tokens */
 +	struct ioband_group *g_expired;
-+		/* the group which has the largest number of tokens in the
-+		   active groups */
++	/* the group which has the largest number of tokens in the
++	   active groups */
 +	struct ioband_group *g_dominant;
-+		/* the number of unused tokens in this epoch */
-+	int	g_token_left;
-+		/* left-over tokens from the previous epoch */
-+	int	g_token_extra;
++	/* the number of unused tokens in this epoch */
++	int g_token_left;
++	/* left-over tokens from the previous epoch */
++	int g_token_extra;
 +};
 +
 +struct ioband_group_stat {
-+	unsigned long	sectors;
-+	unsigned long	immediate;
-+	unsigned long	deferred;
++	unsigned long sectors;
++	unsigned long immediate;
++	unsigned long deferred;
 +};
 +
 +struct ioband_group {
-+	struct	list_head c_list;
++	struct list_head c_list;
 +	struct ioband_device *c_banddev;
 +	struct dm_dev *c_dev;
 +	struct dm_target *c_target;
-+	struct	bio_list c_blocked_bios;
-+	struct	bio_list c_prio_bios;
-+	struct	rb_root c_group_root;
-+	struct  rb_node c_group_node;
-+	int	c_id;	/* should be unsigned long or unsigned long long */
-+	char	c_name[IOBAND_NAME_MAX + 1];	/* rfu */
-+	int	c_blocked;
-+	int	c_prio_blocked;
++	struct bio_list c_blocked_bios;
++	struct bio_list c_prio_bios;
++	struct rb_root c_group_root;
++	struct rb_node c_group_node;
++	int c_id;	/* should be unsigned long or unsigned long long */
++	char c_name[IOBAND_NAME_MAX + 1];	/* rfu */
++	int c_blocked;
++	int c_prio_blocked;
 +	wait_queue_head_t c_waitq;
-+	int	c_flags;
-+	struct	ioband_group_stat c_stat[2];	/* hold rd/wr status */
-+	struct	group_type *c_type;
++	int c_flags;
++	struct ioband_group_stat c_stat[2];	/* hold rd/wr status */
++	struct group_type *c_type;
 +
 +	/* members for weight balancing policy */
-+	int	c_weight;
-+	int	c_my_epoch;
-+	int	c_token;
-+	int	c_token_initial;
-+	int	c_limit;
-+	int     c_consumed;
++	int c_weight;
++	int c_my_epoch;
++	int c_token;
++	int c_token_initial;
++	int c_limit;
++	int c_consumed;
 +
 +	/* rfu */
 +	/* struct bio_list	c_ordered_tag_bios; */
@@ -3097,26 +3079,16 @@
 +#define is_prio_queue(gp)		((gp)->c_flags & IOG_PRIO_QUEUE)
 +#define prio_queue_direct(gp)		((gp)->c_flags & IOG_PRIO_BIO_WRITE)
 +
-+
 +struct policy_type {
 +	const char *p_name;
-+	int	  (*p_policy_init)(struct ioband_device *, int, char **);
++	int (*p_policy_init) (struct ioband_device *, int, char **);
 +};
 +
 +extern struct policy_type dm_ioband_policy_type[];
 +
 +struct group_type {
 +	const char *t_name;
-+	int	  (*t_getid)(struct bio *);
++	int (*t_getid) (struct bio *);
 +};
 +
 +extern struct group_type dm_ioband_group_type[];
-+
-+/* Just for debugging */
-+extern long ioband_debug;
-+#define dprintk(format, a...) do { \
-+	if (ioband_debug > 0) {	\
-+		ioband_debug--; \
-+		printk(format, ##a); \
-+	} \
-+} while (0)

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
  2009-01-20 15:53   ` Alasdair G Kergon
@ 2009-01-22 16:12         ` Vivek Goyal
  0 siblings, 0 replies; 23+ messages in thread
From: Vivek Goyal @ 2009-01-22 16:12 UTC (permalink / raw)
  To: Ryo Tsuruta, dm-devel-H+wXaHxf7aLQT0dZR+AlfA, Alasdair G Kergon
  Cc: Chris Wright, Paul Menage, paolo.valente-rcYM44yAMweonA0d6jMUrA,
	Dhaval Giani, Rik Van Riel,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux kernel mailing list, fchecconi-Re5JQEeQqe8AvxtiuMwx3w,
	arozansk-H+wXaHxf7aLQT0dZR+AlfA,
	jens.axboe-QHcLZuEGTsvQT0dZR+AlfA, jmoyer-H+wXaHxf7aLQT0dZR+AlfA,
	fernando-w0OK63jvRlAuJ+9fw/WgBHgSJqDPrsil, Balbir Singh

On Tue, Jan 20, 2009 at 03:53:34PM +0000, Alasdair G Kergon wrote:
> So, what needs to be reviewed?
> 
> 
> 1. General style/layout/naming cleanup.
> - It's pretty good compared to a lot of patches that get sent, but there are
> still a few things we can improve.
> 
> Lindent is throwing up some stuff (but remember it doesn't get things
> perfect so don't make all the changes it recommends).
> Remove double blank lines, plenty of unnecessary braces, "unsigned int" ->
> "unsigned".  
> Review the names - should a few more things get a DM_ or dm_ prefix?
> Are all the names consistent and as self-explanatory as they can reasonably be?
> (e.g. a variable called 'new' - new what?)
> 
> 
> 2. A high-level review.
> Review the documentation and what the code does and how it does it.
> - Does it make sense to add this to the kernel in this way?
> 

CCing lkml, containers mailing list and other folks who might be 
interested in the thread.

It is a long mail. You have been warned. :-)

Here are some of my thoughts and also summary of some of the past
discussions about dm-ioband either on lkml or off lkml. Following is
one of the relevant link of past discussion on lkml about this.

http://lkml.org/lkml/2008/11/6/227

At this point of time looks like there are two schools of thought regarding
how IO controller should be implemented. The first one believes (dm-ioband)
that io controller should be implemented as 2 level approach where higher
level of IO control is done by this dm-ioband driver and lower level of
scheduling is done by elevator/iosched code (noop, deadline, AS and cfq).

Second school of thought (me, nauman from google and may be others) belive
that introducing another level of IO control at higher layer breaks the
assumptions of lower level scheduling hence we should be doing IO control
and IO scheduing both in single layer and that is at elevator layer.

Before I dive into details how assumptions are broken, let me discuss
requirment part a bit. We seem to be differing on requirement part also.

I think that we need IO control only at the point where real contetion
is and not on every logical block device where there is no real contetion
for the resource. Real contention for the resource is at the end node
physical device where device is slow and then arises the need of some kind
of resource control. 

I am not very sure why dm-ioband folks want to enable IO control on any
xyz block device but in the past I got two responses.

1. Need to control end devices which don't have any elevator attached.
2. Need to do IO control for devices which are effectively network backed.
  for example, an NFS mounted file loop mounted as a block device.

I don't fully understand the first requirement. Which are the device drviers
that don't use any of the standard ioschedulers? I am not aware of any
in-kernel drviers and I am assuming it will be binary drivers.  If that's the
case, then those binary drviers need to be modified to take advantange of IO
control provided by elevator layer.

Regarding the second requirement I think this sounds more like a network
controller issue. Again the real contention is at network layer and not
at logical block device.

So at this point of time my understanding is that most common case for
IO resource control is at the end devices in the system and it can be
controlled by one level of IO control and scheudling. Please correct
me if that's not the case from requirement point of view.

Having said that even if we really find genuine cases where we need to
control IO on any xyz block device, then we should be able to come
up with generic IO controller which can reuse some of the code from 1
level controller. I am not against that and I think probably 1 level IO
controller and a generic IO controller can co-exist. But there are few points
which I find little odd about dm-ioband.

Why generic IO controller is not good for every case
====================================================
To my knowledge, there have been two generic controller implementations.
One is dm-ioband and other is an RFC patch by me. Following is the link.

http://lkml.org/lkml/2008/11/6/227

The biggest issue with generic controller is that they can buffer the
bio's at higher layer (once a cgroup is backed up) and then later release
those bios in FIFO manner. This can conflict with unerlying IO scheduler's
assumptions. Following  example comes to mind.

- If there is one task of io priority 0 in a cgroup and rest of the tasks
  are of io prio 7. All the tasks belong to best effort class. If tasks of
  lower priority (7) do lot of IO, then due to buffering there is a chance
  that IO from lower prio tasks is seen by CFQ first and io from higher prio
  task is not seen by cfq for quite some time hence that task not getting it
  fair share with in the cgroup. Similar situation can arise with RT tasks
  also.

Some of the issues with dm-ioband implementation
===============================================
- Breaks the assumptions of underlying IO schedulers.
	- There is no notion of task classes. So tasks of all the classes are
	  at same level from resource contention point of view. The only thing
	  which differentiates them is cgroup weight. Which does not answer the
	  question that an RT task or RT cgroup should starve the peer cgroup
	  if need be as RT cgroup should get priority access.

	- Because of FIFO release of buffered bios, it is possible that task
	  of lower priority gets more IO done than the task of higher
	  priority.

- Task grouping logic
	- We already have the notion of cgroup where tasks can be grouped
	  in hierarhical manner. dm-ioband does not make full use of that and
	  comes up with own mechansim of grouping tasks (apart from cgroup).
	  And there are odd ways of specifying cgroup id which configuring the
	  dm-ioband device. I think once somebody has created the cgroup
	  hieararchy, any IO controller logic should be able to internally
	  read that hiearchy and provide control. There should not be need
	  of any other configuration utity on top of cgroup.

	  My RFC patches had done that.

- Need of a dm device for every device we want to control

	- This requirement looks odd. It forces everybody to use dm-tools
	  and if there are lots of disks in the system, configuation is
	  pain.

- Does it support hiearhical grouping?

	- I have not looked very closely at dm-ioband patches about this and
	  had asked ryo a question about this (no response).

	  Ryo does, dm-ioband support hierarhical grouping configuration?

Summary
=======
- IMHO, for common case we don't need a generic IO controller and by
  implementing an IO controller at elevator layer with close coupling
  to io schedulers, we should be able to achive the goal.

  Currently there is work in progress (off the list) by me, nauman, Fabio,
  Paolo and others to implement a common IO control layer which can be
  used by all the four IO schedulers without too much of code duplication.
  Hopefully in next 2-3 weeks we should be able to post the initial patches
  for RFC.

- Even if there are cases for controlling a xyz block device, we can have
  a generic io controller also to cover that case. Ideally this controller
  should not be used by devices which use standard io schedulers.

  IMHO, dm-ioband as few odd points as mentioned above when it comes to
  generic controller and I think those should be addressed if we can really
  justify the need of a generic IO controller.

Your comments are welcome.    

Thanks
Vivek

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
@ 2009-01-22 16:12         ` Vivek Goyal
  0 siblings, 0 replies; 23+ messages in thread
From: Vivek Goyal @ 2009-01-22 16:12 UTC (permalink / raw)
  To: Ryo Tsuruta, dm-devel, Alasdair G Kergon
  Cc: linux kernel mailing list, containers, Nauman Rafique, dpshah,
	lizf, mikew, fchecconi, paolo.valente, jens.axboe, fernando,
	s-uchida, taka, guijianfeng, arozansk, jmoyer, Rik Van Riel,
	Peter Zijlstra, Paul Menage, Balbir Singh, Dhaval Giani,
	Chris Wright

On Tue, Jan 20, 2009 at 03:53:34PM +0000, Alasdair G Kergon wrote:
> So, what needs to be reviewed?
> 
> 
> 1. General style/layout/naming cleanup.
> - It's pretty good compared to a lot of patches that get sent, but there are
> still a few things we can improve.
> 
> Lindent is throwing up some stuff (but remember it doesn't get things
> perfect so don't make all the changes it recommends).
> Remove double blank lines, plenty of unnecessary braces, "unsigned int" ->
> "unsigned".  
> Review the names - should a few more things get a DM_ or dm_ prefix?
> Are all the names consistent and as self-explanatory as they can reasonably be?
> (e.g. a variable called 'new' - new what?)
> 
> 
> 2. A high-level review.
> Review the documentation and what the code does and how it does it.
> - Does it make sense to add this to the kernel in this way?
> 

CCing lkml, containers mailing list and other folks who might be 
interested in the thread.

It is a long mail. You have been warned. :-)

Here are some of my thoughts and also summary of some of the past
discussions about dm-ioband either on lkml or off lkml. Following is
one of the relevant link of past discussion on lkml about this.

http://lkml.org/lkml/2008/11/6/227

At this point of time looks like there are two schools of thought regarding
how IO controller should be implemented. The first one believes (dm-ioband)
that io controller should be implemented as 2 level approach where higher
level of IO control is done by this dm-ioband driver and lower level of
scheduling is done by elevator/iosched code (noop, deadline, AS and cfq).

Second school of thought (me, nauman from google and may be others) belive
that introducing another level of IO control at higher layer breaks the
assumptions of lower level scheduling hence we should be doing IO control
and IO scheduing both in single layer and that is at elevator layer.

Before I dive into details how assumptions are broken, let me discuss
requirment part a bit. We seem to be differing on requirement part also.

I think that we need IO control only at the point where real contetion
is and not on every logical block device where there is no real contetion
for the resource. Real contention for the resource is at the end node
physical device where device is slow and then arises the need of some kind
of resource control. 

I am not very sure why dm-ioband folks want to enable IO control on any
xyz block device but in the past I got two responses.

1. Need to control end devices which don't have any elevator attached.
2. Need to do IO control for devices which are effectively network backed.
  for example, an NFS mounted file loop mounted as a block device.

I don't fully understand the first requirement. Which are the device drviers
that don't use any of the standard ioschedulers? I am not aware of any
in-kernel drviers and I am assuming it will be binary drivers.  If that's the
case, then those binary drviers need to be modified to take advantange of IO
control provided by elevator layer.

Regarding the second requirement I think this sounds more like a network
controller issue. Again the real contention is at network layer and not
at logical block device.

So at this point of time my understanding is that most common case for
IO resource control is at the end devices in the system and it can be
controlled by one level of IO control and scheudling. Please correct
me if that's not the case from requirement point of view.

Having said that even if we really find genuine cases where we need to
control IO on any xyz block device, then we should be able to come
up with generic IO controller which can reuse some of the code from 1
level controller. I am not against that and I think probably 1 level IO
controller and a generic IO controller can co-exist. But there are few points
which I find little odd about dm-ioband.

Why generic IO controller is not good for every case
====================================================
To my knowledge, there have been two generic controller implementations.
One is dm-ioband and other is an RFC patch by me. Following is the link.

http://lkml.org/lkml/2008/11/6/227

The biggest issue with generic controller is that they can buffer the
bio's at higher layer (once a cgroup is backed up) and then later release
those bios in FIFO manner. This can conflict with unerlying IO scheduler's
assumptions. Following  example comes to mind.

- If there is one task of io priority 0 in a cgroup and rest of the tasks
  are of io prio 7. All the tasks belong to best effort class. If tasks of
  lower priority (7) do lot of IO, then due to buffering there is a chance
  that IO from lower prio tasks is seen by CFQ first and io from higher prio
  task is not seen by cfq for quite some time hence that task not getting it
  fair share with in the cgroup. Similar situation can arise with RT tasks
  also.

Some of the issues with dm-ioband implementation
===============================================
- Breaks the assumptions of underlying IO schedulers.
	- There is no notion of task classes. So tasks of all the classes are
	  at same level from resource contention point of view. The only thing
	  which differentiates them is cgroup weight. Which does not answer the
	  question that an RT task or RT cgroup should starve the peer cgroup
	  if need be as RT cgroup should get priority access.

	- Because of FIFO release of buffered bios, it is possible that task
	  of lower priority gets more IO done than the task of higher
	  priority.

- Task grouping logic
	- We already have the notion of cgroup where tasks can be grouped
	  in hierarhical manner. dm-ioband does not make full use of that and
	  comes up with own mechansim of grouping tasks (apart from cgroup).
	  And there are odd ways of specifying cgroup id which configuring the
	  dm-ioband device. I think once somebody has created the cgroup
	  hieararchy, any IO controller logic should be able to internally
	  read that hiearchy and provide control. There should not be need
	  of any other configuration utity on top of cgroup.

	  My RFC patches had done that.

- Need of a dm device for every device we want to control

	- This requirement looks odd. It forces everybody to use dm-tools
	  and if there are lots of disks in the system, configuation is
	  pain.

- Does it support hiearhical grouping?

	- I have not looked very closely at dm-ioband patches about this and
	  had asked ryo a question about this (no response).

	  Ryo does, dm-ioband support hierarhical grouping configuration?

Summary
=======
- IMHO, for common case we don't need a generic IO controller and by
  implementing an IO controller at elevator layer with close coupling
  to io schedulers, we should be able to achive the goal.

  Currently there is work in progress (off the list) by me, nauman, Fabio,
  Paolo and others to implement a common IO control layer which can be
  used by all the four IO schedulers without too much of code duplication.
  Hopefully in next 2-3 weeks we should be able to post the initial patches
  for RFC.

- Even if there are cases for controlling a xyz block device, we can have
  a generic io controller also to cover that case. Ideally this controller
  should not be used by devices which use standard io schedulers.

  IMHO, dm-ioband as few odd points as mentioned above when it comes to
  generic controller and I think those should be addressed if we can really
  justify the need of a generic IO controller.

Your comments are welcome.    

Thanks
Vivek

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
       [not found]         ` <20090122161218.GA28795-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2009-01-23 10:14           ` Ryo Tsuruta
  0 siblings, 0 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-23 10:14 UTC (permalink / raw)
  To: vgoyal-H+wXaHxf7aLQT0dZR+AlfA
  Cc: dhaval-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, arozansk-H+wXaHxf7aLQT0dZR+AlfA,
	jens.axboe-QHcLZuEGTsvQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	paolo.valente-rcYM44yAMweonA0d6jMUrA,
	jmoyer-H+wXaHxf7aLQT0dZR+AlfA,
	fernando-w0OK63jvRlAuJ+9fw/WgBHgSJqDPrsil,
	riel-H+wXaHxf7aLQT0dZR+AlfA, fchecconi-Re5JQEeQqe8AvxtiuMwx3w,
	chrisw-H+wXaHxf7aLQT0dZR+AlfA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	menage-hpIqsD4AKlfQT0dZR+AlfA

Hi Vivek,

Thanks for your comments.

> I am not very sure why dm-ioband folks want to enable IO control on any
> xyz block device but in the past I got two responses.
> 
> 1. Need to control end devices which don't have any elevator attached.
> 2. Need to do IO control for devices which are effectively network backed.
>   for example, an NFS mounted file loop mounted as a block device.

The two responses are issues of IO scheduler based controllers, not
reasons why we implement the IO controller as a device mapper driver.
The reasons of that are:
- A user have a choice whether to use dm-ioband or not, and dm-ioband
  doesn't make any effects on the system if a user doesn't want to
  use it.
- The dm device is highly independent module, so we don't need to modify
  the existing kernel code including the IO schedulers. It can keep
  the IO scheduler implementation simple.

So, dm-ioband can co-exist with any other IO controllers from a
user's and kernel developer's perspective.

> Why generic IO controller is not good for every case
> ====================================================
> To my knowledge, there have been two generic controller implementations.
> One is dm-ioband and other is an RFC patch by me. Following is the link.
> 
> http://lkml.org/lkml/2008/11/6/227
> 
> The biggest issue with generic controller is that they can buffer the
> bio's at higher layer (once a cgroup is backed up) and then later release
> those bios in FIFO manner. This can conflict with unerlying IO scheduler's
> assumptions. Following  example comes to mind.

I don't think you are completely right.

> - If there is one task of io priority 0 in a cgroup and rest of the tasks
>   are of io prio 7. All the tasks belong to best effort class. If tasks of
>   lower priority (7) do lot of IO, then due to buffering there is a chance
>   that IO from lower prio tasks is seen by CFQ first and io from higher prio
>   task is not seen by cfq for quite some time hence that task not getting it
>   fair share with in the cgroup. Similar situation can arise with RT tasks
>   also.

Whether using dm-ioband or not, if the tasks of IO priority 7 do lot
of IO, then the device queue is going to be full and tasks which tries
to issue IOs are blocked until the queue get a slot. The IOs are
backlogged even if they are issued from the task of IO priority 0.
I don't understand why you think it's the biggest issue. The same
thing is going to happen without dm-ioband. 

If I were you, I create two cgroups and let tasks of lower priority
belong to one cgroup and tasks of higher priority belong to another,
and give higher bandwidth to the cgroup to which the higher priority
tasks belong. What do you think about this way?

> - Task grouping logic
> 	- We already have the notion of cgroup where tasks can be grouped
> 	  in hierarhical manner. dm-ioband does not make full use of that and
> 	  comes up with own mechansim of grouping tasks (apart from cgroup).
> 	  And there are odd ways of specifying cgroup id which configuring the
> 	  dm-ioband device. I think once somebody has created the cgroup
> 	  hieararchy, any IO controller logic should be able to internally
> 	  read that hiearchy and provide control. There should not be need
> 	  of any other configuration utity on top of cgroup.
> 
> 	  My RFC patches had done that.

Dm-ioband can work with the bio-cgroup mechanism, which makes task groups
in manner of the cgroup, of course.
I already have a basic design to make dm-ioband support the cgroup
hierarchy. This should be started after the core code of bio-cgroup,
which helps trace each I/O requests, is merged in -mm tree.

And the reason dm-ioband uses cgroup id to specify a cgroup is that
the current cgroup infrastructure lacks features to manage resources
placed in the kernel modules.

> - Need of a dm device for every device we want to control
> 
> 	- This requirement looks odd. It forces everybody to use dm-tools
> 	  and if there are lots of disks in the system, configuation is
> 	  pain.

I don't think it's so pain. I think you are already using LVM devices on
your boxes. Setting up dm-ioband is the same as that for LVM. And some
scripts or something similar will help you set up them.

And it is also possible this algorithm can be directly implemented in the
block layer if this is really needed.

> - Does it support hiearhical grouping?
> 
> 	- I have not looked very closely at dm-ioband patches about this and
> 	  had asked ryo a question about this (no response).
> 
> 	  Ryo does, dm-ioband support hierarhical grouping configuration?

I'm sorry I missed your email with the question.
I already have a design plan for it and I will start to implement it
if there are a lot of requests for this. But I doubt this should be
implemented in kernel, which can be placed in user-land, such as
a daemon program.

Thanks,
Ryo Tsuruta

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
  2009-01-22 16:12         ` Vivek Goyal
  (?)
  (?)
@ 2009-01-23 10:14         ` Ryo Tsuruta
       [not found]           ` <20090123.191404.39168431.ryov-jCdQPDEk3idL9jVzuh4AOg@public.gmane.org>
  2009-01-26 16:29           ` Vivek Goyal
  -1 siblings, 2 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-23 10:14 UTC (permalink / raw)
  To: vgoyal
  Cc: dm-devel, agk, linux-kernel, containers, nauman, dpshah, lizf,
	mikew, fchecconi, paolo.valente, jens.axboe, fernando, s-uchida,
	taka, guijianfeng, arozansk, jmoyer, riel, peterz, menage,
	balbir, dhaval, chrisw

Hi Vivek,

Thanks for your comments.

> I am not very sure why dm-ioband folks want to enable IO control on any
> xyz block device but in the past I got two responses.
> 
> 1. Need to control end devices which don't have any elevator attached.
> 2. Need to do IO control for devices which are effectively network backed.
>   for example, an NFS mounted file loop mounted as a block device.

The two responses are issues of IO scheduler based controllers, not
reasons why we implement the IO controller as a device mapper driver.
The reasons of that are:
- A user have a choice whether to use dm-ioband or not, and dm-ioband
  doesn't make any effects on the system if a user doesn't want to
  use it.
- The dm device is highly independent module, so we don't need to modify
  the existing kernel code including the IO schedulers. It can keep
  the IO scheduler implementation simple.

So, dm-ioband can co-exist with any other IO controllers from a
user's and kernel developer's perspective.

> Why generic IO controller is not good for every case
> ====================================================
> To my knowledge, there have been two generic controller implementations.
> One is dm-ioband and other is an RFC patch by me. Following is the link.
> 
> http://lkml.org/lkml/2008/11/6/227
> 
> The biggest issue with generic controller is that they can buffer the
> bio's at higher layer (once a cgroup is backed up) and then later release
> those bios in FIFO manner. This can conflict with unerlying IO scheduler's
> assumptions. Following  example comes to mind.

I don't think you are completely right.

> - If there is one task of io priority 0 in a cgroup and rest of the tasks
>   are of io prio 7. All the tasks belong to best effort class. If tasks of
>   lower priority (7) do lot of IO, then due to buffering there is a chance
>   that IO from lower prio tasks is seen by CFQ first and io from higher prio
>   task is not seen by cfq for quite some time hence that task not getting it
>   fair share with in the cgroup. Similar situation can arise with RT tasks
>   also.

Whether using dm-ioband or not, if the tasks of IO priority 7 do lot
of IO, then the device queue is going to be full and tasks which tries
to issue IOs are blocked until the queue get a slot. The IOs are
backlogged even if they are issued from the task of IO priority 0.
I don't understand why you think it's the biggest issue. The same
thing is going to happen without dm-ioband. 

If I were you, I create two cgroups and let tasks of lower priority
belong to one cgroup and tasks of higher priority belong to another,
and give higher bandwidth to the cgroup to which the higher priority
tasks belong. What do you think about this way?

> - Task grouping logic
> 	- We already have the notion of cgroup where tasks can be grouped
> 	  in hierarhical manner. dm-ioband does not make full use of that and
> 	  comes up with own mechansim of grouping tasks (apart from cgroup).
> 	  And there are odd ways of specifying cgroup id which configuring the
> 	  dm-ioband device. I think once somebody has created the cgroup
> 	  hieararchy, any IO controller logic should be able to internally
> 	  read that hiearchy and provide control. There should not be need
> 	  of any other configuration utity on top of cgroup.
> 
> 	  My RFC patches had done that.

Dm-ioband can work with the bio-cgroup mechanism, which makes task groups
in manner of the cgroup, of course.
I already have a basic design to make dm-ioband support the cgroup
hierarchy. This should be started after the core code of bio-cgroup,
which helps trace each I/O requests, is merged in -mm tree.

And the reason dm-ioband uses cgroup id to specify a cgroup is that
the current cgroup infrastructure lacks features to manage resources
placed in the kernel modules.

> - Need of a dm device for every device we want to control
> 
> 	- This requirement looks odd. It forces everybody to use dm-tools
> 	  and if there are lots of disks in the system, configuation is
> 	  pain.

I don't think it's so pain. I think you are already using LVM devices on
your boxes. Setting up dm-ioband is the same as that for LVM. And some
scripts or something similar will help you set up them.

And it is also possible this algorithm can be directly implemented in the
block layer if this is really needed.

> - Does it support hiearhical grouping?
> 
> 	- I have not looked very closely at dm-ioband patches about this and
> 	  had asked ryo a question about this (no response).
> 
> 	  Ryo does, dm-ioband support hierarhical grouping configuration?

I'm sorry I missed your email with the question.
I already have a design plan for it and I will start to implement it
if there are a lot of requests for this. But I doubt this should be
implemented in kernel, which can be placed in user-land, such as
a daemon program.

Thanks,
Ryo Tsuruta

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
       [not found]           ` <20090123.191404.39168431.ryov-jCdQPDEk3idL9jVzuh4AOg@public.gmane.org>
@ 2009-01-26 16:29             ` Vivek Goyal
  0 siblings, 0 replies; 23+ messages in thread
From: Vivek Goyal @ 2009-01-26 16:29 UTC (permalink / raw)
  To: Ryo Tsuruta
  Cc: dhaval-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, arozansk-H+wXaHxf7aLQT0dZR+AlfA,
	jens.axboe-QHcLZuEGTsvQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	paolo.valente-rcYM44yAMweonA0d6jMUrA,
	jmoyer-H+wXaHxf7aLQT0dZR+AlfA,
	fernando-w0OK63jvRlAuJ+9fw/WgBHgSJqDPrsil,
	riel-H+wXaHxf7aLQT0dZR+AlfA, fchecconi-Re5JQEeQqe8AvxtiuMwx3w,
	chrisw-H+wXaHxf7aLQT0dZR+AlfA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	menage-hpIqsD4AKlfQT0dZR+AlfA

On Fri, Jan 23, 2009 at 07:14:04PM +0900, Ryo Tsuruta wrote:
> Hi Vivek,
> 
> Thanks for your comments.
> 
> > I am not very sure why dm-ioband folks want to enable IO control on any
> > xyz block device but in the past I got two responses.
> > 
> > 1. Need to control end devices which don't have any elevator attached.
> > 2. Need to do IO control for devices which are effectively network backed.
> >   for example, an NFS mounted file loop mounted as a block device.
> 
> The two responses are issues of IO scheduler based controllers, not
> reasons why we implement the IO controller as a device mapper driver.
> The reasons of that are:
> - A user have a choice whether to use dm-ioband or not, and dm-ioband
>   doesn't make any effects on the system if a user doesn't want to
>   use it.

Even in in-kernel solution, cgroup code will be compiled out if user
is not using IO controller. Some code might still be present in run time
but I don't think it will be any big run time penalty.

> - The dm device is highly independent module, so we don't need to modify
>   the existing kernel code including the IO schedulers. It can keep
>   the IO scheduler implementation simple.
> 

Agree that dm device is highly independent module but I think in this it
does not look like the right place to implement the  IO controller.

I think with the introduction of cgroup, IO scheduling has become now
a hierarhical scheduling. Previously it was flat scheduling where there
was only one level. Not there can be multiple levels and each level
can have groups and queues. I don't think that we can break down
hiearchical scheduling problem in two parts where top level part is moved
into a module. It is something like saying that lets break out cpu group
schedling into a separate module and it should not be part of kernel.

I think we need to implement this hiearchical IO scheduler in kernel which
can schedule groups as well as end level io queues. (maintained by cfq,
deadline, as, or noop).

> So, dm-ioband can co-exist with any other IO controllers from a
> user's and kernel developer's perspective.

Just because device mapper framework allows one to implement IO controller
in a separate module, we should not implement it there. It will be
difficult to take care of issues like, configuration, breaking underlying IO
scheduler's assumptions, capability to treat tasks and groups at same level
etc.

> 
> > Why generic IO controller is not good for every case
> > ====================================================
> > To my knowledge, there have been two generic controller implementations.
> > One is dm-ioband and other is an RFC patch by me. Following is the link.
> > 
> > http://lkml.org/lkml/2008/11/6/227
> > 
> > The biggest issue with generic controller is that they can buffer the
> > bio's at higher layer (once a cgroup is backed up) and then later release
> > those bios in FIFO manner. This can conflict with unerlying IO scheduler's
> > assumptions. Following  example comes to mind.
> 
> I don't think you are completely right.
> 
> > - If there is one task of io priority 0 in a cgroup and rest of the tasks
> >   are of io prio 7. All the tasks belong to best effort class. If tasks of
> >   lower priority (7) do lot of IO, then due to buffering there is a chance
> >   that IO from lower prio tasks is seen by CFQ first and io from higher prio
> >   task is not seen by cfq for quite some time hence that task not getting it
> >   fair share with in the cgroup. Similar situation can arise with RT tasks
> >   also.
> 
> Whether using dm-ioband or not, if the tasks of IO priority 7 do lot
> of IO, then the device queue is going to be full and tasks which tries
> to issue IOs are blocked until the queue get a slot. The IOs are
> backlogged even if they are issued from the task of IO priority 0.
> I don't understand why you think it's the biggest issue. The same
> thing is going to happen without dm-ioband. 
> 

True that even limited availability of request descriptors can be a
bottleneck and can lead to same kind of issues but my contention is
that you are aggravating the problem. Putting a 2nd layer can break IO
scheduler's assumption even before underlying request queue is full.

So second level solution on top will increase the frequency of such
incidents where a lower priority task can run away with more job done than
high priority task because there are no separate queues for different
priority tasks and release of buffered bio is FIFO.

Secondly what happens to tasks of RT class? dm-ioband does not have any
notion of handling the RT cgroup or RT tasks.

Thirdly, doing any kind of resource control at higher level takes away the
capability to treat task and groups at same level. I have had this
discussion in other offline thread also where you are copied. I think
it is a good idea to treat tasks and groups at same level where possible
(depends if IO scheduler creates separate queues for tasks or not, cfq
does.) 

> If I were you, I create two cgroups and let tasks of lower priority
> belong to one cgroup and tasks of higher priority belong to another,
> and give higher bandwidth to the cgroup to which the higher priority
> tasks belong. What do you think about this way?
> 

I think this is not practical. What we are talking is that task
priority does not have any meaning. If we want service difference between
two tasks, we need to pack them in separate cgroup otherwise we can't
gurantee things. If we need to pack every task in separate cgroup then
why to even have the notion of task priority.  

> > - Task grouping logic
> > 	- We already have the notion of cgroup where tasks can be grouped
> > 	  in hierarhical manner. dm-ioband does not make full use of that and
> > 	  comes up with own mechansim of grouping tasks (apart from cgroup).
> > 	  And there are odd ways of specifying cgroup id which configuring the
> > 	  dm-ioband device. I think once somebody has created the cgroup
> > 	  hieararchy, any IO controller logic should be able to internally
> > 	  read that hiearchy and provide control. There should not be need
> > 	  of any other configuration utity on top of cgroup.
> > 
> > 	  My RFC patches had done that.
> 
> Dm-ioband can work with the bio-cgroup mechanism, which makes task groups
> in manner of the cgroup, of course.
> I already have a basic design to make dm-ioband support the cgroup
> hierarchy. This should be started after the core code of bio-cgroup,
> which helps trace each I/O requests, is merged in -mm tree.
> 

bio-cgroup patches are fine because they provide us the capability to 
map delayed writes to right cgroup. And it can be used by any IO
controller. 

> And the reason dm-ioband uses cgroup id to specify a cgroup is that
> the current cgroup infrastructure lacks features to manage resources
> placed in the kernel modules.

Can you elaborate on that please? We have heard in the past that cgroup 
does not give you enough flexibility but never got details.

In this case first you are forcing some functionalilty to go in a kernel
module and then coming up with tools for configuration. I never understood
that why don't you let the controller be inside the kernel, let it
directly interact with cgroup subsystem and work instead of first taking
the functionality out of kernel in a module and then justifying the case
that now we need new ways of configuring that module because cgroup
infrastructure is not sufficient.  

> 
> > - Need of a dm device for every device we want to control
> > 
> > 	- This requirement looks odd. It forces everybody to use dm-tools
> > 	  and if there are lots of disks in the system, configuation is
> > 	  pain.
> 
> I don't think it's so pain. I think you are already using LVM devices on
> your boxes. Setting up dm-ioband is the same as that for LVM. And some
> scripts or something similar will help you set up them.
> 

Not everybody uses LVM. Balbir had asked once, if there are thousands of 
disks in the system, does that mean I need to create this dm-ioband device
for all the disks?

> And it is also possible this algorithm can be directly implemented in the
> block layer if this is really needed.
> 
> > - Does it support hiearhical grouping?
> > 
> > 	- I have not looked very closely at dm-ioband patches about this and
> > 	  had asked ryo a question about this (no response).
> > 
> > 	  Ryo does, dm-ioband support hierarhical grouping configuration?
> 
> I'm sorry I missed your email with the question.
> I already have a design plan for it and I will start to implement it
> if there are a lot of requests for this. But I doubt this should be
> implemented in kernel, which can be placed in user-land, such as
> a daemon program.
> 

We do need hierarhical grouping facility for IO controller also.

I am not sure that I agree to the idea of implementing IO controller 
as a device mapper driver because device mapper framework allows it to
be implemented as a module and one can avoid putting code in kernel. At
this point of time, IMHO, I don't think that IO controller code living
inside the kernel is an issue. I would rather focus on rest of the issues.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
  2009-01-23 10:14         ` Ryo Tsuruta
       [not found]           ` <20090123.191404.39168431.ryov-jCdQPDEk3idL9jVzuh4AOg@public.gmane.org>
@ 2009-01-26 16:29           ` Vivek Goyal
       [not found]             ` <20090126162951.GI31802-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
                               ` (3 more replies)
  1 sibling, 4 replies; 23+ messages in thread
From: Vivek Goyal @ 2009-01-26 16:29 UTC (permalink / raw)
  To: Ryo Tsuruta
  Cc: dm-devel, agk, linux-kernel, containers, nauman, dpshah, lizf,
	mikew, fchecconi, paolo.valente, jens.axboe, fernando, s-uchida,
	taka, guijianfeng, arozansk, jmoyer, riel, peterz, menage,
	balbir, dhaval, chrisw

On Fri, Jan 23, 2009 at 07:14:04PM +0900, Ryo Tsuruta wrote:
> Hi Vivek,
> 
> Thanks for your comments.
> 
> > I am not very sure why dm-ioband folks want to enable IO control on any
> > xyz block device but in the past I got two responses.
> > 
> > 1. Need to control end devices which don't have any elevator attached.
> > 2. Need to do IO control for devices which are effectively network backed.
> >   for example, an NFS mounted file loop mounted as a block device.
> 
> The two responses are issues of IO scheduler based controllers, not
> reasons why we implement the IO controller as a device mapper driver.
> The reasons of that are:
> - A user have a choice whether to use dm-ioband or not, and dm-ioband
>   doesn't make any effects on the system if a user doesn't want to
>   use it.

Even in in-kernel solution, cgroup code will be compiled out if user
is not using IO controller. Some code might still be present in run time
but I don't think it will be any big run time penalty.

> - The dm device is highly independent module, so we don't need to modify
>   the existing kernel code including the IO schedulers. It can keep
>   the IO scheduler implementation simple.
> 

Agree that dm device is highly independent module but I think in this it
does not look like the right place to implement the  IO controller.

I think with the introduction of cgroup, IO scheduling has become now
a hierarhical scheduling. Previously it was flat scheduling where there
was only one level. Not there can be multiple levels and each level
can have groups and queues. I don't think that we can break down
hiearchical scheduling problem in two parts where top level part is moved
into a module. It is something like saying that lets break out cpu group
schedling into a separate module and it should not be part of kernel.

I think we need to implement this hiearchical IO scheduler in kernel which
can schedule groups as well as end level io queues. (maintained by cfq,
deadline, as, or noop).

> So, dm-ioband can co-exist with any other IO controllers from a
> user's and kernel developer's perspective.

Just because device mapper framework allows one to implement IO controller
in a separate module, we should not implement it there. It will be
difficult to take care of issues like, configuration, breaking underlying IO
scheduler's assumptions, capability to treat tasks and groups at same level
etc.

> 
> > Why generic IO controller is not good for every case
> > ====================================================
> > To my knowledge, there have been two generic controller implementations.
> > One is dm-ioband and other is an RFC patch by me. Following is the link.
> > 
> > http://lkml.org/lkml/2008/11/6/227
> > 
> > The biggest issue with generic controller is that they can buffer the
> > bio's at higher layer (once a cgroup is backed up) and then later release
> > those bios in FIFO manner. This can conflict with unerlying IO scheduler's
> > assumptions. Following  example comes to mind.
> 
> I don't think you are completely right.
> 
> > - If there is one task of io priority 0 in a cgroup and rest of the tasks
> >   are of io prio 7. All the tasks belong to best effort class. If tasks of
> >   lower priority (7) do lot of IO, then due to buffering there is a chance
> >   that IO from lower prio tasks is seen by CFQ first and io from higher prio
> >   task is not seen by cfq for quite some time hence that task not getting it
> >   fair share with in the cgroup. Similar situation can arise with RT tasks
> >   also.
> 
> Whether using dm-ioband or not, if the tasks of IO priority 7 do lot
> of IO, then the device queue is going to be full and tasks which tries
> to issue IOs are blocked until the queue get a slot. The IOs are
> backlogged even if they are issued from the task of IO priority 0.
> I don't understand why you think it's the biggest issue. The same
> thing is going to happen without dm-ioband. 
> 

True that even limited availability of request descriptors can be a
bottleneck and can lead to same kind of issues but my contention is
that you are aggravating the problem. Putting a 2nd layer can break IO
scheduler's assumption even before underlying request queue is full.

So second level solution on top will increase the frequency of such
incidents where a lower priority task can run away with more job done than
high priority task because there are no separate queues for different
priority tasks and release of buffered bio is FIFO.

Secondly what happens to tasks of RT class? dm-ioband does not have any
notion of handling the RT cgroup or RT tasks.

Thirdly, doing any kind of resource control at higher level takes away the
capability to treat task and groups at same level. I have had this
discussion in other offline thread also where you are copied. I think
it is a good idea to treat tasks and groups at same level where possible
(depends if IO scheduler creates separate queues for tasks or not, cfq
does.) 

> If I were you, I create two cgroups and let tasks of lower priority
> belong to one cgroup and tasks of higher priority belong to another,
> and give higher bandwidth to the cgroup to which the higher priority
> tasks belong. What do you think about this way?
> 

I think this is not practical. What we are talking is that task
priority does not have any meaning. If we want service difference between
two tasks, we need to pack them in separate cgroup otherwise we can't
gurantee things. If we need to pack every task in separate cgroup then
why to even have the notion of task priority.  

> > - Task grouping logic
> > 	- We already have the notion of cgroup where tasks can be grouped
> > 	  in hierarhical manner. dm-ioband does not make full use of that and
> > 	  comes up with own mechansim of grouping tasks (apart from cgroup).
> > 	  And there are odd ways of specifying cgroup id which configuring the
> > 	  dm-ioband device. I think once somebody has created the cgroup
> > 	  hieararchy, any IO controller logic should be able to internally
> > 	  read that hiearchy and provide control. There should not be need
> > 	  of any other configuration utity on top of cgroup.
> > 
> > 	  My RFC patches had done that.
> 
> Dm-ioband can work with the bio-cgroup mechanism, which makes task groups
> in manner of the cgroup, of course.
> I already have a basic design to make dm-ioband support the cgroup
> hierarchy. This should be started after the core code of bio-cgroup,
> which helps trace each I/O requests, is merged in -mm tree.
> 

bio-cgroup patches are fine because they provide us the capability to 
map delayed writes to right cgroup. And it can be used by any IO
controller. 

> And the reason dm-ioband uses cgroup id to specify a cgroup is that
> the current cgroup infrastructure lacks features to manage resources
> placed in the kernel modules.

Can you elaborate on that please? We have heard in the past that cgroup 
does not give you enough flexibility but never got details.

In this case first you are forcing some functionalilty to go in a kernel
module and then coming up with tools for configuration. I never understood
that why don't you let the controller be inside the kernel, let it
directly interact with cgroup subsystem and work instead of first taking
the functionality out of kernel in a module and then justifying the case
that now we need new ways of configuring that module because cgroup
infrastructure is not sufficient.  

> 
> > - Need of a dm device for every device we want to control
> > 
> > 	- This requirement looks odd. It forces everybody to use dm-tools
> > 	  and if there are lots of disks in the system, configuation is
> > 	  pain.
> 
> I don't think it's so pain. I think you are already using LVM devices on
> your boxes. Setting up dm-ioband is the same as that for LVM. And some
> scripts or something similar will help you set up them.
> 

Not everybody uses LVM. Balbir had asked once, if there are thousands of 
disks in the system, does that mean I need to create this dm-ioband device
for all the disks?

> And it is also possible this algorithm can be directly implemented in the
> block layer if this is really needed.
> 
> > - Does it support hiearhical grouping?
> > 
> > 	- I have not looked very closely at dm-ioband patches about this and
> > 	  had asked ryo a question about this (no response).
> > 
> > 	  Ryo does, dm-ioband support hierarhical grouping configuration?
> 
> I'm sorry I missed your email with the question.
> I already have a design plan for it and I will start to implement it
> if there are a lot of requests for this. But I doubt this should be
> implemented in kernel, which can be placed in user-land, such as
> a daemon program.
> 

We do need hierarhical grouping facility for IO controller also.

I am not sure that I agree to the idea of implementing IO controller 
as a device mapper driver because device mapper framework allows it to
be implemented as a module and one can avoid putting code in kernel. At
this point of time, IMHO, I don't think that IO controller code living
inside the kernel is an issue. I would rather focus on rest of the issues.

Thanks
Vivek

^ permalink raw reply	[flat|nested] 23+ messages in thread

* 2-Level IO scheduling (Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch)
       [not found]             ` <20090126162951.GI31802-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2009-01-29  3:36               ` Ryo Tsuruta
  2009-01-29  3:39               ` Hierarchical grouping facility for IO controller " Ryo Tsuruta
  2009-01-29  3:41               ` Implementation of dm-ioband as a dm-driver " Ryo Tsuruta
  2 siblings, 0 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-29  3:36 UTC (permalink / raw)
  To: vgoyal-H+wXaHxf7aLQT0dZR+AlfA
  Cc: dhaval-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, arozansk-H+wXaHxf7aLQT0dZR+AlfA,
	jens.axboe-QHcLZuEGTsvQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	paolo.valente-rcYM44yAMweonA0d6jMUrA,
	jmoyer-H+wXaHxf7aLQT0dZR+AlfA,
	fernando-w0OK63jvRlAuJ+9fw/WgBHgSJqDPrsil,
	riel-H+wXaHxf7aLQT0dZR+AlfA, fchecconi-Re5JQEeQqe8AvxtiuMwx3w,
	chrisw-H+wXaHxf7aLQT0dZR+AlfA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	menage-hpIqsD4AKlfQT0dZR+AlfA

Hi Vivek,

I split this mail thread into three topics:
  o 2-Level IO scheduling
  o Hierarchical grouping facility for IO controller
  o Implement IO controller as a dm-driver

This mail is about 2-Level IO scheduling.

> Just because device mapper framework allows one to implement IO controller
> in a separate module, we should not implement it there. It will be
> difficult to take care of issues like, configuration, breaking underlying IO
> scheduler's assumptions, capability to treat tasks and groups at same level
> etc.

If you are satisfied with low-accuracy bandwidth control by an IO
scheduler, you don't need to use dm-ioband. If you want to use
dm-ioband with an IO scheduler, dm-ioband can work with any type of IO
scheduler, of course dm-ioband can work with your own IO scheduler
which you are developing.

> > > - If there is one task of io priority 0 in a cgroup and rest of the tasks
> > >   are of io prio 7. All the tasks belong to best effort class. If tasks of
> > >   lower priority (7) do lot of IO, then due to buffering there is a chance
> > >   that IO from lower prio tasks is seen by CFQ first and io from higher prio
> > >   task is not seen by cfq for quite some time hence that task not getting it
> > >   fair share with in the cgroup. Similar situation can arise with RT tasks
> > >   also.
> > 
> > Whether using dm-ioband or not, if the tasks of IO priority 7 do lot
> > of IO, then the device queue is going to be full and tasks which tries
> > to issue IOs are blocked until the queue get a slot. The IOs are
> > backlogged even if they are issued from the task of IO priority 0.
> > I don't understand why you think it's the biggest issue. The same
> > thing is going to happen without dm-ioband. 
> > 
> 
> True that even limited availability of request descriptors can be a
> bottleneck and can lead to same kind of issues but my contention is
> that you are aggravating the problem. Putting a 2nd layer can break IO
> scheduler's assumption even before underlying request queue is full.

I don't think so. Dm-ioband doesn't break IO scheduler's assumptions.
In CFQ's case, the priority order is not changed within a cgroup.

> So second level solution on top will increase the frequency of such
> incidents where a lower priority task can run away with more job done than
> high priority task because there are no separate queues for different
> priority tasks and release of buffered bio is FIFO.
> 
> Secondly what happens to tasks of RT class? dm-ioband does not have any
> notion of handling the RT cgroup or RT tasks.

It's not an issue, it's a talk about how to determine a policy.
I think giving priority to cgroup policy rather than I/O scheduler
policy is more flexible.

> Thirdly, doing any kind of resource control at higher level takes away the
> capability to treat task and groups at same level. I have had this
> discussion in other offline thread also where you are copied. I think
> it is a good idea to treat tasks and groups at same level where possible
> (depends if IO scheduler creates separate queues for tasks or not, cfq
> does.) 
> 
> > If I were you, I create two cgroups and let tasks of lower priority
> > belong to one cgroup and tasks of higher priority belong to another,
> > and give higher bandwidth to the cgroup to which the higher priority
> > tasks belong. What do you think about this way?
> 
> I think this is not practical. What we are talking is that task
> priority does not have any meaning. If we want service difference between
> two tasks, we need to pack them in separate cgroup otherwise we can't
> gurantee things. If we need to pack every task in separate cgroup then
> why to even have the notion of task priority.  

It is possible to modify dm-ioband to cooperate with CFQ, but I'm not
sure it's really meaningful. What do you do when a task of RT class
issues a lot of I/O? Do you always give priority to the I/Os from the
task of RT class despite of the assigned bandwidth? Which one do you
give priority bandwidth or RT class?

Thanks,
Ryo Tsuruta

^ permalink raw reply	[flat|nested] 23+ messages in thread

* 2-Level IO scheduling (Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch)
  2009-01-26 16:29           ` Vivek Goyal
       [not found]             ` <20090126162951.GI31802-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2009-01-29  3:36             ` Ryo Tsuruta
  2009-01-29  3:39             ` Hierarchical grouping facility for IO controller " Ryo Tsuruta
  2009-01-29  3:41             ` Implementation of dm-ioband as a dm-driver " Ryo Tsuruta
  3 siblings, 0 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-29  3:36 UTC (permalink / raw)
  To: vgoyal
  Cc: dm-devel, agk, linux-kernel, containers, nauman, dpshah, lizf,
	mikew, fchecconi, paolo.valente, jens.axboe, fernando, s-uchida,
	taka, guijianfeng, arozansk, jmoyer, riel, peterz, menage,
	balbir, dhaval, chrisw

Hi Vivek,

I split this mail thread into three topics:
  o 2-Level IO scheduling
  o Hierarchical grouping facility for IO controller
  o Implement IO controller as a dm-driver

This mail is about 2-Level IO scheduling.

> Just because device mapper framework allows one to implement IO controller
> in a separate module, we should not implement it there. It will be
> difficult to take care of issues like, configuration, breaking underlying IO
> scheduler's assumptions, capability to treat tasks and groups at same level
> etc.

If you are satisfied with low-accuracy bandwidth control by an IO
scheduler, you don't need to use dm-ioband. If you want to use
dm-ioband with an IO scheduler, dm-ioband can work with any type of IO
scheduler, of course dm-ioband can work with your own IO scheduler
which you are developing.

> > > - If there is one task of io priority 0 in a cgroup and rest of the tasks
> > >   are of io prio 7. All the tasks belong to best effort class. If tasks of
> > >   lower priority (7) do lot of IO, then due to buffering there is a chance
> > >   that IO from lower prio tasks is seen by CFQ first and io from higher prio
> > >   task is not seen by cfq for quite some time hence that task not getting it
> > >   fair share with in the cgroup. Similar situation can arise with RT tasks
> > >   also.
> > 
> > Whether using dm-ioband or not, if the tasks of IO priority 7 do lot
> > of IO, then the device queue is going to be full and tasks which tries
> > to issue IOs are blocked until the queue get a slot. The IOs are
> > backlogged even if they are issued from the task of IO priority 0.
> > I don't understand why you think it's the biggest issue. The same
> > thing is going to happen without dm-ioband. 
> > 
> 
> True that even limited availability of request descriptors can be a
> bottleneck and can lead to same kind of issues but my contention is
> that you are aggravating the problem. Putting a 2nd layer can break IO
> scheduler's assumption even before underlying request queue is full.

I don't think so. Dm-ioband doesn't break IO scheduler's assumptions.
In CFQ's case, the priority order is not changed within a cgroup.

> So second level solution on top will increase the frequency of such
> incidents where a lower priority task can run away with more job done than
> high priority task because there are no separate queues for different
> priority tasks and release of buffered bio is FIFO.
> 
> Secondly what happens to tasks of RT class? dm-ioband does not have any
> notion of handling the RT cgroup or RT tasks.

It's not an issue, it's a talk about how to determine a policy.
I think giving priority to cgroup policy rather than I/O scheduler
policy is more flexible.

> Thirdly, doing any kind of resource control at higher level takes away the
> capability to treat task and groups at same level. I have had this
> discussion in other offline thread also where you are copied. I think
> it is a good idea to treat tasks and groups at same level where possible
> (depends if IO scheduler creates separate queues for tasks or not, cfq
> does.) 
> 
> > If I were you, I create two cgroups and let tasks of lower priority
> > belong to one cgroup and tasks of higher priority belong to another,
> > and give higher bandwidth to the cgroup to which the higher priority
> > tasks belong. What do you think about this way?
> 
> I think this is not practical. What we are talking is that task
> priority does not have any meaning. If we want service difference between
> two tasks, we need to pack them in separate cgroup otherwise we can't
> gurantee things. If we need to pack every task in separate cgroup then
> why to even have the notion of task priority.  

It is possible to modify dm-ioband to cooperate with CFQ, but I'm not
sure it's really meaningful. What do you do when a task of RT class
issues a lot of I/O? Do you always give priority to the I/Os from the
task of RT class despite of the assigned bandwidth? Which one do you
give priority bandwidth or RT class?

Thanks,
Ryo Tsuruta

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Hierarchical grouping facility for IO controller (Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch)
       [not found]             ` <20090126162951.GI31802-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2009-01-29  3:36               ` 2-Level IO scheduling (Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch) Ryo Tsuruta
@ 2009-01-29  3:39               ` Ryo Tsuruta
  2009-01-29  3:41               ` Implementation of dm-ioband as a dm-driver " Ryo Tsuruta
  2 siblings, 0 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-29  3:39 UTC (permalink / raw)
  To: vgoyal-H+wXaHxf7aLQT0dZR+AlfA
  Cc: dhaval-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, arozansk-H+wXaHxf7aLQT0dZR+AlfA,
	jens.axboe-QHcLZuEGTsvQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	paolo.valente-rcYM44yAMweonA0d6jMUrA,
	jmoyer-H+wXaHxf7aLQT0dZR+AlfA,
	fernando-w0OK63jvRlAuJ+9fw/WgBHgSJqDPrsil,
	riel-H+wXaHxf7aLQT0dZR+AlfA, fchecconi-Re5JQEeQqe8AvxtiuMwx3w,
	chrisw-H+wXaHxf7aLQT0dZR+AlfA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	menage-hpIqsD4AKlfQT0dZR+AlfA

Hi Vivek,

This mail is about hierarchical grouping facility for IO controller.

> I think with the introduction of cgroup, IO scheduling has become now
> a hierarhical scheduling. Previously it was flat scheduling where there
> was only one level. Not there can be multiple levels and each level
> can have groups and queues. I don't think that we can break down
> hiearchical scheduling problem in two parts where top level part is moved
> into a module. It is something like saying that lets break out cpu group
> schedling into a separate module and it should not be part of kernel.
> 
> I think we need to implement this hiearchical IO scheduler in kernel which
> can schedule groups as well as end level io queues. (maintained by cfq,
> deadline, as, or noop).

I can implement the hierarchical grouping facility if really necessary
and the patch will be released after dm-ioband is merged into the
kernel. But do you believe the hierarchical grouping facility should
be implemented in the kernel, even if we can do it in the userland?
I think disk bandwidth control doesn't need responsiveness like the
CPU scheduler. I know it's possible to implement in the kernel, but
does it only make the kernel complex?

> > > - Task grouping logic
> > > 	- We already have the notion of cgroup where tasks can be grouped
> > > 	  in hierarhical manner. dm-ioband does not make full use of that and
> > > 	  comes up with own mechansim of grouping tasks (apart from cgroup).
> > > 	  And there are odd ways of specifying cgroup id which configuring the
> > > 	  dm-ioband device. I think once somebody has created the cgroup
> > > 	  hieararchy, any IO controller logic should be able to internally
> > > 	  read that hiearchy and provide control. There should not be need
> > > 	  of any other configuration utity on top of cgroup.
> > > 
> > > 	  My RFC patches had done that.
> > 
> > Dm-ioband can work with the bio-cgroup mechanism, which makes task groups
> > in manner of the cgroup, of course.
> > I already have a basic design to make dm-ioband support the cgroup
> > hierarchy. This should be started after the core code of bio-cgroup,
> > which helps trace each I/O requests, is merged in -mm tree.
> 
> bio-cgroup patches are fine because they provide us the capability to 
> map delayed writes to right cgroup. And it can be used by any IO
> controller. 
> 
> > And the reason dm-ioband uses cgroup id to specify a cgroup is that
> > the current cgroup infrastructure lacks features to manage resources
> > placed in the kernel modules.
> 
> Can you elaborate on that please? We have heard in the past that cgroup 
> does not give you enough flexibility but never got details.

The current cgroup framework can't manage resources dynamically. The
reason for using a cgroup ID to specify a cgroup is that it makes the
implementation quite simple. Although it is possible to implement the
function without using the ID, it makes the kernel complex due to the
function has to be implemented outside of the kernel.

Thanks,
Ryo Tsuruta

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Hierarchical grouping facility for IO controller (Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch)
  2009-01-26 16:29           ` Vivek Goyal
       [not found]             ` <20090126162951.GI31802-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2009-01-29  3:36             ` 2-Level IO scheduling " Ryo Tsuruta
@ 2009-01-29  3:39             ` Ryo Tsuruta
  2009-01-29  3:41             ` Implementation of dm-ioband as a dm-driver " Ryo Tsuruta
  3 siblings, 0 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-29  3:39 UTC (permalink / raw)
  To: vgoyal
  Cc: dm-devel, agk, linux-kernel, containers, nauman, dpshah, lizf,
	mikew, fchecconi, paolo.valente, jens.axboe, fernando, s-uchida,
	taka, guijianfeng, arozansk, jmoyer, riel, peterz, menage,
	balbir, dhaval, chrisw

Hi Vivek,

This mail is about hierarchical grouping facility for IO controller.

> I think with the introduction of cgroup, IO scheduling has become now
> a hierarhical scheduling. Previously it was flat scheduling where there
> was only one level. Not there can be multiple levels and each level
> can have groups and queues. I don't think that we can break down
> hiearchical scheduling problem in two parts where top level part is moved
> into a module. It is something like saying that lets break out cpu group
> schedling into a separate module and it should not be part of kernel.
> 
> I think we need to implement this hiearchical IO scheduler in kernel which
> can schedule groups as well as end level io queues. (maintained by cfq,
> deadline, as, or noop).

I can implement the hierarchical grouping facility if really necessary
and the patch will be released after dm-ioband is merged into the
kernel. But do you believe the hierarchical grouping facility should
be implemented in the kernel, even if we can do it in the userland?
I think disk bandwidth control doesn't need responsiveness like the
CPU scheduler. I know it's possible to implement in the kernel, but
does it only make the kernel complex?

> > > - Task grouping logic
> > > 	- We already have the notion of cgroup where tasks can be grouped
> > > 	  in hierarhical manner. dm-ioband does not make full use of that and
> > > 	  comes up with own mechansim of grouping tasks (apart from cgroup).
> > > 	  And there are odd ways of specifying cgroup id which configuring the
> > > 	  dm-ioband device. I think once somebody has created the cgroup
> > > 	  hieararchy, any IO controller logic should be able to internally
> > > 	  read that hiearchy and provide control. There should not be need
> > > 	  of any other configuration utity on top of cgroup.
> > > 
> > > 	  My RFC patches had done that.
> > 
> > Dm-ioband can work with the bio-cgroup mechanism, which makes task groups
> > in manner of the cgroup, of course.
> > I already have a basic design to make dm-ioband support the cgroup
> > hierarchy. This should be started after the core code of bio-cgroup,
> > which helps trace each I/O requests, is merged in -mm tree.
> 
> bio-cgroup patches are fine because they provide us the capability to 
> map delayed writes to right cgroup. And it can be used by any IO
> controller. 
> 
> > And the reason dm-ioband uses cgroup id to specify a cgroup is that
> > the current cgroup infrastructure lacks features to manage resources
> > placed in the kernel modules.
> 
> Can you elaborate on that please? We have heard in the past that cgroup 
> does not give you enough flexibility but never got details.

The current cgroup framework can't manage resources dynamically. The
reason for using a cgroup ID to specify a cgroup is that it makes the
implementation quite simple. Although it is possible to implement the
function without using the ID, it makes the kernel complex due to the
function has to be implemented outside of the kernel.

Thanks,
Ryo Tsuruta

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Implementation of dm-ioband as a dm-driver (Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch)
       [not found]             ` <20090126162951.GI31802-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2009-01-29  3:36               ` 2-Level IO scheduling (Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch) Ryo Tsuruta
  2009-01-29  3:39               ` Hierarchical grouping facility for IO controller " Ryo Tsuruta
@ 2009-01-29  3:41               ` Ryo Tsuruta
  2 siblings, 0 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-29  3:41 UTC (permalink / raw)
  To: vgoyal-H+wXaHxf7aLQT0dZR+AlfA
  Cc: dhaval-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	dm-devel-H+wXaHxf7aLQT0dZR+AlfA, arozansk-H+wXaHxf7aLQT0dZR+AlfA,
	jens.axboe-QHcLZuEGTsvQT0dZR+AlfA, agk-H+wXaHxf7aLQT0dZR+AlfA,
	balbir-23VcF4HTsmIX0ybBhKVfKdBPR1lH4CV8,
	paolo.valente-rcYM44yAMweonA0d6jMUrA,
	jmoyer-H+wXaHxf7aLQT0dZR+AlfA,
	fernando-w0OK63jvRlAuJ+9fw/WgBHgSJqDPrsil,
	riel-H+wXaHxf7aLQT0dZR+AlfA, fchecconi-Re5JQEeQqe8AvxtiuMwx3w,
	chrisw-H+wXaHxf7aLQT0dZR+AlfA,
	containers-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	menage-hpIqsD4AKlfQT0dZR+AlfA

Hi Vivek,

This mail is about implement IO controller as a dm-driver.

> In this case first you are forcing some functionalilty to go in a kernel
> module and then coming up with tools for configuration. I never understood
> that why don't you let the controller be inside the kernel, let it
> directly interact with cgroup subsystem and work instead of first taking
> the functionality out of kernel in a module and then justifying the case
> that now we need new ways of configuring that module because cgroup
> infrastructure is not sufficient.  

It is possible the algorithm of dm-ioband can be directly implemented
in the kernel. I've been investigating how to do it.

> > > - Need of a dm device for every device we want to control
> > > 
> > > 	- This requirement looks odd. It forces everybody to use dm-tools
> > > 	  and if there are lots of disks in the system, configuation is
> > > 	  pain.
> > 
> > I don't think it's so pain. I think you are already using LVM devices on
> > your boxes. Setting up dm-ioband is the same as that for LVM. And some
> > scripts or something similar will help you set up them.
> 
> Not everybody uses LVM. Balbir had asked once, if there are thousands of 
> disks in the system, does that mean I need to create this dm-ioband device
> for all the disks?

I think it could be easily done by a small script of several lines.

Thanks,
Ryo Tsuruta

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Implementation of dm-ioband as a dm-driver (Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch)
  2009-01-26 16:29           ` Vivek Goyal
                               ` (2 preceding siblings ...)
  2009-01-29  3:39             ` Hierarchical grouping facility for IO controller " Ryo Tsuruta
@ 2009-01-29  3:41             ` Ryo Tsuruta
  3 siblings, 0 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-01-29  3:41 UTC (permalink / raw)
  To: vgoyal
  Cc: dm-devel, agk, linux-kernel, containers, nauman, dpshah, lizf,
	mikew, fchecconi, paolo.valente, jens.axboe, fernando, s-uchida,
	taka, guijianfeng, arozansk, jmoyer, riel, peterz, menage,
	balbir, dhaval, chrisw

Hi Vivek,

This mail is about implement IO controller as a dm-driver.

> In this case first you are forcing some functionalilty to go in a kernel
> module and then coming up with tools for configuration. I never understood
> that why don't you let the controller be inside the kernel, let it
> directly interact with cgroup subsystem and work instead of first taking
> the functionality out of kernel in a module and then justifying the case
> that now we need new ways of configuring that module because cgroup
> infrastructure is not sufficient.  

It is possible the algorithm of dm-ioband can be directly implemented
in the kernel. I've been investigating how to do it.

> > > - Need of a dm device for every device we want to control
> > > 
> > > 	- This requirement looks odd. It forces everybody to use dm-tools
> > > 	  and if there are lots of disks in the system, configuation is
> > > 	  pain.
> > 
> > I don't think it's so pain. I think you are already using LVM devices on
> > your boxes. Setting up dm-ioband is the same as that for LVM. And some
> > scripts or something similar will help you set up them.
> 
> Not everybody uses LVM. Balbir had asked once, if there are thousands of 
> disks in the system, does that mean I need to create this dm-ioband device
> for all the disks?

I think it could be easily done by a small script of several lines.

Thanks,
Ryo Tsuruta

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch
  2009-01-22 12:05         ` Ryo Tsuruta
@ 2009-02-04  5:07           ` Ryo Tsuruta
  0 siblings, 0 replies; 23+ messages in thread
From: Ryo Tsuruta @ 2009-02-04  5:07 UTC (permalink / raw)
  To: agk; +Cc: dm-devel

Hi Alasdair,

> BTW, I've attached a patch against the dm-add-ioband.patch in your
> quilt tree. The patch is cleaned up and reflected some Lindent's
> outputs and some points suggested by the previous email.

dm-add-ioband.patch in your editing tree hasn't be applied the patch 
(http://patchwork.kernel.org/patch/3564/) yet.
Could you please update dm-add-ioband.patch?

Thanks,
Ryo Tsuruta

^ permalink raw reply	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2009-02-04  5:07 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-01-20  5:10 [PATCH 0/2] dm-ioband: I/O bandwidth controller v1.10.0: Introduction Ryo Tsuruta
2009-01-20  5:11 ` [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch Ryo Tsuruta
2009-01-20  5:12   ` [PATCH 2/2] dm-ioband: I/O bandwidth controller v1.10.0: Document Ryo Tsuruta
2009-01-20 14:52   ` [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch Alasdair G Kergon
2009-01-21 13:03     ` Ryo Tsuruta
2009-01-21 17:18       ` Alasdair G Kergon
2009-01-22 12:05         ` Ryo Tsuruta
2009-02-04  5:07           ` Ryo Tsuruta
2009-01-20 15:19   ` Alasdair G Kergon
2009-01-20 15:53   ` Alasdair G Kergon
     [not found]     ` <20090120155334.GH9859-swAlYijrCMMf7BdofF/totBPR1lH4CV8@public.gmane.org>
2009-01-22 16:12       ` [dm-devel] " Vivek Goyal
2009-01-22 16:12         ` Vivek Goyal
     [not found]         ` <20090122161218.GA28795-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2009-01-23 10:14           ` Ryo Tsuruta
2009-01-23 10:14         ` Ryo Tsuruta
     [not found]           ` <20090123.191404.39168431.ryov-jCdQPDEk3idL9jVzuh4AOg@public.gmane.org>
2009-01-26 16:29             ` Vivek Goyal
2009-01-26 16:29           ` Vivek Goyal
     [not found]             ` <20090126162951.GI31802-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2009-01-29  3:36               ` 2-Level IO scheduling (Re: [dm-devel] [PATCH 1/2] dm-ioband: I/O bandwidth controller v1.10.0: Source code and patch) Ryo Tsuruta
2009-01-29  3:39               ` Hierarchical grouping facility for IO controller " Ryo Tsuruta
2009-01-29  3:41               ` Implementation of dm-ioband as a dm-driver " Ryo Tsuruta
2009-01-29  3:36             ` 2-Level IO scheduling " Ryo Tsuruta
2009-01-29  3:39             ` Hierarchical grouping facility for IO controller " Ryo Tsuruta
2009-01-29  3:41             ` Implementation of dm-ioband as a dm-driver " Ryo Tsuruta
2009-01-20 15:04 ` [PATCH 0/2] dm-ioband: I/O bandwidth controller v1.10.0: Introduction Alasdair G Kergon

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.