linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] cgroup: limit block I/O bandwidth
@ 2008-01-18 11:41 Andrea Righi
  2008-01-18 12:36 ` Dhaval Giani
  2008-01-18 15:50 ` Andrea Righi
  0 siblings, 2 replies; 22+ messages in thread
From: Andrea Righi @ 2008-01-18 11:41 UTC (permalink / raw)
  To: Balbir Singh, Paul Menage; +Cc: LKML

Allow to limit the block I/O bandwidth for specific process containers
(cgroups) imposing additional delays on I/O requests for those processes
that exceed the limits defined in the control group filesystem.

Example:
  # mkdir /dev/cgroup
  # mount -t cgroup -oio-throttle io-throttle /dev/cgroup
  # cd /dev/cgroup
  # mkdir foo
  --> the cgroup foo has been created
  # /bin/echo $$ > foo/tasks
  # /bin/echo 1024 > foo/io-throttle.io-rate
  # sh
  --> the subshell 'sh' is running in cgroup "foo" and it can use a maximum I/O
      bandwidth of 1MB/s (io-throttle.io-rate is expressed in KB/s).

Future improvements:
* allow to limit also I/O operations per second (instead of KB/s only)

Signed-off-by: Andrea Righi <a.righi@cineca.it>
---

diff -urpN linux-2.6.24-rc8/block/io-throttle.c linux-2.6.24-rc8-cgroup-io-throttling/block/io-throttle.c
--- linux-2.6.24-rc8/block/io-throttle.c	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.24-rc8-cgroup-io-throttling/block/io-throttle.c	2008-01-17 23:16:58.000000000 +0100
@@ -0,0 +1,250 @@
+/*
+ * io-throttle.c
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ *
+ * Copyright (C) 2008 Andrea Righi <a.righi@cineca.it>
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/jiffies.h>
+#include <linux/spinlock.h>
+#include <linux/io-throttle.h>
+
+struct iothrottle {
+	struct cgroup_subsys_state css;
+	spinlock_t lock;
+	unsigned long iorate;
+	unsigned long req;
+	unsigned long last_request;
+};
+
+static inline struct iothrottle *cgroup_to_iothrottle(struct cgroup *cont)
+{
+	return container_of(cgroup_subsys_state(cont, iothrottle_subsys_id),
+			    struct iothrottle, css);
+}
+
+static inline struct iothrottle *task_to_iothrottle(struct task_struct *task)
+{
+	return container_of(task_subsys_state(task, iothrottle_subsys_id),
+			    struct iothrottle, css);
+}
+
+/*
+ * Rules: you can only create a cgroup if:
+ *   1. you are capable(CAP_SYS_ADMIN)
+ *   2. the target cgroup is a descendant of your own cgroup
+ *
+ * Note: called from kernel/cgroup.c with cgroup_lock() held.
+ */
+static struct cgroup_subsys_state *iothrottle_create(
+			struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	struct iothrottle *iot;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
+	if (!cgroup_is_descendant(cont))
+		return ERR_PTR(-EPERM);
+
+	iot = kzalloc(sizeof(struct iothrottle), GFP_KERNEL);
+	if (unlikely(!iot))
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&iot->lock);
+	iot->last_request = jiffies;
+
+	return &iot->css;
+}
+
+/*
+ * Note: called from kernel/cgroup.c with cgroup_lock() held.
+ */
+static void iothrottle_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	kfree(cgroup_to_iothrottle(cont));
+}
+
+static ssize_t iothrottle_read(struct cgroup *cont, struct cftype *cft,
+			       struct file *file, char __user *buf,
+			       size_t nbytes, loff_t *ppos)
+{
+	ssize_t count, ret;
+	unsigned long delta, iorate, req, last_request;
+	struct iothrottle *iot;
+	char *page;
+
+	page = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!page)
+		return -ENOMEM;
+
+	cgroup_lock();
+	if (cgroup_is_removed(cont)) {
+		cgroup_unlock();
+		ret = -ENODEV;
+		goto out;
+	}
+
+	iot = cgroup_to_iothrottle(cont);
+	spin_lock_irq(&iot->lock);
+
+	delta = (long)jiffies - (long)iot->last_request;
+	iorate = iot->iorate;
+	req = iot->req << 1;
+	last_request = iot->last_request;
+
+	spin_unlock_irq(&iot->lock);
+	cgroup_unlock();
+
+	/* print additional debugging stuff */
+	count = sprintf(page, "     io-rate: %lu KiB/sec\n"
+			      "   requested: %lu KiB\n"
+			      "last_request: %lu jiffies\n"
+			      "       delta: %lu jiffies\n",
+			iorate, req << 1, last_request, delta);
+
+	ret = simple_read_from_buffer(buf, nbytes, ppos, page, count);
+
+out:
+	free_page((unsigned long)page);
+	return ret;
+}
+
+static int iothrottle_write_uint(struct cgroup *cont, struct cftype *cft,
+				 u64 val)
+{
+	struct iothrottle *iot;
+	int ret = 0;
+
+	cgroup_lock();
+	if (cgroup_is_removed(cont)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	iot = cgroup_to_iothrottle(cont);
+
+	spin_lock_irq(&iot->lock);
+	iot->iorate = (unsigned long)val;
+	spin_unlock_irq(&iot->lock);
+
+out:
+	cgroup_unlock();
+	return ret;
+}
+
+static struct cftype files[] = {
+	{
+		.name = "io-rate",
+		.read = iothrottle_read,
+		.write_uint = iothrottle_write_uint,
+	},
+};
+
+static int iothrottle_populate(struct cgroup_subsys *ss, struct cgroup *cont)
+{
+	return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
+}
+
+struct cgroup_subsys iothrottle_subsys = {
+	.name = "io-throttle",
+	.create = iothrottle_create,
+	.destroy = iothrottle_destroy,
+	.populate = iothrottle_populate,
+	.subsys_id = iothrottle_subsys_id,
+};
+
+void io_throttle(int nr_sectors)
+{
+	struct iothrottle *iot;
+	unsigned long delta, n;
+	long sleep;
+
+	cgroup_lock();
+	iot = task_to_iothrottle(current);
+	if (!iot)
+		goto out;
+
+	spin_lock_irq(&iot->lock);
+	if (!iot->iorate)
+		goto out2;
+
+	/*
+	 * The concept is the following: evaluate the actual I/O rate of a
+	 * process, looking at the sectors requested over the time elapsed from
+	 * the last request. If the actual I/O rate is beyond the maximum
+	 * allowed I/O rate then sleep the current task for the correct amount
+	 * of time, in order to reduce the actual I/O rate under the allowed
+	 * limit.
+	 *
+	 * The time to sleep is evaluated as:
+	 *
+	 *   sleep = (sectors_requested / allowed_iorate) - time_elapsed
+	 */
+	delta = (long)jiffies - (long)iot->last_request;
+	iot->req += nr_sectors;
+	n = iot->req / iot->iorate;
+
+	spin_unlock_irq(&iot->lock);
+	cgroup_unlock();
+
+	/*
+	 * If it's not possible to evaluate delta (due to a too small interval
+	 * of time between two requests) or n (due to a too small request),
+	 * account the requested sectors in iot->req and sum them to the
+	 * sectors of the next request.
+	 */
+	if (!delta || !n)
+		return;
+
+	/*
+	 * Convert n in jiffies (remember that iot->iorate is in KB/s and we
+	 * need to convert it in sectors/jiffies)
+	 */
+	sleep = msecs_to_jiffies(n * 1000 / 2) - delta;
+	if (sleep > 0) {
+		pr_debug("io-throttle: task %p (%s) must sleep %lu jiffies\n",
+			 current, current->comm, sleep);
+		schedule_timeout_uninterruptible(sleep);
+	}
+
+	/*
+	 * Note: iothrottle element could be changed during the sleep, so
+	 * we must refresh it before resetting statistics.
+	 */
+	cgroup_lock();
+	iot = task_to_iothrottle(current);
+	if (!iot)
+		goto out;
+
+	spin_lock_irq(&iot->lock);
+	iot->req = 0;
+	iot->last_request = jiffies;
+out2:
+	spin_unlock_irq(&iot->lock);
+out:
+	cgroup_unlock();
+}
+EXPORT_SYMBOL(io_throttle);
diff -urpN linux-2.6.24-rc8/block/ll_rw_blk.c linux-2.6.24-rc8-cgroup-io-throttling/block/ll_rw_blk.c
--- linux-2.6.24-rc8/block/ll_rw_blk.c	2008-01-16 05:22:48.000000000 +0100
+++ linux-2.6.24-rc8-cgroup-io-throttling/block/ll_rw_blk.c	2008-01-17 12:35:13.000000000 +0100
@@ -31,6 +31,7 @@
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 #include <linux/scatterlist.h>
+#include <linux/io-throttle.h>
 
 /*
  * for max sense size
@@ -3221,6 +3222,8 @@ static inline void __generic_make_reques
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
 
+	io_throttle(nr_sectors);
+
 	/*
 	 * Resolve the mapping until finished. (drivers are
 	 * still free to implement/resolve their own stacking
diff -urpN linux-2.6.24-rc8/block/Makefile linux-2.6.24-rc8-cgroup-io-throttling/block/Makefile
--- linux-2.6.24-rc8/block/Makefile	2008-01-16 05:22:48.000000000 +0100
+++ linux-2.6.24-rc8-cgroup-io-throttling/block/Makefile	2008-01-17 12:35:13.000000000 +0100
@@ -12,3 +12,5 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched
 
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
+
+obj-$(CONFIG_CGROUP_IO_THROTTLE)	+= io-throttle.o
diff -urpN linux-2.6.24-rc8/include/linux/cgroup_subsys.h linux-2.6.24-rc8-cgroup-io-throttling/include/linux/cgroup_subsys.h
--- linux-2.6.24-rc8/include/linux/cgroup_subsys.h	2008-01-16 05:22:48.000000000 +0100
+++ linux-2.6.24-rc8-cgroup-io-throttling/include/linux/cgroup_subsys.h	2008-01-17 12:35:13.000000000 +0100
@@ -37,3 +37,9 @@ SUBSYS(cpuacct)
 
 /* */
 
+#ifdef CONFIG_CGROUP_IO_THROTTLE
+SUBSYS(iothrottle)
+#endif
+
+/* */
+
diff -urpN linux-2.6.24-rc8/include/linux/io-throttle.h linux-2.6.24-rc8-cgroup-io-throttling/include/linux/io-throttle.h
--- linux-2.6.24-rc8/include/linux/io-throttle.h	1970-01-01 01:00:00.000000000 +0100
+++ linux-2.6.24-rc8-cgroup-io-throttling/include/linux/io-throttle.h	2008-01-17 12:35:13.000000000 +0100
@@ -0,0 +1,10 @@
+#ifndef IO_THROTTLE_H
+#define IO_THROTTLE_H
+
+#ifdef CONFIG_CGROUP_IO_THROTTLE
+extern void io_throttle(int nr_sectors);
+#else
+static inline void io_throttle(int nr_sectors) { }
+#endif /* CONFIG_CGROUP_IO_THROTTLE */
+
+#endif
diff -urpN linux-2.6.24-rc8/init/Kconfig linux-2.6.24-rc8-cgroup-io-throttling/init/Kconfig
--- linux-2.6.24-rc8/init/Kconfig	2008-01-16 05:22:48.000000000 +0100
+++ linux-2.6.24-rc8-cgroup-io-throttling/init/Kconfig	2008-01-17 12:35:13.000000000 +0100
@@ -313,6 +313,15 @@ config CGROUP_NS
           for instance virtual servers and checkpoint/restart
           jobs.
 
+config CGROUP_IO_THROTTLE
+        bool "Enable cgroup I/O throttling (EXPERIMENTAL)"
+        depends on EXPERIMENTAL && CGROUPS
+        help
+	  This allows to limit the maximum I/O bandwidth for specific
+	  cgroup(s).
+
+          Say N if unsure.
+
 config CPUSETS
 	bool "Cpuset support"
 	depends on SMP && CGROUPS

^ permalink raw reply	[flat|nested] 22+ messages in thread
* Re: [PATCH] cgroup: limit block I/O bandwidth
@ 2008-01-18 22:39 Naveen Gupta
  2008-01-19 11:17 ` Andrea Righi
  0 siblings, 1 reply; 22+ messages in thread
From: Naveen Gupta @ 2008-01-18 22:39 UTC (permalink / raw)
  To: Andrea Righi; +Cc: Paul Menage, Dhaval Giani, Balbir Singh, LKML

>Paul Menage wrote:
>> On Jan 18,  2008 7:36 AM, Dhaval Giani <dhaval@linux.vnet.ibm.com>  wrote:
>>> On Fri, Jan 18, 2008 at 12:41:03PM +0100, Andrea Righi  wrote:
>>>> Allow to limit the  block I/O bandwidth for  specific process containers
>>>> (cgroups) imposing additional delays  on I/O requests for those processes
>>>> that exceed the  limits defined in the control group filesystem.
>>>>
>>>>  Example:
>>>>   # mkdir /dev/cgroup
>>>>   # mount -t cgroup -oio-throttle io-throttle /dev/cgroup
>>> Just a minor nit, can't we name it as io,  keeping in mind that other
>>> controllers are known as cpu and  memory?
>>
>> Or maybe "blockio"?
>
>Agree, blockio seems better. Not all I/O is performed on  block devices
>and in this case we're  considering block devices only.

Here we want to rate limit in block layer, I would think I/O scheduler
is the place where we are in much better position to do this kind of
limiting.

Also we are changing the behavior of application by adding sleeps to
it during request submission. Moreover, we will prevent requests from
being merged since we won't allow them to be submitted in this case.

Since bulk of submission for writes is done in background kernel
threads and we throttle based on limits on current, we will end up
throttling these threads and not the actual processes submitting i/o.

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2008-01-25  8:46 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2008-01-18 11:41 [PATCH] cgroup: limit block I/O bandwidth Andrea Righi
2008-01-18 12:36 ` Dhaval Giani
2008-01-18 12:41   ` Paul Menage
2008-01-18 13:02     ` Andrea Righi
2008-01-18 15:50 ` Andrea Righi
2008-01-18 22:39 Naveen Gupta
2008-01-19 11:17 ` Andrea Righi
2008-01-20 13:45   ` Andrea Righi
2008-01-20 14:32     ` Jens Axboe
2008-01-20 14:58       ` Balbir Singh
2008-01-20 15:41       ` Andrea Righi
2008-01-20 16:06         ` Jens Axboe
2008-01-20 23:59           ` Andrea Righi
2008-01-22 19:02             ` Naveen Gupta
2008-01-22 23:11               ` Andrea Righi
2008-01-23  1:17                 ` Naveen Gupta
2008-01-23 15:23                   ` Andrea Righi
2008-01-23 15:38                     ` Balbir Singh
2008-01-23 20:55                       ` Andrea Righi
2008-01-24  9:05                         ` Pavel Emelyanov
2008-01-24 13:48                           ` Andrea Righi
2008-01-24 13:50                             ` Balbir Singh

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).