All of lore.kernel.org
 help / color / mirror / Atom feed
* VM disk I/O limit patch
@ 2011-06-21  8:29 Andrew Xu
  2011-06-21 13:33 ` Konrad Rzeszutek Wilk
  2011-06-23 20:45 ` Shaun Reitan
  0 siblings, 2 replies; 14+ messages in thread
From: Andrew Xu @ 2011-06-21  8:29 UTC (permalink / raw)
  To: xen-devel, xen-users

[-- Attachment #1: Type: text/plain, Size: 13311 bytes --]

Hi all,

I add a blkback QoS patch.
You can config(dynamic/static) different I/O speed for different VM disk
by this patch.

----------------------------------------------------------------------------

diff -urNp blkback/blkback.c blkback-qos/blkback.c
--- blkback/blkback.c	2011-06-22 07:54:19.000000000 +0800
+++ blkback-qos/blkback.c	2011-06-22 07:53:18.000000000 +0800
@@ -44,6 +44,11 @@
 #include <asm/hypervisor.h>
 #include "common.h"
 
+#undef DPRINTK
+#define DPRINTK(fmt, args...)				\
+	printk("blkback/blkback (%s:%d) " fmt ".\n",	\
+		 __FUNCTION__, __LINE__, ##args)
+
 /*
  * These are rather arbitrary. They are fairly large because adjacent requests
  * pulled from a communication ring are quite likely to end up being part of
@@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin
 static int do_block_io_op(blkif_t *blkif);
 static int dispatch_rw_block_io(blkif_t *blkif,
 				 blkif_request_t *req,
-				 pending_req_t *pending_req);
+				 pending_req_t *pending_req,
+				 int *done_nr_sects);
 static void make_response(blkif_t *blkif, u64 id,
 			  unsigned short op, int st);
 
@@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif)
 	blkif->st_pk_req = 0;
 }
 
+static void refill_reqcount(blkif_t *blkif)
+{
+	blkif->reqtime = jiffies + msecs_to_jiffies(1000);
+ 	blkif->reqcount = blkif->reqrate;
+ 	if (blkif->reqcount < blkif->reqmin)
+ 		blkif->reqcount = blkif->reqmin;
+}
+
 int blkif_schedule(void *arg)
 {
 	blkif_t *blkif = arg;
 	struct vbd *vbd = &blkif->vbd;
+	int	ret = 0;
+	struct timeval cur_time;
 
 	blkif_get(blkif);
 
@@ -232,12 +248,34 @@ int blkif_schedule(void *arg)
 		blkif->waiting_reqs = 0;
 		smp_mb(); /* clear flag *before* checking for work */
 
-		if (do_block_io_op(blkif))
+		ret = do_block_io_op(blkif);
+		if (ret)
 			blkif->waiting_reqs = 1;
 		unplug_queue(blkif);
 
+		if(blkif->reqmin){
+			if(2 == ret && (blkif->reqtime > jiffies)){
+				jiffies_to_timeval(jiffies, &cur_time);
+				if(log_stats && (cur_time.tv_sec % 10 ==1 ))
+					printk(KERN_DEBUG "%s: going to sleep %d millsecs(rate=%d)\n",
+							current->comm,
+							jiffies_to_msecs(blkif->reqtime - jiffies),
+							blkif->reqrate);
+				
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(blkif->reqtime - jiffies);
+				
+				if(log_stats && (cur_time.tv_sec % 10 ==1 ))
+					printk(KERN_DEBUG "%s: sleep end(rate=%d)\n",
+							current->comm,blkif->reqrate);
+			}
+			if (time_after(jiffies, blkif->reqtime))
+				refill_reqcount(blkif);
+		}
+
 		if (log_stats && time_after(jiffies, blkif->st_print))
 			print_stats(blkif);
+		
 	}
 
 	if (log_stats)
@@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void *
 /******************************************************************
  * DOWNWARD CALLS -- These interface with the block-device layer proper.
  */
-
 static int do_block_io_op(blkif_t *blkif)
 {
 	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
@@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif
 	pending_req_t *pending_req;
 	RING_IDX rc, rp;
 	int more_to_do = 0, ret;
+	static int last_done_nr_sects = 0;	
 
 	rc = blk_rings->common.req_cons;
 	rp = blk_rings->common.sring->req_prod;
 	rmb(); /* Ensure we see queued requests up to 'rp'. */
+	
+	if (blkif->reqmin && blkif->reqcount <= 0)
+		return (rc != rp) ? 2 : 0;
 
 	while ((rc != rp) || (blkif->is_suspended_req)) {
 
 		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
 			break;
+		
+		if(blkif->reqmin){
+			blkif->reqcount -= last_done_nr_sects;
+			if (blkif->reqcount <= 0) {
+				more_to_do = 2;
+				break;
+			}
+		}
 
 		if (kthread_should_stop()) {
 			more_to_do = 1;
@@ -367,14 +416,14 @@ handle_request:
 		switch (req.operation) {
 		case BLKIF_OP_READ:
 			blkif->st_rd_req++;
-			ret = dispatch_rw_block_io(blkif, &req, pending_req); 
+			ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); 
 			break;
 		case BLKIF_OP_WRITE_BARRIER:
 			blkif->st_br_req++;
 			/* fall through */
 		case BLKIF_OP_WRITE:
 			blkif->st_wr_req++;
-			ret = dispatch_rw_block_io(blkif, &req, pending_req);
+			ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects);
 			break;
 		case BLKIF_OP_PACKET:
 			DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n");
@@ -412,9 +461,29 @@ handle_request:
 	return more_to_do;
 }
 
+static char* operation2str(int operation)
+{
+	char* ret_str = NULL;
+	switch (operation) {
+	case BLKIF_OP_READ:
+		ret_str = "READ";
+		break;
+	case BLKIF_OP_WRITE:
+		ret_str = "WRITE";
+		break;
+	case BLKIF_OP_WRITE_BARRIER:
+		ret_str = "WRITE_BARRIER";
+		break;
+	default:
+		ret_str = "0";
+	}
+	return ret_str;
+}
+
 static int dispatch_rw_block_io(blkif_t *blkif,
 				 blkif_request_t *req,
-				 pending_req_t *pending_req)
+				 pending_req_t *pending_req,
+				 int *done_nr_sects)
 {
 	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t
 	struct bio *bio = NULL;
 	int ret, i;
 	int operation;
+	struct timeval cur_time;
+
+	*done_nr_sects = 0;
 
 	switch (req->operation) {
 	case BLKIF_OP_READ:
@@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t
 	else if (operation == WRITE || operation == WRITE_BARRIER)
 		blkif->st_wr_sect += preq.nr_sects;
 
+	*done_nr_sects = preq.nr_sects;
+	jiffies_to_timeval(jiffies, &cur_time);
+	if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 ))
+		printk(KERN_DEBUG "  operation=%s sects=%d\n",
+			operation2str(req->operation),preq.nr_sects);
+
 	return 0;
 
  fail_flush:
@@ -695,6 +773,8 @@ static int __init blkif_init(void)
 
 	blkif_xenbus_init();
 
+	DPRINTK("blkif_inited\n");
+
 	return 0;
 
  out_of_memory:
diff -urNp blkback/cdrom.c blkback-qos/cdrom.c
--- blkback/cdrom.c	2010-05-20 18:07:00.000000000 +0800
+++ blkback-qos/cdrom.c	2011-06-22 07:34:50.000000000 +0800
@@ -35,9 +35,9 @@
 #include "common.h"
 
 #undef DPRINTK
-#define DPRINTK(_f, _a...)			\
-	printk("(%s() file=%s, line=%d) " _f "\n",	\
-		 __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a )
+#define DPRINTK(fmt, args...)				\
+	printk("blkback/cdrom (%s:%d) " fmt ".\n",	\
+		 __FUNCTION__, __LINE__, ##args)
 
 
 #define MEDIA_PRESENT "media-present"
diff -urNp blkback/common.h blkback-qos/common.h
--- blkback/common.h	2010-05-20 18:07:00.000000000 +0800
+++ blkback-qos/common.h	2011-06-22 07:34:50.000000000 +0800
@@ -100,8 +100,17 @@ typedef struct blkif_st {
 
 	grant_handle_t shmem_handle;
 	grant_ref_t    shmem_ref;
+
+	/* qos information */
+	unsigned long   reqtime;
+	int    reqcount;
+	int    reqmin;
+	int    reqrate; 
+
 } blkif_t;
 
+#define VBD_QOS_MIN_RATE_LIMIT			2*1024		/* 	1MBs 	*/
+
 struct backend_info
 {
 	struct xenbus_device *dev;
@@ -111,6 +120,8 @@ struct backend_info
 	unsigned major;
 	unsigned minor;
 	char *mode;
+  	struct xenbus_watch rate_watch;
+	int have_rate_watch; 
 };
 
 blkif_t *blkif_alloc(domid_t domid);
diff -urNp blkback/vbd.c blkback-qos/vbd.c
--- blkback/vbd.c	2010-05-20 18:07:00.000000000 +0800
+++ blkback-qos/vbd.c	2011-06-22 07:34:50.000000000 +0800
@@ -35,6 +35,11 @@
 #define vbd_sz(_v)   ((_v)->bdev->bd_part ?				\
 	(_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
 
+#undef DPRINTK
+#define DPRINTK(fmt, args...)				\
+	printk("blkback/vbd (%s:%d) " fmt ".\n",	\
+		 __FUNCTION__, __LINE__, ##args)
+
 unsigned long long vbd_size(struct vbd *vbd)
 {
 	return vbd_sz(vbd);
@@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde
 	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
 		vbd->type |= VDISK_REMOVABLE;
 
-	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+	DPRINTK("Successful creation of handle=%04x (dom=%u)",
 		handle, blkif->domid);
 	return 0;
 }
diff -urNp blkback/xenbus.c blkback-qos/xenbus.c
--- blkback/xenbus.c	2010-05-20 18:07:00.000000000 +0800
+++ blkback-qos/xenbus.c	2011-06-22 07:34:50.000000000 +0800
@@ -25,13 +25,14 @@
 
 #undef DPRINTK
 #define DPRINTK(fmt, args...)				\
-	pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",	\
+	printk("blkback/xenbus (%s:%d) " fmt ".\n",	\
 		 __FUNCTION__, __LINE__, ##args)
 
 static void connect(struct backend_info *);
 static int connect_ring(struct backend_info *);
 static void backend_changed(struct xenbus_watch *, const char **,
 			    unsigned int);
+static void unregister_rate_watch(struct backend_info *be);
 
 static int blkback_name(blkif_t *blkif, char *buf)
 {
@@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t
 	char name[TASK_COMM_LEN];
 
 	/* Not ready to connect? */
-	if (!blkif->irq || !blkif->vbd.bdev)
+	if (!blkif->irq || !blkif->vbd.bdev){
+		DPRINTK("Not ready to connect");
 		return;
+	}
 
 	/* Already connected? */
 	if (blkif->be->dev->state == XenbusStateConnected)
@@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_
 		be->cdrom_watch.node = NULL;
 	}
 
+	unregister_rate_watch(be);
+
 	if (be->blkif) {
 		blkif_disconnect(be->blkif);
 		vbd_free(&be->blkif->vbd);
@@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d
 
 	err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
 				 &be->backend_watch, backend_changed);
+
+	DPRINTK("blkback_probe called");
+	DPRINTK("dev->nodename=%s/physical-device",dev->nodename);
+	
 	if (err)
 		goto fail;
 
@@ -266,7 +275,6 @@ fail:
 	return err;
 }
 
-
 /**
  * Callback received when the hotplug scripts have placed the physical-device
  * node.  Read it and the mode node, and create a vbd.  If the frontend is
@@ -283,8 +291,9 @@ static void backend_changed(struct xenbu
 	struct xenbus_device *dev = be->dev;
 	int cdrom = 0;
 	char *device_type;
+	char name[TASK_COMM_LEN];
 
-	DPRINTK("");
+	DPRINTK("backend_changed called");
 
 	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
 			   &major, &minor);
@@ -322,6 +331,34 @@ static void backend_changed(struct xenbu
 		kfree(device_type);
 	}
 
+	/* gather information about QoS policy for this device. */
+	err = blkback_name(be->blkif, name);
+	if (err) {
+		xenbus_dev_error(be->dev, err, "get blkback dev name");
+		return;
+	}
+	
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+				"tokens-rate", "%d", &be->blkif->reqrate, 
+				NULL);
+	if(err){
+		DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name);
+	}else{
+		if(be->blkif->reqrate <= 0){
+			be->blkif->reqmin = 0 ;
+			DPRINTK("%s tokens-rate == 0,no limit",name);	
+		}else{
+			DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
+			be->blkif->reqrate *= 2;
+			be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
+			if(be->blkif->reqmin > be->blkif->reqrate){
+				be->blkif->reqrate = be->blkif->reqmin;
+				DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate);
+			}
+		}
+	}
+	be->blkif->reqtime = jiffies;
+
 	if (be->major == 0 && be->minor == 0) {
 		/* Front end dir is a number, which is used as the handle. */
 
@@ -414,6 +451,49 @@ static void frontend_changed(struct xenb
 
 /* ** Connection ** */
 
+static void unregister_rate_watch(struct backend_info *be)
+{
+	if (be->have_rate_watch) {
+		unregister_xenbus_watch(&be->rate_watch);
+		kfree(be->rate_watch.node);
+	}
+	be->have_rate_watch = 0;
+}
+
+static void rate_changed(struct xenbus_watch *watch,
+                       const char **vec, unsigned int len)
+{
+
+	struct backend_info *be=container_of(watch,struct backend_info, rate_watch);
+	int err;
+	char name[TASK_COMM_LEN];
+
+	err = blkback_name(be->blkif, name);
+	if (err) {
+		xenbus_dev_error(be->dev, err, "get blkback dev name");
+		return;
+	}
+
+	err = xenbus_gather(XBT_NIL,be->dev->otherend, 
+					"tokens-rate",	"%d", 
+					&be->blkif->reqrate,NULL);
+	if(err){
+		DPRINTK("%s xenbus_gather(tokens-rate) error",name);
+	}else{
+		if(be->blkif->reqrate <= 0){
+			be->blkif->reqmin = 0;
+			DPRINTK("%s tokens-rate == 0,no limit",name);	
+		}else{
+			DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
+			be->blkif->reqrate *= 2;
+			be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
+			if(be->blkif->reqmin > be->blkif->reqrate){
+				be->blkif->reqrate = be->blkif->reqmin;
+				DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate);
+			}
+		}
+	}
+}
 
 /**
  * Write the physical details regarding the block device to the store, and
@@ -439,6 +519,14 @@ again:
 	if (err)
 		goto abort;
 
+	/*add by andrew for centos pv*/
+	err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1");
+	if (err){
+		xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache",
+			dev->nodename);
+		goto abort;
+	}
+
 	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 			    vbd_size(&be->blkif->vbd));
 	if (err) {
@@ -469,11 +557,22 @@ again:
 	if (err)
 		xenbus_dev_fatal(dev, err, "ending transaction");
 
+	DPRINTK("xenbus_switch_to XenbusStateConnected");
+
 	err = xenbus_switch_state(dev, XenbusStateConnected);
 	if (err)
 		xenbus_dev_fatal(dev, err, "switching to Connected state",
 				 dev->nodename);
 
+	unregister_rate_watch(be);
+	err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate",
+								&be->rate_watch,rate_changed);
+	if (!err)
+		be->have_rate_watch = 1;
+	else
+		xenbus_dev_fatal(dev, err, "watching tokens-rate",
+				 dev->nodename);
+
 	return;
  abort:
 	xenbus_transaction_end(xbt, 1);

[-- Attachment #2: blkback-qos-20110621.diff --]
[-- Type: application/octet-stream, Size: 13108 bytes --]

diff -urNp blkback/blkback.c blkback-qos/blkback.c
--- blkback/blkback.c	2011-06-22 07:54:19.000000000 +0800
+++ blkback-qos/blkback.c	2011-06-22 07:53:18.000000000 +0800
@@ -44,6 +44,11 @@
 #include <asm/hypervisor.h>
 #include "common.h"
 
+#undef DPRINTK
+#define DPRINTK(fmt, args...)				\
+	printk("blkback/blkback (%s:%d) " fmt ".\n",	\
+		 __FUNCTION__, __LINE__, ##args)
+
 /*
  * These are rather arbitrary. They are fairly large because adjacent requests
  * pulled from a communication ring are quite likely to end up being part of
@@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin
 static int do_block_io_op(blkif_t *blkif);
 static int dispatch_rw_block_io(blkif_t *blkif,
 				 blkif_request_t *req,
-				 pending_req_t *pending_req);
+				 pending_req_t *pending_req,
+				 int *done_nr_sects);
 static void make_response(blkif_t *blkif, u64 id,
 			  unsigned short op, int st);
 
@@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif)
 	blkif->st_pk_req = 0;
 }
 
+static void refill_reqcount(blkif_t *blkif)
+{
+	blkif->reqtime = jiffies + msecs_to_jiffies(1000);
+ 	blkif->reqcount = blkif->reqrate;
+ 	if (blkif->reqcount < blkif->reqmin)
+ 		blkif->reqcount = blkif->reqmin;
+}
+
 int blkif_schedule(void *arg)
 {
 	blkif_t *blkif = arg;
 	struct vbd *vbd = &blkif->vbd;
+	int	ret = 0;
+	struct timeval cur_time;
 
 	blkif_get(blkif);
 
@@ -232,12 +248,34 @@ int blkif_schedule(void *arg)
 		blkif->waiting_reqs = 0;
 		smp_mb(); /* clear flag *before* checking for work */
 
-		if (do_block_io_op(blkif))
+		ret = do_block_io_op(blkif);
+		if (ret)
 			blkif->waiting_reqs = 1;
 		unplug_queue(blkif);
 
+		if(blkif->reqmin){
+			if(2 == ret && (blkif->reqtime > jiffies)){
+				jiffies_to_timeval(jiffies, &cur_time);
+				if(log_stats && (cur_time.tv_sec % 10 ==1 ))
+					printk(KERN_DEBUG "%s: going to sleep %d millsecs(rate=%d)\n",
+							current->comm,
+							jiffies_to_msecs(blkif->reqtime - jiffies),
+							blkif->reqrate);
+				
+				set_current_state(TASK_INTERRUPTIBLE);
+				schedule_timeout(blkif->reqtime - jiffies);
+				
+				if(log_stats && (cur_time.tv_sec % 10 ==1 ))
+					printk(KERN_DEBUG "%s: sleep end(rate=%d)\n",
+							current->comm,blkif->reqrate);
+			}
+			if (time_after(jiffies, blkif->reqtime))
+				refill_reqcount(blkif);
+		}
+
 		if (log_stats && time_after(jiffies, blkif->st_print))
 			print_stats(blkif);
+		
 	}
 
 	if (log_stats)
@@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void *
 /******************************************************************
  * DOWNWARD CALLS -- These interface with the block-device layer proper.
  */
-
 static int do_block_io_op(blkif_t *blkif)
 {
 	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
@@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif
 	pending_req_t *pending_req;
 	RING_IDX rc, rp;
 	int more_to_do = 0, ret;
+	static int last_done_nr_sects = 0;	
 
 	rc = blk_rings->common.req_cons;
 	rp = blk_rings->common.sring->req_prod;
 	rmb(); /* Ensure we see queued requests up to 'rp'. */
+	
+	if (blkif->reqmin && blkif->reqcount <= 0)
+		return (rc != rp) ? 2 : 0;
 
 	while ((rc != rp) || (blkif->is_suspended_req)) {
 
 		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
 			break;
+		
+		if(blkif->reqmin){
+			blkif->reqcount -= last_done_nr_sects;
+			if (blkif->reqcount <= 0) {
+				more_to_do = 2;
+				break;
+			}
+		}
 
 		if (kthread_should_stop()) {
 			more_to_do = 1;
@@ -367,14 +416,14 @@ handle_request:
 		switch (req.operation) {
 		case BLKIF_OP_READ:
 			blkif->st_rd_req++;
-			ret = dispatch_rw_block_io(blkif, &req, pending_req); 
+			ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); 
 			break;
 		case BLKIF_OP_WRITE_BARRIER:
 			blkif->st_br_req++;
 			/* fall through */
 		case BLKIF_OP_WRITE:
 			blkif->st_wr_req++;
-			ret = dispatch_rw_block_io(blkif, &req, pending_req);
+			ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects);
 			break;
 		case BLKIF_OP_PACKET:
 			DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n");
@@ -412,9 +461,29 @@ handle_request:
 	return more_to_do;
 }
 
+static char* operation2str(int operation)
+{
+	char* ret_str = NULL;
+	switch (operation) {
+	case BLKIF_OP_READ:
+		ret_str = "READ";
+		break;
+	case BLKIF_OP_WRITE:
+		ret_str = "WRITE";
+		break;
+	case BLKIF_OP_WRITE_BARRIER:
+		ret_str = "WRITE_BARRIER";
+		break;
+	default:
+		ret_str = "0";
+	}
+	return ret_str;
+}
+
 static int dispatch_rw_block_io(blkif_t *blkif,
 				 blkif_request_t *req,
-				 pending_req_t *pending_req)
+				 pending_req_t *pending_req,
+				 int *done_nr_sects)
 {
 	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t
 	struct bio *bio = NULL;
 	int ret, i;
 	int operation;
+	struct timeval cur_time;
+
+	*done_nr_sects = 0;
 
 	switch (req->operation) {
 	case BLKIF_OP_READ:
@@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t
 	else if (operation == WRITE || operation == WRITE_BARRIER)
 		blkif->st_wr_sect += preq.nr_sects;
 
+	*done_nr_sects = preq.nr_sects;
+	jiffies_to_timeval(jiffies, &cur_time);
+	if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 ))
+		printk(KERN_DEBUG "  operation=%s sects=%d\n",
+			operation2str(req->operation),preq.nr_sects);
+
 	return 0;
 
  fail_flush:
@@ -695,6 +773,8 @@ static int __init blkif_init(void)
 
 	blkif_xenbus_init();
 
+	DPRINTK("blkif_inited\n");
+
 	return 0;
 
  out_of_memory:
diff -urNp blkback/cdrom.c blkback-qos/cdrom.c
--- blkback/cdrom.c	2010-05-20 18:07:00.000000000 +0800
+++ blkback-qos/cdrom.c	2011-06-22 07:34:50.000000000 +0800
@@ -35,9 +35,9 @@
 #include "common.h"
 
 #undef DPRINTK
-#define DPRINTK(_f, _a...)			\
-	printk("(%s() file=%s, line=%d) " _f "\n",	\
-		 __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a )
+#define DPRINTK(fmt, args...)				\
+	printk("blkback/cdrom (%s:%d) " fmt ".\n",	\
+		 __FUNCTION__, __LINE__, ##args)
 
 
 #define MEDIA_PRESENT "media-present"
diff -urNp blkback/common.h blkback-qos/common.h
--- blkback/common.h	2010-05-20 18:07:00.000000000 +0800
+++ blkback-qos/common.h	2011-06-22 07:34:50.000000000 +0800
@@ -100,8 +100,17 @@ typedef struct blkif_st {
 
 	grant_handle_t shmem_handle;
 	grant_ref_t    shmem_ref;
+
+	/* qos information */
+	unsigned long   reqtime;
+	int    reqcount;
+	int    reqmin;
+	int    reqrate; 
+
 } blkif_t;
 
+#define VBD_QOS_MIN_RATE_LIMIT			2*1024		/* 	1MBs 	*/
+
 struct backend_info
 {
 	struct xenbus_device *dev;
@@ -111,6 +120,8 @@ struct backend_info
 	unsigned major;
 	unsigned minor;
 	char *mode;
+  	struct xenbus_watch rate_watch;
+	int have_rate_watch; 
 };
 
 blkif_t *blkif_alloc(domid_t domid);
diff -urNp blkback/vbd.c blkback-qos/vbd.c
--- blkback/vbd.c	2010-05-20 18:07:00.000000000 +0800
+++ blkback-qos/vbd.c	2011-06-22 07:34:50.000000000 +0800
@@ -35,6 +35,11 @@
 #define vbd_sz(_v)   ((_v)->bdev->bd_part ?				\
 	(_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
 
+#undef DPRINTK
+#define DPRINTK(fmt, args...)				\
+	printk("blkback/vbd (%s:%d) " fmt ".\n",	\
+		 __FUNCTION__, __LINE__, ##args)
+
 unsigned long long vbd_size(struct vbd *vbd)
 {
 	return vbd_sz(vbd);
@@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde
 	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
 		vbd->type |= VDISK_REMOVABLE;
 
-	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
+	DPRINTK("Successful creation of handle=%04x (dom=%u)",
 		handle, blkif->domid);
 	return 0;
 }
diff -urNp blkback/xenbus.c blkback-qos/xenbus.c
--- blkback/xenbus.c	2010-05-20 18:07:00.000000000 +0800
+++ blkback-qos/xenbus.c	2011-06-22 07:34:50.000000000 +0800
@@ -25,13 +25,14 @@
 
 #undef DPRINTK
 #define DPRINTK(fmt, args...)				\
-	pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",	\
+	printk("blkback/xenbus (%s:%d) " fmt ".\n",	\
 		 __FUNCTION__, __LINE__, ##args)
 
 static void connect(struct backend_info *);
 static int connect_ring(struct backend_info *);
 static void backend_changed(struct xenbus_watch *, const char **,
 			    unsigned int);
+static void unregister_rate_watch(struct backend_info *be);
 
 static int blkback_name(blkif_t *blkif, char *buf)
 {
@@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t
 	char name[TASK_COMM_LEN];
 
 	/* Not ready to connect? */
-	if (!blkif->irq || !blkif->vbd.bdev)
+	if (!blkif->irq || !blkif->vbd.bdev){
+		DPRINTK("Not ready to connect");
 		return;
+	}
 
 	/* Already connected? */
 	if (blkif->be->dev->state == XenbusStateConnected)
@@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_
 		be->cdrom_watch.node = NULL;
 	}
 
+	unregister_rate_watch(be);
+
 	if (be->blkif) {
 		blkif_disconnect(be->blkif);
 		vbd_free(&be->blkif->vbd);
@@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d
 
 	err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
 				 &be->backend_watch, backend_changed);
+
+	DPRINTK("blkback_probe called");
+	DPRINTK("dev->nodename=%s/physical-device",dev->nodename);
+	
 	if (err)
 		goto fail;
 
@@ -266,7 +275,6 @@ fail:
 	return err;
 }
 
-
 /**
  * Callback received when the hotplug scripts have placed the physical-device
  * node.  Read it and the mode node, and create a vbd.  If the frontend is
@@ -283,8 +291,9 @@ static void backend_changed(struct xenbu
 	struct xenbus_device *dev = be->dev;
 	int cdrom = 0;
 	char *device_type;
+	char name[TASK_COMM_LEN];
 
-	DPRINTK("");
+	DPRINTK("backend_changed called");
 
 	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
 			   &major, &minor);
@@ -322,6 +331,34 @@ static void backend_changed(struct xenbu
 		kfree(device_type);
 	}
 
+	/* gather information about QoS policy for this device. */
+	err = blkback_name(be->blkif, name);
+	if (err) {
+		xenbus_dev_error(be->dev, err, "get blkback dev name");
+		return;
+	}
+	
+	err = xenbus_gather(XBT_NIL, dev->otherend,
+				"tokens-rate", "%d", &be->blkif->reqrate, 
+				NULL);
+	if(err){
+		DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name);
+	}else{
+		if(be->blkif->reqrate <= 0){
+			be->blkif->reqmin = 0 ;
+			DPRINTK("%s tokens-rate == 0,no limit",name);	
+		}else{
+			DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
+			be->blkif->reqrate *= 2;
+			be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
+			if(be->blkif->reqmin > be->blkif->reqrate){
+				be->blkif->reqrate = be->blkif->reqmin;
+				DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate);
+			}
+		}
+	}
+	be->blkif->reqtime = jiffies;
+
 	if (be->major == 0 && be->minor == 0) {
 		/* Front end dir is a number, which is used as the handle. */
 
@@ -414,6 +451,49 @@ static void frontend_changed(struct xenb
 
 /* ** Connection ** */
 
+static void unregister_rate_watch(struct backend_info *be)
+{
+	if (be->have_rate_watch) {
+		unregister_xenbus_watch(&be->rate_watch);
+		kfree(be->rate_watch.node);
+	}
+	be->have_rate_watch = 0;
+}
+
+static void rate_changed(struct xenbus_watch *watch,
+                       const char **vec, unsigned int len)
+{
+
+	struct backend_info *be=container_of(watch,struct backend_info, rate_watch);
+	int err;
+	char name[TASK_COMM_LEN];
+
+	err = blkback_name(be->blkif, name);
+	if (err) {
+		xenbus_dev_error(be->dev, err, "get blkback dev name");
+		return;
+	}
+
+	err = xenbus_gather(XBT_NIL,be->dev->otherend, 
+					"tokens-rate",	"%d", 
+					&be->blkif->reqrate,NULL);
+	if(err){
+		DPRINTK("%s xenbus_gather(tokens-rate) error",name);
+	}else{
+		if(be->blkif->reqrate <= 0){
+			be->blkif->reqmin = 0;
+			DPRINTK("%s tokens-rate == 0,no limit",name);	
+		}else{
+			DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
+			be->blkif->reqrate *= 2;
+			be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
+			if(be->blkif->reqmin > be->blkif->reqrate){
+				be->blkif->reqrate = be->blkif->reqmin;
+				DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate);
+			}
+		}
+	}
+}
 
 /**
  * Write the physical details regarding the block device to the store, and
@@ -439,6 +519,14 @@ again:
 	if (err)
 		goto abort;
 
+	/*add by andrew for centos pv*/
+	err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1");
+	if (err){
+		xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache",
+			dev->nodename);
+		goto abort;
+	}
+
 	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 			    vbd_size(&be->blkif->vbd));
 	if (err) {
@@ -469,11 +557,22 @@ again:
 	if (err)
 		xenbus_dev_fatal(dev, err, "ending transaction");
 
+	DPRINTK("xenbus_switch_to XenbusStateConnected");
+
 	err = xenbus_switch_state(dev, XenbusStateConnected);
 	if (err)
 		xenbus_dev_fatal(dev, err, "switching to Connected state",
 				 dev->nodename);
 
+	unregister_rate_watch(be);
+	err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate",
+								&be->rate_watch,rate_changed);
+	if (!err)
+		be->have_rate_watch = 1;
+	else
+		xenbus_dev_fatal(dev, err, "watching tokens-rate",
+				 dev->nodename);
+
 	return;
  abort:
 	xenbus_transaction_end(xbt, 1);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: VM disk I/O limit patch
  2011-06-21  8:29 VM disk I/O limit patch Andrew Xu
@ 2011-06-21 13:33 ` Konrad Rzeszutek Wilk
  2011-06-22 12:06   ` [Xen-users] " Andrew Xu
  2011-06-22 12:16   ` Re: [Xen-devel] " Florian Heigl
  2011-06-23 20:45 ` Shaun Reitan
  1 sibling, 2 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-06-21 13:33 UTC (permalink / raw)
  To: Andrew Xu; +Cc: xen-devel, xen-users

On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote:
> Hi all,
> 
> I add a blkback QoS patch.

What tree is this against? There is a xen-blkback in 3.0-rc4, can you rebase
it against that please.

What is the patch solving? Why can't it be done with dm-ioband?

> You can config(dynamic/static) different I/O speed for different VM disk
> by this patch.
> 
> ----------------------------------------------------------------------------
> 
> diff -urNp blkback/blkback.c blkback-qos/blkback.c
> --- blkback/blkback.c	2011-06-22 07:54:19.000000000 +0800
> +++ blkback-qos/blkback.c	2011-06-22 07:53:18.000000000 +0800
> @@ -44,6 +44,11 @@
>  #include <asm/hypervisor.h>
>  #include "common.h"
>  
> +#undef DPRINTK
> +#define DPRINTK(fmt, args...)				\
> +	printk("blkback/blkback (%s:%d) " fmt ".\n",	\
> +		 __FUNCTION__, __LINE__, ##args)
> +
>  /*
>   * These are rather arbitrary. They are fairly large because adjacent requests
>   * pulled from a communication ring are quite likely to end up being part of
> @@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin
>  static int do_block_io_op(blkif_t *blkif);
>  static int dispatch_rw_block_io(blkif_t *blkif,
>  				 blkif_request_t *req,
> -				 pending_req_t *pending_req);
> +				 pending_req_t *pending_req,
> +				 int *done_nr_sects);
>  static void make_response(blkif_t *blkif, u64 id,
>  			  unsigned short op, int st);
>  
> @@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif)
>  	blkif->st_pk_req = 0;
>  }
>  
> +static void refill_reqcount(blkif_t *blkif)
> +{
> +	blkif->reqtime = jiffies + msecs_to_jiffies(1000);
> + 	blkif->reqcount = blkif->reqrate;
> + 	if (blkif->reqcount < blkif->reqmin)
> + 		blkif->reqcount = blkif->reqmin;
> +}
> +
>  int blkif_schedule(void *arg)
>  {
>  	blkif_t *blkif = arg;
>  	struct vbd *vbd = &blkif->vbd;
> +	int	ret = 0;
> +	struct timeval cur_time;
>  
>  	blkif_get(blkif);
>  
> @@ -232,12 +248,34 @@ int blkif_schedule(void *arg)
>  		blkif->waiting_reqs = 0;
>  		smp_mb(); /* clear flag *before* checking for work */
>  
> -		if (do_block_io_op(blkif))
> +		ret = do_block_io_op(blkif);
> +		if (ret)
>  			blkif->waiting_reqs = 1;
>  		unplug_queue(blkif);
>  
> +		if(blkif->reqmin){
> +			if(2 == ret && (blkif->reqtime > jiffies)){
> +				jiffies_to_timeval(jiffies, &cur_time);
> +				if(log_stats && (cur_time.tv_sec % 10 ==1 ))
> +					printk(KERN_DEBUG "%s: going to sleep %d millsecs(rate=%d)\n",
> +							current->comm,
> +							jiffies_to_msecs(blkif->reqtime - jiffies),
> +							blkif->reqrate);
> +				
> +				set_current_state(TASK_INTERRUPTIBLE);
> +				schedule_timeout(blkif->reqtime - jiffies);
> +				
> +				if(log_stats && (cur_time.tv_sec % 10 ==1 ))
> +					printk(KERN_DEBUG "%s: sleep end(rate=%d)\n",
> +							current->comm,blkif->reqrate);
> +			}
> +			if (time_after(jiffies, blkif->reqtime))
> +				refill_reqcount(blkif);
> +		}
> +
>  		if (log_stats && time_after(jiffies, blkif->st_print))
>  			print_stats(blkif);
> +		
>  	}
>  
>  	if (log_stats)
> @@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void *
>  /******************************************************************
>   * DOWNWARD CALLS -- These interface with the block-device layer proper.
>   */
> -
>  static int do_block_io_op(blkif_t *blkif)
>  {
>  	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
> @@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif
>  	pending_req_t *pending_req;
>  	RING_IDX rc, rp;
>  	int more_to_do = 0, ret;
> +	static int last_done_nr_sects = 0;	
>  
>  	rc = blk_rings->common.req_cons;
>  	rp = blk_rings->common.sring->req_prod;
>  	rmb(); /* Ensure we see queued requests up to 'rp'. */
> +	
> +	if (blkif->reqmin && blkif->reqcount <= 0)
> +		return (rc != rp) ? 2 : 0;
>  
>  	while ((rc != rp) || (blkif->is_suspended_req)) {
>  
>  		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
>  			break;
> +		
> +		if(blkif->reqmin){
> +			blkif->reqcount -= last_done_nr_sects;
> +			if (blkif->reqcount <= 0) {
> +				more_to_do = 2;
> +				break;
> +			}
> +		}
>  
>  		if (kthread_should_stop()) {
>  			more_to_do = 1;
> @@ -367,14 +416,14 @@ handle_request:
>  		switch (req.operation) {
>  		case BLKIF_OP_READ:
>  			blkif->st_rd_req++;
> -			ret = dispatch_rw_block_io(blkif, &req, pending_req); 
> +			ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); 
>  			break;
>  		case BLKIF_OP_WRITE_BARRIER:
>  			blkif->st_br_req++;
>  			/* fall through */
>  		case BLKIF_OP_WRITE:
>  			blkif->st_wr_req++;
> -			ret = dispatch_rw_block_io(blkif, &req, pending_req);
> +			ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects);
>  			break;
>  		case BLKIF_OP_PACKET:
>  			DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n");
> @@ -412,9 +461,29 @@ handle_request:
>  	return more_to_do;
>  }
>  
> +static char* operation2str(int operation)
> +{
> +	char* ret_str = NULL;
> +	switch (operation) {
> +	case BLKIF_OP_READ:
> +		ret_str = "READ";
> +		break;
> +	case BLKIF_OP_WRITE:
> +		ret_str = "WRITE";
> +		break;
> +	case BLKIF_OP_WRITE_BARRIER:
> +		ret_str = "WRITE_BARRIER";
> +		break;
> +	default:
> +		ret_str = "0";
> +	}
> +	return ret_str;
> +}
> +
>  static int dispatch_rw_block_io(blkif_t *blkif,
>  				 blkif_request_t *req,
> -				 pending_req_t *pending_req)
> +				 pending_req_t *pending_req,
> +				 int *done_nr_sects)
>  {
>  	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
>  	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> @@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t
>  	struct bio *bio = NULL;
>  	int ret, i;
>  	int operation;
> +	struct timeval cur_time;
> +
> +	*done_nr_sects = 0;
>  
>  	switch (req->operation) {
>  	case BLKIF_OP_READ:
> @@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t
>  	else if (operation == WRITE || operation == WRITE_BARRIER)
>  		blkif->st_wr_sect += preq.nr_sects;
>  
> +	*done_nr_sects = preq.nr_sects;
> +	jiffies_to_timeval(jiffies, &cur_time);
> +	if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 ))
> +		printk(KERN_DEBUG "  operation=%s sects=%d\n",
> +			operation2str(req->operation),preq.nr_sects);
> +
>  	return 0;
>  
>   fail_flush:
> @@ -695,6 +773,8 @@ static int __init blkif_init(void)
>  
>  	blkif_xenbus_init();
>  
> +	DPRINTK("blkif_inited\n");
> +
>  	return 0;
>  
>   out_of_memory:
> diff -urNp blkback/cdrom.c blkback-qos/cdrom.c
> --- blkback/cdrom.c	2010-05-20 18:07:00.000000000 +0800
> +++ blkback-qos/cdrom.c	2011-06-22 07:34:50.000000000 +0800
> @@ -35,9 +35,9 @@
>  #include "common.h"
>  
>  #undef DPRINTK
> -#define DPRINTK(_f, _a...)			\
> -	printk("(%s() file=%s, line=%d) " _f "\n",	\
> -		 __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a )
> +#define DPRINTK(fmt, args...)				\
> +	printk("blkback/cdrom (%s:%d) " fmt ".\n",	\
> +		 __FUNCTION__, __LINE__, ##args)
>  
>  
>  #define MEDIA_PRESENT "media-present"
> diff -urNp blkback/common.h blkback-qos/common.h
> --- blkback/common.h	2010-05-20 18:07:00.000000000 +0800
> +++ blkback-qos/common.h	2011-06-22 07:34:50.000000000 +0800
> @@ -100,8 +100,17 @@ typedef struct blkif_st {
>  
>  	grant_handle_t shmem_handle;
>  	grant_ref_t    shmem_ref;
> +
> +	/* qos information */
> +	unsigned long   reqtime;
> +	int    reqcount;
> +	int    reqmin;
> +	int    reqrate; 
> +
>  } blkif_t;
>  
> +#define VBD_QOS_MIN_RATE_LIMIT			2*1024		/* 	1MBs 	*/
> +
>  struct backend_info
>  {
>  	struct xenbus_device *dev;
> @@ -111,6 +120,8 @@ struct backend_info
>  	unsigned major;
>  	unsigned minor;
>  	char *mode;
> +  	struct xenbus_watch rate_watch;
> +	int have_rate_watch; 
>  };
>  
>  blkif_t *blkif_alloc(domid_t domid);
> diff -urNp blkback/vbd.c blkback-qos/vbd.c
> --- blkback/vbd.c	2010-05-20 18:07:00.000000000 +0800
> +++ blkback-qos/vbd.c	2011-06-22 07:34:50.000000000 +0800
> @@ -35,6 +35,11 @@
>  #define vbd_sz(_v)   ((_v)->bdev->bd_part ?				\
>  	(_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
>  
> +#undef DPRINTK
> +#define DPRINTK(fmt, args...)				\
> +	printk("blkback/vbd (%s:%d) " fmt ".\n",	\
> +		 __FUNCTION__, __LINE__, ##args)
> +
>  unsigned long long vbd_size(struct vbd *vbd)
>  {
>  	return vbd_sz(vbd);
> @@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde
>  	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
>  		vbd->type |= VDISK_REMOVABLE;
>  
> -	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
> +	DPRINTK("Successful creation of handle=%04x (dom=%u)",
>  		handle, blkif->domid);
>  	return 0;
>  }
> diff -urNp blkback/xenbus.c blkback-qos/xenbus.c
> --- blkback/xenbus.c	2010-05-20 18:07:00.000000000 +0800
> +++ blkback-qos/xenbus.c	2011-06-22 07:34:50.000000000 +0800
> @@ -25,13 +25,14 @@
>  
>  #undef DPRINTK
>  #define DPRINTK(fmt, args...)				\
> -	pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",	\
> +	printk("blkback/xenbus (%s:%d) " fmt ".\n",	\
>  		 __FUNCTION__, __LINE__, ##args)
>  
>  static void connect(struct backend_info *);
>  static int connect_ring(struct backend_info *);
>  static void backend_changed(struct xenbus_watch *, const char **,
>  			    unsigned int);
> +static void unregister_rate_watch(struct backend_info *be);
>  
>  static int blkback_name(blkif_t *blkif, char *buf)
>  {
> @@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t
>  	char name[TASK_COMM_LEN];
>  
>  	/* Not ready to connect? */
> -	if (!blkif->irq || !blkif->vbd.bdev)
> +	if (!blkif->irq || !blkif->vbd.bdev){
> +		DPRINTK("Not ready to connect");
>  		return;
> +	}
>  
>  	/* Already connected? */
>  	if (blkif->be->dev->state == XenbusStateConnected)
> @@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_
>  		be->cdrom_watch.node = NULL;
>  	}
>  
> +	unregister_rate_watch(be);
> +
>  	if (be->blkif) {
>  		blkif_disconnect(be->blkif);
>  		vbd_free(&be->blkif->vbd);
> @@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d
>  
>  	err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
>  				 &be->backend_watch, backend_changed);
> +
> +	DPRINTK("blkback_probe called");
> +	DPRINTK("dev->nodename=%s/physical-device",dev->nodename);
> +	
>  	if (err)
>  		goto fail;
>  
> @@ -266,7 +275,6 @@ fail:
>  	return err;
>  }
>  
> -
>  /**
>   * Callback received when the hotplug scripts have placed the physical-device
>   * node.  Read it and the mode node, and create a vbd.  If the frontend is
> @@ -283,8 +291,9 @@ static void backend_changed(struct xenbu
>  	struct xenbus_device *dev = be->dev;
>  	int cdrom = 0;
>  	char *device_type;
> +	char name[TASK_COMM_LEN];
>  
> -	DPRINTK("");
> +	DPRINTK("backend_changed called");
>  
>  	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
>  			   &major, &minor);
> @@ -322,6 +331,34 @@ static void backend_changed(struct xenbu
>  		kfree(device_type);
>  	}
>  
> +	/* gather information about QoS policy for this device. */
> +	err = blkback_name(be->blkif, name);
> +	if (err) {
> +		xenbus_dev_error(be->dev, err, "get blkback dev name");
> +		return;
> +	}
> +	
> +	err = xenbus_gather(XBT_NIL, dev->otherend,
> +				"tokens-rate", "%d", &be->blkif->reqrate, 
> +				NULL);
> +	if(err){
> +		DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name);
> +	}else{
> +		if(be->blkif->reqrate <= 0){
> +			be->blkif->reqmin = 0 ;
> +			DPRINTK("%s tokens-rate == 0,no limit",name);	
> +		}else{
> +			DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
> +			be->blkif->reqrate *= 2;
> +			be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
> +			if(be->blkif->reqmin > be->blkif->reqrate){
> +				be->blkif->reqrate = be->blkif->reqmin;
> +				DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate);
> +			}
> +		}
> +	}
> +	be->blkif->reqtime = jiffies;
> +
>  	if (be->major == 0 && be->minor == 0) {
>  		/* Front end dir is a number, which is used as the handle. */
>  
> @@ -414,6 +451,49 @@ static void frontend_changed(struct xenb
>  
>  /* ** Connection ** */
>  
> +static void unregister_rate_watch(struct backend_info *be)
> +{
> +	if (be->have_rate_watch) {
> +		unregister_xenbus_watch(&be->rate_watch);
> +		kfree(be->rate_watch.node);
> +	}
> +	be->have_rate_watch = 0;
> +}
> +
> +static void rate_changed(struct xenbus_watch *watch,
> +                       const char **vec, unsigned int len)
> +{
> +
> +	struct backend_info *be=container_of(watch,struct backend_info, rate_watch);
> +	int err;
> +	char name[TASK_COMM_LEN];
> +
> +	err = blkback_name(be->blkif, name);
> +	if (err) {
> +		xenbus_dev_error(be->dev, err, "get blkback dev name");
> +		return;
> +	}
> +
> +	err = xenbus_gather(XBT_NIL,be->dev->otherend, 
> +					"tokens-rate",	"%d", 
> +					&be->blkif->reqrate,NULL);
> +	if(err){
> +		DPRINTK("%s xenbus_gather(tokens-rate) error",name);
> +	}else{
> +		if(be->blkif->reqrate <= 0){
> +			be->blkif->reqmin = 0;
> +			DPRINTK("%s tokens-rate == 0,no limit",name);	
> +		}else{
> +			DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
> +			be->blkif->reqrate *= 2;
> +			be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
> +			if(be->blkif->reqmin > be->blkif->reqrate){
> +				be->blkif->reqrate = be->blkif->reqmin;
> +				DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate);
> +			}
> +		}
> +	}
> +}
>  
>  /**
>   * Write the physical details regarding the block device to the store, and
> @@ -439,6 +519,14 @@ again:
>  	if (err)
>  		goto abort;
>  
> +	/*add by andrew for centos pv*/
> +	err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1");
> +	if (err){
> +		xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache",
> +			dev->nodename);
> +		goto abort;
> +	}
> +
>  	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
>  			    vbd_size(&be->blkif->vbd));
>  	if (err) {
> @@ -469,11 +557,22 @@ again:
>  	if (err)
>  		xenbus_dev_fatal(dev, err, "ending transaction");
>  
> +	DPRINTK("xenbus_switch_to XenbusStateConnected");
> +
>  	err = xenbus_switch_state(dev, XenbusStateConnected);
>  	if (err)
>  		xenbus_dev_fatal(dev, err, "switching to Connected state",
>  				 dev->nodename);
>  
> +	unregister_rate_watch(be);
> +	err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate",
> +								&be->rate_watch,rate_changed);
> +	if (!err)
> +		be->have_rate_watch = 1;
> +	else
> +		xenbus_dev_fatal(dev, err, "watching tokens-rate",
> +				 dev->nodename);
> +
>  	return;
>   abort:
>  	xenbus_transaction_end(xbt, 1);


> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xensource.com
> http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Xen-users] Re: VM disk I/O limit patch
  2011-06-21 13:33 ` Konrad Rzeszutek Wilk
@ 2011-06-22 12:06   ` Andrew Xu
  2011-06-22 13:11     ` Konrad Rzeszutek Wilk
  2011-06-22 12:16   ` Re: [Xen-devel] " Florian Heigl
  1 sibling, 1 reply; 14+ messages in thread
From: Andrew Xu @ 2011-06-22 12:06 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: xen-devel, xen-users


On Tue, 21 Jun 2011 09:33:37 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:

> On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote:
> > Hi all,
> > 
> > I add a blkback QoS patch.
> 
> What tree is this against? 
This patch is based on suse11.sp1(2.6.32) xen-blkback source. 
(2.6.18 "Xenlinux" based source trees?)

> There is a xen-blkback in 3.0-rc4, can you rebase
> it against that please.
> 
Ok, I will rebase it.

> What is the patch solving? 
> 
With this path, you can set different speed I/O for different VM disk.
For example, I set vm17-disk1 4MKB/s 
                   vm17-disk2 1MKB/s 
                   vm18-disk3 3MKB/s 
I/O speed, by writing follow xenstore key-values.
	/local/domain/17/device/vbd/768/tokens-rate = "4096"
	/local/domain/17/device/vbd/2048/tokens-rate = "1024"
	/local/domain/18/device/vbd/768/tokens-rate = "3096"

> Why can't it be done with dm-ioband?
Of cause, I/O speed limit also can be done with dm-ioband.
But with my patch, there is no need to load dm-ioband any more.
This patch do speed-limit more close disk, more lightweight.


How it doing?
1) It record how many sectors(512Bytes) have submit to disk in 1 second.
2) If it exceed 2*tokens-rate, then sleep (1 - time passed) seconds.
3) Next second come, going to 1)

> > You can config(dynamic/static) different I/O speed for different VM disk
> > by this patch.
> > 
> > ----------------------------------------------------------------------------
> > 
> > diff -urNp blkback/blkback.c blkback-qos/blkback.c
> > --- blkback/blkback.c	2011-06-22 07:54:19.000000000 +0800
> > +++ blkback-qos/blkback.c	2011-06-22 07:53:18.000000000 +0800
> > @@ -44,6 +44,11 @@
> >  #include <asm/hypervisor.h>
> >  #include "common.h"
> >  
> > +#undef DPRINTK
> > +#define DPRINTK(fmt, args...)				\
> > +	printk("blkback/blkback (%s:%d) " fmt ".\n",	\
> > +		 __FUNCTION__, __LINE__, ##args)
> > +
> >  /*
> >   * These are rather arbitrary. They are fairly large because adjacent requests
> >   * pulled from a communication ring are quite likely to end up being part of
> > @@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin
> >  static int do_block_io_op(blkif_t *blkif);
> >  static int dispatch_rw_block_io(blkif_t *blkif,
> >  				 blkif_request_t *req,
> > -				 pending_req_t *pending_req);
> > +				 pending_req_t *pending_req,
> > +				 int *done_nr_sects);
> >  static void make_response(blkif_t *blkif, u64 id,
> >  			  unsigned short op, int st);
> >  
> > @@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif)
> >  	blkif->st_pk_req = 0;
> >  }
> >  
> > +static void refill_reqcount(blkif_t *blkif)
> > +{
> > +	blkif->reqtime = jiffies + msecs_to_jiffies(1000);
> > + 	blkif->reqcount = blkif->reqrate;
> > + 	if (blkif->reqcount < blkif->reqmin)
> > + 		blkif->reqcount = blkif->reqmin;
> > +}
> > +
> >  int blkif_schedule(void *arg)
> >  {
> >  	blkif_t *blkif = arg;
> >  	struct vbd *vbd = &blkif->vbd;
> > +	int	ret = 0;
> > +	struct timeval cur_time;
> >  
> >  	blkif_get(blkif);
> >  
> > @@ -232,12 +248,34 @@ int blkif_schedule(void *arg)
> >  		blkif->waiting_reqs = 0;
> >  		smp_mb(); /* clear flag *before* checking for work */
> >  
> > -		if (do_block_io_op(blkif))
> > +		ret = do_block_io_op(blkif);
> > +		if (ret)
> >  			blkif->waiting_reqs = 1;
> >  		unplug_queue(blkif);
> >  
> > +		if(blkif->reqmin){
> > +			if(2 == ret && (blkif->reqtime > jiffies)){
> > +				jiffies_to_timeval(jiffies, &cur_time);
> > +				if(log_stats && (cur_time.tv_sec % 10 ==1 ))
> > +					printk(KERN_DEBUG "%s: going to sleep %d millsecs(rate=%d)\n",
> > +							current->comm,
> > +							jiffies_to_msecs(blkif->reqtime - jiffies),
> > +							blkif->reqrate);
> > +				
> > +				set_current_state(TASK_INTERRUPTIBLE);
> > +				schedule_timeout(blkif->reqtime - jiffies);
> > +				
> > +				if(log_stats && (cur_time.tv_sec % 10 ==1 ))
> > +					printk(KERN_DEBUG "%s: sleep end(rate=%d)\n",
> > +							current->comm,blkif->reqrate);
> > +			}
> > +			if (time_after(jiffies, blkif->reqtime))
> > +				refill_reqcount(blkif);
> > +		}
> > +
> >  		if (log_stats && time_after(jiffies, blkif->st_print))
> >  			print_stats(blkif);
> > +		
> >  	}
> >  
> >  	if (log_stats)
> > @@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void *
> >  /******************************************************************
> >   * DOWNWARD CALLS -- These interface with the block-device layer proper.
> >   */
> > -
> >  static int do_block_io_op(blkif_t *blkif)
> >  {
> >  	blkif_back_rings_t *blk_rings = &blkif->blk_rings;
> > @@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif
> >  	pending_req_t *pending_req;
> >  	RING_IDX rc, rp;
> >  	int more_to_do = 0, ret;
> > +	static int last_done_nr_sects = 0;	
> >  
> >  	rc = blk_rings->common.req_cons;
> >  	rp = blk_rings->common.sring->req_prod;
> >  	rmb(); /* Ensure we see queued requests up to 'rp'. */
> > +	
> > +	if (blkif->reqmin && blkif->reqcount <= 0)
> > +		return (rc != rp) ? 2 : 0;
> >  
> >  	while ((rc != rp) || (blkif->is_suspended_req)) {
> >  
> >  		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
> >  			break;
> > +		
> > +		if(blkif->reqmin){
> > +			blkif->reqcount -= last_done_nr_sects;
> > +			if (blkif->reqcount <= 0) {
> > +				more_to_do = 2;
> > +				break;
> > +			}
> > +		}
> >  
> >  		if (kthread_should_stop()) {
> >  			more_to_do = 1;
> > @@ -367,14 +416,14 @@ handle_request:
> >  		switch (req.operation) {
> >  		case BLKIF_OP_READ:
> >  			blkif->st_rd_req++;
> > -			ret = dispatch_rw_block_io(blkif, &req, pending_req); 
> > +			ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects); 
> >  			break;
> >  		case BLKIF_OP_WRITE_BARRIER:
> >  			blkif->st_br_req++;
> >  			/* fall through */
> >  		case BLKIF_OP_WRITE:
> >  			blkif->st_wr_req++;
> > -			ret = dispatch_rw_block_io(blkif, &req, pending_req);
> > +			ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects);
> >  			break;
> >  		case BLKIF_OP_PACKET:
> >  			DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n");
> > @@ -412,9 +461,29 @@ handle_request:
> >  	return more_to_do;
> >  }
> >  
> > +static char* operation2str(int operation)
> > +{
> > +	char* ret_str = NULL;
> > +	switch (operation) {
> > +	case BLKIF_OP_READ:
> > +		ret_str = "READ";
> > +		break;
> > +	case BLKIF_OP_WRITE:
> > +		ret_str = "WRITE";
> > +		break;
> > +	case BLKIF_OP_WRITE_BARRIER:
> > +		ret_str = "WRITE_BARRIER";
> > +		break;
> > +	default:
> > +		ret_str = "0";
> > +	}
> > +	return ret_str;
> > +}
> > +
> >  static int dispatch_rw_block_io(blkif_t *blkif,
> >  				 blkif_request_t *req,
> > -				 pending_req_t *pending_req)
> > +				 pending_req_t *pending_req,
> > +				 int *done_nr_sects)
> >  {
> >  	extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
> >  	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
> > @@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t
> >  	struct bio *bio = NULL;
> >  	int ret, i;
> >  	int operation;
> > +	struct timeval cur_time;
> > +
> > +	*done_nr_sects = 0;
> >  
> >  	switch (req->operation) {
> >  	case BLKIF_OP_READ:
> > @@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t
> >  	else if (operation == WRITE || operation == WRITE_BARRIER)
> >  		blkif->st_wr_sect += preq.nr_sects;
> >  
> > +	*done_nr_sects = preq.nr_sects;
> > +	jiffies_to_timeval(jiffies, &cur_time);
> > +	if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 ))
> > +		printk(KERN_DEBUG "  operation=%s sects=%d\n",
> > +			operation2str(req->operation),preq.nr_sects);
> > +
> >  	return 0;
> >  
> >   fail_flush:
> > @@ -695,6 +773,8 @@ static int __init blkif_init(void)
> >  
> >  	blkif_xenbus_init();
> >  
> > +	DPRINTK("blkif_inited\n");
> > +
> >  	return 0;
> >  
> >   out_of_memory:
> > diff -urNp blkback/cdrom.c blkback-qos/cdrom.c
> > --- blkback/cdrom.c	2010-05-20 18:07:00.000000000 +0800
> > +++ blkback-qos/cdrom.c	2011-06-22 07:34:50.000000000 +0800
> > @@ -35,9 +35,9 @@
> >  #include "common.h"
> >  
> >  #undef DPRINTK
> > -#define DPRINTK(_f, _a...)			\
> > -	printk("(%s() file=%s, line=%d) " _f "\n",	\
> > -		 __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a )
> > +#define DPRINTK(fmt, args...)				\
> > +	printk("blkback/cdrom (%s:%d) " fmt ".\n",	\
> > +		 __FUNCTION__, __LINE__, ##args)
> >  
> >  
> >  #define MEDIA_PRESENT "media-present"
> > diff -urNp blkback/common.h blkback-qos/common.h
> > --- blkback/common.h	2010-05-20 18:07:00.000000000 +0800
> > +++ blkback-qos/common.h	2011-06-22 07:34:50.000000000 +0800
> > @@ -100,8 +100,17 @@ typedef struct blkif_st {
> >  
> >  	grant_handle_t shmem_handle;
> >  	grant_ref_t    shmem_ref;
> > +
> > +	/* qos information */
> > +	unsigned long   reqtime;
> > +	int    reqcount;
> > +	int    reqmin;
> > +	int    reqrate; 
> > +
> >  } blkif_t;
> >  
> > +#define VBD_QOS_MIN_RATE_LIMIT			2*1024		/* 	1MBs 	*/
> > +
> >  struct backend_info
> >  {
> >  	struct xenbus_device *dev;
> > @@ -111,6 +120,8 @@ struct backend_info
> >  	unsigned major;
> >  	unsigned minor;
> >  	char *mode;
> > +  	struct xenbus_watch rate_watch;
> > +	int have_rate_watch; 
> >  };
> >  
> >  blkif_t *blkif_alloc(domid_t domid);
> > diff -urNp blkback/vbd.c blkback-qos/vbd.c
> > --- blkback/vbd.c	2010-05-20 18:07:00.000000000 +0800
> > +++ blkback-qos/vbd.c	2011-06-22 07:34:50.000000000 +0800
> > @@ -35,6 +35,11 @@
> >  #define vbd_sz(_v)   ((_v)->bdev->bd_part ?				\
> >  	(_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
> >  
> > +#undef DPRINTK
> > +#define DPRINTK(fmt, args...)				\
> > +	printk("blkback/vbd (%s:%d) " fmt ".\n",	\
> > +		 __FUNCTION__, __LINE__, ##args)
> > +
> >  unsigned long long vbd_size(struct vbd *vbd)
> >  {
> >  	return vbd_sz(vbd);
> > @@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde
> >  	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
> >  		vbd->type |= VDISK_REMOVABLE;
> >  
> > -	DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
> > +	DPRINTK("Successful creation of handle=%04x (dom=%u)",
> >  		handle, blkif->domid);
> >  	return 0;
> >  }
> > diff -urNp blkback/xenbus.c blkback-qos/xenbus.c
> > --- blkback/xenbus.c	2010-05-20 18:07:00.000000000 +0800
> > +++ blkback-qos/xenbus.c	2011-06-22 07:34:50.000000000 +0800
> > @@ -25,13 +25,14 @@
> >  
> >  #undef DPRINTK
> >  #define DPRINTK(fmt, args...)				\
> > -	pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",	\
> > +	printk("blkback/xenbus (%s:%d) " fmt ".\n",	\
> >  		 __FUNCTION__, __LINE__, ##args)
> >  
> >  static void connect(struct backend_info *);
> >  static int connect_ring(struct backend_info *);
> >  static void backend_changed(struct xenbus_watch *, const char **,
> >  			    unsigned int);
> > +static void unregister_rate_watch(struct backend_info *be);
> >  
> >  static int blkback_name(blkif_t *blkif, char *buf)
> >  {
> > @@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t
> >  	char name[TASK_COMM_LEN];
> >  
> >  	/* Not ready to connect? */
> > -	if (!blkif->irq || !blkif->vbd.bdev)
> > +	if (!blkif->irq || !blkif->vbd.bdev){
> > +		DPRINTK("Not ready to connect");
> >  		return;
> > +	}
> >  
> >  	/* Already connected? */
> >  	if (blkif->be->dev->state == XenbusStateConnected)
> > @@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_
> >  		be->cdrom_watch.node = NULL;
> >  	}
> >  
> > +	unregister_rate_watch(be);
> > +
> >  	if (be->blkif) {
> >  		blkif_disconnect(be->blkif);
> >  		vbd_free(&be->blkif->vbd);
> > @@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d
> >  
> >  	err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
> >  				 &be->backend_watch, backend_changed);
> > +
> > +	DPRINTK("blkback_probe called");
> > +	DPRINTK("dev->nodename=%s/physical-device",dev->nodename);
> > +	
> >  	if (err)
> >  		goto fail;
> >  
> > @@ -266,7 +275,6 @@ fail:
> >  	return err;
> >  }
> >  
> > -
> >  /**
> >   * Callback received when the hotplug scripts have placed the physical-device
> >   * node.  Read it and the mode node, and create a vbd.  If the frontend is
> > @@ -283,8 +291,9 @@ static void backend_changed(struct xenbu
> >  	struct xenbus_device *dev = be->dev;
> >  	int cdrom = 0;
> >  	char *device_type;
> > +	char name[TASK_COMM_LEN];
> >  
> > -	DPRINTK("");
> > +	DPRINTK("backend_changed called");
> >  
> >  	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
> >  			   &major, &minor);
> > @@ -322,6 +331,34 @@ static void backend_changed(struct xenbu
> >  		kfree(device_type);
> >  	}
> >  
> > +	/* gather information about QoS policy for this device. */
> > +	err = blkback_name(be->blkif, name);
> > +	if (err) {
> > +		xenbus_dev_error(be->dev, err, "get blkback dev name");
> > +		return;
> > +	}
> > +	
> > +	err = xenbus_gather(XBT_NIL, dev->otherend,
> > +				"tokens-rate", "%d", &be->blkif->reqrate, 
> > +				NULL);
> > +	if(err){
> > +		DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name);
> > +	}else{
> > +		if(be->blkif->reqrate <= 0){
> > +			be->blkif->reqmin = 0 ;
> > +			DPRINTK("%s tokens-rate == 0,no limit",name);	
> > +		}else{
> > +			DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
> > +			be->blkif->reqrate *= 2;
> > +			be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
> > +			if(be->blkif->reqmin > be->blkif->reqrate){
> > +				be->blkif->reqrate = be->blkif->reqmin;
> > +				DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate);
> > +			}
> > +		}
> > +	}
> > +	be->blkif->reqtime = jiffies;
> > +
> >  	if (be->major == 0 && be->minor == 0) {
> >  		/* Front end dir is a number, which is used as the handle. */
> >  
> > @@ -414,6 +451,49 @@ static void frontend_changed(struct xenb
> >  
> >  /* ** Connection ** */
> >  
> > +static void unregister_rate_watch(struct backend_info *be)
> > +{
> > +	if (be->have_rate_watch) {
> > +		unregister_xenbus_watch(&be->rate_watch);
> > +		kfree(be->rate_watch.node);
> > +	}
> > +	be->have_rate_watch = 0;
> > +}
> > +
> > +static void rate_changed(struct xenbus_watch *watch,
> > +                       const char **vec, unsigned int len)
> > +{
> > +
> > +	struct backend_info *be=container_of(watch,struct backend_info, rate_watch);
> > +	int err;
> > +	char name[TASK_COMM_LEN];
> > +
> > +	err = blkback_name(be->blkif, name);
> > +	if (err) {
> > +		xenbus_dev_error(be->dev, err, "get blkback dev name");
> > +		return;
> > +	}
> > +
> > +	err = xenbus_gather(XBT_NIL,be->dev->otherend, 
> > +					"tokens-rate",	"%d", 
> > +					&be->blkif->reqrate,NULL);
> > +	if(err){
> > +		DPRINTK("%s xenbus_gather(tokens-rate) error",name);
> > +	}else{
> > +		if(be->blkif->reqrate <= 0){
> > +			be->blkif->reqmin = 0;
> > +			DPRINTK("%s tokens-rate == 0,no limit",name);	
> > +		}else{
> > +			DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
> > +			be->blkif->reqrate *= 2;
> > +			be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
> > +			if(be->blkif->reqmin > be->blkif->reqrate){
> > +				be->blkif->reqrate = be->blkif->reqmin;
> > +				DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate);
> > +			}
> > +		}
> > +	}
> > +}
> >  
> >  /**
> >   * Write the physical details regarding the block device to the store, and
> > @@ -439,6 +519,14 @@ again:
> >  	if (err)
> >  		goto abort;
> >  
> > +	/*add by andrew for centos pv*/
> > +	err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1");
> > +	if (err){
> > +		xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache",
> > +			dev->nodename);
> > +		goto abort;
> > +	}
> > +
> >  	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
> >  			    vbd_size(&be->blkif->vbd));
> >  	if (err) {
> > @@ -469,11 +557,22 @@ again:
> >  	if (err)
> >  		xenbus_dev_fatal(dev, err, "ending transaction");
> >  
> > +	DPRINTK("xenbus_switch_to XenbusStateConnected");
> > +
> >  	err = xenbus_switch_state(dev, XenbusStateConnected);
> >  	if (err)
> >  		xenbus_dev_fatal(dev, err, "switching to Connected state",
> >  				 dev->nodename);
> >  
> > +	unregister_rate_watch(be);
> > +	err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate",
> > +								&be->rate_watch,rate_changed);
> > +	if (!err)
> > +		be->have_rate_watch = 1;
> > +	else
> > +		xenbus_dev_fatal(dev, err, "watching tokens-rate",
> > +				 dev->nodename);
> > +
> >  	return;
> >   abort:
> >  	xenbus_transaction_end(xbt, 1);
> 
> 
> > _______________________________________________
> > Xen-devel mailing list
> > Xen-devel@lists.xensource.com
> > http://lists.xensource.com/xen-devel
> 
> 
> _______________________________________________
> Xen-users mailing list
> Xen-users@lists.xensource.com
> http://lists.xensource.com/xen-users

**************************************************
徐安(Andrew Xu)
部门  :云快线 - 运营支撑中心 - 研发中心
手机  : 18910391796
E-mail:xu.an@cloudex.cn
地址  :北京市朝阳区酒仙桥东路1号M5楼
**************************************************

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Re: [Xen-devel] VM disk I/O limit patch
  2011-06-21 13:33 ` Konrad Rzeszutek Wilk
  2011-06-22 12:06   ` [Xen-users] " Andrew Xu
@ 2011-06-22 12:16   ` Florian Heigl
  2011-06-22 13:07     ` [Xen-users] " Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 14+ messages in thread
From: Florian Heigl @ 2011-06-22 12:16 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: Andrew Xu, xen-devel, xen-users

Hi,

relying on DM is not advisable since this is yet another layer that
breaks write barriers, plus it kills portability to non-linux OS.

btw:
It would be cool if a block shaping patch finally made it in the Xen
mainline, since the last one that was submitted in 2009 was apparently
forgotten...

Regards,
Florian

2011/6/21 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:
> On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote:
>> Hi all,
>>
>> I add a blkback QoS patch.
>
> What tree is this against? There is a xen-blkback in 3.0-rc4, can you rebase
> it against that please.
>
> What is the patch solving? Why can't it be done with dm-ioband?
>
>> You can config(dynamic/static) different I/O speed for different VM disk
>> by this patch.
>>
>> ----------------------------------------------------------------------------
>>
>> diff -urNp blkback/blkback.c blkback-qos/blkback.c
>> --- blkback/blkback.c 2011-06-22 07:54:19.000000000 +0800
>> +++ blkback-qos/blkback.c     2011-06-22 07:53:18.000000000 +0800
>> @@ -44,6 +44,11 @@
>>  #include <asm/hypervisor.h>
>>  #include "common.h"
>>
>> +#undef DPRINTK
>> +#define DPRINTK(fmt, args...)                                \
>> +     printk("blkback/blkback (%s:%d) " fmt ".\n",    \
>> +              __FUNCTION__, __LINE__, ##args)
>> +
>>  /*
>>   * These are rather arbitrary. They are fairly large because adjacent requests
>>   * pulled from a communication ring are quite likely to end up being part of
>> @@ -110,7 +115,8 @@ static inline unsigned long vaddr(pendin
>>  static int do_block_io_op(blkif_t *blkif);
>>  static int dispatch_rw_block_io(blkif_t *blkif,
>>                                blkif_request_t *req,
>> -                              pending_req_t *pending_req);
>> +                              pending_req_t *pending_req,
>> +                              int *done_nr_sects);
>>  static void make_response(blkif_t *blkif, u64 id,
>>                         unsigned short op, int st);
>>
>> @@ -206,10 +212,20 @@ static void print_stats(blkif_t *blkif)
>>       blkif->st_pk_req = 0;
>>  }
>>
>> +static void refill_reqcount(blkif_t *blkif)
>> +{
>> +     blkif->reqtime = jiffies + msecs_to_jiffies(1000);
>> +     blkif->reqcount = blkif->reqrate;
>> +     if (blkif->reqcount < blkif->reqmin)
>> +             blkif->reqcount = blkif->reqmin;
>> +}
>> +
>>  int blkif_schedule(void *arg)
>>  {
>>       blkif_t *blkif = arg;
>>       struct vbd *vbd = &blkif->vbd;
>> +     int     ret = 0;
>> +     struct timeval cur_time;
>>
>>       blkif_get(blkif);
>>
>> @@ -232,12 +248,34 @@ int blkif_schedule(void *arg)
>>               blkif->waiting_reqs = 0;
>>               smp_mb(); /* clear flag *before* checking for work */
>>
>> -             if (do_block_io_op(blkif))
>> +             ret = do_block_io_op(blkif);
>> +             if (ret)
>>                       blkif->waiting_reqs = 1;
>>               unplug_queue(blkif);
>>
>> +             if(blkif->reqmin){
>> +                     if(2 == ret && (blkif->reqtime > jiffies)){
>> +                             jiffies_to_timeval(jiffies, &cur_time);
>> +                             if(log_stats && (cur_time.tv_sec % 10 ==1 ))
>> +                                     printk(KERN_DEBUG "%s: going to sleep %d millsecs(rate=%d)\n",
>> +                                                     current->comm,
>> +                                                     jiffies_to_msecs(blkif->reqtime - jiffies),
>> +                                                     blkif->reqrate);
>> +
>> +                             set_current_state(TASK_INTERRUPTIBLE);
>> +                             schedule_timeout(blkif->reqtime - jiffies);
>> +
>> +                             if(log_stats && (cur_time.tv_sec % 10 ==1 ))
>> +                                     printk(KERN_DEBUG "%s: sleep end(rate=%d)\n",
>> +                                                     current->comm,blkif->reqrate);
>> +                     }
>> +                     if (time_after(jiffies, blkif->reqtime))
>> +                             refill_reqcount(blkif);
>> +             }
>> +
>>               if (log_stats && time_after(jiffies, blkif->st_print))
>>                       print_stats(blkif);
>> +
>>       }
>>
>>       if (log_stats)
>> @@ -306,7 +344,6 @@ irqreturn_t blkif_be_int(int irq, void *
>>  /******************************************************************
>>   * DOWNWARD CALLS -- These interface with the block-device layer proper.
>>   */
>> -
>>  static int do_block_io_op(blkif_t *blkif)
>>  {
>>       blkif_back_rings_t *blk_rings = &blkif->blk_rings;
>> @@ -314,15 +351,27 @@ static int do_block_io_op(blkif_t *blkif
>>       pending_req_t *pending_req;
>>       RING_IDX rc, rp;
>>       int more_to_do = 0, ret;
>> +     static int last_done_nr_sects = 0;
>>
>>       rc = blk_rings->common.req_cons;
>>       rp = blk_rings->common.sring->req_prod;
>>       rmb(); /* Ensure we see queued requests up to 'rp'. */
>> +
>> +     if (blkif->reqmin && blkif->reqcount <= 0)
>> +             return (rc != rp) ? 2 : 0;
>>
>>       while ((rc != rp) || (blkif->is_suspended_req)) {
>>
>>               if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
>>                       break;
>> +
>> +             if(blkif->reqmin){
>> +                     blkif->reqcount -= last_done_nr_sects;
>> +                     if (blkif->reqcount <= 0) {
>> +                             more_to_do = 2;
>> +                             break;
>> +                     }
>> +             }
>>
>>               if (kthread_should_stop()) {
>>                       more_to_do = 1;
>> @@ -367,14 +416,14 @@ handle_request:
>>               switch (req.operation) {
>>               case BLKIF_OP_READ:
>>                       blkif->st_rd_req++;
>> -                     ret = dispatch_rw_block_io(blkif, &req, pending_req);
>> +                     ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects);
>>                       break;
>>               case BLKIF_OP_WRITE_BARRIER:
>>                       blkif->st_br_req++;
>>                       /* fall through */
>>               case BLKIF_OP_WRITE:
>>                       blkif->st_wr_req++;
>> -                     ret = dispatch_rw_block_io(blkif, &req, pending_req);
>> +                     ret = dispatch_rw_block_io(blkif, &req, pending_req,&last_done_nr_sects);
>>                       break;
>>               case BLKIF_OP_PACKET:
>>                       DPRINTK("error: block operation BLKIF_OP_PACKET not implemented\n");
>> @@ -412,9 +461,29 @@ handle_request:
>>       return more_to_do;
>>  }
>>
>> +static char* operation2str(int operation)
>> +{
>> +     char* ret_str = NULL;
>> +     switch (operation) {
>> +     case BLKIF_OP_READ:
>> +             ret_str = "READ";
>> +             break;
>> +     case BLKIF_OP_WRITE:
>> +             ret_str = "WRITE";
>> +             break;
>> +     case BLKIF_OP_WRITE_BARRIER:
>> +             ret_str = "WRITE_BARRIER";
>> +             break;
>> +     default:
>> +             ret_str = "0";
>> +     }
>> +     return ret_str;
>> +}
>> +
>>  static int dispatch_rw_block_io(blkif_t *blkif,
>>                                blkif_request_t *req,
>> -                              pending_req_t *pending_req)
>> +                              pending_req_t *pending_req,
>> +                              int *done_nr_sects)
>>  {
>>       extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
>>       struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>> @@ -426,6 +495,9 @@ static int dispatch_rw_block_io(blkif_t
>>       struct bio *bio = NULL;
>>       int ret, i;
>>       int operation;
>> +     struct timeval cur_time;
>> +
>> +     *done_nr_sects = 0;
>>
>>       switch (req->operation) {
>>       case BLKIF_OP_READ:
>> @@ -582,6 +654,12 @@ static int dispatch_rw_block_io(blkif_t
>>       else if (operation == WRITE || operation == WRITE_BARRIER)
>>               blkif->st_wr_sect += preq.nr_sects;
>>
>> +     *done_nr_sects = preq.nr_sects;
>> +     jiffies_to_timeval(jiffies, &cur_time);
>> +     if ((log_stats == 2) && (cur_time.tv_sec % 10 ==1 ))
>> +             printk(KERN_DEBUG "  operation=%s sects=%d\n",
>> +                     operation2str(req->operation),preq.nr_sects);
>> +
>>       return 0;
>>
>>   fail_flush:
>> @@ -695,6 +773,8 @@ static int __init blkif_init(void)
>>
>>       blkif_xenbus_init();
>>
>> +     DPRINTK("blkif_inited\n");
>> +
>>       return 0;
>>
>>   out_of_memory:
>> diff -urNp blkback/cdrom.c blkback-qos/cdrom.c
>> --- blkback/cdrom.c   2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/cdrom.c       2011-06-22 07:34:50.000000000 +0800
>> @@ -35,9 +35,9 @@
>>  #include "common.h"
>>
>>  #undef DPRINTK
>> -#define DPRINTK(_f, _a...)                   \
>> -     printk("(%s() file=%s, line=%d) " _f "\n",      \
>> -              __PRETTY_FUNCTION__, __FILE__ , __LINE__ , ##_a )
>> +#define DPRINTK(fmt, args...)                                \
>> +     printk("blkback/cdrom (%s:%d) " fmt ".\n",      \
>> +              __FUNCTION__, __LINE__, ##args)
>>
>>
>>  #define MEDIA_PRESENT "media-present"
>> diff -urNp blkback/common.h blkback-qos/common.h
>> --- blkback/common.h  2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/common.h      2011-06-22 07:34:50.000000000 +0800
>> @@ -100,8 +100,17 @@ typedef struct blkif_st {
>>
>>       grant_handle_t shmem_handle;
>>       grant_ref_t    shmem_ref;
>> +
>> +     /* qos information */
>> +     unsigned long   reqtime;
>> +     int    reqcount;
>> +     int    reqmin;
>> +     int    reqrate;
>> +
>>  } blkif_t;
>>
>> +#define VBD_QOS_MIN_RATE_LIMIT                       2*1024          /*      1MBs    */
>> +
>>  struct backend_info
>>  {
>>       struct xenbus_device *dev;
>> @@ -111,6 +120,8 @@ struct backend_info
>>       unsigned major;
>>       unsigned minor;
>>       char *mode;
>> +     struct xenbus_watch rate_watch;
>> +     int have_rate_watch;
>>  };
>>
>>  blkif_t *blkif_alloc(domid_t domid);
>> diff -urNp blkback/vbd.c blkback-qos/vbd.c
>> --- blkback/vbd.c     2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/vbd.c 2011-06-22 07:34:50.000000000 +0800
>> @@ -35,6 +35,11 @@
>>  #define vbd_sz(_v)   ((_v)->bdev->bd_part ?                          \
>>       (_v)->bdev->bd_part->nr_sects : get_capacity((_v)->bdev->bd_disk))
>>
>> +#undef DPRINTK
>> +#define DPRINTK(fmt, args...)                                \
>> +     printk("blkback/vbd (%s:%d) " fmt ".\n",        \
>> +              __FUNCTION__, __LINE__, ##args)
>> +
>>  unsigned long long vbd_size(struct vbd *vbd)
>>  {
>>       return vbd_sz(vbd);
>> @@ -87,7 +92,7 @@ int vbd_create(blkif_t *blkif, blkif_vde
>>       if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
>>               vbd->type |= VDISK_REMOVABLE;
>>
>> -     DPRINTK("Successful creation of handle=%04x (dom=%u)\n",
>> +     DPRINTK("Successful creation of handle=%04x (dom=%u)",
>>               handle, blkif->domid);
>>       return 0;
>>  }
>> diff -urNp blkback/xenbus.c blkback-qos/xenbus.c
>> --- blkback/xenbus.c  2010-05-20 18:07:00.000000000 +0800
>> +++ blkback-qos/xenbus.c      2011-06-22 07:34:50.000000000 +0800
>> @@ -25,13 +25,14 @@
>>
>>  #undef DPRINTK
>>  #define DPRINTK(fmt, args...)                                \
>> -     pr_debug("blkback/xenbus (%s:%d) " fmt ".\n",   \
>> +     printk("blkback/xenbus (%s:%d) " fmt ".\n",     \
>>                __FUNCTION__, __LINE__, ##args)
>>
>>  static void connect(struct backend_info *);
>>  static int connect_ring(struct backend_info *);
>>  static void backend_changed(struct xenbus_watch *, const char **,
>>                           unsigned int);
>> +static void unregister_rate_watch(struct backend_info *be);
>>
>>  static int blkback_name(blkif_t *blkif, char *buf)
>>  {
>> @@ -59,8 +60,10 @@ static void update_blkif_status(blkif_t
>>       char name[TASK_COMM_LEN];
>>
>>       /* Not ready to connect? */
>> -     if (!blkif->irq || !blkif->vbd.bdev)
>> +     if (!blkif->irq || !blkif->vbd.bdev){
>> +             DPRINTK("Not ready to connect");
>>               return;
>> +     }
>>
>>       /* Already connected? */
>>       if (blkif->be->dev->state == XenbusStateConnected)
>> @@ -193,6 +196,8 @@ static int blkback_remove(struct xenbus_
>>               be->cdrom_watch.node = NULL;
>>       }
>>
>> +     unregister_rate_watch(be);
>> +
>>       if (be->blkif) {
>>               blkif_disconnect(be->blkif);
>>               vbd_free(&be->blkif->vbd);
>> @@ -251,6 +256,10 @@ static int blkback_probe(struct xenbus_d
>>
>>       err = xenbus_watch_path2(dev, dev->nodename, "physical-device",
>>                                &be->backend_watch, backend_changed);
>> +
>> +     DPRINTK("blkback_probe called");
>> +     DPRINTK("dev->nodename=%s/physical-device",dev->nodename);
>> +
>>       if (err)
>>               goto fail;
>>
>> @@ -266,7 +275,6 @@ fail:
>>       return err;
>>  }
>>
>> -
>>  /**
>>   * Callback received when the hotplug scripts have placed the physical-device
>>   * node.  Read it and the mode node, and create a vbd.  If the frontend is
>> @@ -283,8 +291,9 @@ static void backend_changed(struct xenbu
>>       struct xenbus_device *dev = be->dev;
>>       int cdrom = 0;
>>       char *device_type;
>> +     char name[TASK_COMM_LEN];
>>
>> -     DPRINTK("");
>> +     DPRINTK("backend_changed called");
>>
>>       err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
>>                          &major, &minor);
>> @@ -322,6 +331,34 @@ static void backend_changed(struct xenbu
>>               kfree(device_type);
>>       }
>>
>> +     /* gather information about QoS policy for this device. */
>> +     err = blkback_name(be->blkif, name);
>> +     if (err) {
>> +             xenbus_dev_error(be->dev, err, "get blkback dev name");
>> +             return;
>> +     }
>> +
>> +     err = xenbus_gather(XBT_NIL, dev->otherend,
>> +                             "tokens-rate", "%d", &be->blkif->reqrate,
>> +                             NULL);
>> +     if(err){
>> +             DPRINTK("%s xenbus_gather(tokens-min,tokens-rate) error",name);
>> +     }else{
>> +             if(be->blkif->reqrate <= 0){
>> +                     be->blkif->reqmin = 0 ;
>> +                     DPRINTK("%s tokens-rate == 0,no limit",name);
>> +             }else{
>> +                     DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     be->blkif->reqrate *= 2;
>> +                     be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
>> +                     if(be->blkif->reqmin > be->blkif->reqrate){
>> +                             be->blkif->reqrate = be->blkif->reqmin;
>> +                             DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     }
>> +             }
>> +     }
>> +     be->blkif->reqtime = jiffies;
>> +
>>       if (be->major == 0 && be->minor == 0) {
>>               /* Front end dir is a number, which is used as the handle. */
>>
>> @@ -414,6 +451,49 @@ static void frontend_changed(struct xenb
>>
>>  /* ** Connection ** */
>>
>> +static void unregister_rate_watch(struct backend_info *be)
>> +{
>> +     if (be->have_rate_watch) {
>> +             unregister_xenbus_watch(&be->rate_watch);
>> +             kfree(be->rate_watch.node);
>> +     }
>> +     be->have_rate_watch = 0;
>> +}
>> +
>> +static void rate_changed(struct xenbus_watch *watch,
>> +                       const char **vec, unsigned int len)
>> +{
>> +
>> +     struct backend_info *be=container_of(watch,struct backend_info, rate_watch);
>> +     int err;
>> +     char name[TASK_COMM_LEN];
>> +
>> +     err = blkback_name(be->blkif, name);
>> +     if (err) {
>> +             xenbus_dev_error(be->dev, err, "get blkback dev name");
>> +             return;
>> +     }
>> +
>> +     err = xenbus_gather(XBT_NIL,be->dev->otherend,
>> +                                     "tokens-rate",  "%d",
>> +                                     &be->blkif->reqrate,NULL);
>> +     if(err){
>> +             DPRINTK("%s xenbus_gather(tokens-rate) error",name);
>> +     }else{
>> +             if(be->blkif->reqrate <= 0){
>> +                     be->blkif->reqmin = 0;
>> +                     DPRINTK("%s tokens-rate == 0,no limit",name);
>> +             }else{
>> +                     DPRINTK("%s xenbus_gather(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     be->blkif->reqrate *= 2;
>> +                     be->blkif->reqmin = VBD_QOS_MIN_RATE_LIMIT;
>> +                     if(be->blkif->reqmin > be->blkif->reqrate){
>> +                             be->blkif->reqrate = be->blkif->reqmin;
>> +                             DPRINTK("%s reset default value(tokens-rate=%d)",name,be->blkif->reqrate);
>> +                     }
>> +             }
>> +     }
>> +}
>>
>>  /**
>>   * Write the physical details regarding the block device to the store, and
>> @@ -439,6 +519,14 @@ again:
>>       if (err)
>>               goto abort;
>>
>> +     /*add by andrew for centos pv*/
>> +     err = xenbus_printf(xbt, dev->nodename,"feature-flush-cache", "1");
>> +     if (err){
>> +             xenbus_dev_fatal(dev, err, "writing %s/feature-flush-cache",
>> +                     dev->nodename);
>> +             goto abort;
>> +     }
>> +
>>       err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
>>                           vbd_size(&be->blkif->vbd));
>>       if (err) {
>> @@ -469,11 +557,22 @@ again:
>>       if (err)
>>               xenbus_dev_fatal(dev, err, "ending transaction");
>>
>> +     DPRINTK("xenbus_switch_to XenbusStateConnected");
>> +
>>       err = xenbus_switch_state(dev, XenbusStateConnected);
>>       if (err)
>>               xenbus_dev_fatal(dev, err, "switching to Connected state",
>>                                dev->nodename);
>>
>> +     unregister_rate_watch(be);
>> +     err=xenbus_watch_path2(dev, dev->otherend, "tokens-rate",
>> +                                                             &be->rate_watch,rate_changed);
>> +     if (!err)
>> +             be->have_rate_watch = 1;
>> +     else
>> +             xenbus_dev_fatal(dev, err, "watching tokens-rate",
>> +                              dev->nodename);
>> +
>>       return;
>>   abort:
>>       xenbus_transaction_end(xbt, 1);
>
>
>> _______________________________________________
>> Xen-devel mailing list
>> Xen-devel@lists.xensource.com
>> http://lists.xensource.com/xen-devel
>
>
> _______________________________________________
> Xen-users mailing list
> Xen-users@lists.xensource.com
> http://lists.xensource.com/xen-users
>



-- 
the purpose of libvirt is to provide an abstraction layer hiding all
xen features added since 2006 until they were finally understood and
copied by the kvm devs.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Xen-users] Re: VM disk I/O limit patch
  2011-06-22 12:16   ` Re: [Xen-devel] " Florian Heigl
@ 2011-06-22 13:07     ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-06-22 13:07 UTC (permalink / raw)
  To: Florian Heigl; +Cc: Andrew Xu, xen-devel, xen-users

On Wed, Jun 22, 2011 at 02:16:09PM +0200, Florian Heigl wrote:
> Hi,
> 
> relying on DM is not advisable since this is yet another layer that
> breaks write barriers, plus it kills portability to non-linux OS.

How so? And write barriers don't exist anymore - I think you mean
flush support. And I believe they are fixed now - flush requests
are passed through to the bottom layer (the real disk). But let
me double check that shortly.

Portability to non-linux OS? ? What do you mean by that? As in using
this patch on NetBSD or OpenSolaris backend? The backend in those
OS-es is quite different so it wouldn't be "portable"

> 
> btw:
> It would be cool if a block shaping patch finally made it in the Xen
> mainline, since the last one that was submitted in 2009 was apparently
> forgotten...

Why can't dm-ioband do this for you? Hm, I don't remember seeing that patch
at all?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Xen-users] Re: VM disk I/O limit patch
  2011-06-22 12:06   ` [Xen-users] " Andrew Xu
@ 2011-06-22 13:11     ` Konrad Rzeszutek Wilk
  2011-06-22 14:12       ` Re[2]: " Andrew Xu
  0 siblings, 1 reply; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-06-22 13:11 UTC (permalink / raw)
  To: Andrew Xu; +Cc: xen-devel, xen-users

On Wed, Jun 22, 2011 at 08:06:23PM +0800, Andrew Xu wrote:
> 
> On Tue, 21 Jun 2011 09:33:37 -0400
> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> 
> > On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote:
> > > Hi all,
> > > 
> > > I add a blkback QoS patch.
> > 
> > What tree is this against? 
> This patch is based on suse11.sp1(2.6.32) xen-blkback source. 
> (2.6.18 "Xenlinux" based source trees?)
> 
> > There is a xen-blkback in 3.0-rc4, can you rebase
> > it against that please.
> > 
> Ok, I will rebase it.

Hold on, lets talk about the problem you are trying to solve first.
> 
> > What is the patch solving? 
> > 
> With this path, you can set different speed I/O for different VM disk.
> For example, I set vm17-disk1 4MKB/s 
>                    vm17-disk2 1MKB/s 
>                    vm18-disk3 3MKB/s 
> I/O speed, by writing follow xenstore key-values.
> 	/local/domain/17/device/vbd/768/tokens-rate = "4096"
> 	/local/domain/17/device/vbd/2048/tokens-rate = "1024"
> 	/local/domain/18/device/vbd/768/tokens-rate = "3096"
> 
> > Why can't it be done with dm-ioband?
> Of cause, I/O speed limit also can be done with dm-ioband.
> But with my patch, there is no need to load dm-ioband any more.
> This patch do speed-limit more close disk, more lightweight.

I am not convienced this will be easier to maintain than
using existing code (dm-ioband) that Linux kernel provides already.

Are there other technical reasons 'dm-ioband' is not sufficient
enough? Could it be possible to fix 'dm-ioband' to not have those
bugs? Florian mentioned flush requests not passing through
the DM layers but I am pretty sure those have been fixed.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re[2]: [Xen-users] Re: VM disk I/O limit patch
  2011-06-22 13:11     ` Konrad Rzeszutek Wilk
@ 2011-06-22 14:12       ` Andrew Xu
  2011-06-22 14:39         ` Konrad Rzeszutek Wilk
  2011-06-24 14:29         ` Re[2]: " Ian Jackson
  0 siblings, 2 replies; 14+ messages in thread
From: Andrew Xu @ 2011-06-22 14:12 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: xen-devel, xen-users


On Wed, 22 Jun 2011 09:11:21 -0400
Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:

> On Wed, Jun 22, 2011 at 08:06:23PM +0800, Andrew Xu wrote:
> > 
> > On Tue, 21 Jun 2011 09:33:37 -0400
> > Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> > 
> > > On Tue, Jun 21, 2011 at 04:29:35PM +0800, Andrew Xu wrote:
> > > > Hi all,
> > > > 
> > > > I add a blkback QoS patch.
> > > 
> > > What tree is this against? 
> > This patch is based on suse11.sp1(2.6.32) xen-blkback source. 
> > (2.6.18 "Xenlinux" based source trees?)
> > 
> > > There is a xen-blkback in 3.0-rc4, can you rebase
> > > it against that please.
> > > 
> > Ok, I will rebase it.
> 
> Hold on, lets talk about the problem you are trying to solve first.
> > 
> > > What is the patch solving? 
> > > 
> > With this path, you can set different speed I/O for different VM disk.
> > For example, I set vm17-disk1 4MKB/s 
> >                    vm17-disk2 1MKB/s 
> >                    vm18-disk3 3MKB/s 
> > I/O speed, by writing follow xenstore key-values.
> > 	/local/domain/17/device/vbd/768/tokens-rate = "4096"
> > 	/local/domain/17/device/vbd/2048/tokens-rate = "1024"
> > 	/local/domain/18/device/vbd/768/tokens-rate = "3096"
> > 
> > > Why can't it be done with dm-ioband?
> > Of cause, I/O speed limit also can be done with dm-ioband.
> > But with my patch, there is no need to load dm-ioband any more.
> > This patch do speed-limit more close disk, more lightweight.
> 
> I am not convienced this will be easier to maintain than
> using existing code (dm-ioband) that Linux kernel provides already.
> 
> Are there other technical reasons 'dm-ioband' is not sufficient
> enough? Could it be possible to fix 'dm-ioband' to not have those
> bugs? Florian mentioned flush requests not passing through
> the DM layers but I am pretty sure those have been fixed.
> 
I don't find dm-ioband's bug, so I can't answer your question.

But, xen-vm-disk I/O limitation shoud done by xen module, is not it?



> _______________________________________________
> Xen-users mailing list
> Xen-users@lists.xensource.com
> http://lists.xensource.com/xen-users

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Xen-users] Re: VM disk I/O limit patch
  2011-06-22 14:12       ` Re[2]: " Andrew Xu
@ 2011-06-22 14:39         ` Konrad Rzeszutek Wilk
  2011-06-24 14:29         ` Re[2]: " Ian Jackson
  1 sibling, 0 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-06-22 14:39 UTC (permalink / raw)
  To: Andrew Xu; +Cc: xen-devel, xen-users

> > I am not convienced this will be easier to maintain than
> > using existing code (dm-ioband) that Linux kernel provides already.
> > 
> > Are there other technical reasons 'dm-ioband' is not sufficient
> > enough? Could it be possible to fix 'dm-ioband' to not have those
> > bugs? Florian mentioned flush requests not passing through
> > the DM layers but I am pretty sure those have been fixed.
> > 
> I don't find dm-ioband's bug, so I can't answer your question.
> 
> But, xen-vm-disk I/O limitation shoud done by xen module, is not it?

Not all I/O's for a guest go through the xen-blkback. For example
the tap, qcow, are all inside of QEMU which has no notion of xen-blkback
(this is upstream QEMU BTW) - so they would not benefit from this
at all.

While if all of the "disks" that are assigned to a guest go
through dm-ioband the system admin has only one interface that can cover
_all_ of the I/O sinks. And it is standard enough so that if that
system admin is switching over from KVM to Xen they don't have to
alter a lot of their infrastructure.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: VM disk I/O limit patch
  2011-06-21  8:29 VM disk I/O limit patch Andrew Xu
  2011-06-21 13:33 ` Konrad Rzeszutek Wilk
@ 2011-06-23 20:45 ` Shaun Reitan
  2011-06-27 15:41   ` Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 14+ messages in thread
From: Shaun Reitan @ 2011-06-23 20:45 UTC (permalink / raw)
  To: xen-devel; +Cc: xen-users

Does this match only limit throughput or can it also limit the guest by 
disk IOPS?  christopher aker had a patch way back for UML that did disk 
based qos.  What i really liked about that patch was that it allowed for 
bursting by using a bucket.  If i remember correctly you specified that 
a guest's bucket could hold say 4000 tokens, and the bucket would be 
filled with 10 tokens a second.  Each IO took one token from the bucket. 
  When the bucket was empty IO was paused and processed as the bucket 
was filled.  This allowed a guest to burst for a short period of time 
until that bucket was empty and then it would slowely be filled back up.

Also what was nice is that the guest had a /proc/ entry that told the 
customer how many tokens they currently had in their bucket.

I would like to see somthing like this in Xen, I've even thought about 
posting to the devel forums seeing if somebody wanted to write one for $$$

~Shaun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re[2]: [Xen-users] Re: VM disk I/O limit patch
  2011-06-22 14:12       ` Re[2]: " Andrew Xu
  2011-06-22 14:39         ` Konrad Rzeszutek Wilk
@ 2011-06-24 14:29         ` Ian Jackson
  1 sibling, 0 replies; 14+ messages in thread
From: Ian Jackson @ 2011-06-24 14:29 UTC (permalink / raw)
  To: Andrew Xu; +Cc: xen-devel, xen-users, Konrad Rzeszutek Wilk

Andrew Xu writes ("Re[2]: [Xen-users] Re: [Xen-devel] VM disk I/O limit patch"):
> On Wed, 22 Jun 2011 09:11:21 -0400
> Konrad Rzeszutek Wilk <konrad.wilk@oracle.com> wrote:
> > I am not convienced this will be easier to maintain than
> > using existing code (dm-ioband) that Linux kernel provides already.
> 
> But, xen-vm-disk I/O limitation shoud done by xen module, is not it?

Certainly not.  It is much better for Xen systems to use general
facilities where those are sufficient.

Ian.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Re: VM disk I/O limit patch
  2011-06-23 20:45 ` Shaun Reitan
@ 2011-06-27 15:41   ` Konrad Rzeszutek Wilk
  2011-06-27 19:22     ` Shaun Reitan
  2011-06-28 11:32     ` Re: [Xen-devel] " Florian Heigl
  0 siblings, 2 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-06-27 15:41 UTC (permalink / raw)
  To: Shaun Reitan; +Cc: xen-devel, xen-users

On Thu, Jun 23, 2011 at 01:45:36PM -0700, Shaun Reitan wrote:
> Does this match only limit throughput or can it also limit the guest
> by disk IOPS?  christopher aker had a patch way back for UML that

Just throughpout.

> did disk based qos.  What i really liked about that patch was that
> it allowed for bursting by using a bucket.  If i remember correctly
> you specified that a guest's bucket could hold say 4000 tokens, and
> the bucket would be filled with 10 tokens a second.  Each IO took
> one token from the bucket.  When the bucket was empty IO was paused
> and processed as the bucket was filled.  This allowed a guest to
> burst for a short period of time until that bucket was empty and
> then it would slowely be filled back up.

Uhhh... are you sure you are talking about the same patch.
> 
> Also what was nice is that the guest had a /proc/ entry that told
> the customer how many tokens they currently had in their bucket.

OK.. but how would this help the customers? They don't have access
to the /proc in Dom0.
> 
> I would like to see somthing like this in Xen, I've even thought
> about posting to the devel forums seeing if somebody wanted to write
> one for $$$

Why not use dm-ioband (here is a doc about it:
http://lwn.net/Articles/344441/) which has much more options and also
provide the bucket and tokens you are looking for.

[edit: Looks like dm-ioband never made it in the Linux kernel. But there
was something that I thought Vivek wrote that was superior to dm-ioband..
Ah, yes: blkio-controller.txt.

Look in Documentation/cgroups/blkio-controller.txt]

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: VM disk I/O limit patch
  2011-06-27 15:41   ` Konrad Rzeszutek Wilk
@ 2011-06-27 19:22     ` Shaun Reitan
  2011-06-28 11:32     ` Re: [Xen-devel] " Florian Heigl
  1 sibling, 0 replies; 14+ messages in thread
From: Shaun Reitan @ 2011-06-27 19:22 UTC (permalink / raw)
  To: xen-devel; +Cc: xen-users

The patch i'm talking about can be found here 
http://theshore.net/~caker/uml/patches/  It's called the token limiter.

The entries in proc where under the guests proc.

dm-ioband is something i just found a few weeks ago, i do plan to do 
some testing with it.

~Shaun

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: Re: [Xen-devel] Re: VM disk I/O limit patch
  2011-06-27 15:41   ` Konrad Rzeszutek Wilk
  2011-06-27 19:22     ` Shaun Reitan
@ 2011-06-28 11:32     ` Florian Heigl
  2011-06-28 13:29       ` [Xen-users] " Konrad Rzeszutek Wilk
  1 sibling, 1 reply; 14+ messages in thread
From: Florian Heigl @ 2011-06-28 11:32 UTC (permalink / raw)
  To: Konrad Rzeszutek Wilk; +Cc: Shaun Reitan, xen-devel, xen-users

Hi Konrad,

Thanks for providing links to info about both dm-ioband and bklio.
This is surely something to test with and might be the best choice for
Xen from 2.6.34 and up.

Question/food for thought:
since:
- 2.6.18 still has CFQ1 which has notable issues with processes
starving each other (some people have seen this and some havent, but
it exists and it's one of the worst issues that exist. Normally people
will switch to deadline scheduler and ... experience they no longer
can priorize now, and even then they'll still see their dom0 go
sluggish if a domU is too IO heavy)
- Both blkio patch and dm-ioband are not in 2.6.18 and not even in 2.6.32(!!!)
- The patch from last week was for 2.618...

would it be possible to add the patch to the 2.6.18-ish Xen trees and
not into the 3.x one?
We could have a (hopefully) working solution for a problem that exists
now on the deployments that are in use now and that could easily go
into a XenServer 5.6 Patch123456 or XCP or OracleVM.


This might also be the more time-conserving way to do it, since right
now the cgroups mechanisms in Linux are nice, but it should be obvious
that there's still a year or two to go from setting up every single
stuff via /sys after a process is started to a working solution that
can be pre-configured for all VMs.

Unless anybody thinks this is enough ;)


2011/6/27 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:
> On Thu, Jun 23, 2011 at 01:45:36PM -0700, Shaun Reitan wrote:
>> Does this match only limit throughput or can it also limit the guest
>> by disk IOPS?  christopher aker had a patch way back for UML that
>
> Just throughpout.
>
>> did disk based qos.  What i really liked about that patch was that
>> it allowed for bursting by using a bucket.  If i remember correctly

anything that is able to employ limits and keeps them burstable is
just perfect :)


-- 
the purpose of libvirt is to provide an abstraction layer hiding all
xen features added since 2006 until they were finally understood and
copied by the kvm devs.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [Xen-users] Re: Re: VM disk I/O limit patch
  2011-06-28 11:32     ` Re: [Xen-devel] " Florian Heigl
@ 2011-06-28 13:29       ` Konrad Rzeszutek Wilk
  0 siblings, 0 replies; 14+ messages in thread
From: Konrad Rzeszutek Wilk @ 2011-06-28 13:29 UTC (permalink / raw)
  To: Florian Heigl; +Cc: Shaun Reitan, xen-devel, xen-users

On Tue, Jun 28, 2011 at 01:32:14PM +0200, Florian Heigl wrote:
> Hi Konrad,
> 
> Thanks for providing links to info about both dm-ioband and bklio.
> This is surely something to test with and might be the best choice for
> Xen from 2.6.34 and up.
> 
> Question/food for thought:
> since:
> - 2.6.18 still has CFQ1 which has notable issues with processes
> starving each other (some people have seen this and some havent, but
> it exists and it's one of the worst issues that exist. Normally people
> will switch to deadline scheduler and ... experience they no longer
> can priorize now, and even then they'll still see their dom0 go
> sluggish if a domU is too IO heavy)
> - Both blkio patch and dm-ioband are not in 2.6.18 and not even in 2.6.32(!!!)

Right, so you can upgrade to 3.0 or 2.6.39.

> - The patch from last week was for 2.618...

Ah, not idea who is the maintainer for the 2.6.18 tree anymore.

> 
> would it be possible to add the patch to the 2.6.18-ish Xen trees and
> not into the 3.x one?

You are welcome to do this, but I don't think anybody else is going to do this.
I am definitly not going to take the patch for the 3.0 tree.

> We could have a (hopefully) working solution for a problem that exists
> now on the deployments that are in use now and that could easily go
> into a XenServer 5.6 Patch123456 or XCP or OracleVM.
> 
> 
> This might also be the more time-conserving way to do it, since right
> now the cgroups mechanisms in Linux are nice, but it should be obvious
> that there's still a year or two to go from setting up every single
> stuff via /sys after a process is started to a working solution that
> can be pre-configured for all VMs.
> 
> Unless anybody thinks this is enough ;)
> 
> 
> 2011/6/27 Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>:
> > On Thu, Jun 23, 2011 at 01:45:36PM -0700, Shaun Reitan wrote:
> >> Does this match only limit throughput or can it also limit the guest
> >> by disk IOPS?  christopher aker had a patch way back for UML that
> >
> > Just throughpout.
> >
> >> did disk based qos.  What i really liked about that patch was that
> >> it allowed for bursting by using a bucket.  If i remember correctly
> 
> anything that is able to employ limits and keeps them burstable is
> just perfect :)
> 
> 
> -- 
> the purpose of libvirt is to provide an abstraction layer hiding all
> xen features added since 2006 until they were finally understood and
> copied by the kvm devs.

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2011-06-28 13:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-06-21  8:29 VM disk I/O limit patch Andrew Xu
2011-06-21 13:33 ` Konrad Rzeszutek Wilk
2011-06-22 12:06   ` [Xen-users] " Andrew Xu
2011-06-22 13:11     ` Konrad Rzeszutek Wilk
2011-06-22 14:12       ` Re[2]: " Andrew Xu
2011-06-22 14:39         ` Konrad Rzeszutek Wilk
2011-06-24 14:29         ` Re[2]: " Ian Jackson
2011-06-22 12:16   ` Re: [Xen-devel] " Florian Heigl
2011-06-22 13:07     ` [Xen-users] " Konrad Rzeszutek Wilk
2011-06-23 20:45 ` Shaun Reitan
2011-06-27 15:41   ` Konrad Rzeszutek Wilk
2011-06-27 19:22     ` Shaun Reitan
2011-06-28 11:32     ` Re: [Xen-devel] " Florian Heigl
2011-06-28 13:29       ` [Xen-users] " Konrad Rzeszutek Wilk

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.