netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 3/3] netxen: qlogic ethernet : Fix Endian Bug.
@ 2012-03-02 15:11 santosh nayak
  2012-03-02 17:22 ` Rajesh Borundia
  2012-03-05 21:49 ` [PATCH 0001/001] xen: multi page ring support for block devices Santosh Jodh
  0 siblings, 2 replies; 15+ messages in thread
From: santosh nayak @ 2012-03-02 15:11 UTC (permalink / raw)
  To: sony.chacko
  Cc: rajesh.borundia, netdev, linux-kernel, kernel-janitors, Santosh Nayak

From: Santosh Nayak <santoshprasadnayak@gmail.com>

Fix endian bug.
Add a default case in 'netxen_list_config_vlan_ip'

Signed-off-by: Santosh Nayak <santoshprasadnayak@gmail.com>
---
 .../net/ethernet/qlogic/netxen/netxen_nic_main.c   |   20 ++++++++++++--------
 1 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 8dc4a134..971b286 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -818,7 +818,7 @@ netxen_check_options(struct netxen_adapter *adapter)
 			adapter->driver_mismatch = 1;
 			return;
 		}
-		ptr32[i] = cpu_to_le32(val);
+		ptr32[i] = val;
 		offset += sizeof(u32);
 	}
 
@@ -3028,7 +3028,7 @@ netxen_list_config_vlan_ip(struct netxen_adapter *adapter,
 		list_for_each(head, &adapter->vlan_ip_list) {
 			cur = list_entry(head, struct nx_vlan_ip_list, list);
 
-			if (cur->ip_addr == ifa->ifa_address)
+			if (cur->ip_addr == be32_to_cpu(ifa->ifa_address))
 				return;
 		}
 
@@ -3039,18 +3039,22 @@ netxen_list_config_vlan_ip(struct netxen_adapter *adapter,
 			return;
 		}
 
-		cur->ip_addr = ifa->ifa_address;
+		cur->ip_addr = be32_to_cpu(ifa->ifa_address);
 		list_add_tail(&cur->list, &adapter->vlan_ip_list);
 		break;
 	case NX_IP_DOWN:
 		list_for_each_entry_safe(cur, tmp_cur,
 					&adapter->vlan_ip_list, list) {
-			if (cur->ip_addr == ifa->ifa_address) {
+			if (cur->ip_addr == be32_to_cpu(ifa->ifa_address)) {
 				list_del(&cur->list);
 				kfree(cur);
 				break;
 			}
 		}
+		break;
+	default:
+		printk(KERN_ERR "%ld: Wrong event id \n", event);
+		break;
 	}
 }
 static void
@@ -3070,12 +3074,12 @@ netxen_config_indev_addr(struct netxen_adapter *adapter,
 		switch (event) {
 		case NETDEV_UP:
 			netxen_config_ipaddr(adapter,
-					ifa->ifa_address, NX_IP_UP);
+					be32_to_cpu(ifa->ifa_address), NX_IP_UP);
 			netxen_list_config_vlan_ip(adapter, ifa, NX_IP_UP);
 			break;
 		case NETDEV_DOWN:
 			netxen_config_ipaddr(adapter,
-					ifa->ifa_address, NX_IP_DOWN);
+					be32_to_cpu(ifa->ifa_address), NX_IP_DOWN);
 			netxen_list_config_vlan_ip(adapter, ifa, NX_IP_DOWN);
 			break;
 		default:
@@ -3167,11 +3171,11 @@ recheck:
 
 	switch (event) {
 	case NETDEV_UP:
-		netxen_config_ipaddr(adapter, ifa->ifa_address, NX_IP_UP);
+		netxen_config_ipaddr(adapter, be32_to_cpu(ifa->ifa_address), NX_IP_UP);
 		netxen_list_config_vlan_ip(adapter, ifa, NX_IP_UP);
 		break;
 	case NETDEV_DOWN:
-		netxen_config_ipaddr(adapter, ifa->ifa_address, NX_IP_DOWN);
+		netxen_config_ipaddr(adapter, be32_to_cpu(ifa->ifa_address), NX_IP_DOWN);
 		netxen_list_config_vlan_ip(adapter, ifa, NX_IP_DOWN);
 		break;
 	default:
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* RE: [PATCH 3/3] netxen: qlogic ethernet : Fix Endian Bug.
  2012-03-02 15:11 [PATCH 3/3] netxen: qlogic ethernet : Fix Endian Bug santosh nayak
@ 2012-03-02 17:22 ` Rajesh Borundia
  2012-03-05 21:49 ` [PATCH 0001/001] xen: multi page ring support for block devices Santosh Jodh
  1 sibling, 0 replies; 15+ messages in thread
From: Rajesh Borundia @ 2012-03-02 17:22 UTC (permalink / raw)
  To: santosh nayak, Sony Chacko; +Cc: netdev, linux-kernel, kernel-janitors

Santosh,

Thanks for pointing out the bug. But adapter takes  ip in big endian format.
So no need to use  be32_to_cpu instead data type of ip should be changed to __be32.
Also in netxen_config_ipaddr()
-      req.words[1] = cpu_to_le64(ip);
+      req.words[1] = ip;

Rest looks fine.

Rajesh
________________________________________
From: santosh nayak [santoshprasadnayak@gmail.com]
Sent: Friday, March 02, 2012 8:41 PM
To: Sony Chacko
Cc: Rajesh Borundia; netdev; linux-kernel; kernel-janitors@vger.kernel.org; Santosh Nayak
Subject: [PATCH 3/3] netxen: qlogic ethernet : Fix Endian Bug.

From: Santosh Nayak <santoshprasadnayak@gmail.com>

Fix endian bug.
Add a default case in 'netxen_list_config_vlan_ip'

Signed-off-by: Santosh Nayak <santoshprasadnayak@gmail.com>
---
 .../net/ethernet/qlogic/netxen/netxen_nic_main.c   |   20 ++++++++++++--------
 1 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
index 8dc4a134..971b286 100644
--- a/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
+++ b/drivers/net/ethernet/qlogic/netxen/netxen_nic_main.c
@@ -818,7 +818,7 @@ netxen_check_options(struct netxen_adapter *adapter)
                        adapter->driver_mismatch = 1;
                        return;
                }
-               ptr32[i] = cpu_to_le32(val);
+               ptr32[i] = val;
                offset += sizeof(u32);
        }

@@ -3028,7 +3028,7 @@ netxen_list_config_vlan_ip(struct netxen_adapter *adapter,
                list_for_each(head, &adapter->vlan_ip_list) {
                        cur = list_entry(head, struct nx_vlan_ip_list, list);

-                       if (cur->ip_addr == ifa->ifa_address)
+                       if (cur->ip_addr == be32_to_cpu(ifa->ifa_address))
                                return;
                }

@@ -3039,18 +3039,22 @@ netxen_list_config_vlan_ip(struct netxen_adapter *adapter,
                        return;
                }

-               cur->ip_addr = ifa->ifa_address;
+               cur->ip_addr = be32_to_cpu(ifa->ifa_address);
                list_add_tail(&cur->list, &adapter->vlan_ip_list);
                break;
        case NX_IP_DOWN:
                list_for_each_entry_safe(cur, tmp_cur,
                                        &adapter->vlan_ip_list, list) {
-                       if (cur->ip_addr == ifa->ifa_address) {
+                       if (cur->ip_addr == be32_to_cpu(ifa->ifa_address)) {
                                list_del(&cur->list);
                                kfree(cur);
                                break;
                        }
                }
+               break;
+       default:
+               printk(KERN_ERR "%ld: Wrong event id \n", event);
+               break;
        }
 }
 static void
@@ -3070,12 +3074,12 @@ netxen_config_indev_addr(struct netxen_adapter *adapter,
                switch (event) {
                case NETDEV_UP:
                        netxen_config_ipaddr(adapter,
-                                       ifa->ifa_address, NX_IP_UP);
+                                       be32_to_cpu(ifa->ifa_address), NX_IP_UP);
                        netxen_list_config_vlan_ip(adapter, ifa, NX_IP_UP);
                        break;
                case NETDEV_DOWN:
                        netxen_config_ipaddr(adapter,
-                                       ifa->ifa_address, NX_IP_DOWN);
+                                       be32_to_cpu(ifa->ifa_address), NX_IP_DOWN);
                        netxen_list_config_vlan_ip(adapter, ifa, NX_IP_DOWN);
                        break;
                default:
@@ -3167,11 +3171,11 @@ recheck:

        switch (event) {
        case NETDEV_UP:
-               netxen_config_ipaddr(adapter, ifa->ifa_address, NX_IP_UP);
+               netxen_config_ipaddr(adapter, be32_to_cpu(ifa->ifa_address), NX_IP_UP);
                netxen_list_config_vlan_ip(adapter, ifa, NX_IP_UP);
                break;
        case NETDEV_DOWN:
-               netxen_config_ipaddr(adapter, ifa->ifa_address, NX_IP_DOWN);
+               netxen_config_ipaddr(adapter, be32_to_cpu(ifa->ifa_address), NX_IP_DOWN);
                netxen_list_config_vlan_ip(adapter, ifa, NX_IP_DOWN);
                break;
        default:
--
1.7.4.4

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-02 15:11 [PATCH 3/3] netxen: qlogic ethernet : Fix Endian Bug santosh nayak
  2012-03-02 17:22 ` Rajesh Borundia
@ 2012-03-05 21:49 ` Santosh Jodh
  2012-03-06  2:42   ` Rusty Russell
                     ` (3 more replies)
  1 sibling, 4 replies; 15+ messages in thread
From: Santosh Jodh @ 2012-03-05 21:49 UTC (permalink / raw)
  To: konrad.wilk, jeremy, Ian Campbell, jbarnes, jbeulich, joe.jin,
	lersek, weiyi.huang, rusty, dgdegra, David Vrabel,
	paul.gortmaker, akpm, waldi, virtualization, netdev, linux-pci,
  Cc: Santosh Jodh, Paul Durrant

From: Santosh Jodh <santosh.jodh@citrix.com>

Add support for multi page ring for block devices.
The number of pages is configurable for blkback via module parameter.
blkback reports max-ring-page-order to blkfront via xenstore.
blkfront reports its supported ring-page-order to blkback via xenstore.
blkfront reports multi page ring references via ring-refNN in xenstore.
The change allows newer blkfront to work with older blkback and
vice-versa.
Based on original patch by Paul Durrant.

Signed-off-by: Santosh Jodh <santosh.jodh@citrix.com>
---
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 0088bf6..72f2e18 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -60,6 +60,39 @@ static int xen_blkif_reqs = 64;
 module_param_named(reqs, xen_blkif_reqs, int, 0);
 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");

+/* Order of maximum shared ring size advertised to the front end. */
+int xen_blkif_max_ring_order = XENBUS_MAX_RING_ORDER;
+
+#define BLK_RING_SIZE(_order) __CONST_RING_SIZE(blkif, PAGE_SIZE << (_order))
+
+static int set_max_ring_order(const char *buf, struct kernel_param *kp)
+{
+       int err;
+       unsigned long order;
+
+       err = kstrtol(buf, 0, &order);
+       if (err ||
+           order < 0 ||
+           order > XENBUS_MAX_RING_ORDER)
+               return -EINVAL;
+
+       if (xen_blkif_reqs < BLK_RING_SIZE(order))
+               printk(KERN_WARNING "WARNING: "
+                      "I/O request space (%d reqs) < ring order %ld, "
+                      "consider increasing %s.reqs to >= %ld.",
+                      xen_blkif_reqs, order, KBUILD_MODNAME,
+                      roundup_pow_of_two(BLK_RING_SIZE(order)));
+
+       xen_blkif_max_ring_order = order;
+
+       return 0;
+}
+
+module_param_call(max_ring_order,
+                 set_max_ring_order, param_get_int,
+                 &xen_blkif_max_ring_order, 0644);
+MODULE_PARM_DESC(max_ring_order, "log2 of maximum ring size, in pages.");
+
 /* Run-time switchable: /sys/module/blkback/parameters/ */
 static unsigned int log_stats;
 module_param(log_stats, int, 0644);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index d0ee7ed..5f33a1a 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -126,6 +126,8 @@ struct blkif_x86_64_response {
        int16_t         status;          /* BLKIF_RSP_???       */
 };

+extern int xen_blkif_max_ring_order;
+
 DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
                  struct blkif_common_response);
 DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 24a2fb5..7a9d71d 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -122,8 +122,8 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
        return blkif;
 }

-static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
-                        unsigned int evtchn)
+static int xen_blkif_map(struct xen_blkif *blkif, int ring_ref[],
+                        unsigned int ring_order, unsigned int evtchn)
 {
        int err;

@@ -131,7 +131,8 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
        if (blkif->irq)
                return 0;

-       err = xenbus_map_ring_valloc(blkif->be->dev, shared_page, &blkif->blk_ring);
+       err = xenbus_map_ring_valloc(blkif->be->dev, ring_ref, 1 << ring_order,
+                                    &blkif->blk_ring);
        if (err < 0)
                return err;

@@ -140,21 +141,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
        {
                struct blkif_sring *sring;
                sring = (struct blkif_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+               BACK_RING_INIT(&blkif->blk_rings.native, sring,
+                              PAGE_SIZE << ring_order);
                break;
        }
        case BLKIF_PROTOCOL_X86_32:
        {
                struct blkif_x86_32_sring *sring_x86_32;
                sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+               BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32,
+                              PAGE_SIZE << ring_order);
                break;
        }
        case BLKIF_PROTOCOL_X86_64:
        {
                struct blkif_x86_64_sring *sring_x86_64;
                sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+               BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64,
+                              PAGE_SIZE << ring_order);
                break;
        }
        default:
@@ -497,6 +501,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
        if (err)
                goto fail;

+       err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-page-order",
+                           "%u", xen_blkif_max_ring_order);
+       if (err)
+               goto fail;
+
        err = xenbus_switch_state(dev, XenbusStateInitWait);
        if (err)
                goto fail;
@@ -744,22 +753,80 @@ again:
 static int connect_ring(struct backend_info *be)
 {
        struct xenbus_device *dev = be->dev;
-       unsigned long ring_ref;
+       int ring_ref[XENBUS_MAX_RING_PAGES];
+       unsigned int ring_order;
        unsigned int evtchn;
        char protocol[64] = "";
        int err;

        DPRINTK("%s", dev->otherend);

-       err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
-                           &ring_ref, "event-channel", "%u", &evtchn, NULL);
-       if (err) {
-               xenbus_dev_fatal(dev, err,
-                                "reading %s/ring-ref and event-channel",
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
+                          &evtchn);
+       if (err != 1) {
+               err = -EINVAL;
+
+               xenbus_dev_fatal(dev, err, "reading %s/event-channel",
                                 dev->otherend);
                return err;
        }

+       printk(KERN_INFO "blkback: event-channel %u\n", evtchn);
+
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
+                          &ring_order);
+       if (err != 1) {
+               DPRINTK("%s: using single page handshake", dev->otherend);
+
+               ring_order = 0;
+
+               err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
+                                  "%d", &ring_ref[0]);
+               if (err != 1) {
+                       err = -EINVAL;
+
+                       xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
+                                        dev->otherend);
+                       return err;
+               }
+
+               printk(KERN_INFO "blkback: ring-ref %d\n", ring_ref[0]);
+       } else {
+               unsigned int i;
+
+               if (ring_order > xen_blkif_max_ring_order) {
+                       err = -EINVAL;
+
+                       xenbus_dev_fatal(dev, err,
+                                        "%s/ring-page-order too big",
+                                        dev->otherend);
+                       return err;
+               }
+
+               for (i = 0; i < (1u << ring_order); i++) {
+                       char ring_ref_name[10];
+
+                       snprintf(ring_ref_name, sizeof(ring_ref_name),
+                                "ring-ref%u", i);
+
+                       err = xenbus_scanf(XBT_NIL, dev->otherend,
+                                          ring_ref_name, "%d",
+                                          &ring_ref[i]);
+                       if (err != 1) {
+                               err = -EINVAL;
+
+                               xenbus_dev_fatal(dev, err,
+                                                "reading %s/%s",
+                                                dev->otherend,
+                                                ring_ref_name);
+                               return err;
+                       }
+
+                       printk(KERN_INFO "blkback: ring-ref%u %d\n", i,
+                              ring_ref[i]);
+               }
+       }
+
        be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
        err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
                            "%63s", protocol, NULL);
@@ -775,14 +842,11 @@ static int connect_ring(struct backend_info *be)
                xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
                return -1;
        }
-       pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n",
-               ring_ref, evtchn, be->blkif->blk_protocol, protocol);

        /* Map the shared frame, irq etc. */
-       err = xen_blkif_map(be->blkif, ring_ref, evtchn);
+       err = xen_blkif_map(be->blkif, ring_ref, ring_order, evtchn);
        if (err) {
-               xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
-                                ring_ref, evtchn);
+               xenbus_dev_fatal(dev, err, "mapping ring-refs and evtchn");
                return err;
        }

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2f22874..485813a 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -57,6 +57,10 @@

 #include <asm/xen/hypervisor.h>

+static int xen_blkif_ring_order;
+module_param_named(reqs, xen_blkif_ring_order, int, 0);
+MODULE_PARM_DESC(reqs, "log2 of requested ring size, in pages.");
+
 enum blkif_state {
        BLKIF_STATE_DISCONNECTED,
        BLKIF_STATE_CONNECTED,
@@ -72,7 +76,8 @@ struct blk_shadow {
 static DEFINE_MUTEX(blkfront_mutex);
 static const struct block_device_operations xlvbd_block_fops;

-#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
+#define BLK_RING_SIZE(_order)  __CONST_RING_SIZE(blkif, PAGE_SIZE << (_order))
+#define BLK_MAX_RING_SIZE      BLK_RING_SIZE(XENBUS_MAX_RING_ORDER)

 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -87,14 +92,15 @@ struct blkfront_info
        int vdevice;
        blkif_vdev_t handle;
        enum blkif_state connected;
-       int ring_ref;
+       int ring_ref[XENBUS_MAX_RING_PAGES];
+       int ring_order;
        struct blkif_front_ring ring;
        struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
        unsigned int evtchn, irq;
        struct request_queue *rq;
        struct work_struct work;
        struct gnttab_free_callback callback;
-       struct blk_shadow shadow[BLK_RING_SIZE];
+       struct blk_shadow shadow[BLK_MAX_RING_SIZE];
        unsigned long shadow_free;
        unsigned int feature_flush;
        unsigned int flush_op;
@@ -111,9 +117,7 @@ static unsigned int nr_minors;
 static unsigned long *minors;
 static DEFINE_SPINLOCK(minor_lock);

-#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
-       (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
-#define GRANT_INVALID_REF      0
+#define GRANT_INVALID_REF      0

 #define PARTS_PER_DISK         16
 #define PARTS_PER_EXT_DISK      256
@@ -135,7 +139,7 @@ static DEFINE_SPINLOCK(minor_lock);
 static int get_id_from_freelist(struct blkfront_info *info)
 {
        unsigned long free = info->shadow_free;
-       BUG_ON(free >= BLK_RING_SIZE);
+       BUG_ON(free >= BLK_MAX_RING_SIZE);
        info->shadow_free = info->shadow[free].req.u.rw.id;
        info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
        return free;
@@ -683,6 +687,8 @@ static void blkif_restart_queue(struct work_struct *work)

 static void blkif_free(struct blkfront_info *info, int suspend)
 {
+       int i;
+
        /* Prevent new requests being issued until we fix things up. */
        spin_lock_irq(&blkif_io_lock);
        info->connected = suspend ?
@@ -698,16 +704,19 @@ static void blkif_free(struct blkfront_info *info, int suspend)
        flush_work_sync(&info->work);

        /* Free resources associated with old device channel. */
-       if (info->ring_ref != GRANT_INVALID_REF) {
-               gnttab_end_foreign_access(info->ring_ref, 0,
-                                         (unsigned long)info->ring.sring);
-               info->ring_ref = GRANT_INVALID_REF;
-               info->ring.sring = NULL;
+       for (i = 0; i < (1 << info->ring_order); i++) {
+               if (info->ring_ref[i] != GRANT_INVALID_REF) {
+                       gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
+                       info->ring_ref[i] = GRANT_INVALID_REF;
+               }
        }
+
+       free_pages((unsigned long)info->ring.sring, info->ring_order);
+       info->ring.sring = NULL;
+
        if (info->irq)
                unbind_from_irqhandler(info->irq, info);
        info->evtchn = info->irq = 0;
-
 }

 static void blkif_completion(struct blk_shadow *s)
@@ -828,25 +837,24 @@ static int setup_blkring(struct xenbus_device *dev,
        struct blkif_sring *sring;
        int err;

-       info->ring_ref = GRANT_INVALID_REF;
-
-       sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
+       sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
+                                                      info->ring_order);
        if (!sring) {
                xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
                return -ENOMEM;
        }
        SHARED_RING_INIT(sring);
-       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE << info->ring_order);

        sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);

-       err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
+       err = xenbus_grant_ring(dev, info->ring.sring, 1 << info->ring_order,
+                               info->ring_ref);
        if (err < 0) {
-               free_page((unsigned long)sring);
+               free_pages((unsigned long)sring, info->ring_order);
                info->ring.sring = NULL;
                goto fail;
        }
-       info->ring_ref = err;

        err = xenbus_alloc_evtchn(dev, &info->evtchn);
        if (err)
@@ -875,8 +883,27 @@ static int talk_to_blkback(struct xenbus_device *dev,
 {
        const char *message = NULL;
        struct xenbus_transaction xbt;
+       unsigned int ring_order;
+       int legacy_backend;
+       int i;
        int err;

+       for (i = 0; i < (1 << info->ring_order); i++)
+               info->ring_ref[i] = GRANT_INVALID_REF;
+
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "max-ring-page-order", "%u",
+                          &ring_order);
+
+       legacy_backend = !(err == 1);
+
+       if (legacy_backend) {
+               info->ring_order = 0;
+       } else {
+               info->ring_order = (ring_order <= xen_blkif_ring_order) ?
+                                  ring_order :
+                                  xen_blkif_ring_order;
+       }
+
        /* Create shared ring, alloc event channel. */
        err = setup_blkring(dev, info);
        if (err)
@@ -889,12 +916,35 @@ again:
                goto destroy_blkring;
        }

-       err = xenbus_printf(xbt, dev->nodename,
-                           "ring-ref", "%u", info->ring_ref);
-       if (err) {
-               message = "writing ring-ref";
-               goto abort_transaction;
+       if (legacy_backend) {
+               err = xenbus_printf(xbt, dev->nodename,
+                                   "ring-ref", "%d", info->ring_ref[0]);
+               if (err) {
+                       message = "writing ring-ref";
+                       goto abort_transaction;
+               }
+       } else {
+               for (i = 0; i < (1 << info->ring_order); i++) {
+                       char key[sizeof("ring-ref") + 2];
+
+                       sprintf(key, "ring-ref%d", i);
+
+                       err = xenbus_printf(xbt, dev->nodename,
+                                           key, "%d", info->ring_ref[i]);
+                       if (err) {
+                               message = "writing ring-ref";
+                               goto abort_transaction;
+                       }
+               }
+
+               err = xenbus_printf(xbt, dev->nodename,
+                                   "ring-page-order", "%u", info->ring_order);
+               if (err) {
+                       message = "writing ring-order";
+                       goto abort_transaction;
+               }
        }
+
        err = xenbus_printf(xbt, dev->nodename,
                            "event-channel", "%u", info->evtchn);
        if (err) {
@@ -996,21 +1046,14 @@ static int blkfront_probe(struct xenbus_device *dev,
        info->connected = BLKIF_STATE_DISCONNECTED;
        INIT_WORK(&info->work, blkif_restart_queue);

-       for (i = 0; i < BLK_RING_SIZE; i++)
+       for (i = 0; i < BLK_MAX_RING_SIZE; i++)
                info->shadow[i].req.u.rw.id = i+1;
-       info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
+       info->shadow[BLK_MAX_RING_SIZE-1].req.u.rw.id = 0x0fffffff;

        /* Front end dir is a number, which is used as the id. */
        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
        dev_set_drvdata(&dev->dev, info);

-       err = talk_to_blkback(dev, info);
-       if (err) {
-               kfree(info);
-               dev_set_drvdata(&dev->dev, NULL);
-               return err;
-       }
-
        return 0;
 }

@@ -1031,13 +1074,13 @@ static int blkif_recover(struct blkfront_info *info)

        /* Stage 2: Set up free list. */
        memset(&info->shadow, 0, sizeof(info->shadow));
-       for (i = 0; i < BLK_RING_SIZE; i++)
+       for (i = 0; i < BLK_MAX_RING_SIZE; i++)
                info->shadow[i].req.u.rw.id = i+1;
        info->shadow_free = info->ring.req_prod_pvt;
-       info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
+       info->shadow[BLK_MAX_RING_SIZE-1].req.u.rw.id = 0x0fffffff;

        /* Stage 3: Find pending requests and requeue them. */
-       for (i = 0; i < BLK_RING_SIZE; i++) {
+       for (i = 0; i < BLK_RING_SIZE(info->ring_order); i++) {
                /* Not in use? */
                if (!copy[i].request)
                        continue;
@@ -1299,7 +1342,6 @@ static void blkback_changed(struct xenbus_device *dev,

        switch (backend_state) {
        case XenbusStateInitialising:
-       case XenbusStateInitWait:
        case XenbusStateInitialised:
        case XenbusStateReconfiguring:
        case XenbusStateReconfigured:
@@ -1307,6 +1349,10 @@ static void blkback_changed(struct xenbus_device *dev,
        case XenbusStateClosed:
                break;

+       case XenbusStateInitWait:
+               talk_to_blkback(dev, info);
+               break;
+
        case XenbusStateConnected:
                blkfront_connect(info);
                break;
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 94b79c3..f93b59a 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -130,8 +130,8 @@ int xen_netbk_must_stop_queue(struct xenvif *vif);
 /* (Un)Map communication rings. */
 void xen_netbk_unmap_frontend_rings(struct xenvif *vif);
 int xen_netbk_map_frontend_rings(struct xenvif *vif,
-                                grant_ref_t tx_ring_ref,
-                                grant_ref_t rx_ring_ref);
+                                int tx_ring_ref,
+                                int rx_ring_ref);

 /* (De)Register a xenvif with the netback backend. */
 void xen_netbk_add_xenvif(struct xenvif *vif);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 59effac..0b014cf 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1594,8 +1594,8 @@ void xen_netbk_unmap_frontend_rings(struct xenvif *vif)
 }

 int xen_netbk_map_frontend_rings(struct xenvif *vif,
-                                grant_ref_t tx_ring_ref,
-                                grant_ref_t rx_ring_ref)
+                                int tx_ring_ref,
+                                int rx_ring_ref)
 {
        void *addr;
        struct xen_netif_tx_sring *txs;
@@ -1604,7 +1604,7 @@ int xen_netbk_map_frontend_rings(struct xenvif *vif,
        int err = -ENOMEM;

        err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(vif),
-                                    tx_ring_ref, &addr);
+                                    &tx_ring_ref, 1, &addr);
        if (err)
                goto err;

@@ -1612,7 +1612,7 @@ int xen_netbk_map_frontend_rings(struct xenvif *vif,
        BACK_RING_INIT(&vif->tx, txs, PAGE_SIZE);

        err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(vif),
-                                    rx_ring_ref, &addr);
+                                    &rx_ring_ref, 1, &addr);
        if (err)
                goto err;

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 698b905..521a595 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1496,13 +1496,12 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
        SHARED_RING_INIT(txs);
        FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);

-       err = xenbus_grant_ring(dev, virt_to_mfn(txs));
+       err = xenbus_grant_ring(dev, txs, 1, &info->tx_ring_ref);
        if (err < 0) {
                free_page((unsigned long)txs);
                goto fail;
        }

-       info->tx_ring_ref = err;
        rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
        if (!rxs) {
                err = -ENOMEM;
@@ -1512,12 +1511,11 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
        SHARED_RING_INIT(rxs);
        FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);

-       err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
+       err = xenbus_grant_ring(dev, rxs, 1, &info->rx_ring_ref);
        if (err < 0) {
                free_page((unsigned long)rxs);
                goto fail;
        }
-       info->rx_ring_ref = err;

        err = xenbus_alloc_evtchn(dev, &info->evtchn);
        if (err)
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 1620088..95109d8 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -768,12 +768,10 @@ static int pcifront_publish_info(struct pcifront_device *pdev)
        int err = 0;
        struct xenbus_transaction trans;

-       err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
+       err = xenbus_grant_ring(pdev->xdev, pdev->sh_info, 1, &pdev->gnt_ref);
        if (err < 0)
                goto out;

-       pdev->gnt_ref = err;
-
        err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
        if (err)
                goto out;
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
index 64b11f9..e0834cd 100644
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -108,7 +108,7 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
                "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
                gnt_ref, remote_evtchn);

-       err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
+       err = xenbus_map_ring_valloc(pdev->xdev, &gnt_ref, 1, &vaddr);
        if (err < 0) {
                xenbus_dev_fatal(pdev->xdev, err,
                                "Error mapping other domain page in ours.");
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 566d2ad..3a14524 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -53,14 +53,16 @@ struct xenbus_map_node {
                struct vm_struct *area; /* PV */
                struct page *page;     /* HVM */
        };
-       grant_handle_t handle;
+       grant_handle_t handle[XENBUS_MAX_RING_PAGES];
+       unsigned int   nr_handles;
 };

 static DEFINE_SPINLOCK(xenbus_valloc_lock);
 static LIST_HEAD(xenbus_valloc_pages);

 struct xenbus_ring_ops {
-       int (*map)(struct xenbus_device *dev, int gnt, void **vaddr);
+       int (*map)(struct xenbus_device *dev, int gnt[], int nr_gnts,
+                  void **vaddr);
        int (*unmap)(struct xenbus_device *dev, void *vaddr);
 };

@@ -356,17 +358,38 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
 /**
  * xenbus_grant_ring
  * @dev: xenbus device
- * @ring_mfn: mfn of ring to grant
-
- * Grant access to the given @ring_mfn to the peer of the given device.  Return
- * 0 on success, or -errno on error.  On error, the device will switch to
- * XenbusStateClosing, and the error will be saved in the store.
+ * @vaddr: starting virtual address of the ring
+ * @nr_pages: number of page to be granted
+ * @grefs: grant reference array to be filled in
+ * Grant access to the given @vaddr to the peer of the given device.
+ * Then fill in @grefs with grant references.  Return 0 on success, or
+ * -errno on error.  On error, the device will switch to
+ * XenbusStateClosing, and the first error will be saved in the store.
  */
-int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
+int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr,
+                     int nr_pages, int grefs[])
 {
-       int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
-       if (err < 0)
-               xenbus_dev_fatal(dev, err, "granting access to ring page");
+       int i;
+       int err;
+
+       for (i = 0; i < nr_pages; i++) {
+               unsigned long addr = (unsigned long)vaddr +
+                       (PAGE_SIZE * i);
+               err = gnttab_grant_foreign_access(dev->otherend_id,
+                                                 virt_to_mfn(addr), 0);
+               if (err < 0) {
+                       xenbus_dev_fatal(dev, err,
+                                        "granting access to ring page");
+                       goto fail;
+               }
+               grefs[i] = err;
+       }
+
+       return 0;
+
+fail:
+       for ( ; i >= 0; i--)
+               gnttab_end_foreign_access_ref(grefs[i], 0);
        return err;
 }
 EXPORT_SYMBOL_GPL(xenbus_grant_ring);
@@ -447,7 +470,8 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
 /**
  * xenbus_map_ring_valloc
  * @dev: xenbus device
- * @gnt_ref: grant reference
+ * @gnt_ref: grant reference array
+ * @nr_grefs: number of grant reference
  * @vaddr: pointer to address to be filled out by mapping
  *
  * Based on Rusty Russell's skeleton driver's map_page.
@@ -458,23 +482,28 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
  * or -ENOMEM on error. If an error is returned, device will switch to
  * XenbusStateClosing and the error message will be saved in XenStore.
  */
-int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
+int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref[],
+                          int nr_grefs, void **vaddr)
 {
-       return ring_ops->map(dev, gnt_ref, vaddr);
+       return ring_ops->map(dev, gnt_ref, nr_grefs, vaddr);
 }
 EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);

+static int __xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev,
+                                       struct xenbus_map_node *node);
+
 static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
-                                    int gnt_ref, void **vaddr)
+                                    int gnt_ref[], int nr_grefs, void **vaddr)
 {
-       struct gnttab_map_grant_ref op = {
-               .flags = GNTMAP_host_map | GNTMAP_contains_pte,
-               .ref   = gnt_ref,
-               .dom   = dev->otherend_id,
-       };
+       struct gnttab_map_grant_ref op[XENBUS_MAX_RING_PAGES];
        struct xenbus_map_node *node;
        struct vm_struct *area;
-       pte_t *pte;
+       pte_t *pte[XENBUS_MAX_RING_PAGES];
+       int i;
+       int err = 0;
+
+       if (nr_grefs > XENBUS_MAX_RING_PAGES)
+               return -EINVAL;

        *vaddr = NULL;

@@ -482,28 +511,44 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
        if (!node)
                return -ENOMEM;

-       area = alloc_vm_area(PAGE_SIZE, &pte);
+       area = alloc_vm_area(PAGE_SIZE * nr_grefs, pte);
        if (!area) {
                kfree(node);
                return -ENOMEM;
        }

-       op.host_addr = arbitrary_virt_to_machine(pte).maddr;
+       for (i = 0; i < nr_grefs; i++) {
+               op[i].flags = GNTMAP_host_map | GNTMAP_contains_pte,
+               op[i].ref   = gnt_ref[i],
+               op[i].dom   = dev->otherend_id,
+               op[i].host_addr = arbitrary_virt_to_machine(pte[i]).maddr;
+       };

        if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
                BUG();

-       if (op.status != GNTST_okay) {
-               free_vm_area(area);
-               kfree(node);
-               xenbus_dev_fatal(dev, op.status,
-                                "mapping in shared page %d from domain %d",
-                                gnt_ref, dev->otherend_id);
-               return op.status;
+       node->nr_handles = nr_grefs;
+       node->area = area;
+
+       for (i = 0; i < nr_grefs; i++) {
+               if (op[i].status != GNTST_okay) {
+                       err = op[i].status;
+                       node->handle[i] = INVALID_GRANT_HANDLE;
+                       continue;
+               }
+               node->handle[i] = op[i].handle;
        }

-       node->handle = op.handle;
-       node->area = area;
+       if (err != 0) {
+               for (i = 0; i < nr_grefs; i++)
+                       xenbus_dev_fatal(dev, op[i].status,
+                               "mapping in shared page %d from domain %d",
+                               gnt_ref[i], dev->otherend_id);
+
+                __xenbus_unmap_ring_vfree_pv(dev, node);
+
+               return err;
+       }

        spin_lock(&xenbus_valloc_lock);
        list_add(&node->next, &xenbus_valloc_pages);
@@ -514,25 +559,29 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
 }

 static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
-                                     int gnt_ref, void **vaddr)
+                                     int gnt_ref[], int nr_grefs, void **vaddr)
 {
        struct xenbus_map_node *node;
        int err;
        void *addr;

+       if (nr_grefs > XENBUS_MAX_RING_PAGES)
+               return -EINVAL;
+
        *vaddr = NULL;

        node = kzalloc(sizeof(*node), GFP_KERNEL);
        if (!node)
                return -ENOMEM;

-       err = alloc_xenballooned_pages(1, &node->page, false /* lowmem */);
+       err = alloc_xenballooned_pages(nr_grefs, &node->page,
+                                      false /* lowmem */);
        if (err)
                goto out_err;

        addr = pfn_to_kaddr(page_to_pfn(node->page));

-       err = xenbus_map_ring(dev, gnt_ref, &node->handle, addr);
+       err = xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handle, addr);
        if (err)
                goto out_err;

@@ -544,7 +593,7 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
        return 0;

  out_err:
-       free_xenballooned_pages(1, &node->page);
+       free_xenballooned_pages(nr_grefs, &node->page);
        kfree(node);
        return err;
 }
@@ -553,36 +602,51 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
 /**
  * xenbus_map_ring
  * @dev: xenbus device
- * @gnt_ref: grant reference
- * @handle: pointer to grant handle to be filled
+ * @gnt_ref: grant reference array
+ * @nr_grefs: number of grant references
+ * @handle: pointer to grant handle array to be filled, mind the size
  * @vaddr: address to be mapped to
  *
- * Map a page of memory into this domain from another domain's grant table.
+ * Map pages of memory into this domain from another domain's grant table.
  * xenbus_map_ring does not allocate the virtual address space (you must do
- * this yourself!). It only maps in the page to the specified address.
+ * this yourself!). It only maps in the pages to the specified address.
  * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
  * or -ENOMEM on error. If an error is returned, device will switch to
- * XenbusStateClosing and the error message will be saved in XenStore.
+ * XenbusStateClosing and the last error message will be saved in XenStore.
  */
-int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
-                   grant_handle_t *handle, void *vaddr)
+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref[], int nr_grefs,
+                   grant_handle_t handle[], void *vaddr)
 {
-       struct gnttab_map_grant_ref op;
-
-       gnttab_set_map_op(&op, (phys_addr_t)vaddr, GNTMAP_host_map, gnt_ref,
-                         dev->otherend_id);
+       struct gnttab_map_grant_ref op[XENBUS_MAX_RING_PAGES];
+       int i;
+       int err = GNTST_okay;   /* 0 */
+
+       for (i = 0; i < nr_grefs; i++) {
+               unsigned long addr = (unsigned long)vaddr +
+                       (PAGE_SIZE * i);
+               gnttab_set_map_op(&op[i], (phys_addr_t)addr,
+                                 GNTMAP_host_map, gnt_ref[i],
+                                 dev->otherend_id);
+       }

-       if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+       if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, nr_grefs))
                BUG();

-       if (op.status != GNTST_okay) {
-               xenbus_dev_fatal(dev, op.status,
-                                "mapping in shared page %d from domain %d",
-                                gnt_ref, dev->otherend_id);
-       } else
-               *handle = op.handle;
+       for (i = 0; i < nr_grefs; i++) {
+               if (op[i].status != GNTST_okay) {
+                       err = op[i].status;
+                       xenbus_dev_fatal(dev, err,
+                               "mapping in shared page %d from domain %d",
+                               gnt_ref[i], dev->otherend_id);
+                       handle[i] = INVALID_GRANT_HANDLE;
+               } else
+                       handle[i] = op[i].handle;
+       }

-       return op.status;
+       if (err != GNTST_okay)
+               xenbus_unmap_ring(dev, handle, nr_grefs, vaddr);
+
+       return err;
 }
 EXPORT_SYMBOL_GPL(xenbus_map_ring);

@@ -605,13 +669,53 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
 }
 EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);

+static int __xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev,
+                                       struct xenbus_map_node *node)
+{
+       struct gnttab_unmap_grant_ref op[XENBUS_MAX_RING_PAGES];
+       unsigned int level;
+       int i, j;
+       int err = GNTST_okay;
+
+       j = 0;
+       for (i = 0; i < node->nr_handles; i++) {
+               unsigned long vaddr = (unsigned long)node->area->addr +
+                       (PAGE_SIZE * i);
+               if (node->handle[i] != INVALID_GRANT_HANDLE) {
+                       memset(&op[j], 0, sizeof(op[0]));
+                       op[j].host_addr = arbitrary_virt_to_machine(
+                                       lookup_address(vaddr, &level)).maddr;
+                       op[j].handle = node->handle[i];
+                       j++;
+                       node->handle[i] = INVALID_GRANT_HANDLE;
+               }
+       }
+
+       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, op, j))
+               BUG();
+
+       node->nr_handles = 0;
+
+       for (i = 0; i < j; i++) {
+               if (op[i].status != GNTST_okay) {
+                       err = op[i].status;
+                       xenbus_dev_error(dev, err,
+                               "unmapping page %d at handle %d error %d",
+                               i, op[i].handle, err);
+               }
+       }
+
+       if (err == GNTST_okay)
+               free_vm_area(node->area);
+
+       kfree(node);
+
+       return err;
+}
+
 static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)
 {
        struct xenbus_map_node *node;
-       struct gnttab_unmap_grant_ref op = {
-               .host_addr = (unsigned long)vaddr,
-       };
-       unsigned int level;

        spin_lock(&xenbus_valloc_lock);
        list_for_each_entry(node, &xenbus_valloc_pages, next) {
@@ -626,33 +730,18 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)

        if (!node) {
                xenbus_dev_error(dev, -ENOENT,
-                                "can't find mapped virtual address %p", vaddr);
+                               "can't find mapped virtual address %p", vaddr);
                return GNTST_bad_virt_addr;
        }

-       op.handle = node->handle;
-       op.host_addr = arbitrary_virt_to_machine(
-               lookup_address((unsigned long)vaddr, &level)).maddr;
-
-       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
-               BUG();
-
-       if (op.status == GNTST_okay)
-               free_vm_area(node->area);
-       else
-               xenbus_dev_error(dev, op.status,
-                                "unmapping page at handle %d error %d",
-                                node->handle, op.status);
-
-       kfree(node);
-       return op.status;
+       return __xenbus_unmap_ring_vfree_pv(dev, node);
 }

 static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
 {
        int rv;
        struct xenbus_map_node *node;
-       void *addr;
+       void *addr = NULL;

        spin_lock(&xenbus_valloc_lock);
        list_for_each_entry(node, &xenbus_valloc_pages, next) {
@@ -668,14 +757,14 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)

        if (!node) {
                xenbus_dev_error(dev, -ENOENT,
-                                "can't find mapped virtual address %p", vaddr);
+                               "can't find mapped virtual address %p", vaddr);
                return GNTST_bad_virt_addr;
        }

-       rv = xenbus_unmap_ring(dev, node->handle, addr);
+       rv = xenbus_unmap_ring(dev, node->handle, node->nr_handles, addr);

        if (!rv)
-               free_xenballooned_pages(1, &node->page);
+               free_xenballooned_pages(node->nr_handles, &node->page);
        else
                WARN(1, "Leaking %p\n", vaddr);

@@ -687,6 +776,7 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
  * xenbus_unmap_ring
  * @dev: xenbus device
  * @handle: grant handle
+ * @nr_handles: number of grant handle
  * @vaddr: addr to unmap
  *
  * Unmap a page of memory in this domain that was imported from another domain.
@@ -694,21 +784,37 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
  * (see xen/include/interface/grant_table.h).
  */
 int xenbus_unmap_ring(struct xenbus_device *dev,
-                     grant_handle_t handle, void *vaddr)
+                       grant_handle_t handle[], int nr_handles,
+                       void *vaddr)
 {
-       struct gnttab_unmap_grant_ref op;
-
-       gnttab_set_unmap_op(&op, (phys_addr_t)vaddr, GNTMAP_host_map, handle);
+       struct gnttab_unmap_grant_ref op[XENBUS_MAX_RING_PAGES];
+       int i, j;
+       int err = GNTST_okay;
+
+       j = 0;
+       for (i = 0; i < nr_handles; i++) {
+               unsigned long addr = (unsigned long)vaddr +
+                       (PAGE_SIZE * i);
+               if (handle[i] != INVALID_GRANT_HANDLE) {
+                       gnttab_set_unmap_op(&op[j++], (phys_addr_t)addr,
+                                           GNTMAP_host_map, handle[i]);
+                       handle[i] = INVALID_GRANT_HANDLE;
+               }
+       }

-       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, op, j))
                BUG();

-       if (op.status != GNTST_okay)
-               xenbus_dev_error(dev, op.status,
-                                "unmapping page at handle %d error %d",
-                                handle, op.status);
+       for (i = 0; i < j; i++) {
+               if (op[i].status != GNTST_okay) {
+                       err = op[i].status;
+                       xenbus_dev_error(dev, err,
+                               "unmapping page at handle %d error %d",
+                               handle[i], err);
+               }
+       }

-       return op.status;
+       return err;
 }
 EXPORT_SYMBOL_GPL(xenbus_unmap_ring);

diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 3864967..62b92d2 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -718,6 +718,7 @@ static int __init xenstored_local_init(void)
        return err;
 }

+extern void xenbus_ring_ops_init(void);
 static int __init xenbus_init(void)
 {
        int err = 0;
@@ -767,6 +768,8 @@ static int __init xenbus_init(void)
        proc_mkdir("xen", NULL);
 #endif

+       xenbus_ring_ops_init();
+
 out_error:
        return err;
 }
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index e8c599b..cdbd948 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -195,15 +195,23 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
                         const char *pathfmt, ...);

 int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
-int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
-int xenbus_map_ring_valloc(struct xenbus_device *dev,
-                          int gnt_ref, void **vaddr);
-int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
-                          grant_handle_t *handle, void *vaddr);
+
+#define        XENBUS_MAX_RING_ORDER   2
+#define        XENBUS_MAX_RING_PAGES   (1 << XENBUS_MAX_RING_ORDER)
+
+#define INVALID_GRANT_HANDLE           (~0U)
+
+int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr,
+                     int nr_pages, int grefs[]);
+int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref[],
+                          int nr_grefs, void **vaddr);
+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref[], int nr_grefs,
+                   grant_handle_t handle[], void *vaddr);

 int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
 int xenbus_unmap_ring(struct xenbus_device *dev,
-                     grant_handle_t handle, void *vaddr);
+                     grant_handle_t handle[], int nr_handles,
+                     void *vaddr);

 int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
 int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-05 21:49 ` [PATCH 0001/001] xen: multi page ring support for block devices Santosh Jodh
@ 2012-03-06  2:42   ` Rusty Russell
  2012-03-06  6:21     ` Santosh Jodh
  2012-03-06  8:34   ` Jan Beulich
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 15+ messages in thread
From: Rusty Russell @ 2012-03-06  2:42 UTC (permalink / raw)
  To: Santosh Jodh, konrad.wilk, jeremy, Ian Campbell, jbarnes,
	jbeulich, joe.jin, lersek, weiyi.huang, dgdegra, David Vrabel,
	paul.gortmaker, akpm, waldi, virtualization, netdev, linux-pci,
  Cc: Santosh Jodh, Paul Durrant

On Mon, 5 Mar 2012 13:49:07 -0800, Santosh Jodh <Santosh.Jodh@citrix.com> wrote:
> +/* Order of maximum shared ring size advertised to the front end. */
> +int xen_blkif_max_ring_order = XENBUS_MAX_RING_ORDER;
> +
> +#define BLK_RING_SIZE(_order) __CONST_RING_SIZE(blkif, PAGE_SIZE << (_order))
> +
> +static int set_max_ring_order(const char *buf, struct kernel_param *kp)
> +{
> +       int err;
> +       unsigned long order;
> +
> +       err = kstrtol(buf, 0, &order);
> +       if (err ||
> +           order < 0 ||
> +           order > XENBUS_MAX_RING_ORDER)
> +               return -EINVAL;

Hmm, order can't be < 0, since it's unsigned.  So did you mean
kstrtoull?

And I think returning err is cleaner (it's -EINVAL for malformed
strings, -ERANGE for ones too big).

> +       if (xen_blkif_reqs < BLK_RING_SIZE(order))
> +               printk(KERN_WARNING "WARNING: "
> +                      "I/O request space (%d reqs) < ring order %ld, "
> +                      "consider increasing %s.reqs to >= %ld.",
> +                      xen_blkif_reqs, order, KBUILD_MODNAME,
> +                      roundup_pow_of_two(BLK_RING_SIZE(order)));

This message doesn't mention the module namr or parameter name
anywhere.  Think of the poor sysadmins!

Thanks,
Rusty.
-- 
  How could I marry someone with more hair than me?  http://baldalex.org

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-06  2:42   ` Rusty Russell
@ 2012-03-06  6:21     ` Santosh Jodh
  0 siblings, 0 replies; 15+ messages in thread
From: Santosh Jodh @ 2012-03-06  6:21 UTC (permalink / raw)
  To: Rusty Russell, konrad.wilk, jeremy, Ian Campbell, jbarnes,
	jbeulich, joe.jin, lersek, weiyi.huang, dgdegra, David Vrabel,
	paul.gortmaker, akpm, waldi, virtualization, netdev, linux-pci,
	linux-kernel@vger.kernel.org
  Cc: Paul Durrant

Great feedback. I removed unsigned for the first, changed the error code and added module param name in the printk.

Please see latest patch:
---

diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 0088bf6..cc238e7 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -60,6 +60,40 @@ static int xen_blkif_reqs = 64;
 module_param_named(reqs, xen_blkif_reqs, int, 0);
 MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");

+/* Order of maximum shared ring size advertised to the front end. */
+int xen_blkif_max_ring_order = XENBUS_MAX_RING_ORDER;
+
+#define BLK_RING_SIZE(_order) __CONST_RING_SIZE(blkif, PAGE_SIZE << (_order))
+
+static int set_max_ring_order(const char *buf, struct kernel_param *kp)
+{
+       int err;
+       long order;
+
+       err = kstrtol(buf, 0, &order);
+       if (err ||
+           order < 0 ||
+           order > XENBUS_MAX_RING_ORDER)
+               return -ERANGE;
+
+       if (xen_blkif_reqs < BLK_RING_SIZE(order))
+               printk(KERN_WARNING "WARNING: "
+                      "I/O request space (%d reqs) < ring order %ld "
+                       "set by module parameter %s.max_ring_order, "
+                      "consider increasing %s.reqs to >= %ld.",
+                      xen_blkif_reqs, order, KBUILD_MODNAME, KBUILD_MODNAME,
+                      roundup_pow_of_two(BLK_RING_SIZE(order)));
+
+       xen_blkif_max_ring_order = order;
+
+       return 0;
+}
+
+module_param_call(max_ring_order,
+                 set_max_ring_order, param_get_int,
+                 &xen_blkif_max_ring_order, 0644);
+MODULE_PARM_DESC(max_ring_order, "log2 of maximum ring size, in pages.");
+
 /* Run-time switchable: /sys/module/blkback/parameters/ */
 static unsigned int log_stats;
 module_param(log_stats, int, 0644);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index d0ee7ed..5f33a1a 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -126,6 +126,8 @@ struct blkif_x86_64_response {
        int16_t         status;          /* BLKIF_RSP_???       */
 };

+extern int xen_blkif_max_ring_order;
+
 DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
                  struct blkif_common_response);
 DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 24a2fb5..7a9d71d 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -122,8 +122,8 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
        return blkif;
 }

-static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
-                        unsigned int evtchn)
+static int xen_blkif_map(struct xen_blkif *blkif, int ring_ref[],
+                        unsigned int ring_order, unsigned int evtchn)
 {
        int err;

@@ -131,7 +131,8 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
        if (blkif->irq)
                return 0;

-       err = xenbus_map_ring_valloc(blkif->be->dev, shared_page, &blkif->blk_ring);
+       err = xenbus_map_ring_valloc(blkif->be->dev, ring_ref, 1 << ring_order,
+                                    &blkif->blk_ring);
        if (err < 0)
                return err;

@@ -140,21 +141,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
        {
                struct blkif_sring *sring;
                sring = (struct blkif_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
+               BACK_RING_INIT(&blkif->blk_rings.native, sring,
+                              PAGE_SIZE << ring_order);
                break;
        }
        case BLKIF_PROTOCOL_X86_32:
        {
                struct blkif_x86_32_sring *sring_x86_32;
                sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
+               BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32,
+                              PAGE_SIZE << ring_order);
                break;
        }
        case BLKIF_PROTOCOL_X86_64:
        {
                struct blkif_x86_64_sring *sring_x86_64;
                sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
-               BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
+               BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64,
+                              PAGE_SIZE << ring_order);
                break;
        }
        default:
@@ -497,6 +501,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
        if (err)
                goto fail;

+       err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-page-order",
+                           "%u", xen_blkif_max_ring_order);
+       if (err)
+               goto fail;
+
        err = xenbus_switch_state(dev, XenbusStateInitWait);
        if (err)
                goto fail;
@@ -744,22 +753,80 @@ again:
 static int connect_ring(struct backend_info *be)
 {
        struct xenbus_device *dev = be->dev;
-       unsigned long ring_ref;
+       int ring_ref[XENBUS_MAX_RING_PAGES];
+       unsigned int ring_order;
        unsigned int evtchn;
        char protocol[64] = "";
        int err;

        DPRINTK("%s", dev->otherend);

-       err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
-                           &ring_ref, "event-channel", "%u", &evtchn, NULL);
-       if (err) {
-               xenbus_dev_fatal(dev, err,
-                                "reading %s/ring-ref and event-channel",
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
+                          &evtchn);
+       if (err != 1) {
+               err = -EINVAL;
+
+               xenbus_dev_fatal(dev, err, "reading %s/event-channel",
                                 dev->otherend);
                return err;
        }

+       printk(KERN_INFO "blkback: event-channel %u\n", evtchn);
+
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
+                          &ring_order);
+       if (err != 1) {
+               DPRINTK("%s: using single page handshake", dev->otherend);
+
+               ring_order = 0;
+
+               err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
+                                  "%d", &ring_ref[0]);
+               if (err != 1) {
+                       err = -EINVAL;
+
+                       xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
+                                        dev->otherend);
+                       return err;
+               }
+
+               printk(KERN_INFO "blkback: ring-ref %d\n", ring_ref[0]);
+       } else {
+               unsigned int i;
+
+               if (ring_order > xen_blkif_max_ring_order) {
+                       err = -EINVAL;
+
+                       xenbus_dev_fatal(dev, err,
+                                        "%s/ring-page-order too big",
+                                        dev->otherend);
+                       return err;
+               }
+
+               for (i = 0; i < (1u << ring_order); i++) {
+                       char ring_ref_name[10];
+
+                       snprintf(ring_ref_name, sizeof(ring_ref_name),
+                                "ring-ref%u", i);
+
+                       err = xenbus_scanf(XBT_NIL, dev->otherend,
+                                          ring_ref_name, "%d",
+                                          &ring_ref[i]);
+                       if (err != 1) {
+                               err = -EINVAL;
+
+                               xenbus_dev_fatal(dev, err,
+                                                "reading %s/%s",
+                                                dev->otherend,
+                                                ring_ref_name);
+                               return err;
+                       }
+
+                       printk(KERN_INFO "blkback: ring-ref%u %d\n", i,
+                              ring_ref[i]);
+               }
+       }
+
        be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
        err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
                            "%63s", protocol, NULL);
@@ -775,14 +842,11 @@ static int connect_ring(struct backend_info *be)
                xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
                return -1;
        }
-       pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n",
-               ring_ref, evtchn, be->blkif->blk_protocol, protocol);

        /* Map the shared frame, irq etc. */
-       err = xen_blkif_map(be->blkif, ring_ref, evtchn);
+       err = xen_blkif_map(be->blkif, ring_ref, ring_order, evtchn);
        if (err) {
-               xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
-                                ring_ref, evtchn);
+               xenbus_dev_fatal(dev, err, "mapping ring-refs and evtchn");
                return err;
        }

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2f22874..485813a 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -57,6 +57,10 @@

 #include <asm/xen/hypervisor.h>

+static int xen_blkif_ring_order;
+module_param_named(reqs, xen_blkif_ring_order, int, 0);
+MODULE_PARM_DESC(reqs, "log2 of requested ring size, in pages.");
+
 enum blkif_state {
        BLKIF_STATE_DISCONNECTED,
        BLKIF_STATE_CONNECTED,
@@ -72,7 +76,8 @@ struct blk_shadow {
 static DEFINE_MUTEX(blkfront_mutex);
 static const struct block_device_operations xlvbd_block_fops;

-#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
+#define BLK_RING_SIZE(_order)  __CONST_RING_SIZE(blkif, PAGE_SIZE << (_order))
+#define BLK_MAX_RING_SIZE      BLK_RING_SIZE(XENBUS_MAX_RING_ORDER)

 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -87,14 +92,15 @@ struct blkfront_info
        int vdevice;
        blkif_vdev_t handle;
        enum blkif_state connected;
-       int ring_ref;
+       int ring_ref[XENBUS_MAX_RING_PAGES];
+       int ring_order;
        struct blkif_front_ring ring;
        struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
        unsigned int evtchn, irq;
        struct request_queue *rq;
        struct work_struct work;
        struct gnttab_free_callback callback;
-       struct blk_shadow shadow[BLK_RING_SIZE];
+       struct blk_shadow shadow[BLK_MAX_RING_SIZE];
        unsigned long shadow_free;
        unsigned int feature_flush;
        unsigned int flush_op;
@@ -111,9 +117,7 @@ static unsigned int nr_minors;
 static unsigned long *minors;
 static DEFINE_SPINLOCK(minor_lock);

-#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
-       (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
-#define GRANT_INVALID_REF      0
+#define GRANT_INVALID_REF      0

 #define PARTS_PER_DISK         16
 #define PARTS_PER_EXT_DISK      256
@@ -135,7 +139,7 @@ static DEFINE_SPINLOCK(minor_lock);
 static int get_id_from_freelist(struct blkfront_info *info)
 {
        unsigned long free = info->shadow_free;
-       BUG_ON(free >= BLK_RING_SIZE);
+       BUG_ON(free >= BLK_MAX_RING_SIZE);
        info->shadow_free = info->shadow[free].req.u.rw.id;
        info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
        return free;
@@ -683,6 +687,8 @@ static void blkif_restart_queue(struct work_struct *work)

 static void blkif_free(struct blkfront_info *info, int suspend)
 {
+       int i;
+
        /* Prevent new requests being issued until we fix things up. */
        spin_lock_irq(&blkif_io_lock);
        info->connected = suspend ?
@@ -698,16 +704,19 @@ static void blkif_free(struct blkfront_info *info, int suspend)
        flush_work_sync(&info->work);

        /* Free resources associated with old device channel. */
-       if (info->ring_ref != GRANT_INVALID_REF) {
-               gnttab_end_foreign_access(info->ring_ref, 0,
-                                         (unsigned long)info->ring.sring);
-               info->ring_ref = GRANT_INVALID_REF;
-               info->ring.sring = NULL;
+       for (i = 0; i < (1 << info->ring_order); i++) {
+               if (info->ring_ref[i] != GRANT_INVALID_REF) {
+                       gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
+                       info->ring_ref[i] = GRANT_INVALID_REF;
+               }
        }
+
+       free_pages((unsigned long)info->ring.sring, info->ring_order);
+       info->ring.sring = NULL;
+
        if (info->irq)
                unbind_from_irqhandler(info->irq, info);
        info->evtchn = info->irq = 0;
-
 }

 static void blkif_completion(struct blk_shadow *s)
@@ -828,25 +837,24 @@ static int setup_blkring(struct xenbus_device *dev,
        struct blkif_sring *sring;
        int err;

-       info->ring_ref = GRANT_INVALID_REF;
-
-       sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
+       sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
+                                                      info->ring_order);
        if (!sring) {
                xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
                return -ENOMEM;
        }
        SHARED_RING_INIT(sring);
-       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
+       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE << info->ring_order);

        sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);

-       err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
+       err = xenbus_grant_ring(dev, info->ring.sring, 1 << info->ring_order,
+                               info->ring_ref);
        if (err < 0) {
-               free_page((unsigned long)sring);
+               free_pages((unsigned long)sring, info->ring_order);
                info->ring.sring = NULL;
                goto fail;
        }
-       info->ring_ref = err;

        err = xenbus_alloc_evtchn(dev, &info->evtchn);
        if (err)
@@ -875,8 +883,27 @@ static int talk_to_blkback(struct xenbus_device *dev,
 {
        const char *message = NULL;
        struct xenbus_transaction xbt;
+       unsigned int ring_order;
+       int legacy_backend;
+       int i;
        int err;

+       for (i = 0; i < (1 << info->ring_order); i++)
+               info->ring_ref[i] = GRANT_INVALID_REF;
+
+       err = xenbus_scanf(XBT_NIL, dev->otherend, "max-ring-page-order", "%u",
+                          &ring_order);
+
+       legacy_backend = !(err == 1);
+
+       if (legacy_backend) {
+               info->ring_order = 0;
+       } else {
+               info->ring_order = (ring_order <= xen_blkif_ring_order) ?
+                                  ring_order :
+                                  xen_blkif_ring_order;
+       }
+
        /* Create shared ring, alloc event channel. */
        err = setup_blkring(dev, info);
        if (err)
@@ -889,12 +916,35 @@ again:
                goto destroy_blkring;
        }

-       err = xenbus_printf(xbt, dev->nodename,
-                           "ring-ref", "%u", info->ring_ref);
-       if (err) {
-               message = "writing ring-ref";
-               goto abort_transaction;
+       if (legacy_backend) {
+               err = xenbus_printf(xbt, dev->nodename,
+                                   "ring-ref", "%d", info->ring_ref[0]);
+               if (err) {
+                       message = "writing ring-ref";
+                       goto abort_transaction;
+               }
+       } else {
+               for (i = 0; i < (1 << info->ring_order); i++) {
+                       char key[sizeof("ring-ref") + 2];
+
+                       sprintf(key, "ring-ref%d", i);
+
+                       err = xenbus_printf(xbt, dev->nodename,
+                                           key, "%d", info->ring_ref[i]);
+                       if (err) {
+                               message = "writing ring-ref";
+                               goto abort_transaction;
+                       }
+               }
+
+               err = xenbus_printf(xbt, dev->nodename,
+                                   "ring-page-order", "%u", info->ring_order);
+               if (err) {
+                       message = "writing ring-order";
+                       goto abort_transaction;
+               }
        }
+
        err = xenbus_printf(xbt, dev->nodename,
                            "event-channel", "%u", info->evtchn);
        if (err) {
@@ -996,21 +1046,14 @@ static int blkfront_probe(struct xenbus_device *dev,
        info->connected = BLKIF_STATE_DISCONNECTED;
        INIT_WORK(&info->work, blkif_restart_queue);

-       for (i = 0; i < BLK_RING_SIZE; i++)
+       for (i = 0; i < BLK_MAX_RING_SIZE; i++)
                info->shadow[i].req.u.rw.id = i+1;
-       info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
+       info->shadow[BLK_MAX_RING_SIZE-1].req.u.rw.id = 0x0fffffff;

        /* Front end dir is a number, which is used as the id. */
        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
        dev_set_drvdata(&dev->dev, info);

-       err = talk_to_blkback(dev, info);
-       if (err) {
-               kfree(info);
-               dev_set_drvdata(&dev->dev, NULL);
-               return err;
-       }
-
        return 0;
 }

@@ -1031,13 +1074,13 @@ static int blkif_recover(struct blkfront_info *info)

        /* Stage 2: Set up free list. */
        memset(&info->shadow, 0, sizeof(info->shadow));
-       for (i = 0; i < BLK_RING_SIZE; i++)
+       for (i = 0; i < BLK_MAX_RING_SIZE; i++)
                info->shadow[i].req.u.rw.id = i+1;
        info->shadow_free = info->ring.req_prod_pvt;
-       info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
+       info->shadow[BLK_MAX_RING_SIZE-1].req.u.rw.id = 0x0fffffff;

        /* Stage 3: Find pending requests and requeue them. */
-       for (i = 0; i < BLK_RING_SIZE; i++) {
+       for (i = 0; i < BLK_RING_SIZE(info->ring_order); i++) {
                /* Not in use? */
                if (!copy[i].request)
                        continue;
@@ -1299,7 +1342,6 @@ static void blkback_changed(struct xenbus_device *dev,

        switch (backend_state) {
        case XenbusStateInitialising:
-       case XenbusStateInitWait:
        case XenbusStateInitialised:
        case XenbusStateReconfiguring:
        case XenbusStateReconfigured:
@@ -1307,6 +1349,10 @@ static void blkback_changed(struct xenbus_device *dev,
        case XenbusStateClosed:
                break;

+       case XenbusStateInitWait:
+               talk_to_blkback(dev, info);
+               break;
+
        case XenbusStateConnected:
                blkfront_connect(info);
                break;
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index 94b79c3..f93b59a 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -130,8 +130,8 @@ int xen_netbk_must_stop_queue(struct xenvif *vif);
 /* (Un)Map communication rings. */
 void xen_netbk_unmap_frontend_rings(struct xenvif *vif);
 int xen_netbk_map_frontend_rings(struct xenvif *vif,
-                                grant_ref_t tx_ring_ref,
-                                grant_ref_t rx_ring_ref);
+                                int tx_ring_ref,
+                                int rx_ring_ref);

 /* (De)Register a xenvif with the netback backend. */
 void xen_netbk_add_xenvif(struct xenvif *vif);
diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
index 59effac..0b014cf 100644
--- a/drivers/net/xen-netback/netback.c
+++ b/drivers/net/xen-netback/netback.c
@@ -1594,8 +1594,8 @@ void xen_netbk_unmap_frontend_rings(struct xenvif *vif)
 }

 int xen_netbk_map_frontend_rings(struct xenvif *vif,
-                                grant_ref_t tx_ring_ref,
-                                grant_ref_t rx_ring_ref)
+                                int tx_ring_ref,
+                                int rx_ring_ref)
 {
        void *addr;
        struct xen_netif_tx_sring *txs;
@@ -1604,7 +1604,7 @@ int xen_netbk_map_frontend_rings(struct xenvif *vif,
        int err = -ENOMEM;

        err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(vif),
-                                    tx_ring_ref, &addr);
+                                    &tx_ring_ref, 1, &addr);
        if (err)
                goto err;

@@ -1612,7 +1612,7 @@ int xen_netbk_map_frontend_rings(struct xenvif *vif,
        BACK_RING_INIT(&vif->tx, txs, PAGE_SIZE);

        err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(vif),
-                                    rx_ring_ref, &addr);
+                                    &rx_ring_ref, 1, &addr);
        if (err)
                goto err;

diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index 698b905..521a595 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1496,13 +1496,12 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
        SHARED_RING_INIT(txs);
        FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);

-       err = xenbus_grant_ring(dev, virt_to_mfn(txs));
+       err = xenbus_grant_ring(dev, txs, 1, &info->tx_ring_ref);
        if (err < 0) {
                free_page((unsigned long)txs);
                goto fail;
        }

-       info->tx_ring_ref = err;
        rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
        if (!rxs) {
                err = -ENOMEM;
@@ -1512,12 +1511,11 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
        SHARED_RING_INIT(rxs);
        FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);

-       err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
+       err = xenbus_grant_ring(dev, rxs, 1, &info->rx_ring_ref);
        if (err < 0) {
                free_page((unsigned long)rxs);
                goto fail;
        }
-       info->rx_ring_ref = err;

        err = xenbus_alloc_evtchn(dev, &info->evtchn);
        if (err)
diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
index 1620088..95109d8 100644
--- a/drivers/pci/xen-pcifront.c
+++ b/drivers/pci/xen-pcifront.c
@@ -768,12 +768,10 @@ static int pcifront_publish_info(struct pcifront_device *pdev)
        int err = 0;
        struct xenbus_transaction trans;

-       err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
+       err = xenbus_grant_ring(pdev->xdev, pdev->sh_info, 1, &pdev->gnt_ref);
        if (err < 0)
                goto out;

-       pdev->gnt_ref = err;
-
        err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
        if (err)
                goto out;
diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
index 64b11f9..e0834cd 100644
--- a/drivers/xen/xen-pciback/xenbus.c
+++ b/drivers/xen/xen-pciback/xenbus.c
@@ -108,7 +108,7 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
                "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
                gnt_ref, remote_evtchn);

-       err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
+       err = xenbus_map_ring_valloc(pdev->xdev, &gnt_ref, 1, &vaddr);
        if (err < 0) {
                xenbus_dev_fatal(pdev->xdev, err,
                                "Error mapping other domain page in ours.");
diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
index 566d2ad..3a14524 100644
--- a/drivers/xen/xenbus/xenbus_client.c
+++ b/drivers/xen/xenbus/xenbus_client.c
@@ -53,14 +53,16 @@ struct xenbus_map_node {
                struct vm_struct *area; /* PV */
                struct page *page;     /* HVM */
        };
-       grant_handle_t handle;
+       grant_handle_t handle[XENBUS_MAX_RING_PAGES];
+       unsigned int   nr_handles;
 };

 static DEFINE_SPINLOCK(xenbus_valloc_lock);
 static LIST_HEAD(xenbus_valloc_pages);

 struct xenbus_ring_ops {
-       int (*map)(struct xenbus_device *dev, int gnt, void **vaddr);
+       int (*map)(struct xenbus_device *dev, int gnt[], int nr_gnts,
+                  void **vaddr);
        int (*unmap)(struct xenbus_device *dev, void *vaddr);
 };

@@ -356,17 +358,38 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
 /**
  * xenbus_grant_ring
  * @dev: xenbus device
- * @ring_mfn: mfn of ring to grant
-
- * Grant access to the given @ring_mfn to the peer of the given device.  Return
- * 0 on success, or -errno on error.  On error, the device will switch to
- * XenbusStateClosing, and the error will be saved in the store.
+ * @vaddr: starting virtual address of the ring
+ * @nr_pages: number of page to be granted
+ * @grefs: grant reference array to be filled in
+ * Grant access to the given @vaddr to the peer of the given device.
+ * Then fill in @grefs with grant references.  Return 0 on success, or
+ * -errno on error.  On error, the device will switch to
+ * XenbusStateClosing, and the first error will be saved in the store.
  */
-int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
+int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr,
+                     int nr_pages, int grefs[])
 {
-       int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
-       if (err < 0)
-               xenbus_dev_fatal(dev, err, "granting access to ring page");
+       int i;
+       int err;
+
+       for (i = 0; i < nr_pages; i++) {
+               unsigned long addr = (unsigned long)vaddr +
+                       (PAGE_SIZE * i);
+               err = gnttab_grant_foreign_access(dev->otherend_id,
+                                                 virt_to_mfn(addr), 0);
+               if (err < 0) {
+                       xenbus_dev_fatal(dev, err,
+                                        "granting access to ring page");
+                       goto fail;
+               }
+               grefs[i] = err;
+       }
+
+       return 0;
+
+fail:
+       for ( ; i >= 0; i--)
+               gnttab_end_foreign_access_ref(grefs[i], 0);
        return err;
 }
 EXPORT_SYMBOL_GPL(xenbus_grant_ring);
@@ -447,7 +470,8 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
 /**
  * xenbus_map_ring_valloc
  * @dev: xenbus device
- * @gnt_ref: grant reference
+ * @gnt_ref: grant reference array
+ * @nr_grefs: number of grant reference
  * @vaddr: pointer to address to be filled out by mapping
  *
  * Based on Rusty Russell's skeleton driver's map_page.
@@ -458,23 +482,28 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
  * or -ENOMEM on error. If an error is returned, device will switch to
  * XenbusStateClosing and the error message will be saved in XenStore.
  */
-int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
+int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref[],
+                          int nr_grefs, void **vaddr)
 {
-       return ring_ops->map(dev, gnt_ref, vaddr);
+       return ring_ops->map(dev, gnt_ref, nr_grefs, vaddr);
 }
 EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);

+static int __xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev,
+                                       struct xenbus_map_node *node);
+
 static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
-                                    int gnt_ref, void **vaddr)
+                                    int gnt_ref[], int nr_grefs, void **vaddr)
 {
-       struct gnttab_map_grant_ref op = {
-               .flags = GNTMAP_host_map | GNTMAP_contains_pte,
-               .ref   = gnt_ref,
-               .dom   = dev->otherend_id,
-       };
+       struct gnttab_map_grant_ref op[XENBUS_MAX_RING_PAGES];
        struct xenbus_map_node *node;
        struct vm_struct *area;
-       pte_t *pte;
+       pte_t *pte[XENBUS_MAX_RING_PAGES];
+       int i;
+       int err = 0;
+
+       if (nr_grefs > XENBUS_MAX_RING_PAGES)
+               return -EINVAL;

        *vaddr = NULL;

@@ -482,28 +511,44 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
        if (!node)
                return -ENOMEM;

-       area = alloc_vm_area(PAGE_SIZE, &pte);
+       area = alloc_vm_area(PAGE_SIZE * nr_grefs, pte);
        if (!area) {
                kfree(node);
                return -ENOMEM;
        }

-       op.host_addr = arbitrary_virt_to_machine(pte).maddr;
+       for (i = 0; i < nr_grefs; i++) {
+               op[i].flags = GNTMAP_host_map | GNTMAP_contains_pte,
+               op[i].ref   = gnt_ref[i],
+               op[i].dom   = dev->otherend_id,
+               op[i].host_addr = arbitrary_virt_to_machine(pte[i]).maddr;
+       };

        if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
                BUG();

-       if (op.status != GNTST_okay) {
-               free_vm_area(area);
-               kfree(node);
-               xenbus_dev_fatal(dev, op.status,
-                                "mapping in shared page %d from domain %d",
-                                gnt_ref, dev->otherend_id);
-               return op.status;
+       node->nr_handles = nr_grefs;
+       node->area = area;
+
+       for (i = 0; i < nr_grefs; i++) {
+               if (op[i].status != GNTST_okay) {
+                       err = op[i].status;
+                       node->handle[i] = INVALID_GRANT_HANDLE;
+                       continue;
+               }
+               node->handle[i] = op[i].handle;
        }

-       node->handle = op.handle;
-       node->area = area;
+       if (err != 0) {
+               for (i = 0; i < nr_grefs; i++)
+                       xenbus_dev_fatal(dev, op[i].status,
+                               "mapping in shared page %d from domain %d",
+                               gnt_ref[i], dev->otherend_id);
+
+                __xenbus_unmap_ring_vfree_pv(dev, node);
+
+               return err;
+       }

        spin_lock(&xenbus_valloc_lock);
        list_add(&node->next, &xenbus_valloc_pages);
@@ -514,25 +559,29 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
 }

 static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
-                                     int gnt_ref, void **vaddr)
+                                     int gnt_ref[], int nr_grefs, void **vaddr)
 {
        struct xenbus_map_node *node;
        int err;
        void *addr;

+       if (nr_grefs > XENBUS_MAX_RING_PAGES)
+               return -EINVAL;
+
        *vaddr = NULL;

        node = kzalloc(sizeof(*node), GFP_KERNEL);
        if (!node)
                return -ENOMEM;

-       err = alloc_xenballooned_pages(1, &node->page, false /* lowmem */);
+       err = alloc_xenballooned_pages(nr_grefs, &node->page,
+                                      false /* lowmem */);
        if (err)
                goto out_err;

        addr = pfn_to_kaddr(page_to_pfn(node->page));

-       err = xenbus_map_ring(dev, gnt_ref, &node->handle, addr);
+       err = xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handle, addr);
        if (err)
                goto out_err;

@@ -544,7 +593,7 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
        return 0;

  out_err:
-       free_xenballooned_pages(1, &node->page);
+       free_xenballooned_pages(nr_grefs, &node->page);
        kfree(node);
        return err;
 }
@@ -553,36 +602,51 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
 /**
  * xenbus_map_ring
  * @dev: xenbus device
- * @gnt_ref: grant reference
- * @handle: pointer to grant handle to be filled
+ * @gnt_ref: grant reference array
+ * @nr_grefs: number of grant references
+ * @handle: pointer to grant handle array to be filled, mind the size
  * @vaddr: address to be mapped to
  *
- * Map a page of memory into this domain from another domain's grant table.
+ * Map pages of memory into this domain from another domain's grant table.
  * xenbus_map_ring does not allocate the virtual address space (you must do
- * this yourself!). It only maps in the page to the specified address.
+ * this yourself!). It only maps in the pages to the specified address.
  * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
  * or -ENOMEM on error. If an error is returned, device will switch to
- * XenbusStateClosing and the error message will be saved in XenStore.
+ * XenbusStateClosing and the last error message will be saved in XenStore.
  */
-int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
-                   grant_handle_t *handle, void *vaddr)
+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref[], int nr_grefs,
+                   grant_handle_t handle[], void *vaddr)
 {
-       struct gnttab_map_grant_ref op;
-
-       gnttab_set_map_op(&op, (phys_addr_t)vaddr, GNTMAP_host_map, gnt_ref,
-                         dev->otherend_id);
+       struct gnttab_map_grant_ref op[XENBUS_MAX_RING_PAGES];
+       int i;
+       int err = GNTST_okay;   /* 0 */
+
+       for (i = 0; i < nr_grefs; i++) {
+               unsigned long addr = (unsigned long)vaddr +
+                       (PAGE_SIZE * i);
+               gnttab_set_map_op(&op[i], (phys_addr_t)addr,
+                                 GNTMAP_host_map, gnt_ref[i],
+                                 dev->otherend_id);
+       }

-       if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
+       if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, nr_grefs))
                BUG();

-       if (op.status != GNTST_okay) {
-               xenbus_dev_fatal(dev, op.status,
-                                "mapping in shared page %d from domain %d",
-                                gnt_ref, dev->otherend_id);
-       } else
-               *handle = op.handle;
+       for (i = 0; i < nr_grefs; i++) {
+               if (op[i].status != GNTST_okay) {
+                       err = op[i].status;
+                       xenbus_dev_fatal(dev, err,
+                               "mapping in shared page %d from domain %d",
+                               gnt_ref[i], dev->otherend_id);
+                       handle[i] = INVALID_GRANT_HANDLE;
+               } else
+                       handle[i] = op[i].handle;
+       }

-       return op.status;
+       if (err != GNTST_okay)
+               xenbus_unmap_ring(dev, handle, nr_grefs, vaddr);
+
+       return err;
 }
 EXPORT_SYMBOL_GPL(xenbus_map_ring);

@@ -605,13 +669,53 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
 }
 EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);

+static int __xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev,
+                                       struct xenbus_map_node *node)
+{
+       struct gnttab_unmap_grant_ref op[XENBUS_MAX_RING_PAGES];
+       unsigned int level;
+       int i, j;
+       int err = GNTST_okay;
+
+       j = 0;
+       for (i = 0; i < node->nr_handles; i++) {
+               unsigned long vaddr = (unsigned long)node->area->addr +
+                       (PAGE_SIZE * i);
+               if (node->handle[i] != INVALID_GRANT_HANDLE) {
+                       memset(&op[j], 0, sizeof(op[0]));
+                       op[j].host_addr = arbitrary_virt_to_machine(
+                                       lookup_address(vaddr, &level)).maddr;
+                       op[j].handle = node->handle[i];
+                       j++;
+                       node->handle[i] = INVALID_GRANT_HANDLE;
+               }
+       }
+
+       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, op, j))
+               BUG();
+
+       node->nr_handles = 0;
+
+       for (i = 0; i < j; i++) {
+               if (op[i].status != GNTST_okay) {
+                       err = op[i].status;
+                       xenbus_dev_error(dev, err,
+                               "unmapping page %d at handle %d error %d",
+                               i, op[i].handle, err);
+               }
+       }
+
+       if (err == GNTST_okay)
+               free_vm_area(node->area);
+
+       kfree(node);
+
+       return err;
+}
+
 static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)
 {
        struct xenbus_map_node *node;
-       struct gnttab_unmap_grant_ref op = {
-               .host_addr = (unsigned long)vaddr,
-       };
-       unsigned int level;

        spin_lock(&xenbus_valloc_lock);
        list_for_each_entry(node, &xenbus_valloc_pages, next) {
@@ -626,33 +730,18 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)

        if (!node) {
                xenbus_dev_error(dev, -ENOENT,
-                                "can't find mapped virtual address %p", vaddr);
+                               "can't find mapped virtual address %p", vaddr);
                return GNTST_bad_virt_addr;
        }

-       op.handle = node->handle;
-       op.host_addr = arbitrary_virt_to_machine(
-               lookup_address((unsigned long)vaddr, &level)).maddr;
-
-       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
-               BUG();
-
-       if (op.status == GNTST_okay)
-               free_vm_area(node->area);
-       else
-               xenbus_dev_error(dev, op.status,
-                                "unmapping page at handle %d error %d",
-                                node->handle, op.status);
-
-       kfree(node);
-       return op.status;
+       return __xenbus_unmap_ring_vfree_pv(dev, node);
 }

 static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
 {
        int rv;
        struct xenbus_map_node *node;
-       void *addr;
+       void *addr = NULL;

        spin_lock(&xenbus_valloc_lock);
        list_for_each_entry(node, &xenbus_valloc_pages, next) {
@@ -668,14 +757,14 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)

        if (!node) {
                xenbus_dev_error(dev, -ENOENT,
-                                "can't find mapped virtual address %p", vaddr);
+                               "can't find mapped virtual address %p", vaddr);
                return GNTST_bad_virt_addr;
        }

-       rv = xenbus_unmap_ring(dev, node->handle, addr);
+       rv = xenbus_unmap_ring(dev, node->handle, node->nr_handles, addr);

        if (!rv)
-               free_xenballooned_pages(1, &node->page);
+               free_xenballooned_pages(node->nr_handles, &node->page);
        else
                WARN(1, "Leaking %p\n", vaddr);

@@ -687,6 +776,7 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
  * xenbus_unmap_ring
  * @dev: xenbus device
  * @handle: grant handle
+ * @nr_handles: number of grant handle
  * @vaddr: addr to unmap
  *
  * Unmap a page of memory in this domain that was imported from another domain.
@@ -694,21 +784,37 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
  * (see xen/include/interface/grant_table.h).
  */
 int xenbus_unmap_ring(struct xenbus_device *dev,
-                     grant_handle_t handle, void *vaddr)
+                       grant_handle_t handle[], int nr_handles,
+                       void *vaddr)
 {
-       struct gnttab_unmap_grant_ref op;
-
-       gnttab_set_unmap_op(&op, (phys_addr_t)vaddr, GNTMAP_host_map, handle);
+       struct gnttab_unmap_grant_ref op[XENBUS_MAX_RING_PAGES];
+       int i, j;
+       int err = GNTST_okay;
+
+       j = 0;
+       for (i = 0; i < nr_handles; i++) {
+               unsigned long addr = (unsigned long)vaddr +
+                       (PAGE_SIZE * i);
+               if (handle[i] != INVALID_GRANT_HANDLE) {
+                       gnttab_set_unmap_op(&op[j++], (phys_addr_t)addr,
+                                           GNTMAP_host_map, handle[i]);
+                       handle[i] = INVALID_GRANT_HANDLE;
+               }
+       }

-       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
+       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, op, j))
                BUG();

-       if (op.status != GNTST_okay)
-               xenbus_dev_error(dev, op.status,
-                                "unmapping page at handle %d error %d",
-                                handle, op.status);
+       for (i = 0; i < j; i++) {
+               if (op[i].status != GNTST_okay) {
+                       err = op[i].status;
+                       xenbus_dev_error(dev, err,
+                               "unmapping page at handle %d error %d",
+                               handle[i], err);
+               }
+       }

-       return op.status;
+       return err;
 }
 EXPORT_SYMBOL_GPL(xenbus_unmap_ring);

diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 3864967..62b92d2 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -718,6 +718,7 @@ static int __init xenstored_local_init(void)
        return err;
 }

+extern void xenbus_ring_ops_init(void);
 static int __init xenbus_init(void)
 {
        int err = 0;
@@ -767,6 +768,8 @@ static int __init xenbus_init(void)
        proc_mkdir("xen", NULL);
 #endif

+       xenbus_ring_ops_init();
+
 out_error:
        return err;
 }
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index e8c599b..cdbd948 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -195,15 +195,23 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
                         const char *pathfmt, ...);

 int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
-int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
-int xenbus_map_ring_valloc(struct xenbus_device *dev,
-                          int gnt_ref, void **vaddr);
-int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
-                          grant_handle_t *handle, void *vaddr);
+
+#define        XENBUS_MAX_RING_ORDER   2
+#define        XENBUS_MAX_RING_PAGES   (1 << XENBUS_MAX_RING_ORDER)
+
+#define INVALID_GRANT_HANDLE           (~0U)
+
+int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr,
+                     int nr_pages, int grefs[]);
+int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref[],
+                          int nr_grefs, void **vaddr);
+int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref[], int nr_grefs,
+                   grant_handle_t handle[], void *vaddr);

 int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
 int xenbus_unmap_ring(struct xenbus_device *dev,
-                     grant_handle_t handle, void *vaddr);
+                     grant_handle_t handle[], int nr_handles,
+                     void *vaddr);

 int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
 int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-05 21:49 ` [PATCH 0001/001] xen: multi page ring support for block devices Santosh Jodh
  2012-03-06  2:42   ` Rusty Russell
@ 2012-03-06  8:34   ` Jan Beulich
  2012-03-14 17:17     ` [Xen-devel] " Justin T. Gibbs
  2012-03-06 11:16   ` Wei Liu
  2012-03-06 17:20   ` [Xen-devel] " Konrad Rzeszutek Wilk
  3 siblings, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2012-03-06  8:34 UTC (permalink / raw)
  To: Santosh Jodh
  Cc: David Vrabel, Ian Campbell, Paul Durrant, waldi, weiyi.huang,
	jeremy, akpm, virtualization, xen-devel, joe.jin, konrad.wilk,
	lersek, rusty, dgdegra, linux-kernel, linux-pci, netdev,

>>> On 05.03.12 at 22:49, Santosh Jodh <Santosh.Jodh@citrix.com> wrote:

Could this be split up into 3 patches, for easier reviewing:
- one adjusting the xenbus interface to allow for multiple ring pages (and
  maybe even that one should be split into the backend and frontend
  related parts), syncing with the similar netback effort?
- one for the blkback changes
- one for the blkfront changes?

> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -122,8 +122,8 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
>         return blkif;
>  }
> 
> -static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
> -                        unsigned int evtchn)
> +static int xen_blkif_map(struct xen_blkif *blkif, int ring_ref[],

As you need to touch this anyway, can you please switch this to the
proper type (grant_ref_t) rather than using plain "int" (not just here)?

> +                        unsigned int ring_order, unsigned int evtchn)
>  {
>         int err;
> 
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -135,7 +139,7 @@ static DEFINE_SPINLOCK(minor_lock);
>  static int get_id_from_freelist(struct blkfront_info *info)
>  {
>         unsigned long free = info->shadow_free;
> -       BUG_ON(free >= BLK_RING_SIZE);
> +       BUG_ON(free >= BLK_MAX_RING_SIZE);

Wouldn't you better check against the actual limit here?

>         info->shadow_free = info->shadow[free].req.u.rw.id;
>         info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
>         return free;
> @@ -698,16 +704,19 @@ static void blkif_free(struct blkfront_info *info, int suspend)
>         flush_work_sync(&info->work);
> 
>         /* Free resources associated with old device channel. */
> -       if (info->ring_ref != GRANT_INVALID_REF) {
> -               gnttab_end_foreign_access(info->ring_ref, 0,
> -                                         (unsigned long)info->ring.sring);
> -               info->ring_ref = GRANT_INVALID_REF;
> -               info->ring.sring = NULL;
> +       for (i = 0; i < (1 << info->ring_order); i++) {
> +               if (info->ring_ref[i] != GRANT_INVALID_REF) {
> +                       gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
> +                       info->ring_ref[i] = GRANT_INVALID_REF;
> +               }
>         }
> +
> +       free_pages((unsigned long)info->ring.sring, info->ring_order);

No. The freeing must continue happen in gnttab_end_foreign_access()
(with the sole exception when a page was allocated but the grant
didn't get established), since it must be suppressed/delayed when the
grant is still in use (otherwise the kernel will die on the first re-use of
the page). I just happened to fix that problem at the end of last week
in the variant of the patch that we pulled into our tree.

Further, rather than doing a non-zero order allocation here, I'd
suggest allocating individual pages and vmap()-ing them.

> +       info->ring.sring = NULL;
> +
>         if (info->irq)
>                 unbind_from_irqhandler(info->irq, info);
>         info->evtchn = info->irq = 0;
> -
>  }
> 
>  static void blkif_completion(struct blk_shadow *s)
> @@ -875,8 +883,27 @@ static int talk_to_blkback(struct xenbus_device *dev,
>  {
>         const char *message = NULL;
>         struct xenbus_transaction xbt;
> +       unsigned int ring_order;
> +       int legacy_backend;
> +       int i;
>         int err;
> 
> +       for (i = 0; i < (1 << info->ring_order); i++)
> +               info->ring_ref[i] = GRANT_INVALID_REF;
> +
> +       err = xenbus_scanf(XBT_NIL, dev->otherend, "max-ring-page-order", "%u",
> +                          &ring_order);

At least the frontend should imo also support the alternative interface
(using "max-ring-pages" etc).

> +
> +       legacy_backend = !(err == 1);
> +
> +       if (legacy_backend) {
> +               info->ring_order = 0;
> +       } else {
> +               info->ring_order = (ring_order <= xen_blkif_ring_order) ?
> +                                  ring_order :
> +                                  xen_blkif_ring_order;

min()?

> +       }
> +
>         /* Create shared ring, alloc event channel. */
>         err = setup_blkring(dev, info);
>         if (err)
> @@ -889,12 +916,35 @@ again:
>                 goto destroy_blkring;
>         }
> 
> -       err = xenbus_printf(xbt, dev->nodename,
> -                           "ring-ref", "%u", info->ring_ref);
> -       if (err) {
> -               message = "writing ring-ref";
> -               goto abort_transaction;
> +       if (legacy_backend) {

Why not use the simpler interface always when info->ring_order == 0?

> +               err = xenbus_printf(xbt, dev->nodename,
> +                                   "ring-ref", "%d", info->ring_ref[0]);
> +               if (err) {
> +                       message = "writing ring-ref";
> +                       goto abort_transaction;
> +               }
> +       } else {
> +               for (i = 0; i < (1 << info->ring_order); i++) {
> +                       char key[sizeof("ring-ref") + 2];
> +
> +                       sprintf(key, "ring-ref%d", i);
> +
> +                       err = xenbus_printf(xbt, dev->nodename,
> +                                           key, "%d", info->ring_ref[i]);
> +                       if (err) {
> +                               message = "writing ring-ref";
> +                               goto abort_transaction;
> +                       }
> +               }
> +
> +               err = xenbus_printf(xbt, dev->nodename,
> +                                   "ring-page-order", "%u", info->ring_order);
> +               if (err) {
> +                       message = "writing ring-order";
> +                       goto abort_transaction;
> +               }
>         }
> +
>         err = xenbus_printf(xbt, dev->nodename,
>                             "event-channel", "%u", info->evtchn);
>         if (err) {
> @@ -996,21 +1046,14 @@ static int blkfront_probe(struct xenbus_device *dev,
>         info->connected = BLKIF_STATE_DISCONNECTED;
>         INIT_WORK(&info->work, blkif_restart_queue);
> 
> -       for (i = 0; i < BLK_RING_SIZE; i++)
> +       for (i = 0; i < BLK_MAX_RING_SIZE; i++)
>                 info->shadow[i].req.u.rw.id = i+1;
> -       info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> +       info->shadow[BLK_MAX_RING_SIZE-1].req.u.rw.id = 0x0fffffff;

A proper terminator must also be written in talk_to_blkback() once
the actual ring size is known.

Further, blkif_recover() must be able to deal with a change of the
allowed upper bound.

>         /* Front end dir is a number, which is used as the id. */
>         info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
>         dev_set_drvdata(&dev->dev, info);
> 
> -       err = talk_to_blkback(dev, info);

Completely removing this here is wrong afaict - what if the backend
already is in InitWait when the frontend starts?

Further, whatever is done to this call here also needs to be done in
blkfront_resume().

> -       if (err) {
> -               kfree(info);
> -               dev_set_drvdata(&dev->dev, NULL);
> -               return err;
> -       }
> -
>         return 0;
>  }
> 
> @@ -1307,6 +1349,10 @@ static void blkback_changed(struct xenbus_device *dev,
>         case XenbusStateClosed:
>                 break;
> 
> +       case XenbusStateInitWait:
> +               talk_to_blkback(dev, info);

This call can return an error.

> +               break;
> +
>         case XenbusStateConnected:
>                 blkfront_connect(info);
>                 break;
> --- a/include/xen/xenbus.h
> +++ b/include/xen/xenbus.h
> @@ -195,15 +195,23 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
>                          const char *pathfmt, ...);
> 
>  int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
> -int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
> -int xenbus_map_ring_valloc(struct xenbus_device *dev,
> -                          int gnt_ref, void **vaddr);
> -int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
> -                          grant_handle_t *handle, void *vaddr);
> +
> +#define        XENBUS_MAX_RING_ORDER   2
> +#define        XENBUS_MAX_RING_PAGES   (1 << XENBUS_MAX_RING_ORDER)

Why do you need an artificial global limit here? Each driver can decide
individually what its limit should be.

Jan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-05 21:49 ` [PATCH 0001/001] xen: multi page ring support for block devices Santosh Jodh
  2012-03-06  2:42   ` Rusty Russell
  2012-03-06  8:34   ` Jan Beulich
@ 2012-03-06 11:16   ` Wei Liu
  2012-03-06 17:20   ` [Xen-devel] " Konrad Rzeszutek Wilk
  3 siblings, 0 replies; 15+ messages in thread
From: Wei Liu @ 2012-03-06 11:16 UTC (permalink / raw)
  To: Santosh Jodh
  Cc: jeremy, wei.liu2, Ian Campbell, konrad.wilk, waldi, weiyi.huang,
	rusty, joe.jin, linux-kernel, virtualization, paul.gortmaker,
	linux-pci, Paul Durrant, jbarnes, netdev, dgdegra, xen-devel,
	lersek@redhat.com

On Mon, 2012-03-05 at 21:49 +0000, Santosh Jodh wrote:
> From: Santosh Jodh <santosh.jodh@citrix.com>
> 
> Add support for multi page ring for block devices.
> The number of pages is configurable for blkback via module parameter.
> blkback reports max-ring-page-order to blkfront via xenstore.
> blkfront reports its supported ring-page-order to blkback via xenstore.
> blkfront reports multi page ring references via ring-refNN in xenstore.
> The change allows newer blkfront to work with older blkback and
> vice-versa.
> Based on original patch by Paul Durrant.
> 
> Signed-off-by: Santosh Jodh <santosh.jodh@citrix.com>


Doesn't the xenbus interface change deserve another patch (as
prerequisite for block devices change)? Or at least please mention the
change in commit message?


Wei.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Xen-devel] [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-05 21:49 ` [PATCH 0001/001] xen: multi page ring support for block devices Santosh Jodh
                     ` (2 preceding siblings ...)
  2012-03-06 11:16   ` Wei Liu
@ 2012-03-06 17:20   ` Konrad Rzeszutek Wilk
  2012-03-07  9:33     ` Jan Beulich
  3 siblings, 1 reply; 15+ messages in thread
From: Konrad Rzeszutek Wilk @ 2012-03-06 17:20 UTC (permalink / raw)
  To: Santosh Jodh
  Cc: jeremy, Ian Campbell, konrad.wilk, waldi, weiyi.huang, joe.jin,
	linux-kernel, jbeulich, virtualization, paul.gortmaker,
	linux-pci, Paul Durrant, jbarnes, netdev, dgdegra, xen-devel,
	lersek,

On Mon, Mar 5, 2012 at 4:49 PM, Santosh Jodh <Santosh.Jodh@citrix.com> wrote:
> From: Santosh Jodh <santosh.jodh@citrix.com>
>
> Add support for multi page ring for block devices.
> The number of pages is configurable for blkback via module parameter.
> blkback reports max-ring-page-order to blkfront via xenstore.
> blkfront reports its supported ring-page-order to blkback via xenstore.
> blkfront reports multi page ring references via ring-refNN in xenstore.
> The change allows newer blkfront to work with older blkback and
> vice-versa.
> Based on original patch by Paul Durrant.

you should include his SoB in this patch.

The patch overall looks Ok, thought I do have some comments:

 -> the call to "xenbus_ring_ops_init();" looks like a bug-fix? If so,
it should be a separate patch.
 -> the usage of XenbusStateInitWait? Why do we introduce that? Looks
like a fix to something.
-> XENBUS_MAX_RING_PAGES - why 2? Why not 4? What is the optimal
default size for SSD usage? 16?
 -> don't do sprintf, use snprinf
 -> don't use printk(KERN_..), use pr_info or the variant of
pr_err,pr_debug, etc.
 -> don't split the printk contents. It is Ok for them to be more than
80 lines.
 -> check that xen_blkif_ring_order is under XENBUS_MAX_RING_PAGES.
Otherwise a joker could do = 9999999999999999999 for ring size and we
would try to use that.
 -> Separate the patch that introduces the changes to the XenBus
infrastructure (and then the changes to net* and blk*) to use the
extra arguments would be folded in that patch. Then the patch that
implements the multi ring to blkback is a patch that depends on that
the XenBus modifications patch. Also make sure you CC David Miller and
Jens Axboe on the XenBus patch as it modifies the net-* side which
requires Ian's and David's Ack.
 -> Have you done a sanity/test check where the backend and frontend
have different size rings? Just to make sure nothing explodes.

>
> Signed-off-by: Santosh Jodh <santosh.jodh@citrix.com>
> ---
> diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
> index 0088bf6..72f2e18 100644
> --- a/drivers/block/xen-blkback/blkback.c
> +++ b/drivers/block/xen-blkback/blkback.c
> @@ -60,6 +60,39 @@ static int xen_blkif_reqs = 64;
>  module_param_named(reqs, xen_blkif_reqs, int, 0);
>  MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
>
> +/* Order of maximum shared ring size advertised to the front end. */
> +int xen_blkif_max_ring_order = XENBUS_MAX_RING_ORDER;
> +
> +#define BLK_RING_SIZE(_order) __CONST_RING_SIZE(blkif, PAGE_SIZE << (_order))
> +
> +static int set_max_ring_order(const char *buf, struct kernel_param *kp)
> +{
> +       int err;
> +       unsigned long order;
> +
> +       err = kstrtol(buf, 0, &order);
> +       if (err ||
> +           order < 0 ||
> +           order > XENBUS_MAX_RING_ORDER)
> +               return -EINVAL;
> +
> +       if (xen_blkif_reqs < BLK_RING_SIZE(order))
> +               printk(KERN_WARNING "WARNING: "
> +                      "I/O request space (%d reqs) < ring order %ld, "
> +                      "consider increasing %s.reqs to >= %ld.",
> +                      xen_blkif_reqs, order, KBUILD_MODNAME,
> +                      roundup_pow_of_two(BLK_RING_SIZE(order)));
> +
> +       xen_blkif_max_ring_order = order;
> +
> +       return 0;
> +}
> +
> +module_param_call(max_ring_order,
> +                 set_max_ring_order, param_get_int,
> +                 &xen_blkif_max_ring_order, 0644);
> +MODULE_PARM_DESC(max_ring_order, "log2 of maximum ring size, in pages.");
> +
>  /* Run-time switchable: /sys/module/blkback/parameters/ */
>  static unsigned int log_stats;
>  module_param(log_stats, int, 0644);
> diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
> index d0ee7ed..5f33a1a 100644
> --- a/drivers/block/xen-blkback/common.h
> +++ b/drivers/block/xen-blkback/common.h
> @@ -126,6 +126,8 @@ struct blkif_x86_64_response {
>        int16_t         status;          /* BLKIF_RSP_???       */
>  };
>
> +extern int xen_blkif_max_ring_order;
> +
>  DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
>                  struct blkif_common_response);
>  DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
> diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
> index 24a2fb5..7a9d71d 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -122,8 +122,8 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
>        return blkif;
>  }
>
> -static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
> -                        unsigned int evtchn)
> +static int xen_blkif_map(struct xen_blkif *blkif, int ring_ref[],
> +                        unsigned int ring_order, unsigned int evtchn)
>  {
>        int err;
>
> @@ -131,7 +131,8 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
>        if (blkif->irq)
>                return 0;
>
> -       err = xenbus_map_ring_valloc(blkif->be->dev, shared_page, &blkif->blk_ring);
> +       err = xenbus_map_ring_valloc(blkif->be->dev, ring_ref, 1 << ring_order,
> +                                    &blkif->blk_ring);
>        if (err < 0)
>                return err;
>
> @@ -140,21 +141,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
>        {
>                struct blkif_sring *sring;
>                sring = (struct blkif_sring *)blkif->blk_ring;
> -               BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
> +               BACK_RING_INIT(&blkif->blk_rings.native, sring,
> +                              PAGE_SIZE << ring_order);
>                break;
>        }
>        case BLKIF_PROTOCOL_X86_32:
>        {
>                struct blkif_x86_32_sring *sring_x86_32;
>                sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
> -               BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
> +               BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32,
> +                              PAGE_SIZE << ring_order);
>                break;
>        }
>        case BLKIF_PROTOCOL_X86_64:
>        {
>                struct blkif_x86_64_sring *sring_x86_64;
>                sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
> -               BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
> +               BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64,
> +                              PAGE_SIZE << ring_order);
>                break;
>        }
>        default:
> @@ -497,6 +501,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
>        if (err)
>                goto fail;
>
> +       err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-page-order",
> +                           "%u", xen_blkif_max_ring_order);
> +       if (err)
> +               goto fail;
> +
>        err = xenbus_switch_state(dev, XenbusStateInitWait);
>        if (err)
>                goto fail;
> @@ -744,22 +753,80 @@ again:
>  static int connect_ring(struct backend_info *be)
>  {
>        struct xenbus_device *dev = be->dev;
> -       unsigned long ring_ref;
> +       int ring_ref[XENBUS_MAX_RING_PAGES];
> +       unsigned int ring_order;
>        unsigned int evtchn;
>        char protocol[64] = "";
>        int err;
>
>        DPRINTK("%s", dev->otherend);
>
> -       err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
> -                           &ring_ref, "event-channel", "%u", &evtchn, NULL);
> -       if (err) {
> -               xenbus_dev_fatal(dev, err,
> -                                "reading %s/ring-ref and event-channel",
> +       err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
> +                          &evtchn);
> +       if (err != 1) {
> +               err = -EINVAL;
> +
> +               xenbus_dev_fatal(dev, err, "reading %s/event-channel",
>                                 dev->otherend);
>                return err;
>        }
>
> +       printk(KERN_INFO "blkback: event-channel %u\n", evtchn);
> +
> +       err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
> +                          &ring_order);
> +       if (err != 1) {
> +               DPRINTK("%s: using single page handshake", dev->otherend);
> +
> +               ring_order = 0;
> +
> +               err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
> +                                  "%d", &ring_ref[0]);
> +               if (err != 1) {
> +                       err = -EINVAL;
> +
> +                       xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
> +                                        dev->otherend);
> +                       return err;
> +               }
> +
> +               printk(KERN_INFO "blkback: ring-ref %d\n", ring_ref[0]);
> +       } else {
> +               unsigned int i;
> +
> +               if (ring_order > xen_blkif_max_ring_order) {
> +                       err = -EINVAL;
> +
> +                       xenbus_dev_fatal(dev, err,
> +                                        "%s/ring-page-order too big",
> +                                        dev->otherend);
> +                       return err;
> +               }
> +
> +               for (i = 0; i < (1u << ring_order); i++) {
> +                       char ring_ref_name[10];
> +
> +                       snprintf(ring_ref_name, sizeof(ring_ref_name),
> +                                "ring-ref%u", i);
> +
> +                       err = xenbus_scanf(XBT_NIL, dev->otherend,
> +                                          ring_ref_name, "%d",
> +                                          &ring_ref[i]);
> +                       if (err != 1) {
> +                               err = -EINVAL;
> +
> +                               xenbus_dev_fatal(dev, err,
> +                                                "reading %s/%s",
> +                                                dev->otherend,
> +                                                ring_ref_name);
> +                               return err;
> +                       }
> +
> +                       printk(KERN_INFO "blkback: ring-ref%u %d\n", i,
> +                              ring_ref[i]);
> +               }
> +       }
> +
>        be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
>        err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
>                            "%63s", protocol, NULL);
> @@ -775,14 +842,11 @@ static int connect_ring(struct backend_info *be)
>                xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
>                return -1;
>        }
> -       pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n",
> -               ring_ref, evtchn, be->blkif->blk_protocol, protocol);
>
>        /* Map the shared frame, irq etc. */
> -       err = xen_blkif_map(be->blkif, ring_ref, evtchn);
> +       err = xen_blkif_map(be->blkif, ring_ref, ring_order, evtchn);
>        if (err) {
> -               xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
> -                                ring_ref, evtchn);
> +               xenbus_dev_fatal(dev, err, "mapping ring-refs and evtchn");
>                return err;
>        }
>
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 2f22874..485813a 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -57,6 +57,10 @@
>
>  #include <asm/xen/hypervisor.h>
>
> +static int xen_blkif_ring_order;
> +module_param_named(reqs, xen_blkif_ring_order, int, 0);
> +MODULE_PARM_DESC(reqs, "log2 of requested ring size, in pages.");
> +
>  enum blkif_state {
>        BLKIF_STATE_DISCONNECTED,
>        BLKIF_STATE_CONNECTED,
> @@ -72,7 +76,8 @@ struct blk_shadow {
>  static DEFINE_MUTEX(blkfront_mutex);
>  static const struct block_device_operations xlvbd_block_fops;
>
> -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
> +#define BLK_RING_SIZE(_order)  __CONST_RING_SIZE(blkif, PAGE_SIZE << (_order))
> +#define BLK_MAX_RING_SIZE      BLK_RING_SIZE(XENBUS_MAX_RING_ORDER)
>
>  /*
>  * We have one of these per vbd, whether ide, scsi or 'other'.  They
> @@ -87,14 +92,15 @@ struct blkfront_info
>        int vdevice;
>        blkif_vdev_t handle;
>        enum blkif_state connected;
> -       int ring_ref;
> +       int ring_ref[XENBUS_MAX_RING_PAGES];
> +       int ring_order;
>        struct blkif_front_ring ring;
>        struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
>        unsigned int evtchn, irq;
>        struct request_queue *rq;
>        struct work_struct work;
>        struct gnttab_free_callback callback;
> -       struct blk_shadow shadow[BLK_RING_SIZE];
> +       struct blk_shadow shadow[BLK_MAX_RING_SIZE];
>        unsigned long shadow_free;
>        unsigned int feature_flush;
>        unsigned int flush_op;
> @@ -111,9 +117,7 @@ static unsigned int nr_minors;
>  static unsigned long *minors;
>  static DEFINE_SPINLOCK(minor_lock);
>
> -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
> -       (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
> -#define GRANT_INVALID_REF      0
> +#define GRANT_INVALID_REF      0
>
>  #define PARTS_PER_DISK         16
>  #define PARTS_PER_EXT_DISK      256
> @@ -135,7 +139,7 @@ static DEFINE_SPINLOCK(minor_lock);
>  static int get_id_from_freelist(struct blkfront_info *info)
>  {
>        unsigned long free = info->shadow_free;
> -       BUG_ON(free >= BLK_RING_SIZE);
> +       BUG_ON(free >= BLK_MAX_RING_SIZE);
>        info->shadow_free = info->shadow[free].req.u.rw.id;
>        info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
>        return free;
> @@ -683,6 +687,8 @@ static void blkif_restart_queue(struct work_struct *work)
>
>  static void blkif_free(struct blkfront_info *info, int suspend)
>  {
> +       int i;
> +
>        /* Prevent new requests being issued until we fix things up. */
>        spin_lock_irq(&blkif_io_lock);
>        info->connected = suspend ?
> @@ -698,16 +704,19 @@ static void blkif_free(struct blkfront_info *info, int suspend)
>        flush_work_sync(&info->work);
>
>        /* Free resources associated with old device channel. */
> -       if (info->ring_ref != GRANT_INVALID_REF) {
> -               gnttab_end_foreign_access(info->ring_ref, 0,
> -                                         (unsigned long)info->ring.sring);
> -               info->ring_ref = GRANT_INVALID_REF;
> -               info->ring.sring = NULL;
> +       for (i = 0; i < (1 << info->ring_order); i++) {
> +               if (info->ring_ref[i] != GRANT_INVALID_REF) {
> +                       gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
> +                       info->ring_ref[i] = GRANT_INVALID_REF;
> +               }
>        }
> +
> +       free_pages((unsigned long)info->ring.sring, info->ring_order);
> +       info->ring.sring = NULL;
> +
>        if (info->irq)
>                unbind_from_irqhandler(info->irq, info);
>        info->evtchn = info->irq = 0;
> -
>  }
>
>  static void blkif_completion(struct blk_shadow *s)
> @@ -828,25 +837,24 @@ static int setup_blkring(struct xenbus_device *dev,
>        struct blkif_sring *sring;
>        int err;
>
> -       info->ring_ref = GRANT_INVALID_REF;
> -
> -       sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
> +       sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
> +                                                      info->ring_order);
>        if (!sring) {
>                xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
>                return -ENOMEM;
>        }
>        SHARED_RING_INIT(sring);
> -       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
> +       FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE << info->ring_order);
>
>        sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
>
> -       err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
> +       err = xenbus_grant_ring(dev, info->ring.sring, 1 << info->ring_order,
> +                               info->ring_ref);
>        if (err < 0) {
> -               free_page((unsigned long)sring);
> +               free_pages((unsigned long)sring, info->ring_order);
>                info->ring.sring = NULL;
>                goto fail;
>        }
> -       info->ring_ref = err;
>
>        err = xenbus_alloc_evtchn(dev, &info->evtchn);
>        if (err)
> @@ -875,8 +883,27 @@ static int talk_to_blkback(struct xenbus_device *dev,
>  {
>        const char *message = NULL;
>        struct xenbus_transaction xbt;
> +       unsigned int ring_order;
> +       int legacy_backend;
> +       int i;
>        int err;
>
> +       for (i = 0; i < (1 << info->ring_order); i++)
> +               info->ring_ref[i] = GRANT_INVALID_REF;
> +
> +       err = xenbus_scanf(XBT_NIL, dev->otherend, "max-ring-page-order", "%u",
> +                          &ring_order);
> +
> +       legacy_backend = !(err == 1);
> +
> +       if (legacy_backend) {
> +               info->ring_order = 0;
> +       } else {
> +               info->ring_order = (ring_order <= xen_blkif_ring_order) ?
> +                                  ring_order :
> +                                  xen_blkif_ring_order;
> +       }
> +
>        /* Create shared ring, alloc event channel. */
>        err = setup_blkring(dev, info);
>        if (err)
> @@ -889,12 +916,35 @@ again:
>                goto destroy_blkring;
>        }
>
> -       err = xenbus_printf(xbt, dev->nodename,
> -                           "ring-ref", "%u", info->ring_ref);
> -       if (err) {
> -               message = "writing ring-ref";
> -               goto abort_transaction;
> +       if (legacy_backend) {
> +               err = xenbus_printf(xbt, dev->nodename,
> +                                   "ring-ref", "%d", info->ring_ref[0]);
> +               if (err) {
> +                       message = "writing ring-ref";
> +                       goto abort_transaction;
> +               }
> +       } else {
> +               for (i = 0; i < (1 << info->ring_order); i++) {
> +                       char key[sizeof("ring-ref") + 2];
> +
> +                       sprintf(key, "ring-ref%d", i);
> +
> +                       err = xenbus_printf(xbt, dev->nodename,
> +                                           key, "%d", info->ring_ref[i]);
> +                       if (err) {
> +                               message = "writing ring-ref";
> +                               goto abort_transaction;
> +                       }
> +               }
> +
> +               err = xenbus_printf(xbt, dev->nodename,
> +                                   "ring-page-order", "%u", info->ring_order);
> +               if (err) {
> +                       message = "writing ring-order";
> +                       goto abort_transaction;
> +               }
>        }
> +
>        err = xenbus_printf(xbt, dev->nodename,
>                            "event-channel", "%u", info->evtchn);
>        if (err) {
> @@ -996,21 +1046,14 @@ static int blkfront_probe(struct xenbus_device *dev,
>        info->connected = BLKIF_STATE_DISCONNECTED;
>        INIT_WORK(&info->work, blkif_restart_queue);
>
> -       for (i = 0; i < BLK_RING_SIZE; i++)
> +       for (i = 0; i < BLK_MAX_RING_SIZE; i++)
>                info->shadow[i].req.u.rw.id = i+1;
> -       info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> +       info->shadow[BLK_MAX_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>
>        /* Front end dir is a number, which is used as the id. */
>        info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
>        dev_set_drvdata(&dev->dev, info);
>
> -       err = talk_to_blkback(dev, info);
> -       if (err) {
> -               kfree(info);
> -               dev_set_drvdata(&dev->dev, NULL);
> -               return err;
> -       }
> -
>        return 0;
>  }
>
> @@ -1031,13 +1074,13 @@ static int blkif_recover(struct blkfront_info *info)
>
>        /* Stage 2: Set up free list. */
>        memset(&info->shadow, 0, sizeof(info->shadow));
> -       for (i = 0; i < BLK_RING_SIZE; i++)
> +       for (i = 0; i < BLK_MAX_RING_SIZE; i++)
>                info->shadow[i].req.u.rw.id = i+1;
>        info->shadow_free = info->ring.req_prod_pvt;
> -       info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> +       info->shadow[BLK_MAX_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>
>        /* Stage 3: Find pending requests and requeue them. */
> -       for (i = 0; i < BLK_RING_SIZE; i++) {
> +       for (i = 0; i < BLK_RING_SIZE(info->ring_order); i++) {
>                /* Not in use? */
>                if (!copy[i].request)
>                        continue;
> @@ -1299,7 +1342,6 @@ static void blkback_changed(struct xenbus_device *dev,
>
>        switch (backend_state) {
>        case XenbusStateInitialising:
> -       case XenbusStateInitWait:
>        case XenbusStateInitialised:
>        case XenbusStateReconfiguring:
>        case XenbusStateReconfigured:
> @@ -1307,6 +1349,10 @@ static void blkback_changed(struct xenbus_device *dev,
>        case XenbusStateClosed:
>                break;
>
> +       case XenbusStateInitWait:
> +               talk_to_blkback(dev, info);
> +               break;
> +
>        case XenbusStateConnected:
>                blkfront_connect(info);
>                break;
> diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
> index 94b79c3..f93b59a 100644
> --- a/drivers/net/xen-netback/common.h
> +++ b/drivers/net/xen-netback/common.h
> @@ -130,8 +130,8 @@ int xen_netbk_must_stop_queue(struct xenvif *vif);
>  /* (Un)Map communication rings. */
>  void xen_netbk_unmap_frontend_rings(struct xenvif *vif);
>  int xen_netbk_map_frontend_rings(struct xenvif *vif,
> -                                grant_ref_t tx_ring_ref,
> -                                grant_ref_t rx_ring_ref);
> +                                int tx_ring_ref,
> +                                int rx_ring_ref);
>
>  /* (De)Register a xenvif with the netback backend. */
>  void xen_netbk_add_xenvif(struct xenvif *vif);
> diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c
> index 59effac..0b014cf 100644
> --- a/drivers/net/xen-netback/netback.c
> +++ b/drivers/net/xen-netback/netback.c
> @@ -1594,8 +1594,8 @@ void xen_netbk_unmap_frontend_rings(struct xenvif *vif)
>  }
>
>  int xen_netbk_map_frontend_rings(struct xenvif *vif,
> -                                grant_ref_t tx_ring_ref,
> -                                grant_ref_t rx_ring_ref)
> +                                int tx_ring_ref,
> +                                int rx_ring_ref)
>  {
>        void *addr;
>        struct xen_netif_tx_sring *txs;
> @@ -1604,7 +1604,7 @@ int xen_netbk_map_frontend_rings(struct xenvif *vif,
>        int err = -ENOMEM;
>
>        err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(vif),
> -                                    tx_ring_ref, &addr);
> +                                    &tx_ring_ref, 1, &addr);
>        if (err)
>                goto err;
>
> @@ -1612,7 +1612,7 @@ int xen_netbk_map_frontend_rings(struct xenvif *vif,
>        BACK_RING_INIT(&vif->tx, txs, PAGE_SIZE);
>
>        err = xenbus_map_ring_valloc(xenvif_to_xenbus_device(vif),
> -                                    rx_ring_ref, &addr);
> +                                    &rx_ring_ref, 1, &addr);
>        if (err)
>                goto err;
>
> diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
> index 698b905..521a595 100644
> --- a/drivers/net/xen-netfront.c
> +++ b/drivers/net/xen-netfront.c
> @@ -1496,13 +1496,12 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
>        SHARED_RING_INIT(txs);
>        FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE);
>
> -       err = xenbus_grant_ring(dev, virt_to_mfn(txs));
> +       err = xenbus_grant_ring(dev, txs, 1, &info->tx_ring_ref);
>        if (err < 0) {
>                free_page((unsigned long)txs);
>                goto fail;
>        }
>
> -       info->tx_ring_ref = err;
>        rxs = (struct xen_netif_rx_sring *)get_zeroed_page(GFP_NOIO | __GFP_HIGH);
>        if (!rxs) {
>                err = -ENOMEM;
> @@ -1512,12 +1511,11 @@ static int setup_netfront(struct xenbus_device *dev, struct netfront_info *info)
>        SHARED_RING_INIT(rxs);
>        FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE);
>
> -       err = xenbus_grant_ring(dev, virt_to_mfn(rxs));
> +       err = xenbus_grant_ring(dev, rxs, 1, &info->rx_ring_ref);
>        if (err < 0) {
>                free_page((unsigned long)rxs);
>                goto fail;
>        }
> -       info->rx_ring_ref = err;
>
>        err = xenbus_alloc_evtchn(dev, &info->evtchn);
>        if (err)
> diff --git a/drivers/pci/xen-pcifront.c b/drivers/pci/xen-pcifront.c
> index 1620088..95109d8 100644
> --- a/drivers/pci/xen-pcifront.c
> +++ b/drivers/pci/xen-pcifront.c
> @@ -768,12 +768,10 @@ static int pcifront_publish_info(struct pcifront_device *pdev)
>        int err = 0;
>        struct xenbus_transaction trans;
>
> -       err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info));
> +       err = xenbus_grant_ring(pdev->xdev, pdev->sh_info, 1, &pdev->gnt_ref);
>        if (err < 0)
>                goto out;
>
> -       pdev->gnt_ref = err;
> -
>        err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn);
>        if (err)
>                goto out;
> diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c
> index 64b11f9..e0834cd 100644
> --- a/drivers/xen/xen-pciback/xenbus.c
> +++ b/drivers/xen/xen-pciback/xenbus.c
> @@ -108,7 +108,7 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref,
>                "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n",
>                gnt_ref, remote_evtchn);
>
> -       err = xenbus_map_ring_valloc(pdev->xdev, gnt_ref, &vaddr);
> +       err = xenbus_map_ring_valloc(pdev->xdev, &gnt_ref, 1, &vaddr);
>        if (err < 0) {
>                xenbus_dev_fatal(pdev->xdev, err,
>                                "Error mapping other domain page in ours.");
> diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c
> index 566d2ad..3a14524 100644
> --- a/drivers/xen/xenbus/xenbus_client.c
> +++ b/drivers/xen/xenbus/xenbus_client.c
> @@ -53,14 +53,16 @@ struct xenbus_map_node {
>                struct vm_struct *area; /* PV */
>                struct page *page;     /* HVM */
>        };
> -       grant_handle_t handle;
> +       grant_handle_t handle[XENBUS_MAX_RING_PAGES];
> +       unsigned int   nr_handles;
>  };
>
>  static DEFINE_SPINLOCK(xenbus_valloc_lock);
>  static LIST_HEAD(xenbus_valloc_pages);
>
>  struct xenbus_ring_ops {
> -       int (*map)(struct xenbus_device *dev, int gnt, void **vaddr);
> +       int (*map)(struct xenbus_device *dev, int gnt[], int nr_gnts,
> +                  void **vaddr);
>        int (*unmap)(struct xenbus_device *dev, void *vaddr);
>  };
>
> @@ -356,17 +358,38 @@ static void xenbus_switch_fatal(struct xenbus_device *dev, int depth, int err,
>  /**
>  * xenbus_grant_ring
>  * @dev: xenbus device
> - * @ring_mfn: mfn of ring to grant
> -
> - * Grant access to the given @ring_mfn to the peer of the given device.  Return
> - * 0 on success, or -errno on error.  On error, the device will switch to
> - * XenbusStateClosing, and the error will be saved in the store.
> + * @vaddr: starting virtual address of the ring
> + * @nr_pages: number of page to be granted
> + * @grefs: grant reference array to be filled in
> + * Grant access to the given @vaddr to the peer of the given device.
> + * Then fill in @grefs with grant references.  Return 0 on success, or
> + * -errno on error.  On error, the device will switch to
> + * XenbusStateClosing, and the first error will be saved in the store.
>  */
> -int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn)
> +int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr,
> +                     int nr_pages, int grefs[])
>  {
> -       int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0);
> -       if (err < 0)
> -               xenbus_dev_fatal(dev, err, "granting access to ring page");
> +       int i;
> +       int err;
> +
> +       for (i = 0; i < nr_pages; i++) {
> +               unsigned long addr = (unsigned long)vaddr +
> +                       (PAGE_SIZE * i);
> +               err = gnttab_grant_foreign_access(dev->otherend_id,
> +                                                 virt_to_mfn(addr), 0);
> +               if (err < 0) {
> +                       xenbus_dev_fatal(dev, err,
> +                                        "granting access to ring page");
> +                       goto fail;
> +               }
> +               grefs[i] = err;
> +       }
> +
> +       return 0;
> +
> +fail:
> +       for ( ; i >= 0; i--)
> +               gnttab_end_foreign_access_ref(grefs[i], 0);
>        return err;
>  }
>  EXPORT_SYMBOL_GPL(xenbus_grant_ring);
> @@ -447,7 +470,8 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
>  /**
>  * xenbus_map_ring_valloc
>  * @dev: xenbus device
> - * @gnt_ref: grant reference
> + * @gnt_ref: grant reference array
> + * @nr_grefs: number of grant reference
>  * @vaddr: pointer to address to be filled out by mapping
>  *
>  * Based on Rusty Russell's skeleton driver's map_page.
> @@ -458,23 +482,28 @@ EXPORT_SYMBOL_GPL(xenbus_free_evtchn);
>  * or -ENOMEM on error. If an error is returned, device will switch to
>  * XenbusStateClosing and the error message will be saved in XenStore.
>  */
> -int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr)
> +int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref[],
> +                          int nr_grefs, void **vaddr)
>  {
> -       return ring_ops->map(dev, gnt_ref, vaddr);
> +       return ring_ops->map(dev, gnt_ref, nr_grefs, vaddr);
>  }
>  EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc);
>
> +static int __xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev,
> +                                       struct xenbus_map_node *node);
> +
>  static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
> -                                    int gnt_ref, void **vaddr)
> +                                    int gnt_ref[], int nr_grefs, void **vaddr)
>  {
> -       struct gnttab_map_grant_ref op = {
> -               .flags = GNTMAP_host_map | GNTMAP_contains_pte,
> -               .ref   = gnt_ref,
> -               .dom   = dev->otherend_id,
> -       };
> +       struct gnttab_map_grant_ref op[XENBUS_MAX_RING_PAGES];
>        struct xenbus_map_node *node;
>        struct vm_struct *area;
> -       pte_t *pte;
> +       pte_t *pte[XENBUS_MAX_RING_PAGES];
> +       int i;
> +       int err = 0;
> +
> +       if (nr_grefs > XENBUS_MAX_RING_PAGES)
> +               return -EINVAL;
>
>        *vaddr = NULL;
>
> @@ -482,28 +511,44 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
>        if (!node)
>                return -ENOMEM;
>
> -       area = alloc_vm_area(PAGE_SIZE, &pte);
> +       area = alloc_vm_area(PAGE_SIZE * nr_grefs, pte);
>        if (!area) {
>                kfree(node);
>                return -ENOMEM;
>        }
>
> -       op.host_addr = arbitrary_virt_to_machine(pte).maddr;
> +       for (i = 0; i < nr_grefs; i++) {
> +               op[i].flags = GNTMAP_host_map | GNTMAP_contains_pte,
> +               op[i].ref   = gnt_ref[i],
> +               op[i].dom   = dev->otherend_id,
> +               op[i].host_addr = arbitrary_virt_to_machine(pte[i]).maddr;
> +       };
>
>        if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
>                BUG();
>
> -       if (op.status != GNTST_okay) {
> -               free_vm_area(area);
> -               kfree(node);
> -               xenbus_dev_fatal(dev, op.status,
> -                                "mapping in shared page %d from domain %d",
> -                                gnt_ref, dev->otherend_id);
> -               return op.status;
> +       node->nr_handles = nr_grefs;
> +       node->area = area;
> +
> +       for (i = 0; i < nr_grefs; i++) {
> +               if (op[i].status != GNTST_okay) {
> +                       err = op[i].status;
> +                       node->handle[i] = INVALID_GRANT_HANDLE;
> +                       continue;
> +               }
> +               node->handle[i] = op[i].handle;
>        }
>
> -       node->handle = op.handle;
> -       node->area = area;
> +       if (err != 0) {
> +               for (i = 0; i < nr_grefs; i++)
> +                       xenbus_dev_fatal(dev, op[i].status,
> +                               "mapping in shared page %d from domain %d",
> +                               gnt_ref[i], dev->otherend_id);
> +
> +                __xenbus_unmap_ring_vfree_pv(dev, node);
> +
> +               return err;
> +       }
>
>        spin_lock(&xenbus_valloc_lock);
>        list_add(&node->next, &xenbus_valloc_pages);
> @@ -514,25 +559,29 @@ static int xenbus_map_ring_valloc_pv(struct xenbus_device *dev,
>  }
>
>  static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
> -                                     int gnt_ref, void **vaddr)
> +                                     int gnt_ref[], int nr_grefs, void **vaddr)
>  {
>        struct xenbus_map_node *node;
>        int err;
>        void *addr;
>
> +       if (nr_grefs > XENBUS_MAX_RING_PAGES)
> +               return -EINVAL;
> +
>        *vaddr = NULL;
>
>        node = kzalloc(sizeof(*node), GFP_KERNEL);
>        if (!node)
>                return -ENOMEM;
>
> -       err = alloc_xenballooned_pages(1, &node->page, false /* lowmem */);
> +       err = alloc_xenballooned_pages(nr_grefs, &node->page,
> +                                      false /* lowmem */);
>        if (err)
>                goto out_err;
>
>        addr = pfn_to_kaddr(page_to_pfn(node->page));
>
> -       err = xenbus_map_ring(dev, gnt_ref, &node->handle, addr);
> +       err = xenbus_map_ring(dev, gnt_ref, nr_grefs, node->handle, addr);
>        if (err)
>                goto out_err;
>
> @@ -544,7 +593,7 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
>        return 0;
>
>  out_err:
> -       free_xenballooned_pages(1, &node->page);
> +       free_xenballooned_pages(nr_grefs, &node->page);
>        kfree(node);
>        return err;
>  }
> @@ -553,36 +602,51 @@ static int xenbus_map_ring_valloc_hvm(struct xenbus_device *dev,
>  /**
>  * xenbus_map_ring
>  * @dev: xenbus device
> - * @gnt_ref: grant reference
> - * @handle: pointer to grant handle to be filled
> + * @gnt_ref: grant reference array
> + * @nr_grefs: number of grant references
> + * @handle: pointer to grant handle array to be filled, mind the size
>  * @vaddr: address to be mapped to
>  *
> - * Map a page of memory into this domain from another domain's grant table.
> + * Map pages of memory into this domain from another domain's grant table.
>  * xenbus_map_ring does not allocate the virtual address space (you must do
> - * this yourself!). It only maps in the page to the specified address.
> + * this yourself!). It only maps in the pages to the specified address.
>  * Returns 0 on success, and GNTST_* (see xen/include/interface/grant_table.h)
>  * or -ENOMEM on error. If an error is returned, device will switch to
> - * XenbusStateClosing and the error message will be saved in XenStore.
> + * XenbusStateClosing and the last error message will be saved in XenStore.
>  */
> -int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
> -                   grant_handle_t *handle, void *vaddr)
> +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref[], int nr_grefs,
> +                   grant_handle_t handle[], void *vaddr)
>  {
> -       struct gnttab_map_grant_ref op;
> -
> -       gnttab_set_map_op(&op, (phys_addr_t)vaddr, GNTMAP_host_map, gnt_ref,
> -                         dev->otherend_id);
> +       struct gnttab_map_grant_ref op[XENBUS_MAX_RING_PAGES];
> +       int i;
> +       int err = GNTST_okay;   /* 0 */
> +
> +       for (i = 0; i < nr_grefs; i++) {
> +               unsigned long addr = (unsigned long)vaddr +
> +                       (PAGE_SIZE * i);
> +               gnttab_set_map_op(&op[i], (phys_addr_t)addr,
> +                                 GNTMAP_host_map, gnt_ref[i],
> +                                 dev->otherend_id);
> +       }
>
> -       if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1))
> +       if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, op, nr_grefs))
>                BUG();
>
> -       if (op.status != GNTST_okay) {
> -               xenbus_dev_fatal(dev, op.status,
> -                                "mapping in shared page %d from domain %d",
> -                                gnt_ref, dev->otherend_id);
> -       } else
> -               *handle = op.handle;
> +       for (i = 0; i < nr_grefs; i++) {
> +               if (op[i].status != GNTST_okay) {
> +                       err = op[i].status;
> +                       xenbus_dev_fatal(dev, err,
> +                               "mapping in shared page %d from domain %d",
> +                               gnt_ref[i], dev->otherend_id);
> +                       handle[i] = INVALID_GRANT_HANDLE;
> +               } else
> +                       handle[i] = op[i].handle;
> +       }
>
> -       return op.status;
> +       if (err != GNTST_okay)
> +               xenbus_unmap_ring(dev, handle, nr_grefs, vaddr);
> +
> +       return err;
>  }
>  EXPORT_SYMBOL_GPL(xenbus_map_ring);
>
> @@ -605,13 +669,53 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr)
>  }
>  EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree);
>
> +static int __xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev,
> +                                       struct xenbus_map_node *node)
> +{
> +       struct gnttab_unmap_grant_ref op[XENBUS_MAX_RING_PAGES];
> +       unsigned int level;
> +       int i, j;
> +       int err = GNTST_okay;
> +
> +       j = 0;
> +       for (i = 0; i < node->nr_handles; i++) {
> +               unsigned long vaddr = (unsigned long)node->area->addr +
> +                       (PAGE_SIZE * i);
> +               if (node->handle[i] != INVALID_GRANT_HANDLE) {
> +                       memset(&op[j], 0, sizeof(op[0]));
> +                       op[j].host_addr = arbitrary_virt_to_machine(
> +                                       lookup_address(vaddr, &level)).maddr;
> +                       op[j].handle = node->handle[i];
> +                       j++;
> +                       node->handle[i] = INVALID_GRANT_HANDLE;
> +               }
> +       }
> +
> +       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, op, j))
> +               BUG();
> +
> +       node->nr_handles = 0;
> +
> +       for (i = 0; i < j; i++) {
> +               if (op[i].status != GNTST_okay) {
> +                       err = op[i].status;
> +                       xenbus_dev_error(dev, err,
> +                               "unmapping page %d at handle %d error %d",
> +                               i, op[i].handle, err);
> +               }
> +       }
> +
> +       if (err == GNTST_okay)
> +               free_vm_area(node->area);
> +
> +       kfree(node);
> +
> +       return err;
> +}
> +
>  static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)
>  {
>        struct xenbus_map_node *node;
> -       struct gnttab_unmap_grant_ref op = {
> -               .host_addr = (unsigned long)vaddr,
> -       };
> -       unsigned int level;
>
>        spin_lock(&xenbus_valloc_lock);
>        list_for_each_entry(node, &xenbus_valloc_pages, next) {
> @@ -626,33 +730,18 @@ static int xenbus_unmap_ring_vfree_pv(struct xenbus_device *dev, void *vaddr)
>
>        if (!node) {
>                xenbus_dev_error(dev, -ENOENT,
> -                                "can't find mapped virtual address %p", vaddr);
> +                               "can't find mapped virtual address %p", vaddr);
>                return GNTST_bad_virt_addr;
>        }
>
> -       op.handle = node->handle;
> -       op.host_addr = arbitrary_virt_to_machine(
> -               lookup_address((unsigned long)vaddr, &level)).maddr;
> -
> -       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
> -               BUG();
> -
> -       if (op.status == GNTST_okay)
> -               free_vm_area(node->area);
> -       else
> -               xenbus_dev_error(dev, op.status,
> -                                "unmapping page at handle %d error %d",
> -                                node->handle, op.status);
> -
> -       kfree(node);
> -       return op.status;
> +       return __xenbus_unmap_ring_vfree_pv(dev, node);
>  }
>
>  static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
>  {
>        int rv;
>        struct xenbus_map_node *node;
> -       void *addr;
> +       void *addr = NULL;
>
>        spin_lock(&xenbus_valloc_lock);
>        list_for_each_entry(node, &xenbus_valloc_pages, next) {
> @@ -668,14 +757,14 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
>
>        if (!node) {
>                xenbus_dev_error(dev, -ENOENT,
> -                                "can't find mapped virtual address %p", vaddr);
> +                               "can't find mapped virtual address %p", vaddr);
>                return GNTST_bad_virt_addr;
>        }
>
> -       rv = xenbus_unmap_ring(dev, node->handle, addr);
> +       rv = xenbus_unmap_ring(dev, node->handle, node->nr_handles, addr);
>
>        if (!rv)
> -               free_xenballooned_pages(1, &node->page);
> +               free_xenballooned_pages(node->nr_handles, &node->page);
>        else
>                WARN(1, "Leaking %p\n", vaddr);
>
> @@ -687,6 +776,7 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
>  * xenbus_unmap_ring
>  * @dev: xenbus device
>  * @handle: grant handle
> + * @nr_handles: number of grant handle
>  * @vaddr: addr to unmap
>  *
>  * Unmap a page of memory in this domain that was imported from another domain.
> @@ -694,21 +784,37 @@ static int xenbus_unmap_ring_vfree_hvm(struct xenbus_device *dev, void *vaddr)
>  * (see xen/include/interface/grant_table.h).
>  */
>  int xenbus_unmap_ring(struct xenbus_device *dev,
> -                     grant_handle_t handle, void *vaddr)
> +                       grant_handle_t handle[], int nr_handles,
> +                       void *vaddr)
>  {
> -       struct gnttab_unmap_grant_ref op;
> -
> -       gnttab_set_unmap_op(&op, (phys_addr_t)vaddr, GNTMAP_host_map, handle);
> +       struct gnttab_unmap_grant_ref op[XENBUS_MAX_RING_PAGES];
> +       int i, j;
> +       int err = GNTST_okay;
> +
> +       j = 0;
> +       for (i = 0; i < nr_handles; i++) {
> +               unsigned long addr = (unsigned long)vaddr +
> +                       (PAGE_SIZE * i);
> +               if (handle[i] != INVALID_GRANT_HANDLE) {
> +                       gnttab_set_unmap_op(&op[j++], (phys_addr_t)addr,
> +                                           GNTMAP_host_map, handle[i]);
> +                       handle[i] = INVALID_GRANT_HANDLE;
> +               }
> +       }
>
> -       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1))
> +       if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, op, j))
>                BUG();
>
> -       if (op.status != GNTST_okay)
> -               xenbus_dev_error(dev, op.status,
> -                                "unmapping page at handle %d error %d",
> -                                handle, op.status);
> +       for (i = 0; i < j; i++) {
> +               if (op[i].status != GNTST_okay) {
> +                       err = op[i].status;
> +                       xenbus_dev_error(dev, err,
> +                               "unmapping page at handle %d error %d",
> +                               handle[i], err);
> +               }
> +       }
>
> -       return op.status;
> +       return err;
>  }
>  EXPORT_SYMBOL_GPL(xenbus_unmap_ring);
>
> diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
> index 3864967..62b92d2 100644
> --- a/drivers/xen/xenbus/xenbus_probe.c
> +++ b/drivers/xen/xenbus/xenbus_probe.c
> @@ -718,6 +718,7 @@ static int __init xenstored_local_init(void)
>        return err;
>  }
>
> +extern void xenbus_ring_ops_init(void);
>  static int __init xenbus_init(void)
>  {
>        int err = 0;
> @@ -767,6 +768,8 @@ static int __init xenbus_init(void)
>        proc_mkdir("xen", NULL);
>  #endif
>
> +       xenbus_ring_ops_init();
> +
>  out_error:
>        return err;
>  }
> diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
> index e8c599b..cdbd948 100644
> --- a/include/xen/xenbus.h
> +++ b/include/xen/xenbus.h
> @@ -195,15 +195,23 @@ int xenbus_watch_pathfmt(struct xenbus_device *dev, struct xenbus_watch *watch,
>                         const char *pathfmt, ...);
>
>  int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state new_state);
> -int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn);
> -int xenbus_map_ring_valloc(struct xenbus_device *dev,
> -                          int gnt_ref, void **vaddr);
> -int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref,
> -                          grant_handle_t *handle, void *vaddr);
> +
> +#define        XENBUS_MAX_RING_ORDER   2
> +#define        XENBUS_MAX_RING_PAGES   (1 << XENBUS_MAX_RING_ORDER)
> +
> +#define INVALID_GRANT_HANDLE           (~0U)
> +
> +int xenbus_grant_ring(struct xenbus_device *dev, void *vaddr,
> +                     int nr_pages, int grefs[]);
> +int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref[],
> +                          int nr_grefs, void **vaddr);
> +int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref[], int nr_grefs,
> +                   grant_handle_t handle[], void *vaddr);
>
>  int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr);
>  int xenbus_unmap_ring(struct xenbus_device *dev,
> -                     grant_handle_t handle, void *vaddr);
> +                     grant_handle_t handle[], int nr_handles,
> +                     void *vaddr);
>
>  int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port);
>  int xenbus_bind_evtchn(struct xenbus_device *dev, int remote_port, int *port);
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Xen-devel] [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-06 17:20   ` [Xen-devel] " Konrad Rzeszutek Wilk
@ 2012-03-07  9:33     ` Jan Beulich
  2012-03-07 15:15       ` Konrad Rzeszutek Wilk
  2012-03-14  6:32       ` Justin Gibbs
  0 siblings, 2 replies; 15+ messages in thread
From: Jan Beulich @ 2012-03-07  9:33 UTC (permalink / raw)
  To: Santosh Jodh, konrad
  Cc: jeremy, Ian Campbell, netdev, konrad.wilk, waldi, joe.jin,
	weiyi.huang, linux-kernel, jbarnes, virtualization,
	paul.gortmaker, Paul Durrant, David Vrabel, linux-pci, akpm,
	xen-devel, lersek,

>>> On 06.03.12 at 18:20, Konrad Rzeszutek Wilk <konrad@darnok.org> wrote:
>  -> the usage of XenbusStateInitWait? Why do we introduce that? Looks
> like a fix to something.

No, this is required to get the negotiation working (the frontend must
not try to read the new nodes until it can be certain that the backend
populated them). However, as already pointed out in an earlier reply
to Santosh, the way this is done here doesn't appear to allow for the
backend to already be in InitWait state when the frontend gets
invoked.

> -> XENBUS_MAX_RING_PAGES - why 2? Why not 4? What is the optimal
> default size for SSD usage? 16?

What do SSDs have to do with a XenBus definition? Imo it's wrong (and
unnecessary) to introduce a limit at the XenBus level at all - each driver
can do this for itself.

As to the limit for SSDs in the block interface - I don't think the number
of possibly simultaneous requests has anything to do with this. Instead,
I'd expect the request number/size/segments extension that NetBSD
apparently implements to possibly have an effect.

Jan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Xen-devel] [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-07  9:33     ` Jan Beulich
@ 2012-03-07 15:15       ` Konrad Rzeszutek Wilk
  2012-03-14  6:32       ` Justin Gibbs
  1 sibling, 0 replies; 15+ messages in thread
From: Konrad Rzeszutek Wilk @ 2012-03-07 15:15 UTC (permalink / raw)
  To: Jan Beulich
  Cc: jeremy, Ian Campbell, konrad.wilk, waldi, netdev, joe.jin,
	linux-kernel, jbarnes, xen-devel, paul.gortmaker, Paul Durrant,
	weiyi.huang, Santosh Jodh, linux-pci, dgdegra, virtualization,
	lersek, akpm, David


[-- Attachment #1.1: Type: text/plain, Size: 1374 bytes --]

On Mar 7, 2012 4:33 AM, "Jan Beulich" <JBeulich@suse.com> wrote:
>
> >>> On 06.03.12 at 18:20, Konrad Rzeszutek Wilk <konrad@darnok.org> wrote:
> >  -> the usage of XenbusStateInitWait? Why do we introduce that? Looks
> > like a fix to something.
>
> No, this is required to get the negotiation working (the frontend must
> not try to read the new nodes until it can be certain that the backend
> populated them). However, as already pointed out in an earlier reply
> to Santosh, the way this is done here doesn't appear to allow for the
> backend to already be in InitWait state when the frontend gets
> invoked.

OK.
>
> > -> XENBUS_MAX_RING_PAGES - why 2? Why not 4? What is the optimal
> > default size for SSD usage? 16?
>
> What do SSDs have to do with a XenBus definition? Imo it's wrong (and
> unnecessary) to introduce a limit at the XenBus level at all - each driver
> can do this for itself.

The patch should mention what the benefit of multi ring is.
>
> As to the limit for SSDs in the block interface - I don't think the number
> of possibly simultaneous requests has anything to do with this. Instead,
> I'd expect the request number/size/segments extension that NetBSD
> apparently implements to possibly have an effect.

.. which sounds to me like increasing the bandwidth of the protocol. Should
be mentioned somewhere in the git description.
>
> Jan
>
>

[-- Attachment #1.2: Type: text/html, Size: 1774 bytes --]

[-- Attachment #2: Type: text/plain, Size: 183 bytes --]

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Xen-devel] [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-07  9:33     ` Jan Beulich
  2012-03-07 15:15       ` Konrad Rzeszutek Wilk
@ 2012-03-14  6:32       ` Justin Gibbs
  2012-03-14  8:35         ` Jan Beulich
  2012-03-14 15:34         ` Jan Beulich
  1 sibling, 2 replies; 15+ messages in thread
From: Justin Gibbs @ 2012-03-14  6:32 UTC (permalink / raw)
  To: Jan Beulich
  Cc: jeremy, xen-devel, Ian Campbell, konrad.wilk, waldi, netdev,
	joe.jin, linux-kernel, jbarnes, weiyi.huang, paul.gortmaker,
	Paul Durrant, David Vrabel, Santosh Jodh, linux-pci,
	<konrad@darnok.org>,
	akpm, virtualization@lists.linux-foundation.org


On Mar 7, 2012, at 2:33 AM, Jan Beulich wrote:

>>>> On 06.03.12 at 18:20, Konrad Rzeszutek Wilk <konrad@darnok.org> wrote:
>> -> XENBUS_MAX_RING_PAGES - why 2? Why not 4? What is the optimal
>> default size for SSD usage? 16?
> 
> What do SSDs have to do with a XenBus definition? Imo it's wrong (and
> unnecessary) to introduce a limit at the XenBus level at all - each driver
> can do this for itself.
> 
> As to the limit for SSDs in the block interface - I don't think the number
> of possibly simultaneous requests has anything to do with this. Instead,
> I'd expect the request number/size/segments extension that NetBSD
> apparently implements to possibly have an effect.
> 
> Jan

There's another problem here that I brought up during the Xen
Hack-a-thon.  The ring macros require that the ring element count
be a power of two.  This doesn't mean that the ring will be a power
of 2 pages in size.  To illustrate this point, I modified the FreeBSD
blkback driver to provide negotiated ring stats via sysctl.

Here's a connection to a Windows VM running the Citrix PV drivers:

    dev.xbbd.2.max_requests: 128
    dev.xbbd.2.max_request_segments: 11
    dev.xbbd.2.max_request_size: 45056
    dev.xbbd.2.ring_elem_size: 108  <= 32bit ABI
    dev.xbbd.2.ring_pages: 4
    dev.xbbd.2.ring_elements: 128
    dev.xbbd.2.ring_waste: 2496

Over half a page is wasted when ring-page-order is 2.  I'm sure you
can see where this is going.  :-)

Here are the limits published by our backend to the XenStore:

    max-ring-pages = "113"
    max-ring-page-order = "7"
    max-requests = "256"
    max-request-segments = "129"
    max-request-size = "524288"

Because we allow so many concurrent, large requests in our product,
the ring wastage really adds up if the front end doesn't support
the "ring-pages" variant of the extension.  However, you only need
a ring-page-order of 3 with this protocol to start seeing pages of
wasted ring space.

You don't really want to negotiate "ring-pages" either.  The backends
often need to support multiple ABIs.  I can easily construct a set
of limits for the FreeBSD blkback driver which will cause the ring
limits to vary by a page between the 32bit and 64bit ABIs.

With all this in mind, the backend must do a dance of rounding up,
taking the max of the ring sizes for the different ABIs, and then
validating the front-end published limits taking its ABI into
account.  The front-end does some of this too.  Its way too messy
and error prone because we don't communicate the ring element limit
directly.

"max-ring-element-order" anyone? :-)

--
Justin

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Xen-devel] [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-14  6:32       ` Justin Gibbs
@ 2012-03-14  8:35         ` Jan Beulich
  2012-03-14 15:34         ` Jan Beulich
  1 sibling, 0 replies; 15+ messages in thread
From: Jan Beulich @ 2012-03-14  8:35 UTC (permalink / raw)
  To: Justin Gibbs
  Cc: jeremy, Ian Campbell, netdev, konrad.wilk, waldi, joe.jin,
	weiyi.huang, linux-kernel, jbarnes, virtualization,
	paul.gortmaker, Paul Durrant, DavidVrabel, Santosh Jodh,
	linux-pci, <konrad@darnok.org>, akpm,

>>> On 14.03.12 at 07:32, Justin Gibbs <justing@spectralogic.com> wrote:
> There's another problem here that I brought up during the Xen
> Hack-a-thon.  The ring macros require that the ring element count
> be a power of two.  This doesn't mean that the ring will be a power
> of 2 pages in size.  To illustrate this point, I modified the FreeBSD
> blkback driver to provide negotiated ring stats via sysctl.
> 
> Here's a connection to a Windows VM running the Citrix PV drivers:
> 
>     dev.xbbd.2.max_requests: 128
>     dev.xbbd.2.max_request_segments: 11
>     dev.xbbd.2.max_request_size: 45056
>     dev.xbbd.2.ring_elem_size: 108  <= 32bit ABI
>     dev.xbbd.2.ring_pages: 4
>     dev.xbbd.2.ring_elements: 128
>     dev.xbbd.2.ring_waste: 2496
> 
> Over half a page is wasted when ring-page-order is 2.  I'm sure you
> can see where this is going.  :-)
> 
> Here are the limits published by our backend to the XenStore:
> 
>     max-ring-pages = "113"
>     max-ring-page-order = "7"
>     max-requests = "256"
>     max-request-segments = "129"
>     max-request-size = "524288"
> 
> Because we allow so many concurrent, large requests in our product,
> the ring wastage really adds up if the front end doesn't support
> the "ring-pages" variant of the extension.  However, you only need
> a ring-page-order of 3 with this protocol to start seeing pages of
> wasted ring space.
> 
> You don't really want to negotiate "ring-pages" either.  The backends
> often need to support multiple ABIs.  I can easily construct a set
> of limits for the FreeBSD blkback driver which will cause the ring
> limits to vary by a page between the 32bit and 64bit ABIs.
> 
> With all this in mind, the backend must do a dance of rounding up,
> taking the max of the ring sizes for the different ABIs, and then
> validating the front-end published limits taking its ABI into
> account.  The front-end does some of this too.  Its way too messy
> and error prone because we don't communicate the ring element limit
> directly.
> 
> "max-ring-element-order" anyone? :-)

Interesting observation - yes, I think deprecating both pre-existing
methods in favor of something along those lines would be desirable.
(But I'd favor not using the term "order" here as it is - at least in
Linux - usually implied to be used on pages. "max-ringent-log2"
perhaps?)

What you say also implies that all currently floating around Linux
backend patches are flawed in their way of calculating the number
of ring entries, as this number really depends on the protocol the
frontend advertises.

Further, if you're concerned about wasting ring space (and
particularly in the context of your request number/size/segments
extension), shouldn't we bother to define pairs (or larger groups)
of struct blkif_request_segment (as currently a quarter of the space
is mere padding)? Or split grefs from {first,last}_sect altogether?

Finally, while looking at all this again, I stumbled across the use
of blkif_vdev_t in the ring structures: At least Linux'es blkback
completely ignores this field - {xen_,}vbd_translate() simply
overwrites what dispatch_rw_block_io() put there (and with this,
struct phys_req's dev and bdev members seem rather pointless too).
Does anyone recall what the original intention with this request field
was? Allowing I/O on multiple devices over a single ring?

Bottom line - shouldn't we define a blkif2 interface to cleanly
accommodate all the various extensions (and do away with the
protocol variations)?

Jan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Xen-devel] [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-14  6:32       ` Justin Gibbs
  2012-03-14  8:35         ` Jan Beulich
@ 2012-03-14 15:34         ` Jan Beulich
  2012-03-14 17:01           ` Justin Gibbs
  1 sibling, 1 reply; 15+ messages in thread
From: Jan Beulich @ 2012-03-14 15:34 UTC (permalink / raw)
  To: Justin Gibbs
  Cc: jeremy, Ian Campbell, netdev, konrad.wilk, waldi, joe.jin,
	weiyi.huang, linux-kernel, jbarnes, virtualization,
	paul.gortmaker, Paul Durrant, DavidVrabel, Santosh Jodh,
	linux-pci, <konrad@darnok.org>, akpm,

>>> On 14.03.12 at 07:32, Justin Gibbs <justing@spectralogic.com> wrote:
> There's another problem here that I brought up during the Xen
> Hack-a-thon.  The ring macros require that the ring element count
> be a power of two.  This doesn't mean that the ring will be a power
> of 2 pages in size.  To illustrate this point, I modified the FreeBSD
> blkback driver to provide negotiated ring stats via sysctl.
> 
> Here's a connection to a Windows VM running the Citrix PV drivers:
> 
>     dev.xbbd.2.max_requests: 128
>     dev.xbbd.2.max_request_segments: 11
>     dev.xbbd.2.max_request_size: 45056
>     dev.xbbd.2.ring_elem_size: 108  <= 32bit ABI
>     dev.xbbd.2.ring_pages: 4
>     dev.xbbd.2.ring_elements: 128
>     dev.xbbd.2.ring_waste: 2496
> 
> Over half a page is wasted when ring-page-order is 2.  I'm sure you
> can see where this is going.  :-)

Having looked a little closer on how the wasted space is progressing,
I find myself in the odd position that I can't explain the original (and
still active) definition of BLKIF_MAX_SEGMENTS_PER_REQUEST (11):
With ring-order zero, there's 0x240/0x1c0 bytes (32/64-bit
respectively) are unused. With 32 requests fitting in the ring, and with
each segment occupying 6 bytes (padded to 8), in the 64-bit variant
there's enough space for a 12th segment (32-bit would even have
space for a 13th). Am I missing anything here?

Plus all this assumes a page size of 4k, yet ia64 had always been using
pages of 16k iirc.

> Here are the limits published by our backend to the XenStore:
>
>     max-ring-pages = "113"
>     max-ring-page-order = "7"
>     max-requests = "256"
>     max-request-segments = "129"
>     max-request-size = "524288"

Oh, so this protocol doesn't require ring-pages (and max-ring-pages)
to be a power of two? In which case I think it is a mistake to also
advertise max-ring-page-order, as at least the (Linux) frontend code
I know of interprets this as being able to set up a ring of (using the
numbers above) 128 pages (unless, of course, your backend can deal
with this regardless of the max-ring-pages value it announces).

Jan

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Xen-devel] [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-14 15:34         ` Jan Beulich
@ 2012-03-14 17:01           ` Justin Gibbs
  0 siblings, 0 replies; 15+ messages in thread
From: Justin Gibbs @ 2012-03-14 17:01 UTC (permalink / raw)
  To: Jan Beulich
  Cc: jeremy, Ian Campbell, netdev, konrad.wilk, waldi, joe.jin,
	weiyi.huang, linux-kernel, jbarnes, virtualization,
	paul.gortmaker, Paul Durrant, DavidVrabel, Santosh Jodh,
	linux-pci, <konrad@darnok.org>, akpm,

On Mar 14, 2012, at 9:34 AM, Jan Beulich wrote:

>>>> On 14.03.12 at 07:32, Justin Gibbs <justing@spectralogic.com> wrote:
>> There's another problem here that I brought up during the Xen
>> Hack-a-thon.  The ring macros require that the ring element count
>> be a power of two.  This doesn't mean that the ring will be a power
>> of 2 pages in size.  To illustrate this point, I modified the FreeBSD
>> blkback driver to provide negotiated ring stats via sysctl.
>> 
>> Here's a connection to a Windows VM running the Citrix PV drivers:
>> 
>>    dev.xbbd.2.max_requests: 128
>>    dev.xbbd.2.max_request_segments: 11
>>    dev.xbbd.2.max_request_size: 45056
>>    dev.xbbd.2.ring_elem_size: 108  <= 32bit ABI
>>    dev.xbbd.2.ring_pages: 4
>>    dev.xbbd.2.ring_elements: 128
>>    dev.xbbd.2.ring_waste: 2496
>> 
>> Over half a page is wasted when ring-page-order is 2.  I'm sure you
>> can see where this is going.  :-)
> 
> Having looked a little closer on how the wasted space is progressing,
> I find myself in the odd position that I can't explain the original (and
> still active) definition of BLKIF_MAX_SEGMENTS_PER_REQUEST (11):
> With ring-order zero, there's 0x240/0x1c0 bytes (32/64-bit
> respectively) are unused. With 32 requests fitting in the ring, and with
> each segment occupying 6 bytes (padded to 8), in the 64-bit variant
> there's enough space for a 12th segment (32-bit would even have
> space for a 13th). Am I missing anything here?

I don't profess to know the real reason, but the only thing I can come up
with is a requirement/desire on some platforms for 16byte alignment
of the request structures.  This would make the largest possible structure
112 bytes, not the 120 that would allow for more elements.

While we're talking about fixing ring data structures, can RING_IDX
be defined as a "uint32_t" instead of "unsigned int".  The structure
padding in the ring macros assumes RING_IDX is exactly 4 bytes,
so this should be made explicit.  ILP64 machines may still be a way
out, but the use of non-fixed sized types in places where size really
matters just isn't clean.

> 
>> Here are the limits published by our backend to the XenStore:
>> 
>>    max-ring-pages = "113"
>>    max-ring-page-order = "7"
>>    max-requests = "256"
>>    max-request-segments = "129"
>>    max-request-size = "524288"
> 
> Oh, so this protocol doesn't require ring-pages (and max-ring-pages)
> to be a power of two? In which case I think it is a mistake to also
> advertise max-ring-page-order, as at least the (Linux) frontend code
> I know of interprets this as being able to set up a ring of (using the
> numbers above) 128 pages (unless, of course, your backend can deal
> with this regardless of the max-ring-pages value it announces).

The advertised max-ring-pages is sufficient to hold the maximum allowed
number of ring elements regardless of ABI.  This is then rounded up to the
next power of 2 pages to get the max-ring-page order.  When the front-end
negotiates, the backend just verifies that the maximum number of ring
elements in the specified ring size doesn't exceed the backend's limit.
Fortunately, even with this large of a ring, regardless of ABI, a given
page order computes to the same number of ring elements.  You just have
more wasted space.

--
Justin

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Xen-devel] [PATCH 0001/001] xen: multi page ring support for block devices
  2012-03-06  8:34   ` Jan Beulich
@ 2012-03-14 17:17     ` Justin T. Gibbs
  0 siblings, 0 replies; 15+ messages in thread
From: Justin T. Gibbs @ 2012-03-14 17:17 UTC (permalink / raw)
  To: Jan Beulich
  Cc: jeremy, xen-devel, Ian Campbell, konrad.wilk, waldi, netdev,
	joe.jin, linux-kernel, jbarnes, weiyi.huang, paul.gortmaker,
	Paul Durrant, David Vrabel, Santosh Jodh, linux-pci, akpm,
	virtualization,

On Mar 6, 2012, at 1:34 AM, Jan Beulich wrote:

>>>> On 05.03.12 at 22:49, Santosh Jodh <Santosh.Jodh@citrix.com> wrote:
>> +       }
>> +
>>        /* Create shared ring, alloc event channel. */
>>        err = setup_blkring(dev, info);
>>        if (err)
>> @@ -889,12 +916,35 @@ again:
>>                goto destroy_blkring;
>>        }
>> 
>> -       err = xenbus_printf(xbt, dev->nodename,
>> -                           "ring-ref", "%u", info->ring_ref);
>> -       if (err) {
>> -               message = "writing ring-ref";
>> -               goto abort_transaction;
>> +       if (legacy_backend) {
> 
> Why not use the simpler interface always when info->ring_order == 0?

Because, as I just found out today via a FreeBSD bug report, that's
not how XenServer works.  If the front-end publishes "ring-page-order",
the backend assumes the "ring-refNN" XenStore nodes are in effect,
even if the order is 0.

I'm working on a documentation update for blkif.h now.

<sigh>

--
Justin

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2012-03-14 17:17 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-03-02 15:11 [PATCH 3/3] netxen: qlogic ethernet : Fix Endian Bug santosh nayak
2012-03-02 17:22 ` Rajesh Borundia
2012-03-05 21:49 ` [PATCH 0001/001] xen: multi page ring support for block devices Santosh Jodh
2012-03-06  2:42   ` Rusty Russell
2012-03-06  6:21     ` Santosh Jodh
2012-03-06  8:34   ` Jan Beulich
2012-03-14 17:17     ` [Xen-devel] " Justin T. Gibbs
2012-03-06 11:16   ` Wei Liu
2012-03-06 17:20   ` [Xen-devel] " Konrad Rzeszutek Wilk
2012-03-07  9:33     ` Jan Beulich
2012-03-07 15:15       ` Konrad Rzeszutek Wilk
2012-03-14  6:32       ` Justin Gibbs
2012-03-14  8:35         ` Jan Beulich
2012-03-14 15:34         ` Jan Beulich
2012-03-14 17:01           ` Justin Gibbs

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).