From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S1753776AbbDAO41 (ORCPT ); Wed, 1 Apr 2015 10:56:27 -0400 Received: from aserp1040.oracle.com ([141.146.126.69]:47183 "EHLO aserp1040.oracle.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1753734AbbDAO4F (ORCPT ); Wed, 1 Apr 2015 10:56:05 -0400 Date: Wed, 1 Apr 2015 10:55:57 -0400 From: Konrad Rzeszutek Wilk To: Bob Liu Cc: xen-devel@lists.xenproject.org, paul.durrant@citrix.com, david.vrabel@citrix.com, roger.pau@citrix.com, linux-kernel@vger.kernel.org Subject: Re: [PATCH v2 2/2] xen/block: add multi-page ring support Message-ID: <20150401145557.GC28938@l.oracle.com> References: <1427804127-11372-1-git-send-email-bob.liu@oracle.com> <1427804127-11372-2-git-send-email-bob.liu@oracle.com> <551BDBD3.2080200@oracle.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <551BDBD3.2080200@oracle.com> User-Agent: Mutt/1.5.23 (2014-03-12) X-Source-IP: aserv0021.oracle.com [141.146.126.233] Sender: linux-kernel-owner@vger.kernel.org List-ID: X-Mailing-List: linux-kernel@vger.kernel.org On Wed, Apr 01, 2015 at 07:51:47PM +0800, Bob Liu wrote: > > On 03/31/2015 08:15 PM, Bob Liu wrote: > > Extend xen/block to support multi-page ring, so that more requests can be issued > > by using more than one pages as the request ring between blkfront and backend. > > As a result, the performance can get improved significantly. > > > > We got some impressive improvements on our highend iscsi storage cluster backend. > > If using 64 pages as the ring, the IOPS increased about 15 times for the > > throughput testing and above doubled for the latency testing. > > > > The reason was the limit on outstanding requests is 32 if use only one-page > > ring, but in our case the iscsi lun was spread across about 100 physical drives, > > 32 was really not enough to keep them busy. > > > > Change in V2: > > * Rebased to 4.0-rc6 > > * Add description on how this protocol works into io/blkif.h > > > > I found there were already some related descriptions in file > xen/include/public/io/blkif.h. I'll send an new patch following the > protocol in that file, and leave file include/xen/interface/io/blkif.h > unchanged. > > 190 * max-ring-pages > 191 * Values: > 192 * Default Value: 1 > 193 * Notes: DEPRECATED, 2, 3 Well, perhaps remove the 'DEPRECATED' part? > 194 * > 195 * The maximum supported size of the request ring buffer in > units of > 196 * machine pages. The value must be a power of 2. > > 302 * num-ring-pages > 303 * Values: > 304 * Default Value: 1 > 305 * Maximum Value: MAX(max-ring-pages,(0x1 << max-ring-page-order)) > 306 * Notes: DEPRECATED, 2, 3 > 307 * > 308 * The size of the frontend allocated request ring buffer in > units of > 309 * machine pages. The value must be a power of 2. > 310 * > > Regards, > -Bob > > > Signed-off-by: Bob Liu > > --- > > drivers/block/xen-blkback/blkback.c | 2 +- > > drivers/block/xen-blkback/common.h | 3 +- > > drivers/block/xen-blkback/xenbus.c | 83 ++++++++++++++++++++------- > > drivers/block/xen-blkfront.c | 111 +++++++++++++++++++++++++----------- > > include/xen/interface/io/blkif.h | 13 +++++ > > 5 files changed, 156 insertions(+), 56 deletions(-) > > > > diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c > > index 0c8da82..072d15b 100644 > > --- a/drivers/block/xen-blkback/blkback.c > > +++ b/drivers/block/xen-blkback/blkback.c > > @@ -628,7 +628,7 @@ purge_gnt_list: > > } > > > > /* Shrink if we have more than xen_blkif_max_buffer_pages */ > > - shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); > > + shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages * blkif->nr_ring_pages); > > > > if (log_stats && time_after(jiffies, blkif->st_print)) > > print_stats(blkif); > > diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h > > index 375d288..320ca35 100644 > > --- a/drivers/block/xen-blkback/common.h > > +++ b/drivers/block/xen-blkback/common.h > > @@ -254,7 +254,7 @@ struct backend_info; > > #define PERSISTENT_GNT_WAS_ACTIVE 1 > > > > /* Number of requests that we can fit in a ring */ > > -#define XEN_BLKIF_REQS 32 > > +#define XEN_BLKIF_REQS (32 * XENBUS_MAX_RING_PAGES) > > > > struct persistent_gnt { > > struct page *page; > > @@ -326,6 +326,7 @@ struct xen_blkif { > > struct work_struct free_work; > > /* Thread shutdown wait queue. */ > > wait_queue_head_t shutdown_wq; > > + int nr_ring_pages; > > }; > > > > struct seg_buf { > > diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c > > index ff30259..b75be66 100644 > > --- a/drivers/block/xen-blkback/xenbus.c > > +++ b/drivers/block/xen-blkback/xenbus.c > > @@ -193,8 +193,8 @@ fail: > > return ERR_PTR(-ENOMEM); > > } > > > > -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, > > - unsigned int evtchn) > > +static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, > > + unsigned int nr_grefs, unsigned int evtchn) > > { > > int err; > > > > @@ -202,7 +202,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, > > if (blkif->irq) > > return 0; > > > > - err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1, > > + err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, > > &blkif->blk_ring); > > if (err < 0) > > return err; > > @@ -212,21 +212,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref, > > { > > struct blkif_sring *sring; > > sring = (struct blkif_sring *)blkif->blk_ring; > > - BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); > > + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs); > > break; > > } > > case BLKIF_PROTOCOL_X86_32: > > { > > struct blkif_x86_32_sring *sring_x86_32; > > sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; > > - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); > > + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs); > > break; > > } > > case BLKIF_PROTOCOL_X86_64: > > { > > struct blkif_x86_64_sring *sring_x86_64; > > sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; > > - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); > > + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs); > > break; > > } > > default: > > @@ -588,6 +588,10 @@ static int xen_blkbk_probe(struct xenbus_device *dev, > > if (err) > > goto fail; > > > > + err = xenbus_printf(XBT_NIL, dev->nodename, "feature-multi-page-ring", "%u", 1); > > + if (err) > > + goto fail; > > + > > err = xenbus_switch_state(dev, XenbusStateInitWait); > > if (err) > > goto fail; > > @@ -852,22 +856,61 @@ again: > > static int connect_ring(struct backend_info *be) > > { > > struct xenbus_device *dev = be->dev; > > - unsigned long ring_ref; > > - unsigned int evtchn; > > + unsigned int ring_ref[XENBUS_MAX_RING_PAGES]; > > + unsigned int evtchn, nr_grefs; > > unsigned int pers_grants; > > char protocol[64] = ""; > > int err; > > > > DPRINTK("%s", dev->otherend); > > - > > - err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", > > - &ring_ref, "event-channel", "%u", &evtchn, NULL); > > - if (err) { > > - xenbus_dev_fatal(dev, err, > > - "reading %s/ring-ref and event-channel", > > + err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u", > > + &evtchn); > > + if (err != 1) { > > + err = -EINVAL; > > + xenbus_dev_fatal(dev, err, "reading %s/event-channel", > > dev->otherend); > > return err; > > } > > + pr_info(DRV_PFX "event-channel %u\n", evtchn); > > + > > + err = xenbus_scanf(XBT_NIL, dev->otherend, "max-ring-pages", "%u", > > + &nr_grefs); > > + if (err != 1) { > > + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", > > + "%u", &ring_ref[0]); > > + if (err != 1) { > > + err = -EINVAL; > > + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", > > + dev->otherend); > > + return err; > > + } > > + nr_grefs = 1; > > + pr_info(DRV_PFX "%s:using single page: ring-ref %d\n", > > + dev->otherend, ring_ref[0]); > > + } else { > > + unsigned int i; > > + > > + if (nr_grefs > XENBUS_MAX_RING_PAGES) { > > + err = -EINVAL; > > + xenbus_dev_fatal(dev, err, "%s/request %d ring pages exceed max", > > + dev->otherend, nr_grefs); > > + return err; > > + } > > + for (i = 0; i < nr_grefs; i++) { > > + char ring_ref_name[15]; > > + > > + snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i); > > + err = xenbus_scanf(XBT_NIL, dev->otherend, > > + ring_ref_name, "%d", &ring_ref[i]); > > + if (err != 1) { > > + err = -EINVAL; > > + xenbus_dev_fatal(dev, err, "reading %s/%s", > > + dev->otherend, ring_ref_name); > > + return err; > > + } > > + pr_info(DRV_PFX "ring-ref%u: %d\n", i, ring_ref[i]); > > + } > > + } > > > > be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; > > err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", > > @@ -892,16 +935,16 @@ static int connect_ring(struct backend_info *be) > > > > be->blkif->vbd.feature_gnt_persistent = pers_grants; > > be->blkif->vbd.overflow_max_grants = 0; > > + be->blkif->nr_ring_pages = nr_grefs; > > > > - pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n", > > - ring_ref, evtchn, be->blkif->blk_protocol, protocol, > > - pers_grants ? "persistent grants" : ""); > > + pr_info(DRV_PFX "ring-pages:%d, event-channel %d, protocol %d (%s) %s\n", > > + nr_grefs, evtchn, be->blkif->blk_protocol, protocol, > > + pers_grants ? "persistent grants" : ""); > > > > /* Map the shared frame, irq etc. */ > > - err = xen_blkif_map(be->blkif, ring_ref, evtchn); > > + err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); > > if (err) { > > - xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", > > - ring_ref, evtchn); > > + xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); > > return err; > > } > > > > diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c > > index 2c61cf8..528e4f6 100644 > > --- a/drivers/block/xen-blkfront.c > > +++ b/drivers/block/xen-blkfront.c > > @@ -98,7 +98,12 @@ static unsigned int xen_blkif_max_segments = 32; > > module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); > > MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); > > > > -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE) > > +static unsigned int xen_blkif_max_ring_pages = 1; > > +module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, 0644); > > +MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as the ring"); > > + > > +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * info->nr_ring_pages) > > +#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES) > > > > /* > > * We have one of these per vbd, whether ide, scsi or 'other'. They > > @@ -114,13 +119,14 @@ struct blkfront_info > > int vdevice; > > blkif_vdev_t handle; > > enum blkif_state connected; > > - int ring_ref; > > + int ring_ref[XENBUS_MAX_RING_PAGES]; > > + int nr_ring_pages; > > struct blkif_front_ring ring; > > unsigned int evtchn, irq; > > struct request_queue *rq; > > struct work_struct work; > > struct gnttab_free_callback callback; > > - struct blk_shadow shadow[BLK_RING_SIZE]; > > + struct blk_shadow shadow[BLK_MAX_RING_SIZE]; > > struct list_head grants; > > struct list_head indirect_pages; > > unsigned int persistent_gnts_c; > > @@ -139,8 +145,6 @@ static unsigned int nr_minors; > > static unsigned long *minors; > > static DEFINE_SPINLOCK(minor_lock); > > > > -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ > > - (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) > > #define GRANT_INVALID_REF 0 > > > > #define PARTS_PER_DISK 16 > > @@ -1033,12 +1037,15 @@ free_shadow: > > flush_work(&info->work); > > > > /* Free resources associated with old device channel. */ > > - if (info->ring_ref != GRANT_INVALID_REF) { > > - gnttab_end_foreign_access(info->ring_ref, 0, > > - (unsigned long)info->ring.sring); > > - info->ring_ref = GRANT_INVALID_REF; > > - info->ring.sring = NULL; > > + for (i = 0; i < info->nr_ring_pages; i++) { > > + if (info->ring_ref[i] != GRANT_INVALID_REF) { > > + gnttab_end_foreign_access(info->ring_ref[i], 0, 0); > > + info->ring_ref[i] = GRANT_INVALID_REF; > > + } > > } > > + free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); > > + info->ring.sring = NULL; > > + > > if (info->irq) > > unbind_from_irqhandler(info->irq, info); > > info->evtchn = info->irq = 0; > > @@ -1245,26 +1252,30 @@ static int setup_blkring(struct xenbus_device *dev, > > struct blkfront_info *info) > > { > > struct blkif_sring *sring; > > - grant_ref_t gref; > > - int err; > > + int err, i; > > + unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE; > > + grant_ref_t gref[XENBUS_MAX_RING_PAGES]; > > > > - info->ring_ref = GRANT_INVALID_REF; > > + for (i = 0; i < XENBUS_MAX_RING_PAGES; i++) > > + info->ring_ref[i] = GRANT_INVALID_REF; > > > > - sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH); > > + sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, > > + get_order(ring_size)); > > if (!sring) { > > xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); > > return -ENOMEM; > > } > > SHARED_RING_INIT(sring); > > - FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); > > + FRONT_RING_INIT(&info->ring, sring, ring_size); > > > > - err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref); > > + err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); > > if (err < 0) { > > free_page((unsigned long)sring); > > info->ring.sring = NULL; > > goto fail; > > } > > - info->ring_ref = gref; > > + for (i = 0; i < info->nr_ring_pages; i++) > > + info->ring_ref[i] = gref[i]; > > > > err = xenbus_alloc_evtchn(dev, &info->evtchn); > > if (err) > > @@ -1292,7 +1303,14 @@ static int talk_to_blkback(struct xenbus_device *dev, > > { > > const char *message = NULL; > > struct xenbus_transaction xbt; > > - int err; > > + int err, i, multi_ring_pages = 0; > > + > > + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, > > + "feature-multi-page-ring", "%u", &multi_ring_pages); > > + if (err != 1) > > + info->nr_ring_pages = 1; > > + else > > + info->nr_ring_pages = xen_blkif_max_ring_pages; > > > > /* Create shared ring, alloc event channel. */ > > err = setup_blkring(dev, info); > > @@ -1306,11 +1324,31 @@ again: > > goto destroy_blkring; > > } > > > > - err = xenbus_printf(xbt, dev->nodename, > > - "ring-ref", "%u", info->ring_ref); > > - if (err) { > > - message = "writing ring-ref"; > > - goto abort_transaction; > > + if (info->nr_ring_pages == 1) { > > + err = xenbus_printf(xbt, dev->nodename, > > + "ring-ref", "%u", info->ring_ref[0]); > > + if (err) { > > + message = "writing ring-ref"; > > + goto abort_transaction; > > + } > > + } else { > > + err = xenbus_printf(xbt, dev->nodename, > > + "max-ring-pages", "%u", info->nr_ring_pages); > > + if (err) { > > + message = "writing max-ring-pages"; > > + goto abort_transaction; > > + } > > + > > + for (i = 0; i < info->nr_ring_pages; i++) { > > + char ring_ref_name[15]; > > + sprintf(ring_ref_name, "ring-ref%d", i); > > + err = xenbus_printf(xbt, dev->nodename, > > + ring_ref_name, "%d", info->ring_ref[i]); > > + if (err) { > > + message = "writing ring-ref"; > > + goto abort_transaction; > > + } > > + } > > } > > err = xenbus_printf(xbt, dev->nodename, > > "event-channel", "%u", info->evtchn); > > @@ -1338,6 +1376,7 @@ again: > > goto destroy_blkring; > > } > > > > + info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; > > xenbus_switch_state(dev, XenbusStateInitialised); > > > > return 0; > > @@ -1422,21 +1461,13 @@ static int blkfront_probe(struct xenbus_device *dev, > > info->connected = BLKIF_STATE_DISCONNECTED; > > INIT_WORK(&info->work, blkif_restart_queue); > > > > - for (i = 0; i < BLK_RING_SIZE; i++) > > + for (i = 0; i < BLK_MAX_RING_SIZE; i++) > > info->shadow[i].req.u.rw.id = i+1; > > - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; > > > > /* Front end dir is a number, which is used as the id. */ > > info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); > > dev_set_drvdata(&dev->dev, info); > > > > - err = talk_to_blkback(dev, info); > > - if (err) { > > - kfree(info); > > - dev_set_drvdata(&dev->dev, NULL); > > - return err; > > - } > > - > > return 0; > > } > > > > @@ -1476,7 +1507,7 @@ static int blkif_recover(struct blkfront_info *info) > > > > /* Stage 2: Set up free list. */ > > memset(&info->shadow, 0, sizeof(info->shadow)); > > - for (i = 0; i < BLK_RING_SIZE; i++) > > + for (i = 0; i < BLK_MAX_RING_SIZE; i++) > > info->shadow[i].req.u.rw.id = i+1; > > info->shadow_free = info->ring.req_prod_pvt; > > info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; > > @@ -1906,8 +1937,17 @@ static void blkback_changed(struct xenbus_device *dev, > > dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state); > > > > switch (backend_state) { > > - case XenbusStateInitialising: > > case XenbusStateInitWait: > > + /* > > + * Talk to blkback after backend enter InitWait so we can know > > + * whether backend spports feature-multi-page-ring. > > + */ > > + if (talk_to_blkback(dev, info)) { > > + kfree(info); > > + dev_set_drvdata(&dev->dev, NULL); > > + } > > + break; > > + case XenbusStateInitialising: > > case XenbusStateInitialised: > > case XenbusStateReconfiguring: > > case XenbusStateReconfigured: > > @@ -2088,6 +2128,9 @@ static int __init xlblk_init(void) > > { > > int ret; > > > > + if (xen_blkif_max_ring_pages > XENBUS_MAX_RING_PAGES) > > + return -EINVAL; > > + > > if (!xen_domain()) > > return -ENODEV; > > > > diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h > > index c33e1c4..9c2c9a7 100644 > > --- a/include/xen/interface/io/blkif.h > > +++ b/include/xen/interface/io/blkif.h > > @@ -127,6 +127,19 @@ typedef uint64_t blkif_sector_t; > > #define BLKIF_OP_INDIRECT 6 > > > > /* > > + * "feature-multi-page-ring" is introduced to use more than one page as the > > + * request ring between blkfront and backend. As a result, more reqs can be > > + * issued from blkfront and the performance get improved accordingly. > > + * > > + * If supported, the backend will write the key 'feature-multi-page-ring' > > + * for that vbd. > > + * Frontends that are aware of this feature and wish to use it will write key > > + * "max-ring-pages" to set the number of pages that they wish to be used as the > > + * ring. The 'xen_blkif_max_ring_pages' parameter is introduced with default > > + * number still one page, the maximum is XENBUS_MAX_RING_PAGES. > > + */ > > + > > +/* > > * Maximum scatter/gather segments per request. > > * This is carefully chosen so that sizeof(struct blkif_ring) <= PAGE_SIZE. > > * NB. This could be 12 if the ring indexes weren't stored in the same page. > >