From mboxrd@z Thu Jan 1 00:00:00 1970 From: Daniel Stodden Subject: [PATCH 2/5] blktap: Make VMAs non-foreign and bounce buffered. Date: Fri, 12 Nov 2010 15:31:44 -0800 Message-ID: <1289604707-13378-3-git-send-email-daniel.stodden@citrix.com> References: <1289604707-13378-1-git-send-email-daniel.stodden@citrix.com> Return-path: In-Reply-To: <1289604707-13378-1-git-send-email-daniel.stodden@citrix.com> List-Unsubscribe: , List-Post: List-Help: List-Subscribe: , Sender: xen-devel-bounces@lists.xensource.com Errors-To: xen-devel-bounces@lists.xensource.com To: Xen Cc: Jeremy Fitzhardinge , Daniel Stodden List-Id: xen-devel@lists.xenproject.org Drop zero-copy I/O. Removes all grantmap mechanism from blktap, bouncing I/O from/to a pool of dom0 memory and request SGs. Essentially renders blktap without any residual dependencies on Xen whatsoever. Signed-off-by: Daniel Stodden --- drivers/xen/blktap/blktap.h | 43 ++-- drivers/xen/blktap/control.c | 8 +- drivers/xen/blktap/device.c | 564 ++++++------------------------------------ drivers/xen/blktap/request.c | 20 ++- drivers/xen/blktap/ring.c | 319 +++++++++++++------------ 5 files changed, 285 insertions(+), 669 deletions(-) diff --git a/drivers/xen/blktap/blktap.h b/drivers/xen/blktap/blktap.h index ad79c15..fe63fc9 100644 --- a/drivers/xen/blktap/blktap.h +++ b/drivers/xen/blktap/blktap.h @@ -7,7 +7,6 @@ #include #include #include -#include extern int blktap_debug_level; extern int blktap_ring_major; @@ -27,7 +26,6 @@ extern int blktap_device_major; #define MAX_BLKTAP_DEVICE 1024 -#define BLKTAP_CONTROL 1 #define BLKTAP_DEVICE 4 #define BLKTAP_DEVICE_CLOSED 5 #define BLKTAP_SHUTDOWN_REQUESTED 8 @@ -94,11 +92,13 @@ struct blktap_ring { struct task_struct *task; struct vm_area_struct *vma; - struct blkif_front_ring ring; - struct vm_foreign_map foreign_map; + struct blkif_front_ring ring; unsigned long ring_vstart; unsigned long user_vstart; + int n_pending; + struct blktap_request *pending[MAX_PENDING_REQS]; + wait_queue_head_t poll_wait; dev_t devno; @@ -123,19 +123,21 @@ struct blktap_statistics { struct blktap_request { struct blktap *tap; struct request *rq; - uint16_t usr_idx; - - uint8_t status; - atomic_t pendcnt; - unsigned short operation; + int usr_idx; + int operation; struct timeval time; - struct grant_handle_pair handles[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct scatterlist sg_table[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; int nr_pages; }; +#define blktap_for_each_sg(_sg, _req, _i) \ + for (_sg = (_req)->sg_table, _i = 0; \ + _i < (_req)->nr_pages; \ + (_sg)++, (_i)++) + struct blktap { int minor; unsigned long dev_inuse; @@ -144,10 +146,6 @@ struct blktap { struct blktap_device device; struct blktap_page_pool *pool; - int pending_cnt; - struct blktap_request *pending_requests[MAX_PENDING_REQS]; - struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; - wait_queue_head_t remove_wait; struct work_struct remove_work; char name[BLKTAP2_MAX_MESSAGE_LEN]; @@ -174,6 +172,13 @@ void blktap_ring_exit(void); size_t blktap_ring_debug(struct blktap *, char *, size_t); int blktap_ring_create(struct blktap *); int blktap_ring_destroy(struct blktap *); +struct blktap_request *blktap_ring_make_request(struct blktap *); +void blktap_ring_free_request(struct blktap *,struct blktap_request *); +void blktap_ring_submit_request(struct blktap *, struct blktap_request *); +int blktap_ring_map_request_segment(struct blktap *, struct blktap_request *, int); +int blktap_ring_map_request(struct blktap *, struct blktap_request *); +void blktap_ring_unmap_request(struct blktap *, struct blktap_request *); +void blktap_ring_set_message(struct blktap *, int); void blktap_ring_kick_user(struct blktap *); int blktap_sysfs_init(void); @@ -187,7 +192,7 @@ size_t blktap_device_debug(struct blktap *, char *, size_t); int blktap_device_create(struct blktap *, struct blktap_params *); int blktap_device_destroy(struct blktap *); void blktap_device_destroy_sync(struct blktap *); -int blktap_device_run_queue(struct blktap *); +void blktap_device_run_queue(struct blktap *); void blktap_device_end_request(struct blktap *, struct blktap_request *, int); int blktap_page_pool_init(struct kobject *); @@ -200,13 +205,5 @@ int blktap_request_get_pages(struct blktap *, struct blktap_request *, int); void blktap_request_free(struct blktap *, struct blktap_request *); void blktap_request_bounce(struct blktap *, struct blktap_request *, int, int); -static inline unsigned long -request_to_kaddr(struct blktap_request *req, int seg) -{ - return (unsigned long)page_address(req->pages[seg]); -} - -#define request_to_page(_request, _seg) ((_request)->pages[_seg]) - #endif diff --git a/drivers/xen/blktap/control.c b/drivers/xen/blktap/control.c index 8652e07..f339bba 100644 --- a/drivers/xen/blktap/control.c +++ b/drivers/xen/blktap/control.c @@ -18,13 +18,10 @@ blktap_control_get_minor(void) int minor; struct blktap *tap; - tap = kmalloc(sizeof(*tap), GFP_KERNEL); + tap = kzalloc(sizeof(*tap), GFP_KERNEL); if (unlikely(!tap)) return NULL; - memset(tap, 0, sizeof(*tap)); - sg_init_table(tap->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST); - mutex_lock(&blktap_lock); for (minor = 0; minor < blktap_max_minor; minor++) @@ -290,9 +287,6 @@ blktap_init(void) { int err; - if (!xen_pv_domain()) - return -ENODEV; - err = blktap_device_init(); if (err) goto fail; diff --git a/drivers/xen/blktap/device.c b/drivers/xen/blktap/device.c index ed95548..02e1fc8 100644 --- a/drivers/xen/blktap/device.c +++ b/drivers/xen/blktap/device.c @@ -2,27 +2,11 @@ #include #include #include -#include -#include - #include #include -#include -#include - -#include -#include - #include "blktap.h" -#include "../blkback/blkback-pagemap.h" - -struct blktap_grant_table { - int cnt; - struct gnttab_map_grant_ref grants[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; -}; - int blktap_device_major; #define dev_to_blktap(_dev) container_of(_dev, struct blktap, device) @@ -119,526 +103,136 @@ static struct block_device_operations blktap_device_file_operations = { .getgeo = blktap_device_getgeo }; -static int -blktap_map_uaddr_fn(pte_t *ptep, struct page *pmd_page, - unsigned long addr, void *data) -{ - pte_t *pte = (pte_t *)data; - - BTDBG("ptep %p -> %012llx\n", ptep, pte_val(*pte)); - set_pte(ptep, *pte); - return 0; -} - -static int -blktap_map_uaddr(struct mm_struct *mm, unsigned long address, pte_t pte) -{ - return apply_to_page_range(mm, address, - PAGE_SIZE, blktap_map_uaddr_fn, &pte); -} - -static int -blktap_umap_uaddr_fn(pte_t *ptep, struct page *pmd_page, - unsigned long addr, void *data) -{ - struct mm_struct *mm = (struct mm_struct *)data; - - BTDBG("ptep %p\n", ptep); - pte_clear(mm, addr, ptep); - return 0; -} - -static int -blktap_umap_uaddr(struct mm_struct *mm, unsigned long address) -{ - return apply_to_page_range(mm, address, - PAGE_SIZE, blktap_umap_uaddr_fn, mm); -} - -static inline void -flush_tlb_kernel_page(unsigned long kvaddr) -{ - flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE); -} - -static void -blktap_device_end_dequeued_request(struct blktap_device *dev, - struct request *req, int error) -{ - unsigned long flags; - int ret; - - //spin_lock_irq(&dev->lock); - spin_lock_irqsave(dev->gd->queue->queue_lock, flags); - ret = __blk_end_request(req, error, blk_rq_bytes(req)); - spin_unlock_irqrestore(dev->gd->queue->queue_lock, flags); - //spin_unlock_irq(&dev->lock); - - BUG_ON(ret); -} - -static void -blktap_device_fast_flush(struct blktap *tap, struct blktap_request *request) -{ - uint64_t ptep; - int ret, usr_idx; - unsigned int i, cnt; - struct page **map, *page; - struct blktap_ring *ring; - struct grant_handle_pair *khandle; - unsigned long kvaddr, uvaddr, offset; - struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST * 2]; - - cnt = 0; - ring = &tap->ring; - usr_idx = request->usr_idx; - map = ring->foreign_map.map; - - if (!ring->vma) - return; - - if (xen_feature(XENFEAT_auto_translated_physmap)) - zap_page_range(ring->vma, - MMAP_VADDR(ring->user_vstart, usr_idx, 0), - request->nr_pages << PAGE_SHIFT, NULL); - - for (i = 0; i < request->nr_pages; i++) { - kvaddr = request_to_kaddr(request, i); - uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); - - khandle = request->handles + i; - - if (khandle->kernel != INVALID_GRANT_HANDLE) { - gnttab_set_unmap_op(&unmap[cnt], kvaddr, - GNTMAP_host_map, khandle->kernel); - cnt++; - set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, - INVALID_P2M_ENTRY); - } - - if (khandle->user != INVALID_GRANT_HANDLE) { - BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); - if (create_lookup_pte_addr(ring->vma->vm_mm, - uvaddr, &ptep) != 0) { - BTERR("Couldn't get a pte addr!\n"); - return; - } - - gnttab_set_unmap_op(&unmap[cnt], ptep, - GNTMAP_host_map - | GNTMAP_application_map - | GNTMAP_contains_pte, - khandle->user); - cnt++; - } - - offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT; - - BTDBG("offset: 0x%08lx, page: %p, request: %p, usr_idx: %d, " - "seg: %d, kvaddr: 0x%08lx, khandle: %u, uvaddr: " - "0x%08lx, handle: %u\n", offset, map[offset], request, - usr_idx, i, kvaddr, khandle->kernel, uvaddr, - khandle->user); - - page = map[offset]; - if (page && blkback_pagemap_contains_page(page)) - set_page_private(page, 0); - - map[offset] = NULL; - - khandle->kernel = INVALID_GRANT_HANDLE; - khandle->user = INVALID_GRANT_HANDLE; - } - - if (cnt) { - ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, - unmap, cnt); - BUG_ON(ret); - } - - if (!xen_feature(XENFEAT_auto_translated_physmap)) - zap_page_range(ring->vma, - MMAP_VADDR(ring->user_vstart, usr_idx, 0), - request->nr_pages << PAGE_SHIFT, NULL); -} - -static void -blktap_unmap(struct blktap *tap, struct blktap_request *request) -{ - int i, usr_idx; - unsigned long kvaddr; - - usr_idx = request->usr_idx; - - for (i = 0; i < request->nr_pages; i++) { - kvaddr = request_to_kaddr(request, i); - BTDBG("request: %p, seg: %d, kvaddr: 0x%08lx, khandle: %u, " - "uvaddr: 0x%08lx, uhandle: %u\n", request, i, - kvaddr, request->handles[i].kernel, - MMAP_VADDR(tap->ring.user_vstart, usr_idx, i), - request->handles[i].user); - - if (request->handles[i].kernel == INVALID_GRANT_HANDLE) { - blktap_umap_uaddr(current->mm, kvaddr); - flush_tlb_kernel_page(kvaddr); - set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, - INVALID_P2M_ENTRY); - } - } - - blktap_device_fast_flush(tap, request); -} - void blktap_device_end_request(struct blktap *tap, struct blktap_request *request, int error) { struct blktap_device *tapdev = &tap->device; + struct request *rq = request->rq; + + blktap_ring_unmap_request(tap, request); + + blktap_ring_free_request(tap, request); - blktap_unmap(tap, request); + dev_dbg(&tapdev->gd->dev, + "end_request: op=%d error=%d bytes=%d\n", + rq_data_dir(rq), error, blk_rq_bytes(rq)); spin_lock_irq(&tapdev->lock); end_request(request->rq, !error); spin_unlock_irq(&tapdev->lock); - - blktap_request_free(tap, request); } -static int -blktap_prep_foreign(struct blktap *tap, - struct blktap_request *request, - struct blkif_request *blkif_req, - unsigned int seg, struct page *page, - struct blktap_grant_table *table) -{ - uint64_t ptep; - uint32_t flags; -#ifdef BLKTAP_CHAINED_BLKTAP - struct page *tap_page; -#endif - struct blktap_ring *ring; - struct blkback_pagemap map; - unsigned long uvaddr, kvaddr; - - ring = &tap->ring; - map = blkback_pagemap_read(page); - blkif_req->seg[seg].gref = map.gref; - - uvaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg); - kvaddr = request_to_kaddr(request, seg); - flags = GNTMAP_host_map | - (request->operation == BLKIF_OP_WRITE ? GNTMAP_readonly : 0); - - gnttab_set_map_op(&table->grants[table->cnt], - kvaddr, flags, map.gref, map.domid); - table->cnt++; - - -#ifdef BLKTAP_CHAINED_BLKTAP - /* enable chained tap devices */ - tap_page = request_to_page(request, seg); - set_page_private(tap_page, page_private(page)); - SetPageBlkback(tap_page); -#endif - - if (xen_feature(XENFEAT_auto_translated_physmap)) - return 0; - - if (create_lookup_pte_addr(ring->vma->vm_mm, uvaddr, &ptep)) { - BTERR("couldn't get a pte addr!\n"); - return -1; - } - - flags |= GNTMAP_application_map | GNTMAP_contains_pte; - gnttab_set_map_op(&table->grants[table->cnt], - ptep, flags, map.gref, map.domid); - table->cnt++; - - return 0; -} - -static int -blktap_map_foreign(struct blktap *tap, - struct blktap_request *request, - struct blkif_request *blkif_req, - struct blktap_grant_table *table) +int +blktap_device_make_request(struct blktap *tap, struct request *rq) { - struct page *page; - int i, grant, err, usr_idx; - struct blktap_ring *ring; - unsigned long uvaddr, foreign_mfn; - - if (!table->cnt) - return 0; + struct blktap_device *tapdev = &tap->device; + struct blktap_request *request; + int write, nsegs; + int err; - err = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, - table->grants, table->cnt); - BUG_ON(err); + request = blktap_ring_make_request(tap); + if (IS_ERR(request)) { + err = PTR_ERR(request); + request = NULL; - grant = 0; - usr_idx = request->usr_idx; - ring = &tap->ring; + if (err == -ENOSPC || err == -ENOMEM) + goto stop; - for (i = 0; i < request->nr_pages; i++) { - if (!blkif_req->seg[i].gref) - continue; + goto fail; + } - uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); + write = rq_data_dir(rq) == WRITE; + nsegs = blk_rq_map_sg(rq->q, rq, request->sg_table); - if (unlikely(table->grants[grant].status)) { - BTERR("invalid kernel buffer: could not remap it\n"); - err |= 1; - table->grants[grant].handle = INVALID_GRANT_HANDLE; - } + dev_dbg(&tapdev->gd->dev, + "make_request: op=%c bytes=%d nsegs=%d\n", + write ? 'w' : 'r', blk_rq_bytes(rq), nsegs); - request->handles[i].kernel = table->grants[grant].handle; - foreign_mfn = table->grants[grant].dev_bus_addr >> PAGE_SHIFT; - grant++; + request->rq = rq; + request->operation = write ? BLKIF_OP_WRITE : BLKIF_OP_READ; - if (xen_feature(XENFEAT_auto_translated_physmap)) - goto done; - - if (unlikely(table->grants[grant].status)) { - BTERR("invalid user buffer: could not remap it\n"); - err |= 1; - table->grants[grant].handle = INVALID_GRANT_HANDLE; - } + err = blktap_request_get_pages(tap, request, nsegs); + if (err) + goto stop; - request->handles[i].user = table->grants[grant].handle; - grant++; + err = blktap_ring_map_request(tap, request); + if (err) + goto fail; - done: - if (err) - continue; + blktap_ring_submit_request(tap, request); - page = request_to_page(request, i); + return 0; - if (!xen_feature(XENFEAT_auto_translated_physmap)) - set_phys_to_machine(page_to_pfn(page), - FOREIGN_FRAME(foreign_mfn)); - else if (vm_insert_page(ring->vma, uvaddr, page)) - err |= 1; +stop: + tap->stats.st_oo_req++; + err = -EBUSY; - BTDBG("pending_req: %p, seg: %d, page: %p, " - "kvaddr: 0x%p, khandle: %u, uvaddr: 0x%08lx, " - "uhandle: %u\n", request, i, page, - pfn_to_kaddr(page_to_pfn(page)), - request->handles[i].kernel, - uvaddr, request->handles[i].user); - } +_out: + if (request) + blktap_ring_free_request(tap, request); return err; -} - -static void -blktap_map(struct blktap *tap, - struct blktap_request *request, - unsigned int seg, struct page *page) -{ - pte_t pte; - int usr_idx; - struct blktap_ring *ring; - unsigned long uvaddr, kvaddr; - - ring = &tap->ring; - usr_idx = request->usr_idx; - uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, seg); - kvaddr = request_to_kaddr(request, seg); - - pte = mk_pte(page, ring->vma->vm_page_prot); - blktap_map_uaddr(ring->vma->vm_mm, uvaddr, pte_mkwrite(pte)); - flush_tlb_page(ring->vma, uvaddr); - blktap_map_uaddr(ring->vma->vm_mm, kvaddr, mk_pte(page, PAGE_KERNEL)); - flush_tlb_kernel_page(kvaddr); - - set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, pte_mfn(pte)); - request->handles[seg].kernel = INVALID_GRANT_HANDLE; - request->handles[seg].user = INVALID_GRANT_HANDLE; - - BTDBG("pending_req: %p, seg: %d, page: %p, kvaddr: 0x%08lx, " - "uvaddr: 0x%08lx\n", request, seg, page, kvaddr, - uvaddr); -} - -static int -blktap_device_process_request(struct blktap *tap, - struct blktap_request *request, - struct request *req) -{ - struct page *page; - int i, usr_idx, err; - struct blktap_ring *ring; - struct scatterlist *sg; - struct blktap_grant_table table; - unsigned int fsect, lsect, nr_sects; - unsigned long offset, uvaddr; - struct blkif_request blkif_req, *target; - - err = -1; - memset(&table, 0, sizeof(table)); - - ring = &tap->ring; - usr_idx = request->usr_idx; - blkif_req.id = usr_idx; - blkif_req.sector_number = (blkif_sector_t)req->sector; - blkif_req.handle = 0; - blkif_req.operation = rq_data_dir(req) ? - BLKIF_OP_WRITE : BLKIF_OP_READ; - - request->rq = req; - request->operation = blkif_req.operation; - request->status = BLKTAP_REQUEST_PENDING; - do_gettimeofday(&request->time); - - nr_sects = 0; - request->nr_pages = 0; - blkif_req.nr_segments = blk_rq_map_sg(req->q, req, tap->sg); - BUG_ON(blkif_req.nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); - for (i = 0; i < blkif_req.nr_segments; ++i) { - sg = tap->sg + i; - fsect = sg->offset >> 9; - lsect = fsect + (sg->length >> 9) - 1; - nr_sects += sg->length >> 9; - - blkif_req.seg[i] = - (struct blkif_request_segment) { - .gref = 0, - .first_sect = fsect, - .last_sect = lsect }; - - if (blkback_pagemap_contains_page(sg_page(sg))) { - /* foreign page -- use xen */ - if (blktap_prep_foreign(tap, - request, - &blkif_req, - i, - sg_page(sg), - &table)) - goto out; - } else { - /* do it the old fashioned way */ - blktap_map(tap, - request, - i, - sg_page(sg)); - } - - uvaddr = MMAP_VADDR(ring->user_vstart, usr_idx, i); - offset = (uvaddr - ring->vma->vm_start) >> PAGE_SHIFT; - page = request_to_page(request, i); - ring->foreign_map.map[offset] = page; - SetPageReserved(page); - - BTDBG("mapped uaddr %08lx to page %p pfn 0x%lx\n", - uvaddr, page, page_to_pfn(page)); - BTDBG("offset: 0x%08lx, pending_req: %p, seg: %d, " - "page: %p, kvaddr: %p, uvaddr: 0x%08lx\n", - offset, request, i, - page, pfn_to_kaddr(page_to_pfn(page)), uvaddr); - - request->nr_pages++; - } - - if (blktap_map_foreign(tap, request, &blkif_req, &table)) - goto out; - - /* Finally, write the request message to the user ring. */ - target = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt); - memcpy(target, &blkif_req, sizeof(blkif_req)); - target->id = request->usr_idx; - wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */ - ring->ring.req_prod_pvt++; - - if (rq_data_dir(req)) { - tap->stats.st_wr_sect += nr_sects; - tap->stats.st_wr_req++; - } else { - tap->stats.st_rd_sect += nr_sects; - tap->stats.st_rd_req++; - } - - err = 0; - -out: - if (err) - blktap_device_fast_flush(tap, request); - return err; +fail: + if (printk_ratelimit()) + dev_warn(&tapdev->gd->dev, + "make request: %d, failing\n", err); + goto _out; } /* * called from tapdisk context */ -int +void blktap_device_run_queue(struct blktap *tap) { - int err, rv; - struct request_queue *rq; - struct request *req; - struct blktap_ring *ring; - struct blktap_device *dev; - struct blktap_request *request; - - ring = &tap->ring; - dev = &tap->device; - rq = dev->gd->queue; + struct blktap_device *tapdev = &tap->device; + struct request_queue *q; + struct request *rq; + int err; - BTDBG("running queue for %d\n", tap->minor); - spin_lock_irq(&dev->lock); - queue_flag_clear(QUEUE_FLAG_STOPPED, rq); + if (!tapdev->gd) + return; - while ((req = elv_next_request(rq)) != NULL) { - if (!blk_fs_request(req)) { - end_request(req, 0); - continue; - } + q = tapdev->gd->queue; - if (blk_empty_barrier_rq(req)) { - end_request(req, 1); - continue; - } + spin_lock_irq(&tapdev->lock); + queue_flag_clear(QUEUE_FLAG_STOPPED, q); - if (RING_FULL(&ring->ring)) { - wait: - /* Avoid pointless unplugs. */ - blk_stop_queue(rq); + do { + rq = elv_next_request(q); + if (!rq) break; + + if (!blk_fs_request(rq)) { + end_queued_request(rq, 0); + continue; } - request = blktap_request_alloc(tap); - if (!request) { - tap->stats.st_oo_req++; - goto wait; + if (blk_empty_barrier(rq)) { + end_queued_request(rq, 1); + continue; } - BTDBG("req %p: dev %d cmd %p, sec 0x%llx, (0x%x/0x%lx) " - "buffer:%p [%s], pending: %p\n", req, tap->minor, - req->cmd, (unsigned long long)req->sector, - req->current_nr_sectors, - req->nr_sectors, req->buffer, - rq_data_dir(req) ? "write" : "read", request); + spin_unlock_irq(&tapdev->lock); - blkdev_dequeue_request(req); + err = blktap_device_make_request(tap, rq); - spin_unlock_irq(&dev->lock); + spin_lock_irq(&tapdev->lock); - err = blktap_device_process_request(tap, request, req); - if (err) { - blktap_device_end_dequeued_request(dev, req, -EIO); - blktap_request_free(tap, request); + if (err == -EBUSY) { + blk_stop_queue(q); + break; } - spin_lock_irq(&dev->lock); - } - - spin_unlock_irq(&dev->lock); - - rv = ring->ring.req_prod_pvt - - ring->ring.sring->req_prod; + blkdev_dequeue_request(req); - RING_PUSH_REQUESTS(&ring->ring); + if (unlikely(err)) + end_request(rq, 0); + } while (1); - return rv; + spin_unlock_irq(&tapdev->lock); } static void diff --git a/drivers/xen/blktap/request.c b/drivers/xen/blktap/request.c index ca12442..9bef48c 100644 --- a/drivers/xen/blktap/request.c +++ b/drivers/xen/blktap/request.c @@ -3,7 +3,6 @@ #include #include #include -#include #include "blktap.h" @@ -129,6 +128,25 @@ blktap_request_free(struct blktap *tap, __page_pool_wake(tap->pool); } +void +blktap_request_bounce(struct blktap *tap, + struct blktap_request *request, + int seg, int write) +{ + struct scatterlist *sg = &request->sg_table[seg]; + void *s, *p; + + BUG_ON(seg >= request->nr_pages); + + s = sg_virt(sg); + p = page_address(request->pages[seg]) + sg->offset; + + if (write) + memcpy(p, s, sg->length); + else + memcpy(s, p, sg->length); +} + static void blktap_request_ctor(void *obj) { diff --git a/drivers/xen/blktap/ring.c b/drivers/xen/blktap/ring.c index a72a1b3..38896e7 100644 --- a/drivers/xen/blktap/ring.c +++ b/drivers/xen/blktap/ring.c @@ -1,30 +1,15 @@ + #include #include #include #include - -#include -#include +#include #include "blktap.h" -#ifdef CONFIG_XEN_BLKDEV_BACKEND -#include "../blkback/blkback-pagemap.h" -#else -#define blkback_pagemap_contains_page(page) 0 -#endif - int blktap_ring_major; static struct cdev blktap_ring_cdev; -static inline struct blktap * -vma_to_blktap(struct vm_area_struct *vma) -{ - struct vm_foreign_map *m = vma->vm_private_data; - struct blktap_ring *r = container_of(m, struct blktap_ring, foreign_map); - return container_of(r, struct blktap, ring); -} - /* * BLKTAP - immediately before the mmap area, * we have a bunch of pages reserved for shared memory rings. @@ -47,7 +32,7 @@ blktap_ring_read_response(struct blktap *tap, goto invalid; } - request = tap->pending_requests[usr_idx]; + request = ring->pending[usr_idx]; if (!request) { err = -ESRCH; @@ -110,90 +95,15 @@ static int blktap_ring_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } -static pte_t -blktap_ring_clear_pte(struct vm_area_struct *vma, - unsigned long uvaddr, - pte_t *ptep, int is_fullmm) -{ - pte_t copy; - struct blktap *tap; - unsigned long kvaddr; - struct page **map, *page; - struct blktap_ring *ring; - struct blktap_request *request; - struct grant_handle_pair *khandle; - struct gnttab_unmap_grant_ref unmap[2]; - int offset, seg, usr_idx, count = 0; - - tap = vma_to_blktap(vma); - ring = &tap->ring; - map = ring->foreign_map.map; - BUG_ON(!map); /* TODO Should this be changed to if statement? */ - - /* - * Zap entry if the address is before the start of the grant - * mapped region. - */ - if (uvaddr < ring->user_vstart) - return ptep_get_and_clear_full(vma->vm_mm, uvaddr, - ptep, is_fullmm); - - offset = (int)((uvaddr - ring->user_vstart) >> PAGE_SHIFT); - usr_idx = offset / BLKIF_MAX_SEGMENTS_PER_REQUEST; - seg = offset % BLKIF_MAX_SEGMENTS_PER_REQUEST; - - offset = (int)((uvaddr - vma->vm_start) >> PAGE_SHIFT); - page = map[offset]; - if (page && blkback_pagemap_contains_page(page)) - set_page_private(page, 0); - map[offset] = NULL; - - request = tap->pending_requests[usr_idx]; - kvaddr = request_to_kaddr(request, seg); - khandle = request->handles + seg; - - if (khandle->kernel != INVALID_GRANT_HANDLE) { - gnttab_set_unmap_op(&unmap[count], kvaddr, - GNTMAP_host_map, khandle->kernel); - count++; - - set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, - INVALID_P2M_ENTRY); - } - - if (khandle->user != INVALID_GRANT_HANDLE) { - BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); - - copy = *ptep; - gnttab_set_unmap_op(&unmap[count], virt_to_machine(ptep).maddr, - GNTMAP_host_map - | GNTMAP_application_map - | GNTMAP_contains_pte, - khandle->user); - count++; - } else - copy = ptep_get_and_clear_full(vma->vm_mm, uvaddr, ptep, - is_fullmm); - - if (count) - if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, - unmap, count)) - BUG(); - - khandle->kernel = INVALID_GRANT_HANDLE; - khandle->user = INVALID_GRANT_HANDLE; - - return copy; -} - static void blktap_ring_fail_pending(struct blktap *tap) { + struct blktap_ring *ring = &tap->ring; struct blktap_request *request; int usr_idx; for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) { - request = tap->pending_requests[usr_idx]; + request = ring->pending[usr_idx]; if (!request) continue; @@ -204,15 +114,12 @@ blktap_ring_fail_pending(struct blktap *tap) static void blktap_ring_vm_close(struct vm_area_struct *vma) { - struct blktap *tap = vma_to_blktap(vma); + struct blktap *tap = vma->vm_private_data; struct blktap_ring *ring = &tap->ring; struct page *page = virt_to_page(ring->ring.sring); blktap_ring_fail_pending(tap); - kfree(ring->foreign_map.map); - ring->foreign_map.map = NULL; - zap_page_range(vma, vma->vm_start, PAGE_SIZE, NULL); ClearPageReserved(page); __free_page(page); @@ -226,9 +133,154 @@ blktap_ring_vm_close(struct vm_area_struct *vma) static struct vm_operations_struct blktap_ring_vm_operations = { .close = blktap_ring_vm_close, .fault = blktap_ring_fault, - .zap_pte = blktap_ring_clear_pte, }; +int +blktap_ring_map_segment(struct blktap *tap, + struct blktap_request *request, + int seg) +{ + struct blktap_ring *ring = &tap->ring; + unsigned long uaddr; + + uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, seg); + return vm_insert_page(ring->vma, uaddr, request->pages[seg]); +} + +int +blktap_ring_map_request(struct blktap *tap, + struct blktap_request *request) +{ + int seg, err = 0; + int write; + + write = request->operation == BLKIF_OP_WRITE; + + for (seg = 0; seg < request->nr_pages; seg++) { + if (write) + blktap_request_bounce(tap, request, seg, write); + + err = blktap_ring_map_segment(tap, request, seg); + if (err) + break; + } + + if (err) + blktap_ring_unmap_request(tap, request); + + return err; +} + +void +blktap_ring_unmap_request(struct blktap *tap, + struct blktap_request *request) +{ + struct blktap_ring *ring = &tap->ring; + unsigned long uaddr; + unsigned size; + int seg, read; + + uaddr = MMAP_VADDR(ring->user_vstart, request->usr_idx, 0); + size = request->nr_pages << PAGE_SHIFT; + read = request->operation == BLKIF_OP_READ; + + if (read) + for (seg = 0; seg < request->nr_pages; seg++) + blktap_request_bounce(tap, request, seg, !read); + + zap_page_range(ring->vma, uaddr, size, NULL); +} + +void +blktap_ring_free_request(struct blktap *tap, + struct blktap_request *request) +{ + struct blktap_ring *ring = &tap->ring; + + ring->pending[request->usr_idx] = NULL; + ring->n_pending--; + + blktap_request_free(tap, request); +} + +struct blktap_request* +blktap_ring_make_request(struct blktap *tap) +{ + struct blktap_ring *ring = &tap->ring; + struct blktap_request *request; + int usr_idx; + + if (RING_FULL(&ring->ring)) + return ERR_PTR(-ENOSPC); + + request = blktap_request_alloc(tap); + if (!request) + return ERR_PTR(-ENOMEM); + + for (usr_idx = 0; usr_idx < BLK_RING_SIZE; usr_idx++) + if (!ring->pending[usr_idx]) + break; + + BUG_ON(usr_idx >= BLK_RING_SIZE); + + request->tap = tap; + request->usr_idx = usr_idx; + + ring->pending[usr_idx] = request; + ring->n_pending++; + + return request; +} + +void +blktap_ring_submit_request(struct blktap *tap, + struct blktap_request *request) +{ + struct blktap_ring *ring = &tap->ring; + struct blkif_request *breq; + struct scatterlist *sg; + int i, nsecs = 0; + + dev_dbg(ring->dev, + "request %d [%p] submit\n", request->usr_idx, request); + + breq = RING_GET_REQUEST(&ring->ring, ring->ring.req_prod_pvt); + + breq->id = request->usr_idx; + breq->sector_number = request->rq->sector; + breq->handle = 0; + breq->operation = request->operation; + breq->nr_segments = request->nr_pages; + + blktap_for_each_sg(sg, request, i) { + struct blkif_request_segment *seg = &breq->seg[i]; + int first, count; + + count = sg->length >> 9; + first = sg->offset >> 9; + + seg->first_sect = first; + seg->last_sect = first + count - 1; + + nsecs += count; + } + + ring->ring.req_prod_pvt++; + + do_gettimeofday(&request->time); + + + if (request->operation == BLKIF_OP_WRITE) { + tap->stats.st_wr_sect += nsecs; + tap->stats.st_wr_req++; + } + + if (request->operation == BLKIF_OP_READ) { + tap->stats.st_rd_sect += nsecs; + tap->stats.st_rd_req++; + } +} + static int blktap_ring_open(struct inode *inode, struct file *filp) { @@ -270,51 +322,21 @@ blktap_ring_release(struct inode *inode, struct file *filp) return 0; } -/* Note on mmap: - * We need to map pages to user space in a way that will allow the block - * subsystem set up direct IO to them. This couldn't be done before, because - * there isn't really a sane way to translate a user virtual address down to a - * physical address when the page belongs to another domain. - * - * My first approach was to map the page in to kernel memory, add an entry - * for it in the physical frame list (using alloc_lomem_region as in blkback) - * and then attempt to map that page up to user space. This is disallowed - * by xen though, which realizes that we don't really own the machine frame - * underlying the physical page. - * - * The new approach is to provide explicit support for this in xen linux. - * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages - * mapped from other vms. vma->vm_private_data is set up as a mapping - * from pages to actual page structs. There is a new clause in get_user_pages - * that does the right thing for this sort of mapping. - */ static int blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma) { struct blktap *tap = filp->private_data; struct blktap_ring *ring = &tap->ring; struct blkif_sring *sring; - struct page *page; - int size, err; - struct page **map; - - map = NULL; - sring = NULL; + struct page *page = NULL; + int err; if (ring->vma) return -EBUSY; - size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; - if (size != (MMAP_PAGES + RING_PAGES)) { - BTERR("you _must_ map exactly %lu pages!\n", - MMAP_PAGES + RING_PAGES); - return -EAGAIN; - } - - /* allocate the shared ring */ page = alloc_page(GFP_KERNEL|__GFP_ZERO); if (!page) - goto fail; + return -ENOMEM; SetPageReserved(page); @@ -329,22 +351,12 @@ blktap_ring_mmap(struct file *filp, struct vm_area_struct *vma) ring->ring_vstart = vma->vm_start; ring->user_vstart = ring->ring_vstart + PAGE_SIZE; - /* allocate the foreign map */ - map = kzalloc(size * sizeof(struct page *), GFP_KERNEL); - if (!map) - goto fail; + vma->vm_private_data = tap; - /* Mark this VM as containing foreign pages, and set up mappings. */ - ring->foreign_map.map = map; - vma->vm_private_data = &ring->foreign_map; - vma->vm_flags |= VM_FOREIGN; vma->vm_flags |= VM_DONTCOPY; vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &blktap_ring_vm_operations; -#ifdef CONFIG_X86 - vma->vm_mm->context.has_foreign_mappings = 1; -#endif + vma->vm_ops = &blktap_ring_vm_operations; ring->vma = vma; return 0; @@ -356,10 +368,7 @@ fail: __free_page(page); } - if (map) - kfree(map); - - return -ENOMEM; + return err; } static int @@ -405,16 +414,19 @@ static unsigned int blktap_ring_poll(struct file *filp, poll_table *wait) { struct blktap *tap = filp->private_data; struct blktap_ring *ring = &tap->ring; - int work = 0; + int work; poll_wait(filp, &tap->pool->wait, wait); poll_wait(filp, &ring->poll_wait, wait); down_read(¤t->mm->mmap_sem); if (ring->vma && tap->device.gd) - work = blktap_device_run_queue(tap); + blktap_device_run_queue(tap); up_read(¤t->mm->mmap_sem); + work = ring->ring.req_prod_pvt - ring->ring.sring->req_prod; + RING_PUSH_REQUESTS(&ring->ring); + if (work || ring->ring.sring->private.tapif_user.msg || test_and_clear_bit(BLKTAP_DEVICE_CLOSED, &tap->dev_inuse)) @@ -463,18 +475,19 @@ blktap_ring_create(struct blktap *tap) size_t blktap_ring_debug(struct blktap *tap, char *buf, size_t size) { + struct blktap_ring *ring = &tap->ring; char *s = buf, *end = buf + size; int usr_idx; s += snprintf(s, end - s, - "begin pending:%d\n", tap->pending_cnt); + "begin pending:%d\n", ring->n_pending); for (usr_idx = 0; usr_idx < MAX_PENDING_REQS; usr_idx++) { struct blktap_request *request; struct timeval *time; int write; - request = tap->pending_requests[usr_idx]; + request = ring->pending[usr_idx]; if (!request) continue; -- 1.7.0.4