* [PATCH, RFC] block: don't allocate a payload for discard request @ 2010-06-18 14:59 Christoph Hellwig 2010-06-19 4:25 ` Mike Snitzer 0 siblings, 1 reply; 60+ messages in thread From: Christoph Hellwig @ 2010-06-18 14:59 UTC (permalink / raw) To: axboe, snitzer; +Cc: martin.petersen, James.Bottomley, linux-kernel Allocating a fixed payload for discard requests always was a horrible hack, and it's not coming to byte us when adding support for discard in DM/MD. So change the code to leave the allocation of a payload to the lowlevel driver. Unfortunately that means we'll need another hack, which allows us to update the various block layer length fields indicating that we have a payload. Instead of hiding this in sd.c, which we already partially do for UNMAP support add a documented helper in the core block layer for it. Signed-off-by: Christoph Hellwig <hch@lst.de> Index: linux-2.6/block/blk-lib.c =================================================================== --- linux-2.6.orig/block/blk-lib.c 2010-06-18 11:33:37.313260457 +0200 +++ linux-2.6/block/blk-lib.c 2010-06-18 16:49:20.854319055 +0200 @@ -19,7 +19,6 @@ static void blkdev_discard_end_io(struct if (bio->bi_private) complete(bio->bi_private); - __free_page(bio_page(bio)); bio_put(bio); } @@ -43,7 +42,6 @@ int blkdev_issue_discard(struct block_de int type = flags & BLKDEV_IFL_BARRIER ? DISCARD_BARRIER : DISCARD_NOBARRIER; struct bio *bio; - struct page *page; int ret = 0; if (!q) @@ -53,35 +51,21 @@ int blkdev_issue_discard(struct block_de return -EOPNOTSUPP; while (nr_sects && !ret) { - unsigned int sector_size = q->limits.logical_block_size; unsigned int max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); bio = bio_alloc(gfp_mask, 1); - if (!bio) - goto out; + if (!bio) { + ret = -ENOMEM; + break; + } + bio->bi_sector = sector; bio->bi_end_io = blkdev_discard_end_io; bio->bi_bdev = bdev; if (flags & BLKDEV_IFL_WAIT) bio->bi_private = &wait; - /* - * Add a zeroed one-sector payload as that's what - * our current implementations need. If we'll ever need - * more the interface will need revisiting. - */ - page = alloc_page(gfp_mask | __GFP_ZERO); - if (!page) - goto out_free_bio; - if (bio_add_pc_page(q, bio, page, sector_size, 0) < sector_size) - goto out_free_page; - - /* - * And override the bio size - the way discard works we - * touch many more blocks on disk than the actual payload - * length. - */ if (nr_sects > max_discard_sectors) { bio->bi_size = max_discard_sectors << 9; nr_sects -= max_discard_sectors; @@ -103,13 +87,8 @@ int blkdev_issue_discard(struct block_de ret = -EIO; bio_put(bio); } + return ret; -out_free_page: - __free_page(page); -out_free_bio: - bio_put(bio); -out: - return -ENOMEM; } EXPORT_SYMBOL(blkdev_issue_discard); Index: linux-2.6/drivers/scsi/sd.c =================================================================== --- linux-2.6.orig/drivers/scsi/sd.c 2010-06-18 11:33:37.322253892 +0200 +++ linux-2.6/drivers/scsi/sd.c 2010-06-18 16:50:04.790255778 +0200 @@ -411,22 +411,25 @@ static void sd_prot_op(struct scsi_cmnd } /** - * sd_prepare_discard - unmap blocks on thinly provisioned device + * scsi_setup_discard_cmnd - unmap blocks on thinly provisioned device + * @sdp: scsi device to operate one * @rq: Request to prepare * * Will issue either UNMAP or WRITE SAME(16) depending on preference * indicated by target device. **/ -static int sd_prepare_discard(struct request *rq) +static int scsi_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq) { struct scsi_disk *sdkp = scsi_disk(rq->rq_disk); struct bio *bio = rq->bio; sector_t sector = bio->bi_sector; - unsigned int num = bio_sectors(bio); + unsigned int nr_sectors = bio_sectors(bio); + unsigned int len; + struct page *page; if (sdkp->device->sector_size == 4096) { sector >>= 3; - num >>= 3; + nr_sectors >>= 3; } rq->cmd_type = REQ_TYPE_BLOCK_PC; @@ -434,31 +437,35 @@ static int sd_prepare_discard(struct req memset(rq->cmd, 0, rq->cmd_len); + page = alloc_page(GFP_ATOMIC | __GFP_ZERO); + if (!page) + return BLKPREP_DEFER; + if (sdkp->unmap) { - char *buf = kmap_atomic(bio_page(bio), KM_USER0); + char *buf = page_address(page); + rq->cmd_len = 10; rq->cmd[0] = UNMAP; rq->cmd[8] = 24; - rq->cmd_len = 10; - - /* Ensure that data length matches payload */ - rq->__data_len = bio->bi_size = bio->bi_io_vec->bv_len = 24; put_unaligned_be16(6 + 16, &buf[0]); put_unaligned_be16(16, &buf[2]); put_unaligned_be64(sector, &buf[8]); - put_unaligned_be32(num, &buf[16]); + put_unaligned_be32(nr_sectors, &buf[16]); - kunmap_atomic(buf, KM_USER0); + len = 24; } else { + rq->cmd_len = 16; rq->cmd[0] = WRITE_SAME_16; rq->cmd[1] = 0x8; /* UNMAP */ put_unaligned_be64(sector, &rq->cmd[2]); - put_unaligned_be32(num, &rq->cmd[10]); - rq->cmd_len = 16; + put_unaligned_be32(nr_sectors, &rq->cmd[10]); + + len = sdkp->device->sector_size; } - return BLKPREP_OK; + blk_add_request_payload(rq, page, len); + return scsi_setup_blk_pc_cmnd(sdp, rq); } /** @@ -485,10 +492,10 @@ static int sd_prep_fn(struct request_que * Discard request come in as REQ_TYPE_FS but we turn them into * block PC requests to make life easier. */ - if (rq->cmd_flags & REQ_DISCARD) - ret = sd_prepare_discard(rq); - - if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { + if (rq->cmd_flags & REQ_DISCARD) { + ret = scsi_setup_discard_cmnd(sdp, rq); + goto out; + } else if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { ret = scsi_setup_blk_pc_cmnd(sdp, rq); goto out; } else if (rq->cmd_type != REQ_TYPE_FS) { @@ -1163,6 +1170,15 @@ static int sd_done(struct scsi_cmnd *SCp int sense_valid = 0; int sense_deferred = 0; + /* + * If this is a discard request that originated from the kernel + * we need to free our payload here. Note that we need to check + * the request flag as the normal payload rules apply for + * pass-through UNMAP / WRITE SAME requests. + */ + if (SCpnt->request->cmd_flags & REQ_DISCARD) + __free_page(bio_page(SCpnt->request->bio)); + if (result) { sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr); if (sense_valid) Index: linux-2.6/block/blk-core.c =================================================================== --- linux-2.6.orig/block/blk-core.c 2010-06-18 16:43:23.711273168 +0200 +++ linux-2.6/block/blk-core.c 2010-06-18 16:48:17.392256056 +0200 @@ -1135,6 +1135,38 @@ void blk_put_request(struct request *req } EXPORT_SYMBOL(blk_put_request); +/** + * blk_add_request_payload - add a payload to a request + * @rq: request to update + * @page: page backing the payload + * @len: length of the payload. + * + * This allows to later add a payload to an already submitted request by + * a block driver. The driver needs to take care of freeing the payload + * itself. + * + * Note that this is a quite horrible hack and nothing but handling of + * discard requests should ever use it. + */ +void blk_add_request_payload(struct request *rq, struct page *page, + unsigned int len) +{ + struct bio *bio = rq->bio; + + bio->bi_io_vec->bv_page = page; + bio->bi_io_vec->bv_offset = 0; + bio->bi_io_vec->bv_len = len; + + bio->bi_size = len; + bio->bi_vcnt = 1; + bio->bi_phys_segments = 1; + + rq->__data_len = rq->resid_len = len; + rq->nr_phys_segments = 1; + rq->buffer = bio_data(bio); +} +EXPORT_SYMBOL_GPL(blk_add_request_payload); + void init_request_from_bio(struct request *req, struct bio *bio) { req->cpu = bio->bi_comp_cpu; Index: linux-2.6/include/linux/blkdev.h =================================================================== --- linux-2.6.orig/include/linux/blkdev.h 2010-06-18 16:45:29.223003718 +0200 +++ linux-2.6/include/linux/blkdev.h 2010-06-18 16:47:25.894005813 +0200 @@ -702,6 +702,8 @@ extern struct request *blk_make_request( gfp_t); extern void blk_insert_request(struct request_queue *, struct request *, int, void *); extern void blk_requeue_request(struct request_queue *, struct request *); +extern void blk_add_request_payload(struct request *rq, struct page *page, + unsigned int len); extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH, RFC] block: don't allocate a payload for discard request 2010-06-18 14:59 [PATCH, RFC] block: don't allocate a payload for discard request Christoph Hellwig @ 2010-06-19 4:25 ` Mike Snitzer 2010-06-22 18:00 ` Mike Snitzer 0 siblings, 1 reply; 60+ messages in thread From: Mike Snitzer @ 2010-06-19 4:25 UTC (permalink / raw) To: Christoph Hellwig Cc: axboe, martin.petersen, James.Bottomley, linux-kernel, dm-devel On Fri, Jun 18 2010 at 10:59am -0400, Christoph Hellwig <hch@lst.de> wrote: > > Allocating a fixed payload for discard requests always was a horrible hack, > and it's not coming to byte us when adding support for discard in DM/MD. > > So change the code to leave the allocation of a payload to the lowlevel > driver. Unfortunately that means we'll need another hack, which allows > us to update the various block layer length fields indicating that we > have a payload. Instead of hiding this in sd.c, which we already partially > do for UNMAP support add a documented helper in the core block layer for it. > > Signed-off-by: Christoph Hellwig <hch@lst.de> Acked-by: Mike Snitzer <snitzer@redhat.com> I've built on your patch to successfully implement discard support for DM (this includes splitting discards that span targets; only the linear and striped DM targets are supported so far). The patchset is still a work in progress but is available here (against 2.6.34 at the moment): http://people.redhat.com/msnitzer/patches/dm-discard/latest/ ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH, RFC] block: don't allocate a payload for discard request 2010-06-19 4:25 ` Mike Snitzer @ 2010-06-22 18:00 ` Mike Snitzer 2010-06-26 19:56 ` [PATCH 1/2] block: fix leaks associated with discard request payload Mike Snitzer 2010-06-26 19:56 ` [PATCH 2/2] block: defer the use of inline biovecs for discard requests Mike Snitzer 0 siblings, 2 replies; 60+ messages in thread From: Mike Snitzer @ 2010-06-22 18:00 UTC (permalink / raw) To: Christoph Hellwig Cc: axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen On Sat, Jun 19 2010 at 12:25am -0400, Mike Snitzer <snitzer@redhat.com> wrote: > On Fri, Jun 18 2010 at 10:59am -0400, > Christoph Hellwig <hch@lst.de> wrote: > > > > > Allocating a fixed payload for discard requests always was a horrible hack, > > and it's not coming to byte us when adding support for discard in DM/MD. > > > > So change the code to leave the allocation of a payload to the lowlevel > > driver. Unfortunately that means we'll need another hack, which allows > > us to update the various block layer length fields indicating that we > > have a payload. Instead of hiding this in sd.c, which we already partially > > do for UNMAP support add a documented helper in the core block layer for it. > > > > Signed-off-by: Christoph Hellwig <hch@lst.de> > > Acked-by: Mike Snitzer <snitzer@redhat.com> > > I've built on your patch to successfully implement discard support for > DM (this includes splitting discards that span targets; only the linear > and striped DM targets are supported so far). I must retract my Acked-by because further testing has shown that this change results in OOM (on 2.6.34 with postmark benchmarking against ext4 w/ -o discard). The patch is _very_ desirable (simplifies DM's discard support, etc) but there is more work needed to get it stable. I'll continue tracking this OOM down. Mem-Info: Node 0 DMA per-cpu: CPU 0: hi: 0, btch: 1 usd: 0 CPU 1: hi: 0, btch: 1 usd: 0 Node 0 DMA32 per-cpu: CPU 0: hi: 186, btch: 31 usd: 91 CPU 1: hi: 186, btch: 31 usd: 92 active_anon:52 inactive_anon:65 isolated_anon:0 active_file:525 inactive_file:1275 isolated_file:0 unevictable:0 dirty:802 writeback:0 unstable:0 free:3411 slab_reclaimable:5386 slab_unreclaimable:7672 mapped:86 shmem:0 pagetables:576 bounce:0 Node 0 DMA free:8040kB min:40kB low:48kB high:60kB active_anon:0kB inactive_anon:0kB active_file:12kB inactive_file:0kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:15768kB mlocked:0kB dirty:0kB writeback:0kB mapped:8kB shmem:0kB slab_reclaimable:28kB slab_unreclaimable:16kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:22 all_unreclaimable? yes lowmem_reserve[]: 0 2003 2003 2003 Node 0 DMA32 free:5604kB min:5704kB low:7128kB high:8556kB active_anon:208kB inactive_anon:260kB active_file:2088kB inactive_file:4976kB unevictable:0kB isolated(anon):0kB isolated(file):128kB present:2052068kB mlocked:0kB dirty:3208kB writeback:0kB mapped:336kB shmem:0kB slab_reclaimable:21516kB slab_unreclaimable:30672kB kernel_stack:816kB pagetables:2304kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:608 all_unreclaimable? no lowmem_reserve[]: 0 0 0 0 Node 0 DMA: 4*4kB 3*8kB 1*16kB 2*32kB 2*64kB 3*128kB 3*256kB 3*512kB 3*1024kB 1*2048kB 0*4096kB = 8056kB Node 0 DMA32: 422*4kB 0*8kB 0*16kB 3*32kB 1*64kB 1*128kB 0*256kB 1*512kB 1*1024kB 1*2048kB 0*4096kB = 5560kB 1914 total pagecache pages 106 pages in swap cache Swap cache stats: add 499740, delete 499634, find 61395/208276 Free swap = 4177680kB Total swap = 4194300kB 524224 pages RAM 79718 pages reserved 1622 pages shared 438890 pages non-shared Out of memory: kill process 2672 (sshd) score 2515 or a child Killed process 3013 (sshd) vsz:97340kB, anon-rss:0kB, file-rss:4kB postmark used greatest stack depth: 2560 bytes left I identified one potential leak, on an error path, through code inspection but I still hit OOM even with the following patch: --- block/blk-core.c | 24 ++++++++++++++++++++++++ drivers/scsi/sd.c | 9 ++++++++- include/linux/blkdev.h | 1 + 3 files changed, 33 insertions(+), 1 deletion(-) Index: linux-2.6/drivers/scsi/sd.c =================================================================== --- linux-2.6.orig/drivers/scsi/sd.c +++ linux-2.6/drivers/scsi/sd.c @@ -424,6 +424,7 @@ static int scsi_setup_discard_cmnd(struc sector_t sector = bio->bi_sector; unsigned int nr_sectors = bio_sectors(bio); unsigned int len; + int ret; struct page *page; if (sdkp->device->sector_size == 4096) { @@ -464,7 +465,13 @@ static int scsi_setup_discard_cmnd(struc } blk_add_request_payload(rq, page, len); - return scsi_setup_blk_pc_cmnd(sdp, rq); + ret = scsi_setup_blk_pc_cmnd(sdp, rq); + if (ret != BLKPREP_OK) { + blk_clear_request_payload(rq); + __free_page(page); + } + + return ret; } /** Index: linux-2.6/block/blk-core.c =================================================================== --- linux-2.6.orig/block/blk-core.c +++ linux-2.6/block/blk-core.c @@ -1139,6 +1139,30 @@ void blk_add_request_payload(struct requ } EXPORT_SYMBOL_GPL(blk_add_request_payload); +/** + * blk_add_request_payload - add a payload to a request + * @rq: request to update + * + * This clears a request's payload. The driver needs to take care of + * freeing the payload itself. + */ +void blk_clear_request_payload(struct request *rq) +{ + struct bio *bio = rq->bio; + + rq->__data_len = rq->resid_len = 0; + rq->nr_phys_segments = 0; + rq->buffer = NULL; + + bio->bi_size = 0; + bio->bi_vcnt = 0; + bio->bi_phys_segments = 0; + + bio->bi_io_vec->bv_page = NULL; + bio->bi_io_vec->bv_len = 0; +} +EXPORT_SYMBOL_GPL(blk_clear_request_payload); + void init_request_from_bio(struct request *req, struct bio *bio) { req->cpu = bio->bi_comp_cpu; Index: linux-2.6/include/linux/blkdev.h =================================================================== --- linux-2.6.orig/include/linux/blkdev.h +++ linux-2.6/include/linux/blkdev.h @@ -779,6 +779,7 @@ extern void blk_insert_request(struct re extern void blk_requeue_request(struct request_queue *, struct request *); extern void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len); +extern void blk_clear_request_payload(struct request *rq); extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, ^ permalink raw reply [flat|nested] 60+ messages in thread
* [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-22 18:00 ` Mike Snitzer @ 2010-06-26 19:56 ` Mike Snitzer 2010-06-27 8:49 ` FUJITA Tomonori ` (2 more replies) 2010-06-26 19:56 ` [PATCH 2/2] block: defer the use of inline biovecs for discard requests Mike Snitzer 1 sibling, 3 replies; 60+ messages in thread From: Mike Snitzer @ 2010-06-26 19:56 UTC (permalink / raw) To: Christoph Hellwig Cc: axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm Fix leaks introduced via "block: don't allocate a payload for discard request" commit a1d949f5f44. sd_done() is not called for REQ_TYPE_BLOCK_PC commands so cleanup discard request's payload directly in scsi_finish_command(). Also cleanup page allocated for discard payload in scsi_setup_discard_cmnd's scsi_setup_blk_pc_cmnd error path. Signed-off-by: Mike Snitzer <snitzer@redhat.com> --- block/blk-core.c | 23 +++++++++++++++++++++++ drivers/scsi/scsi.c | 8 ++++++++ drivers/scsi/sd.c | 18 ++++++++---------- include/linux/blkdev.h | 1 + 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 98b4cee..07925aa 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1167,6 +1167,29 @@ void blk_add_request_payload(struct request *rq, struct page *page, } EXPORT_SYMBOL_GPL(blk_add_request_payload); +/** + * blk_clear_request_payload - clear a request's payload + * @rq: request to update + * + * The driver needs to take care of freeing the payload itself. + */ +void blk_clear_request_payload(struct request *rq) +{ + struct bio *bio = rq->bio; + + rq->__data_len = rq->resid_len = 0; + rq->nr_phys_segments = 0; + rq->buffer = NULL; + + bio->bi_size = 0; + bio->bi_vcnt = 0; + bio->bi_phys_segments = 0; + + bio->bi_io_vec->bv_page = NULL; + bio->bi_io_vec->bv_len = 0; +} +EXPORT_SYMBOL_GPL(blk_clear_request_payload); + void init_request_from_bio(struct request *req, struct bio *bio) { req->cpu = bio->bi_comp_cpu; diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index ad0ed21..69c7ea4 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -851,6 +851,14 @@ void scsi_finish_command(struct scsi_cmnd *cmd) */ if (good_bytes == old_good_bytes) good_bytes -= scsi_get_resid(cmd); + } else if (cmd->request->cmd_flags & REQ_DISCARD) { + /* + * If this is a discard request that originated from the kernel + * we need to free our payload here. Note that we need to check + * the request flag as the normal payload rules apply for + * pass-through UNMAP / WRITE SAME requests. + */ + __free_page(bio_page(cmd->request->bio)); } scsi_io_completion(cmd, good_bytes); } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 86da819..9b81dda 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -425,6 +425,7 @@ static int scsi_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq) sector_t sector = bio->bi_sector; unsigned int nr_sectors = bio_sectors(bio); unsigned int len; + int ret; struct page *page; if (sdkp->device->sector_size == 4096) { @@ -465,7 +466,13 @@ static int scsi_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq) } blk_add_request_payload(rq, page, len); - return scsi_setup_blk_pc_cmnd(sdp, rq); + ret = scsi_setup_blk_pc_cmnd(sdp, rq); + if (ret != BLKPREP_OK) { + blk_clear_request_payload(rq); + __free_page(page); + } + + return ret; } /** @@ -1170,15 +1177,6 @@ static int sd_done(struct scsi_cmnd *SCpnt) int sense_valid = 0; int sense_deferred = 0; - /* - * If this is a discard request that originated from the kernel - * we need to free our payload here. Note that we need to check - * the request flag as the normal payload rules apply for - * pass-through UNMAP / WRITE SAME requests. - */ - if (SCpnt->request->cmd_flags & REQ_DISCARD) - __free_page(bio_page(SCpnt->request->bio)); - if (result) { sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr); if (sense_valid) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 204fbe2..fdeef47 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -707,6 +707,7 @@ extern void blk_insert_request(struct request_queue *, struct request *, int, vo extern void blk_requeue_request(struct request_queue *, struct request *); extern void blk_add_request_payload(struct request *rq, struct page *page, unsigned int len); +extern void blk_clear_request_payload(struct request *rq); extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, -- 1.6.5.rc2 ^ permalink raw reply related [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-26 19:56 ` [PATCH 1/2] block: fix leaks associated with discard request payload Mike Snitzer @ 2010-06-27 8:49 ` FUJITA Tomonori 2010-06-27 9:26 ` Christoph Hellwig 2010-06-27 9:38 ` Christoph Hellwig 2010-06-27 15:29 ` James Bottomley 2 siblings, 1 reply; 60+ messages in thread From: FUJITA Tomonori @ 2010-06-27 8:49 UTC (permalink / raw) To: snitzer Cc: hch, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Sat, 26 Jun 2010 15:56:50 -0400 Mike Snitzer <snitzer@redhat.com> wrote: > Fix leaks introduced via "block: don't allocate a payload for discard > request" commit a1d949f5f44. > > sd_done() is not called for REQ_TYPE_BLOCK_PC commands so cleanup > discard request's payload directly in scsi_finish_command(). Instead of adding another discard hack to scsi_finish_command(), how about converting discard to REQ_TYPE_FS request? discard is FS request from the perspective of the block layer. It also fixes a problem that discard isn't retried in the case of UNIT ATTENTION. I think that we can get more cleaner code if we handle discard as normal (fs) request in the block layer (and scsi-ml). We need more changes but this patch is the first step. This patch depends on the patchset to convert flush request to REQ_TYPE_FS: http://marc.info/?l=linux-kernel&m=127730591527424&w=2 The git tree is also available: git://git.kernel.org/pub/scm/linux/kernel/git/tomo/linux-2.6-misc.git flush > Also cleanup page allocated for discard payload in > scsi_setup_discard_cmnd's scsi_setup_blk_pc_cmnd error path. This patch doesn't address on this (since it's a different problem). I think that scsi_setup_discard_cmnd() sees if it already allocates a page before allocating a page. That's similar to how other prep functions work, I think. = From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Subject: [PATCH] convert discard request to REQ_TYPE_FS instead of BLOCK_PC Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> --- block/blk-lib.c | 4 ++++ drivers/scsi/sd.c | 11 +---------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/block/blk-lib.c b/block/blk-lib.c index e16185b..9e15c46 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -20,6 +20,10 @@ static void blkdev_discard_end_io(struct bio *bio, int err) if (bio->bi_private) complete(bio->bi_private); + /* free the page that the lower layer allocated */ + if (bio_page(bio)) + __free_page(bio_page(bio)); + bio_put(bio); } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index d447726..1b182de 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -432,7 +432,7 @@ static int scsi_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq) nr_sectors >>= 3; } - rq->cmd_type = REQ_TYPE_BLOCK_PC; + rq->cmd_type = REQ_TYPE_FS; rq->timeout = SD_TIMEOUT; memset(rq->cmd, 0, rq->cmd_len); @@ -1177,15 +1177,6 @@ static int sd_done(struct scsi_cmnd *SCpnt) int sense_valid = 0; int sense_deferred = 0; - /* - * If this is a discard request that originated from the kernel - * we need to free our payload here. Note that we need to check - * the request flag as the normal payload rules apply for - * pass-through UNMAP / WRITE SAME requests. - */ - if (SCpnt->request->cmd_flags & REQ_DISCARD) - __free_page(bio_page(SCpnt->request->bio)); - if (result) { sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr); if (sense_valid) -- 1.6.5 ^ permalink raw reply related [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 8:49 ` FUJITA Tomonori @ 2010-06-27 9:26 ` Christoph Hellwig 2010-06-27 10:01 ` FUJITA Tomonori 0 siblings, 1 reply; 60+ messages in thread From: Christoph Hellwig @ 2010-06-27 9:26 UTC (permalink / raw) To: FUJITA Tomonori Cc: snitzer, hch, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Sun, Jun 27, 2010 at 05:49:29PM +0900, FUJITA Tomonori wrote: > On Sat, 26 Jun 2010 15:56:50 -0400 > Mike Snitzer <snitzer@redhat.com> wrote: > > > Fix leaks introduced via "block: don't allocate a payload for discard > > request" commit a1d949f5f44. > > > > sd_done() is not called for REQ_TYPE_BLOCK_PC commands so cleanup > > discard request's payload directly in scsi_finish_command(). > > Instead of adding another discard hack to scsi_finish_command(), how > about converting discard to REQ_TYPE_FS request? discard is FS request > from the perspective of the block layer. It also fixes a problem that > discard isn't retried in the case of UNIT ATTENTION. > > I think that we can get more cleaner code if we handle discard as > normal (fs) request in the block layer (and scsi-ml). We need more > changes but this patch is the first step. Making discard a REQ_TYPE_FS inside scsi (it already is before entering sd_prep_fn) means we'll need to special case it all over the I/O submission and completion path. Having the payload length not matching the transfer length is something we don't expect for FS requests. > index e16185b..9e15c46 100644 > --- a/block/blk-lib.c > +++ b/block/blk-lib.c > @@ -20,6 +20,10 @@ static void blkdev_discard_end_io(struct bio *bio, int err) > if (bio->bi_private) > complete(bio->bi_private); > > + /* free the page that the lower layer allocated */ > + if (bio_page(bio)) > + __free_page(bio_page(bio)); > + This is exactly what this patchkit gets rid off. Having a payload page that the caller tracks (previously fully, with this patch only for freeing) makes DM's life a lot harder. Remember we don't actually store any payload in there before entering sd_prep_fn - it's just that the scsi commands implementing discards need some payload - either a sector sizes zero filled buffer for WRITE SAME, or an LBA/len encoding inside the payload for UNMAP. > - rq->cmd_type = REQ_TYPE_BLOCK_PC; > + rq->cmd_type = REQ_TYPE_FS; No need to set REQ_TYPE_FS here, it's already set up that way. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 9:26 ` Christoph Hellwig @ 2010-06-27 10:01 ` FUJITA Tomonori 2010-06-27 10:35 ` FUJITA Tomonori 0 siblings, 1 reply; 60+ messages in thread From: FUJITA Tomonori @ 2010-06-27 10:01 UTC (permalink / raw) To: hch Cc: fujita.tomonori, snitzer, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Sun, 27 Jun 2010 11:26:52 +0200 Christoph Hellwig <hch@lst.de> wrote: > On Sun, Jun 27, 2010 at 05:49:29PM +0900, FUJITA Tomonori wrote: > > On Sat, 26 Jun 2010 15:56:50 -0400 > > Mike Snitzer <snitzer@redhat.com> wrote: > > > > > Fix leaks introduced via "block: don't allocate a payload for discard > > > request" commit a1d949f5f44. > > > > > > sd_done() is not called for REQ_TYPE_BLOCK_PC commands so cleanup > > > discard request's payload directly in scsi_finish_command(). > > > > Instead of adding another discard hack to scsi_finish_command(), how > > about converting discard to REQ_TYPE_FS request? discard is FS request > > from the perspective of the block layer. It also fixes a problem that > > discard isn't retried in the case of UNIT ATTENTION. > > > > I think that we can get more cleaner code if we handle discard as > > normal (fs) request in the block layer (and scsi-ml). We need more > > changes but this patch is the first step. > > Making discard a REQ_TYPE_FS inside scsi (it already is before entering > sd_prep_fn) means we'll need to special case it all over the I/O > submission and completion path. Having the payload length not matching Hmm, my patch doesn't add any special case in scsi submission and completion. sd_prep_fn already has a hack for discard to set bi->bi_size to rq->__data_size so scsi can tell the block layer to finish discard requests. Adding another special case for discard to scsi_io_completion() doesn't look good. About the block layer, we already have special case for discard everywhere (rq->cmd_flags & REQ_DISCARD). > the transfer length is something we don't expect for FS requests. Yeah, that's tricky. I'm not sure yet which is better; change how the block layer handles the transfer length or let the lower layer to add pages (as we do now). > > index e16185b..9e15c46 100644 > > --- a/block/blk-lib.c > > +++ b/block/blk-lib.c > > @@ -20,6 +20,10 @@ static void blkdev_discard_end_io(struct bio *bio, int err) > > if (bio->bi_private) > > complete(bio->bi_private); > > > > + /* free the page that the lower layer allocated */ > > + if (bio_page(bio)) > > + __free_page(bio_page(bio)); > > + > > This is exactly what this patchkit gets rid off. Having a payload > page that the caller tracks (previously fully, with this patch only for > freeing) makes DM's life a lot harder. Remember we don't actually store > any payload in there before entering sd_prep_fn - it's just that the > scsi commands implementing discards need some payload - either a sector > sizes zero filled buffer for WRITE SAME, or an LBA/len encoding inside > the payload for UNMAP. It's so bad if the block layer frees pages that the lower layer allocates? I thought it's ok if the block layer doesn't allocate. It's better if sd_done() frees a page? As my patch does, if we handle discard as FS in scsi-ml, sd_done() is called. > > - rq->cmd_type = REQ_TYPE_BLOCK_PC; > > + rq->cmd_type = REQ_TYPE_FS; > > No need to set REQ_TYPE_FS here, it's already set up that way. Oops, sure. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 10:01 ` FUJITA Tomonori @ 2010-06-27 10:35 ` FUJITA Tomonori 2010-06-27 11:07 ` Christoph Hellwig 0 siblings, 1 reply; 60+ messages in thread From: FUJITA Tomonori @ 2010-06-27 10:35 UTC (permalink / raw) To: hch Cc: snitzer, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Sun, 27 Jun 2010 19:01:33 +0900 FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> wrote: > On Sun, 27 Jun 2010 11:26:52 +0200 > Christoph Hellwig <hch@lst.de> wrote: > > > On Sun, Jun 27, 2010 at 05:49:29PM +0900, FUJITA Tomonori wrote: > > > On Sat, 26 Jun 2010 15:56:50 -0400 > > > Mike Snitzer <snitzer@redhat.com> wrote: > > > > > > > Fix leaks introduced via "block: don't allocate a payload for discard > > > > request" commit a1d949f5f44. > > > > > > > > sd_done() is not called for REQ_TYPE_BLOCK_PC commands so cleanup > > > > discard request's payload directly in scsi_finish_command(). > > > > > > Instead of adding another discard hack to scsi_finish_command(), how > > > about converting discard to REQ_TYPE_FS request? discard is FS request > > > from the perspective of the block layer. It also fixes a problem that > > > discard isn't retried in the case of UNIT ATTENTION. > > > > > > I think that we can get more cleaner code if we handle discard as > > > normal (fs) request in the block layer (and scsi-ml). We need more > > > changes but this patch is the first step. > > > > Making discard a REQ_TYPE_FS inside scsi (it already is before entering > > sd_prep_fn) means we'll need to special case it all over the I/O > > submission and completion path. Having the payload length not matching > > Hmm, my patch doesn't add any special case in scsi submission and > completion. sd_prep_fn already has a hack for discard to set > bi->bi_size to rq->__data_size so scsi can tell the block layer to > finish discard requests. > > Adding another special case for discard to scsi_io_completion() > doesn't look good. > > About the block layer, we already have special case for discard > everywhere (rq->cmd_flags & REQ_DISCARD). > > > > the transfer length is something we don't expect for FS requests. > > Yeah, that's tricky. I'm not sure yet which is better; change how the > block layer handles the transfer length or let the lower layer to add > pages (as we do now). > > > > > index e16185b..9e15c46 100644 > > > --- a/block/blk-lib.c > > > +++ b/block/blk-lib.c > > > @@ -20,6 +20,10 @@ static void blkdev_discard_end_io(struct bio *bio, int err) > > > if (bio->bi_private) > > > complete(bio->bi_private); > > > > > > + /* free the page that the lower layer allocated */ > > > + if (bio_page(bio)) > > > + __free_page(bio_page(bio)); > > > + > > > > This is exactly what this patchkit gets rid off. Having a payload > > page that the caller tracks (previously fully, with this patch only for > > freeing) makes DM's life a lot harder. Remember we don't actually store > > any payload in there before entering sd_prep_fn - it's just that the > > scsi commands implementing discards need some payload - either a sector > > sizes zero filled buffer for WRITE SAME, or an LBA/len encoding inside > > the payload for UNMAP. > > It's so bad if the block layer frees pages that the lower layer > allocates? I thought it's ok if the block layer doesn't allocate. > > It's better if sd_done() frees a page? As my patch does, if we handle > discard as FS in scsi-ml, sd_done() is called. How about this? = From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> Subject: [PATCH] convert discard to REQ_TYPE_FS instead of REQ_TYPE_BLOCK_PC Fixes the two issues: - leak of pages that scsi_setup_discard_cmnd() allocates (because we don't call sd_done for pc requets). - discard requests aren't retried when possible (e.g. UNIT ATTENTION). Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> --- drivers/scsi/sd.c | 1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index d447726..056c8e1 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -432,7 +432,6 @@ static int scsi_setup_discard_cmnd(struct scsi_device *sdp, struct request *rq) nr_sectors >>= 3; } - rq->cmd_type = REQ_TYPE_BLOCK_PC; rq->timeout = SD_TIMEOUT; memset(rq->cmd, 0, rq->cmd_len); -- 1.6.5 ^ permalink raw reply related [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 10:35 ` FUJITA Tomonori @ 2010-06-27 11:07 ` Christoph Hellwig 2010-06-27 12:32 ` FUJITA Tomonori 0 siblings, 1 reply; 60+ messages in thread From: Christoph Hellwig @ 2010-06-27 11:07 UTC (permalink / raw) To: FUJITA Tomonori Cc: hch, snitzer, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi > How about this? As I tried to explain before this utterly confuses the I/O completion path. With the patch applied even a simple mkfs.xfs that issues discard just hangs. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 11:07 ` Christoph Hellwig @ 2010-06-27 12:32 ` FUJITA Tomonori 2010-06-27 14:16 ` Mike Snitzer ` (2 more replies) 0 siblings, 3 replies; 60+ messages in thread From: FUJITA Tomonori @ 2010-06-27 12:32 UTC (permalink / raw) To: hch Cc: fujita.tomonori, snitzer, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Sun, 27 Jun 2010 13:07:12 +0200 Christoph Hellwig <hch@lst.de> wrote: > > How about this? > > As I tried to explain before this utterly confuses the I/O completion > path. With the patch applied even a simple mkfs.xfs that issues discard > just hangs. Wired. I just tried mkfs.xfs against scsi_debug with my block patches (I saw one discard command). Seemed that it worked fine. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 12:32 ` FUJITA Tomonori @ 2010-06-27 14:16 ` Mike Snitzer 2010-06-27 15:35 ` Christoph Hellwig 2010-06-27 15:33 ` Christoph Hellwig 2010-06-28 7:57 ` Christoph Hellwig 2 siblings, 1 reply; 60+ messages in thread From: Mike Snitzer @ 2010-06-27 14:16 UTC (permalink / raw) To: FUJITA Tomonori Cc: hch, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Sun, Jun 27 2010 at 8:32am -0400, FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> wrote: > On Sun, 27 Jun 2010 13:07:12 +0200 > Christoph Hellwig <hch@lst.de> wrote: > > > > How about this? > > > > As I tried to explain before this utterly confuses the I/O completion > > path. With the patch applied even a simple mkfs.xfs that issues discard > > just hangs. > > Wired. I just tried mkfs.xfs against scsi_debug with my block patches > (I saw one discard command). Seemed that it worked fine. My leak fixes have been tested extensively against all permuations of devices with discards (ATA trim, SCSI UNMAP, SCSI WRTIE SAME w/ unmap=1). I think we need to get Christoph's discard payload transformation complete by fixing the leaks _without_ trying to rework how discard commands are tagged, etc. E.g. fix what Jens already has staged in linux-2.6-block's 'for-next' and 'for-2.6.36'. With that sorted out we can then look at longer term changes to cleanup discard request processing. Regards, Mike ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 14:16 ` Mike Snitzer @ 2010-06-27 15:35 ` Christoph Hellwig 2010-06-27 16:23 ` FUJITA Tomonori 0 siblings, 1 reply; 60+ messages in thread From: Christoph Hellwig @ 2010-06-27 15:35 UTC (permalink / raw) To: Mike Snitzer Cc: FUJITA Tomonori, hch, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Sun, Jun 27, 2010 at 10:16:40AM -0400, Mike Snitzer wrote: > My leak fixes have been tested extensively against all permuations of > devices with discards (ATA trim, SCSI UNMAP, SCSI WRTIE SAME w/ unmap=1). > > I think we need to get Christoph's discard payload transformation > complete by fixing the leaks _without_ trying to rework how discard > commands are tagged, etc. E.g. fix what Jens already has staged in > linux-2.6-block's 'for-next' and 'for-2.6.36'. > > With that sorted out we can then look at longer term changes to cleanup > discard request processing. I tend to agree. I'll look into getting rid of treating discards as BLOCK_PC commands ontop of you patchset. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 15:35 ` Christoph Hellwig @ 2010-06-27 16:23 ` FUJITA Tomonori 0 siblings, 0 replies; 60+ messages in thread From: FUJITA Tomonori @ 2010-06-27 16:23 UTC (permalink / raw) To: hch Cc: snitzer, fujita.tomonori, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Sun, 27 Jun 2010 17:35:45 +0200 Christoph Hellwig <hch@lst.de> wrote: > On Sun, Jun 27, 2010 at 10:16:40AM -0400, Mike Snitzer wrote: > > My leak fixes have been tested extensively against all permuations of > > devices with discards (ATA trim, SCSI UNMAP, SCSI WRTIE SAME w/ unmap=1). > > > > I think we need to get Christoph's discard payload transformation > > complete by fixing the leaks _without_ trying to rework how discard > > commands are tagged, etc. E.g. fix what Jens already has staged in > > linux-2.6-block's 'for-next' and 'for-2.6.36'. > > > > With that sorted out we can then look at longer term changes to cleanup > > discard request processing. > > I tend to agree. I'll look into getting rid of treating discards as > BLOCK_PC commands ontop of you patchset. Let's start from jens' 2.6.36 tree (or it would be better if Jens can drop Christoph's patch). We don't need to start on the top of new discard hacks (specially, the hack in scsi_finish_command looks terrible). We should not merge them into mainline. We are still in rc3. I'll see how things work with other configurations. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 12:32 ` FUJITA Tomonori 2010-06-27 14:16 ` Mike Snitzer @ 2010-06-27 15:33 ` Christoph Hellwig 2010-06-28 7:57 ` Christoph Hellwig 2 siblings, 0 replies; 60+ messages in thread From: Christoph Hellwig @ 2010-06-27 15:33 UTC (permalink / raw) To: FUJITA Tomonori Cc: hch, snitzer, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Sun, Jun 27, 2010 at 09:32:07PM +0900, FUJITA Tomonori wrote: > On Sun, 27 Jun 2010 13:07:12 +0200 > Christoph Hellwig <hch@lst.de> wrote: > > > > How about this? > > > > As I tried to explain before this utterly confuses the I/O completion > > path. With the patch applied even a simple mkfs.xfs that issues discard > > just hangs. > > Wired. I just tried mkfs.xfs against scsi_debug with my block patches > (I saw one discard command). Seemed that it worked fine. Doesn't work for me against either a real SSD or WRITE SAME support in qemu. But I think I didn't actually have your barrier patches applied, so I'll try again with a clean tree. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 12:32 ` FUJITA Tomonori 2010-06-27 14:16 ` Mike Snitzer 2010-06-27 15:33 ` Christoph Hellwig @ 2010-06-28 7:57 ` Christoph Hellwig 2010-06-28 8:14 ` FUJITA Tomonori 2 siblings, 1 reply; 60+ messages in thread From: Christoph Hellwig @ 2010-06-28 7:57 UTC (permalink / raw) To: FUJITA Tomonori Cc: hch, snitzer, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Sun, Jun 27, 2010 at 09:32:07PM +0900, FUJITA Tomonori wrote: > On Sun, 27 Jun 2010 13:07:12 +0200 > Christoph Hellwig <hch@lst.de> wrote: > > > > How about this? > > > > As I tried to explain before this utterly confuses the I/O completion > > path. With the patch applied even a simple mkfs.xfs that issues discard > > just hangs. > > Wired. I just tried mkfs.xfs against scsi_debug with my block patches > (I saw one discard command). Seemed that it worked fine. I've tracked it down to the call to scsi_requeue_command in scsi_end_request. When the command is marked BLOCK_PC we'll just get it back as such in ->prep_fn next time, but now it's reverting to the previous state. While I see the problems with leaking ressources in that case I still can't quite explain the hang I see. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-28 7:57 ` Christoph Hellwig @ 2010-06-28 8:14 ` FUJITA Tomonori 2010-06-28 8:18 ` Jens Axboe 2010-06-28 15:25 ` Christoph Hellwig 0 siblings, 2 replies; 60+ messages in thread From: FUJITA Tomonori @ 2010-06-28 8:14 UTC (permalink / raw) To: hch Cc: fujita.tomonori, snitzer, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Mon, 28 Jun 2010 09:57:38 +0200 Christoph Hellwig <hch@lst.de> wrote: > On Sun, Jun 27, 2010 at 09:32:07PM +0900, FUJITA Tomonori wrote: > > On Sun, 27 Jun 2010 13:07:12 +0200 > > Christoph Hellwig <hch@lst.de> wrote: > > > > > > How about this? > > > > > > As I tried to explain before this utterly confuses the I/O completion > > > path. With the patch applied even a simple mkfs.xfs that issues discard > > > just hangs. > > > > Wired. I just tried mkfs.xfs against scsi_debug with my block patches > > (I saw one discard command). Seemed that it worked fine. > > I've tracked it down to the call to scsi_requeue_command in scsi_end_request. > When the command is marked BLOCK_PC we'll just get it back as such in > ->prep_fn next time, but now it's reverting to the previous state. If scsi_end_request() calls scsi_requeue_command(), the command has a left over (i.e. hasn't finished all the data), right? You hit such condition with discard commands? BLOCK_PC requests don't hit this case since blk_end_request() always return false for PC. > While I see the problems with leaking ressources in that case I still > can't quite explain the hang I see. Any way to reproduce the hang without ssd drives? ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-28 8:14 ` FUJITA Tomonori @ 2010-06-28 8:18 ` Jens Axboe 2010-06-28 8:45 ` FUJITA Tomonori 2010-06-28 15:25 ` Christoph Hellwig 1 sibling, 1 reply; 60+ messages in thread From: Jens Axboe @ 2010-06-28 8:18 UTC (permalink / raw) To: FUJITA Tomonori Cc: hch, snitzer, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On 2010-06-28 10:14, FUJITA Tomonori wrote: > On Mon, 28 Jun 2010 09:57:38 +0200 > Christoph Hellwig <hch@lst.de> wrote: > >> On Sun, Jun 27, 2010 at 09:32:07PM +0900, FUJITA Tomonori wrote: >>> On Sun, 27 Jun 2010 13:07:12 +0200 >>> Christoph Hellwig <hch@lst.de> wrote: >>> >>>>> How about this? >>>> >>>> As I tried to explain before this utterly confuses the I/O completion >>>> path. With the patch applied even a simple mkfs.xfs that issues discard >>>> just hangs. >>> >>> Wired. I just tried mkfs.xfs against scsi_debug with my block patches >>> (I saw one discard command). Seemed that it worked fine. >> >> I've tracked it down to the call to scsi_requeue_command in scsi_end_request. >> When the command is marked BLOCK_PC we'll just get it back as such in >> ->prep_fn next time, but now it's reverting to the previous state. > > If scsi_end_request() calls scsi_requeue_command(), the command has a > left over (i.e. hasn't finished all the data), right? You hit such > condition with discard commands? > > BLOCK_PC requests don't hit this case since blk_end_request() always > return false for PC. You can get requeues on the ->queuecommand() path as well, for a variety of reasons, and that would be what Christoph is hitting. -- Jens Axboe ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-28 8:18 ` Jens Axboe @ 2010-06-28 8:45 ` FUJITA Tomonori 0 siblings, 0 replies; 60+ messages in thread From: FUJITA Tomonori @ 2010-06-28 8:45 UTC (permalink / raw) To: axboe Cc: fujita.tomonori, hch, snitzer, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Mon, 28 Jun 2010 10:18:48 +0200 Jens Axboe <axboe@kernel.dk> wrote: > On 2010-06-28 10:14, FUJITA Tomonori wrote: > > On Mon, 28 Jun 2010 09:57:38 +0200 > > Christoph Hellwig <hch@lst.de> wrote: > > > >> On Sun, Jun 27, 2010 at 09:32:07PM +0900, FUJITA Tomonori wrote: > >>> On Sun, 27 Jun 2010 13:07:12 +0200 > >>> Christoph Hellwig <hch@lst.de> wrote: > >>> > >>>>> How about this? > >>>> > >>>> As I tried to explain before this utterly confuses the I/O completion > >>>> path. With the patch applied even a simple mkfs.xfs that issues discard > >>>> just hangs. > >>> > >>> Wired. I just tried mkfs.xfs against scsi_debug with my block patches > >>> (I saw one discard command). Seemed that it worked fine. > >> > >> I've tracked it down to the call to scsi_requeue_command in scsi_end_request. > >> When the command is marked BLOCK_PC we'll just get it back as such in > >> ->prep_fn next time, but now it's reverting to the previous state. > > > > If scsi_end_request() calls scsi_requeue_command(), the command has a > > left over (i.e. hasn't finished all the data), right? You hit such > > condition with discard commands? > > > > BLOCK_PC requests don't hit this case since blk_end_request() always > > return false for PC. > > You can get requeues on the ->queuecommand() path as well, for a > variety of reasons, and that would be what Christoph is hitting. Probably, that's would be fine (we need to fix memory leak in that path). I guess that requeue with the partial completion commands might cause problems. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload @ 2010-06-28 8:45 ` FUJITA Tomonori 0 siblings, 0 replies; 60+ messages in thread From: FUJITA Tomonori @ 2010-06-28 8:45 UTC (permalink / raw) To: axboe Cc: fujita.tomonori, hch, snitzer, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Mon, 28 Jun 2010 10:18:48 +0200 Jens Axboe <axboe@kernel.dk> wrote: > On 2010-06-28 10:14, FUJITA Tomonori wrote: > > On Mon, 28 Jun 2010 09:57:38 +0200 > > Christoph Hellwig <hch@lst.de> wrote: > > > >> On Sun, Jun 27, 2010 at 09:32:07PM +0900, FUJITA Tomonori wrote: > >>> On Sun, 27 Jun 2010 13:07:12 +0200 > >>> Christoph Hellwig <hch@lst.de> wrote: > >>> > >>>>> How about this? > >>>> > >>>> As I tried to explain before this utterly confuses the I/O completion > >>>> path. With the patch applied even a simple mkfs.xfs that issues discard > >>>> just hangs. > >>> > >>> Wired. I just tried mkfs.xfs against scsi_debug with my block patches > >>> (I saw one discard command). Seemed that it worked fine. > >> > >> I've tracked it down to the call to scsi_requeue_command in scsi_end_request. > >> When the command is marked BLOCK_PC we'll just get it back as such in > >> ->prep_fn next time, but now it's reverting to the previous state. > > > > If scsi_end_request() calls scsi_requeue_command(), the command has a > > left over (i.e. hasn't finished all the data), right? You hit such > > condition with discard commands? > > > > BLOCK_PC requests don't hit this case since blk_end_request() always > > return false for PC. > > You can get requeues on the ->queuecommand() path as well, for a > variety of reasons, and that would be what Christoph is hitting. Probably, that's would be fine (we need to fix memory leak in that path). I guess that requeue with the partial completion commands might cause problems. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-28 8:14 ` FUJITA Tomonori 2010-06-28 8:18 ` Jens Axboe @ 2010-06-28 15:25 ` Christoph Hellwig 2010-06-30 11:55 ` FUJITA Tomonori 1 sibling, 1 reply; 60+ messages in thread From: Christoph Hellwig @ 2010-06-28 15:25 UTC (permalink / raw) To: FUJITA Tomonori Cc: hch, snitzer, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Mon, Jun 28, 2010 at 05:14:28PM +0900, FUJITA Tomonori wrote: > > While I see the problems with leaking ressources in that case I still > > can't quite explain the hang I see. > > Any way to reproduce the hang without ssd drives? Actually the SSDs don't fully hang, they just causes lots of I/O errors and hit the error handler hard. The hard hang is when running under qemu. Apply the patch below, then create an if=scsi drive that resides on an XFS filesystem, and you'll have scsi TP support in the guest: Index: qemu/hw/scsi-disk.c =================================================================== --- qemu.orig/hw/scsi-disk.c 2010-06-26 14:40:42.580004436 +0200 +++ qemu/hw/scsi-disk.c 2010-06-26 14:59:20.338020011 +0200 @@ -397,6 +397,7 @@ static int scsi_disk_emulate_inquiry(SCS } case 0xb0: /* block device characteristics */ { + static const int trim_sectors = (2 * 1024 * 1024) / 512; unsigned int min_io_size = s->qdev.conf.min_io_size / s->qdev.blocksize; unsigned int opt_io_size = @@ -416,6 +417,12 @@ static int scsi_disk_emulate_inquiry(SCS outbuf[13] = (opt_io_size >> 16) & 0xff; outbuf[14] = (opt_io_size >> 8) & 0xff; outbuf[15] = opt_io_size & 0xff; + + /* optimal unmap granularity */ + outbuf[28] = (trim_sectors >> 24) & 0xff; + outbuf[29] = (trim_sectors >> 16) & 0xff; + outbuf[30] = (trim_sectors >> 8) & 0xff; + outbuf[31] = trim_sectors & 0xff; break; } default: @@ -824,6 +831,11 @@ static int scsi_disk_emulate_command(SCS outbuf[11] = 0; outbuf[12] = 0; outbuf[13] = get_physical_block_exp(&s->qdev.conf); + + /* set TPE and TPRZ bits if the format supports discard */ + if (bdrv_is_discardable(s->bs)) + outbuf[14] = 0x80 | 0x40; + /* Protection, exponent and lowest lba field left blank. */ buflen = req->cmd.xfer; break; @@ -988,6 +1000,28 @@ static int32_t scsi_send_command(SCSIDev r->sector_count = len * s->cluster_size; is_write = 1; break; + case WRITE_SAME_16: +// printf("WRITE SAME(16) (sector %" PRId64 ", count %d)\n", lba, len); + +// if (lba + len > s->max_lba) + if (lba > s->max_lba) + goto illegal_lba; // check the condition code + /* + * We only support WRITE SAME with the unmap bit set for now. + */ + if (!(buf[1] & 0x8)) + goto fail; + + rc = bdrv_discard(s->bs, lba * s->cluster_size, len * s->cluster_size); + if (rc < 0) { + /* XXX: better error code ?*/ + goto fail; + } + + scsi_req_set_status(&r->req, GOOD, NO_SENSE); + scsi_req_complete(&r->req); + scsi_remove_request(r); + return 0; default: DPRINTF("Unknown SCSI command (%2.2x)\n", buf[0]); fail: Index: qemu/hw/scsi-defs.h =================================================================== --- qemu.orig/hw/scsi-defs.h 2010-06-26 14:40:42.589025947 +0200 +++ qemu/hw/scsi-defs.h 2010-06-26 14:40:43.820253983 +0200 @@ -84,6 +84,7 @@ #define MODE_SENSE_10 0x5a #define PERSISTENT_RESERVE_IN 0x5e #define PERSISTENT_RESERVE_OUT 0x5f +#define WRITE_SAME_16 0x93 #define MAINTENANCE_IN 0xa3 #define MAINTENANCE_OUT 0xa4 #define MOVE_MEDIUM 0xa5 Index: qemu/block.c =================================================================== --- qemu.orig/block.c 2010-06-26 14:40:42.597004296 +0200 +++ qemu/block.c 2010-06-26 14:40:43.824253703 +0200 @@ -1286,6 +1286,11 @@ int bdrv_is_sg(BlockDriverState *bs) return bs->sg; } +int bdrv_is_discardable(BlockDriverState *bs) +{ + return bs->discardable; +} + int bdrv_enable_write_cache(BlockDriverState *bs) { return bs->enable_write_cache; @@ -1431,6 +1436,13 @@ int bdrv_has_zero_init(BlockDriverState return 1; } +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ + if (!bs->drv || !bs->drv->bdrv_discard) + return -ENOTSUP; + return bs->drv->bdrv_discard(bs, sector_num, nb_sectors); +} + /* * Returns true iff the specified sector is present in the disk image. Drivers * not implementing the functionality are assumed to not support backing files, Index: qemu/block.h =================================================================== --- qemu.orig/block.h 2010-06-26 14:40:42.606004157 +0200 +++ qemu/block.h 2010-06-26 14:40:43.831254122 +0200 @@ -135,6 +135,7 @@ void bdrv_flush(BlockDriverState *bs); void bdrv_flush_all(void); void bdrv_close_all(void); +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); int bdrv_has_zero_init(BlockDriverState *bs); int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum); @@ -162,6 +163,7 @@ BlockErrorAction bdrv_get_on_error(Block int bdrv_is_removable(BlockDriverState *bs); int bdrv_is_read_only(BlockDriverState *bs); int bdrv_is_sg(BlockDriverState *bs); +int bdrv_is_discardable(BlockDriverState *bs); int bdrv_enable_write_cache(BlockDriverState *bs); int bdrv_is_inserted(BlockDriverState *bs); int bdrv_media_changed(BlockDriverState *bs); Index: qemu/block/raw-posix.c =================================================================== --- qemu.orig/block/raw-posix.c 2010-06-26 14:40:42.614025947 +0200 +++ qemu/block/raw-posix.c 2010-06-26 14:42:33.449255585 +0200 @@ -68,6 +68,10 @@ #include <sys/diskslice.h> #endif +#ifdef CONFIG_XFS +#include <xfs/xfs.h> +#endif + //#define DEBUG_FLOPPY //#define DEBUG_BLOCK @@ -117,6 +121,9 @@ typedef struct BDRVRawState { int use_aio; void *aio_ctx; #endif +#ifdef CONFIG_XFS + int is_xfs : 1; +#endif uint8_t* aligned_buf; } BDRVRawState; @@ -189,6 +196,13 @@ static int raw_open_common(BlockDriverSt #endif } +#ifdef CONFIG_XFS + if (platform_test_xfs_fd(s->fd)) { + s->is_xfs = 1; + bs->discardable = 1; + } +#endif + return 0; out_free_buf: @@ -731,6 +745,36 @@ static void raw_flush(BlockDriverState * qemu_fdatasync(s->fd); } +#ifdef CONFIG_XFS +static int xfs_discard(BDRVRawState *s, int64_t sector_num, int nb_sectors) +{ + struct xfs_flock64 fl; + + memset(&fl, 0, sizeof(fl)); + fl.l_whence = SEEK_SET; + fl.l_start = sector_num << 9; + fl.l_len = (int64_t)nb_sectors << 9; + + if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { + printf("cannot punch hole (%s)\n", strerror(errno)); + return -errno; + } + + return 0; +} +#endif + +static int raw_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ + BDRVRawState *s = bs->opaque; + +#ifdef CONFIG_XFS + if (s->is_xfs) + return xfs_discard(s, sector_num, nb_sectors); +#endif + + return -EOPNOTSUPP; +} static QEMUOptionParameter raw_create_options[] = { { @@ -752,6 +796,7 @@ static BlockDriver bdrv_file = { .bdrv_close = raw_close, .bdrv_create = raw_create, .bdrv_flush = raw_flush, + .bdrv_discard = raw_discard, .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, Index: qemu/block_int.h =================================================================== --- qemu.orig/block_int.h 2010-06-26 14:40:42.636003738 +0200 +++ qemu/block_int.h 2010-06-26 14:40:43.843033002 +0200 @@ -73,6 +73,8 @@ struct BlockDriver { BlockDriverCompletionFunc *cb, void *opaque); BlockDriverAIOCB *(*bdrv_aio_flush)(BlockDriverState *bs, BlockDriverCompletionFunc *cb, void *opaque); + int (*bdrv_discard)(BlockDriverState *bs, int64_t sector_num, + int nb_sectors); int (*bdrv_aio_multiwrite)(BlockDriverState *bs, BlockRequest *reqs, int num_reqs); @@ -141,6 +143,7 @@ struct BlockDriverState { int encrypted; /* if true, the media is encrypted */ int valid_key; /* if true, a valid encryption key has been set */ int sg; /* if true, the device is a /dev/sg* */ + int discardable;/* if true the device supports TRIM/UNMAP */ /* event callback when inserting/removing */ void (*change_cb)(void *opaque); void *change_opaque; Index: qemu/configure =================================================================== --- qemu.orig/configure 2010-06-26 14:40:42.644003947 +0200 +++ qemu/configure 2010-06-26 14:40:43.850005973 +0200 @@ -272,6 +272,7 @@ xen="" linux_aio="" attr="" vhost_net="" +xfs="" gprof="no" debug_tcg="no" @@ -1284,6 +1285,27 @@ EOF fi ########################################## +# xfsctl() probe, used for raw-posix +if test "$xfs" != "no" ; then + cat > $TMPC << EOF +#include <xfs/xfs.h> +int main(void) +{ + xfsctl(NULL, 0, 0, NULL); + return 0; +} +EOF + if compile_prog "" "" ; then + xfs="yes" + else + if test "$xfs" = "yes" ; then + feature_not_found "uuid" + fi + xfs=no + fi +fi + +########################################## # vde libraries probe if test "$vde" != "no" ; then vde_libs="-lvdeplug" @@ -2115,6 +2137,7 @@ echo "preadv support $preadv" echo "fdatasync $fdatasync" echo "uuid support $uuid" echo "vhost-net support $vhost_net" +echo "xfsctl support $xfs" if test $sdl_too_old = "yes"; then echo "-> Your SDL version is too old - please upgrade to have SDL support" @@ -2235,6 +2258,9 @@ fi if test "$uuid" = "yes" ; then echo "CONFIG_UUID=y" >> $config_host_mak fi +if test "$xfs" = "yes" ; then + echo "CONFIG_XFS=y" >> $config_host_mak +fi qemu_version=`head $source_path/VERSION` echo "VERSION=$qemu_version" >>$config_host_mak echo "PKGVERSION=$pkgversion" >>$config_host_mak Index: qemu/block/raw.c =================================================================== --- qemu.orig/block/raw.c 2010-06-26 14:40:42.628004296 +0200 +++ qemu/block/raw.c 2010-06-26 14:40:43.852014913 +0200 @@ -6,6 +6,7 @@ static int raw_open(BlockDriverState *bs, int flags) { bs->sg = bs->file->sg; + bs->discardable = bs->file->discardable; return 0; } @@ -65,6 +66,11 @@ static int raw_probe(const uint8_t *buf, return 1; /* everything can be opened as raw image */ } +static int raw_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ + return bdrv_discard(bs->file, sector_num, nb_sectors); +} + static int raw_is_inserted(BlockDriverState *bs) { return bdrv_is_inserted(bs->file); @@ -125,6 +131,7 @@ static BlockDriver bdrv_raw = { .bdrv_aio_readv = raw_aio_readv, .bdrv_aio_writev = raw_aio_writev, .bdrv_aio_flush = raw_aio_flush, + .bdrv_discard = raw_discard, .bdrv_is_inserted = raw_is_inserted, .bdrv_eject = raw_eject, ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-28 15:25 ` Christoph Hellwig @ 2010-06-30 11:55 ` FUJITA Tomonori 2010-07-01 4:21 ` FUJITA Tomonori 0 siblings, 1 reply; 60+ messages in thread From: FUJITA Tomonori @ 2010-06-30 11:55 UTC (permalink / raw) To: hch Cc: fujita.tomonori, snitzer, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Mon, 28 Jun 2010 17:25:36 +0200 Christoph Hellwig <hch@lst.de> wrote: > On Mon, Jun 28, 2010 at 05:14:28PM +0900, FUJITA Tomonori wrote: > > > While I see the problems with leaking ressources in that case I still > > > can't quite explain the hang I see. > > > > Any way to reproduce the hang without ssd drives? > > Actually the SSDs don't fully hang, they just causes lots of I/O errors > and hit the error handler hard. The hard hang is when running under > qemu. Apply the patch below, then create an if=scsi drive that resides > on an XFS filesystem, and you'll have scsi TP support in the guest: Ok, I figured out what's wrong. As I suspected, it's due to the partial completion. qemu scsi driver tells that the WRITE_SAME command was successful but somehow the command has resid. So we retry it again and again (and leak some memory). I don't know yet why qemu scsi driver is broken. Maybe there is a bug in it or converting discard to FS sends broken commands to the driver. I'll try to figure out it tomorrow. I've put a patch to complete discard command in the all-or-nothing manner: git://git.kernel.org/pub/scm/linux/kernel/git/tomo/linux-2.6-misc.git discard At least, the guest kernel doesn't hang for me. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-30 11:55 ` FUJITA Tomonori @ 2010-07-01 4:21 ` FUJITA Tomonori 0 siblings, 0 replies; 60+ messages in thread From: FUJITA Tomonori @ 2010-07-01 4:21 UTC (permalink / raw) To: hch Cc: snitzer, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm, linux-scsi On Wed, 30 Jun 2010 20:55:09 +0900 FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> wrote: > On Mon, 28 Jun 2010 17:25:36 +0200 > Christoph Hellwig <hch@lst.de> wrote: > > > On Mon, Jun 28, 2010 at 05:14:28PM +0900, FUJITA Tomonori wrote: > > > > While I see the problems with leaking ressources in that case I still > > > > can't quite explain the hang I see. > > > > > > Any way to reproduce the hang without ssd drives? > > > > Actually the SSDs don't fully hang, they just causes lots of I/O errors > > and hit the error handler hard. The hard hang is when running under > > qemu. Apply the patch below, then create an if=scsi drive that resides > > on an XFS filesystem, and you'll have scsi TP support in the guest: > > Ok, I figured out what's wrong. > > As I suspected, it's due to the partial completion. > > qemu scsi driver tells that the WRITE_SAME command was successful but > somehow the command has resid. So we retry it again and again (and > leak some memory). > > I don't know yet why qemu scsi driver is broken. Maybe there is a bug > in it or converting discard to FS sends broken commands to the driver. looks like your qemu WRITE_SAME patch isn't completed :) You implement WRITE_SAME as if it doesn't do any data transfer. So qemu scsi driver gets resid. The reason why WRITE_SAME works now is that scsi-ml doesn't care about resid with PC commands but it cares with FS commands. I confirmed that qemu scsi driver gets the identical command with both PC and FS commands and qemu calls xfsctl. > I've put a patch to complete discard command in the all-or-nothing > manner: > > git://git.kernel.org/pub/scm/linux/kernel/git/tomo/linux-2.6-misc.git discard Seems that I finished discard FS conversion. I'll update it on the top of James' uprep patchset soon. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-26 19:56 ` [PATCH 1/2] block: fix leaks associated with discard request payload Mike Snitzer 2010-06-27 8:49 ` FUJITA Tomonori @ 2010-06-27 9:38 ` Christoph Hellwig 2010-06-27 15:29 ` James Bottomley 2 siblings, 0 replies; 60+ messages in thread From: Christoph Hellwig @ 2010-06-27 9:38 UTC (permalink / raw) To: Mike Snitzer Cc: Christoph Hellwig, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Sat, Jun 26, 2010 at 03:56:50PM -0400, Mike Snitzer wrote: > Fix leaks introduced via "block: don't allocate a payload for discard > request" commit a1d949f5f44. > > sd_done() is not called for REQ_TYPE_BLOCK_PC commands so cleanup > discard request's payload directly in scsi_finish_command(). > > Also cleanup page allocated for discard payload in > scsi_setup_discard_cmnd's scsi_setup_blk_pc_cmnd error path. This looks good - I've tested an equivalent patch doing xfstests runs for more than 24hours now without seeing leaks. On the long run I'd prefer not having the knowledge of the payload freeing inside the core scsi code, but that requires either calling ->done for block pc requests, or treating discards as fs requests all the way, which will require much larger changes. So for now: Reviewed-by: Christoph Hellwig <hch@lst.de> ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-26 19:56 ` [PATCH 1/2] block: fix leaks associated with discard request payload Mike Snitzer 2010-06-27 8:49 ` FUJITA Tomonori 2010-06-27 9:38 ` Christoph Hellwig @ 2010-06-27 15:29 ` James Bottomley 2010-06-28 17:16 ` Martin K. Petersen ` (2 more replies) 2 siblings, 3 replies; 60+ messages in thread From: James Bottomley @ 2010-06-27 15:29 UTC (permalink / raw) To: Mike Snitzer Cc: Christoph Hellwig, axboe, dm-devel, linux-kernel, martin.petersen, akpm, linux-scsi, FUJITA Tomonori linux-scsi cc added, since it's a SCSI patch. On Sat, 2010-06-26 at 15:56 -0400, Mike Snitzer wrote: > Fix leaks introduced via "block: don't allocate a payload for discard > request" commit a1d949f5f44. > > sd_done() is not called for REQ_TYPE_BLOCK_PC commands so cleanup > discard request's payload directly in scsi_finish_command(). > > Also cleanup page allocated for discard payload in > scsi_setup_discard_cmnd's scsi_setup_blk_pc_cmnd error path. > > Signed-off-by: Mike Snitzer <snitzer@redhat.com> > --- > block/blk-core.c | 23 +++++++++++++++++++++++ > drivers/scsi/scsi.c | 8 ++++++++ > drivers/scsi/sd.c | 18 ++++++++---------- > include/linux/blkdev.h | 1 + > 4 files changed, 40 insertions(+), 10 deletions(-) > > diff --git a/block/blk-core.c b/block/blk-core.c > index 98b4cee..07925aa 100644 > --- a/block/blk-core.c > +++ b/block/blk-core.c > @@ -1167,6 +1167,29 @@ void blk_add_request_payload(struct request *rq, struct page *page, > } > EXPORT_SYMBOL_GPL(blk_add_request_payload); > > +/** > + * blk_clear_request_payload - clear a request's payload > + * @rq: request to update > + * > + * The driver needs to take care of freeing the payload itself. > + */ > +void blk_clear_request_payload(struct request *rq) > +{ > + struct bio *bio = rq->bio; > + > + rq->__data_len = rq->resid_len = 0; > + rq->nr_phys_segments = 0; > + rq->buffer = NULL; > + > + bio->bi_size = 0; > + bio->bi_vcnt = 0; > + bio->bi_phys_segments = 0; > + > + bio->bi_io_vec->bv_page = NULL; > + bio->bi_io_vec->bv_len = 0; > +} > +EXPORT_SYMBOL_GPL(blk_clear_request_payload); > + > void init_request_from_bio(struct request *req, struct bio *bio) > { > req->cpu = bio->bi_comp_cpu; > diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c > index ad0ed21..69c7ea4 100644 > --- a/drivers/scsi/scsi.c > +++ b/drivers/scsi/scsi.c > @@ -851,6 +851,14 @@ void scsi_finish_command(struct scsi_cmnd *cmd) > */ > if (good_bytes == old_good_bytes) > good_bytes -= scsi_get_resid(cmd); > + } else if (cmd->request->cmd_flags & REQ_DISCARD) { > + /* > + * If this is a discard request that originated from the kernel > + * we need to free our payload here. Note that we need to check > + * the request flag as the normal payload rules apply for > + * pass-through UNMAP / WRITE SAME requests. > + */ > + __free_page(bio_page(cmd->request->bio)); This is another layering violation: the page is allocated in the Upper layer and freed in the mid-layer. I really hate these growing contortions for discard. They're a clear signal that we haven't implemented it right. So let's first work out how it should be done. I really like Tomo's idea of doing discard through the normal REQ_TYPE_FS route, which means we can control the setup in prep and the tear down in done, all confined to the ULD. Where I think I'm at is partially what Christoph says: The command transformation belongs in the ULD so that's where the allocation and deallocation should be, and partly what Tomo says in that we should eliminate the special case paths. The payload vs actual request size should be a red herring if we've got everything correct: only the ULD cares about the request parameters. Once we've got everything set up, the mid layer and LLD should only care about the parameters in the command, so we can confine the size changing part to the ULD doing the discard. Could someone take a stab at this and see if it works without layering violations or any other problematic signals? Thanks, James ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 15:29 ` James Bottomley @ 2010-06-28 17:16 ` Martin K. Petersen 2010-06-29 8:00 ` Boaz Harrosh 2010-06-29 22:28 ` [dm-devel] " Mikulas Patocka 2010-06-30 8:32 ` Boaz Harrosh 2 siblings, 1 reply; 60+ messages in thread From: Martin K. Petersen @ 2010-06-28 17:16 UTC (permalink / raw) To: James Bottomley Cc: Mike Snitzer, Christoph Hellwig, axboe, dm-devel, linux-kernel, martin.petersen, akpm, linux-scsi, FUJITA Tomonori >>>>> "James" == James Bottomley <James.Bottomley@suse.de> writes: James> I really hate these growing contortions for discard. They're a James> clear signal that we haven't implemented it right. James> So let's first work out how it should be done. I really like James> Tomo's idea of doing discard through the normal REQ_TYPE_FS James> route, which means we can control the setup in prep and the tear James> down in done, all confined to the ULD. Yeah, this is what I was trying to do a couple of months ago. Trying to make discard and write same filesystem class requests so we can split, merge, etc. like READs and WRITEs. I still think this is how we should do it but it's a lot of work. There are several challenges involved. I was doing the "payload" allocation at request allocation time by permitting a buffer trailing struct request (size defined by ULD depending on req type). However, we have a few places in the stack where we memcpy requests and assume them to be the same size. That needs to be fixed. That's also the roadblock I ran into wrt. 32-byte CDB allocation so for that I ended up allocating the command in sd. Also, another major headache of mine is WRITE SAME/UNMAP to DSM TRIM conversion. Because of the limitations of the TRIM command format a single WRITE SAME can turn into effectively hundreds of TRIM commands to be issued. I tried to limit this by using UNMAP translation instead. But we can still get into cases where we need to either loop or allocate a bunch of TRIMs in the translation layer. That leaves two options: Either pass really conservative limits up the stack and loop up there. Or deal with the allocation/translation stuff at the bottom of the pile. None of my attempts in these departments turned out to be very nice. I'm still dreaming of the day where libata moves out from under SCSI so we don't have to translate square pegs into round holes... -- Martin K. Petersen Oracle Linux Engineering ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-28 17:16 ` Martin K. Petersen @ 2010-06-29 8:00 ` Boaz Harrosh 0 siblings, 0 replies; 60+ messages in thread From: Boaz Harrosh @ 2010-06-29 8:00 UTC (permalink / raw) To: Martin K. Petersen Cc: James Bottomley, Mike Snitzer, Christoph Hellwig, axboe, dm-devel, linux-kernel, akpm, linux-scsi, FUJITA Tomonori On 06/28/2010 08:16 PM, Martin K. Petersen wrote: > I'm still dreaming of the day where libata moves out from under SCSI so > we don't have to translate square pegs into round holes... > Hi Martin. Is that on the Radar still? what would you say are the major issues holding it back? * Is it just the missing new ULD, Will we be using a new ULD? * Is it the ata midlayer that would duplicate some of scsi-midlayer. (Not so much these days, lots of good code went into blk_rq_) * Is it the distro's setup procedures changing yet again back to the hd vs sd times. Thanks Boaz ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [dm-devel] [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 15:29 ` James Bottomley 2010-06-28 17:16 ` Martin K. Petersen @ 2010-06-29 22:28 ` Mikulas Patocka 2010-06-29 23:03 ` James Bottomley 2010-06-30 8:32 ` Boaz Harrosh 2 siblings, 1 reply; 60+ messages in thread From: Mikulas Patocka @ 2010-06-29 22:28 UTC (permalink / raw) To: device-mapper development Cc: Mike Snitzer, axboe, linux-scsi, martin.petersen, linux-kernel, akpm, Christoph Hellwig On Sun, 27 Jun 2010, James Bottomley wrote: > linux-scsi cc added, since it's a SCSI patch. > > On Sat, 2010-06-26 at 15:56 -0400, Mike Snitzer wrote: > > Fix leaks introduced via "block: don't allocate a payload for discard > > request" commit a1d949f5f44. > > > > sd_done() is not called for REQ_TYPE_BLOCK_PC commands so cleanup > > discard request's payload directly in scsi_finish_command(). > > > > Also cleanup page allocated for discard payload in > > scsi_setup_discard_cmnd's scsi_setup_blk_pc_cmnd error path. > > > > Signed-off-by: Mike Snitzer <snitzer@redhat.com> > > --- > > block/blk-core.c | 23 +++++++++++++++++++++++ > > drivers/scsi/scsi.c | 8 ++++++++ > > drivers/scsi/sd.c | 18 ++++++++---------- > > include/linux/blkdev.h | 1 + > > 4 files changed, 40 insertions(+), 10 deletions(-) > > > > diff --git a/block/blk-core.c b/block/blk-core.c > > index 98b4cee..07925aa 100644 > > --- a/block/blk-core.c > > +++ b/block/blk-core.c > > @@ -1167,6 +1167,29 @@ void blk_add_request_payload(struct request *rq, struct page *page, > > } > > EXPORT_SYMBOL_GPL(blk_add_request_payload); > > > > +/** > > + * blk_clear_request_payload - clear a request's payload > > + * @rq: request to update > > + * > > + * The driver needs to take care of freeing the payload itself. > > + */ > > +void blk_clear_request_payload(struct request *rq) > > +{ > > + struct bio *bio = rq->bio; > > + > > + rq->__data_len = rq->resid_len = 0; > > + rq->nr_phys_segments = 0; > > + rq->buffer = NULL; > > + > > + bio->bi_size = 0; > > + bio->bi_vcnt = 0; > > + bio->bi_phys_segments = 0; > > + > > + bio->bi_io_vec->bv_page = NULL; > > + bio->bi_io_vec->bv_len = 0; > > +} > > +EXPORT_SYMBOL_GPL(blk_clear_request_payload); > > + > > void init_request_from_bio(struct request *req, struct bio *bio) > > { > > req->cpu = bio->bi_comp_cpu; > > diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c > > index ad0ed21..69c7ea4 100644 > > --- a/drivers/scsi/scsi.c > > +++ b/drivers/scsi/scsi.c > > @@ -851,6 +851,14 @@ void scsi_finish_command(struct scsi_cmnd *cmd) > > */ > > if (good_bytes == old_good_bytes) > > good_bytes -= scsi_get_resid(cmd); > > + } else if (cmd->request->cmd_flags & REQ_DISCARD) { > > + /* > > + * If this is a discard request that originated from the kernel > > + * we need to free our payload here. Note that we need to check > > + * the request flag as the normal payload rules apply for > > + * pass-through UNMAP / WRITE SAME requests. > > + */ > > + __free_page(bio_page(cmd->request->bio)); > > This is another layering violation: the page is allocated in the Upper > layer and freed in the mid-layer. > > I really hate these growing contortions for discard. They're a clear > signal that we haven't implemented it right. > > So let's first work out how it should be done. I really like Tomo's > idea of doing discard through the normal REQ_TYPE_FS route, which means > we can control the setup in prep and the tear down in done, all confined > to the ULD. > > Where I think I'm at is partially what Christoph says: The command > transformation belongs in the ULD so that's where the allocation and > deallocation should be, and partly what Tomo says in that we should > eliminate the special case paths. > > The payload vs actual request size should be a red herring if we've got > everything correct: only the ULD cares about the request parameters. > Once we've got everything set up, the mid layer and LLD should only care > about the parameters in the command, so we can confine the size changing > part to the ULD doing the discard. > > Could someone take a stab at this and see if it works without layering > violations or any other problematic signals? > > Thanks, > > James Well, I think that you overestimate the importance of scsi code too much. There is a layering violation in the code. So what --- you either fix the layering violation or let it be there and grind your teeth on it. But in either case, that layering violation won't affect anyone except scsi developers. On the other hand, if you say "because we want to avoid layering violation in SCSI, every issuer of discard request must supply an empty page", you create havoc all over the Linux codebase. md, dm, drbd, xvd, virtio --- whatever you think of, will be allocating a dummy page when constructing a discard request. If the layering violation spans only scsi code, it can be eventually fixed, but this, much worse "layering violation" that will be spanning all block device midlayers, won't ever be fixed. Imagine for example --- a discard request arrivers at a dm-snapshot device. The driver splits it into chunks, remaps each chunk to the physical chunk, submits the requests, the elevator merges adjacent requests and submits fewer bigger requests to the device. Now, if you had to allocate a zeroed page each time you are splitting the request, that would exhaust memory and burn cpu needlessly. You delete a 100MB file? --- fine, allocate a 100MB of zeroed pages. So I say --- let there be a layering violation in the scsi code, but don't put this problem with a page allocation to all the other bio midlayer developers. Mikulas ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [dm-devel] [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-29 22:28 ` [dm-devel] " Mikulas Patocka @ 2010-06-29 23:03 ` James Bottomley 2010-06-29 23:51 ` Mike Snitzer 2010-06-30 0:11 ` [dm-devel] " Mikulas Patocka 0 siblings, 2 replies; 60+ messages in thread From: James Bottomley @ 2010-06-29 23:03 UTC (permalink / raw) To: Mikulas Patocka Cc: device-mapper development, Mike Snitzer, axboe, linux-scsi, martin.petersen, linux-kernel, akpm, Christoph Hellwig On Tue, 2010-06-29 at 18:28 -0400, Mikulas Patocka wrote: > > On Sun, 27 Jun 2010, James Bottomley wrote: > > > linux-scsi cc added, since it's a SCSI patch. > > > > On Sat, 2010-06-26 at 15:56 -0400, Mike Snitzer wrote: > > > Fix leaks introduced via "block: don't allocate a payload for discard > > > request" commit a1d949f5f44. > > > > > > sd_done() is not called for REQ_TYPE_BLOCK_PC commands so cleanup > > > discard request's payload directly in scsi_finish_command(). > > > > > > Also cleanup page allocated for discard payload in > > > scsi_setup_discard_cmnd's scsi_setup_blk_pc_cmnd error path. > > > > > > Signed-off-by: Mike Snitzer <snitzer@redhat.com> > > > --- > > > block/blk-core.c | 23 +++++++++++++++++++++++ > > > drivers/scsi/scsi.c | 8 ++++++++ > > > drivers/scsi/sd.c | 18 ++++++++---------- > > > include/linux/blkdev.h | 1 + > > > 4 files changed, 40 insertions(+), 10 deletions(-) > > > > > > diff --git a/block/blk-core.c b/block/blk-core.c > > > index 98b4cee..07925aa 100644 > > > --- a/block/blk-core.c > > > +++ b/block/blk-core.c > > > @@ -1167,6 +1167,29 @@ void blk_add_request_payload(struct request *rq, struct page *page, > > > } > > > EXPORT_SYMBOL_GPL(blk_add_request_payload); > > > > > > +/** > > > + * blk_clear_request_payload - clear a request's payload > > > + * @rq: request to update > > > + * > > > + * The driver needs to take care of freeing the payload itself. > > > + */ > > > +void blk_clear_request_payload(struct request *rq) > > > +{ > > > + struct bio *bio = rq->bio; > > > + > > > + rq->__data_len = rq->resid_len = 0; > > > + rq->nr_phys_segments = 0; > > > + rq->buffer = NULL; > > > + > > > + bio->bi_size = 0; > > > + bio->bi_vcnt = 0; > > > + bio->bi_phys_segments = 0; > > > + > > > + bio->bi_io_vec->bv_page = NULL; > > > + bio->bi_io_vec->bv_len = 0; > > > +} > > > +EXPORT_SYMBOL_GPL(blk_clear_request_payload); > > > + > > > void init_request_from_bio(struct request *req, struct bio *bio) > > > { > > > req->cpu = bio->bi_comp_cpu; > > > diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c > > > index ad0ed21..69c7ea4 100644 > > > --- a/drivers/scsi/scsi.c > > > +++ b/drivers/scsi/scsi.c > > > @@ -851,6 +851,14 @@ void scsi_finish_command(struct scsi_cmnd *cmd) > > > */ > > > if (good_bytes == old_good_bytes) > > > good_bytes -= scsi_get_resid(cmd); > > > + } else if (cmd->request->cmd_flags & REQ_DISCARD) { > > > + /* > > > + * If this is a discard request that originated from the kernel > > > + * we need to free our payload here. Note that we need to check > > > + * the request flag as the normal payload rules apply for > > > + * pass-through UNMAP / WRITE SAME requests. > > > + */ > > > + __free_page(bio_page(cmd->request->bio)); > > > > This is another layering violation: the page is allocated in the Upper > > layer and freed in the mid-layer. > > > > I really hate these growing contortions for discard. They're a clear > > signal that we haven't implemented it right. > > > > So let's first work out how it should be done. I really like Tomo's > > idea of doing discard through the normal REQ_TYPE_FS route, which means > > we can control the setup in prep and the tear down in done, all confined > > to the ULD. > > > > Where I think I'm at is partially what Christoph says: The command > > transformation belongs in the ULD so that's where the allocation and > > deallocation should be, and partly what Tomo says in that we should > > eliminate the special case paths. > > > > The payload vs actual request size should be a red herring if we've got > > everything correct: only the ULD cares about the request parameters. > > Once we've got everything set up, the mid layer and LLD should only care > > about the parameters in the command, so we can confine the size changing > > part to the ULD doing the discard. > > > > Could someone take a stab at this and see if it works without layering > > violations or any other problematic signals? > > > > Thanks, > > > > James > > Well, I think that you overestimate the importance of scsi code too much. Not, I think, a deadly sin for a SCSI maintainer. > There is a layering violation in the code. So what --- you either fix the > layering violation or let it be there and grind your teeth on it. But in > either case, that layering violation won't affect anyone except scsi > developers. A layering violation is a signal of bad design wherever it occurs, so that wasn't a SCSI centric argument. > On the other hand, if you say "because we want to avoid layering violation > in SCSI, every issuer of discard request must supply an empty page", you > create havoc all over the Linux codebase. md, dm, drbd, xvd, virtio --- > whatever you think of, will be allocating a dummy page when constructing > a discard request. Since I didn't actually say any of that, I suggest you re-read text you quoted above. The phrase "The command transformation belongs in the ULD so that's where the allocation and deallocation should be" might be a relevant one to concentrate on. > If the layering violation spans only scsi code, it can be eventually > fixed, but this, much worse "layering violation" that will be spanning all > block device midlayers, won't ever be fixed. > > Imagine for example --- a discard request arrivers at a dm-snapshot > device. The driver splits it into chunks, remaps each chunk to the > physical chunk, submits the requests, the elevator merges adjacent > requests and submits fewer bigger requests to the device. Now, if you had > to allocate a zeroed page each time you are splitting the request, that > would exhaust memory and burn cpu needlessly. You delete a 100MB file? --- > fine, allocate a 100MB of zeroed pages. This is a straw man: You've tried to portray a position I've never taken as mine then attack it ... with what is effectively another bogus argument. It's not an either/or choice. I've asked the relevant parties to combine the approaches and see if a REQ_TYPE_FS path that does the allocations in the appropriate place, likely the ULD, produces a good design. > So I say --- let there be a layering violation in the scsi code, but don't > put this problem with a page allocation to all the other bio midlayer > developers. Thanks for explaining that you have nothing to contribute, I'll make sure you're not on my list of relevant parties. James ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-29 23:03 ` James Bottomley @ 2010-06-29 23:51 ` Mike Snitzer 2010-06-30 0:11 ` [dm-devel] " Mikulas Patocka 1 sibling, 0 replies; 60+ messages in thread From: Mike Snitzer @ 2010-06-29 23:51 UTC (permalink / raw) To: James Bottomley Cc: Mikulas Patocka, device-mapper development, axboe, linux-scsi, martin.petersen, linux-kernel, akpm, Christoph Hellwig On Tue, Jun 29 2010 at 7:03pm -0400, James Bottomley <James.Bottomley@suse.de> wrote: > On Tue, 2010-06-29 at 18:28 -0400, Mikulas Patocka wrote: > > > > On Sun, 27 Jun 2010, James Bottomley wrote: > > > > > linux-scsi cc added, since it's a SCSI patch. > > > > > > On Sat, 2010-06-26 at 15:56 -0400, Mike Snitzer wrote: > > > > Fix leaks introduced via "block: don't allocate a payload for discard > > > > request" commit a1d949f5f44. > > > > > > > > sd_done() is not called for REQ_TYPE_BLOCK_PC commands so cleanup > > > > discard request's payload directly in scsi_finish_command(). > > > > > > > > Also cleanup page allocated for discard payload in > > > > scsi_setup_discard_cmnd's scsi_setup_blk_pc_cmnd error path. > > > > > > > > Signed-off-by: Mike Snitzer <snitzer@redhat.com> > > > > --- > > > > block/blk-core.c | 23 +++++++++++++++++++++++ > > > > drivers/scsi/scsi.c | 8 ++++++++ > > > > drivers/scsi/sd.c | 18 ++++++++---------- > > > > include/linux/blkdev.h | 1 + > > > > 4 files changed, 40 insertions(+), 10 deletions(-) > > > > > > > > diff --git a/block/blk-core.c b/block/blk-core.c > > > > index 98b4cee..07925aa 100644 > > > > --- a/block/blk-core.c > > > > +++ b/block/blk-core.c > > > > @@ -1167,6 +1167,29 @@ void blk_add_request_payload(struct request *rq, struct page *page, > > > > } > > > > EXPORT_SYMBOL_GPL(blk_add_request_payload); > > > > > > > > +/** > > > > + * blk_clear_request_payload - clear a request's payload > > > > + * @rq: request to update > > > > + * > > > > + * The driver needs to take care of freeing the payload itself. > > > > + */ > > > > +void blk_clear_request_payload(struct request *rq) > > > > +{ > > > > + struct bio *bio = rq->bio; > > > > + > > > > + rq->__data_len = rq->resid_len = 0; > > > > + rq->nr_phys_segments = 0; > > > > + rq->buffer = NULL; > > > > + > > > > + bio->bi_size = 0; > > > > + bio->bi_vcnt = 0; > > > > + bio->bi_phys_segments = 0; > > > > + > > > > + bio->bi_io_vec->bv_page = NULL; > > > > + bio->bi_io_vec->bv_len = 0; > > > > +} > > > > +EXPORT_SYMBOL_GPL(blk_clear_request_payload); > > > > + > > > > void init_request_from_bio(struct request *req, struct bio *bio) > > > > { > > > > req->cpu = bio->bi_comp_cpu; > > > > diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c > > > > index ad0ed21..69c7ea4 100644 > > > > --- a/drivers/scsi/scsi.c > > > > +++ b/drivers/scsi/scsi.c > > > > @@ -851,6 +851,14 @@ void scsi_finish_command(struct scsi_cmnd *cmd) > > > > */ > > > > if (good_bytes == old_good_bytes) > > > > good_bytes -= scsi_get_resid(cmd); > > > > + } else if (cmd->request->cmd_flags & REQ_DISCARD) { > > > > + /* > > > > + * If this is a discard request that originated from the kernel > > > > + * we need to free our payload here. Note that we need to check > > > > + * the request flag as the normal payload rules apply for > > > > + * pass-through UNMAP / WRITE SAME requests. > > > > + */ > > > > + __free_page(bio_page(cmd->request->bio)); > > > > > > This is another layering violation: the page is allocated in the Upper > > > layer and freed in the mid-layer. > > > > > > I really hate these growing contortions for discard. They're a clear > > > signal that we haven't implemented it right. > > > > > > So let's first work out how it should be done. I really like Tomo's > > > idea of doing discard through the normal REQ_TYPE_FS route, which means > > > we can control the setup in prep and the tear down in done, all confined > > > to the ULD. > > > > > > Where I think I'm at is partially what Christoph says: The command > > > transformation belongs in the ULD so that's where the allocation and > > > deallocation should be, and partly what Tomo says in that we should > > > eliminate the special case paths. > > > > > > The payload vs actual request size should be a red herring if we've got > > > everything correct: only the ULD cares about the request parameters. > > > Once we've got everything set up, the mid layer and LLD should only care > > > about the parameters in the command, so we can confine the size changing > > > part to the ULD doing the discard. > > > > > > Could someone take a stab at this and see if it works without layering > > > violations or any other problematic signals? > > > > > > Thanks, > > > > > > James > > > > Well, I think that you overestimate the importance of scsi code too much. > > Not, I think, a deadly sin for a SCSI maintainer. Indeed ;) > > There is a layering violation in the code. So what --- you either fix the > > layering violation or let it be there and grind your teeth on it. But in > > either case, that layering violation won't affect anyone except scsi > > developers. > > A layering violation is a signal of bad design wherever it occurs, so > that wasn't a SCSI centric argument. > > > On the other hand, if you say "because we want to avoid layering violation > > in SCSI, every issuer of discard request must supply an empty page", you > > create havoc all over the Linux codebase. md, dm, drbd, xvd, virtio --- > > whatever you think of, will be allocating a dummy page when constructing > > a discard request. > > Since I didn't actually say any of that, I suggest you re-read text you > quoted above. The phrase "The command transformation belongs in the ULD > so that's where the allocation and deallocation should be" might be a > relevant one to concentrate on. Right, freeing the page, that was allocated in SCSI's ULD, from the SCSI midlayer is a SCSI layering violation. I think Mikulas was reacting to the desire to maintain the existing, arguably more problematic, layering violation that spans the block and SCSI layers. > > If the layering violation spans only scsi code, it can be eventually > > fixed, but this, much worse "layering violation" that will be spanning all > > block device midlayers, won't ever be fixed. > > > > Imagine for example --- a discard request arrivers at a dm-snapshot > > device. The driver splits it into chunks, remaps each chunk to the > > physical chunk, submits the requests, the elevator merges adjacent > > requests and submits fewer bigger requests to the device. Now, if you had > > to allocate a zeroed page each time you are splitting the request, that > > would exhaust memory and burn cpu needlessly. You delete a 100MB file? --- > > fine, allocate a 100MB of zeroed pages. > > This is a straw man: You've tried to portray a position I've never > taken as mine then attack it ... with what is effectively another bogus > argument. > > It's not an either/or choice. I've asked the relevant parties to > combine the approaches and see if a REQ_TYPE_FS path that does the > allocations in the appropriate place, likely the ULD, produces a good > design. If in the end we can fix up SCSI properly then everyone is happy. So lets just keep working toward that. The various attempts to convert discard over to REQ_TYPE_FS have fallen short. Hopefully we'll have a break through shortly. Thanks for your guidance James, Mike ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload @ 2010-06-29 23:51 ` Mike Snitzer 0 siblings, 0 replies; 60+ messages in thread From: Mike Snitzer @ 2010-06-29 23:51 UTC (permalink / raw) To: James Bottomley Cc: axboe, linux-scsi, martin.petersen, linux-kernel, device-mapper development, Mikulas Patocka, akpm, Christoph Hellwig On Tue, Jun 29 2010 at 7:03pm -0400, James Bottomley <James.Bottomley@suse.de> wrote: > On Tue, 2010-06-29 at 18:28 -0400, Mikulas Patocka wrote: > > > > On Sun, 27 Jun 2010, James Bottomley wrote: > > > > > linux-scsi cc added, since it's a SCSI patch. > > > > > > On Sat, 2010-06-26 at 15:56 -0400, Mike Snitzer wrote: > > > > Fix leaks introduced via "block: don't allocate a payload for discard > > > > request" commit a1d949f5f44. > > > > > > > > sd_done() is not called for REQ_TYPE_BLOCK_PC commands so cleanup > > > > discard request's payload directly in scsi_finish_command(). > > > > > > > > Also cleanup page allocated for discard payload in > > > > scsi_setup_discard_cmnd's scsi_setup_blk_pc_cmnd error path. > > > > > > > > Signed-off-by: Mike Snitzer <snitzer@redhat.com> > > > > --- > > > > block/blk-core.c | 23 +++++++++++++++++++++++ > > > > drivers/scsi/scsi.c | 8 ++++++++ > > > > drivers/scsi/sd.c | 18 ++++++++---------- > > > > include/linux/blkdev.h | 1 + > > > > 4 files changed, 40 insertions(+), 10 deletions(-) > > > > > > > > diff --git a/block/blk-core.c b/block/blk-core.c > > > > index 98b4cee..07925aa 100644 > > > > --- a/block/blk-core.c > > > > +++ b/block/blk-core.c > > > > @@ -1167,6 +1167,29 @@ void blk_add_request_payload(struct request *rq, struct page *page, > > > > } > > > > EXPORT_SYMBOL_GPL(blk_add_request_payload); > > > > > > > > +/** > > > > + * blk_clear_request_payload - clear a request's payload > > > > + * @rq: request to update > > > > + * > > > > + * The driver needs to take care of freeing the payload itself. > > > > + */ > > > > +void blk_clear_request_payload(struct request *rq) > > > > +{ > > > > + struct bio *bio = rq->bio; > > > > + > > > > + rq->__data_len = rq->resid_len = 0; > > > > + rq->nr_phys_segments = 0; > > > > + rq->buffer = NULL; > > > > + > > > > + bio->bi_size = 0; > > > > + bio->bi_vcnt = 0; > > > > + bio->bi_phys_segments = 0; > > > > + > > > > + bio->bi_io_vec->bv_page = NULL; > > > > + bio->bi_io_vec->bv_len = 0; > > > > +} > > > > +EXPORT_SYMBOL_GPL(blk_clear_request_payload); > > > > + > > > > void init_request_from_bio(struct request *req, struct bio *bio) > > > > { > > > > req->cpu = bio->bi_comp_cpu; > > > > diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c > > > > index ad0ed21..69c7ea4 100644 > > > > --- a/drivers/scsi/scsi.c > > > > +++ b/drivers/scsi/scsi.c > > > > @@ -851,6 +851,14 @@ void scsi_finish_command(struct scsi_cmnd *cmd) > > > > */ > > > > if (good_bytes == old_good_bytes) > > > > good_bytes -= scsi_get_resid(cmd); > > > > + } else if (cmd->request->cmd_flags & REQ_DISCARD) { > > > > + /* > > > > + * If this is a discard request that originated from the kernel > > > > + * we need to free our payload here. Note that we need to check > > > > + * the request flag as the normal payload rules apply for > > > > + * pass-through UNMAP / WRITE SAME requests. > > > > + */ > > > > + __free_page(bio_page(cmd->request->bio)); > > > > > > This is another layering violation: the page is allocated in the Upper > > > layer and freed in the mid-layer. > > > > > > I really hate these growing contortions for discard. They're a clear > > > signal that we haven't implemented it right. > > > > > > So let's first work out how it should be done. I really like Tomo's > > > idea of doing discard through the normal REQ_TYPE_FS route, which means > > > we can control the setup in prep and the tear down in done, all confined > > > to the ULD. > > > > > > Where I think I'm at is partially what Christoph says: The command > > > transformation belongs in the ULD so that's where the allocation and > > > deallocation should be, and partly what Tomo says in that we should > > > eliminate the special case paths. > > > > > > The payload vs actual request size should be a red herring if we've got > > > everything correct: only the ULD cares about the request parameters. > > > Once we've got everything set up, the mid layer and LLD should only care > > > about the parameters in the command, so we can confine the size changing > > > part to the ULD doing the discard. > > > > > > Could someone take a stab at this and see if it works without layering > > > violations or any other problematic signals? > > > > > > Thanks, > > > > > > James > > > > Well, I think that you overestimate the importance of scsi code too much. > > Not, I think, a deadly sin for a SCSI maintainer. Indeed ;) > > There is a layering violation in the code. So what --- you either fix the > > layering violation or let it be there and grind your teeth on it. But in > > either case, that layering violation won't affect anyone except scsi > > developers. > > A layering violation is a signal of bad design wherever it occurs, so > that wasn't a SCSI centric argument. > > > On the other hand, if you say "because we want to avoid layering violation > > in SCSI, every issuer of discard request must supply an empty page", you > > create havoc all over the Linux codebase. md, dm, drbd, xvd, virtio --- > > whatever you think of, will be allocating a dummy page when constructing > > a discard request. > > Since I didn't actually say any of that, I suggest you re-read text you > quoted above. The phrase "The command transformation belongs in the ULD > so that's where the allocation and deallocation should be" might be a > relevant one to concentrate on. Right, freeing the page, that was allocated in SCSI's ULD, from the SCSI midlayer is a SCSI layering violation. I think Mikulas was reacting to the desire to maintain the existing, arguably more problematic, layering violation that spans the block and SCSI layers. > > If the layering violation spans only scsi code, it can be eventually > > fixed, but this, much worse "layering violation" that will be spanning all > > block device midlayers, won't ever be fixed. > > > > Imagine for example --- a discard request arrivers at a dm-snapshot > > device. The driver splits it into chunks, remaps each chunk to the > > physical chunk, submits the requests, the elevator merges adjacent > > requests and submits fewer bigger requests to the device. Now, if you had > > to allocate a zeroed page each time you are splitting the request, that > > would exhaust memory and burn cpu needlessly. You delete a 100MB file? --- > > fine, allocate a 100MB of zeroed pages. > > This is a straw man: You've tried to portray a position I've never > taken as mine then attack it ... with what is effectively another bogus > argument. > > It's not an either/or choice. I've asked the relevant parties to > combine the approaches and see if a REQ_TYPE_FS path that does the > allocations in the appropriate place, likely the ULD, produces a good > design. If in the end we can fix up SCSI properly then everyone is happy. So lets just keep working toward that. The various attempts to convert discard over to REQ_TYPE_FS have fallen short. Hopefully we'll have a break through shortly. Thanks for your guidance James, Mike ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [dm-devel] [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-29 23:03 ` James Bottomley 2010-06-29 23:51 ` Mike Snitzer @ 2010-06-30 0:11 ` Mikulas Patocka 2010-06-30 14:22 ` James Bottomley 1 sibling, 1 reply; 60+ messages in thread From: Mikulas Patocka @ 2010-06-30 0:11 UTC (permalink / raw) To: James Bottomley Cc: device-mapper development, Mike Snitzer, axboe, linux-scsi, martin.petersen, linux-kernel, akpm, Christoph Hellwig > > If the layering violation spans only scsi code, it can be eventually > > fixed, but this, much worse "layering violation" that will be spanning all > > block device midlayers, won't ever be fixed. > > > > Imagine for example --- a discard request arrivers at a dm-snapshot > > device. The driver splits it into chunks, remaps each chunk to the > > physical chunk, submits the requests, the elevator merges adjacent > > requests and submits fewer bigger requests to the device. Now, if you had > > to allocate a zeroed page each time you are splitting the request, that > > would exhaust memory and burn cpu needlessly. You delete a 100MB file? --- > > fine, allocate a 100MB of zeroed pages. > > This is a straw man: You've tried to portray a position I've never > taken as mine then attack it ... with what is effectively another bogus > argument. > > It's not an either/or choice. It is either/or choice. If the interface isn't fixed NOW, the existing flawed zeroed-page-allocation interface gets into RHEL and I and others will have to support it for 7 years. > I've asked the relevant parties to > combine the approaches and see if a REQ_TYPE_FS path that does the > allocations in the appropriate place, likely the ULD, produces a good > design. OK, but before you do this research, fix the interface. > > So I say --- let there be a layering violation in the scsi code, but don't > > put this problem with a page allocation to all the other bio midlayer > > developers. > > Thanks for explaining that you have nothing to contribute, I'll make > sure you're not on my list of relevant parties. You misunderstand what I meant. You admit that there are design problems in SCSI. So don't burden other developers with these problems. Don't force the others to allocate dummy pages just because you want a cleaner scsi code. You intend to fix the design of SCSI and then remove the dummy pages. But by the time you finish it, it will be already late and there will be midlayer drivers allocating these dummy pages. What I mean is that "layering violation" inside one driver is smaller problem than misdesigned interface between drivers. So accept the patch that creates "layering violation" but cleans up the interface. Mikulas ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [dm-devel] [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-30 0:11 ` [dm-devel] " Mikulas Patocka @ 2010-06-30 14:22 ` James Bottomley 2010-06-30 15:36 ` Mike Snitzer 2010-07-01 12:28 ` [dm-devel] " Mikulas Patocka 0 siblings, 2 replies; 60+ messages in thread From: James Bottomley @ 2010-06-30 14:22 UTC (permalink / raw) To: Mikulas Patocka Cc: device-mapper development, Mike Snitzer, axboe, linux-scsi, martin.petersen, linux-kernel, akpm, Christoph Hellwig On Tue, 2010-06-29 at 20:11 -0400, Mikulas Patocka wrote: > > > If the layering violation spans only scsi code, it can be eventually > > > fixed, but this, much worse "layering violation" that will be spanning all > > > block device midlayers, won't ever be fixed. > > > > > > Imagine for example --- a discard request arrivers at a dm-snapshot > > > device. The driver splits it into chunks, remaps each chunk to the > > > physical chunk, submits the requests, the elevator merges adjacent > > > requests and submits fewer bigger requests to the device. Now, if you had > > > to allocate a zeroed page each time you are splitting the request, that > > > would exhaust memory and burn cpu needlessly. You delete a 100MB file? --- > > > fine, allocate a 100MB of zeroed pages. > > > > This is a straw man: You've tried to portray a position I've never > > taken as mine then attack it ... with what is effectively another bogus > > argument. > > > > It's not an either/or choice. > > It is either/or choice. If the interface isn't fixed NOW, the existing > flawed zeroed-page-allocation interface gets into RHEL That's a false dichotomy. You might see an either apply this hack now or support the interface choice with RHEL, but upstream has the option to fix stuff correctly. RHEL has never needed my blessing to apply random crap to their kernel before ... why is this patch any different? > and I and others will have to support it for 7 years. It's called a business model ... I believe it's what they pay you for. > > I've asked the relevant parties to > > combine the approaches and see if a REQ_TYPE_FS path that does the > > allocations in the appropriate place, likely the ULD, produces a good > > design. > > OK, but before you do this research, fix the interface. So even in the RHEL world, I think you'd find that analysing the problem *before* comping up with a fix is a good way of doing things. > > > So I say --- let there be a layering violation in the scsi code, but don't > > > put this problem with a page allocation to all the other bio midlayer > > > developers. > > > > Thanks for explaining that you have nothing to contribute, I'll make > > sure you're not on my list of relevant parties. > > You misunderstand what I meant. You admit that there are design problems > in SCSI. No I didn't. And the rest of this rubbish is based on that false premise. It might help you to take off your SCSI antipathy and see this as a system problem: it actually originates in block and spills out from there. Thus it requires a system solution. James > So don't burden other developers with these problems. Don't force > the others to allocate dummy pages just because you want a cleaner scsi > code. > > You intend to fix the design of SCSI and then remove the dummy pages. But > by the time you finish it, it will be already late and there will be > midlayer drivers allocating these dummy pages. > > What I mean is that "layering violation" inside one driver is smaller > problem than misdesigned interface between drivers. So accept the patch > that creates "layering violation" but cleans up the interface. > > Mikulas ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-30 14:22 ` James Bottomley @ 2010-06-30 15:36 ` Mike Snitzer 2010-06-30 16:26 ` James Bottomley 2010-07-01 12:28 ` [dm-devel] " Mikulas Patocka 1 sibling, 1 reply; 60+ messages in thread From: Mike Snitzer @ 2010-06-30 15:36 UTC (permalink / raw) To: James Bottomley Cc: Mikulas Patocka, device-mapper development, axboe, linux-scsi, martin.petersen, linux-kernel, akpm, Christoph Hellwig On Wed, Jun 30 2010 at 10:22am -0400, James Bottomley <James.Bottomley@suse.de> wrote: > On Tue, 2010-06-29 at 20:11 -0400, Mikulas Patocka wrote: > > > > If the layering violation spans only scsi code, it can be eventually > > > > fixed, but this, much worse "layering violation" that will be spanning all > > > > block device midlayers, won't ever be fixed. > > > > > > > > Imagine for example --- a discard request arrivers at a dm-snapshot > > > > device. The driver splits it into chunks, remaps each chunk to the > > > > physical chunk, submits the requests, the elevator merges adjacent > > > > requests and submits fewer bigger requests to the device. Now, if you had > > > > to allocate a zeroed page each time you are splitting the request, that > > > > would exhaust memory and burn cpu needlessly. You delete a 100MB file? --- > > > > fine, allocate a 100MB of zeroed pages. > > > > > > This is a straw man: You've tried to portray a position I've never > > > taken as mine then attack it ... with what is effectively another bogus > > > argument. > > > > > > It's not an either/or choice. > > > > It is either/or choice. If the interface isn't fixed NOW, the existing > > flawed zeroed-page-allocation interface gets into RHEL > > That's a false dichotomy. You might see an either apply this hack now > or support the interface choice with RHEL, but upstream has the option > to fix stuff correctly. RHEL has never needed my blessing to apply > random crap to their kernel before ... why is this patch any different? > > > and I and others will have to support it for 7 years. > > It's called a business model ... I believe it's what they pay you for. > > > > I've asked the relevant parties to > > > combine the approaches and see if a REQ_TYPE_FS path that does the > > > allocations in the appropriate place, likely the ULD, produces a good > > > design. > > > > OK, but before you do this research, fix the interface. > > So even in the RHEL world, I think you'd find that analysing the problem > *before* comping up with a fix is a good way of doing things. > > > > > So I say --- let there be a layering violation in the scsi code, but don't > > > > put this problem with a page allocation to all the other bio midlayer > > > > developers. > > > > > > Thanks for explaining that you have nothing to contribute, I'll make > > > sure you're not on my list of relevant parties. > > > > You misunderstand what I meant. You admit that there are design problems > > in SCSI. > > No I didn't. > > And the rest of this rubbish is based on that false premise. It might > help you to take off your SCSI antipathy and see this as a system > problem: it actually originates in block and spills out from there. > Thus it requires a system solution. As fun as it is for the others monitoring these lists to see redhat.com vs suse.de banter I think framing this discussion like you (and Mikulas) continue to do is a complete distraction. I tried to elevate (and defuse) the discussion yesterday. But simply put: patches speak volumes. I look forward to working with Tomo, hch and anyone else who has something to contribute that moves us toward a real fix for discards. Mike ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-30 15:36 ` Mike Snitzer @ 2010-06-30 16:26 ` James Bottomley 0 siblings, 0 replies; 60+ messages in thread From: James Bottomley @ 2010-06-30 16:26 UTC (permalink / raw) To: Mike Snitzer Cc: Mikulas Patocka, device-mapper development, axboe, linux-scsi, martin.petersen, linux-kernel, akpm, Christoph Hellwig On Wed, 2010-06-30 at 11:36 -0400, Mike Snitzer wrote: > On Wed, Jun 30 2010 at 10:22am -0400, > James Bottomley <James.Bottomley@suse.de> wrote: > > > On Tue, 2010-06-29 at 20:11 -0400, Mikulas Patocka wrote: > > > > > If the layering violation spans only scsi code, it can be eventually > > > > > fixed, but this, much worse "layering violation" that will be spanning all > > > > > block device midlayers, won't ever be fixed. > > > > > > > > > > Imagine for example --- a discard request arrivers at a dm-snapshot > > > > > device. The driver splits it into chunks, remaps each chunk to the > > > > > physical chunk, submits the requests, the elevator merges adjacent > > > > > requests and submits fewer bigger requests to the device. Now, if you had > > > > > to allocate a zeroed page each time you are splitting the request, that > > > > > would exhaust memory and burn cpu needlessly. You delete a 100MB file? --- > > > > > fine, allocate a 100MB of zeroed pages. > > > > > > > > This is a straw man: You've tried to portray a position I've never > > > > taken as mine then attack it ... with what is effectively another bogus > > > > argument. > > > > > > > > It's not an either/or choice. > > > > > > It is either/or choice. If the interface isn't fixed NOW, the existing > > > flawed zeroed-page-allocation interface gets into RHEL > > > > That's a false dichotomy. You might see an either apply this hack now > > or support the interface choice with RHEL, but upstream has the option > > to fix stuff correctly. RHEL has never needed my blessing to apply > > random crap to their kernel before ... why is this patch any different? > > > > > and I and others will have to support it for 7 years. > > > > It's called a business model ... I believe it's what they pay you for. > > > > > > I've asked the relevant parties to > > > > combine the approaches and see if a REQ_TYPE_FS path that does the > > > > allocations in the appropriate place, likely the ULD, produces a good > > > > design. > > > > > > OK, but before you do this research, fix the interface. > > > > So even in the RHEL world, I think you'd find that analysing the problem > > *before* comping up with a fix is a good way of doing things. > > > > > > > So I say --- let there be a layering violation in the scsi code, but don't > > > > > put this problem with a page allocation to all the other bio midlayer > > > > > developers. > > > > > > > > Thanks for explaining that you have nothing to contribute, I'll make > > > > sure you're not on my list of relevant parties. > > > > > > You misunderstand what I meant. You admit that there are design problems > > > in SCSI. > > > > No I didn't. > > > > And the rest of this rubbish is based on that false premise. It might > > help you to take off your SCSI antipathy and see this as a system > > problem: it actually originates in block and spills out from there. > > Thus it requires a system solution. > > As fun as it is for the others monitoring these lists to see redhat.com > vs suse.de banter I think framing this discussion like you (and Mikulas) > continue to do is a complete distraction. Well, it's not SUSE v Red Hat, it's upstream v Enterprise ... and it's partly my job to explain why upstream does correct fixes not enterprise workarounds (whether the enterprise is RHEL or SLES). But I agree it's becoming a pointless distraction. > I tried to elevate (and defuse) the discussion yesterday. But simply > put: patches speak volumes. I look forward to working with Tomo, hch > and anyone else who has something to contribute that moves us toward a > real fix for discards. Right, so thanks for that. Most of our problem is tied up in the fact that we need to allocate in the prepare path, but we don't have a corresponding clear unprepare path to do the deallocation in. Introducing that into block might sort out this tangle better ... error handling on the backend is very convoluted and we can't really free the page until after it's complete (and the ->done function doesn't mark completion of error handling terms). I think the bones of a solution to this might be that scsi_unprep_request() needs to call into block (instead of setting the flags itself), say blk_unprep_request. Block also needs to call blk_unprep_request based on the REQ_DONTPREP status in its completion path. This would then give us a hook to hang the deallocation correctly on. James ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [dm-devel] [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-30 14:22 ` James Bottomley 2010-06-30 15:36 ` Mike Snitzer @ 2010-07-01 12:28 ` Mikulas Patocka 2010-07-01 12:46 ` Mike Snitzer 2010-07-01 12:49 ` [dm-devel] " Alasdair G Kergon 1 sibling, 2 replies; 60+ messages in thread From: Mikulas Patocka @ 2010-07-01 12:28 UTC (permalink / raw) To: James Bottomley Cc: device-mapper development, Mike Snitzer, axboe, linux-scsi, martin.petersen, linux-kernel, akpm, Christoph Hellwig > > It is either/or choice. If the interface isn't fixed NOW, the existing > > flawed zeroed-page-allocation interface gets into RHEL > > That's a false dichotomy. You might see an either apply this hack now > or support the interface choice with RHEL, but upstream has the option > to fix stuff correctly. RHEL has never needed my blessing to apply > random crap to their kernel before ... why is this patch any different? We can't apply non-upstream patches (except few exceptions such as dm-raid45). It makes sense, non-upstream patches have smaller test coverage. > And the rest of this rubbish is based on that false premise. It might > help you to take off your SCSI antipathy and see this as a system > problem: it actually originates in block and spills out from there. > Thus it requires a system solution. > > James Imagine this: I take a FPGA PCI board, I design a storage controller on it and this controller will need 3 pages to process a discard request. Now I say: I refuse to allocate these 3 pages in the driver because the driver would look ugly --- instead, I demand that everyone in the Linux kernel who creates a discard request must attach 3 pages to the request for my driver. Do you think it is correct behavior? Would you accept such a driver? I guess you wouldn't! But this is the same thing that you are doing with SCSI. Now lets take it a bit further and I say "I may clean up the driver for my controller one day, when I do it, I remove that 3-page requirement --- and then, everyone who allocated those pages will have to change his code and remove the allocations". And this is what you are intending to do with SCSI. Mikulas ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-07-01 12:28 ` [dm-devel] " Mikulas Patocka @ 2010-07-01 12:46 ` Mike Snitzer 2010-07-01 14:03 ` Mikulas Patocka 2010-07-01 12:49 ` [dm-devel] " Alasdair G Kergon 1 sibling, 1 reply; 60+ messages in thread From: Mike Snitzer @ 2010-07-01 12:46 UTC (permalink / raw) To: Mikulas Patocka Cc: James Bottomley, device-mapper development, axboe, linux-scsi, martin.petersen, linux-kernel, akpm, Christoph Hellwig On Thu, Jul 01 2010 at 8:28am -0400, Mikulas Patocka <mpatocka@redhat.com> wrote: > > > It is either/or choice. If the interface isn't fixed NOW, the existing > > > flawed zeroed-page-allocation interface gets into RHEL > > > > That's a false dichotomy. You might see an either apply this hack now > > or support the interface choice with RHEL, but upstream has the option > > to fix stuff correctly. RHEL has never needed my blessing to apply > > random crap to their kernel before ... why is this patch any different? > > We can't apply non-upstream patches (except few exceptions such as > dm-raid45). It makes sense, non-upstream patches have smaller test > coverage. > > > And the rest of this rubbish is based on that false premise. It might > > help you to take off your SCSI antipathy and see this as a system > > problem: it actually originates in block and spills out from there. > > Thus it requires a system solution. > > > > James > > Imagine this: I take a FPGA PCI board, I design a storage controller on it > and this controller will need 3 pages to process a discard request. Now I > say: I refuse to allocate these 3 pages in the driver because the driver > would look ugly --- instead, I demand that everyone in the Linux kernel > who creates a discard request must attach 3 pages to the request for my > driver. > > Do you think it is correct behavior? Would you accept such a driver? I > guess you wouldn't! But this is the same thing that you are doing with > SCSI. > > Now lets take it a bit further and I say "I may clean up the driver for my > controller one day, when I do it, I remove that 3-page requirement --- and > then, everyone who allocated those pages will have to change his code and > remove the allocations". > > And this is what you are intending to do with SCSI. Mikulas, Jens has already queued up a comprehensive fix (3 patches) that James and Tomo developed. Please stop the hostility.. it has no place. Others, I'd encourage you to not respond to this thread further ;) Regards, Mike ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-07-01 12:46 ` Mike Snitzer @ 2010-07-01 14:03 ` Mikulas Patocka 0 siblings, 0 replies; 60+ messages in thread From: Mikulas Patocka @ 2010-07-01 14:03 UTC (permalink / raw) To: Mike Snitzer Cc: James Bottomley, device-mapper development, axboe, linux-scsi, martin.petersen, linux-kernel, akpm, Christoph Hellwig > Mikulas, > > Jens has already queued up a comprehensive fix (3 patches) that James > and Tomo developed. Please stop the hostility.. it has no place. > > Others, > I'd encourage you to not respond to this thread further ;) > > Regards, > Mike I read mailing lists at longer intervals than my mailbox, so I read the James' message I responded to but missed the patches on the list. It's good that it's fixed and there is no need to argue about it anymore. Mikulas ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload @ 2010-07-01 14:03 ` Mikulas Patocka 0 siblings, 0 replies; 60+ messages in thread From: Mikulas Patocka @ 2010-07-01 14:03 UTC (permalink / raw) To: Mike Snitzer Cc: axboe, linux-scsi, martin.petersen, linux-kernel, device-mapper development, James Bottomley, akpm, Christoph Hellwig > Mikulas, > > Jens has already queued up a comprehensive fix (3 patches) that James > and Tomo developed. Please stop the hostility.. it has no place. > > Others, > I'd encourage you to not respond to this thread further ;) > > Regards, > Mike I read mailing lists at longer intervals than my mailbox, so I read the James' message I responded to but missed the patches on the list. It's good that it's fixed and there is no need to argue about it anymore. Mikulas ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [dm-devel] [PATCH 1/2] block: fix leaks associated with discard request payload 2010-07-01 12:28 ` [dm-devel] " Mikulas Patocka 2010-07-01 12:46 ` Mike Snitzer @ 2010-07-01 12:49 ` Alasdair G Kergon 1 sibling, 0 replies; 60+ messages in thread From: Alasdair G Kergon @ 2010-07-01 12:49 UTC (permalink / raw) To: Mikulas Patocka Cc: James Bottomley, axboe, Mike Snitzer, linux-scsi, linux-kernel, device-mapper development, martin.petersen, akpm, Christoph Hellwig On Thu, Jul 01, 2010 at 08:28:02AM -0400, Mikulas Patocka wrote: > But this is the same thing that you are doing with SCSI. Have you read these patches, Mikulas? Stop digging. Alasdair ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-27 15:29 ` James Bottomley 2010-06-28 17:16 ` Martin K. Petersen 2010-06-29 22:28 ` [dm-devel] " Mikulas Patocka @ 2010-06-30 8:32 ` Boaz Harrosh 2010-06-30 8:42 ` Christoph Hellwig 2 siblings, 1 reply; 60+ messages in thread From: Boaz Harrosh @ 2010-06-30 8:32 UTC (permalink / raw) To: James Bottomley Cc: Mike Snitzer, Christoph Hellwig, axboe, dm-devel, linux-kernel, martin.petersen, akpm, linux-scsi, FUJITA Tomonori On 06/27/2010 06:29 PM, James Bottomley wrote: <snip> >> + /* >> + * If this is a discard request that originated from the kernel >> + * we need to free our payload here. Note that we need to check >> + * the request flag as the normal payload rules apply for >> + * pass-through UNMAP / WRITE SAME requests. >> + */ >> + __free_page(bio_page(cmd->request->bio)); > > This is another layering violation: the page is allocated in the Upper > layer and freed in the mid-layer. > May I ask a silly question? Why the dynamic allocation? Why not have a const-static single global page at the block-layer somewhere that will be used for all discard-type operations and be done with it once and for all. A single page can be used for any size bio , any number of concurrent discards, any ZERO needed operation. It can also be used by other operations like padding and others. In fact isn't there one for the libsata padding? (It could be dynamical allocated on first use for embedded system) just my $0.017 Boaz ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-30 8:32 ` Boaz Harrosh @ 2010-06-30 8:42 ` Christoph Hellwig 2010-06-30 10:25 ` Boaz Harrosh 0 siblings, 1 reply; 60+ messages in thread From: Christoph Hellwig @ 2010-06-30 8:42 UTC (permalink / raw) To: Boaz Harrosh Cc: James Bottomley, Mike Snitzer, Christoph Hellwig, axboe, dm-devel, linux-kernel, martin.petersen, akpm, linux-scsi, FUJITA Tomonori On Wed, Jun 30, 2010 at 11:32:43AM +0300, Boaz Harrosh wrote: > May I ask a silly question? Why the dynamic allocation? > > Why not have a const-static single global page at the block-layer somewhere > that will be used for all discard-type operations and be done with it once and > for all. A single page can be used for any size bio , any number of concurrent > discards, any ZERO needed operation. It can also be used by other operations > like padding and others. In fact isn't there one for the libsata padding? for UNMAP we need to write into the payload. And for ATA TRIM we need to write into the WRITE SAME payload. That's another layering violation for those looking for them, btw.. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-30 8:42 ` Christoph Hellwig @ 2010-06-30 10:25 ` Boaz Harrosh 2010-06-30 10:41 ` Christoph Hellwig 0 siblings, 1 reply; 60+ messages in thread From: Boaz Harrosh @ 2010-06-30 10:25 UTC (permalink / raw) To: Christoph Hellwig Cc: James Bottomley, Mike Snitzer, axboe, dm-devel, linux-kernel, martin.petersen, akpm, linux-scsi, FUJITA Tomonori On 06/30/2010 11:42 AM, Christoph Hellwig wrote: > On Wed, Jun 30, 2010 at 11:32:43AM +0300, Boaz Harrosh wrote: >> May I ask a silly question? Why the dynamic allocation? >> >> Why not have a const-static single global page at the block-layer somewhere >> that will be used for all discard-type operations and be done with it once and >> for all. A single page can be used for any size bio , any number of concurrent >> discards, any ZERO needed operation. It can also be used by other operations >> like padding and others. In fact isn't there one for the libsata padding? > > for UNMAP we need to write into the payload. And for ATA TRIM we need > to write into the WRITE SAME payload. OK, Thanks, I see. Is it one of these operations, (like we have in OSD) where the CDB information spills into the payload? like the scatter-gather and extent lists and such. Do we actually use a WRITE_SAME which is not zero? for what use? > That's another layering violation > for those looking for them, btw.. > Agreed ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-30 10:25 ` Boaz Harrosh @ 2010-06-30 10:41 ` Christoph Hellwig 2010-06-30 10:57 ` Boaz Harrosh 0 siblings, 1 reply; 60+ messages in thread From: Christoph Hellwig @ 2010-06-30 10:41 UTC (permalink / raw) To: Boaz Harrosh Cc: Christoph Hellwig, James Bottomley, Mike Snitzer, axboe, dm-devel, linux-kernel, martin.petersen, akpm, linux-scsi, FUJITA Tomonori On Wed, Jun 30, 2010 at 01:25:01PM +0300, Boaz Harrosh wrote: > OK, Thanks, I see. Is it one of these operations, (like we have in OSD) where > the CDB information spills into the payload? like the scatter-gather and extent > lists and such. For UNMAP the payload is a list of block number / length pairs, while the CDB itself doesn't contain any information like that. It's a rather awkward command. > Do we actually use a WRITE_SAME which is not zero? for what use? The kernel doesn't issue any WRITE SAME without the unmap bit set. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-30 10:41 ` Christoph Hellwig @ 2010-06-30 10:57 ` Boaz Harrosh 2010-06-30 12:18 ` Mike Snitzer 0 siblings, 1 reply; 60+ messages in thread From: Boaz Harrosh @ 2010-06-30 10:57 UTC (permalink / raw) To: Christoph Hellwig Cc: James Bottomley, Mike Snitzer, axboe, dm-devel, linux-kernel, martin.petersen, akpm, linux-scsi, FUJITA Tomonori On 06/30/2010 01:41 PM, Christoph Hellwig wrote: > On Wed, Jun 30, 2010 at 01:25:01PM +0300, Boaz Harrosh wrote: >> OK, Thanks, I see. Is it one of these operations, (like we have in OSD) where >> the CDB information spills into the payload? like the scatter-gather and extent >> lists and such. > > For UNMAP the payload is a list of block number / length pairs, while > the CDB itself doesn't contain any information like that. It's a rather > awkward command. > How big can that be? could we, maybe, use the sense_buffer, properly allocated already? >> Do we actually use a WRITE_SAME which is not zero? for what use? > > The kernel doesn't issue any WRITE SAME without the unmap bit set. So if the unmap bit is set then the page can just be zero, right? I still think a static zero-page is a worth while optimization. And block-drivers can take care with special needs with a private mem_pool or something. For the discard-type user and generic block layer the page is just an implementation specific residue, No? But don't mind me, I'm just babbling. Not that I'll do anything about it. Boaz ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 1/2] block: fix leaks associated with discard request payload 2010-06-30 10:57 ` Boaz Harrosh @ 2010-06-30 12:18 ` Mike Snitzer 0 siblings, 0 replies; 60+ messages in thread From: Mike Snitzer @ 2010-06-30 12:18 UTC (permalink / raw) To: Boaz Harrosh Cc: Christoph Hellwig, James Bottomley, axboe, dm-devel, linux-kernel, martin.petersen, akpm, linux-scsi, FUJITA Tomonori On Wed, Jun 30 2010 at 6:57am -0400, Boaz Harrosh <bharrosh@panasas.com> wrote: > On 06/30/2010 01:41 PM, Christoph Hellwig wrote: > > On Wed, Jun 30, 2010 at 01:25:01PM +0300, Boaz Harrosh wrote: > >> OK, Thanks, I see. Is it one of these operations, (like we have in OSD) where > >> the CDB information spills into the payload? like the scatter-gather and extent > >> lists and such. > > > > For UNMAP the payload is a list of block number / length pairs, while > > the CDB itself doesn't contain any information like that. It's a rather > > awkward command. > > > > How big can that be? could we, maybe, use the sense_buffer, properly allocated > already? > > >> Do we actually use a WRITE_SAME which is not zero? for what use? > > > > The kernel doesn't issue any WRITE SAME without the unmap bit set. > > So if the unmap bit is set then the page can just be zero, right? > > I still think a static zero-page is a worth while optimization. And > block-drivers can take care with special needs with a private mem_pool > or something. For the discard-type user and generic block layer the > page is just an implementation specific residue, No? Why should the block layer have any role in managing this page? Block layer doesn't care about it, SCSI does. Mike ^ permalink raw reply [flat|nested] 60+ messages in thread
* [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-22 18:00 ` Mike Snitzer 2010-06-26 19:56 ` [PATCH 1/2] block: fix leaks associated with discard request payload Mike Snitzer @ 2010-06-26 19:56 ` Mike Snitzer 2010-06-27 9:39 ` Christoph Hellwig ` (3 more replies) 1 sibling, 4 replies; 60+ messages in thread From: Mike Snitzer @ 2010-06-26 19:56 UTC (permalink / raw) To: Christoph Hellwig Cc: axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so means bio_has_data() will not be true until the SCSI layer adds the payload to the discard request via blk_add_request_payload. bio_{enable,disable}_inline_vecs are not expected to be widely used so they were exported using EXPORT_SYMBOL_GPL. This patch avoids the need for the following VM accounting fix for discards: http://lkml.org/lkml/2010/6/23/361 NOTE: Jens, you said you applied Tao Ma's fix but I cannot see it in your linux-2.6-block's for-next or for-2.6.36... as such I didn't revert it in this patch. Signed-off-by: Mike Snitzer <snitzer@redhat.com> --- block/blk-core.c | 2 ++ block/blk-lib.c | 2 +- fs/bio.c | 19 +++++++++++++++++-- include/linux/bio.h | 3 +++ 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 07925aa..457fc29 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1153,6 +1153,7 @@ void blk_add_request_payload(struct request *rq, struct page *page, { struct bio *bio = rq->bio; + bio_enable_inline_vecs(bio); bio->bi_io_vec->bv_page = page; bio->bi_io_vec->bv_offset = 0; bio->bi_io_vec->bv_len = len; @@ -1187,6 +1188,7 @@ void blk_clear_request_payload(struct request *rq) bio->bi_io_vec->bv_page = NULL; bio->bi_io_vec->bv_len = 0; + bio_disable_inline_vecs(bio); } EXPORT_SYMBOL_GPL(blk_clear_request_payload); diff --git a/block/blk-lib.c b/block/blk-lib.c index e16185b..345a4b6 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -54,7 +54,7 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, unsigned int max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); - bio = bio_alloc(gfp_mask, 1); + bio = bio_alloc(gfp_mask, 0); if (!bio) { ret = -ENOMEM; break; diff --git a/fs/bio.c b/fs/bio.c index 8abb2df..e47d0a6 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -260,6 +260,21 @@ void bio_init(struct bio *bio) } EXPORT_SYMBOL(bio_init); +void bio_enable_inline_vecs(struct bio *bio) +{ + bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; + bio->bi_max_vecs = BIO_INLINE_VECS; + bio->bi_io_vec = bio->bi_inline_vecs;; +} +EXPORT_SYMBOL_GPL(bio_enable_inline_vecs); + +void bio_disable_inline_vecs(struct bio *bio) +{ + bio->bi_max_vecs = 0; + bio->bi_io_vec = NULL; +} +EXPORT_SYMBOL_GPL(bio_disable_inline_vecs); + /** * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator @@ -293,8 +308,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) goto out_set; if (nr_iovecs <= BIO_INLINE_VECS) { - bvl = bio->bi_inline_vecs; - nr_iovecs = BIO_INLINE_VECS; + bio_enable_inline_vecs(bio); + return bio; } else { bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); if (unlikely(!bvl)) diff --git a/include/linux/bio.h b/include/linux/bio.h index 4d379c8..3135631 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -403,6 +403,9 @@ extern struct bio *bio_clone(struct bio *, gfp_t); extern void bio_init(struct bio *); +extern void bio_enable_inline_vecs(struct bio *); +extern void bio_disable_inline_vecs(struct bio *); + extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); -- 1.6.5.rc2 ^ permalink raw reply related [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-26 19:56 ` [PATCH 2/2] block: defer the use of inline biovecs for discard requests Mike Snitzer @ 2010-06-27 9:39 ` Christoph Hellwig 2010-06-27 14:00 ` Mike Snitzer 2010-06-27 14:55 ` [PATCH 2/2 v2] " Mike Snitzer ` (2 subsequent siblings) 3 siblings, 1 reply; 60+ messages in thread From: Christoph Hellwig @ 2010-06-27 9:39 UTC (permalink / raw) To: Mike Snitzer Cc: Christoph Hellwig, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Sat, Jun 26, 2010 at 03:56:51PM -0400, Mike Snitzer wrote: > Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so > means bio_has_data() will not be true until the SCSI layer adds the > payload to the discard request via blk_add_request_payload. > > bio_{enable,disable}_inline_vecs are not expected to be widely used so > they were exported using EXPORT_SYMBOL_GPL. Why do we need them exported at all? Also some comments on these functions would be useful. Otherwise it looks good to me. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-27 9:39 ` Christoph Hellwig @ 2010-06-27 14:00 ` Mike Snitzer 0 siblings, 0 replies; 60+ messages in thread From: Mike Snitzer @ 2010-06-27 14:00 UTC (permalink / raw) To: Christoph Hellwig Cc: axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Sun, Jun 27 2010 at 5:39am -0400, Christoph Hellwig <hch@lst.de> wrote: > On Sat, Jun 26, 2010 at 03:56:51PM -0400, Mike Snitzer wrote: > > Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so > > means bio_has_data() will not be true until the SCSI layer adds the > > payload to the discard request via blk_add_request_payload. > > > > bio_{enable,disable}_inline_vecs are not expected to be widely used so > > they were exported using EXPORT_SYMBOL_GPL. > > Why do we need them exported at all? Hmm, good point! > Also some comments on these functions would be useful. OK. > Otherwise it looks good to me. Thanks, I'll get a v2 of this patch out. ^ permalink raw reply [flat|nested] 60+ messages in thread
* [PATCH 2/2 v2] block: defer the use of inline biovecs for discard requests 2010-06-26 19:56 ` [PATCH 2/2] block: defer the use of inline biovecs for discard requests Mike Snitzer 2010-06-27 9:39 ` Christoph Hellwig @ 2010-06-27 14:55 ` Mike Snitzer 2010-06-27 15:33 ` Christoph Hellwig 2010-06-28 10:33 ` [PATCH 2/2] " FUJITA Tomonori 2010-06-28 12:34 ` Jens Axboe 3 siblings, 1 reply; 60+ messages in thread From: Mike Snitzer @ 2010-06-27 14:55 UTC (permalink / raw) To: Christoph Hellwig Cc: axboe, martin.petersen, linux-kernel, dm-devel, James.Bottomley, akpm Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so means bio_has_data() will not be true until the SCSI layer adds the payload to the discard request via blk_add_request_payload. This patch avoids the need for the following VM accounting fix for discards: http://lkml.org/lkml/2010/6/23/361 NOTE: Jens, you said you applied Tao Ma's fix but I cannot see it in your linux-2.6-block's for-next or for-2.6.36... as such I didn't revert it in this patch. Signed-off-by: Mike Snitzer <snitzer@redhat.com> --- block/blk-core.c | 2 ++ block/blk-lib.c | 2 +- fs/bio.c | 25 +++++++++++++++++++++++-- include/linux/bio.h | 3 +++ 4 files changed, 29 insertions(+), 3 deletions(-) Index: linux-2.6-block/block/blk-core.c =================================================================== --- linux-2.6-block.orig/block/blk-core.c +++ linux-2.6-block/block/blk-core.c @@ -1153,6 +1153,7 @@ void blk_add_request_payload(struct requ { struct bio *bio = rq->bio; + bio_enable_inline_vecs(bio); bio->bi_io_vec->bv_page = page; bio->bi_io_vec->bv_offset = 0; bio->bi_io_vec->bv_len = len; @@ -1187,6 +1188,7 @@ void blk_clear_request_payload(struct re bio->bi_io_vec->bv_page = NULL; bio->bi_io_vec->bv_len = 0; + bio_disable_inline_vecs(bio); } EXPORT_SYMBOL_GPL(blk_clear_request_payload); Index: linux-2.6-block/block/blk-lib.c =================================================================== --- linux-2.6-block.orig/block/blk-lib.c +++ linux-2.6-block/block/blk-lib.c @@ -54,7 +54,7 @@ int blkdev_issue_discard(struct block_de unsigned int max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); - bio = bio_alloc(gfp_mask, 1); + bio = bio_alloc(gfp_mask, 0); if (!bio) { ret = -ENOMEM; break; Index: linux-2.6-block/fs/bio.c =================================================================== --- linux-2.6-block.orig/fs/bio.c +++ linux-2.6-block/fs/bio.c @@ -261,6 +261,27 @@ void bio_init(struct bio *bio) EXPORT_SYMBOL(bio_init); /** + * bio_enable_inline_vecs - enable use of bio's inline iovecs + * @bio: bio that will use its inline iovecs + */ +void bio_enable_inline_vecs(struct bio *bio) +{ + bio->bi_flags |= BIO_POOL_NONE << BIO_POOL_OFFSET; + bio->bi_max_vecs = BIO_INLINE_VECS; + bio->bi_io_vec = bio->bi_inline_vecs;; +} + +/** + * bio_disable_inline_vecs - disable use of bio's inline iovecs + * @bio: bio that will no longer use its inline iovecs + */ +void bio_disable_inline_vecs(struct bio *bio) +{ + bio->bi_max_vecs = 0; + bio->bi_io_vec = NULL; +} + +/** * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate @@ -293,8 +314,8 @@ struct bio *bio_alloc_bioset(gfp_t gfp_m goto out_set; if (nr_iovecs <= BIO_INLINE_VECS) { - bvl = bio->bi_inline_vecs; - nr_iovecs = BIO_INLINE_VECS; + bio_enable_inline_vecs(bio); + return bio; } else { bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); if (unlikely(!bvl)) Index: linux-2.6-block/include/linux/bio.h =================================================================== --- linux-2.6-block.orig/include/linux/bio.h +++ linux-2.6-block/include/linux/bio.h @@ -403,6 +403,9 @@ extern struct bio *bio_clone(struct bio extern void bio_init(struct bio *); +extern void bio_enable_inline_vecs(struct bio *); +extern void bio_disable_inline_vecs(struct bio *); + extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int); extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *, unsigned int, unsigned int); ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2 v2] block: defer the use of inline biovecs for discard requests 2010-06-27 14:55 ` [PATCH 2/2 v2] " Mike Snitzer @ 2010-06-27 15:33 ` Christoph Hellwig 0 siblings, 0 replies; 60+ messages in thread From: Christoph Hellwig @ 2010-06-27 15:33 UTC (permalink / raw) To: Mike Snitzer Cc: Christoph Hellwig, axboe, martin.petersen, linux-kernel, dm-devel, James.Bottomley, akpm Looks good, Reviewed-by: Christoph Hellwig <hch@lst.de> ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-26 19:56 ` [PATCH 2/2] block: defer the use of inline biovecs for discard requests Mike Snitzer 2010-06-27 9:39 ` Christoph Hellwig 2010-06-27 14:55 ` [PATCH 2/2 v2] " Mike Snitzer @ 2010-06-28 10:33 ` FUJITA Tomonori 2010-06-28 12:29 ` Mike Snitzer 2010-06-28 12:34 ` Jens Axboe 3 siblings, 1 reply; 60+ messages in thread From: FUJITA Tomonori @ 2010-06-28 10:33 UTC (permalink / raw) To: snitzer Cc: hch, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Sat, 26 Jun 2010 15:56:51 -0400 Mike Snitzer <snitzer@redhat.com> wrote: > Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so > means bio_has_data() will not be true until the SCSI layer adds the > payload to the discard request via blk_add_request_payload. > > bio_{enable,disable}_inline_vecs are not expected to be widely used so > they were exported using EXPORT_SYMBOL_GPL. > > This patch avoids the need for the following VM accounting fix for > discards: http://lkml.org/lkml/2010/6/23/361 Why do we need to avoid the above fix? Surely, the above fix is hacky but much simpler than this patch. > NOTE: Jens, you said you applied Tao Ma's fix but I cannot see it in > your linux-2.6-block's for-next or for-2.6.36... as such I didn't revert > it in this patch. > > Signed-off-by: Mike Snitzer <snitzer@redhat.com> > --- > block/blk-core.c | 2 ++ > block/blk-lib.c | 2 +- > fs/bio.c | 19 +++++++++++++++++-- > include/linux/bio.h | 3 +++ > 4 files changed, 23 insertions(+), 3 deletions(-) ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-28 10:33 ` [PATCH 2/2] " FUJITA Tomonori @ 2010-06-28 12:29 ` Mike Snitzer 2010-06-28 15:15 ` FUJITA Tomonori 0 siblings, 1 reply; 60+ messages in thread From: Mike Snitzer @ 2010-06-28 12:29 UTC (permalink / raw) To: FUJITA Tomonori Cc: hch, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Mon, Jun 28 2010 at 6:33am -0400, FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> wrote: > On Sat, 26 Jun 2010 15:56:51 -0400 > Mike Snitzer <snitzer@redhat.com> wrote: > > > Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so > > means bio_has_data() will not be true until the SCSI layer adds the > > payload to the discard request via blk_add_request_payload. > > > > bio_{enable,disable}_inline_vecs are not expected to be widely used so > > they were exported using EXPORT_SYMBOL_GPL. > > > > This patch avoids the need for the following VM accounting fix for > > discards: http://lkml.org/lkml/2010/6/23/361 > > Why do we need to avoid the above fix? We don't _need_ to. We avoid the need for it as a side-effect of the cleanup that my patch provides. > Surely, the above fix is hacky but much simpler than this patch. My patch wasn't meant as an alternative to Tao Ma's patch. Again, it just obviates the need for it. Your tolerance for "hacky" is difficult to understand. On the one-hand (PATCH 1/2) you have no tolerance for "hacky" fixes for leaks (that introduce a short-term SCSI layering violation). But in this case you're perfectly fine with BIO_RW_DISCARD special casing? Mike ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-28 12:29 ` Mike Snitzer @ 2010-06-28 15:15 ` FUJITA Tomonori 2010-06-28 15:31 ` Mike Snitzer 0 siblings, 1 reply; 60+ messages in thread From: FUJITA Tomonori @ 2010-06-28 15:15 UTC (permalink / raw) To: snitzer Cc: fujita.tomonori, hch, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Mon, 28 Jun 2010 08:29:55 -0400 Mike Snitzer <snitzer@redhat.com> wrote: > On Mon, Jun 28 2010 at 6:33am -0400, > FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> wrote: > > > On Sat, 26 Jun 2010 15:56:51 -0400 > > Mike Snitzer <snitzer@redhat.com> wrote: > > > > > Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so > > > means bio_has_data() will not be true until the SCSI layer adds the > > > payload to the discard request via blk_add_request_payload. > > > > > > bio_{enable,disable}_inline_vecs are not expected to be widely used so > > > they were exported using EXPORT_SYMBOL_GPL. > > > > > > This patch avoids the need for the following VM accounting fix for > > > discards: http://lkml.org/lkml/2010/6/23/361 > > > > Why do we need to avoid the above fix? > > We don't _need_ to. We avoid the need for it as a side-effect of the > cleanup that my patch provides. > > > Surely, the above fix is hacky but much simpler than this patch. > > My patch wasn't meant as an alternative to Tao Ma's patch. Again, it > just obviates the need for it. > > Your tolerance for "hacky" is difficult to understand. On the one-hand > (PATCH 1/2) you have no tolerance for "hacky" fixes for leaks (that > introduce a short-term SCSI layering violation). Sorry, if not clear enough. - SCSI layering violation is bad. - A 'short term' solution always turns out to be a long solution. We should have a clean solution from the start. - Complicating the SCSI I/O completion is bad (already complicated enough). ... And the 'leaks' bug is still in -next. No need to fix it in a hacky way. We can just drop it from -next. > But in this case > you're perfectly fine with BIO_RW_DISCARD special casing? BIO_RW_DISCARD special is already everywhere in the block layer. I prefer to have the less. However as long as it's in the block layer, I can live with it. After all, that's the block layer thing. At least, it looks much better this patch. This patch is really hacky (as Jens said). ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-28 15:15 ` FUJITA Tomonori @ 2010-06-28 15:31 ` Mike Snitzer 0 siblings, 0 replies; 60+ messages in thread From: Mike Snitzer @ 2010-06-28 15:31 UTC (permalink / raw) To: FUJITA Tomonori Cc: hch, axboe, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Mon, Jun 28 2010 at 11:15am -0400, FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> wrote: > On Mon, 28 Jun 2010 08:29:55 -0400 > Mike Snitzer <snitzer@redhat.com> wrote: > > > On Mon, Jun 28 2010 at 6:33am -0400, > > FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp> wrote: > > > > > On Sat, 26 Jun 2010 15:56:51 -0400 > > > Mike Snitzer <snitzer@redhat.com> wrote: > > > > > > > Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so > > > > means bio_has_data() will not be true until the SCSI layer adds the > > > > payload to the discard request via blk_add_request_payload. > > > > > > > > bio_{enable,disable}_inline_vecs are not expected to be widely used so > > > > they were exported using EXPORT_SYMBOL_GPL. > > > > > > > > This patch avoids the need for the following VM accounting fix for > > > > discards: http://lkml.org/lkml/2010/6/23/361 > > > > > > Why do we need to avoid the above fix? > > > > We don't _need_ to. We avoid the need for it as a side-effect of the > > cleanup that my patch provides. > > > > > Surely, the above fix is hacky but much simpler than this patch. > > > > My patch wasn't meant as an alternative to Tao Ma's patch. Again, it > > just obviates the need for it. > > > > Your tolerance for "hacky" is difficult to understand. On the one-hand > > (PATCH 1/2) you have no tolerance for "hacky" fixes for leaks (that > > introduce a short-term SCSI layering violation). > > Sorry, if not clear enough. > > - SCSI layering violation is bad. > > - A 'short term' solution always turns out to be a long solution. We > should have a clean solution from the start. > > - Complicating the SCSI I/O completion is bad (already complicated > enough). > > ... > > And the 'leaks' bug is still in -next. No need to fix it in a hacky > way. We can just drop it from -next. > > > > But in this case > > you're perfectly fine with BIO_RW_DISCARD special casing? > > BIO_RW_DISCARD special is already everywhere in the block layer. I > prefer to have the less. However as long as it's in the block layer, I > can live with it. After all, that's the block layer thing. > > At least, it looks much better this patch. This patch is really hacky > (as Jens said). Christoph more clearly conveyed the intent of my patch. Its focus was _not_ to eliminate the need for Tao Ma's vm accounting patch. I was attempting to have the SCSI layer more comprehensively manage the allocation and use of biovec associated with the discard payload (that the SCSI layer was now also managing rather than relying on the block layer). It is as simple as that. Berating me with "really hacky" critiques doesn't change the fact that both the block layer _and_ the SCSI layer need serious help on their implementation of discard support. The entirety of Linux's current discard support is "really hacky". I think we can all agree on that; so if any good came of the discussion over the past 24 hours it is: we now know work is needed to make Linux's discard support more capable (select few knew this, but many more are aware of that fact now). And the SCSI layer has a significant role in improving Linux's discard capabilities. So relying on all discard changes to be in the block layer isn't an option ;) Regards, Mike ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-26 19:56 ` [PATCH 2/2] block: defer the use of inline biovecs for discard requests Mike Snitzer ` (2 preceding siblings ...) 2010-06-28 10:33 ` [PATCH 2/2] " FUJITA Tomonori @ 2010-06-28 12:34 ` Jens Axboe 2010-06-28 12:37 ` Mike Snitzer 3 siblings, 1 reply; 60+ messages in thread From: Jens Axboe @ 2010-06-28 12:34 UTC (permalink / raw) To: Mike Snitzer Cc: Christoph Hellwig, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On 2010-06-26 21:56, Mike Snitzer wrote: > Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so > means bio_has_data() will not be true until the SCSI layer adds the > payload to the discard request via blk_add_request_payload. Sorry, this looks horrible. What is the point of this?! > bio_{enable,disable}_inline_vecs are not expected to be widely used so > they were exported using EXPORT_SYMBOL_GPL. Never export anything that doesn't have an in-kernel modular user. > This patch avoids the need for the following VM accounting fix for > discards: http://lkml.org/lkml/2010/6/23/361 > NOTE: Jens, you said you applied Tao Ma's fix but I cannot see it in > your linux-2.6-block's for-next or for-2.6.36... as such I didn't revert > it in this patch. It's in the for-linus branch, that is stuff headed for the current kernel. -- Jens Axboe ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-28 12:34 ` Jens Axboe @ 2010-06-28 12:37 ` Mike Snitzer 2010-06-28 12:41 ` Jens Axboe 0 siblings, 1 reply; 60+ messages in thread From: Mike Snitzer @ 2010-06-28 12:37 UTC (permalink / raw) To: Jens Axboe Cc: Christoph Hellwig, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Mon, Jun 28 2010 at 8:34am -0400, Jens Axboe <axboe@kernel.dk> wrote: > On 2010-06-26 21:56, Mike Snitzer wrote: > > Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so > > means bio_has_data() will not be true until the SCSI layer adds the > > payload to the discard request via blk_add_request_payload. > > Sorry, this looks horrible. Your judgment isn't giving me much to work with... not sure where I go with "horrible". > What is the point of this?! Enables discard requests with _not_ return true for bio_has_data(). > > bio_{enable,disable}_inline_vecs are not expected to be widely used so > > they were exported using EXPORT_SYMBOL_GPL. > > Never export anything that doesn't have an in-kernel modular user. Yeap, v2 removed the exports. > > This patch avoids the need for the following VM accounting fix for > > discards: http://lkml.org/lkml/2010/6/23/361 > > NOTE: Jens, you said you applied Tao Ma's fix but I cannot see it in > > your linux-2.6-block's for-next or for-2.6.36... as such I didn't revert > > it in this patch. > > It's in the for-linus branch, that is stuff headed for the current > kernel. OK. Mike ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-28 12:37 ` Mike Snitzer @ 2010-06-28 12:41 ` Jens Axboe 2010-06-28 12:44 ` Christoph Hellwig 2010-06-28 12:45 ` Mike Snitzer 0 siblings, 2 replies; 60+ messages in thread From: Jens Axboe @ 2010-06-28 12:41 UTC (permalink / raw) To: Mike Snitzer Cc: Christoph Hellwig, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On 2010-06-28 14:37, Mike Snitzer wrote: > On Mon, Jun 28 2010 at 8:34am -0400, > Jens Axboe <axboe@kernel.dk> wrote: > >> On 2010-06-26 21:56, Mike Snitzer wrote: >>> Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so >>> means bio_has_data() will not be true until the SCSI layer adds the >>> payload to the discard request via blk_add_request_payload. >> >> Sorry, this looks horrible. > > Your judgment isn't giving me much to work with... not sure where I go > with "horrible". The horrible part is working around that issue by fiddling with the assignment of the internal vec. THAT looks like a horrible solution to that problem. How about just adding a check to bio_has_data() for non-zero bio->bi_vcnt? -- Jens Axboe ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-28 12:41 ` Jens Axboe @ 2010-06-28 12:44 ` Christoph Hellwig 2010-06-28 12:49 ` Jens Axboe 2010-06-28 12:45 ` Mike Snitzer 1 sibling, 1 reply; 60+ messages in thread From: Christoph Hellwig @ 2010-06-28 12:44 UTC (permalink / raw) To: Jens Axboe Cc: Mike Snitzer, Christoph Hellwig, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Mon, Jun 28, 2010 at 02:41:30PM +0200, Jens Axboe wrote: > The horrible part is working around that issue by fiddling with the > assignment of the internal vec. THAT looks like a horrible solution > to that problem. > > How about just adding a check to bio_has_data() for non-zero > bio->bi_vcnt? The question is how a discard request from the block layer should look like. With Mike's patch we have the same situation as for a barrier request: absolutely no data transferred and no indicator of it. IHMO that's much better than any partially constructed request. And yes, that means enabling the payload later in the driver. The other option would be to not reuse the request at all and just allocate a new request and use that from sd_prep_fn. That's what I tried to implement first, but I couldn't get it to work. Given all the issue we have with the current approach I'm almost tempted to try that again. ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-28 12:44 ` Christoph Hellwig @ 2010-06-28 12:49 ` Jens Axboe 0 siblings, 0 replies; 60+ messages in thread From: Jens Axboe @ 2010-06-28 12:49 UTC (permalink / raw) To: Christoph Hellwig Cc: Mike Snitzer, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On 2010-06-28 14:44, Christoph Hellwig wrote: > On Mon, Jun 28, 2010 at 02:41:30PM +0200, Jens Axboe wrote: >> The horrible part is working around that issue by fiddling with the >> assignment of the internal vec. THAT looks like a horrible solution >> to that problem. >> >> How about just adding a check to bio_has_data() for non-zero >> bio->bi_vcnt? > > The question is how a discard request from the block layer should look > like. With Mike's patch we have the same situation as for a barrier > request: absolutely no data transferred and no indicator of it. IHMO > that's much better than any partially constructed request. And yes, > that means enabling the payload later in the driver. With a barrier, it's more clear I think - if it carries data, then you account that. If it's an empty barrier, then there's nothing to account. There will be an impact on the io stream, but that is indicated in blktrace for instance. > The other option would be to not reuse the request at all and just > allocate a new request and use that from sd_prep_fn. That's what > I tried to implement first, but I couldn't get it to work. Given > all the issue we have with the current approach I'm almost tempted > to try that again. That sounds way cleaner... -- Jens Axboe ^ permalink raw reply [flat|nested] 60+ messages in thread
* Re: [PATCH 2/2] block: defer the use of inline biovecs for discard requests 2010-06-28 12:41 ` Jens Axboe 2010-06-28 12:44 ` Christoph Hellwig @ 2010-06-28 12:45 ` Mike Snitzer 1 sibling, 0 replies; 60+ messages in thread From: Mike Snitzer @ 2010-06-28 12:45 UTC (permalink / raw) To: Jens Axboe Cc: Christoph Hellwig, dm-devel, James.Bottomley, linux-kernel, martin.petersen, akpm On Mon, Jun 28 2010 at 8:41am -0400, Jens Axboe <axboe@kernel.dk> wrote: > On 2010-06-28 14:37, Mike Snitzer wrote: > > On Mon, Jun 28 2010 at 8:34am -0400, > > Jens Axboe <axboe@kernel.dk> wrote: > > > >> On 2010-06-26 21:56, Mike Snitzer wrote: > >>> Don't alloc discard bio with a biovec in blkdev_issue_discard. Doing so > >>> means bio_has_data() will not be true until the SCSI layer adds the > >>> payload to the discard request via blk_add_request_payload. > >> > >> Sorry, this looks horrible. > > > > Your judgment isn't giving me much to work with... not sure where I go > > with "horrible". > > The horrible part is working around that issue by fiddling with the > assignment of the internal vec. THAT looks like a horrible solution > to that problem. > > How about just adding a check to bio_has_data() for non-zero > bio->bi_vcnt? Sure, that works. ^ permalink raw reply [flat|nested] 60+ messages in thread
end of thread, other threads:[~2010-07-01 14:04 UTC | newest] Thread overview: 60+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2010-06-18 14:59 [PATCH, RFC] block: don't allocate a payload for discard request Christoph Hellwig 2010-06-19 4:25 ` Mike Snitzer 2010-06-22 18:00 ` Mike Snitzer 2010-06-26 19:56 ` [PATCH 1/2] block: fix leaks associated with discard request payload Mike Snitzer 2010-06-27 8:49 ` FUJITA Tomonori 2010-06-27 9:26 ` Christoph Hellwig 2010-06-27 10:01 ` FUJITA Tomonori 2010-06-27 10:35 ` FUJITA Tomonori 2010-06-27 11:07 ` Christoph Hellwig 2010-06-27 12:32 ` FUJITA Tomonori 2010-06-27 14:16 ` Mike Snitzer 2010-06-27 15:35 ` Christoph Hellwig 2010-06-27 16:23 ` FUJITA Tomonori 2010-06-27 15:33 ` Christoph Hellwig 2010-06-28 7:57 ` Christoph Hellwig 2010-06-28 8:14 ` FUJITA Tomonori 2010-06-28 8:18 ` Jens Axboe 2010-06-28 8:45 ` FUJITA Tomonori 2010-06-28 8:45 ` FUJITA Tomonori 2010-06-28 15:25 ` Christoph Hellwig 2010-06-30 11:55 ` FUJITA Tomonori 2010-07-01 4:21 ` FUJITA Tomonori 2010-06-27 9:38 ` Christoph Hellwig 2010-06-27 15:29 ` James Bottomley 2010-06-28 17:16 ` Martin K. Petersen 2010-06-29 8:00 ` Boaz Harrosh 2010-06-29 22:28 ` [dm-devel] " Mikulas Patocka 2010-06-29 23:03 ` James Bottomley 2010-06-29 23:51 ` Mike Snitzer 2010-06-29 23:51 ` Mike Snitzer 2010-06-30 0:11 ` [dm-devel] " Mikulas Patocka 2010-06-30 14:22 ` James Bottomley 2010-06-30 15:36 ` Mike Snitzer 2010-06-30 16:26 ` James Bottomley 2010-07-01 12:28 ` [dm-devel] " Mikulas Patocka 2010-07-01 12:46 ` Mike Snitzer 2010-07-01 14:03 ` Mikulas Patocka 2010-07-01 14:03 ` Mikulas Patocka 2010-07-01 12:49 ` [dm-devel] " Alasdair G Kergon 2010-06-30 8:32 ` Boaz Harrosh 2010-06-30 8:42 ` Christoph Hellwig 2010-06-30 10:25 ` Boaz Harrosh 2010-06-30 10:41 ` Christoph Hellwig 2010-06-30 10:57 ` Boaz Harrosh 2010-06-30 12:18 ` Mike Snitzer 2010-06-26 19:56 ` [PATCH 2/2] block: defer the use of inline biovecs for discard requests Mike Snitzer 2010-06-27 9:39 ` Christoph Hellwig 2010-06-27 14:00 ` Mike Snitzer 2010-06-27 14:55 ` [PATCH 2/2 v2] " Mike Snitzer 2010-06-27 15:33 ` Christoph Hellwig 2010-06-28 10:33 ` [PATCH 2/2] " FUJITA Tomonori 2010-06-28 12:29 ` Mike Snitzer 2010-06-28 15:15 ` FUJITA Tomonori 2010-06-28 15:31 ` Mike Snitzer 2010-06-28 12:34 ` Jens Axboe 2010-06-28 12:37 ` Mike Snitzer 2010-06-28 12:41 ` Jens Axboe 2010-06-28 12:44 ` Christoph Hellwig 2010-06-28 12:49 ` Jens Axboe 2010-06-28 12:45 ` Mike Snitzer
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.