All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] block: Add discard flag to blkdev_issue_zeroout() function
@ 2015-01-21  1:06 Martin K. Petersen
  2015-01-21 10:10 ` [PATCH] block: create ioctl to discard-or-zeroout a range of blocks Darrick J. Wong
  2015-01-21 17:42 ` [PATCH] block: Add discard flag to blkdev_issue_zeroout() function Jens Axboe
  0 siblings, 2 replies; 15+ messages in thread
From: Martin K. Petersen @ 2015-01-21  1:06 UTC (permalink / raw)
  To: axboe; +Cc: linux-scsi, linux-fsdevel, Martin K. Petersen

blkdev_issue_discard() will zero a given block range. This is done by
way of explicit writing, thus provisioning or allocating the blocks on
disk.

There are use cases where the desired behavior is to zero the blocks but
unprovision them if possible. The blocks must deterministically contain
zeroes when they are subsequently read back.

This patch adds a flag to blkdev_issue_zeroout() that provides this
variant. If the discard flag is set and a block device guarantees
discard_zeroes_data we will use REQ_DISCARD to clear the block range. If
the device does not support discard_zeroes_data or if the discard
request fails we will fall back to first REQ_WRITE_SAME and then a
regular REQ_WRITE.

Also update the callers of blkdev_issue_zero() to reflect the new flag
and make sb_issue_zeroout() prefer the discard approach.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-lib.c                    | 30 ++++++++++++++++++++++++++----
 block/ioctl.c                      |  2 +-
 drivers/block/drbd/drbd_receiver.c |  2 +-
 include/linux/blkdev.h             |  4 ++--
 4 files changed, 30 insertions(+), 8 deletions(-)

diff --git a/block/blk-lib.c b/block/blk-lib.c
index 8411be3c19d3..715e948f58a4 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -283,23 +283,45 @@ static int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
  * @sector:	start sector
  * @nr_sects:	number of sectors to write
  * @gfp_mask:	memory allocation flags (for bio_alloc)
+ * @discard:	whether to discard the block range
  *
  * Description:
- *  Generate and issue number of bios with zerofiled pages.
+
+ *  Zero-fill a block range.  If the discard flag is set and the block
+ *  device guarantees that subsequent READ operations to the block range
+ *  in question will return zeroes, the blocks will be discarded. Should
+ *  the discard request fail, if the discard flag is not set, or if
+ *  discard_zeroes_data is not supported, this function will resort to
+ *  zeroing the blocks manually, thus provisioning (allocating,
+ *  anchoring) them. If the block device supports the WRITE SAME command
+ *  blkdev_issue_zeroout() will use it to optimize the process of
+ *  clearing the block range. Otherwise the zeroing will be performed
+ *  using regular WRITE calls.
  */
 
 int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			 sector_t nr_sects, gfp_t gfp_mask)
+			 sector_t nr_sects, gfp_t gfp_mask, bool discard)
 {
+	struct request_queue *q = bdev_get_queue(bdev);
+	unsigned char bdn[BDEVNAME_SIZE];
+
+	if (discard && blk_queue_discard(q) && q->limits.discard_zeroes_data) {
+
+		if (!blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, 0))
+			return 0;
+
+		bdevname(bdev, bdn);
+		pr_warn("%s: DISCARD failed. Manually zeroing.\n", bdn);
+	}
+
 	if (bdev_write_same(bdev)) {
-		unsigned char bdn[BDEVNAME_SIZE];
 
 		if (!blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask,
 					     ZERO_PAGE(0)))
 			return 0;
 
 		bdevname(bdev, bdn);
-		pr_err("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
+		pr_warn("%s: WRITE SAME failed. Manually zeroing.\n", bdn);
 	}
 
 	return __blkdev_issue_zeroout(bdev, sector, nr_sects, gfp_mask);
diff --git a/block/ioctl.c b/block/ioctl.c
index 6c7bf903742f..7d8befde2aca 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -198,7 +198,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
 	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
 		return -EINVAL;
 
-	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL);
+	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index d169b4a79267..cee20354ac37 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1388,7 +1388,7 @@ int drbd_submit_peer_request(struct drbd_device *device,
 		list_add_tail(&peer_req->w.list, &device->active_ee);
 		spin_unlock_irq(&device->resource->req_lock);
 		if (blkdev_issue_zeroout(device->ldev->backing_bdev,
-			sector, data_size >> 9, GFP_NOIO))
+			sector, data_size >> 9, GFP_NOIO, false))
 			peer_req->flags |= EE_WAS_ERROR;
 		drbd_endio_write_sec_final(peer_req);
 		return 0;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 92f4b4b288dd..669f747ce947 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1162,7 +1162,7 @@ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, struct page *page);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
-			sector_t nr_sects, gfp_t gfp_mask);
+		sector_t nr_sects, gfp_t gfp_mask, bool discard);
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
@@ -1176,7 +1176,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 	return blkdev_issue_zeroout(sb->s_bdev,
 				    block << (sb->s_blocksize_bits - 9),
 				    nr_blocks << (sb->s_blocksize_bits - 9),
-				    gfp_mask);
+				    gfp_mask, true);
 }
 
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
-- 
1.9.3


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-01-21  1:06 [PATCH] block: Add discard flag to blkdev_issue_zeroout() function Martin K. Petersen
@ 2015-01-21 10:10 ` Darrick J. Wong
  2015-01-21 17:42 ` [PATCH] block: Add discard flag to blkdev_issue_zeroout() function Jens Axboe
  1 sibling, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2015-01-21 10:10 UTC (permalink / raw)
  To: Martin K. Petersen; +Cc: axboe, linux-scsi, linux-fsdevel

Create a new ioctl to expose the block layer's newfound ability to
issue either a zeroing discard, a WRITE SAME with a zero page, or a
regular write with the zero page.  This BLKZEROOUT2 ioctl takes
{start, length, flags} as parameters.  So far, the only flag available
is to enable the zeroing discard part -- without it, the call invokes
the old BLKZEROOUT behavior.  start and length have the same meaning
as in BLKZEROOUT.

Furthermore, because BLKZEROOUT2 issues commands directly to the
storage device, we must invalidate the page cache (as a regular
O_DIRECT write would do) to avoid returning stale cache contents at a
later time.

Depends on "block: Add discard flag to blkdev_issue_zeroout() function".

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 block/ioctl.c           |   45 ++++++++++++++++++++++++++++++++++++++-------
 include/uapi/linux/fs.h |    7 +++++++
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befd..ff623d5 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -186,19 +186,39 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 }
 
 static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
-			     uint64_t len)
+			     uint64_t len, uint32_t flags)
 {
+	int ret;
+	struct address_space *mapping;
+	uint64_t end = start + len - 1;
+
+	if (flags & ~BLKZEROOUT2_DISCARD_OK)
+		return -EINVAL;
 	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
 		return -EINVAL;
-	start >>= 9;
-	len >>= 9;
-
-	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+	if (end >= i_size_read(bdev->bd_inode))
 		return -EINVAL;
 
-	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
+	/* Invalidate the page cache, including dirty pages */
+	mapping = bdev->bd_inode->i_mapping;
+	truncate_inode_pages_range(mapping, start, end);
+
+	ret = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
+				   flags & BLKZEROOUT2_DISCARD_OK);
+	if (ret)
+		goto out;
+
+	/*
+	 * Invalidate again; if someone wandered in and dirtied a page,
+	 * the caller will be given -EBUSY.
+	 */
+	ret = invalidate_inode_pages2_range(mapping,
+					    start >> PAGE_CACHE_SHIFT,
+					    end >> PAGE_CACHE_SHIFT);
+out:
+	return ret;
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
@@ -326,7 +346,18 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
 			return -EFAULT;
 
-		return blk_ioctl_zeroout(bdev, range[0], range[1]);
+		return blk_ioctl_zeroout(bdev, range[0], range[1], 0);
+	}
+	case BLKZEROOUT2: {
+		struct blkzeroout2 p;
+
+		if (!(mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(&p, (void __user *)arg, sizeof(p)))
+			return -EFAULT;
+
+		return blk_ioctl_zeroout(bdev, p.start, p.length, p.flags);
 	}
 
 	case HDIO_GETGEO: {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0..54d24ea 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -150,6 +150,13 @@ struct inodes_stat_t {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
+struct blkzeroout2 {
+	__u64 start;
+	__u64 length;
+	__u32 flags;
+};
+#define BLKZEROOUT2_DISCARD_OK	1
+#define BLKZEROOUT2 _IOR(0x12, 127, struct blkzeroout2)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] block: Add discard flag to blkdev_issue_zeroout() function
  2015-01-21  1:06 [PATCH] block: Add discard flag to blkdev_issue_zeroout() function Martin K. Petersen
  2015-01-21 10:10 ` [PATCH] block: create ioctl to discard-or-zeroout a range of blocks Darrick J. Wong
@ 2015-01-21 17:42 ` Jens Axboe
  1 sibling, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2015-01-21 17:42 UTC (permalink / raw)
  To: Martin K. Petersen; +Cc: linux-scsi, linux-fsdevel

On 01/20/2015 06:06 PM, Martin K. Petersen wrote:
> blkdev_issue_discard() will zero a given block range. This is done by
> way of explicit writing, thus provisioning or allocating the blocks on
> disk.
>
> There are use cases where the desired behavior is to zero the blocks but
> unprovision them if possible. The blocks must deterministically contain
> zeroes when they are subsequently read back.
>
> This patch adds a flag to blkdev_issue_zeroout() that provides this
> variant. If the discard flag is set and a block device guarantees
> discard_zeroes_data we will use REQ_DISCARD to clear the block range. If
> the device does not support discard_zeroes_data or if the discard
> request fails we will fall back to first REQ_WRITE_SAME and then a
> regular REQ_WRITE.
>
> Also update the callers of blkdev_issue_zero() to reflect the new flag
> and make sb_issue_zeroout() prefer the discard approach.

Applied for 3.20, thanks.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-11-13 15:47       ` Jens Axboe
  (?)
@ 2015-11-13 21:36       ` Darrick J. Wong
  -1 siblings, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2015-11-13 21:36 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Seymour, Shane M, Christoph Hellwig, linux-kernel, linux-fsdevel,
	linux-api, Jeff Layton, J. Bruce Fields, martin.petersen

On Fri, Nov 13, 2015 at 08:47:20AM -0700, Jens Axboe wrote:
> On 11/10/2015 11:14 PM, Darrick J. Wong wrote:
> >On Wed, Nov 11, 2015 at 05:30:07AM +0000, Seymour, Shane M wrote:
> >>A quick question about this part of the patch:
> >>
> >>>+	uint64_t end = start + len - 1;
> >>
> >>>+	if (end >= i_size_read(bdev->bd_inode))
> >>  		return -EINVAL;
> >>
> >>>+	/* Invalidate the page cache, including dirty pages */
> >>>+	mapping = bdev->bd_inode->i_mapping;
> >>>+	truncate_inode_pages_range(mapping, start, end);
> >>
> >>blk_ioctl_zeroout accepts unsigned values for start and end (uint64_t) but
> >>loff_t types are turned from i_size_read() and passed as the 2nd and 3rd
> >>values to truncate_inode_pages_range() and loff_t is a signed value. It
> >>should be possible to pass in some values would overflow the calculation of
> >>end causing the test on the value of end and the result of i_size_read to
> >>pass but then end up passing a large unsigned value for in start that would
> >>be implicitly converted to signed in truncate_inode_pages_range. I was
> >>wondering if you'd tested passing in data that would cause sign conversion
> >>issues when passed into truncate_inode_pages_range (does it handle it
> >>gracefully?) or should this code:
> >>
> >>	if (start & 511)
> >>  		return -EINVAL;
> >>  	if (len & 511)
> >>  		return -EINVAL;
> >>
> >>be something more like this (for better sanity checking of your arguments)
> >>which will ensure that you don't have implicit conversion issues from
> >>unsigned to signed and ensure that the result of adding them together won't
> >>either:
> >>
> >>	if ((start & 511) || (start > (uint64_t)LLONG_MAX))
> >>  		return -EINVAL;
> >>  	if ((len & 511) ) || (len > (uint64_t)LLONG_MAX))
> >>  		return -EINVAL;
> >>	if (end > (uint64_t)LLONG_MAX)
> >>		return -EINVAL;
> >>
> >>My apologies in advance if I've made a mistake when looking at this and my
> >>concerns about unsigned values being implicitly converted to signed are
> >>unfounded (I would have hoped for compiler warnings about any implicit
> >>conversions though).
> >
> >I don't have a device large enough to test for signedness errors, since passing
> >huge values for start and len never make it past the i_size_read check.
> >However, I do see that I forgot to check the padding values, so I'll update
> >that.
> 
> modprobe null_blk nr_devices=1 gb=512000

I tried doing that for some huge device sizes and this is what I saw:

# modprobe null_blk nr_devices=1 gb=17179869184
modprobe: ERROR: could not insert 'null_blk': Numerical result out of range
# modprobe null_blk nr_devices=1 gb=8589934591
modprobe: ERROR: could not insert 'null_blk': Numerical result out of range
# modprobe null_blk nr_devices=1 gb=8589934592
modprobe: ERROR: could not insert 'null_blk': Numerical result out of range
# modprobe null_blk nr_devices=1 gb=4294967295
modprobe: ERROR: could not insert 'null_blk': Numerical result out of range
# modprobe null_blk nr_devices=1 gb=4294967294
modprobe: ERROR: could not insert 'null_blk': Numerical result out of range
# modprobe null_blk nr_devices=1 gb=2147483647
# lsblk /dev/nullb0
NAME   MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
nullb0 249:0    0   2E  0 disk 

Looks like the biggest nullblk device I can create is 2E?

(Same result with "modprobe scsi-debug virtual_gb=...")

> will get you a /dev/nullb0 that is 500TB. Adjust 'gb' at will. Or
> use loop with a big ass sparse file.

I tried that, too.  XFS only wanted to let me create an 8E file.  So
did btrfs.  Eventually, I figured out the following:

# echo '0 18446744073709551616 zero' | dmsetup create MOO-backing
# echo '0 18446744073709551616 snapshot /dev/mapper/MOO-backing /dev/sda N 512' | dmsetup create MOO
# lsblk /dev/mapper/MOO
NAME       MAJ:MIN RM SIZE RO TYPE MOUNTPOINT
MOO (dm-1) 251:1    0  16E  0 dm   

Well, now we're getting somewhere.  It's a little funny that we asked
for 2^64 sectors, the dm table says 2^64, but the device claims a size
of 2^55... but it's 16E which exhausts our loff_t.  Let's try wrapping
around the end:

# /djwong/t/blkzero/test /dev/mapper/MOO 18446744073709543424 4096
OFFSET=-8192 BUFSZ=4096 USER_COHERENCE=0 DIRECTIO=0 EXCLUSIVE=0

# /djwong/t/blkzero/test /dev/mapper/MOO 18446744073709543424 8192
OFFSET=-8192 BUFSZ=8192 USER_COHERENCE=0 DIRECTIO=0 EXCLUSIVE=0
/dev/mapper/MOO: Invalid argument

# /djwong/t/blkzero/test /dev/mapper/MOO 18446744073709543424 12288
OFFSET=-8192 BUFSZ=12288 USER_COHERENCE=0 DIRECTIO=0 EXCLUSIVE=0

#

Hmm.  The third case should fail, since that clearly goes off the end
of the device.  However, it's not correct to compare start or end
against LLONG_MAX because the block layer clearly supports devices
that are larger than LLONG_MAX bytes.  However, the case where the
"end" calculation overflows should be easy to spot with a quick
"if (end < start) return -EINVAL;"

Curiously, if I create the DM device with 2^55 sectors, the dm
snapshot errors out:

# echo '0 36028797018963967 zero' | dmsetup create MOO-backing
# echo '0 36028797018963967 snapshot /dev/mapper/MOO-backing /dev/sda N 512' | dmsetup create MOO
# /djwong/t/blkzero/test /dev/mapper/MOO 18446744073709543424 4096
OFFSET=-8192 BUFSZ=4096 USER_COHERENCE=0 DIRECTIO=0 EXCLUSIVE=0
/dev/mapper/MOO: Input/output error

and dmesg spits out:

"device-mapper: snapshots: Invalidating snapshot: Error reading/writing."

FWIW, truncate_inode_pages_range takes the loff_t and converts that
into a pgoff_t, which is unsigned long.  If the start pgoff_t > the
end pgoff_t, the function does nothing.  On the off chance the
blkdev_issue_zeroout call doesn't fail when it's given weird
arguments, invalidate_inode_pages2_range seems to do roughly the same
thing with pgoff_t.  Will add the one check and resend.

--D

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
@ 2015-11-13 15:47       ` Jens Axboe
  0 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2015-11-13 15:47 UTC (permalink / raw)
  To: Darrick J. Wong, Seymour, Shane M
  Cc: Christoph Hellwig, linux-kernel, linux-fsdevel, linux-api,
	Jeff Layton, J. Bruce Fields, martin.petersen

On 11/10/2015 11:14 PM, Darrick J. Wong wrote:
> On Wed, Nov 11, 2015 at 05:30:07AM +0000, Seymour, Shane M wrote:
>> A quick question about this part of the patch:
>>
>>> +	uint64_t end = start + len - 1;
>>
>>> +	if (end >= i_size_read(bdev->bd_inode))
>>   		return -EINVAL;
>>
>>> +	/* Invalidate the page cache, including dirty pages */
>>> +	mapping = bdev->bd_inode->i_mapping;
>>> +	truncate_inode_pages_range(mapping, start, end);
>>
>> blk_ioctl_zeroout accepts unsigned values for start and end (uint64_t) but
>> loff_t types are turned from i_size_read() and passed as the 2nd and 3rd
>> values to truncate_inode_pages_range() and loff_t is a signed value. It
>> should be possible to pass in some values would overflow the calculation of
>> end causing the test on the value of end and the result of i_size_read to
>> pass but then end up passing a large unsigned value for in start that would
>> be implicitly converted to signed in truncate_inode_pages_range. I was
>> wondering if you'd tested passing in data that would cause sign conversion
>> issues when passed into truncate_inode_pages_range (does it handle it
>> gracefully?) or should this code:
>>
>> 	if (start & 511)
>>   		return -EINVAL;
>>   	if (len & 511)
>>   		return -EINVAL;
>>
>> be something more like this (for better sanity checking of your arguments)
>> which will ensure that you don't have implicit conversion issues from
>> unsigned to signed and ensure that the result of adding them together won't
>> either:
>>
>> 	if ((start & 511) || (start > (uint64_t)LLONG_MAX))
>>   		return -EINVAL;
>>   	if ((len & 511) ) || (len > (uint64_t)LLONG_MAX))
>>   		return -EINVAL;
>> 	if (end > (uint64_t)LLONG_MAX)
>> 		return -EINVAL;
>>
>> My apologies in advance if I've made a mistake when looking at this and my
>> concerns about unsigned values being implicitly converted to signed are
>> unfounded (I would have hoped for compiler warnings about any implicit
>> conversions though).
>
> I don't have a device large enough to test for signedness errors, since passing
> huge values for start and len never make it past the i_size_read check.
> However, I do see that I forgot to check the padding values, so I'll update
> that.

modprobe null_blk nr_devices=1 gb=512000

will get you a /dev/nullb0 that is 500TB. Adjust 'gb' at will. Or use 
loop with a big ass sparse file.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
@ 2015-11-13 15:47       ` Jens Axboe
  0 siblings, 0 replies; 15+ messages in thread
From: Jens Axboe @ 2015-11-13 15:47 UTC (permalink / raw)
  To: Darrick J. Wong, Seymour, Shane M
  Cc: Christoph Hellwig, linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Jeff Layton, J. Bruce Fields,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA

On 11/10/2015 11:14 PM, Darrick J. Wong wrote:
> On Wed, Nov 11, 2015 at 05:30:07AM +0000, Seymour, Shane M wrote:
>> A quick question about this part of the patch:
>>
>>> +	uint64_t end = start + len - 1;
>>
>>> +	if (end >= i_size_read(bdev->bd_inode))
>>   		return -EINVAL;
>>
>>> +	/* Invalidate the page cache, including dirty pages */
>>> +	mapping = bdev->bd_inode->i_mapping;
>>> +	truncate_inode_pages_range(mapping, start, end);
>>
>> blk_ioctl_zeroout accepts unsigned values for start and end (uint64_t) but
>> loff_t types are turned from i_size_read() and passed as the 2nd and 3rd
>> values to truncate_inode_pages_range() and loff_t is a signed value. It
>> should be possible to pass in some values would overflow the calculation of
>> end causing the test on the value of end and the result of i_size_read to
>> pass but then end up passing a large unsigned value for in start that would
>> be implicitly converted to signed in truncate_inode_pages_range. I was
>> wondering if you'd tested passing in data that would cause sign conversion
>> issues when passed into truncate_inode_pages_range (does it handle it
>> gracefully?) or should this code:
>>
>> 	if (start & 511)
>>   		return -EINVAL;
>>   	if (len & 511)
>>   		return -EINVAL;
>>
>> be something more like this (for better sanity checking of your arguments)
>> which will ensure that you don't have implicit conversion issues from
>> unsigned to signed and ensure that the result of adding them together won't
>> either:
>>
>> 	if ((start & 511) || (start > (uint64_t)LLONG_MAX))
>>   		return -EINVAL;
>>   	if ((len & 511) ) || (len > (uint64_t)LLONG_MAX))
>>   		return -EINVAL;
>> 	if (end > (uint64_t)LLONG_MAX)
>> 		return -EINVAL;
>>
>> My apologies in advance if I've made a mistake when looking at this and my
>> concerns about unsigned values being implicitly converted to signed are
>> unfounded (I would have hoped for compiler warnings about any implicit
>> conversions though).
>
> I don't have a device large enough to test for signedness errors, since passing
> huge values for start and len never make it past the i_size_read check.
> However, I do see that I forgot to check the padding values, so I'll update
> that.

modprobe null_blk nr_devices=1 gb=512000

will get you a /dev/nullb0 that is 500TB. Adjust 'gb' at will. Or use 
loop with a big ass sparse file.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-11-11  6:14     ` Darrick J. Wong
  (?)
  (?)
@ 2015-11-13  3:36     ` Seymour, Shane M
  -1 siblings, 0 replies; 15+ messages in thread
From: Seymour, Shane M @ 2015-11-13  3:36 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-fsdevel,
	linux-api, Jeff Layton, J. Bruce Fields, martin.petersen

> I don't have a device large enough to test for signedness errors, since passing 
> huge values for start and len never make it past the i_size_read check.

If you have someone trying to bypass your sanity checks then if start=18446744073709551104 and len=1024 the result of adding them together will be 512 (subtracting an extra 1 in the patched code to get 511 for end). That will pass the i_size_read check won't it? If so that would cause lstart in truncate_inode_pages_range() to be -512. I don't know what truncate_inode_pages_range() will do with a negative lstart value like that but it seems like an unusual value for your code to be willing to pass into truncate_inode_pages_range().

Shane

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-11-11 23:30     ` Seymour, Shane M
@ 2015-11-12  0:56       ` Seymour, Shane M
  0 siblings, 0 replies; 15+ messages in thread
From: Seymour, Shane M @ 2015-11-12  0:56 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-fsdevel,
	linux-api, Jeff Layton, J. Bruce Fields, martin.petersen

> which would make the other checks I suggested to ensure that neither start
> or end were more than (uint64_t)LLONG_MAX unnecessary.

My apologies I was wrong about what I said above - after thinking about it for longer you still need to make sure that at least len is not greater than (uint64_t)LLONG_MAX because in the calculation:

	if (start > (uint64_t)LLONG_MAX - len)
		return -EINVAL;

if len was more than (uint64_t)LLONG_MAX it would underflow and become a very large positive number and start would never be greater than that (and I said end when I should have said len).

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-11-11  6:14     ` Darrick J. Wong
  (?)
@ 2015-11-11 23:30     ` Seymour, Shane M
  2015-11-12  0:56       ` Seymour, Shane M
  -1 siblings, 1 reply; 15+ messages in thread
From: Seymour, Shane M @ 2015-11-11 23:30 UTC (permalink / raw)
  To: Darrick J. Wong
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-fsdevel,
	linux-api, Jeff Layton, J. Bruce Fields, martin.petersen

> I don't have a device large enough to test for signedness errors, since passing
> huge values for start and len never make it past the i_size_read check.
> However, I do see that I forgot to check the padding values, so I'll update that.

Then do you want to at least consider converting end to be of type loff_t to
match the type of value returned by i_get_size() and the type of the value
passed to truncate_inode_pages_range()? Then you only need one additional
check, something like:

/* Check for an overflow when adding start+len into end */
	if (start > (uint64_t)LLONG_MAX - len)
		return -EINVAL;

Looking at the maximum values if we have start=0 and
len=(uint64_t)LLONG_MAX we would still continue but
len=(uint64_t)LLONG_MAX+1 would return -EINVAL. Similarly for
start=(uint64_t)LLONG_MAX and len=0 it would continue
but start=(uint64_t)LLONG_MAX+1 would return -EINVAL and that 
should allow start + len to be safely added and stored into an loff_t without
overflow. With this check start+len can never be more than (uint64_t)LLONG_MAX
which would make the other checks I suggested to ensure that neither start or end
were more than (uint64_t)LLONG_MAX unnecessary.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
@ 2015-11-11  6:14     ` Darrick J. Wong
  0 siblings, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2015-11-11  6:14 UTC (permalink / raw)
  To: Seymour, Shane M
  Cc: Jens Axboe, Christoph Hellwig, linux-kernel, linux-fsdevel,
	linux-api, Jeff Layton, J. Bruce Fields, martin.petersen

On Wed, Nov 11, 2015 at 05:30:07AM +0000, Seymour, Shane M wrote:
> A quick question about this part of the patch:
> 
> > +	uint64_t end = start + len - 1;
> 
> > +	if (end >= i_size_read(bdev->bd_inode))
>  		return -EINVAL;
>  
> > +	/* Invalidate the page cache, including dirty pages */
> > +	mapping = bdev->bd_inode->i_mapping;
> > +	truncate_inode_pages_range(mapping, start, end);
> 
> blk_ioctl_zeroout accepts unsigned values for start and end (uint64_t) but
> loff_t types are turned from i_size_read() and passed as the 2nd and 3rd
> values to truncate_inode_pages_range() and loff_t is a signed value. It
> should be possible to pass in some values would overflow the calculation of
> end causing the test on the value of end and the result of i_size_read to
> pass but then end up passing a large unsigned value for in start that would
> be implicitly converted to signed in truncate_inode_pages_range. I was
> wondering if you'd tested passing in data that would cause sign conversion
> issues when passed into truncate_inode_pages_range (does it handle it
> gracefully?) or should this code:
> 
> 	if (start & 511)
>  		return -EINVAL;
>  	if (len & 511)
>  		return -EINVAL;
> 
> be something more like this (for better sanity checking of your arguments)
> which will ensure that you don't have implicit conversion issues from
> unsigned to signed and ensure that the result of adding them together won't
> either:
> 
> 	if ((start & 511) || (start > (uint64_t)LLONG_MAX))
>  		return -EINVAL;
>  	if ((len & 511) ) || (len > (uint64_t)LLONG_MAX))
>  		return -EINVAL;
> 	if (end > (uint64_t)LLONG_MAX)
> 		return -EINVAL;
> 
> My apologies in advance if I've made a mistake when looking at this and my
> concerns about unsigned values being implicitly converted to signed are
> unfounded (I would have hoped for compiler warnings about any implicit
> conversions though).

I don't have a device large enough to test for signedness errors, since passing
huge values for start and len never make it past the i_size_read check.
However, I do see that I forgot to check the padding values, so I'll update
that.

--D
> 
> Thanks
> Shane

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
@ 2015-11-11  6:14     ` Darrick J. Wong
  0 siblings, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2015-11-11  6:14 UTC (permalink / raw)
  To: Seymour, Shane M
  Cc: Jens Axboe, Christoph Hellwig,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Jeff Layton, J. Bruce Fields,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA

On Wed, Nov 11, 2015 at 05:30:07AM +0000, Seymour, Shane M wrote:
> A quick question about this part of the patch:
> 
> > +	uint64_t end = start + len - 1;
> 
> > +	if (end >= i_size_read(bdev->bd_inode))
>  		return -EINVAL;
>  
> > +	/* Invalidate the page cache, including dirty pages */
> > +	mapping = bdev->bd_inode->i_mapping;
> > +	truncate_inode_pages_range(mapping, start, end);
> 
> blk_ioctl_zeroout accepts unsigned values for start and end (uint64_t) but
> loff_t types are turned from i_size_read() and passed as the 2nd and 3rd
> values to truncate_inode_pages_range() and loff_t is a signed value. It
> should be possible to pass in some values would overflow the calculation of
> end causing the test on the value of end and the result of i_size_read to
> pass but then end up passing a large unsigned value for in start that would
> be implicitly converted to signed in truncate_inode_pages_range. I was
> wondering if you'd tested passing in data that would cause sign conversion
> issues when passed into truncate_inode_pages_range (does it handle it
> gracefully?) or should this code:
> 
> 	if (start & 511)
>  		return -EINVAL;
>  	if (len & 511)
>  		return -EINVAL;
> 
> be something more like this (for better sanity checking of your arguments)
> which will ensure that you don't have implicit conversion issues from
> unsigned to signed and ensure that the result of adding them together won't
> either:
> 
> 	if ((start & 511) || (start > (uint64_t)LLONG_MAX))
>  		return -EINVAL;
>  	if ((len & 511) ) || (len > (uint64_t)LLONG_MAX))
>  		return -EINVAL;
> 	if (end > (uint64_t)LLONG_MAX)
> 		return -EINVAL;
> 
> My apologies in advance if I've made a mistake when looking at this and my
> concerns about unsigned values being implicitly converted to signed are
> unfounded (I would have hoped for compiler warnings about any implicit
> conversions though).

I don't have a device large enough to test for signedness errors, since passing
huge values for start and len never make it past the i_size_read check.
However, I do see that I forgot to check the padding values, so I'll update
that.

--D
> 
> Thanks
> Shane

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2015-11-10  5:15 ` Darrick J. Wong
  (?)
@ 2015-11-11  5:30 ` Seymour, Shane M
  2015-11-11  6:14     ` Darrick J. Wong
  -1 siblings, 1 reply; 15+ messages in thread
From: Seymour, Shane M @ 2015-11-11  5:30 UTC (permalink / raw)
  To: Darrick J. Wong, Jens Axboe, Christoph Hellwig
  Cc: linux-kernel, linux-fsdevel, linux-api, Jeff Layton,
	J. Bruce Fields, martin.petersen

A quick question about this part of the patch:

> +	uint64_t end = start + len - 1;

> +	if (end >= i_size_read(bdev->bd_inode))
 		return -EINVAL;
 
> +	/* Invalidate the page cache, including dirty pages */
> +	mapping = bdev->bd_inode->i_mapping;
> +	truncate_inode_pages_range(mapping, start, end);

blk_ioctl_zeroout accepts unsigned values for start and end (uint64_t) but loff_t types are turned from i_size_read() and passed as the 2nd and 3rd values to truncate_inode_pages_range() and loff_t is a signed value. It should be possible to pass in some values would overflow the calculation of end causing the test on the value of end and the result of i_size_read to pass but then end up passing a large unsigned value for in start that would be implicitly converted to signed in truncate_inode_pages_range. I was wondering if you'd tested passing in data that would cause sign conversion issues when passed into truncate_inode_pages_range (does it handle it gracefully?) or should this code:

	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
 		return -EINVAL;

be something more like this (for better sanity checking of your arguments) which will ensure that you don't have implicit conversion issues from unsigned to signed and ensure that the result of adding them together won't either:

	if ((start & 511) || (start > (uint64_t)LLONG_MAX))
 		return -EINVAL;
 	if ((len & 511) ) || (len > (uint64_t)LLONG_MAX))
 		return -EINVAL;
	if (end > (uint64_t)LLONG_MAX)
		return -EINVAL;

My apologies in advance if I've made a mistake when looking at this and my concerns about unsigned values being implicitly converted to signed are unfounded (I would have hoped for compiler warnings about any implicit conversions though).

Thanks
Shane

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
@ 2015-11-10  5:15 ` Darrick J. Wong
  0 siblings, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2015-11-10  5:15 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig
  Cc: linux-kernel, linux-fsdevel, linux-api, Jeff Layton,
	J. Bruce Fields, martin.petersen

Create a new ioctl to expose the block layer's newfound ability to
issue either a zeroing discard, a WRITE SAME with a zero page, or a
regular write with the zero page.  This BLKZEROOUT2 ioctl takes
{start, length, flags} as parameters.  So far, the only flag available
is to enable the zeroing discard part -- without it, the call invokes
the old BLKZEROOUT behavior.  start and length have the same meaning
as in BLKZEROOUT.

Furthermore, because BLKZEROOUT2 issues commands directly to the
storage device, we must invalidate the page cache (as a regular
O_DIRECT write would do) to avoid returning stale cache contents at a
later time.  BLKZEROOUT never did this, which lead to coherency
problems when e2fsprogs tried to use it.

v2: Padded the structure with extra space for future fun, since the
maintainer never responded...

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 block/ioctl.c           |   45 ++++++++++++++++++++++++++++++++++++++-------
 include/uapi/linux/fs.h |    9 +++++++++
 2 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 8061eba..79a2e1c 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -213,19 +213,39 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 }
 
 static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
-			     uint64_t len)
+			     uint64_t len, uint32_t flags)
 {
+	int ret;
+	struct address_space *mapping;
+	uint64_t end = start + len - 1;
+
+	if (flags & ~BLKZEROOUT2_DISCARD_OK)
+		return -EINVAL;
 	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
 		return -EINVAL;
-	start >>= 9;
-	len >>= 9;
-
-	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+	if (end >= i_size_read(bdev->bd_inode))
 		return -EINVAL;
 
-	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
+	/* Invalidate the page cache, including dirty pages */
+	mapping = bdev->bd_inode->i_mapping;
+	truncate_inode_pages_range(mapping, start, end);
+
+	ret = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
+				   flags & BLKZEROOUT2_DISCARD_OK);
+	if (ret)
+		goto out;
+
+	/*
+	 * Invalidate again; if someone wandered in and dirtied a page,
+	 * the caller will be given -EBUSY.
+	 */
+	ret = invalidate_inode_pages2_range(mapping,
+					    start >> PAGE_CACHE_SHIFT,
+					    end >> PAGE_CACHE_SHIFT);
+out:
+	return ret;
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
@@ -353,7 +373,18 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
 			return -EFAULT;
 
-		return blk_ioctl_zeroout(bdev, range[0], range[1]);
+		return blk_ioctl_zeroout(bdev, range[0], range[1], 0);
+	}
+	case BLKZEROOUT2: {
+		struct blkzeroout2 p;
+
+		if (!(mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(&p, (void __user *)arg, sizeof(p)))
+			return -EFAULT;
+
+		return blk_ioctl_zeroout(bdev, p.start, p.length, p.flags);
 	}
 
 	case HDIO_GETGEO: {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 9b964a5..b811fa4 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -152,6 +152,15 @@ struct inodes_stat_t {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
+struct blkzeroout2 {
+	__u64 start;
+	__u64 length;
+	__u32 flags;
+	__u32 padding;
+	__u64 padding2;
+};
+#define BLKZEROOUT2_DISCARD_OK	1
+#define BLKZEROOUT2 _IOR(0x12, 127, struct blkzeroout2)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
@ 2015-11-10  5:15 ` Darrick J. Wong
  0 siblings, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2015-11-10  5:15 UTC (permalink / raw)
  To: Jens Axboe, Christoph Hellwig
  Cc: linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-fsdevel-u79uwXL29TY76Z2rM5mHXA,
	linux-api-u79uwXL29TY76Z2rM5mHXA, Jeff Layton, J. Bruce Fields,
	martin.petersen-QHcLZuEGTsvQT0dZR+AlfA

Create a new ioctl to expose the block layer's newfound ability to
issue either a zeroing discard, a WRITE SAME with a zero page, or a
regular write with the zero page.  This BLKZEROOUT2 ioctl takes
{start, length, flags} as parameters.  So far, the only flag available
is to enable the zeroing discard part -- without it, the call invokes
the old BLKZEROOUT behavior.  start and length have the same meaning
as in BLKZEROOUT.

Furthermore, because BLKZEROOUT2 issues commands directly to the
storage device, we must invalidate the page cache (as a regular
O_DIRECT write would do) to avoid returning stale cache contents at a
later time.  BLKZEROOUT never did this, which lead to coherency
problems when e2fsprogs tried to use it.

v2: Padded the structure with extra space for future fun, since the
maintainer never responded...

Signed-off-by: Darrick J. Wong <darrick.wong-QHcLZuEGTsvQT0dZR+AlfA@public.gmane.org>
---
 block/ioctl.c           |   45 ++++++++++++++++++++++++++++++++++++++-------
 include/uapi/linux/fs.h |    9 +++++++++
 2 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 8061eba..79a2e1c 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -213,19 +213,39 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 }
 
 static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
-			     uint64_t len)
+			     uint64_t len, uint32_t flags)
 {
+	int ret;
+	struct address_space *mapping;
+	uint64_t end = start + len - 1;
+
+	if (flags & ~BLKZEROOUT2_DISCARD_OK)
+		return -EINVAL;
 	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
 		return -EINVAL;
-	start >>= 9;
-	len >>= 9;
-
-	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+	if (end >= i_size_read(bdev->bd_inode))
 		return -EINVAL;
 
-	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
+	/* Invalidate the page cache, including dirty pages */
+	mapping = bdev->bd_inode->i_mapping;
+	truncate_inode_pages_range(mapping, start, end);
+
+	ret = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
+				   flags & BLKZEROOUT2_DISCARD_OK);
+	if (ret)
+		goto out;
+
+	/*
+	 * Invalidate again; if someone wandered in and dirtied a page,
+	 * the caller will be given -EBUSY.
+	 */
+	ret = invalidate_inode_pages2_range(mapping,
+					    start >> PAGE_CACHE_SHIFT,
+					    end >> PAGE_CACHE_SHIFT);
+out:
+	return ret;
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
@@ -353,7 +373,18 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
 			return -EFAULT;
 
-		return blk_ioctl_zeroout(bdev, range[0], range[1]);
+		return blk_ioctl_zeroout(bdev, range[0], range[1], 0);
+	}
+	case BLKZEROOUT2: {
+		struct blkzeroout2 p;
+
+		if (!(mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(&p, (void __user *)arg, sizeof(p)))
+			return -EFAULT;
+
+		return blk_ioctl_zeroout(bdev, p.start, p.length, p.flags);
 	}
 
 	case HDIO_GETGEO: {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 9b964a5..b811fa4 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -152,6 +152,15 @@ struct inodes_stat_t {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
+struct blkzeroout2 {
+	__u64 start;
+	__u64 length;
+	__u32 flags;
+	__u32 padding;
+	__u64 padding2;
+};
+#define BLKZEROOUT2_DISCARD_OK	1
+#define BLKZEROOUT2 _IOR(0x12, 127, struct blkzeroout2)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH] block: create ioctl to discard-or-zeroout a range of blocks
  2014-11-11  0:04   ` Darrick J. Wong
@ 2014-11-17 19:28     ` Darrick J. Wong
  0 siblings, 0 replies; 15+ messages in thread
From: Darrick J. Wong @ 2014-11-17 19:28 UTC (permalink / raw)
  To: Martin K. Petersen; +Cc: linux-scsi, linux-ide, linux-fsdevel, neilb, linux-api

Create a new ioctl to expose the block layer's newfound ability to
issue either a zeroing discard, a WRITE SAME with a zero page, or a
regular write with the zero page.  This BLKZEROOUT2 ioctl takes
{start, length, flags} as parameters.  So far, the only flag available
is to enable the zeroing discard part -- without it, the call invokes
the old BLKZEROOUT behavior.  start and length have the same meaning
as in BLKZEROOUT.

Furthermore, because BLKZEROOUT2 issues commands directly to the
storage device, we must invalidate the page cache (as a regular
O_DIRECT write would do) to avoid returning stale cache contents at a
later time.

This patch depends on mkp's earlier patch "block: Introduce
blkdev_issue_zeroout_discard() function".

Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com>
---
 block/ioctl.c           |   45 ++++++++++++++++++++++++++++++++++++++-------
 include/uapi/linux/fs.h |    7 +++++++
 2 files changed, 45 insertions(+), 7 deletions(-)

diff --git a/block/ioctl.c b/block/ioctl.c
index 7d8befd..ff623d5 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -186,19 +186,39 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 }
 
 static int blk_ioctl_zeroout(struct block_device *bdev, uint64_t start,
-			     uint64_t len)
+			     uint64_t len, uint32_t flags)
 {
+	int ret;
+	struct address_space *mapping;
+	uint64_t end = start + len - 1;
+
+	if (flags & ~BLKZEROOUT2_DISCARD_OK)
+		return -EINVAL;
 	if (start & 511)
 		return -EINVAL;
 	if (len & 511)
 		return -EINVAL;
-	start >>= 9;
-	len >>= 9;
-
-	if (start + len > (i_size_read(bdev->bd_inode) >> 9))
+	if (end >= i_size_read(bdev->bd_inode))
 		return -EINVAL;
 
-	return blkdev_issue_zeroout(bdev, start, len, GFP_KERNEL, false);
+	/* Invalidate the page cache, including dirty pages */
+	mapping = bdev->bd_inode->i_mapping;
+	truncate_inode_pages_range(mapping, start, end);
+
+	ret = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
+				   flags & BLKZEROOUT2_DISCARD_OK);
+	if (ret)
+		goto out;
+
+	/*
+	 * Invalidate again; if someone wandered in and dirtied a page,
+	 * the caller will be given -EBUSY.
+	 */
+	ret = invalidate_inode_pages2_range(mapping,
+					    start >> PAGE_CACHE_SHIFT,
+					    end >> PAGE_CACHE_SHIFT);
+out:
+	return ret;
 }
 
 static int put_ushort(unsigned long arg, unsigned short val)
@@ -326,7 +346,18 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		if (copy_from_user(range, (void __user *)arg, sizeof(range)))
 			return -EFAULT;
 
-		return blk_ioctl_zeroout(bdev, range[0], range[1]);
+		return blk_ioctl_zeroout(bdev, range[0], range[1], 0);
+	}
+	case BLKZEROOUT2: {
+		struct blkzeroout2 p;
+
+		if (!(mode & FMODE_WRITE))
+			return -EBADF;
+
+		if (copy_from_user(&p, (void __user *)arg, sizeof(p)))
+			return -EFAULT;
+
+		return blk_ioctl_zeroout(bdev, p.start, p.length, p.flags);
 	}
 
 	case HDIO_GETGEO: {
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 3735fa0..54d24ea 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -150,6 +150,13 @@ struct inodes_stat_t {
 #define BLKSECDISCARD _IO(0x12,125)
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
+struct blkzeroout2 {
+	__u64 start;
+	__u64 length;
+	__u32 flags;
+};
+#define BLKZEROOUT2_DISCARD_OK	1
+#define BLKZEROOUT2 _IOR(0x12, 127, struct blkzeroout2)
 
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */

^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2015-11-13 21:36 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-01-21  1:06 [PATCH] block: Add discard flag to blkdev_issue_zeroout() function Martin K. Petersen
2015-01-21 10:10 ` [PATCH] block: create ioctl to discard-or-zeroout a range of blocks Darrick J. Wong
2015-01-21 17:42 ` [PATCH] block: Add discard flag to blkdev_issue_zeroout() function Jens Axboe
  -- strict thread matches above, loose matches on Subject: below --
2015-11-10  5:15 [PATCH] block: create ioctl to discard-or-zeroout a range of blocks Darrick J. Wong
2015-11-10  5:15 ` Darrick J. Wong
2015-11-11  5:30 ` Seymour, Shane M
2015-11-11  6:14   ` Darrick J. Wong
2015-11-11  6:14     ` Darrick J. Wong
2015-11-11 23:30     ` Seymour, Shane M
2015-11-12  0:56       ` Seymour, Shane M
2015-11-13  3:36     ` Seymour, Shane M
2015-11-13 15:47     ` Jens Axboe
2015-11-13 15:47       ` Jens Axboe
2015-11-13 21:36       ` Darrick J. Wong
2014-11-07  5:08 [RFC] Discard update for 3.19 Martin K. Petersen
2014-11-07  5:08 ` [PATCH 3/3] block: Introduce blkdev_issue_zeroout_discard() function Martin K. Petersen
2014-11-11  0:04   ` Darrick J. Wong
2014-11-17 19:28     ` [PATCH] block: create ioctl to discard-or-zeroout a range of blocks Darrick J. Wong

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.