linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Kirill Tkhai <ktkhai@virtuozzo.com>
To: linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-ext4@vger.kernel.org
Cc: axboe@kernel.dk, tytso@mit.edu, adilger.kernel@dilger.ca,
	ming.lei@redhat.com, osandov@fb.com, jthumshirn@suse.de,
	minwoo.im.dev@gmail.com, damien.lemoal@wdc.com,
	ktkhai@virtuozzo.com, andrea.parri@amarulasolutions.com,
	hare@suse.com, tj@kernel.org, ajay.joshi@wdc.com,
	sagi@grimberg.me, dsterba@suse.com, chaitanya.kulkarni@wdc.com,
	bvanassche@acm.org, dhowells@redhat.com, asml.silence@gmail.com
Subject: [PATCH RFC 1/3] block: Add support for REQ_OP_ASSIGN_RANGE operation
Date: Tue, 10 Dec 2019 19:56:08 +0300	[thread overview]
Message-ID: <157599696813.12112.14140818972910110796.stgit@localhost.localdomain> (raw)
In-Reply-To: <157599668662.12112.10184894900037871860.stgit@localhost.localdomain>

This operation allows to notify a device about the fact,
that some sectors range was choosen by a filesystem
as a single extent, and the device should try its best
to reflect that (keep the range as a single hunk in its
internals, or represent the range as minimal set of hunks).
Speaking directly, the operation is for forwarding
fallocate(0) requests into an essence, on which the device
is based.

This may be useful for some distributed network filesystems,
providing block device interface, for optimization of their
blocks placement over the cluster nodes.

Also, block devices mapping a file (like loop) are users
of that, since this allows to allocate more continuous
extents and since this batches blocks allocation requests.
In addition, hypervisors like QEMU may use this for better
blocks placement.

The patch adds a new blkdev_issue_assign_range() primitive,
which is rather similar to existing blkdev_issue_{*} api.
Also, a new queue limit.max_assign_range_sectors is added.

Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
---
 block/blk-core.c          |    4 +++
 block/blk-lib.c           |   70 +++++++++++++++++++++++++++++++++++++++++++++
 block/blk-merge.c         |   21 ++++++++++++++
 block/bounce.c            |    1 +
 include/linux/bio.h       |    3 ++
 include/linux/blk_types.h |    2 +
 include/linux/blkdev.h    |   29 +++++++++++++++++++
 7 files changed, 130 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 8d4db6e74496..060cc0ea1246 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -978,6 +978,10 @@ generic_make_request_checks(struct bio *bio)
 		if (!q->limits.max_write_zeroes_sectors)
 			goto not_supported;
 		break;
+	case REQ_OP_ASSIGN_RANGE:
+		if (!q->limits.max_assign_range_sectors)
+			goto not_supported;
+		break;
 	default:
 		break;
 	}
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 5f2c429d4378..fbf780d3ea32 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -252,6 +252,46 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
 	return 0;
 }
 
+static int __blkdev_issue_assign_range(struct block_device *bdev,
+		sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
+		struct bio **biop, unsigned flags)
+{
+	struct bio *bio = *biop;
+	unsigned int max_sectors;
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (!q)
+		return -ENXIO;
+
+	if (bdev_read_only(bdev))
+		return -EPERM;
+
+	max_sectors = bdev_assign_range_sectors(bdev);
+
+	if (max_sectors == 0)
+		return -EOPNOTSUPP;
+
+	while (nr_sects) {
+		bio = blk_next_bio(bio, 0, gfp_mask);
+		bio->bi_iter.bi_sector = sector;
+		bio_set_dev(bio, bdev);
+		bio->bi_opf = REQ_OP_ASSIGN_RANGE;
+
+		if (nr_sects > max_sectors) {
+			bio->bi_iter.bi_size = max_sectors << 9;
+			nr_sects -= max_sectors;
+			sector += max_sectors;
+		} else {
+			bio->bi_iter.bi_size = nr_sects << 9;
+			nr_sects = 0;
+		}
+		cond_resched();
+	}
+
+	*biop = bio;
+	return 0;
+}
+
 /*
  * Convert a number of 512B sectors to a number of pages.
  * The result is limited to a number of pages that can fit into a BIO.
@@ -405,3 +445,33 @@ int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
+
+int blkdev_issue_assign_range(struct block_device *bdev, sector_t sector,
+			sector_t nr_sects, gfp_t gfp_mask, unsigned flags)
+{
+	int ret = 0;
+	sector_t bs_mask;
+	struct bio *bio;
+	struct blk_plug plug;
+
+	if (bdev_assign_range_sectors(bdev) == 0)
+		return 0;
+
+	bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+	if ((sector | nr_sects) & bs_mask)
+		return -EINVAL;
+
+	bio = NULL;
+	blk_start_plug(&plug);
+
+	ret = __blkdev_issue_assign_range(bdev, sector, nr_sects,
+					  gfp_mask, &bio, flags);
+	if (ret == 0 && bio) {
+		ret = submit_bio_wait(bio);
+		bio_put(bio);
+	}
+	blk_finish_plug(&plug);
+
+	return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_assign_range);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index d783bdc4559b..b2ae8b5acd72 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -102,6 +102,22 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
 	return bio_split(bio, split_sectors, GFP_NOIO, bs);
 }
 
+static struct bio *blk_bio_assign_range_split(struct request_queue *q,
+					      struct bio *bio,
+					      struct bio_set *bs,
+					      unsigned *nsegs)
+{
+	*nsegs = 1;
+
+	if (!q->limits.max_assign_range_sectors)
+		return NULL;
+
+	if (bio_sectors(bio) <= q->limits.max_assign_range_sectors)
+		return NULL;
+
+	return bio_split(bio, q->limits.max_assign_range_sectors, GFP_NOIO, bs);
+}
+
 static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
 		struct bio *bio, struct bio_set *bs, unsigned *nsegs)
 {
@@ -300,6 +316,10 @@ void __blk_queue_split(struct request_queue *q, struct bio **bio,
 	case REQ_OP_SECURE_ERASE:
 		split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
 		break;
+	case REQ_OP_ASSIGN_RANGE:
+		split = blk_bio_assign_range_split(q, *bio, &q->bio_split,
+				nr_segs);
+		break;
 	case REQ_OP_WRITE_ZEROES:
 		split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
 				nr_segs);
@@ -382,6 +402,7 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
+	case REQ_OP_ASSIGN_RANGE:
 		return 0;
 	case REQ_OP_WRITE_SAME:
 		return 1;
diff --git a/block/bounce.c b/block/bounce.c
index f8ed677a1bf7..017bedba7b23 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -255,6 +255,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
 
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
+	case REQ_OP_ASSIGN_RANGE:
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
 		break;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 3cdb84cdc488..cf235c997e45 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -63,6 +63,7 @@ static inline bool bio_has_data(struct bio *bio)
 	if (bio &&
 	    bio->bi_iter.bi_size &&
 	    bio_op(bio) != REQ_OP_DISCARD &&
+	    bio_op(bio) != REQ_OP_ASSIGN_RANGE &&
 	    bio_op(bio) != REQ_OP_SECURE_ERASE &&
 	    bio_op(bio) != REQ_OP_WRITE_ZEROES)
 		return true;
@@ -73,6 +74,7 @@ static inline bool bio_has_data(struct bio *bio)
 static inline bool bio_no_advance_iter(struct bio *bio)
 {
 	return bio_op(bio) == REQ_OP_DISCARD ||
+	       bio_op(bio) == REQ_OP_ASSIGN_RANGE ||
 	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
 	       bio_op(bio) == REQ_OP_WRITE_SAME ||
 	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
@@ -184,6 +186,7 @@ static inline unsigned bio_segments(struct bio *bio)
 
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
+	case REQ_OP_ASSIGN_RANGE:
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
 		return 0;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 70254ae11769..f03dcf25c831 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -296,6 +296,8 @@ enum req_opf {
 	REQ_OP_ZONE_CLOSE	= 11,
 	/* Transition a zone to full */
 	REQ_OP_ZONE_FINISH	= 12,
+	/* assign sector range */
+	REQ_OP_ASSIGN_RANGE	= 15,
 
 	/* SCSI passthrough using struct scsi_request */
 	REQ_OP_SCSI_IN		= 32,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3cd1853dbdac..9af70120fe57 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -336,6 +336,7 @@ struct queue_limits {
 	unsigned int		max_hw_discard_sectors;
 	unsigned int		max_write_same_sectors;
 	unsigned int		max_write_zeroes_sectors;
+	unsigned int		max_assign_range_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
 
@@ -995,6 +996,10 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
 		return min(q->limits.max_discard_sectors,
 			   UINT_MAX >> SECTOR_SHIFT);
 
+	if (unlikely(op == REQ_OP_ASSIGN_RANGE))
+		return min(q->limits.max_assign_range_sectors,
+			   UINT_MAX >> SECTOR_SHIFT);
+
 	if (unlikely(op == REQ_OP_WRITE_SAME))
 		return q->limits.max_write_same_sectors;
 
@@ -1028,6 +1033,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 
 	if (!q->limits.chunk_sectors ||
 	    req_op(rq) == REQ_OP_DISCARD ||
+	    req_op(rq) == REQ_OP_ASSIGN_RANGE ||
 	    req_op(rq) == REQ_OP_SECURE_ERASE)
 		return blk_queue_get_max_sectors(q, req_op(rq));
 
@@ -1225,6 +1231,8 @@ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		unsigned flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
+extern int blkdev_issue_assign_range(struct block_device *bdev, sector_t sector,
+		sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
 
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
@@ -1247,6 +1255,17 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 				    gfp_mask, 0);
 }
 
+static inline int sb_issue_assign_range(struct super_block *sb, sector_t block,
+		sector_t nr_blocks, gfp_t gfp_mask)
+{
+	return blkdev_issue_assign_range(sb->s_bdev,
+					 block << (sb->s_blocksize_bits -
+						   SECTOR_SHIFT),
+					 nr_blocks << (sb->s_blocksize_bits -
+						       SECTOR_SHIFT),
+					 gfp_mask, 0);
+}
+
 extern int blk_verify_command(unsigned char *cmd, fmode_t mode);
 
 enum blk_default_limits {
@@ -1428,6 +1447,16 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
 	return 0;
 }
 
+static inline unsigned int bdev_assign_range_sectors(struct block_device *bdev)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+
+	if (q)
+		return q->limits.max_assign_range_sectors;
+
+	return 0;
+}
+
 static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);



  reply	other threads:[~2019-12-10 16:57 UTC|newest]

Thread overview: 21+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-12-10 16:56 [PATCH RFC 0/3] block,ext4: Introduce REQ_OP_ASSIGN_RANGE to reflect extents allocation in block device internals Kirill Tkhai
2019-12-10 16:56 ` Kirill Tkhai [this message]
2019-12-19  3:03   ` [PATCH RFC 1/3] block: Add support for REQ_OP_ASSIGN_RANGE operation Martin K. Petersen
2019-12-19 11:07     ` Kirill Tkhai
2019-12-19 22:03       ` Chaitanya Kulkarni
2019-12-19 22:37       ` Martin K. Petersen
2019-12-20  1:53         ` Darrick J. Wong
2019-12-20  2:22           ` Martin K. Petersen
2019-12-20 11:55         ` Kirill Tkhai
2019-12-21 18:54           ` Martin K. Petersen
2019-12-23  8:51             ` Kirill Tkhai
2020-01-07  3:24               ` Martin K. Petersen
2020-01-07 13:59                 ` Kirill Tkhai
2020-01-08  2:49                   ` Martin K. Petersen
2020-01-09  9:43                     ` Kirill Tkhai
2019-12-10 16:56 ` [PATCH RFC 2/3] loop: Forward REQ_OP_ASSIGN_RANGE into fallocate(0) Kirill Tkhai
2019-12-10 16:56 ` [PATCH RFC 3/3] ext4: Notify block device about fallocate(0)-assigned blocks Kirill Tkhai
2019-12-11 12:55   ` [PATCH RFC v2 " Kirill Tkhai
2019-12-11  7:42 ` [PATCH RFC 0/3] block,ext4: Introduce REQ_OP_ASSIGN_RANGE to reflect extents allocation in block device internals Chaitanya Kulkarni
2019-12-11  8:50   ` Kirill Tkhai
2019-12-17 14:16 ` Kirill Tkhai

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=157599696813.12112.14140818972910110796.stgit@localhost.localdomain \
    --to=ktkhai@virtuozzo.com \
    --cc=adilger.kernel@dilger.ca \
    --cc=ajay.joshi@wdc.com \
    --cc=andrea.parri@amarulasolutions.com \
    --cc=asml.silence@gmail.com \
    --cc=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=chaitanya.kulkarni@wdc.com \
    --cc=damien.lemoal@wdc.com \
    --cc=dhowells@redhat.com \
    --cc=dsterba@suse.com \
    --cc=hare@suse.com \
    --cc=jthumshirn@suse.de \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-ext4@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=ming.lei@redhat.com \
    --cc=minwoo.im.dev@gmail.com \
    --cc=osandov@fb.com \
    --cc=sagi@grimberg.me \
    --cc=tj@kernel.org \
    --cc=tytso@mit.edu \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).