Linux-Block Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH] block: support arbitrary zone size
@ 2020-02-10 22:08 Alexey Dobriyan
  2020-02-10 22:20 ` [PATCH v2] " Alexey Dobriyan
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Alexey Dobriyan @ 2020-02-10 22:08 UTC (permalink / raw)
  To: axboe; +Cc: linux-block, damien.lemoal

SK hynix is going to ship ZNS device with zone size not being power of 2.

Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
---

 block/blk-settings.c           |    4 +---
 block/blk-zoned.c              |   10 +++++-----
 drivers/block/null_blk_main.c  |   11 ++++++-----
 drivers/block/null_blk_zoned.c |   10 ++--------
 4 files changed, 14 insertions(+), 21 deletions(-)

--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -206,15 +206,13 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors);
  *
  * Description:
  *    If a driver doesn't want IOs to cross a given chunk size, it can set
- *    this limit and prevent merging across chunks. Note that the chunk size
- *    must currently be a power-of-2 in sectors. Also note that the block
+ *    this limit and prevent merging across chunks. Note that the block
  *    layer must accept a page worth of data at any offset. So if the
  *    crossing of chunks is a hard limitation in the driver, it must still be
  *    prepared to split single page bios.
  **/
 void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
 {
-	BUG_ON(!is_power_of_2(chunk_sectors));
 	q->limits.chunk_sectors = chunk_sectors;
 }
 EXPORT_SYMBOL(blk_queue_chunk_sectors);
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -83,7 +83,7 @@ unsigned int blkdev_nr_zones(struct gendisk *disk)
 
 	if (!blk_queue_is_zoned(disk->queue))
 		return 0;
-	return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
+	return div64_u64(get_capacity(disk) + zone_sectors - 1, zone_sectors);
 }
 EXPORT_SYMBOL_GPL(blkdev_nr_zones);
 
@@ -363,14 +363,14 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 	 * smaller last zone.
 	 */
 	if (zone->start == 0) {
-		if (zone->len == 0 || !is_power_of_2(zone->len)) {
-			pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
-				disk->disk_name, zone->len);
+		if (zone->len == 0) {
+			pr_warn("%s: Invalid zoned device with length 0\n",
+				disk->disk_name);
 			return -ENODEV;
 		}
 
 		args->zone_sectors = zone->len;
-		args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
+		args->nr_zones = div64_u64(capacity + zone->len - 1, zone->len);
 	} else if (zone->start + args->zone_sectors < capacity) {
 		if (zone->len != args->zone_sectors) {
 			pr_warn("%s: Invalid zoned device with non constant zone size\n",
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -187,7 +187,7 @@ MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Defau
 
 static unsigned long g_zone_size = 256;
 module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
-MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
+MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Default: 256");
 
 static unsigned int g_zone_nr_conv;
 module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
@@ -1641,10 +1641,11 @@ static int null_validate_conf(struct nullb_device *dev)
 	if (dev->queue_mode == NULL_Q_BIO)
 		dev->mbps = 0;
 
-	if (dev->zoned &&
-	    (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
-		pr_err("zone_size must be power-of-two\n");
-		return -EINVAL;
+	if (dev->zoned) {
+		if (dev->zone_size == 0) {
+			pr_err("zone_size must be positive\n");
+			return -EINVAL;
+		}
 	}
 
 	return 0;
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -7,7 +7,7 @@
 
 static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
 {
-	return sect >> ilog2(dev->zone_size_sects);
+	return div64_u64(sect, dev->zone_size_sects * SECTOR_SIZE);
 }
 
 int null_zone_init(struct nullb_device *dev)
@@ -16,14 +16,8 @@ int null_zone_init(struct nullb_device *dev)
 	sector_t sector = 0;
 	unsigned int i;
 
-	if (!is_power_of_2(dev->zone_size)) {
-		pr_err("zone_size must be power-of-two\n");
-		return -EINVAL;
-	}
-
 	dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
-	dev->nr_zones = dev_size >>
-				(SECTOR_SHIFT + ilog2(dev->zone_size_sects));
+	dev->nr_zones = div64_u64(dev_size, dev->zone_size_sects * SECTOR_SIZE);
 	dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone),
 			GFP_KERNEL | __GFP_ZERO);
 	if (!dev->zones)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2] block: support arbitrary zone size
  2020-02-10 22:08 [PATCH] block: support arbitrary zone size Alexey Dobriyan
@ 2020-02-10 22:20 ` " Alexey Dobriyan
  2020-02-10 22:57   ` Damien Le Moal
                     ` (4 more replies)
  2020-02-10 22:23 ` [PATCH] " Bart Van Assche
  2020-02-10 22:33 ` Keith Busch
  2 siblings, 5 replies; 10+ messages in thread
From: Alexey Dobriyan @ 2020-02-10 22:20 UTC (permalink / raw)
  To: axboe; +Cc: linux-block, damien.lemoal

SK hynix is going to ship ZNS device with zone size not being power of 2.

Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
---

	v2: fixup one ">>ilog2"/div conversion :-/

 block/blk-settings.c           |    4 +---
 block/blk-zoned.c              |   10 +++++-----
 drivers/block/null_blk_main.c  |   11 ++++++-----
 drivers/block/null_blk_zoned.c |   10 ++--------
 4 files changed, 14 insertions(+), 21 deletions(-)

--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -206,15 +206,13 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors);
  *
  * Description:
  *    If a driver doesn't want IOs to cross a given chunk size, it can set
- *    this limit and prevent merging across chunks. Note that the chunk size
- *    must currently be a power-of-2 in sectors. Also note that the block
+ *    this limit and prevent merging across chunks. Note that the block
  *    layer must accept a page worth of data at any offset. So if the
  *    crossing of chunks is a hard limitation in the driver, it must still be
  *    prepared to split single page bios.
  **/
 void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
 {
-	BUG_ON(!is_power_of_2(chunk_sectors));
 	q->limits.chunk_sectors = chunk_sectors;
 }
 EXPORT_SYMBOL(blk_queue_chunk_sectors);
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -83,7 +83,7 @@ unsigned int blkdev_nr_zones(struct gendisk *disk)
 
 	if (!blk_queue_is_zoned(disk->queue))
 		return 0;
-	return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
+	return div64_u64(get_capacity(disk) + zone_sectors - 1, zone_sectors);
 }
 EXPORT_SYMBOL_GPL(blkdev_nr_zones);
 
@@ -363,14 +363,14 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 	 * smaller last zone.
 	 */
 	if (zone->start == 0) {
-		if (zone->len == 0 || !is_power_of_2(zone->len)) {
-			pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
-				disk->disk_name, zone->len);
+		if (zone->len == 0) {
+			pr_warn("%s: Invalid zoned device with length 0\n",
+				disk->disk_name);
 			return -ENODEV;
 		}
 
 		args->zone_sectors = zone->len;
-		args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
+		args->nr_zones = div64_u64(capacity + zone->len - 1, zone->len);
 	} else if (zone->start + args->zone_sectors < capacity) {
 		if (zone->len != args->zone_sectors) {
 			pr_warn("%s: Invalid zoned device with non constant zone size\n",
--- a/drivers/block/null_blk_main.c
+++ b/drivers/block/null_blk_main.c
@@ -187,7 +187,7 @@ MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Defau
 
 static unsigned long g_zone_size = 256;
 module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
-MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
+MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Default: 256");
 
 static unsigned int g_zone_nr_conv;
 module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
@@ -1641,10 +1641,11 @@ static int null_validate_conf(struct nullb_device *dev)
 	if (dev->queue_mode == NULL_Q_BIO)
 		dev->mbps = 0;
 
-	if (dev->zoned &&
-	    (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
-		pr_err("zone_size must be power-of-two\n");
-		return -EINVAL;
+	if (dev->zoned) {
+		if (dev->zone_size == 0) {
+			pr_err("zone_size must be positive\n");
+			return -EINVAL;
+		}
 	}
 
 	return 0;
--- a/drivers/block/null_blk_zoned.c
+++ b/drivers/block/null_blk_zoned.c
@@ -7,7 +7,7 @@
 
 static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
 {
-	return sect >> ilog2(dev->zone_size_sects);
+	return div64_u64(sect, dev->zone_size_sects);
 }
 
 int null_zone_init(struct nullb_device *dev)
@@ -16,14 +16,8 @@ int null_zone_init(struct nullb_device *dev)
 	sector_t sector = 0;
 	unsigned int i;
 
-	if (!is_power_of_2(dev->zone_size)) {
-		pr_err("zone_size must be power-of-two\n");
-		return -EINVAL;
-	}
-
 	dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
-	dev->nr_zones = dev_size >>
-				(SECTOR_SHIFT + ilog2(dev->zone_size_sects));
+	dev->nr_zones = div64_u64(dev_size, dev->zone_size_sects * SECTOR_SIZE);
 	dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone),
 			GFP_KERNEL | __GFP_ZERO);
 	if (!dev->zones)

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] block: support arbitrary zone size
  2020-02-10 22:08 [PATCH] block: support arbitrary zone size Alexey Dobriyan
  2020-02-10 22:20 ` [PATCH v2] " Alexey Dobriyan
@ 2020-02-10 22:23 ` " Bart Van Assche
  2020-02-10 22:33 ` Keith Busch
  2 siblings, 0 replies; 10+ messages in thread
From: Bart Van Assche @ 2020-02-10 22:23 UTC (permalink / raw)
  To: Alexey Dobriyan, axboe; +Cc: linux-block, damien.lemoal

On 2/10/20 2:08 PM, Alexey Dobriyan wrote:
> -	return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
> +	return div64_u64(get_capacity(disk) + zone_sectors - 1, zone_sectors);

This kind of change impacts the performance negatively of ZNS devices 
for which the zone size is a power of two. I don't think it's fair to 
make others pay a price for a feature that is only needed for one single 
device.

Bart.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] block: support arbitrary zone size
  2020-02-10 22:08 [PATCH] block: support arbitrary zone size Alexey Dobriyan
  2020-02-10 22:20 ` [PATCH v2] " Alexey Dobriyan
  2020-02-10 22:23 ` [PATCH] " Bart Van Assche
@ 2020-02-10 22:33 ` Keith Busch
  2 siblings, 0 replies; 10+ messages in thread
From: Keith Busch @ 2020-02-10 22:33 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: axboe, linux-block, damien.lemoal

On Tue, Feb 11, 2020 at 01:08:16AM +0300, Alexey Dobriyan wrote:
>  void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
>  {
> -	BUG_ON(!is_power_of_2(chunk_sectors));
>  	q->limits.chunk_sectors = chunk_sectors;
>  }

This breaks blk_max_size_offset() if the value isn't a power of 2,
but I wouldn't want to replace its cheap mask with an expensive divide
operation anyway. Please keep the power-of-2 requirement.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] block: support arbitrary zone size
  2020-02-10 22:20 ` [PATCH v2] " Alexey Dobriyan
@ 2020-02-10 22:57   ` Damien Le Moal
  2020-02-10 23:06   ` Martin K. Petersen
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Damien Le Moal @ 2020-02-10 22:57 UTC (permalink / raw)
  To: Alexey Dobriyan, axboe; +Cc: linux-block

On 2020/02/11 7:20, Alexey Dobriyan wrote:
> SK hynix is going to ship ZNS device with zone size not being power of 2.
> 
> Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>

In addition to Bart and Keith comments, I will add a clear NACK on this, at
least for this patch as is. Details below inline.

> ---
> 
> 	v2: fixup one ">>ilog2"/div conversion :-/
> 
>  block/blk-settings.c           |    4 +---
>  block/blk-zoned.c              |   10 +++++-----
>  drivers/block/null_blk_main.c  |   11 ++++++-----
>  drivers/block/null_blk_zoned.c |   10 ++--------
>  4 files changed, 14 insertions(+), 21 deletions(-)
> 
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -206,15 +206,13 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors);
>   *
>   * Description:
>   *    If a driver doesn't want IOs to cross a given chunk size, it can set
> - *    this limit and prevent merging across chunks. Note that the chunk size
> - *    must currently be a power-of-2 in sectors. Also note that the block
> + *    this limit and prevent merging across chunks. Note that the block
>   *    layer must accept a page worth of data at any offset. So if the
>   *    crossing of chunks is a hard limitation in the driver, it must still be
>   *    prepared to split single page bios.
>   **/
>  void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
>  {
> -	BUG_ON(!is_power_of_2(chunk_sectors));
>  	q->limits.chunk_sectors = chunk_sectors;
>  }
>  EXPORT_SYMBOL(blk_queue_chunk_sectors);

Chunk sectors is used by the entire bio/request merging and splitting code.
For zoned devices, this ensures that requests do not cross over zones
boundaries due to merges. All that code relies on power of 2 arithmetics
(bit shifts), so allowing a non power of 2 value would mean changing all of
that. That is the hot path ! adding multiplications and divisions will slow
down all devices, including non zoned ones.

Furthermore, chunk sectors is also used to indicate the stripe size of
software or hardware raid arrays for use by file systems and device mappers
to do optimizations and also avoid merging of bios/request accross device
stripe boundaries. Not enforcing a power of 2 here also potentially breaks
all of that.


> --- a/block/blk-zoned.c
> +++ b/block/blk-zoned.c
> @@ -83,7 +83,7 @@ unsigned int blkdev_nr_zones(struct gendisk *disk)
>  
>  	if (!blk_queue_is_zoned(disk->queue))
>  		return 0;
> -	return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
> +	return div64_u64(get_capacity(disk) + zone_sectors - 1, zone_sectors);
>  }
>  EXPORT_SYMBOL_GPL(blkdev_nr_zones);

Sure, and this is the kind of modification that will be needed all over the
hot path for bio/request merging checks. Not nice at all. Slow down for
everything.

>  
> @@ -363,14 +363,14 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
>  	 * smaller last zone.
>  	 */
>  	if (zone->start == 0) {
> -		if (zone->len == 0 || !is_power_of_2(zone->len)) {
> -			pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
> -				disk->disk_name, zone->len);
> +		if (zone->len == 0) {
> +			pr_warn("%s: Invalid zoned device with length 0\n",
> +				disk->disk_name);
>  			return -ENODEV;
>  		}
>  
>  		args->zone_sectors = zone->len;
> -		args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
> +		args->nr_zones = div64_u64(capacity + zone->len - 1, zone->len);
>  	} else if (zone->start + args->zone_sectors < capacity) {
>  		if (zone->len != args->zone_sectors) {
>  			pr_warn("%s: Invalid zoned device with non constant zone size\n",

You only have these 2 changes for the generic block layer part, but there
are a lot more to add. Anything that has ilog2(chunk_sectors) (e.g.
blk_queue_zone_no()) needs to be changed. mq-deadline and zone write
locking handling, and all bio/request split & merge checks handling. And
again, even with such changes, allowing chunk_sectors to not be a power of
2 will cause problems for non-zoned use cases such as RAID and in all file
systems relying on that value to be a power of 2.

> --- a/drivers/block/null_blk_main.c
> +++ b/drivers/block/null_blk_main.c
> @@ -187,7 +187,7 @@ MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Defau
>  
>  static unsigned long g_zone_size = 256;
>  module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
> -MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
> +MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Default: 256");
>  
>  static unsigned int g_zone_nr_conv;
>  module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
> @@ -1641,10 +1641,11 @@ static int null_validate_conf(struct nullb_device *dev)
>  	if (dev->queue_mode == NULL_Q_BIO)
>  		dev->mbps = 0;
>  
> -	if (dev->zoned &&
> -	    (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
> -		pr_err("zone_size must be power-of-two\n");
> -		return -EINVAL;
> +	if (dev->zoned) {
> +		if (dev->zone_size == 0) {
> +			pr_err("zone_size must be positive\n");
> +			return -EINVAL;
> +		}
>  	}
>  
>  	return 0;
> --- a/drivers/block/null_blk_zoned.c
> +++ b/drivers/block/null_blk_zoned.c
> @@ -7,7 +7,7 @@
>  
>  static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
>  {
> -	return sect >> ilog2(dev->zone_size_sects);
> +	return div64_u64(sect, dev->zone_size_sects);
>  }
>  
>  int null_zone_init(struct nullb_device *dev)
> @@ -16,14 +16,8 @@ int null_zone_init(struct nullb_device *dev)
>  	sector_t sector = 0;
>  	unsigned int i;
>  
> -	if (!is_power_of_2(dev->zone_size)) {
> -		pr_err("zone_size must be power-of-two\n");
> -		return -EINVAL;
> -	}
> -
>  	dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
> -	dev->nr_zones = dev_size >>
> -				(SECTOR_SHIFT + ilog2(dev->zone_size_sects));
> +	dev->nr_zones = div64_u64(dev_size, dev->zone_size_sects * SECTOR_SIZE);
>  	dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone),
>  			GFP_KERNEL | __GFP_ZERO);
>  	if (!dev->zones)
> 

And beyond null_blk, other drivers are affected:
* DM core, dm-linear, dm-flakey and dm-zoned
* f2fs and zonefs file systems will not work as is on such drive. Ongoing
brtfs work is affected too.
* probably some things get broken in the scsi stack too.
* And user space will suffer badly too, at the very least all user space
tools associated with file systems and DM format/check and sys-utils.

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] block: support arbitrary zone size
  2020-02-10 22:20 ` [PATCH v2] " Alexey Dobriyan
  2020-02-10 22:57   ` Damien Le Moal
@ 2020-02-10 23:06   ` Martin K. Petersen
  2020-02-10 23:28   ` Matias Bjørling
                     ` (2 subsequent siblings)
  4 siblings, 0 replies; 10+ messages in thread
From: Martin K. Petersen @ 2020-02-10 23:06 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: axboe, linux-block, damien.lemoal


Alexey,

> SK hynix is going to ship ZNS device with zone size not being power of 2.

That is going to wreak havoc all over the place. We have always stated
to device vendors that Linux only supports block and zone sizes that are
powers of two.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] block: support arbitrary zone size
  2020-02-10 22:20 ` [PATCH v2] " Alexey Dobriyan
  2020-02-10 22:57   ` Damien Le Moal
  2020-02-10 23:06   ` Martin K. Petersen
@ 2020-02-10 23:28   ` Matias Bjørling
  2020-02-11 17:16     ` Alexey Dobriyan
  2020-02-11  9:16   ` Javier González
  2020-02-12 20:00   ` Christoph Hellwig
  4 siblings, 1 reply; 10+ messages in thread
From: Matias Bjørling @ 2020-02-10 23:28 UTC (permalink / raw)
  To: Alexey Dobriyan, axboe; +Cc: linux-block, damien.lemoal

On 10/02/2020 23.20, Alexey Dobriyan wrote:
> SK hynix is going to ship ZNS device with zone size not being power of 2.
>
> Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
> ---

Nack from here as well. The power of 2 is a strict requirement. All SMR 
HDDs Zone Sizes are a power of two, and ZNS SSDs should be as well if 
they want Linux eco-system support.

The ZNS specification is specifically written to accommodate this. It 
has two type of zone sizes, the Zone Size (which is what should be a 
power of two) and the Zone Capacity (that is the writable capacity of 
the zone). For Linux support, your ZNS SSD implemention must have a zone 
size that is power of two, and instead use the Zone Capacity field to be 
the "Zone Size" that you have now. Then it works as expected, making 
both the host and device happy. The host gets power of two, and device 
can communicate the non-power of two writeable LBAs within the zone.

Best, Matias


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] block: support arbitrary zone size
  2020-02-10 22:20 ` [PATCH v2] " Alexey Dobriyan
                     ` (2 preceding siblings ...)
  2020-02-10 23:28   ` Matias Bjørling
@ 2020-02-11  9:16   ` Javier González
  2020-02-12 20:00   ` Christoph Hellwig
  4 siblings, 0 replies; 10+ messages in thread
From: Javier González @ 2020-02-11  9:16 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: axboe, linux-block, damien.lemoal

On 11.02.2020 01:20, Alexey Dobriyan wrote:
>SK hynix is going to ship ZNS device with zone size not being power of 2.
>
>Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
>---
>
>	v2: fixup one ">>ilog2"/div conversion :-/
>
> block/blk-settings.c           |    4 +---
> block/blk-zoned.c              |   10 +++++-----
> drivers/block/null_blk_main.c  |   11 ++++++-----
> drivers/block/null_blk_zoned.c |   10 ++--------
> 4 files changed, 14 insertions(+), 21 deletions(-)
>
>--- a/block/blk-settings.c
>+++ b/block/blk-settings.c
>@@ -206,15 +206,13 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors);
>  *
>  * Description:
>  *    If a driver doesn't want IOs to cross a given chunk size, it can set
>- *    this limit and prevent merging across chunks. Note that the chunk size
>- *    must currently be a power-of-2 in sectors. Also note that the block
>+ *    this limit and prevent merging across chunks. Note that the block
>  *    layer must accept a page worth of data at any offset. So if the
>  *    crossing of chunks is a hard limitation in the driver, it must still be
>  *    prepared to split single page bios.
>  **/
> void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
> {
>-	BUG_ON(!is_power_of_2(chunk_sectors));
> 	q->limits.chunk_sectors = chunk_sectors;
> }
> EXPORT_SYMBOL(blk_queue_chunk_sectors);
>--- a/block/blk-zoned.c
>+++ b/block/blk-zoned.c
>@@ -83,7 +83,7 @@ unsigned int blkdev_nr_zones(struct gendisk *disk)
>
> 	if (!blk_queue_is_zoned(disk->queue))
> 		return 0;
>-	return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
>+	return div64_u64(get_capacity(disk) + zone_sectors - 1, zone_sectors);
> }
> EXPORT_SYMBOL_GPL(blkdev_nr_zones);
>
>@@ -363,14 +363,14 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
> 	 * smaller last zone.
> 	 */
> 	if (zone->start == 0) {
>-		if (zone->len == 0 || !is_power_of_2(zone->len)) {
>-			pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
>-				disk->disk_name, zone->len);
>+		if (zone->len == 0) {
>+			pr_warn("%s: Invalid zoned device with length 0\n",
>+				disk->disk_name);
> 			return -ENODEV;
> 		}
>
> 		args->zone_sectors = zone->len;
>-		args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
>+		args->nr_zones = div64_u64(capacity + zone->len - 1, zone->len);
> 	} else if (zone->start + args->zone_sectors < capacity) {
> 		if (zone->len != args->zone_sectors) {
> 			pr_warn("%s: Invalid zoned device with non constant zone size\n",
>--- a/drivers/block/null_blk_main.c
>+++ b/drivers/block/null_blk_main.c
>@@ -187,7 +187,7 @@ MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Defau
>
> static unsigned long g_zone_size = 256;
> module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
>-MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
>+MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Default: 256");
>
> static unsigned int g_zone_nr_conv;
> module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
>@@ -1641,10 +1641,11 @@ static int null_validate_conf(struct nullb_device *dev)
> 	if (dev->queue_mode == NULL_Q_BIO)
> 		dev->mbps = 0;
>
>-	if (dev->zoned &&
>-	    (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
>-		pr_err("zone_size must be power-of-two\n");
>-		return -EINVAL;
>+	if (dev->zoned) {
>+		if (dev->zone_size == 0) {
>+			pr_err("zone_size must be positive\n");
>+			return -EINVAL;
>+		}
> 	}
>
> 	return 0;
>--- a/drivers/block/null_blk_zoned.c
>+++ b/drivers/block/null_blk_zoned.c
>@@ -7,7 +7,7 @@
>
> static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
> {
>-	return sect >> ilog2(dev->zone_size_sects);
>+	return div64_u64(sect, dev->zone_size_sects);
> }
>
> int null_zone_init(struct nullb_device *dev)
>@@ -16,14 +16,8 @@ int null_zone_init(struct nullb_device *dev)
> 	sector_t sector = 0;
> 	unsigned int i;
>
>-	if (!is_power_of_2(dev->zone_size)) {
>-		pr_err("zone_size must be power-of-two\n");
>-		return -EINVAL;
>-	}
>-
> 	dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
>-	dev->nr_zones = dev_size >>
>-				(SECTOR_SHIFT + ilog2(dev->zone_size_sects));
>+	dev->nr_zones = div64_u64(dev_size, dev->zone_size_sects * SECTOR_SIZE);
> 	dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone),
> 			GFP_KERNEL | __GFP_ZERO);
> 	if (!dev->zones)

Same response from here. You will need to add some changes to support
arbitrary zone capacities, but the zone size needs to be a power-of-2 to
plug into the zoned framework.

Javier

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] block: support arbitrary zone size
  2020-02-10 23:28   ` Matias Bjørling
@ 2020-02-11 17:16     ` Alexey Dobriyan
  0 siblings, 0 replies; 10+ messages in thread
From: Alexey Dobriyan @ 2020-02-11 17:16 UTC (permalink / raw)
  To: Matias Bjørling; +Cc: axboe, linux-block, damien.lemoal

On Tue, Feb 11, 2020 at 12:28:17AM +0100, Matias Bjørling wrote:
> The ZNS specification is specifically written to accommodate this. It 
> has two type of zone sizes, the Zone Size (which is what should be a 
> power of two) and the Zone Capacity (that is the writable capacity of 
> the zone).

Ouch, what an embarassing patch I've sent.
For some reason holes in LBA space completely escaped my mind.

	Alexey

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] block: support arbitrary zone size
  2020-02-10 22:20 ` [PATCH v2] " Alexey Dobriyan
                     ` (3 preceding siblings ...)
  2020-02-11  9:16   ` Javier González
@ 2020-02-12 20:00   ` Christoph Hellwig
  4 siblings, 0 replies; 10+ messages in thread
From: Christoph Hellwig @ 2020-02-12 20:00 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: axboe, linux-block, damien.lemoal

On Tue, Feb 11, 2020 at 01:20:45AM +0300, Alexey Dobriyan wrote:
> SK hynix is going to ship ZNS device with zone size not being power of 2.

NAK.  We clearly stated we require power of two zone sizes in Linux.
(Not Zone Capacities for ZNS, though - which is the whole point the
zone capacity concept was added).

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, back to index

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-02-10 22:08 [PATCH] block: support arbitrary zone size Alexey Dobriyan
2020-02-10 22:20 ` [PATCH v2] " Alexey Dobriyan
2020-02-10 22:57   ` Damien Le Moal
2020-02-10 23:06   ` Martin K. Petersen
2020-02-10 23:28   ` Matias Bjørling
2020-02-11 17:16     ` Alexey Dobriyan
2020-02-11  9:16   ` Javier González
2020-02-12 20:00   ` Christoph Hellwig
2020-02-10 22:23 ` [PATCH] " Bart Van Assche
2020-02-10 22:33 ` Keith Busch

Linux-Block Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-block/0 linux-block/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-block linux-block/ https://lore.kernel.org/linux-block \
		linux-block@vger.kernel.org
	public-inbox-index linux-block

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-block


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git