All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/2] null_blk: zone support
@ 2018-07-06 17:38 Matias Bjørling
  2018-07-06 17:38 ` [PATCH 1/2] null_blk: move shared definitions to header file Matias Bjørling
                   ` (3 more replies)
  0 siblings, 4 replies; 25+ messages in thread
From: Matias Bjørling @ 2018-07-06 17:38 UTC (permalink / raw)
  To: axboe
  Cc: linux-block, linux-kernel, bart.vanassche, damien.lemoal,
	Matias Bjørling

This series adds support for exposing a zone block device using the
null_blk device driver.

The first patch moves core null_blk data structures to a shared header
file. The second implements the actual zone support. The patchset adds
two new options. One to enable the zone interface, and another to
define the size of the zones to expose.

Thanks,
Matias

Matias Bjørling (2):
  null_blk: move shared definitions to header file
  null_blk: add zone support

 Documentation/block/null_blk.txt |   7 ++
 drivers/block/Makefile           |   5 +-
 drivers/block/null_blk.c         | 124 ++++++++++++--------------------
 drivers/block/null_blk.h         | 108 ++++++++++++++++++++++++++++
 drivers/block/null_blk_zoned.c   | 149 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 315 insertions(+), 78 deletions(-)
 create mode 100644 drivers/block/null_blk.h
 create mode 100644 drivers/block/null_blk_zoned.c

-- 
2.11.0

^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH 1/2] null_blk: move shared definitions to header file
  2018-07-06 17:38 [PATCH 0/2] null_blk: zone support Matias Bjørling
@ 2018-07-06 17:38 ` Matias Bjørling
  2018-07-06 17:38 ` [PATCH 2/2] null_blk: add zone support Matias Bjørling
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 25+ messages in thread
From: Matias Bjørling @ 2018-07-06 17:38 UTC (permalink / raw)
  To: axboe
  Cc: linux-block, linux-kernel, bart.vanassche, damien.lemoal,
	Matias Bjørling

From: Matias Bjørling <matias.bjorling@wdc.com>

Split the null_blk device driver, such that it can prepare for
zoned block interface support.

Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
---
 drivers/block/null_blk.c | 76 +--------------------------------------------
 drivers/block/null_blk.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 75 deletions(-)
 create mode 100644 drivers/block/null_blk.h

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 042c778e5a4e..cd4b0849d3b4 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -7,14 +7,8 @@
 #include <linux/moduleparam.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
-#include <linux/blkdev.h>
 #include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/blk-mq.h>
-#include <linux/hrtimer.h>
-#include <linux/configfs.h>
-#include <linux/badblocks.h>
-#include <linux/fault-inject.h>
+#include "null_blk.h"
 
 #define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
 #define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
@@ -35,28 +29,6 @@ static inline u64 mb_per_tick(int mbps)
 	return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
 }
 
-struct nullb_cmd {
-	struct list_head list;
-	struct llist_node ll_list;
-	struct __call_single_data csd;
-	struct request *rq;
-	struct bio *bio;
-	unsigned int tag;
-	blk_status_t error;
-	struct nullb_queue *nq;
-	struct hrtimer timer;
-};
-
-struct nullb_queue {
-	unsigned long *tag_map;
-	wait_queue_head_t wait;
-	unsigned int queue_depth;
-	struct nullb_device *dev;
-	unsigned int requeue_selection;
-
-	struct nullb_cmd *cmds;
-};
-
 /*
  * Status flags for nullb_device.
  *
@@ -92,52 +64,6 @@ struct nullb_page {
 #define NULLB_PAGE_LOCK (MAP_SZ - 1)
 #define NULLB_PAGE_FREE (MAP_SZ - 2)
 
-struct nullb_device {
-	struct nullb *nullb;
-	struct config_item item;
-	struct radix_tree_root data; /* data stored in the disk */
-	struct radix_tree_root cache; /* disk cache data */
-	unsigned long flags; /* device flags */
-	unsigned int curr_cache;
-	struct badblocks badblocks;
-
-	unsigned long size; /* device size in MB */
-	unsigned long completion_nsec; /* time in ns to complete a request */
-	unsigned long cache_size; /* disk cache size in MB */
-	unsigned int submit_queues; /* number of submission queues */
-	unsigned int home_node; /* home node for the device */
-	unsigned int queue_mode; /* block interface */
-	unsigned int blocksize; /* block size */
-	unsigned int irqmode; /* IRQ completion handler */
-	unsigned int hw_queue_depth; /* queue depth */
-	unsigned int index; /* index of the disk, only valid with a disk */
-	unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
-	bool blocking; /* blocking blk-mq device */
-	bool use_per_node_hctx; /* use per-node allocation for hardware context */
-	bool power; /* power on/off the device */
-	bool memory_backed; /* if data is stored in memory */
-	bool discard; /* if support discard */
-};
-
-struct nullb {
-	struct nullb_device *dev;
-	struct list_head list;
-	unsigned int index;
-	struct request_queue *q;
-	struct gendisk *disk;
-	struct blk_mq_tag_set *tag_set;
-	struct blk_mq_tag_set __tag_set;
-	unsigned int queue_depth;
-	atomic_long_t cur_bytes;
-	struct hrtimer bw_timer;
-	unsigned long cache_flush_pos;
-	spinlock_t lock;
-
-	struct nullb_queue *queues;
-	unsigned int nr_queues;
-	char disk_name[DISK_NAME_LEN];
-};
-
 static LIST_HEAD(nullb_list);
 static struct mutex lock;
 static int null_major;
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
new file mode 100644
index 000000000000..d82c5501806d
--- /dev/null
+++ b/drivers/block/null_blk.h
@@ -0,0 +1,80 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BLK_NULL_BLK_H
+#define __BLK_NULL_BLK_H
+
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/blk-mq.h>
+#include <linux/hrtimer.h>
+#include <linux/configfs.h>
+#include <linux/badblocks.h>
+#include <linux/fault-inject.h>
+
+struct nullb_cmd {
+	struct list_head list;
+	struct llist_node ll_list;
+	struct __call_single_data csd;
+	struct request *rq;
+	struct bio *bio;
+	unsigned int tag;
+	blk_status_t error;
+	struct nullb_queue *nq;
+	struct hrtimer timer;
+};
+
+struct nullb_queue {
+	unsigned long *tag_map;
+	wait_queue_head_t wait;
+	unsigned int queue_depth;
+	struct nullb_device *dev;
+	unsigned int requeue_selection;
+
+	struct nullb_cmd *cmds;
+};
+
+struct nullb_device {
+	struct nullb *nullb;
+	struct config_item item;
+	struct radix_tree_root data; /* data stored in the disk */
+	struct radix_tree_root cache; /* disk cache data */
+	unsigned long flags; /* device flags */
+	unsigned int curr_cache;
+	struct badblocks badblocks;
+
+	unsigned long size; /* device size in MB */
+	unsigned long completion_nsec; /* time in ns to complete a request */
+	unsigned long cache_size; /* disk cache size in MB */
+	unsigned int submit_queues; /* number of submission queues */
+	unsigned int home_node; /* home node for the device */
+	unsigned int queue_mode; /* block interface */
+	unsigned int blocksize; /* block size */
+	unsigned int irqmode; /* IRQ completion handler */
+	unsigned int hw_queue_depth; /* queue depth */
+	unsigned int index; /* index of the disk, only valid with a disk */
+	unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
+	bool blocking; /* blocking blk-mq device */
+	bool use_per_node_hctx; /* use per-node allocation for hardware context */
+	bool power; /* power on/off the device */
+	bool memory_backed; /* if data is stored in memory */
+	bool discard; /* if support discard */
+};
+
+struct nullb {
+	struct nullb_device *dev;
+	struct list_head list;
+	unsigned int index;
+	struct request_queue *q;
+	struct gendisk *disk;
+	struct blk_mq_tag_set *tag_set;
+	struct blk_mq_tag_set __tag_set;
+	unsigned int queue_depth;
+	atomic_long_t cur_bytes;
+	struct hrtimer bw_timer;
+	unsigned long cache_flush_pos;
+	spinlock_t lock;
+
+	struct nullb_queue *queues;
+	unsigned int nr_queues;
+	char disk_name[DISK_NAME_LEN];
+};
+#endif /* __NULL_BLK_H */
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH 2/2] null_blk: add zone support
  2018-07-06 17:38 [PATCH 0/2] null_blk: zone support Matias Bjørling
  2018-07-06 17:38 ` [PATCH 1/2] null_blk: move shared definitions to header file Matias Bjørling
@ 2018-07-06 17:38 ` Matias Bjørling
  2018-07-06 17:45 ` [PATCH 0/2] null_blk: " Laurence Oberman
  2018-07-07  2:54 ` [PATCH 0/2] null_blk: zone support Jens Axboe
  3 siblings, 0 replies; 25+ messages in thread
From: Matias Bjørling @ 2018-07-06 17:38 UTC (permalink / raw)
  To: axboe
  Cc: linux-block, linux-kernel, bart.vanassche, damien.lemoal,
	Matias Bjørling

From: Matias Bjørling <matias.bjorling@wdc.com>

Adds support for exposing a null_blk device through the zone device
interface.

The interface is managed with the parameters zoned and zone_size.
If zoned is set, the null_blk instance registers as a zoned block
device. The zone_size parameter defines how big each zone will be.

Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com>
Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com>
Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com>
---
 Documentation/block/null_blk.txt |   7 ++
 drivers/block/Makefile           |   5 +-
 drivers/block/null_blk.c         |  48 ++++++++++++-
 drivers/block/null_blk.h         |  28 ++++++++
 drivers/block/null_blk_zoned.c   | 149 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 234 insertions(+), 3 deletions(-)
 create mode 100644 drivers/block/null_blk_zoned.c

diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt
index 07f147381f32..ea2dafe49ae8 100644
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -85,3 +85,10 @@ shared_tags=[0/1]: Default: 0
   0: Tag set is not shared.
   1: Tag set shared between devices for blk-mq. Only makes sense with
      nr_devices > 1, otherwise there's no tag set to share.
+
+zoned=[0/1]: Default: 0
+  0: Block device is exposed as a random-access block device.
+  1: Block device is exposed as a host-managed zoned block device.
+
+zone_size=[MB]: Default: 256
+  Per zone size when exposed as a zoned block device. Must be a power of two.
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index dc061158b403..a0d88aa0c05d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -36,8 +36,11 @@ obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
 obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
 
 obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/
-obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk.o
 obj-$(CONFIG_ZRAM) += zram/
 
+obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk_mod.o
+null_blk_mod-objs	:= null_blk.o
+null_blk_mod-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o
+
 skd-y		:= skd_main.o
 swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index cd4b0849d3b4..99b6bfe7abd1 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -180,6 +180,14 @@ static bool g_use_per_node_hctx;
 module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
 MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
 
+static bool g_zoned;
+module_param_named(zoned, g_zoned, bool, S_IRUGO);
+MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
+
+static unsigned long g_zone_size = 256;
+module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
+MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
+
 static struct nullb_device *null_alloc_dev(void);
 static void null_free_dev(struct nullb_device *dev);
 static void null_del_dev(struct nullb *nullb);
@@ -283,6 +291,8 @@ NULLB_DEVICE_ATTR(memory_backed, bool);
 NULLB_DEVICE_ATTR(discard, bool);
 NULLB_DEVICE_ATTR(mbps, uint);
 NULLB_DEVICE_ATTR(cache_size, ulong);
+NULLB_DEVICE_ATTR(zoned, bool);
+NULLB_DEVICE_ATTR(zone_size, ulong);
 
 static ssize_t nullb_device_power_show(struct config_item *item, char *page)
 {
@@ -394,6 +404,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
 	&nullb_device_attr_mbps,
 	&nullb_device_attr_cache_size,
 	&nullb_device_attr_badblocks,
+	&nullb_device_attr_zoned,
+	&nullb_device_attr_zone_size,
 	NULL,
 };
 
@@ -446,7 +458,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
 
 static ssize_t memb_group_features_show(struct config_item *item, char *page)
 {
-	return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n");
+	return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n");
 }
 
 CONFIGFS_ATTR_RO(memb_group_, features);
@@ -505,6 +517,8 @@ static struct nullb_device *null_alloc_dev(void)
 	dev->hw_queue_depth = g_hw_queue_depth;
 	dev->blocking = g_blocking;
 	dev->use_per_node_hctx = g_use_per_node_hctx;
+	dev->zoned = g_zoned;
+	dev->zone_size = g_zone_size;
 	return dev;
 }
 
@@ -513,6 +527,7 @@ static void null_free_dev(struct nullb_device *dev)
 	if (!dev)
 		return;
 
+	null_zone_exit(dev);
 	badblocks_exit(&dev->badblocks);
 	kfree(dev);
 }
@@ -1145,6 +1160,11 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
 	struct nullb *nullb = dev->nullb;
 	int err = 0;
 
+	if (req_op(cmd->rq) == REQ_OP_ZONE_REPORT) {
+		cmd->error = null_zone_report(nullb, cmd);
+		goto out;
+	}
+
 	if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
 		struct request *rq = cmd->rq;
 
@@ -1209,6 +1229,13 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd)
 		}
 	}
 	cmd->error = errno_to_blk_status(err);
+
+	if (!cmd->error && dev->zoned) {
+		if (req_op(cmd->rq) == REQ_OP_WRITE)
+			null_zone_write(cmd);
+		else if (req_op(cmd->rq) == REQ_OP_ZONE_RESET)
+			null_zone_reset(cmd);
+	}
 out:
 	/* Complete IO by inline, softirq or timer */
 	switch (dev->irqmode) {
@@ -1736,6 +1763,15 @@ static int null_add_dev(struct nullb_device *dev)
 		blk_queue_flush_queueable(nullb->q, true);
 	}
 
+	if (dev->zoned) {
+		rv = null_zone_init(dev);
+		if (rv)
+			goto out_cleanup_blk_queue;
+
+		blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects);
+		nullb->q->limits.zoned = BLK_ZONED_HM;
+	}
+
 	nullb->q->queuedata = nullb;
 	blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
 	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
@@ -1754,13 +1790,16 @@ static int null_add_dev(struct nullb_device *dev)
 
 	rv = null_gendisk_register(nullb);
 	if (rv)
-		goto out_cleanup_blk_queue;
+		goto out_cleanup_zone;
 
 	mutex_lock(&lock);
 	list_add_tail(&nullb->list, &nullb_list);
 	mutex_unlock(&lock);
 
 	return 0;
+out_cleanup_zone:
+	if (dev->zoned)
+		null_zone_exit(dev);
 out_cleanup_blk_queue:
 	blk_cleanup_queue(nullb->q);
 out_cleanup_tags:
@@ -1787,6 +1826,11 @@ static int __init null_init(void)
 		g_bs = PAGE_SIZE;
 	}
 
+	if (!is_power_of_2(g_zone_size)) {
+		pr_err("null_blk: zone_size must be power-of-two\n");
+		return -EINVAL;
+	}
+
 	if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
 		if (g_submit_queues != nr_online_nodes) {
 			pr_warn("null_blk: submit_queues param is set to %u.\n",
diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
index d82c5501806d..d81781f22dba 100644
--- a/drivers/block/null_blk.h
+++ b/drivers/block/null_blk.h
@@ -41,9 +41,14 @@ struct nullb_device {
 	unsigned int curr_cache;
 	struct badblocks badblocks;
 
+	unsigned int nr_zones;
+	struct blk_zone *zones;
+	sector_t zone_size_sects;
+
 	unsigned long size; /* device size in MB */
 	unsigned long completion_nsec; /* time in ns to complete a request */
 	unsigned long cache_size; /* disk cache size in MB */
+	unsigned long zone_size; /* zone size in MB if device is zoned */
 	unsigned int submit_queues; /* number of submission queues */
 	unsigned int home_node; /* home node for the device */
 	unsigned int queue_mode; /* block interface */
@@ -57,6 +62,7 @@ struct nullb_device {
 	bool power; /* power on/off the device */
 	bool memory_backed; /* if data is stored in memory */
 	bool discard; /* if support discard */
+	bool zoned; /* if device is zoned */
 };
 
 struct nullb {
@@ -77,4 +83,26 @@ struct nullb {
 	unsigned int nr_queues;
 	char disk_name[DISK_NAME_LEN];
 };
+
+#ifdef CONFIG_BLK_DEV_ZONED
+int null_zone_init(struct nullb_device *dev);
+void null_zone_exit(struct nullb_device *dev);
+blk_status_t null_zone_report(struct nullb *nullb,
+					    struct nullb_cmd *cmd);
+void null_zone_write(struct nullb_cmd *cmd);
+void null_zone_reset(struct nullb_cmd *cmd);
+#else
+static inline int null_zone_init(struct nullb_device *dev)
+{
+	return -EINVAL;
+}
+static inline void null_zone_exit(struct nullb_device *dev) {}
+static inline blk_status_t null_zone_report(struct nullb *nullb,
+					    struct nullb_cmd *cmd)
+{
+	return BLK_STS_NOTSUPP;
+}
+static inline void null_zone_write(struct nullb_cmd *cmd) {}
+static inline void null_zone_reset(struct nullb_cmd *cmd) {}
+#endif /* CONFIG_BLK_DEV_ZONED */
 #endif /* __NULL_BLK_H */
diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
new file mode 100644
index 000000000000..a979ca00d7be
--- /dev/null
+++ b/drivers/block/null_blk_zoned.c
@@ -0,0 +1,149 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/vmalloc.h>
+#include "null_blk.h"
+
+/* zone_size in MBs to sectors. */
+#define ZONE_SIZE_SHIFT		11
+
+static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
+{
+	return sect >> ilog2(dev->zone_size_sects);
+}
+
+int null_zone_init(struct nullb_device *dev)
+{
+	sector_t dev_size = (sector_t)dev->size * 1024 * 1024;
+	sector_t sector = 0;
+	unsigned int i;
+
+	if (!is_power_of_2(dev->zone_size)) {
+		pr_err("null_blk: zone_size must be power-of-two\n");
+		return -EINVAL;
+	}
+
+	dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT;
+	dev->nr_zones = dev_size >>
+				(SECTOR_SHIFT + ilog2(dev->zone_size_sects));
+	dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone),
+			GFP_KERNEL | __GFP_ZERO);
+	if (!dev->zones)
+		return -ENOMEM;
+
+	for (i = 0; i < dev->nr_zones; i++) {
+		struct blk_zone *zone = &dev->zones[i];
+
+		zone->start = zone->wp = sector;
+		zone->len = dev->zone_size_sects;
+		zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+		zone->cond = BLK_ZONE_COND_EMPTY;
+
+		sector += dev->zone_size_sects;
+	}
+
+	return 0;
+}
+
+void null_zone_exit(struct nullb_device *dev)
+{
+	kvfree(dev->zones);
+}
+
+static void null_zone_fill_rq(struct nullb_device *dev, struct request *rq,
+			      unsigned int zno, unsigned int nr_zones)
+{
+	struct blk_zone_report_hdr *hdr = NULL;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	void *addr;
+	unsigned int zones_to_cpy;
+
+	bio_for_each_segment(bvec, rq->bio, iter) {
+		addr = kmap_atomic(bvec.bv_page);
+
+		zones_to_cpy = bvec.bv_len / sizeof(struct blk_zone);
+
+		if (!hdr) {
+			hdr = (struct blk_zone_report_hdr *)addr;
+			hdr->nr_zones = nr_zones;
+			zones_to_cpy--;
+			addr += sizeof(struct blk_zone_report_hdr);
+		}
+
+		zones_to_cpy = min_t(unsigned int, zones_to_cpy, nr_zones);
+
+		memcpy(addr, &dev->zones[zno],
+				zones_to_cpy * sizeof(struct blk_zone));
+
+		kunmap_atomic(addr);
+
+		nr_zones -= zones_to_cpy;
+		zno += zones_to_cpy;
+
+		if (!nr_zones)
+			break;
+	}
+}
+
+blk_status_t null_zone_report(struct nullb *nullb,
+				     struct nullb_cmd *cmd)
+{
+	struct nullb_device *dev = nullb->dev;
+	struct request *rq = cmd->rq;
+	unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
+	unsigned int nr_zones = dev->nr_zones - zno;
+	unsigned int max_zones = (blk_rq_bytes(rq) /
+					sizeof(struct blk_zone)) - 1;
+
+	nr_zones = min_t(unsigned int, nr_zones, max_zones);
+
+	null_zone_fill_rq(nullb->dev, rq, zno, nr_zones);
+
+	return BLK_STS_OK;
+}
+
+void null_zone_write(struct nullb_cmd *cmd)
+{
+	struct nullb_device *dev = cmd->nq->dev;
+	struct request *rq = cmd->rq;
+	sector_t sector = blk_rq_pos(rq);
+	unsigned int rq_sectors = blk_rq_sectors(rq);
+	unsigned int zno = null_zone_no(dev, sector);
+	struct blk_zone *zone = &dev->zones[zno];
+
+	switch (zone->cond) {
+	case BLK_ZONE_COND_FULL:
+		/* Cannot write to a full zone */
+		cmd->error = BLK_STS_IOERR;
+		break;
+	case BLK_ZONE_COND_EMPTY:
+	case BLK_ZONE_COND_IMP_OPEN:
+		/* Writes must be at the write pointer position */
+		if (blk_rq_pos(rq) != zone->wp) {
+			cmd->error = BLK_STS_IOERR;
+			break;
+		}
+
+		if (zone->cond == BLK_ZONE_COND_EMPTY)
+			zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+		zone->wp += rq_sectors;
+		if (zone->wp == zone->start + zone->len)
+			zone->cond = BLK_ZONE_COND_FULL;
+		break;
+	default:
+		/* Invalid zone condition */
+		cmd->error = BLK_STS_IOERR;
+		break;
+	}
+}
+
+void null_zone_reset(struct nullb_cmd *cmd)
+{
+	struct nullb_device *dev = cmd->nq->dev;
+	struct request *rq = cmd->rq;
+	unsigned int zno = null_zone_no(dev, blk_rq_pos(rq));
+	struct blk_zone *zone = &dev->zones[zno];
+
+	zone->cond = BLK_ZONE_COND_EMPTY;
+	zone->wp = zone->start;
+}
-- 
2.11.0

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
  2018-07-06 17:38 [PATCH 0/2] null_blk: zone support Matias Bjørling
  2018-07-06 17:38 ` [PATCH 1/2] null_blk: move shared definitions to header file Matias Bjørling
  2018-07-06 17:38 ` [PATCH 2/2] null_blk: add zone support Matias Bjørling
@ 2018-07-06 17:45 ` Laurence Oberman
  2018-07-09  7:54   ` Matias Bjørling
  2018-07-07  2:54 ` [PATCH 0/2] null_blk: zone support Jens Axboe
  3 siblings, 1 reply; 25+ messages in thread
From: Laurence Oberman @ 2018-07-06 17:45 UTC (permalink / raw)
  To: Matias Bjørling, axboe
  Cc: linux-block, linux-kernel, bart.vanassche, damien.lemoal

On Fri, 2018-07-06 at 19:38 +0200, Matias Bjørling wrote:
> This series adds support for exposing a zone block device using the
> null_blk device driver.
> 
> The first patch moves core null_blk data structures to a shared
> header
> file. The second implements the actual zone support. The patchset
> adds
> two new options. One to enable the zone interface, and another to
> define the size of the zones to expose.
> 
> Thanks,
> Matias
> 
> Matias Bjørling (2):
>   null_blk: move shared definitions to header file
>   null_blk: add zone support
> 
>  Documentation/block/null_blk.txt |   7 ++
>  drivers/block/Makefile           |   5 +-
>  drivers/block/null_blk.c         | 124 ++++++++++++-----------------
> ---
>  drivers/block/null_blk.h         | 108 ++++++++++++++++++++++++++++
>  drivers/block/null_blk_zoned.c   | 149
> +++++++++++++++++++++++++++++++++++++++
>  5 files changed, 315 insertions(+), 78 deletions(-)
>  create mode 100644 drivers/block/null_blk.h
>  create mode 100644 drivers/block/null_blk_zoned.c
> 

Thank you for this will be very useful.
I will test it to add value.
I dont know the code well enough to review it
Regards
Laurence

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
  2018-07-06 17:38 [PATCH 0/2] null_blk: zone support Matias Bjørling
                   ` (2 preceding siblings ...)
  2018-07-06 17:45 ` [PATCH 0/2] null_blk: " Laurence Oberman
@ 2018-07-07  2:54 ` Jens Axboe
  3 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2018-07-07  2:54 UTC (permalink / raw)
  To: Matias Bjørling
  Cc: linux-block, linux-kernel, bart.vanassche, damien.lemoal

On 7/6/18 11:38 AM, Matias Bjørling wrote:
> This series adds support for exposing a zone block device using the
> null_blk device driver.
> 
> The first patch moves core null_blk data structures to a shared header
> file. The second implements the actual zone support. The patchset adds
> two new options. One to enable the zone interface, and another to
> define the size of the zones to expose.

Thanks, should be useful for verifying write pointer violations.
Applied for 4.19.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
  2018-07-06 17:45 ` [PATCH 0/2] null_blk: " Laurence Oberman
@ 2018-07-09  7:54   ` Matias Bjørling
  2018-07-09 16:34     ` Jens Axboe
  0 siblings, 1 reply; 25+ messages in thread
From: Matias Bjørling @ 2018-07-09  7:54 UTC (permalink / raw)
  To: loberman, axboe; +Cc: linux-block, linux-kernel, Bart.VanAssche, Damien.LeMoal

On 07/06/2018 07:45 PM, Laurence Oberman wrote:
> On Fri, 2018-07-06 at 19:38 +0200, Matias Bjørling wrote:
>> This series adds support for exposing a zone block device using the
>> null_blk device driver.
>>
>> The first patch moves core null_blk data structures to a shared
>> header
>> file. The second implements the actual zone support. The patchset
>> adds
>> two new options. One to enable the zone interface, and another to
>> define the size of the zones to expose.
>>
>> Thanks,
>> Matias
>>
>> Matias Bjørling (2):
>>    null_blk: move shared definitions to header file
>>    null_blk: add zone support
>>
>>   Documentation/block/null_blk.txt |   7 ++
>>   drivers/block/Makefile           |   5 +-
>>   drivers/block/null_blk.c         | 124 ++++++++++++-----------------
>> ---
>>   drivers/block/null_blk.h         | 108 ++++++++++++++++++++++++++++
>>   drivers/block/null_blk_zoned.c   | 149
>> +++++++++++++++++++++++++++++++++++++++
>>   5 files changed, 315 insertions(+), 78 deletions(-)
>>   create mode 100644 drivers/block/null_blk.h
>>   create mode 100644 drivers/block/null_blk_zoned.c
>>
> 
> Thank you for this will be very useful.
> I will test it to add value.
> I dont know the code well enough to review it
> Regards
> Laurence
> 

Great.

For fio, you can use the zone support here:

   https://github.com/bvanassche/fio

It is in the process of being upstreamed.

Also, you'll need the latest libzbc/util-linux (blkzone) if you want to 
report zones. A bug was fixed around detection of the drive.

   https://github.com/hgst/libzbc
   https://github.com/karelzak/util-linux

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
  2018-07-09  7:54   ` Matias Bjørling
@ 2018-07-09 16:34     ` Jens Axboe
  2018-07-10  0:05         ` Bart Van Assche
  0 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2018-07-09 16:34 UTC (permalink / raw)
  To: Matias Bjørling, loberman
  Cc: linux-block, linux-kernel, Bart.VanAssche, Damien.LeMoal

On 7/9/18 1:54 AM, Matias Bjørling wrote:
> For fio, you can use the zone support here:
> 
>    https://github.com/bvanassche/fio
> 
> It is in the process of being upstreamed.

In the spirit of making some progress on this, I just don't like how
it's done. For example, it should not be necessary to adjust what
comes out of the block generator, instead the block generator should
be told to do what we need on zbc. This is a key concept. The workload
should be defined as such that it works for zoned devices.

The trim as zone resets seems odd. What happens if you end up with a
zoned flash device in the future?

The support code looks fine.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
  2018-07-09 16:34     ` Jens Axboe
@ 2018-07-10  0:05         ` Bart Van Assche
  0 siblings, 0 replies; 25+ messages in thread
From: Bart Van Assche @ 2018-07-10  0:05 UTC (permalink / raw)
  To: mb, axboe, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

T24gTW9uLCAyMDE4LTA3LTA5IGF0IDEwOjM0IC0wNjAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBP
biA3LzkvMTggMTo1NCBBTSwgTWF0aWFzIEJqw7hybGluZyB3cm90ZToNCj4gPiBGb3IgZmlvLCB5
b3UgY2FuIHVzZSB0aGUgem9uZSBzdXBwb3J0IGhlcmU6DQo+ID4gDQo+ID4gICAgaHR0cHM6Ly9n
aXRodWIuY29tL2J2YW5hc3NjaGUvZmlvDQo+ID4gDQo+ID4gSXQgaXMgaW4gdGhlIHByb2Nlc3Mg
b2YgYmVpbmcgdXBzdHJlYW1lZC4NCj4gDQo+IEluIHRoZSBzcGlyaXQgb2YgbWFraW5nIHNvbWUg
cHJvZ3Jlc3Mgb24gdGhpcywgSSBqdXN0IGRvbid0IGxpa2UgaG93DQo+IGl0J3MgZG9uZS4gRm9y
IGV4YW1wbGUsIGl0IHNob3VsZCBub3QgYmUgbmVjZXNzYXJ5IHRvIGFkanVzdCB3aGF0DQo+IGNv
bWVzIG91dCBvZiB0aGUgYmxvY2sgZ2VuZXJhdG9yLCBpbnN0ZWFkIHRoZSBibG9jayBnZW5lcmF0
b3Igc2hvdWxkDQo+IGJlIHRvbGQgdG8gZG8gd2hhdCB3ZSBuZWVkIG9uIHpiYy4gVGhpcyBpcyBh
IGtleSBjb25jZXB0LiBUaGUgd29ya2xvYWQNCj4gc2hvdWxkIGJlIGRlZmluZWQgYXMgc3VjaCB0
aGF0IGl0IHdvcmtzIGZvciB6b25lZCBkZXZpY2VzLg0KDQpIZWxsbyBKZW5zLA0KDQpIb3cgd291
bGQgeW91IGxpa2UgdG8gc2VlIGJsb2NrIGdlbmVyYXRpb24gd29yaz8gSSBkb24ndCBzZWUgYW4g
YWx0ZXJuYXRpdmUNCmZvciByYW5kb20gSS9PIG90aGVyIHN0YXJ0aW5nIGZyb20gdGhlIG91dHB1
dCBvZiBhIHJhbmRvbSBnZW5lcmF0b3IgYW5kDQp0cmFuc2xhdGluZyB0aGF0IG91dHB1dCBpbnRv
IHNvbWV0aGluZyB0aGF0IGlzIGFwcHJvcHJpYXRlIGZvciBhIHpvbmVkIGJsb2NrDQpkZXZpY2Uu
IFJhbmRvbSByZWFkcyBtdXN0IGhhcHBlbiBiZWxvdyB0aGUgem9uZSBwb2ludGVyIGlmIGZpbyBp
cyBjb25maWd1cmVkDQp0byByZWFkIGJlbG93IHRoZSB6b25lIHBvaW50ZXIuIFJhbmRvbSB3cml0
ZXMgbXVzdCBoYXBwZW4gYXQgdGhlIHdyaXRlDQpwb2ludGVyLiBUaGUgb25seSB3YXkgSSBzZWUg
dG8gaW1wbGVtZW50IHN1Y2ggYW4gSS9PIHBhdHRlcm4gaXMgdG8gc3RhcnQNCmZyb20gdGhlIG91
dHB1dCBvZiBhIHJhbmRvbSBnZW5lcmF0b3IgYW5kIHRvIGFkanVzdCB0aGUgb3V0cHV0IG9mIHRo
YXQNCnJhbmRvbSBnZW5lcmF0b3IuIEhvd2V2ZXIsIEkgZG9uJ3QgaGF2ZSBhIHN0cm9uZyBvcGlu
aW9uIHdoZXRoZXIgYWRqdXN0aW5nDQp0aGUgb3V0cHV0IG9mIGEgcmFuZG9tIGdlbmVyYXRvciBz
aG91bGQgaGFwcGVuIGJ5IHRoZSBjYWxsZXIgb2YNCmdldF9uZXh0X2J1ZmxlbigpIG9yIGluc2lk
ZSBnZXRfbmV4dF9idWZsZW4oKS4gT3IgaXMgeW91ciBjb25jZXJuIHBlcmhhcHMNCnRoYXQgdGhl
IGN1cnJlbnQgYXBwcm9hY2ggaW50ZXJmZXJlcyB3aXRoIGZpbyBqb2Igb3B0aW9ucyBsaWtlIGJz
X3VuYWxpZ25lZD8NCg0KPiBUaGUgdHJpbSBhcyB6b25lIHJlc2V0cyBzZWVtcyBvZGQuIFdoYXQg
aGFwcGVucyBpZiB5b3UgZW5kIHVwIHdpdGggYQ0KPiB6b25lZCBmbGFzaCBkZXZpY2UgaW4gdGhl
IGZ1dHVyZT8NCg0KV2UgY2FuIGxlYXZlIG91dCB0aGUgY29kZSB0aGF0IHRyYW5zbGF0ZXMgdHJp
bSBpbnRvIHpvbmUgcmVzZXRzIGZyb20gZmlvDQphbmQgZGlzY3VzcyBsYXRlciBob3cgdG8gaGFu
ZGxlIHRoaXMuIE9uZSBwb3NzaWJpbGl0eSBpcyB0byBtb2RpZnkgdGhlIHNkDQpkcml2ZXIgc3Vj
aCB0aGF0IGl0IHRyYW5zbGF0ZXMgUkVRX09QX0RJU0NBUkQgaW50byB6b25lIHJlc2V0cyB3aGVu
DQphcHByb3ByaWF0ZS4gVGhlcmUgbWF5IGJlIGJldHRlciBhbHRlcm5hdGl2ZXMuDQoNCkJhcnQu
DQoNCg0KDQoNCg==

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
@ 2018-07-10  0:05         ` Bart Van Assche
  0 siblings, 0 replies; 25+ messages in thread
From: Bart Van Assche @ 2018-07-10  0:05 UTC (permalink / raw)
  To: mb, axboe, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

On Mon, 2018-07-09 at 10:34 -0600, Jens Axboe wrote:
> On 7/9/18 1:54 AM, Matias Bjørling wrote:
> > For fio, you can use the zone support here:
> > 
> >    https://github.com/bvanassche/fio
> > 
> > It is in the process of being upstreamed.
> 
> In the spirit of making some progress on this, I just don't like how
> it's done. For example, it should not be necessary to adjust what
> comes out of the block generator, instead the block generator should
> be told to do what we need on zbc. This is a key concept. The workload
> should be defined as such that it works for zoned devices.

Hello Jens,

How would you like to see block generation work? I don't see an alternative
for random I/O other starting from the output of a random generator and
translating that output into something that is appropriate for a zoned block
device. Random reads must happen below the zone pointer if fio is configured
to read below the zone pointer. Random writes must happen at the write
pointer. The only way I see to implement such an I/O pattern is to start
from the output of a random generator and to adjust the output of that
random generator. However, I don't have a strong opinion whether adjusting
the output of a random generator should happen by the caller of
get_next_buflen() or inside get_next_buflen(). Or is your concern perhaps
that the current approach interferes with fio job options like bs_unaligned?

> The trim as zone resets seems odd. What happens if you end up with a
> zoned flash device in the future?

We can leave out the code that translates trim into zone resets from fio
and discuss later how to handle this. One possibility is to modify the sd
driver such that it translates REQ_OP_DISCARD into zone resets when
appropriate. There may be better alternatives.

Bart.





^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
  2018-07-10  0:05         ` Bart Van Assche
  (?)
@ 2018-07-10 14:46         ` Jens Axboe
  2018-07-10 16:47             ` Bart Van Assche
  -1 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2018-07-10 14:46 UTC (permalink / raw)
  To: Bart Van Assche, mb, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

On 7/9/18 6:05 PM, Bart Van Assche wrote:
> On Mon, 2018-07-09 at 10:34 -0600, Jens Axboe wrote:
>> On 7/9/18 1:54 AM, Matias Bjørling wrote:
>>> For fio, you can use the zone support here:
>>>
>>>    https://github.com/bvanassche/fio
>>>
>>> It is in the process of being upstreamed.
>>
>> In the spirit of making some progress on this, I just don't like how
>> it's done. For example, it should not be necessary to adjust what
>> comes out of the block generator, instead the block generator should
>> be told to do what we need on zbc. This is a key concept. The workload
>> should be defined as such that it works for zoned devices.
> 
> Hello Jens,
> 
> How would you like to see block generation work? I don't see an
> alternative for random I/O other starting from the output of a random
> generator and translating that output into something that is
> appropriate for a zoned block device. Random reads must happen below
> the zone pointer if fio is configured to read below the zone pointer.
> Random writes must happen at the write pointer. The only way I see to
> implement such an I/O pattern is to start from the output of a random
> generator and to adjust the output of that random generator. However,
> I don't have a strong opinion whether adjusting the output of a random
> generator should happen by the caller of get_next_buflen() or inside
> get_next_buflen(). Or is your concern perhaps that the current
> approach interferes with fio job options like bs_unaligned?

The main issue I have with that approach is that the core of fio is
generating the IO patterns, and then you are just changing them as you
see fit. This means that the workload definition and the resulting IO
operations are no longer matched up, since they now also depend on what
you are running on. If I take one workload and run it on a zoned drive,
and then run it on a non-zoned drive, I can't compare the results at
all. This is a showstopper.

There should be no adjusting of the output, rather it should be possible
to write zoned friendly job definitions. It should be possible to run
the same job on a non-zoned drive, and vice versa, and the resulting IO
patterns must be the same.

Fio already has some notion of zones. Maybe that could be extended to
hard zones, and some control of open zones, and patterns within those
zones?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
  2018-07-10 14:46         ` Jens Axboe
@ 2018-07-10 16:47             ` Bart Van Assche
  0 siblings, 0 replies; 25+ messages in thread
From: Bart Van Assche @ 2018-07-10 16:47 UTC (permalink / raw)
  To: mb, axboe, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

T24gVHVlLCAyMDE4LTA3LTEwIGF0IDA4OjQ2IC0wNjAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBP
biA3LzkvMTggNjowNSBQTSwgQmFydCBWYW4gQXNzY2hlIHdyb3RlOg0KPiA+IE9uIE1vbiwgMjAx
OC0wNy0wOSBhdCAxMDozNCAtMDYwMCwgSmVucyBBeGJvZSB3cm90ZToNCj4gPiA+IEluIHRoZSBz
cGlyaXQgb2YgbWFraW5nIHNvbWUgcHJvZ3Jlc3Mgb24gdGhpcywgSSBqdXN0IGRvbid0IGxpa2Ug
aG93DQo+ID4gPiBpdCdzIGRvbmUuIEZvciBleGFtcGxlLCBpdCBzaG91bGQgbm90IGJlIG5lY2Vz
c2FyeSB0byBhZGp1c3Qgd2hhdA0KPiA+ID4gY29tZXMgb3V0IG9mIHRoZSBibG9jayBnZW5lcmF0
b3IsIGluc3RlYWQgdGhlIGJsb2NrIGdlbmVyYXRvciBzaG91bGQNCj4gPiA+IGJlIHRvbGQgdG8g
ZG8gd2hhdCB3ZSBuZWVkIG9uIHpiYy4gVGhpcyBpcyBhIGtleSBjb25jZXB0LiBUaGUgd29ya2xv
YWQNCj4gPiA+IHNob3VsZCBiZSBkZWZpbmVkIGFzIHN1Y2ggdGhhdCBpdCB3b3JrcyBmb3Igem9u
ZWQgZGV2aWNlcy4NCj4gPiANCj4gPiBIb3cgd291bGQgeW91IGxpa2UgdG8gc2VlIGJsb2NrIGdl
bmVyYXRpb24gd29yaz8gSSBkb24ndCBzZWUgYW4NCj4gPiBhbHRlcm5hdGl2ZSBmb3IgcmFuZG9t
IEkvTyBvdGhlciBzdGFydGluZyBmcm9tIHRoZSBvdXRwdXQgb2YgYSByYW5kb20NCj4gPiBnZW5l
cmF0b3IgYW5kIHRyYW5zbGF0aW5nIHRoYXQgb3V0cHV0IGludG8gc29tZXRoaW5nIHRoYXQgaXMN
Cj4gPiBhcHByb3ByaWF0ZSBmb3IgYSB6b25lZCBibG9jayBkZXZpY2UuIFJhbmRvbSByZWFkcyBt
dXN0IGhhcHBlbiBiZWxvdw0KPiA+IHRoZSB6b25lIHBvaW50ZXIgaWYgZmlvIGlzIGNvbmZpZ3Vy
ZWQgdG8gcmVhZCBiZWxvdyB0aGUgem9uZSBwb2ludGVyLg0KPiA+IFJhbmRvbSB3cml0ZXMgbXVz
dCBoYXBwZW4gYXQgdGhlIHdyaXRlIHBvaW50ZXIuIFRoZSBvbmx5IHdheSBJIHNlZSB0bw0KPiA+
IGltcGxlbWVudCBzdWNoIGFuIEkvTyBwYXR0ZXJuIGlzIHRvIHN0YXJ0IGZyb20gdGhlIG91dHB1
dCBvZiBhIHJhbmRvbQ0KPiA+IGdlbmVyYXRvciBhbmQgdG8gYWRqdXN0IHRoZSBvdXRwdXQgb2Yg
dGhhdCByYW5kb20gZ2VuZXJhdG9yLiBIb3dldmVyLA0KPiA+IEkgZG9uJ3QgaGF2ZSBhIHN0cm9u
ZyBvcGluaW9uIHdoZXRoZXIgYWRqdXN0aW5nIHRoZSBvdXRwdXQgb2YgYSByYW5kb20NCj4gPiBn
ZW5lcmF0b3Igc2hvdWxkIGhhcHBlbiBieSB0aGUgY2FsbGVyIG9mIGdldF9uZXh0X2J1Zmxlbigp
IG9yIGluc2lkZQ0KPiA+IGdldF9uZXh0X2J1ZmxlbigpLiBPciBpcyB5b3VyIGNvbmNlcm4gcGVy
aGFwcyB0aGF0IHRoZSBjdXJyZW50DQo+ID4gYXBwcm9hY2ggaW50ZXJmZXJlcyB3aXRoIGZpbyBq
b2Igb3B0aW9ucyBsaWtlIGJzX3VuYWxpZ25lZD8NCj4gDQo+IFRoZSBtYWluIGlzc3VlIEkgaGF2
ZSB3aXRoIHRoYXQgYXBwcm9hY2ggaXMgdGhhdCB0aGUgY29yZSBvZiBmaW8gaXMNCj4gZ2VuZXJh
dGluZyB0aGUgSU8gcGF0dGVybnMsIGFuZCB0aGVuIHlvdSBhcmUganVzdCBjaGFuZ2luZyB0aGVt
IGFzIHlvdQ0KPiBzZWUgZml0LiBUaGlzIG1lYW5zIHRoYXQgdGhlIHdvcmtsb2FkIGRlZmluaXRp
b24gYW5kIHRoZSByZXN1bHRpbmcgSU8NCj4gb3BlcmF0aW9ucyBhcmUgbm8gbG9uZ2VyIG1hdGNo
ZWQgdXAsIHNpbmNlIHRoZXkgbm93IGFsc28gZGVwZW5kIG9uIHdoYXQNCj4geW91IGFyZSBydW5u
aW5nIG9uLiBJZiBJIHRha2Ugb25lIHdvcmtsb2FkIGFuZCBydW4gaXQgb24gYSB6b25lZCBkcml2
ZSwNCj4gYW5kIHRoZW4gcnVuIGl0IG9uIGEgbm9uLXpvbmVkIGRyaXZlLCBJIGNhbid0IGNvbXBh
cmUgdGhlIHJlc3VsdHMgYXQNCj4gYWxsLiBUaGlzIGlzIGEgc2hvd3N0b3BwZXIuDQo+IA0KPiBU
aGVyZSBzaG91bGQgYmUgbm8gYWRqdXN0aW5nIG9mIHRoZSBvdXRwdXQsIHJhdGhlciBpdCBzaG91
bGQgYmUgcG9zc2libGUNCj4gdG8gd3JpdGUgem9uZWQgZnJpZW5kbHkgam9iIGRlZmluaXRpb25z
LiBJdCBzaG91bGQgYmUgcG9zc2libGUgdG8gcnVuDQo+IHRoZSBzYW1lIGpvYiBvbiBhIG5vbi16
b25lZCBkcml2ZSwgYW5kIHZpY2UgdmVyc2EsIGFuZCB0aGUgcmVzdWx0aW5nIElPDQo+IHBhdHRl
cm5zIG11c3QgYmUgdGhlIHNhbWUuDQo+IA0KPiBGaW8gYWxyZWFkeSBoYXMgc29tZSBub3Rpb24g
b2Ygem9uZXMuIE1heWJlIHRoYXQgY291bGQgYmUgZXh0ZW5kZWQgdG8NCj4gaGFyZCB6b25lcywg
YW5kIHNvbWUgY29udHJvbCBvZiBvcGVuIHpvbmVzLCBhbmQgcGF0dGVybnMgd2l0aGluIHRob3Nl
DQo+IHpvbmVzPw0KDQpIZWxsbyBKZW5zLA0KDQpIb3cgYWJvdXQgYWRkaW5nIGEgam9iIG9wdGlv
biB0aGF0IG1ha2VzIGl0IHBvc3NpYmxlIHRvIHVzZSB0aGUgem9uZWQNCmJsb2NrIGRldmljZSAo
WkJEKSBJL08gcGF0dGVybiBvbiBub24tWkJEIGRldmljZXMsIHJlcXVpcmluZyB0aGF0IHRoZQ0K
em9uZSBzaXplIGlzIHNldCBleHBsaWNpdGx5IGZvciBub24tWkJEIGRldmljZXMgYW5kIG1haW50
YWluaW5nIGEgd3JpdGUNCnBvaW50ZXIgbm90IG9ubHkgd2hlbiBwZXJmb3JtaW5nIEkvTyB0byBh
IFpCRCBkZXZpY2UgYnV0IGFsc28gaWYgYQ0KWkJELXN0eWxlIEkvTyBwYXR0ZXJuIGlzIGFwcGxp
ZWQgdG8gYSBub24tWkJEIGRpc2s/IFRoaXMgc2hvdWxkIGFsbG93IHRvDQphcHBseSBleGFjdGx5
IHRoZSBzYW1lIHdvcmtsb2FkIHRvIGEgbm9uLVpCRCBkaXNrIGFzIHRvIGEgWkJEIGRpc2suDQoN
CldoYXQgSSBkZXJpdmVkIGZyb20gdGhlIGZpbyBzb3VyY2UgY29kZSBpcyBhcyBmb2xsb3dzIChw
bGVhc2UgY29ycmVjdCBtZQ0KaWYgSSBnb3QgYW55dGhpbmcgd3JvbmcpOg0KKiBUaGUgcHVycG9z
ZSBvZiB0aGUgem9uZXNpemUsIHpvbmVyYW5nZSBhbmQgem9uZXNraXAgam9iIG9wdGlvbnMgaXMg
dG8NCiAgbGltaXQgdGhlIEkvTyByYW5nZSB0byBhIHNpbmdsZSB6b25lIHdpdGggc2l6ZSAiem9u
ZXNpemUiLiBUaGUgSS9PDQogIHBhdHRlcm4gZm9yIHpvbmVkIGJsb2NrIGRldmljZXMgaXMgZGlm
ZmVyZW50OiBJL08gaGFwcGVucyBpbiBtdWx0aXBsZQ0KICB6b25lcyBzaW11bHRhbmVvdXNseS4g
VGhlIG51bWJlciBvZiB6b25lcyB0byB3aGljaCBJL08gaGFwcGVucyBpcw0KICBjYWxsZWQgdGhl
IG51bWJlciBvZiBvcGVuIHpvbmVzLg0KKiBUaGUgcHVycG9zZSBvZiB0aGUgcmFuZG9tX2Rpc3Ry
aWJ1dGlvbj16b25lZHtfYWJzfSBqb2Igb3B0aW9uIGlzIHRvDQogIGFsbG93IHRoZSB1c2VyIHRv
IHNrZXcgYSB1bmlmb3JtIHJhbmRvbSBkaXN0cmlidXRpb24uIFRoaXMgaXMgYW5vdGhlcg0KICB3
b3JrbG9hZCBwYXR0ZXJuIHRoYW4gdGhlIHR5cGljYWwgcGF0dGVybiBmb3IgWkJEIGRyaXZlcy4N
Cg0KVGhhbmtzLA0KDQpCYXJ0Lg0KDQoNCg0KDQoNCg==

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
@ 2018-07-10 16:47             ` Bart Van Assche
  0 siblings, 0 replies; 25+ messages in thread
From: Bart Van Assche @ 2018-07-10 16:47 UTC (permalink / raw)
  To: mb, axboe, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

On Tue, 2018-07-10 at 08:46 -0600, Jens Axboe wrote:
> On 7/9/18 6:05 PM, Bart Van Assche wrote:
> > On Mon, 2018-07-09 at 10:34 -0600, Jens Axboe wrote:
> > > In the spirit of making some progress on this, I just don't like how
> > > it's done. For example, it should not be necessary to adjust what
> > > comes out of the block generator, instead the block generator should
> > > be told to do what we need on zbc. This is a key concept. The workload
> > > should be defined as such that it works for zoned devices.
> > 
> > How would you like to see block generation work? I don't see an
> > alternative for random I/O other starting from the output of a random
> > generator and translating that output into something that is
> > appropriate for a zoned block device. Random reads must happen below
> > the zone pointer if fio is configured to read below the zone pointer.
> > Random writes must happen at the write pointer. The only way I see to
> > implement such an I/O pattern is to start from the output of a random
> > generator and to adjust the output of that random generator. However,
> > I don't have a strong opinion whether adjusting the output of a random
> > generator should happen by the caller of get_next_buflen() or inside
> > get_next_buflen(). Or is your concern perhaps that the current
> > approach interferes with fio job options like bs_unaligned?
> 
> The main issue I have with that approach is that the core of fio is
> generating the IO patterns, and then you are just changing them as you
> see fit. This means that the workload definition and the resulting IO
> operations are no longer matched up, since they now also depend on what
> you are running on. If I take one workload and run it on a zoned drive,
> and then run it on a non-zoned drive, I can't compare the results at
> all. This is a showstopper.
> 
> There should be no adjusting of the output, rather it should be possible
> to write zoned friendly job definitions. It should be possible to run
> the same job on a non-zoned drive, and vice versa, and the resulting IO
> patterns must be the same.
> 
> Fio already has some notion of zones. Maybe that could be extended to
> hard zones, and some control of open zones, and patterns within those
> zones?

Hello Jens,

How about adding a job option that makes it possible to use the zoned
block device (ZBD) I/O pattern on non-ZBD devices, requiring that the
zone size is set explicitly for non-ZBD devices and maintaining a write
pointer not only when performing I/O to a ZBD device but also if a
ZBD-style I/O pattern is applied to a non-ZBD disk? This should allow to
apply exactly the same workload to a non-ZBD disk as to a ZBD disk.

What I derived from the fio source code is as follows (please correct me
if I got anything wrong):
* The purpose of the zonesize, zonerange and zoneskip job options is to
  limit the I/O range to a single zone with size "zonesize". The I/O
  pattern for zoned block devices is different: I/O happens in multiple
  zones simultaneously. The number of zones to which I/O happens is
  called the number of open zones.
* The purpose of the random_distribution=zoned{_abs} job option is to
  allow the user to skew a uniform random distribution. This is another
  workload pattern than the typical pattern for ZBD drives.

Thanks,

Bart.






^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
  2018-07-10 16:47             ` Bart Van Assche
  (?)
@ 2018-07-10 18:45             ` Jens Axboe
  2018-07-10 18:49                 ` Bart Van Assche
  -1 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2018-07-10 18:45 UTC (permalink / raw)
  To: Bart Van Assche, mb, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

On 7/10/18 10:47 AM, Bart Van Assche wrote:
> On Tue, 2018-07-10 at 08:46 -0600, Jens Axboe wrote:
>> On 7/9/18 6:05 PM, Bart Van Assche wrote:
>>> On Mon, 2018-07-09 at 10:34 -0600, Jens Axboe wrote:
>>>> In the spirit of making some progress on this, I just don't like how
>>>> it's done. For example, it should not be necessary to adjust what
>>>> comes out of the block generator, instead the block generator should
>>>> be told to do what we need on zbc. This is a key concept. The workload
>>>> should be defined as such that it works for zoned devices.
>>>
>>> How would you like to see block generation work? I don't see an
>>> alternative for random I/O other starting from the output of a random
>>> generator and translating that output into something that is
>>> appropriate for a zoned block device. Random reads must happen below
>>> the zone pointer if fio is configured to read below the zone pointer.
>>> Random writes must happen at the write pointer. The only way I see to
>>> implement such an I/O pattern is to start from the output of a random
>>> generator and to adjust the output of that random generator. However,
>>> I don't have a strong opinion whether adjusting the output of a random
>>> generator should happen by the caller of get_next_buflen() or inside
>>> get_next_buflen(). Or is your concern perhaps that the current
>>> approach interferes with fio job options like bs_unaligned?
>>
>> The main issue I have with that approach is that the core of fio is
>> generating the IO patterns, and then you are just changing them as you
>> see fit. This means that the workload definition and the resulting IO
>> operations are no longer matched up, since they now also depend on what
>> you are running on. If I take one workload and run it on a zoned drive,
>> and then run it on a non-zoned drive, I can't compare the results at
>> all. This is a showstopper.
>>
>> There should be no adjusting of the output, rather it should be possible
>> to write zoned friendly job definitions. It should be possible to run
>> the same job on a non-zoned drive, and vice versa, and the resulting IO
>> patterns must be the same.
>>
>> Fio already has some notion of zones. Maybe that could be extended to
>> hard zones, and some control of open zones, and patterns within those
>> zones?
> 
> Hello Jens,
> 
> How about adding a job option that makes it possible to use the zoned
> block device (ZBD) I/O pattern on non-ZBD devices, requiring that the
> zone size is set explicitly for non-ZBD devices and maintaining a write
> pointer not only when performing I/O to a ZBD device but also if a
> ZBD-style I/O pattern is applied to a non-ZBD disk? This should allow to
> apply exactly the same workload to a non-ZBD disk as to a ZBD disk.

It just doesn't make any sense to me. The source of truth is the
generator of the IO, which does exactly what it is told by the job
definition. You're proposing to mangle that somehow, to fit some
restrictions that the underlying device has. That very concept is
foreign, and adding an option to be able to do the same on some other
device is misleading. The difference between the job file and the
workload run can be huge. Consider something really basic:

[randwrites]
bs=4k
rw=randwrite

which would be 100% random 4k writes. If I run this on a zoned device,
then that'd turn into 100% sequential writes. That makes no sense at
all. And if I run it on a different devices, I'd get 100% random writes.
Except if I set some magic option. Sorry, but that concept is just too
ugly to live, it makes zero sense. Put down your zoned hat for a bit and
think about it.

> What I derived from the fio source code is as follows (please correct me
> if I got anything wrong):
> * The purpose of the zonesize, zonerange and zoneskip job options is to
>   limit the I/O range to a single zone with size "zonesize". The I/O
>   pattern for zoned block devices is different: I/O happens in multiple
>   zones simultaneously. The number of zones to which I/O happens is
>   called the number of open zones.

The only difference is that fio currently only has one zone active. When
it finishes one, it goes to the next. See my above suggestion on adding
the notion of open zones, which would extend this to more than 1.

> * The purpose of the random_distribution=zoned{_abs} job option is to
>   allow the user to skew a uniform random distribution. This is another
>   workload pattern than the typical pattern for ZBD drives.

Fio's zones were never intended to be for zoned devices. Don't get hung
up on current use cases, think about what kind of definitions would make
sense for zoned devices.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
  2018-07-10 18:45             ` Jens Axboe
@ 2018-07-10 18:49                 ` Bart Van Assche
  0 siblings, 0 replies; 25+ messages in thread
From: Bart Van Assche @ 2018-07-10 18:49 UTC (permalink / raw)
  To: mb, axboe, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

T24gVHVlLCAyMDE4LTA3LTEwIGF0IDEyOjQ1IC0wNjAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBU
aGUgZGlmZmVyZW5jZSBiZXR3ZWVuIHRoZSBqb2IgZmlsZSBhbmQgdGhlDQo+IHdvcmtsb2FkIHJ1
biBjYW4gYmUgaHVnZS4gQ29uc2lkZXIgc29tZXRoaW5nIHJlYWxseSBiYXNpYzoNCj4gDQo+IFty
YW5kd3JpdGVzXQ0KPiBicz00aw0KPiBydz1yYW5kd3JpdGUNCj4gDQo+IHdoaWNoIHdvdWxkIGJl
IDEwMCUgcmFuZG9tIDRrIHdyaXRlcy4gSWYgSSBydW4gdGhpcyBvbiBhIHpvbmVkIGRldmljZSwN
Cj4gdGhlbiB0aGF0J2QgdHVybiBpbnRvIDEwMCUgc2VxdWVudGlhbCB3cml0ZXMuDQoNClRoYXQn
cyBub3QgY29ycmVjdC4gVGhlIFpCRCBjb2RlIGluIHRoZSBnaXRodWIgcHVsbCByZXF1ZXN0IHNl
cmlhbGl6ZXMgd3JpdGVzDQpwZXIgem9uZSwgbm90IGdsb2JhbGx5Lg0KDQo+ID4gV2hhdCBJIGRl
cml2ZWQgZnJvbSB0aGUgZmlvIHNvdXJjZSBjb2RlIGlzIGFzIGZvbGxvd3MgKHBsZWFzZSBjb3Jy
ZWN0IG1lDQo+ID4gaWYgSSBnb3QgYW55dGhpbmcgd3JvbmcpOg0KPiA+ICogVGhlIHB1cnBvc2Ug
b2YgdGhlIHpvbmVzaXplLCB6b25lcmFuZ2UgYW5kIHpvbmVza2lwIGpvYiBvcHRpb25zIGlzIHRv
DQo+ID4gICBsaW1pdCB0aGUgSS9PIHJhbmdlIHRvIGEgc2luZ2xlIHpvbmUgd2l0aCBzaXplICJ6
b25lc2l6ZSIuIFRoZSBJL08NCj4gPiAgIHBhdHRlcm4gZm9yIHpvbmVkIGJsb2NrIGRldmljZXMg
aXMgZGlmZmVyZW50OiBJL08gaGFwcGVucyBpbiBtdWx0aXBsZQ0KPiA+ICAgem9uZXMgc2ltdWx0
YW5lb3VzbHkuIFRoZSBudW1iZXIgb2Ygem9uZXMgdG8gd2hpY2ggSS9PIGhhcHBlbnMgaXMNCj4g
PiAgIGNhbGxlZCB0aGUgbnVtYmVyIG9mIG9wZW4gem9uZXMuDQo+IA0KPiBUaGUgb25seSBkaWZm
ZXJlbmNlIGlzIHRoYXQgZmlvIGN1cnJlbnRseSBvbmx5IGhhcyBvbmUgem9uZSBhY3RpdmUuIFdo
ZW4NCj4gaXQgZmluaXNoZXMgb25lLCBpdCBnb2VzIHRvIHRoZSBuZXh0LiBTZWUgbXkgYWJvdmUg
c3VnZ2VzdGlvbiBvbiBhZGRpbmcNCj4gdGhlIG5vdGlvbiBvZiBvcGVuIHpvbmVzLCB3aGljaCB3
b3VsZCBleHRlbmQgdGhpcyB0byBtb3JlIHRoYW4gMS4NCg0KVGhhbmtzLCBJIHdpbGwgbG9vayBp
bnRvIHRoaXMuDQoNCkJhcnQuDQoNCg0KDQoNCg0K

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
@ 2018-07-10 18:49                 ` Bart Van Assche
  0 siblings, 0 replies; 25+ messages in thread
From: Bart Van Assche @ 2018-07-10 18:49 UTC (permalink / raw)
  To: mb, axboe, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote:
> The difference between the job file and the
> workload run can be huge. Consider something really basic:
> 
> [randwrites]
> bs=4k
> rw=randwrite
> 
> which would be 100% random 4k writes. If I run this on a zoned device,
> then that'd turn into 100% sequential writes.

That's not correct. The ZBD code in the github pull request serializes writes
per zone, not globally.

> > What I derived from the fio source code is as follows (please correct me
> > if I got anything wrong):
> > * The purpose of the zonesize, zonerange and zoneskip job options is to
> >   limit the I/O range to a single zone with size "zonesize". The I/O
> >   pattern for zoned block devices is different: I/O happens in multiple
> >   zones simultaneously. The number of zones to which I/O happens is
> >   called the number of open zones.
> 
> The only difference is that fio currently only has one zone active. When
> it finishes one, it goes to the next. See my above suggestion on adding
> the notion of open zones, which would extend this to more than 1.

Thanks, I will look into this.

Bart.






^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH 0/2] null_blk: zone support
  2018-07-10 18:49                 ` Bart Van Assche
  (?)
@ 2018-07-10 18:51                 ` Jens Axboe
  2018-08-09 20:51                     ` Bart Van Assche
  -1 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2018-07-10 18:51 UTC (permalink / raw)
  To: Bart Van Assche, mb, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

On 7/10/18 12:49 PM, Bart Van Assche wrote:
> On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote:
>> The difference between the job file and the
>> workload run can be huge. Consider something really basic:
>>
>> [randwrites]
>> bs=4k
>> rw=randwrite
>>
>> which would be 100% random 4k writes. If I run this on a zoned device,
>> then that'd turn into 100% sequential writes.
> 
> That's not correct. The ZBD code in the github pull request serializes writes
> per zone, not globally.

That's a totally minor detail. If all my random writes fall within a single
zone, then they'd be 100% sequential. For N open zones, you'd be 100%
sequential within the zone. The point is that the workload as defined and
the workload as run are two totally different things, and THAT is the
problem.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: Zoned block device support for fio (was: [PATCH 0/2] null_blk: zone support)
  2018-07-10 18:51                 ` Jens Axboe
@ 2018-08-09 20:51                     ` Bart Van Assche
  0 siblings, 0 replies; 25+ messages in thread
From: Bart Van Assche @ 2018-08-09 20:51 UTC (permalink / raw)
  To: mb, axboe, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

On Tue, 2018-07-10 at 12:51 -0600, Jens Axboe wrote:
> On 7/10/18 12:49 PM, Bart Van Assche wrote:
> > On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote:
> > > The difference between the job file and the
> > > workload run can be huge. Consider something really basic=
:
> > >=20
> > > [randwrites]
> > > bs=4k
> > > rw=randwrite
> > >=20
> > > which would be 100% random 4k writes. If I run this o=
n a zoned device,
> > > then that'd turn into 100% sequential writes.
> >=20
> > That's not correct. The ZBD code in the github pull request ser=
ializes writes
> > per zone, not globally.
>=20
> That's a totally minor detail. If all my random writes fall within a =
single
> zone, then they'd be 100% sequential. For N open zones, you'd be =
100%
> sequential within the zone. The point is that the workload as defined=
 and
> the workload as run are two totally different things, and THAT is the
> problem.

Hello Jens,

What you proposed in a previous e-mail, namely to use the existing fio zone
concept for zoned block devices sounds interesting to me. This is something=
 I
will definitely look further into. This will help to make a given workload
that is suited for zoned block devices to behave (almost) identical when ru=
n
against a regular block device. Since fio users expect that no zones are
reset before e.g. a random write test is started, there will always be a sm=
all
difference in behavior between a workload run against a zoned block device =
and
a workload run against a regular device if some zones already contain data.

It's not clear to me how close you want the behavior of fio to be for zoned
and regular block devices. Do you e.g. want me to introduce a new I/O patte=
rn
(--rw=...) that causes fio to write sequentially inside a zone and for =
which
zones are selected randomly? I don't see any other approach that allows to
make sure that the same workload definition behaves identically against zon=
ed
and regular block devices.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: Zoned block device support for fio (was: [PATCH 0/2] null_blk: zone support)
@ 2018-08-09 20:51                     ` Bart Van Assche
  0 siblings, 0 replies; 25+ messages in thread
From: Bart Van Assche @ 2018-08-09 20:51 UTC (permalink / raw)
  To: mb, axboe, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

On Tue, 2018-07-10 at 12:51 -0600, Jens Axboe wrote:
> On 7/10/18 12:49 PM, Bart Van Assche wrote:
> > On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote:
> > > The difference between the job file and the
> > > workload run can be huge. Consider something really basic:
> > > 
> > > [randwrites]
> > > bs=4k
> > > rw=randwrite
> > > 
> > > which would be 100% random 4k writes. If I run this on a zoned device,
> > > then that'd turn into 100% sequential writes.
> > 
> > That's not correct. The ZBD code in the github pull request serializes writes
> > per zone, not globally.
> 
> That's a totally minor detail. If all my random writes fall within a single
> zone, then they'd be 100% sequential. For N open zones, you'd be 100%
> sequential within the zone. The point is that the workload as defined and
> the workload as run are two totally different things, and THAT is the
> problem.

Hello Jens,

What you proposed in a previous e-mail, namely to use the existing fio zone
concept for zoned block devices sounds interesting to me. This is something I
will definitely look further into. This will help to make a given workload
that is suited for zoned block devices to behave (almost) identical when run
against a regular block device. Since fio users expect that no zones are
reset before e.g. a random write test is started, there will always be a small
difference in behavior between a workload run against a zoned block device and
a workload run against a regular device if some zones already contain data.

It's not clear to me how close you want the behavior of fio to be for zoned
and regular block devices. Do you e.g. want me to introduce a new I/O pattern
(--rw=...) that causes fio to write sequentially inside a zone and for which
zones are selected randomly? I don't see any other approach that allows to
make sure that the same workload definition behaves identically against zoned
and regular block devices.

Thanks,

Bart.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: Zoned block device support for fio
  2018-08-09 20:51                     ` Bart Van Assche
  (?)
@ 2018-08-09 21:03                     ` Jens Axboe
  2018-08-15 18:07                         ` Bart Van Assche
  -1 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2018-08-09 21:03 UTC (permalink / raw)
  To: Bart Van Assche, mb, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

On 8/9/18 2:51 PM, Bart Van Assche wrote:
> On Tue, 2018-07-10 at 12:51 -0600, Jens Axboe wrote:
>> On 7/10/18 12:49 PM, Bart Van Assche wrote:
>>> On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote:
>>>> The difference between the job file and the
>>>> workload run can be huge. Consider something really basic:
>>>>
>>>> [randwrites]
>>>> bs=4k
>>>> rw=randwrite
>>>>
>>>> which would be 100% random 4k writes. If I run this on a zoned device,
>>>> then that'd turn into 100% sequential writes.
>>>
>>> That's not correct. The ZBD code in the github pull request serializes writes
>>> per zone, not globally.
>>
>> That's a totally minor detail. If all my random writes fall within a single
>> zone, then they'd be 100% sequential. For N open zones, you'd be 100%
>> sequential within the zone. The point is that the workload as defined and
>> the workload as run are two totally different things, and THAT is the
>> problem.
> 
> Hello Jens,
> 
> What you proposed in a previous e-mail, namely to use the existing fio zone
> concept for zoned block devices sounds interesting to me. This is something I
> will definitely look further into. This will help to make a given workload
> that is suited for zoned block devices to behave (almost) identical when run
> against a regular block device. Since fio users expect that no zones are
> reset before e.g. a random write test is started, there will always be a small
> difference in behavior between a workload run against a zoned block device and
> a workload run against a regular device if some zones already contain data.
> 
> It's not clear to me how close you want the behavior of fio to be for zoned
> and regular block devices. Do you e.g. want me to introduce a new I/O pattern
> (--rw=...) that causes fio to write sequentially inside a zone and for which
> zones are selected randomly? I don't see any other approach that allows to
> make sure that the same workload definition behaves identically against zoned
> and regular block devices.

Yes exactly. Basically come up with something that just maps fio zones on
top of SMR zones. Then come up with something that allows you to have
sequential writes IO within a zone, and something that allows you to decide
how many zones to use, when to skip between zones, etc.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: Zoned block device support for fio
  2018-08-09 21:03                     ` Zoned block device support for fio Jens Axboe
@ 2018-08-15 18:07                         ` Bart Van Assche
  0 siblings, 0 replies; 25+ messages in thread
From: Bart Van Assche @ 2018-08-15 18:07 UTC (permalink / raw)
  To: mb, axboe, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

On Thu, 2018-08-09 at 15:03 -0600, Jens Axboe wrote:
> Yes exactly. Basically come up with something that just maps fio zone=
s on
> top of SMR zones. Then come up with something that allows you to have
> sequential writes IO within a zone, and something that allows you to =
decide
> how many zones to use, when to skip between zones, etc.

Hello Jens,

SMR support for fio has been reworked. The reworked patches are available a=
t
https://github.com/axboe/fio/pull/585. The following two test scripts allow=
 to
verify whether the fio behavior is identical for SMR and non-SMR storage me=
dia:
* t/zbd/run-tests-against-regular-nullb
* t/zbd/run-tests-against-zoned-nullb

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: Zoned block device support for fio
@ 2018-08-15 18:07                         ` Bart Van Assche
  0 siblings, 0 replies; 25+ messages in thread
From: Bart Van Assche @ 2018-08-15 18:07 UTC (permalink / raw)
  To: mb, axboe, loberman; +Cc: linux-kernel, linux-block, Damien Le Moal

On Thu, 2018-08-09 at 15:03 -0600, Jens Axboe wrote:
> Yes exactly. Basically come up with something that just maps fio zones on
> top of SMR zones. Then come up with something that allows you to have
> sequential writes IO within a zone, and something that allows you to decide
> how many zones to use, when to skip between zones, etc.

Hello Jens,

SMR support for fio has been reworked. The reworked patches are available at
https://github.com/axboe/fio/pull/585. The following two test scripts allow to
verify whether the fio behavior is identical for SMR and non-SMR storage media:
* t/zbd/run-tests-against-regular-nullb
* t/zbd/run-tests-against-zoned-nullb

Thanks,

Bart.


^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: Zoned block device support for fio
  2018-08-24 16:42   ` Bart Van Assche
@ 2018-08-24 16:46     ` Jens Axboe
  0 siblings, 0 replies; 25+ messages in thread
From: Jens Axboe @ 2018-08-24 16:46 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: fio, Jason Jorgensen, Dmitry Fomichev, phillip.a.chen,
	Damien Le Moal, Masato Suzuki, Matias Bjorling

On 8/24/18 10:42 AM, Bart Van Assche wrote:
>> zbd_init() should not be something that
>> is unconditionally called. Only do it if ZONE_MODE_ZBD is set and make
>> that explicit.
> 
> There is a test in the zbd_init() code that makes fio report an error
> message if --zonemode=zbd has not been specified for zoned block devices:
> 
> 	if (td->o.zone_mode != ZONE_MODE_ZBD) {
>  		for_each_file(td, f, i) {
> 			if (get_zbd_model(f->file_name) != ZBD_DM_HOST_MANAGED)
> 				continue;
> 			log_err("%s: Using --zonemode=zbd is mandatory for host-managed drives\n\n",
> 				f->file_name);
> 			return 1;
> 		}
> 	}
> 
> I think that error message is very helpful for people who run fio for the
> first time against a zoned block device so I would like to keep that error
> message. However, I'm not sure where you would like me to move that code if
> zbd_init() is only called for --zonemode=zbd?

That's exactly what I'm objecting to. If someone doesn't ask for zoned
behavior, don't do any of that. If the test then fails, then the test
will fail. Don't look or parse any of that stuff. If need be, you could
introduce an error check similarly to how we add an extra information
note if the first O_DIRECT IO fails, for instance.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: Zoned block device support for fio
  2018-08-24 13:48 ` Jens Axboe
@ 2018-08-24 16:42   ` Bart Van Assche
  2018-08-24 16:46     ` Jens Axboe
  0 siblings, 1 reply; 25+ messages in thread
From: Bart Van Assche @ 2018-08-24 16:42 UTC (permalink / raw)
  To: axboe
  Cc: fio, Jason Jorgensen, Dmitry Fomichev, phillip.a.chen,
	Damien Le Moal, Masato Suzuki, Matias Bjorling

On Fri, 2018-08-24 at 07:48 -0600, Jens Axboe wrote:
> I think this is starting to look good. For the filesetup.c changes, I think
> that this:
> 
> +       if (o->zone_mode == ZONE_MODE_NONE && o->zone_size)
> +               o->zone_mode = ZONE_MODE_STRIDED;
> 
> belongs in fixup_options().

Hello Jens,

Thanks for having taken a look. That change has been made in the patches I
just posted on the fio mailing list.

> zbd_init() should not be something that
> is unconditionally called. Only do it if ZONE_MODE_ZBD is set and make
> that explicit.

There is a test in the zbd_init() code that makes fio report an error
message if --zonemode=zbd has not been specified for zoned block devices:

	if (td->o.zone_mode != ZONE_MODE_ZBD) {
 		for_each_file(td, f, i) {
			if (get_zbd_model(f->file_name) != ZBD_DM_HOST_MANAGED)
				continue;
			log_err("%s: Using --zonemode=zbd is mandatory for host-managed drives\n\n",
				f->file_name);
			return 1;
		}
	}

I think that error message is very helpful for people who run fio for the
first time against a zoned block device so I would like to keep that error
message. However, I'm not sure where you would like me to move that code if
zbd_init() is only called for --zonemode=zbd?

> Also kill zbd querying in zbd_init() if we haven't asked for it.

Can you clarify this? parse_zone_info() and read_zone_info() are only called
if --zonemode=zbd has been specified. The only new query that is triggered
from zbd_init() for regular block devices is get_zbd_model(). That function
reads /sys/dev/block/.../queue/zoned but does not try to perform any ioctls
that are specific to zoned block devices.

Thanks,

Bart.



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: Zoned block device support for fio
  2018-08-23 23:00 Zoned block device support for fio Bart Van Assche
@ 2018-08-24 13:48 ` Jens Axboe
  2018-08-24 16:42   ` Bart Van Assche
  0 siblings, 1 reply; 25+ messages in thread
From: Jens Axboe @ 2018-08-24 13:48 UTC (permalink / raw)
  To: Bart Van Assche
  Cc: fio, Jason Jorgensen, Dmitry Fomichev, phillip.a.chen,
	Damien Le Moal, Masato Suzuki, Matias Bjorling

On 8/23/18 5:00 PM, Bart Van Assche wrote:
> Hello,
> 
> The code on the master branch of my fio repository on github has been
> updated again. The changes compared to the previous version are:
> * A new job option has been added, namely zonemode. zonemode=zbd has to be
>   specified when filename refers to a zoned block device. zonesize has a new
>   meaning for zonemode=zbd. When both zonemode and zonesize have been
>   specified and filename refers to a regular block device the I/O pattern
>   the same as for a zoned block device (at least if all write pointers of the
>   zoned block device are reset before fio is started). This behavior change
>   was requested by Jens.
> * A subtle bug has been fixed that caused I/O against a zoned block device to
>   stop early if multiple jobs were doing random writes to the same LBA range
>   and with the random map enabled.
> * The zonemode=zbd do_verify=1 combination no longer causes fio to report
>   verify errors when run against the conventional zones of a zoned block
>   device.
> * The code for skipping offline zones should now work. Offline zones were not
>   skipped in previous versions. I have not yet tested this code - testing this
>   change requires access to a real or simulated openchannel SSD.
> * Several new unit tests have been added to t/zbc/test-zbc-support.
> 
> See also https://github.com/axboe/fio/pull/585.

I'm going to reply here, since I generally hate github review for anything
but the simplest of things.

I think this is starting to look good. For the filesetup.c changes, I think
that this:

+       if (o->zone_mode == ZONE_MODE_NONE && o->zone_size)
+               o->zone_mode = ZONE_MODE_STRIDED;

belongs in fixup_options()zbd_init() should not be something that
is unconditionally called. Only do it if ZONE_MODE_ZBD is set and make
that explicit. Also kill zbd querying in zbd_init() if we haven't asked for
it.

Can you git send-email the series to me and the list? I'll take a closer
look.

-- 
Jens Axboe



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Zoned block device support for fio
@ 2018-08-23 23:00 Bart Van Assche
  2018-08-24 13:48 ` Jens Axboe
  0 siblings, 1 reply; 25+ messages in thread
From: Bart Van Assche @ 2018-08-23 23:00 UTC (permalink / raw)
  To: axboe
  Cc: fio, Jason Jorgensen, Dmitry Fomichev, phillip.a.chen,
	Damien Le Moal, Masato Suzuki, Matias Bjorling

Hello,

The code on the master branch of my fio repository on github has been
updated again. The changes compared to the previous version are:
* A new job option has been added, namely zonemode. zonemode=zbd has to be
  specified when filename refers to a zoned block device. zonesize has a new
  meaning for zonemode=zbd. When both zonemode and zonesize have been
  specified and filename refers to a regular block device the I/O pattern
  the same as for a zoned block device (at least if all write pointers of the
  zoned block device are reset before fio is started). This behavior change
  was requested by Jens.
* A subtle bug has been fixed that caused I/O against a zoned block device to
  stop early if multiple jobs were doing random writes to the same LBA range
  and with the random map enabled.
* The zonemode=zbd do_verify=1 combination no longer causes fio to report
  verify errors when run against the conventional zones of a zoned block
  device.
* The code for skipping offline zones should now work. Offline zones were not
  skipped in previous versions. I have not yet tested this code - testing this
  change requires access to a real or simulated openchannel SSD.
* Several new unit tests have been added to t/zbc/test-zbc-support.

See also https://github.com/axboe/fio/pull/585.

Best regards,

Bart.


^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2018-08-24 16:46 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-06 17:38 [PATCH 0/2] null_blk: zone support Matias Bjørling
2018-07-06 17:38 ` [PATCH 1/2] null_blk: move shared definitions to header file Matias Bjørling
2018-07-06 17:38 ` [PATCH 2/2] null_blk: add zone support Matias Bjørling
2018-07-06 17:45 ` [PATCH 0/2] null_blk: " Laurence Oberman
2018-07-09  7:54   ` Matias Bjørling
2018-07-09 16:34     ` Jens Axboe
2018-07-10  0:05       ` Bart Van Assche
2018-07-10  0:05         ` Bart Van Assche
2018-07-10 14:46         ` Jens Axboe
2018-07-10 16:47           ` Bart Van Assche
2018-07-10 16:47             ` Bart Van Assche
2018-07-10 18:45             ` Jens Axboe
2018-07-10 18:49               ` Bart Van Assche
2018-07-10 18:49                 ` Bart Van Assche
2018-07-10 18:51                 ` Jens Axboe
2018-08-09 20:51                   ` Zoned block device support for fio (was: [PATCH 0/2] null_blk: zone support) Bart Van Assche
2018-08-09 20:51                     ` Bart Van Assche
2018-08-09 21:03                     ` Zoned block device support for fio Jens Axboe
2018-08-15 18:07                       ` Bart Van Assche
2018-08-15 18:07                         ` Bart Van Assche
2018-07-07  2:54 ` [PATCH 0/2] null_blk: zone support Jens Axboe
2018-08-23 23:00 Zoned block device support for fio Bart Van Assche
2018-08-24 13:48 ` Jens Axboe
2018-08-24 16:42   ` Bart Van Assche
2018-08-24 16:46     ` Jens Axboe

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.