From: Mark Ruijter With reference to the following issue reported on the mailing list :- http://lists.infradead.org/pipermail/linux-nvme/2019-October/027604.html This patch adds a new attrubute use_vfs so that any block device can be used in the file backend. We can see the follwoing performance improvement in the I/Os with the setup described in the link when new attribute use_vfs=1 and device_path configured as /dev/md0. Performance numbers :- 1. With this patch using /dev/md0 as namespace backend where use_vfs=0:- write: IOPS=66.1k, BW=258MiB/s (271MB/s)(7750MiB/30002msec) write: IOPS=65.8k, BW=257MiB/s (269MB/s)(7709MiB/30002msec) write: IOPS=64.8k, BW=253MiB/s (266MB/s)(7599MiB/30002msec) 2. With this patch using /dev/md0 as namespace backend where use_vfs=1:- write: IOPS=153k, BW=598MiB/s (627MB/s)(17.5GiB/30001msec) write: IOPS=152k, BW=594MiB/s (623MB/s)(17.4GiB/30001msec) write: IOPS=151k, BW=589MiB/s (617MB/s)(17.2GiB/30002msec) Signed-off-by: Mark Ruijter Signed-off-by: Chaitanya Kulkarni --- Hi, This work is originally done by Mark Ruijter (MRuijter@onestopsystems.com), I've fixed couple of coding style issues, tested, validated performance numbers with nvme-loop. Setup Info md0 with 2 memory backed null_blk devices :- # lsblk | grep null nullb1 252:1 0 2G 0 disk └─nullb1p1 259:1 0 2G 0 part nullb0 252:0 0 2G 0 disk └─nullb0p1 259:2 0 2G 0 part # mdadm -E /dev/nullb0 /dev/nullb0: MBR Magic : aa55 Partition[0] : 4192256 sectors at 2048 (type fd) # mdadm -E /dev/nullb1 /dev/nullb1: MBR Magic : aa55 Partition[0] : 4192256 sectors at 2048 (type fd) # mdadm --detail /dev/md0 /dev/md0: Version : 1.2 Creation Time : Tue Oct 22 15:45:48 2019 Raid Level : raid1 Array Size : 2095104 (2046.00 MiB 2145.39 MB) Used Dev Size : 2095104 (2046.00 MiB 2145.39 MB) Raid Devices : 2 Total Devices : 2 Persistence : Superblock is persistent Update Time : Tue Oct 22 23:22:22 2019 State : clean Active Devices : 2 Working Devices : 2 Failed Devices : 0 Spare Devices : 0 Consistency Policy : resync Name : cvenusqemu:0 (local to host cvenusqemu) UUID : 28141eb1:94d31044:e2692981:08ccd882 Events : 17 Number Major Minor RaidDevice State 0 259 2 0 active sync /dev/nullb0p1 1 259 1 1 active sync /dev/nullb1p1 Performance numbers :- 1. With this patch using /dev/md0 as namespace backend where use_vfs=0:- write: IOPS=66.1k, BW=258MiB/s (271MB/s)(7750MiB/30002msec) write: IOPS=65.8k, BW=257MiB/s (269MB/s)(7709MiB/30002msec) write: IOPS=64.8k, BW=253MiB/s (266MB/s)(7599MiB/30002msec) 2. With this patch using /dev/md0 as namespace backend where use_vfs=1:- write: IOPS=153k, BW=598MiB/s (627MB/s)(17.5GiB/30001msec) write: IOPS=152k, BW=594MiB/s (623MB/s)(17.4GiB/30001msec) write: IOPS=151k, BW=589MiB/s (617MB/s)(17.2GiB/30002msec) We can see the significant performance improvement when use_vfs=1. Note:- I've not tested entire patch with all the corner cases. Once I get a feedback I'll send out well tested version. Regards, -Chaitanya --- drivers/nvme/target/configfs.c | 29 +++++++++++++++++++++++++++++ drivers/nvme/target/core.c | 1 + drivers/nvme/target/io-cmd-bdev.c | 5 +++++ drivers/nvme/target/io-cmd-file.c | 31 +++++++++++++++++++++---------- drivers/nvme/target/nvmet.h | 1 + 5 files changed, 57 insertions(+), 10 deletions(-) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 98613a45bd3b..184555c19c03 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -545,12 +545,41 @@ static ssize_t nvmet_ns_buffered_io_store(struct config_item *item, CONFIGFS_ATTR(nvmet_ns_, buffered_io); +static ssize_t nvmet_ns_use_vfs_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", to_nvmet_ns(item)->use_vfs); +} + +static ssize_t nvmet_ns_use_vfs_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + bool val; + + if (strtobool(page, &val)) + return -EINVAL; + + mutex_lock(&ns->subsys->lock); + if (ns->enabled) { + pr_err("disable ns before setting use_vfs value.\n"); + mutex_unlock(&ns->subsys->lock); + return -EINVAL; + } + + ns->use_vfs = val; + mutex_unlock(&ns->subsys->lock); + return count; +} + +CONFIGFS_ATTR(nvmet_ns_, use_vfs); + static struct configfs_attribute *nvmet_ns_attrs[] = { &nvmet_ns_attr_device_path, &nvmet_ns_attr_device_nguid, &nvmet_ns_attr_device_uuid, &nvmet_ns_attr_ana_grpid, &nvmet_ns_attr_enable, + &nvmet_ns_attr_use_vfs, &nvmet_ns_attr_buffered_io, #ifdef CONFIG_PCI_P2PDMA &nvmet_ns_attr_p2pmem, diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 6b39cfc6ade1..1d7c6310d5f0 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -653,6 +653,7 @@ struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) uuid_gen(&ns->uuid); ns->buffered_io = false; + ns->use_vfs = false; return ns; } diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index f2618dc2ef3a..e0d8079de5c3 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -51,6 +51,11 @@ int nvmet_bdev_ns_enable(struct nvmet_ns *ns) { int ret; + if (ns->use_vfs) { + pr_info("Force using the vfs layer\n"); + return -ENOTBLK; + } + ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE, NULL); if (IS_ERR(ns->bdev)) { diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c index 05453f5d1448..336ffda3261b 100644 --- a/drivers/nvme/target/io-cmd-file.c +++ b/drivers/nvme/target/io-cmd-file.c @@ -30,6 +30,7 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns) int nvmet_file_ns_enable(struct nvmet_ns *ns) { int flags = O_RDWR | O_LARGEFILE; + struct block_device *bdev; struct kstat stat; int ret; @@ -45,17 +46,27 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns) ret = vfs_getattr(&ns->file->f_path, &stat, STATX_SIZE, AT_STATX_FORCE_SYNC); - if (ret) - goto err; - - ns->size = stat.size; - /* - * i_blkbits can be greater than the universally accepted upper bound, - * so make sure we export a sane namespace lba_shift. - */ - ns->blksize_shift = min_t(u8, - file_inode(ns->file)->i_blkbits, 12); + if (ret) { + pr_err("failed to stat device file %s\n", + ns->device_path); + } + if (stat.size == 0 && ns->use_vfs) { + bdev = blkdev_get_by_path(ns->device_path, + FMODE_READ | FMODE_WRITE, NULL); + if (IS_ERR(bdev)) + goto err; + ns->size = i_size_read(bdev->bd_inode); + ns->blksize_shift = blksize_bits(bdev_logical_block_size(bdev)); + } else { + /* + * i_blkbits can be greater than the universally accepted upper + * bound, so make sure we export a sane namespace lba_shift. + */ + ns->size = stat.size; + ns->blksize_shift = min_t(u8, + file_inode(ns->file)->i_blkbits, 12); + } ns->bvec_cache = kmem_cache_create("nvmet-bvec", NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), 0, SLAB_HWCACHE_ALIGN, NULL); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index c51f8dd01dc4..20aa83077765 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -63,6 +63,7 @@ struct nvmet_ns { u32 anagrpid; bool buffered_io; + bool use_vfs; bool enabled; struct nvmet_subsys *subsys; const char *device_path; -- 2.22.1