linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* Low md raid1 performance unless forcing to use the VFS layer (io-cmd-file).
@ 2019-10-17 16:20 Mark Ruijter
  0 siblings, 0 replies; only message in thread
From: Mark Ruijter @ 2019-10-17 16:20 UTC (permalink / raw)
  To: linux-nvme

[-- Attachment #1: Type: text/plain, Size: 3514 bytes --]


When I export a md raid1 /dev/mdX device using nvmet, the 4k random write performance is limited to 250K iops, even though fio allows 850k iops on the target system.
A single raid1 thread consumes 100% of the CPU and this seems to be the bottleneck.

When I format the /dev/mdX device using XFS and export a file from that filesystem the write performance increases 300%.
To verify that using the VFS made the difference I wrote a patch (attached) that allows me to forcefully select using the io-cmd-file code.

When the nvme target has been configured normally and all io is handled by  io-cmd-bdev.c the performance reported by fio is:

fio --name=nvme_tcp --rw=randwrite --bs=4k --filename=/dev/nvme0n1 --numjobs=32 --iodepth=128 --exitall --direct=1 --group_reporting --time_based --runtime=300 --size=32G --ioengine=libaio
nvme_tcp: (g=0): rw=randwrite, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=128
...
fio-3.1
Starting 32 processes
^Cbs: 32 (f=32): [w(32)][2.0%][r=0KiB/s,w=638MiB/s][r=0,w=163k IOPS][eta 04m:54s]
fio: terminating on signal 2

nvme_tcp: (groupid=0, jobs=32): err= 0: pid=18081: Thu Oct 17 09:55:26 2019
  write: IOPS=174k, BW=679MiB/s (712MB/s)(4518MiB/6655msec)

A single process can be seen top running on the target system:
28039 root      20   0       0      0      0 R 100.00 0.000   0:09.52 md1_raid1

When I activate the attached patch below the performance goes up a lot!
/sys/kernel/config/nvmet/subsystems/clr1/namespaces/1 # ls -l
total 0
-rw-r--r-- 1 root root 4096 Oct 17 10:18 ana_grpid
-rw-r--r-- 1 root root 4096 Oct 17 10:18 buffered_io
-rw-r--r-- 1 root root 4096 Oct 17 06:59 device_nguid
-rw-r--r-- 1 root root 4096 Oct 17 09:54 device_path
-rw-r--r-- 1 root root 4096 Oct 17 06:59 device_uuid
-rw-r--r-- 1 root root 4096 Oct 17 09:59 enable
-rw-r--r-- 1 root root 4096 Oct 17 09:59 use_vfs

Note the new attr: use_vfs which is 0 by default.

/sys/kernel/config/nvmet/subsystems/clr1/namespaces/1 # cat device_path 
/dev/md1
/sys/kernel/config/nvmet/subsystems/clr1/namespaces/1 # echo 0 > enable 
/sys/kernel/config/nvmet/subsystems/clr1/namespaces/1 # echo 1 > use_vfs 
/sys/kernel/config/nvmet/subsystems/clr1/namespaces/1 # echo 1 > enable

fio --name=nvme_tcp --rw=randwrite --bs=4k --filename=/dev/nvme0n1 --numjobs=32 --iodepth=128 --exitall --direct=1 --group_reporting --time_based --runtime=300 --size=32G --ioengine=libaio
nvme_tcp: (g=0): rw=randwrite, bs=(R) 4096B-4096B, (W) 4096B-4096B, (T) 4096B-4096B, ioengine=libaio, iodepth=128
...
fio-3.1
Starting 32 processes
^Cbs: 32 (f=32): [w(32)][8.3%][r=0KiB/s,w=2348MiB/s][r=0,w=601k IOPS][eta 04m:35s]
fio: terminating on signal 2

nvme_tcp: (groupid=0, jobs=32): err= 0: pid=18227: Thu Oct 17 10:00:51 2019
  write: IOPS=640k, BW=2500MiB/s (2622MB/s)(61.0GiB/25374msec)

The target system now shows many kernel threads with low to moderate loads ( 32% ~ 70%).

Since the io-cmd-file code enforces direct-io the performance should only be slightly less compared to io-cmd-bdev.c which uses the bio interface.
However when writing to a md raid1 the opposite is true.

I still have to find out if the md raid1 kernel module or the nvmet driver is to blame.
However since using the VFS 'fixes' the issue I expect that the problem can likely be the nvmet blockio implementation.

Has anyone seen this issue with md raid1 before? 
Or does anyone have ideas about this problem?

Thank you,
 
Mark Ruijter


[-- Attachment #2: vfs.patch --]
[-- Type: application/octet-stream, Size: 3700 bytes --]

--- configfs.c	2019-10-07 19:01:58.000000000 +0200
+++ /usr/src/packages/BUILD/kernel-5.3.5/drivers/nvme/target/configfs.c	2019-10-14 10:19:17.187624027 +0200
@@ -545,6 +545,34 @@
 
 CONFIGFS_ATTR(nvmet_ns_, buffered_io);
 
+static ssize_t nvmet_ns_use_vfs_show(struct config_item *item, char *page)
+{
+        return sprintf(page, "%d\n", to_nvmet_ns(item)->use_vfs);
+}
+
+static ssize_t nvmet_ns_use_vfs_store(struct config_item *item,
+                const char *page, size_t count)
+{
+        struct nvmet_ns *ns = to_nvmet_ns(item);
+        bool val;
+
+        if (strtobool(page, &val))
+                return -EINVAL;
+
+        mutex_lock(&ns->subsys->lock);
+        if (ns->enabled) {
+                pr_err("disable ns before setting use_vfs value.\n");
+                mutex_unlock(&ns->subsys->lock);
+                return -EINVAL;
+        }
+
+        ns->use_vfs = val;
+        mutex_unlock(&ns->subsys->lock);
+        return count;
+}
+
+CONFIGFS_ATTR(nvmet_ns_, use_vfs);
+
 static struct configfs_attribute *nvmet_ns_attrs[] = {
 	&nvmet_ns_attr_device_path,
 	&nvmet_ns_attr_device_nguid,
@@ -552,6 +580,7 @@
 	&nvmet_ns_attr_ana_grpid,
 	&nvmet_ns_attr_enable,
 	&nvmet_ns_attr_buffered_io,
+	&nvmet_ns_attr_use_vfs,
 #ifdef CONFIG_PCI_P2PDMA
 	&nvmet_ns_attr_p2pmem,
 #endif
--- core.c	2019-10-07 19:01:58.000000000 +0200
+++ /usr/src/packages/BUILD/kernel-5.3.5/drivers/nvme/target/core.c	2019-10-14 10:14:38.686953801 +0200
@@ -653,7 +653,7 @@
 
 	uuid_gen(&ns->uuid);
 	ns->buffered_io = false;
-
+        ns->use_vfs = false;
 	return ns;
 }
 
--- io-cmd-bdev.c	2019-10-07 19:01:58.000000000 +0200
+++ /usr/src/packages/BUILD/kernel-5.3.5/drivers/nvme/target/io-cmd-bdev.c	2019-10-14 15:06:53.347300960 +0200
@@ -51,6 +51,10 @@
 {
 	int ret;
 
+	if (ns->use_vfs) {
+		pr_info("Force using the vfs layer\n");
+		return -ENOTBLK;
+	}
 	ns->bdev = blkdev_get_by_path(ns->device_path,
 			FMODE_READ | FMODE_WRITE, NULL);
 	if (IS_ERR(ns->bdev)) {
--- io-cmd-file.c	2019-10-07 19:01:58.000000000 +0200
+++ /usr/src/packages/BUILD/kernel-5.3.5/drivers/nvme/target/io-cmd-file.c	2019-10-14 15:21:11.590260670 +0200
@@ -31,6 +31,7 @@
 {
 	int flags = O_RDWR | O_LARGEFILE;
 	struct kstat stat;
+	struct block_device *bdev;
 	int ret;
 
 	if (!ns->buffered_io)
@@ -45,16 +46,28 @@
 
 	ret = vfs_getattr(&ns->file->f_path,
 			&stat, STATX_SIZE, AT_STATX_FORCE_SYNC);
-	if (ret)
+	if (ret) {
+		pr_err("failed to stat device file %s\n",
+			ns->device_path);
 		goto err;
+	}
 
 	ns->size = stat.size;
-	/*
-	 * i_blkbits can be greater than the universally accepted upper bound,
-	 * so make sure we export a sane namespace lba_shift.
-	 */
-	ns->blksize_shift = min_t(u8,
-			file_inode(ns->file)->i_blkbits, 12);
+	if (ns->size == 0 && ns->use_vfs) {
+		bdev = blkdev_get_by_path(ns->device_path,
+		                        FMODE_READ | FMODE_WRITE, NULL);
+		if (!IS_ERR(bdev)) {
+        		ns->size = i_size_read(bdev->bd_inode);
+			ns->blksize_shift = blksize_bits(bdev_logical_block_size(bdev));
+                }
+	} else {
+		/*
+		 * i_blkbits can be greater than the universally accepted upper bound,
+		 * so make sure we export a sane namespace lba_shift.
+		 */
+		ns->blksize_shift = min_t(u8,
+				file_inode(ns->file)->i_blkbits, 12);
+	}
 
 	ns->bvec_cache = kmem_cache_create("nvmet-bvec",
 			NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec),
--- nvmet.h	2019-10-07 19:01:58.000000000 +0200
+++ /usr/src/packages/BUILD/kernel-5.3.5/drivers/nvme/target/nvmet.h	2019-10-14 10:13:27.652679601 +0200
@@ -63,6 +63,7 @@
 	u32			anagrpid;
 
 	bool			buffered_io;
+	bool			use_vfs;
 	bool			enabled;
 	struct nvmet_subsys	*subsys;
 	const char		*device_path;

[-- Attachment #3: Type: text/plain, Size: 158 bytes --]

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2019-10-17 16:21 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-17 16:20 Low md raid1 performance unless forcing to use the VFS layer (io-cmd-file) Mark Ruijter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).