All of lore.kernel.org
 help / color / mirror / Atom feed
From: Anuj Gupta <anuj20.g@samsung.com>
To: axboe@kernel.dk, vincentfu@gmail.com
Cc: joshi.k@samsung.com, ankit.kumar@samsung.com,
	fio@vger.kernel.org, Anuj Gupta <anuj20.g@samsung.com>
Subject: [PATCH v2 2/2] t/io_uring: add support for async-passthru
Date: Fri, 26 Aug 2022 17:03:06 +0530	[thread overview]
Message-ID: <20220826113306.4139-3-anuj20.g@samsung.com> (raw)
In-Reply-To: <20220826113306.4139-1-anuj20.g@samsung.com>

This patch adds support for async-passthru in t/io_uring. User needs to
specify -u1 option in the command

Example commandline:
t/io_uring -b512 -d128 -c32 -s32 -p0 -F1 -B0 -O0 -n1 -u1 /dev/ng0n1

Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
---
 t/io_uring.c | 238 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 230 insertions(+), 8 deletions(-)

diff --git a/t/io_uring.c b/t/io_uring.c
index a42abd46..4e1d617f 100644
--- a/t/io_uring.c
+++ b/t/io_uring.c
@@ -35,6 +35,7 @@
 #include "../lib/rand.h"
 #include "../minmax.h"
 #include "../os/linux/io_uring.h"
+#include "../engines/nvme.h"
 
 struct io_sq_ring {
 	unsigned *head;
@@ -67,6 +68,8 @@ struct file {
 	unsigned long max_size;
 	unsigned long cur_off;
 	unsigned pending_ios;
+	unsigned int nsid;	/* nsid field required for nvme-passthrough */
+	unsigned int lba_shift;	/* lba_shift field required for nvme-passthrough */
 	int real_fd;
 	int fixed_fd;
 	int fileno;
@@ -139,6 +142,7 @@ static int random_io = 1;	/* random or sequential IO */
 static int register_ring = 1;	/* register ring */
 static int use_sync = 0;	/* use preadv2 */
 static int numa_placement = 0;	/* set to node of device */
+static int pt = 0;		/* passthrough I/O or not */
 
 static unsigned long tsc_rate;
 
@@ -161,6 +165,54 @@ struct io_uring_map_buffers {
 };
 #endif
 
+static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
+			 enum nvme_csi csi, void *data)
+{
+	struct nvme_passthru_cmd cmd = {
+		.opcode         = nvme_admin_identify,
+		.nsid           = nsid,
+		.addr           = (__u64)(uintptr_t)data,
+		.data_len       = NVME_IDENTIFY_DATA_SIZE,
+		.cdw10          = cns,
+		.cdw11          = csi << NVME_IDENTIFY_CSI_SHIFT,
+		.timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
+	};
+
+	return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
+}
+
+static int nvme_get_info(int fd, __u32 *nsid, __u32 *lba_sz, __u64 *nlba)
+{
+	struct nvme_id_ns ns;
+	int namespace_id;
+	int err;
+
+	namespace_id = ioctl(fd, NVME_IOCTL_ID);
+	if (namespace_id < 0) {
+		fprintf(stderr, "error failed to fetch namespace-id\n");
+		close(fd);
+		return -errno;
+	}
+
+	/*
+	 * Identify namespace to get namespace-id, namespace size in LBA's
+	 * and LBA data size.
+	 */
+	err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
+				NVME_CSI_NVM, &ns);
+	if (err) {
+		fprintf(stderr, "error failed to fetch identify namespace\n");
+		close(fd);
+		return err;
+	}
+
+	*nsid = namespace_id;
+	*lba_sz = 1 << ns.lbaf[(ns.flbas & 0x0f)].ds;
+	*nlba = ns.nsze;
+
+	return 0;
+}
+
 static unsigned long cycles_to_nsec(unsigned long cycles)
 {
 	uint64_t val;
@@ -520,6 +572,65 @@ static void init_io(struct submitter *s, unsigned index)
 		sqe->user_data |= ((uint64_t)s->clock_index << 32);
 }
 
+static void init_io_pt(struct submitter *s, unsigned index)
+{
+	struct io_uring_sqe *sqe = &s->sqes[index << 1];
+	unsigned long offset;
+	struct file *f;
+	struct nvme_uring_cmd *cmd;
+	unsigned long long slba;
+	unsigned long long nlb;
+	long r;
+
+	if (s->nr_files == 1) {
+		f = &s->files[0];
+	} else {
+		f = &s->files[s->cur_file];
+		if (f->pending_ios >= file_depth(s)) {
+			s->cur_file++;
+			if (s->cur_file == s->nr_files)
+				s->cur_file = 0;
+			f = &s->files[s->cur_file];
+		}
+	}
+	f->pending_ios++;
+
+	if (random_io) {
+		r = __rand64(&s->rand_state);
+		offset = (r % (f->max_blocks - 1)) * bs;
+	} else {
+		offset = f->cur_off;
+		f->cur_off += bs;
+		if (f->cur_off + bs > f->max_size)
+			f->cur_off = 0;
+	}
+
+	if (register_files) {
+		sqe->fd = f->fixed_fd;
+		sqe->flags = IOSQE_FIXED_FILE;
+	} else {
+		sqe->fd = f->real_fd;
+		sqe->flags = 0;
+	}
+	sqe->opcode = IORING_OP_URING_CMD;
+	sqe->user_data = (unsigned long) f->fileno;
+	if (stats)
+		sqe->user_data |= ((unsigned long)s->clock_index << 32);
+	sqe->cmd_op = NVME_URING_CMD_IO;
+	slba = offset >> f->lba_shift;
+	nlb = (bs >> f->lba_shift) - 1;
+	cmd = (struct nvme_uring_cmd *)&sqe->cmd;
+	/* cdw10 and cdw11 represent starting slba*/
+	cmd->cdw10 = slba & 0xffffffff;
+	cmd->cdw11 = slba >> 32;
+	/* cdw12 represent number of lba to be read*/
+	cmd->cdw12 = nlb;
+	cmd->addr = (unsigned long) s->iovecs[index].iov_base;
+	cmd->data_len = bs;
+	cmd->nsid = f->nsid;
+	cmd->opcode = 2;
+}
+
 static int prep_more_ios_uring(struct submitter *s, int max_ios)
 {
 	struct io_sq_ring *ring = &s->sq_ring;
@@ -532,7 +643,10 @@ static int prep_more_ios_uring(struct submitter *s, int max_ios)
 			break;
 
 		index = tail & sq_ring_mask;
-		init_io(s, index);
+		if (pt)
+			init_io_pt(s, index);
+		else
+			init_io(s, index);
 		ring->array[index] = index;
 		prepped++;
 		tail = next_tail;
@@ -549,7 +663,29 @@ static int get_file_size(struct file *f)
 
 	if (fstat(f->real_fd, &st) < 0)
 		return -1;
-	if (S_ISBLK(st.st_mode)) {
+	if (pt) {
+		__u64 nlba;
+		__u32 lbs;
+		int ret;
+
+		if (!S_ISCHR(st.st_mode)) {
+			fprintf(stderr, "passthrough works with only nvme-ns "
+					"generic devices (/dev/ngXnY)\n");
+			return -1;
+		}
+		ret = nvme_get_info(f->real_fd, &f->nsid, &lbs, &nlba);
+		if (ret)
+			return -1;
+		if ((bs % lbs) != 0) {
+			printf("error: bs:%d should be a multiple logical_block_size:%d\n",
+					bs, lbs);
+			return -1;
+		}
+		f->max_blocks = nlba / bs;
+		f->max_size = nlba;
+		f->lba_shift = ilog2(lbs);
+		return 0;
+	} else if (S_ISBLK(st.st_mode)) {
 		unsigned long long bytes;
 
 		if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
@@ -620,6 +756,60 @@ static int reap_events_uring(struct submitter *s)
 	return reaped;
 }
 
+static int reap_events_uring_pt(struct submitter *s)
+{
+	struct io_cq_ring *ring = &s->cq_ring;
+	struct io_uring_cqe *cqe;
+	unsigned head, reaped = 0;
+	int last_idx = -1, stat_nr = 0;
+	unsigned index;
+	int fileno;
+
+	head = *ring->head;
+	do {
+		struct file *f;
+
+		read_barrier();
+		if (head == atomic_load_acquire(ring->tail))
+			break;
+		index = head & cq_ring_mask;
+		cqe = &ring->cqes[index << 1];
+		fileno = cqe->user_data & 0xffffffff;
+		f = &s->files[fileno];
+		f->pending_ios--;
+
+		if (cqe->res != 0) {
+			printf("io: unexpected ret=%d\n", cqe->res);
+			if (polled && cqe->res == -EINVAL)
+				printf("passthrough doesn't support polled IO\n");
+			return -1;
+		}
+		if (stats) {
+			int clock_index = cqe->user_data >> 32;
+
+			if (last_idx != clock_index) {
+				if (last_idx != -1) {
+					add_stat(s, last_idx, stat_nr);
+					stat_nr = 0;
+				}
+				last_idx = clock_index;
+			}
+			stat_nr++;
+		}
+		reaped++;
+		head++;
+	} while (1);
+
+	if (stat_nr)
+		add_stat(s, last_idx, stat_nr);
+
+	if (reaped) {
+		s->inflight -= reaped;
+		atomic_store_release(ring->head, head);
+	}
+	return reaped;
+}
+
 static void set_affinity(struct submitter *s)
 {
 #ifdef CONFIG_LIBNUMA
@@ -697,6 +887,7 @@ static int setup_ring(struct submitter *s)
 	struct io_uring_params p;
 	int ret, fd;
 	void *ptr;
+	size_t len;
 
 	memset(&p, 0, sizeof(p));
 
@@ -709,6 +900,10 @@ static int setup_ring(struct submitter *s)
 			p.sq_thread_cpu = sq_thread_cpu;
 		}
 	}
+	if (pt) {
+		p.flags |= IORING_SETUP_SQE128;
+		p.flags |= IORING_SETUP_CQE32;
+	}
 
 	fd = io_uring_setup(depth, &p);
 	if (fd < 0) {
@@ -761,11 +956,22 @@ static int setup_ring(struct submitter *s)
 	sring->array = ptr + p.sq_off.array;
 	sq_ring_mask = *sring->ring_mask;
 
-	s->sqes = mmap(0, p.sq_entries * sizeof(struct io_uring_sqe),
+	if (p.flags & IORING_SETUP_SQE128)
+		len = 2 * p.sq_entries * sizeof(struct io_uring_sqe);
+	else
+		len = p.sq_entries * sizeof(struct io_uring_sqe);
+	s->sqes = mmap(0, len,
 			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
 			IORING_OFF_SQES);
 
-	ptr = mmap(0, p.cq_off.cqes + p.cq_entries * sizeof(struct io_uring_cqe),
+	if (p.flags & IORING_SETUP_CQE32) {
+		len = p.cq_off.cqes +
+			2 * p.cq_entries * sizeof(struct io_uring_cqe);
+	} else {
+		len = p.cq_off.cqes +
+			p.cq_entries * sizeof(struct io_uring_cqe);
+	}
+	ptr = mmap(0, len,
 			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE, fd,
 			IORING_OFF_CQ_RING);
 	cring->head = ptr + p.cq_off.head;
@@ -856,7 +1062,16 @@ static int submitter_init(struct submitter *s)
 		s->plat = NULL;
 		nr_batch = 0;
 	}
+	/* perform the expensive command initialization part for passthrough here
+	 * rather than in the fast path
+	 */
+	if (pt) {
+		for (i = 0; i < roundup_pow2(depth); i++) {
+			struct io_uring_sqe *sqe = &s->sqes[i << 1];
 
+			memset(&sqe->cmd, 0, sizeof(struct nvme_uring_cmd));
+		}
+	}
 	return nr_batch;
 }
 
@@ -1112,7 +1327,10 @@ submit:
 		do {
 			int r;
 
-			r = reap_events_uring(s);
+			if (pt)
+				r = reap_events_uring_pt(s);
+			else
+				r = reap_events_uring(s);
 			if (r == -1) {
 				s->finish = 1;
 				break;
@@ -1306,11 +1524,12 @@ static void usage(char *argv, int status)
 		" -a <bool> : Use legacy aio, default %d\n"
 		" -S <bool> : Use sync IO (preadv2), default %d\n"
 		" -X <bool> : Use registered ring %d\n"
-		" -P <bool> : Automatically place on device home node %d\n",
+		" -P <bool> : Automatically place on device home node %d\n"
+		" -u <bool> : Use nvme-passthrough I/O, default %d\n",
 		argv, DEPTH, BATCH_SUBMIT, BATCH_COMPLETE, BS, polled,
 		fixedbufs, dma_map, register_files, nthreads, !buffered, do_nop,
 		stats, runtime == 0 ? "unlimited" : runtime_str, random_io, aio,
-		use_sync, register_ring, numa_placement);
+		use_sync, register_ring, numa_placement, pt);
 	exit(status);
 }
 
@@ -1369,7 +1588,7 @@ int main(int argc, char *argv[])
 	if (!do_nop && argc < 2)
 		usage(argv[0], 1);
 
-	while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:X:S:P:h?")) != -1) {
+	while ((opt = getopt(argc, argv, "d:s:c:b:p:B:F:n:N:O:t:T:a:r:D:R:X:S:P:u:h?")) != -1) {
 		switch (opt) {
 		case 'a':
 			aio = !!atoi(optarg);
@@ -1450,6 +1669,9 @@ int main(int argc, char *argv[])
 		case 'P':
 			numa_placement = !!atoi(optarg);
 			break;
+		case 'u':
+			pt = !!atoi(optarg);
+			break;
 		case 'h':
 		case '?':
 		default:
-- 
2.25.1


  parent reply	other threads:[~2022-08-26 11:45 UTC|newest]

Thread overview: 4+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <CGME20220826114305epcas5p4a5636b062f33534f75f0d907af31bc50@epcas5p4.samsung.com>
2022-08-26 11:33 ` [PATCH v2 0/2] Add support for uring-passthrough in t/io_uring Anuj Gupta
     [not found]   ` <CGME20220826114309epcas5p36e313a77d0dc872fc15319b203e05d56@epcas5p3.samsung.com>
2022-08-26 11:33     ` [PATCH v2 1/2] t/io_uring: prep for including engines/nvme.h " Anuj Gupta
     [not found]   ` <CGME20220826114312epcas5p3adb78ca25ec4dae7655940e96f5cdd85@epcas5p3.samsung.com>
2022-08-26 11:33     ` Anuj Gupta [this message]
2022-08-26 13:30   ` [PATCH v2 0/2] Add support for uring-passthrough " Jens Axboe

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220826113306.4139-3-anuj20.g@samsung.com \
    --to=anuj20.g@samsung.com \
    --cc=ankit.kumar@samsung.com \
    --cc=axboe@kernel.dk \
    --cc=fio@vger.kernel.org \
    --cc=joshi.k@samsung.com \
    --cc=vincentfu@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.