Re: [PATCH 2/2] t/io_uring: add support for async-passthru

From: Anuj Gupta <anuj20.g@samsung.com>
To: Kanchan Joshi <joshi.k@samsung.com>
Cc: axboe@kernel.dk, vincentfu@gmail.com, ankit.kumar@samsung.com,
	fio@vger.kernel.org
Subject: Re: [PATCH 2/2] t/io_uring: add support for async-passthru
Date: Thu, 25 Aug 2022 17:09:52 +0530	[thread overview]
Message-ID: <20220825113952.GB16653@test-zns> (raw)
In-Reply-To: <20220825082444.GB22496@test-zns>

[-- Attachment #1: Type: text/plain, Size: 7596 bytes --]

On Thu, Aug 25, 2022 at 01:54:44PM +0530, Kanchan Joshi wrote:
> On Wed, Aug 24, 2022 at 03:01:09PM +0530, Anuj Gupta wrote:
> > This patch adds support for async-passthru in t/io_uring. User needs to
> > specify -u1 option in the command
> > 
> > Example commandline:
> > t/io_uring -b512 -d128 -c32 -s32 -p0 -F1 -B0 -O0 -n1 -u1 /dev/ng0n1
> > 
> > Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
> > ---
> > t/io_uring.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++++---
> > 1 file changed, 138 insertions(+), 8 deletions(-)
> > 
> > diff --git a/t/io_uring.c b/t/io_uring.c
> > index a42abd46..78fedaf1 100644
> > --- a/t/io_uring.c
> > +++ b/t/io_uring.c
> > @@ -35,6 +35,7 @@
> > #include "../lib/rand.h"
> > #include "../minmax.h"
> > #include "../os/linux/io_uring.h"
> > +#include "../engines/nvme.h"
> > 
> > struct io_sq_ring {
> > 	unsigned *head;
> > @@ -139,6 +140,9 @@ static int random_io = 1;	/* random or sequential IO */
> > static int register_ring = 1;	/* register ring */
> > static int use_sync = 0;	/* use preadv2 */
> > static int numa_placement = 0;	/* set to node of device */
> > +static int pt = 0;		/* passthrough I/O or not */
> > +static unsigned int nsid = 0;	/* nsid field required for nvme-passthrough */
> > +static unsigned int lbs = 0;	/* logical_block_size field required for nvme-passthrough */
> > 
> > static unsigned long tsc_rate;
> > 
> > @@ -161,6 +165,54 @@ struct io_uring_map_buffers {
> > };
> > #endif
> > 
> > +static int nvme_identify(int fd, __u32 nsid, enum nvme_identify_cns cns,
> > +			 enum nvme_csi csi, void *data)
> > +{
> > +	struct nvme_passthru_cmd cmd = {
> > +		.opcode         = nvme_admin_identify,
> > +		.nsid           = nsid,
> > +		.addr           = (__u64)(uintptr_t)data,
> > +		.data_len       = NVME_IDENTIFY_DATA_SIZE,
> > +		.cdw10          = cns,
> > +		.cdw11          = csi << NVME_IDENTIFY_CSI_SHIFT,
> > +		.timeout_ms     = NVME_DEFAULT_IOCTL_TIMEOUT,
> > +	};
> > +
> > +	return ioctl(fd, NVME_IOCTL_ADMIN_CMD, &cmd);
> > +}
> > +
> > +static int nvme_get_info(int fd, __u32 *nsid, __u32 *lba_sz, __u64 *nlba)
> > +{
> > +	struct nvme_id_ns ns;
> > +	int namespace_id;
> > +	int err;
> > +
> > +	namespace_id = ioctl(fd, NVME_IOCTL_ID);
> > +	if (namespace_id < 0) {
> > +		fprintf(stderr, "error failed to fetch namespace-id\n");
> > +		close(fd);
> > +		return -errno;
> > +	}
> > +
> > +	/*
> > +	 * Identify namespace to get namespace-id, namespace size in LBA's
> > +	 * and LBA data size.
> > +	 */
> > +	err = nvme_identify(fd, namespace_id, NVME_IDENTIFY_CNS_NS,
> > +				NVME_CSI_NVM, &ns);
> > +	if (err) {
> > +		fprintf(stderr, "error failed to fetch identify namespace\n");
> > +		close(fd);
> > +		return err;
> > +	}
> > +
> > +	*nsid = namespace_id;
> > +	*lba_sz = 1 << ns.lbaf[(ns.flbas & 0x0f)].ds;
> > +	*nlba = ns.nsze;
> > +
> > +	return 0;
> > +}
> > +
> > static unsigned long cycles_to_nsec(unsigned long cycles)
> > {
> > 	uint64_t val;
> > @@ -457,12 +509,13 @@ static unsigned file_depth(struct submitter *s)
> > 
> > static void init_io(struct submitter *s, unsigned index)
> > {
> > -	struct io_uring_sqe *sqe = &s->sqes[index];
> > +	struct io_uring_sqe *sqe = NULL;
> > 	unsigned long offset;
> > 	struct file *f;
> > 	long r;
> > 
> > 	if (do_nop) {
> > +		sqe = &s->sqes[index];
> 
> Nit: this and previous change is not must as <see below>
> 

got it, will change this

> > 		sqe->opcode = IORING_OP_NOP;
> > 		return;
> > 	}
> > @@ -490,6 +543,41 @@ static void init_io(struct submitter *s, unsigned index)
> > 			f->cur_off = 0;
> > 	}
> > 
> > +	if (pt) {
> > +		struct nvme_uring_cmd *cmd;
> > +		unsigned long long slba;
> > +		unsigned long long nlb;
> > +
> > +		sqe = &s->sqes[index<<1];
> 
> this is enough.
> 
> > +		if (register_files) {
> > +			sqe->fd = f->fixed_fd;
> > +			sqe->flags = IOSQE_FIXED_FILE;
> > +		} else {
> > +			sqe->fd = f->real_fd;
> > +			sqe->flags = 0;
> > +		}
> > +		sqe->opcode = IORING_OP_URING_CMD;
> > +		sqe->user_data = (unsigned long) f->fileno;
> > +		if (stats)
> > +			sqe->user_data |= ((unsigned long)s->clock_index << 32);
> > +		sqe->cmd_op = NVME_URING_CMD_IO;
> > +		slba = offset / lbs;
> > +		nlb = bs/lbs - 1;
> > +		cmd = (struct nvme_uring_cmd *)&sqe->cmd;
> > +		memset(cmd, 0, sizeof(struct nvme_uring_cmd));
> > +		/* cdw10 and cdw11 represent starting slba*/
> > +		cmd->cdw10 = slba & 0xffffffff;
> > +		cmd->cdw11 = slba >> 32;
> > +		/* cdw12 represent number of lba to be read*/
> > +		cmd->cdw12 = nlb;
> > +		cmd->addr = (unsigned long) s->iovecs[index].iov_base;
> > +		cmd->data_len = bs;
> > +		cmd->nsid = nsid;
> > +		cmd->opcode = 2;
> > +		return;
> > +	}
> > +
> > +	sqe = &s->sqes[index];
> > 	if (register_files) {
> > 		sqe->flags = IOSQE_FIXED_FILE;
> > 		sqe->fd = f->fixed_fd;
> > @@ -549,7 +637,22 @@ static int get_file_size(struct file *f)
> > 
> > 	if (fstat(f->real_fd, &st) < 0)
> > 		return -1;
> > -	if (S_ISBLK(st.st_mode)) {
> > +	if (pt) {
> > +		__u64 nlba;
> > +		int ret;
> > +
> > +		if (!S_ISCHR(st.st_mode)) {
> > +			fprintf(stderr, "passthrough works with only nvme-ns "
> > +					"generic devices (/dev/ngXnY)\n");
> > +			return -1;
> > +		}
> > +		ret = nvme_get_info(f->real_fd, &nsid, &lbs, &nlba);
> > +		if (ret)
> > +			return -1;
> > +		f->max_blocks = nlba / bs;
> > +		f->max_size = nlba;
> > +		return 0;
> > +	} else if (S_ISBLK(st.st_mode)) {
> > 		unsigned long long bytes;
> > 
> > 		if (ioctl(f->real_fd, BLKGETSIZE64, &bytes) != 0)
> > @@ -587,11 +690,14 @@ static int reap_events_uring(struct submitter *s)
> > 
> > 			f = &s->files[fileno];
> > 			f->pending_ios--;
> > -			if (cqe->res != bs) {
> > +			if (!pt && cqe->res != bs) {
> > 				printf("io: unexpected ret=%d\n", cqe->res);
> > 				if (polled && cqe->res == -EOPNOTSUPP)
> > 					printf("Your filesystem/driver/kernel doesn't support polled IO\n");
> > 				return -1;
> > +			} else if (pt && cqe->res != 0) {
> > +				printf("io: unexpected ret=%d\n", cqe->res);
> > +				return -1;
> 
> Missing inner check (polled one) for pt case?

right, adding a check for polling makes sense.

> How about something like this fragement -
> 
>                if (!do_nop) {
>                        int fileno = cqe->user_data & 0xffffffff;
> +                       bool error;
> 
>                        f = &s->files[fileno];
>                        f->pending_ios--;
> -                       if (!pt && cqe->res != bs) {
> +                       if (!pt)
> +                               error = (cqe->res != bs);
> +                       else
> +                               error = (cqe->res != 0);
> +                       if (error) {
>                                printf("io: unexpected ret=%d\n", cqe->res);
>                                if (polled && cqe->res == -EOPNOTSUPP)
>                                        printf("Your filesystem/driver/kernel doesn't support polled IO\n");
>                                return -1;
> 
> [...]
>

I think, the result is -EINVAL rather than -EOPNOTSUPP, if the user enables polling for passthru i/o.
With that we can check for polled-io for passthru

> > @@ -1512,6 +1638,10 @@ int main(int argc, char *argv[])
> > 			printf("failed getting size of device/file\n");
> > 			return 1;
> > 		}
> > +		if (pt && bs < lbs) {
> > +			printf("error: bs:%d is less than logical_block_size:%d\n", bs, lbs);
> > +			return 1;
> > +		}
> 
> would it be better to replace the check/print with "bs % lbs" instead?
> since io size needs to be multiple of lbs to go through.
> 

right, will modify the check accordingly

--
Anuj Gupta

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]