Re: [PATCH V12 2/3] nvmet: add ZBD over ZNS backend support

From: Chaitanya Kulkarni <Chaitanya.Kulkarni@wdc.com>
To: Damien Le Moal <Damien.LeMoal@wdc.com>,
	"linux-nvme@lists.infradead.org" <linux-nvme@lists.infradead.org>
Cc: "hch@lst.de" <hch@lst.de>,
	"kbusch@kernel.org" <kbusch@kernel.org>,
	"sagi@grimberg.me" <sagi@grimberg.me>
Subject: Re: [PATCH V12 2/3] nvmet: add ZBD over ZNS backend support
Date: Fri, 12 Mar 2021 06:29:49 +0000	[thread overview]
Message-ID: <BYAPR04MB49657E04A5017F5C31D4EFE1866F9@BYAPR04MB4965.namprd04.prod.outlook.com> (raw)
In-Reply-To: BL0PR04MB6514B749F68EB412FEC032DDE76F9@BL0PR04MB6514.namprd04.prod.outlook.com

>> -
>> -void nvmet_bdev_ns_disable(struct nvmet_ns *ns)
>> -{
>> -	if (ns->bdev) {
>> -		blkdev_put(ns->bdev, FMODE_WRITE | FMODE_READ);
>> -		ns->bdev = NULL;
>> +	/* bdev_is_zoned() is stubbed out of CONFIG_BLK_DEV_ZONED */
> I do not really understand this comment and I do not think it is useful.
> bdev_is_zoned() is always defined, regardless of CONFIG_BLK_DEV_ZONED. If
> CONFIG_BLK_DEV_ZONED is not set, you will always get false.

Okay, will remove the comment.

>> +
>>  static inline struct nvme_ctrl *
>>  nvmet_req_passthru_ctrl(struct nvmet_req *req)
>>  {
>> diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
>> new file mode 100644
>> index 000000000000..e12629b02320
>> --- /dev/null
>> +++ b/drivers/nvme/target/zns.c
>> @@ -0,0 +1,332 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +/*
>> + * NVMe ZNS-ZBD command implementation.
> s/ZBD/zoned block device
>
> "ZBD" is not necessarilly an obvious acronym to everybody.
>
>> + * Copyright (c) 2020-2021 HGST, a Western Digital Company.
> This should be:
>
> * Copyright (C) 2021 Western Digital Corporation or its affiliates.

Will fix the header.

>> +static inline u8 nvmet_zasl(unsigned int zone_append_sects)
>> +{
>> +	/*
>> +	 * Zone Append Size Limit is the value experessed in the units
>> +	 * of minimum memory page size (i.e. 12) and is reported power of 2.
>> +	 */
>> +	return ilog2((zone_append_sects << 9) >> NVMET_MPSMIN_SHIFT);
> s/9/SECTOR_SHIFT
>
> And you could rewrite this as:
>
> return ilog2(zone_append_sects >> (NVMET_MPSMIN_SHIFT - SECTOR_SHIFT));

SECTOR_SHIFT patches are nacked, reason being it doesn't make code
clear, how about following ?

return ilog2(zone_append_sects >> (NVMET_MPSMIN_SHIFT - 9));

>
>> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
>> +{
>> +	if (nvmet_bdev_has_conv_zones(ns->bdev))
>> +		return false;
>> +
>> +	ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev));
>> +
>> +	if (!nvmet_zns_update_zasl(ns))
>> +		return false;
>> +
>> +	return !(get_capacity(ns->bdev->bd_disk) &
>> +			(bdev_zone_sectors(ns->bdev) - 1));
> It may be good to add a comment above this one as it is not necessarilly
> obvious. Something like:
>
> /*
>  * Generic zoned block devices may have a smaller last zone which is
>  * not supported by ZNS. Excludes zoned drives that have such smaller
>  * last zone.
>  */

Will add above comment.

>
>> +}
>> +
>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>> +{
>> +	u8 zasl = req->sq->ctrl->subsys->zasl;
>> +	struct nvmet_ctrl *ctrl = req->sq->ctrl;
>> +	struct nvme_id_ctrl_zns *id;
>> +	u16 status;
>> +
>> +	if (req->cmd->identify.csi != NVME_CSI_ZNS) {
>> +		req->error_loc = offsetof(struct nvme_common_command, opcode);
>> +		status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
>> +		goto out;
>> +	}
>> +
>> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
>> +	if (!id) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out;
>> +	}
>> +
>> +	if (ctrl->ops->get_mdts)
>> +		id->zasl = min_t(u8, ctrl->ops->get_mdts(ctrl), zasl);
>> +	else
>> +		id->zasl = zasl;
>> +
>> +	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
>> +
>> +	kfree(id);
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>> +{
>> +	struct nvme_id_ns_zns *id_zns;
>> +	u64 zsze;
>> +	u16 status;
>> +
>> +	if (req->cmd->identify.csi != NVME_CSI_ZNS) {
>> +		req->error_loc = offsetof(struct nvme_common_command, opcode);
>> +		status = NVME_SC_INVALID_OPCODE | NVME_SC_DNR;
>> +		goto out;
>> +	}
>> +
>> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>> +		goto out;
>> +	}
>> +
>> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
>> +	if (!id_zns) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out;
>> +	}
>> +
>> +	status = nvmet_req_find_ns(req);
>> +	if (status) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto done;
>> +	}
>> +
>> +	if (!bdev_is_zoned(req->ns->bdev)) {
>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>> +		goto done;
>> +	}
>> +
>> +	nvmet_ns_revalidate(req->ns);
>> +	zsze = (bdev_zone_sectors(req->ns->bdev) << 9) >>
>> +					req->ns->blksize_shift;
> s/9/SECTOR_SHIFT

See my earlier reply to the same comment.

>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>> +{
>> +	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zmr.slba);
>> +	u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
>> +	struct nvmet_report_zone_data data = { .ns = req->ns };
>> +	unsigned int nr_zones;
>> +	int reported_zones;
>> +	u16 status;
>> +
>> +	status = nvmet_bdev_zns_checks(req);
>> +	if (status)
>> +		goto out;
>> +
>> +	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY | __GFP_ZERO);
>> +	if (!data.rz) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out;
>> +	}
>> +
>> +	nr_zones = (bufsize - sizeof(struct nvme_zone_report)) /
>> +			sizeof(struct nvme_zone_descriptor);
>> +	if (!nr_zones) {
>> +		status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
>> +		goto out_free_report_zones;
>> +	}
>> +
>> +	reported_zones = blkdev_report_zones(req->ns->bdev, sect, nr_zones,
>> +					     nvmet_bdev_report_zone_cb, &data);
>> +	if (reported_zones < 0) {
>> +		status = NVME_SC_INTERNAL;
>> +		goto out_free_report_zones;
>> +	}
> There is a problem here: the code as is ignores the request reporting option
> field which can lead to an invalid zone report being returned. I think you need
> to modify nvmet_bdev_report_zone_cb() to look at the reporting option field
> passed by the initiator and filter the zone report since blkdev_report_zones()
> does not handle that argument.

The reporting options are set by the host statistically in
nvme_ns_report_zones()
arefrom:-  nvme_ns_report_zones()
         c.zmr.zra = NVME_ZRA_ZONE_REPORT;
         c.zmr.zrasf = NVME_ZRASF_ZONE_REPORT_ALL;
         c.zmr.pr = NVME_REPORT_ZONE_PARTIAL;

All the above values are validated in the nvmet_bdev_zns_checks() helper
called from nvmet_bdev_execute_zone_mgmt_recv() before we allocate the
report zone buffer.

1. c.zmr.zra indicates the action which Reports zone descriptor entries
   through the Report Zones data structure.

   We validate this value is been set to NVME_ZRA_ZONE_REPORT in the
   nvmet_bdev_zns_chceks(). We are calling report zone after checking
   zone receive action it NVME_ZRA_ZONE_REPORT so not filtering is needed
   in the nvmet_bdev_report_zone_cb().

2. c.zmr.zrasf indicates the action specific field which is set to
   NVME_ZRASF_ZONE_REPORT_ALL.

   We validate this value is been set to NVME_ZRASF_ZONE_REPORT_ALL in the
   nvmet_bdev_zns_chceks(). Since host wants all the zones we don't need to
   filter any zone states in the nvmet_bdev_report_zone_cb().

3. c.zmr.pr is set to NVME_REPORT_ZONE_PARTIAL which value = 1 i.e value in
   the Report Zone data structure Number of Zones field indicates the
number of
   fully transferred zone descriptors in the data buffer, which we set from
   return value of the blkdev_report_zones() :-

   reported_zones = blkdev_report_zones(req->ns->bdev, sect, nr_zones,
					     nvmet_bdev_report_zone_cb, &data);
<snip>   data.rz->nr_zones = cpu_to_le64(reported_zones);

   So no filtering is needed in nvmet_bdev_report_zone_cb() for c.zmr.pr.

Can you please explain what filtering is missing in the current code ?

Maybe I'm looking into an old spec.

>> +
>> +	data.rz->nr_zones = cpu_to_le64(reported_zones);
>> +
>> +	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
>> +
>> +out_free_report_zones:
>> +	kvfree(data.rz);
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>> +{
>> +	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba);
>> +	sector_t nr_sect = bdev_zone_sectors(req->ns->bdev);
>> +	u16 status = NVME_SC_SUCCESS;
>> +	u8 zsa = req->cmd->zms.zsa;
>> +	enum req_opf op;
>> +	int ret;
>> +	const unsigned int zsa_to_op[] = {
>> +		[NVME_ZONE_OPEN]	= REQ_OP_ZONE_OPEN,
>> +		[NVME_ZONE_CLOSE]	= REQ_OP_ZONE_CLOSE,
>> +		[NVME_ZONE_FINISH]	= REQ_OP_ZONE_FINISH,
>> +		[NVME_ZONE_RESET]	= REQ_OP_ZONE_RESET,
>> +	};
>> +
>> +	if (zsa > ARRAY_SIZE(zsa_to_op) || !zsa_to_op[zsa]) {
> What is the point of the "!zsa_to_op[zsa]" here ? All the REQ_OP_ZONE_XXX are
> non 0, always...

Well this is just making sure that we receive the right action since sparse
array will return 0 for any other values than listed above having
!zsa_to_op[zsa] check we can return an error.

See nvme_sysfs_show_state() if you want.

>> +		status = NVME_SC_INVALID_FIELD;
>> +		goto out;
>> +	}
>> +
>> +	op = zsa_to_op[zsa];
>> +
>> +	if (req->cmd->zms.select_all)
>> +		nr_sect = get_capacity(req->ns->bdev->bd_disk);
> If select_all is set, sect must be ignored, so you need to have something like this:
>
> 	if (req->cmd->zms.select_all) {
> 		sect = 0;
> 		nr_sect = get_capacity(req->ns->bdev->bd_disk);
> 	} else {
> 		sect = nvmet_lba_to_sect(req->ns, req->cmd->zms.slba);
> 		nr_sect = bdev_zone_sectors(req->ns->bdev);
> 	}
>
> Easier to read. Also may be rename nr_sect to nr_sects (plural).

Okay, will make this change.

>> +
>> +	ret = blkdev_zone_mgmt(req->ns->bdev, op, sect, nr_sect, GFP_KERNEL);
>> +	if (ret)
>> +		status = NVME_SC_INTERNAL;
>> +out:
>> +	nvmet_req_complete(req, status);
>> +}
>> +
>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>> +{
>> +	sector_t sect = nvmet_lba_to_sect(req->ns, req->cmd->rw.slba);
>> +	u16 status = NVME_SC_SUCCESS;
>> +	unsigned int total_len = 0;
>> +	struct scatterlist *sg;
>> +	int ret = 0, sg_cnt;
>> +	struct bio *bio;
>> +
>> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
>> +		return;
> No nvmet_req_complete() call ? Is that done in nvmet_check_transfer_len() ?

Yes it does, you had the same comment on earlier version, it can be
confusing
that is why I proposed a helper for check transfer len and !req->sg_cnt
check,
but we don't want that helper.

>> +
>> +	if (!req->sg_cnt) {
>> +		nvmet_req_complete(req, 0);
>> +		return;
>> +	}
>> +
>> +	if (req->transfer_len <= NVMET_MAX_INLINE_DATA_LEN) {
>> +		bio = &req->b.inline_bio;
>> +		bio_init(bio, req->inline_bvec, ARRAY_SIZE(req->inline_bvec));
>> +	} else {
>> +		bio = bio_alloc(GFP_KERNEL, req->sg_cnt);
>> +	}
>> +
>> +	bio_set_dev(bio, req->ns->bdev);
>> +	bio->bi_iter.bi_sector = sect;
>> +	bio->bi_opf = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
>> +	if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA))
>> +		bio->bi_opf |= REQ_FUA;
>> +
>> +	for_each_sg(req->sg, sg, req->sg_cnt, sg_cnt) {
>> +		struct page *p = sg_page(sg);
>> +		unsigned int l = sg->length;
>> +		unsigned int o = sg->offset;
>> +
>> +		ret = bio_add_zone_append_page(bio, p, l, o);
>> +		if (ret != sg->length) {
>> +			status = NVME_SC_INTERNAL;
>> +			goto out_bio_put;
>> +		}
>> +
>> +		total_len += sg->length;
>> +	}
>> +
>> +	if (total_len != nvmet_rw_data_len(req)) {
>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>> +		goto out_bio_put;
>> +	}
>> +
>> +	ret = submit_bio_wait(bio);
> submit_bio_wait() ? Why blocking here ? That would be bad for performance. Is it
> mandatory to block here ? The result handling could be done in the bio_end
> callback no ?

I did initially, but zonefs uses sync I/O, I'm not sure about the btrfs,
if it does
please let me know I'll make it async.

If there is no async caller in the kernel for REQ_OP_ZONE_APPEND
shouldwe make this
async ?

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme