Re: [PATCH V2 2/9] nvmet: add ZNS support for bdev-ns

From: Damien Le Moal <Damien.LeMoal@wdc.com>
To: Chaitanya Kulkarni <Chaitanya.Kulkarni@wdc.com>,
	"linux-block@vger.kernel.org" <linux-block@vger.kernel.org>,
	"linux-nvme@lists.infradead.org" <linux-nvme@lists.infradead.org>
Cc: "hch@lst.de" <hch@lst.de>, "sagi@grimberg.me" <sagi@grimberg.me>
Subject: Re: [PATCH V2 2/9] nvmet: add ZNS support for bdev-ns
Date: Tue, 1 Dec 2020 05:44:30 +0000	[thread overview]
Message-ID: <CH2PR04MB6522A475CD3F0F4D123E913BE7F40@CH2PR04MB6522.namprd04.prod.outlook.com> (raw)
In-Reply-To: BYAPR04MB49654A4ECB46346E8445D8AA86F40@BYAPR04MB4965.namprd04.prod.outlook.com

On 2020/12/01 12:38, Chaitanya Kulkarni wrote:
[...]
>>> +	 * namespaces.
>>> +	 */
>>> +	min_zasl = ns->zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
>>> +
>>> +	xa_for_each(&(ns->subsys->namespaces), idx, ins) {
>>> +		struct request_queue *iq = ins->bdev->bd_disk->queue;
>>> +		unsigned int imax_za_sects = queue_max_zone_append_sectors(iq);
>>> +		u8 izasl = nvmet_zasl(imax_za_sects);
>>> +
>>> +		if (!bdev_is_zoned(ins->bdev))
>>> +			continue;
>>> +
>>> +		min_zasl = min_zasl > izasl ? izasl : min_zasl;
>>> +	}
>>> +
>>> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
>> I do not understand what bio_max_zasl is used for here, as it does not represent
>> anything related to the zoned namespaces backends.
> If we don't consider the bio max zasl then we will get the request more than
> one bio can handle for zone append and will lead to bio split whichneeds to
> be avoided.

Why ? zasl comes from the target backend max zone append sectors, which we
already know is OK and does not lead to BIO splitting for zone append. So why do
you need an extra verification against that bio_max_zasl ?

>>> +}> +
>>> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
>>> +{
>>> +	if (ns->bdev->bd_disk->queue->conv_zones_bitmap) {
>>> +		pr_err("block devices with conventional zones are not supported.");
>>> +		return false;
>>> +	}
>> It may be better to move this test in nvmet_bdev_validate_zns_zones(), so that
>> all backend zone configuration checks are together.
> Okay.
>>> +
>>> +	if (!nvmet_bdev_validate_zns_zones(ns))
>>> +		return false;
>>> +
>>> +	/*
>>> +	 * For ZBC and ZAC devices, writes into sequential zones must be aligned
>>> +	 * to the device physical block size. So use this value as the logical
>>> +	 * block size to avoid errors.
>>> +	 */
>>> +	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
>>> +
>>> +	nvmet_zns_update_zasl(ns);
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +/*
>>> + * ZNS related Admin and I/O command handlers.
>>> + */
>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>> +{
>>> +	struct nvmet_ctrl *ctrl = req->sq->ctrl;
>>> +	struct nvme_id_ctrl_zns *id;
>>> +	u16 status = 0;
>>> +	u8 mdts;
>>> +
>>> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
>>> +	if (!id) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	/*
>>> +	 * Even though this function sets Zone Append Size Limit to 0,
>>> +	 * the 0 value here indicates that the maximum data transfer size for
>>> +	 * the Zone Append command is indicated by the ctrl
>>> +	 * Maximum Data Transfer Size (MDTS).
>>> +	 */
>> I do not understand this comment. It does not really exactly match what the code
>> below is doing.
> Let me fix the code and the comment in next version.
>>> +
>>> +	mdts = ctrl->ops->get_mdts ? ctrl->ops->get_mdts(ctrl) : 0;
>>> +
>>> +	id->zasl = min_t(u8, mdts, req->sq->ctrl->subsys->id_ctrl_zns.zasl);
>>> +
>>> +	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
>>> +
>>> +	kfree(id);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>> +{
>>> +	struct nvme_id_ns_zns *id_zns;
>>> +	u16 status = 0;
>>> +	u64 zsze;
>>> +
>>> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
>>> +	if (!id_zns) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
>>> +	if (!req->ns) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto done;
>>> +	}
>>> +
>>> +	if (!bdev_is_zoned(nvmet_bdev(req))) {
>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>> +		goto done;
>>> +	}
>>> +
>>> +	nvmet_ns_revalidate(req->ns);
>>> +	zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >>
>>> +					req->ns->blksize_shift;
>>> +	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
>>> +	id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req)));
>>> +	id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req)));
>>> +
>>> +done:
>>> +	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
>>> +	kfree(id_zns);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +struct nvmet_report_zone_data {
>>> +	struct nvmet_ns *ns;
>>> +	struct nvme_zone_report *rz;
>>> +};
>>> +
>>> +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx,
>>> +				     void *data)
>>> +{
>>> +	struct nvmet_report_zone_data *report_zone_data = data;
>>> +	struct nvme_zone_descriptor *entries = report_zone_data->rz->entries;
>>> +	struct nvmet_ns *ns = report_zone_data->ns;
>>> +
>>> +	entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity));
>>> +	entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start));
>>> +	entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp));
>>> +	entries[idx].za = z->reset ? 1 << 2 : 0;
>>> +	entries[idx].zt = z->type;
>>> +	entries[idx].zs = z->cond << 4;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>> +{
>>> +	u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
>> size_t ?
> Yes.
>>> +	struct nvmet_report_zone_data data = { .ns = req->ns };
>>> +	struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr;
>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(zmr->slba));
>>> +	unsigned int nr_zones = bufsize / nvmet_zones_to_desc_size(1);
>> I think this is wrong since nvmet_zones_to_desc_size includes sizeof(struct
>> nvme_zone_report). I think what you want here is:
>>
>> nr_zones = (bufsize - sizeof(struct nvme_zone_report)) /
>> 	sizeof(struct nvme_zone_descriptor);
>>
>> And then you can get rid of nvmet_zones_to_desc_size();
> Yes, it doesn't need nvmet_zones_to_desc_size() anymore from v1.
>>> +	int reported_zones;Maybe there is better way.
>>> +	u16 status;
>>> +
>>> +	status = nvmet_bdev_zns_checks(req);
>>> +	if (status)
>>> +		goto out;
>>> +
>>> +	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>>> +	if (!data.rz) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nr_zones,
>>> +					     nvmet_bdev_report_zone_cb,
>>> +					     &data);
>>> +	if (reported_zones < 0) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out_free_report_zones;
>>> +	}
>>> +
>>> +	data.rz->nr_zones = cpu_to_le64(reported_zones);
>>> +
>>> +	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
>>> +
>>> +out_free_report_zones:
>>> +	kvfree(data.rz);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>> +{
>>> +	sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req));
>>> +	struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms;
>>> +	enum req_opf op = REQ_OP_LAST;
>>> +	u16 status = NVME_SC_SUCCESS;
>>> +	sector_t sect;
>>> +	int ret;
>>> +
>>> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba));
>>> +
>>> +	switch (c->zsa) {
>>> +	case NVME_ZONE_OPEN:
>>> +		op = REQ_OP_ZONE_OPEN;
>>> +		break;
>>> +	case NVME_ZONE_CLOSE:
>>> +		op = REQ_OP_ZONE_CLOSE;
>>> +		break;
>>> +	case NVME_ZONE_FINISH:
>>> +		op = REQ_OP_ZONE_FINISH;
>>> +		break;
>>> +	case NVME_ZONE_RESET:
>>> +		if (c->select_all)
>>> +			nr_sect = get_capacity(nvmet_bdev(req)->bd_disk);
>>> +		op = REQ_OP_ZONE_RESET;
>>> +		break;
>>> +	default:
>>> +		status = NVME_SC_INVALID_FIELD;
>>> +		goto out;
>>> +	}
>>> +
>>> +	ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL);
>>> +	if (ret)
>>> +		status = NVME_SC_INTERNAL;
>>> +
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>> +{
>>> +	unsigned long bv_cnt = req->sg_cnt;
>>> +	int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
>>> +	u64 slba = le64_to_cpu(req->cmd->rw.slba);
>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, slba);
>>> +	u16 status = NVME_SC_SUCCESS;
>>> +	size_t mapped_data_len = 0;
>>> +	int sg_cnt = req->sg_cnt;
>>> +	struct scatterlist *sg;
>>> +	struct iov_iter from;
>>> +	struct bio_vec *bvec;
>>> +	size_t mapped_cnt;
>>> +	struct bio *bio;
>>> +	int ret;
>>> +
>>> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
>>> +		return;
>>> +
>>> +	/*
>>> +	 * When setting the ctrl->zasl we consider the BIO_MAX_PAGES so that we
>>> +	 * don't have to split the bio, i.e. we shouldn't get
>>> +	 * sg_cnt > BIO_MAX_PAGES since zasl on the host will limit the I/Os
>>> +	 * with the size that considers the BIO_MAX_PAGES.
>>> +	 */
>> What ? Unclear... You advertize max zone append as ctrl->zasl = min(all ns max
>> zone append sectors). What does BIO_MAX_PAGES has to do with anything ?
> That is not true, ctrl->zasl is advertised
> min(min all ns max zone append sectors), bio_max_zasl) to avoid the
> splitting
> of the bio on the target side:-

We already know that zasl for each NS, given by the max zone append sector of
the backends, are already OK, regardless of BIO_MAX_PAGES (see below).

> 
> See nvmet_zns_update_zasl()
> 
> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
> 
> Without considering the bio_max_pages we may have to set the ctrl->zasl
> value
> thatis > bio_max_pages_zasl, then we'll get a request that is greater
> than what
> one bio can handle, that will lead to splitting the bios, which we want to
> avoid as per the comment in the V1.
> 
> Can you please explain what is wrong with this approach which regulates the
> zasl with all the possible namespaces zasl and bio_zasl?

For starter, with multi-page bvecs now, I am not sure BIO_MAX_PAGES makes any
sense anymore. Next, on the target side, the max zone append sectors limit for
each NS guarantee that a single BIO can be used without splitting needed. Taking
the min  of all of them will NOT remove that guarantee. Hence I think that
BIO_MAX_PAGES has nothing to do in the middle of all this.

> 
> May be there is better way?
>>> +	if (!req->sg_cnt)
>>> +		goto out;
>>> +
>>> +	if (WARN_ON(req->sg_cnt > BIO_MAX_PAGES)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL);
>>> +	if (!bvec) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) {
>>> +		nvmet_file_init_bvec(bvec, sg);
>>> +		mapped_data_len += bvec[mapped_cnt].bv_len;
>>> +		sg_cnt--;
>>> +		if (mapped_cnt == bv_cnt)
>>> +			brhigh frequency I/O operationeak;
>>> +	}
>>> +
>>> +	if (WARN_ON(sg_cnt)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len);
>>> +
>>> +	bio = bio_alloc(GFP_KERNEL, bv_cnt);
>>> +	bio_set_dev(bio, nvmet_bdev(req));
>>> +	bio->bi_iter.bi_sector = sect;
>>> +	bio->bi_opf = op;
>> The op variable is used only here. It is not necessary.
> Yes.
>>> +
>>> +	ret =  __bio_iov_append_get_pages(bio, &from);
>> I still do not see why bio_iov_iter_get_pages() does not work here. Would you
>> care to explain please ?
>>
> The same reason pointed out by the Johannes. Instead of calling wrapper,
> call the underlaying core API, since we don't care about the rest of the
> generic code. This also avoids an extra function callfor zone-append
> fast path.

I am still not convinced that __bio_iov_append_get_pages() will do the right
thing as it lacks the loop that bio_iov_iter_get_pages() has.

>>> +	if (unlikely(ret)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		bio_io_error(bio);
>>> +		goto bvec_free;
>>> +	}
>>> +
>>> +	ret = submit_bio_wait(bio);
>>> +	status = ret < 0 ? NVME_SC_INTERNAL : status;
>>> +	bio_put(bio);
>>> +
>>> +	sect += (mapped_data_len >> 9);
>>> +	req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect));
>>> +
>>> +bvec_free:
>>> +	kfree(bvec);
>>> +
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +#else  /* CONFIG_BLK_DEV_ZONED */
>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>> +{
>>> +}
>>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
>>> +{
>>> +	return 0;
>>> +}
>>> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
>>> +{
>>> +	return false;
>>> +}
>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
>>> +{
>>> +}
>>> +#endif /* CONFIG_BLK_DEV_ZONED */
>>>
>>
> 
> 

-- 
Damien Le Moal
Western Digital Research