All of lore.kernel.org
 help / color / mirror / Atom feed
From: Damien Le Moal <Damien.LeMoal@wdc.com>
To: Chaitanya Kulkarni <Chaitanya.Kulkarni@wdc.com>,
	"linux-block@vger.kernel.org" <linux-block@vger.kernel.org>,
	"linux-nvme@lists.infradead.org" <linux-nvme@lists.infradead.org>
Cc: "hch@lst.de" <hch@lst.de>, "sagi@grimberg.me" <sagi@grimberg.me>
Subject: Re: [PATCH V2 2/9] nvmet: add ZNS support for bdev-ns
Date: Tue, 1 Dec 2020 05:44:30 +0000	[thread overview]
Message-ID: <CH2PR04MB6522A475CD3F0F4D123E913BE7F40@CH2PR04MB6522.namprd04.prod.outlook.com> (raw)
In-Reply-To: BYAPR04MB49654A4ECB46346E8445D8AA86F40@BYAPR04MB4965.namprd04.prod.outlook.com

On 2020/12/01 12:38, Chaitanya Kulkarni wrote:
[...]
>>> +	 * namespaces.
>>> +	 */
>>> +	min_zasl = ns->zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
>>> +
>>> +	xa_for_each(&(ns->subsys->namespaces), idx, ins) {
>>> +		struct request_queue *iq = ins->bdev->bd_disk->queue;
>>> +		unsigned int imax_za_sects = queue_max_zone_append_sectors(iq);
>>> +		u8 izasl = nvmet_zasl(imax_za_sects);
>>> +
>>> +		if (!bdev_is_zoned(ins->bdev))
>>> +			continue;
>>> +
>>> +		min_zasl = min_zasl > izasl ? izasl : min_zasl;
>>> +	}
>>> +
>>> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
>> I do not understand what bio_max_zasl is used for here, as it does not represent
>> anything related to the zoned namespaces backends.
> If we don't consider the bio max zasl then we will get the request more than
> one bio can handle for zone append and will lead to bio split whichneeds to
> be avoided.

Why ? zasl comes from the target backend max zone append sectors, which we
already know is OK and does not lead to BIO splitting for zone append. So why do
you need an extra verification against that bio_max_zasl ?

>>> +}> +
>>> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
>>> +{
>>> +	if (ns->bdev->bd_disk->queue->conv_zones_bitmap) {
>>> +		pr_err("block devices with conventional zones are not supported.");
>>> +		return false;
>>> +	}
>> It may be better to move this test in nvmet_bdev_validate_zns_zones(), so that
>> all backend zone configuration checks are together.
> Okay.
>>> +
>>> +	if (!nvmet_bdev_validate_zns_zones(ns))
>>> +		return false;
>>> +
>>> +	/*
>>> +	 * For ZBC and ZAC devices, writes into sequential zones must be aligned
>>> +	 * to the device physical block size. So use this value as the logical
>>> +	 * block size to avoid errors.
>>> +	 */
>>> +	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
>>> +
>>> +	nvmet_zns_update_zasl(ns);
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +/*
>>> + * ZNS related Admin and I/O command handlers.
>>> + */
>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>> +{
>>> +	struct nvmet_ctrl *ctrl = req->sq->ctrl;
>>> +	struct nvme_id_ctrl_zns *id;
>>> +	u16 status = 0;
>>> +	u8 mdts;
>>> +
>>> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
>>> +	if (!id) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	/*
>>> +	 * Even though this function sets Zone Append Size Limit to 0,
>>> +	 * the 0 value here indicates that the maximum data transfer size for
>>> +	 * the Zone Append command is indicated by the ctrl
>>> +	 * Maximum Data Transfer Size (MDTS).
>>> +	 */
>> I do not understand this comment. It does not really exactly match what the code
>> below is doing.
> Let me fix the code and the comment in next version.
>>> +
>>> +	mdts = ctrl->ops->get_mdts ? ctrl->ops->get_mdts(ctrl) : 0;
>>> +
>>> +	id->zasl = min_t(u8, mdts, req->sq->ctrl->subsys->id_ctrl_zns.zasl);
>>> +
>>> +	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
>>> +
>>> +	kfree(id);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>> +{
>>> +	struct nvme_id_ns_zns *id_zns;
>>> +	u16 status = 0;
>>> +	u64 zsze;
>>> +
>>> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
>>> +	if (!id_zns) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
>>> +	if (!req->ns) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto done;
>>> +	}
>>> +
>>> +	if (!bdev_is_zoned(nvmet_bdev(req))) {
>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>> +		goto done;
>>> +	}
>>> +
>>> +	nvmet_ns_revalidate(req->ns);
>>> +	zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >>
>>> +					req->ns->blksize_shift;
>>> +	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
>>> +	id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req)));
>>> +	id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req)));
>>> +
>>> +done:
>>> +	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
>>> +	kfree(id_zns);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +struct nvmet_report_zone_data {
>>> +	struct nvmet_ns *ns;
>>> +	struct nvme_zone_report *rz;
>>> +};
>>> +
>>> +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx,
>>> +				     void *data)
>>> +{
>>> +	struct nvmet_report_zone_data *report_zone_data = data;
>>> +	struct nvme_zone_descriptor *entries = report_zone_data->rz->entries;
>>> +	struct nvmet_ns *ns = report_zone_data->ns;
>>> +
>>> +	entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity));
>>> +	entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start));
>>> +	entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp));
>>> +	entries[idx].za = z->reset ? 1 << 2 : 0;
>>> +	entries[idx].zt = z->type;
>>> +	entries[idx].zs = z->cond << 4;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>> +{
>>> +	u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
>> size_t ?
> Yes.
>>> +	struct nvmet_report_zone_data data = { .ns = req->ns };
>>> +	struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr;
>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(zmr->slba));
>>> +	unsigned int nr_zones = bufsize / nvmet_zones_to_desc_size(1);
>> I think this is wrong since nvmet_zones_to_desc_size includes sizeof(struct
>> nvme_zone_report). I think what you want here is:
>>
>> nr_zones = (bufsize - sizeof(struct nvme_zone_report)) /
>> 	sizeof(struct nvme_zone_descriptor);
>>
>> And then you can get rid of nvmet_zones_to_desc_size();
> Yes, it doesn't need nvmet_zones_to_desc_size() anymore from v1.
>>> +	int reported_zones;Maybe there is better way.
>>> +	u16 status;
>>> +
>>> +	status = nvmet_bdev_zns_checks(req);
>>> +	if (status)
>>> +		goto out;
>>> +
>>> +	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>>> +	if (!data.rz) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nr_zones,
>>> +					     nvmet_bdev_report_zone_cb,
>>> +					     &data);
>>> +	if (reported_zones < 0) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out_free_report_zones;
>>> +	}
>>> +
>>> +	data.rz->nr_zones = cpu_to_le64(reported_zones);
>>> +
>>> +	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
>>> +
>>> +out_free_report_zones:
>>> +	kvfree(data.rz);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>> +{
>>> +	sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req));
>>> +	struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms;
>>> +	enum req_opf op = REQ_OP_LAST;
>>> +	u16 status = NVME_SC_SUCCESS;
>>> +	sector_t sect;
>>> +	int ret;
>>> +
>>> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba));
>>> +
>>> +	switch (c->zsa) {
>>> +	case NVME_ZONE_OPEN:
>>> +		op = REQ_OP_ZONE_OPEN;
>>> +		break;
>>> +	case NVME_ZONE_CLOSE:
>>> +		op = REQ_OP_ZONE_CLOSE;
>>> +		break;
>>> +	case NVME_ZONE_FINISH:
>>> +		op = REQ_OP_ZONE_FINISH;
>>> +		break;
>>> +	case NVME_ZONE_RESET:
>>> +		if (c->select_all)
>>> +			nr_sect = get_capacity(nvmet_bdev(req)->bd_disk);
>>> +		op = REQ_OP_ZONE_RESET;
>>> +		break;
>>> +	default:
>>> +		status = NVME_SC_INVALID_FIELD;
>>> +		goto out;
>>> +	}
>>> +
>>> +	ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL);
>>> +	if (ret)
>>> +		status = NVME_SC_INTERNAL;
>>> +
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>> +{
>>> +	unsigned long bv_cnt = req->sg_cnt;
>>> +	int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
>>> +	u64 slba = le64_to_cpu(req->cmd->rw.slba);
>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, slba);
>>> +	u16 status = NVME_SC_SUCCESS;
>>> +	size_t mapped_data_len = 0;
>>> +	int sg_cnt = req->sg_cnt;
>>> +	struct scatterlist *sg;
>>> +	struct iov_iter from;
>>> +	struct bio_vec *bvec;
>>> +	size_t mapped_cnt;
>>> +	struct bio *bio;
>>> +	int ret;
>>> +
>>> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
>>> +		return;
>>> +
>>> +	/*
>>> +	 * When setting the ctrl->zasl we consider the BIO_MAX_PAGES so that we
>>> +	 * don't have to split the bio, i.e. we shouldn't get
>>> +	 * sg_cnt > BIO_MAX_PAGES since zasl on the host will limit the I/Os
>>> +	 * with the size that considers the BIO_MAX_PAGES.
>>> +	 */
>> What ? Unclear... You advertize max zone append as ctrl->zasl = min(all ns max
>> zone append sectors). What does BIO_MAX_PAGES has to do with anything ?
> That is not true, ctrl->zasl is advertised
> min(min all ns max zone append sectors), bio_max_zasl) to avoid the
> splitting
> of the bio on the target side:-

We already know that zasl for each NS, given by the max zone append sector of
the backends, are already OK, regardless of BIO_MAX_PAGES (see below).

> 
> See nvmet_zns_update_zasl()
> 
> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
> 
> Without considering the bio_max_pages we may have to set the ctrl->zasl
> value
> thatis > bio_max_pages_zasl, then we'll get a request that is greater
> than what
> one bio can handle, that will lead to splitting the bios, which we want to
> avoid as per the comment in the V1.
> 
> Can you please explain what is wrong with this approach which regulates the
> zasl with all the possible namespaces zasl and bio_zasl?

For starter, with multi-page bvecs now, I am not sure BIO_MAX_PAGES makes any
sense anymore. Next, on the target side, the max zone append sectors limit for
each NS guarantee that a single BIO can be used without splitting needed. Taking
the min  of all of them will NOT remove that guarantee. Hence I think that
BIO_MAX_PAGES has nothing to do in the middle of all this.

> 
> May be there is better way?
>>> +	if (!req->sg_cnt)
>>> +		goto out;
>>> +
>>> +	if (WARN_ON(req->sg_cnt > BIO_MAX_PAGES)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL);
>>> +	if (!bvec) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) {
>>> +		nvmet_file_init_bvec(bvec, sg);
>>> +		mapped_data_len += bvec[mapped_cnt].bv_len;
>>> +		sg_cnt--;
>>> +		if (mapped_cnt == bv_cnt)
>>> +			brhigh frequency I/O operationeak;
>>> +	}
>>> +
>>> +	if (WARN_ON(sg_cnt)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len);
>>> +
>>> +	bio = bio_alloc(GFP_KERNEL, bv_cnt);
>>> +	bio_set_dev(bio, nvmet_bdev(req));
>>> +	bio->bi_iter.bi_sector = sect;
>>> +	bio->bi_opf = op;
>> The op variable is used only here. It is not necessary.
> Yes.
>>> +
>>> +	ret =  __bio_iov_append_get_pages(bio, &from);
>> I still do not see why bio_iov_iter_get_pages() does not work here. Would you
>> care to explain please ?
>>
> The same reason pointed out by the Johannes. Instead of calling wrapper,
> call the underlaying core API, since we don't care about the rest of the
> generic code. This also avoids an extra function callfor zone-append
> fast path.

I am still not convinced that __bio_iov_append_get_pages() will do the right
thing as it lacks the loop that bio_iov_iter_get_pages() has.

>>> +	if (unlikely(ret)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		bio_io_error(bio);
>>> +		goto bvec_free;
>>> +	}
>>> +
>>> +	ret = submit_bio_wait(bio);
>>> +	status = ret < 0 ? NVME_SC_INTERNAL : status;
>>> +	bio_put(bio);
>>> +
>>> +	sect += (mapped_data_len >> 9);
>>> +	req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect));
>>> +
>>> +bvec_free:
>>> +	kfree(bvec);
>>> +
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +#else  /* CONFIG_BLK_DEV_ZONED */
>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>> +{
>>> +}
>>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
>>> +{
>>> +	return 0;
>>> +}
>>> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
>>> +{
>>> +	return false;
>>> +}
>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
>>> +{
>>> +}
>>> +#endif /* CONFIG_BLK_DEV_ZONED */
>>>
>>
> 
> 


-- 
Damien Le Moal
Western Digital Research

WARNING: multiple messages have this Message-ID (diff)
From: Damien Le Moal <Damien.LeMoal@wdc.com>
To: Chaitanya Kulkarni <Chaitanya.Kulkarni@wdc.com>,
	"linux-block@vger.kernel.org" <linux-block@vger.kernel.org>,
	"linux-nvme@lists.infradead.org" <linux-nvme@lists.infradead.org>
Cc: "hch@lst.de" <hch@lst.de>, "sagi@grimberg.me" <sagi@grimberg.me>
Subject: Re: [PATCH V2 2/9] nvmet: add ZNS support for bdev-ns
Date: Tue, 1 Dec 2020 05:44:30 +0000	[thread overview]
Message-ID: <CH2PR04MB6522A475CD3F0F4D123E913BE7F40@CH2PR04MB6522.namprd04.prod.outlook.com> (raw)
In-Reply-To: BYAPR04MB49654A4ECB46346E8445D8AA86F40@BYAPR04MB4965.namprd04.prod.outlook.com

On 2020/12/01 12:38, Chaitanya Kulkarni wrote:
[...]
>>> +	 * namespaces.
>>> +	 */
>>> +	min_zasl = ns->zasl = nvmet_zasl(queue_max_zone_append_sectors(q));
>>> +
>>> +	xa_for_each(&(ns->subsys->namespaces), idx, ins) {
>>> +		struct request_queue *iq = ins->bdev->bd_disk->queue;
>>> +		unsigned int imax_za_sects = queue_max_zone_append_sectors(iq);
>>> +		u8 izasl = nvmet_zasl(imax_za_sects);
>>> +
>>> +		if (!bdev_is_zoned(ins->bdev))
>>> +			continue;
>>> +
>>> +		min_zasl = min_zasl > izasl ? izasl : min_zasl;
>>> +	}
>>> +
>>> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
>> I do not understand what bio_max_zasl is used for here, as it does not represent
>> anything related to the zoned namespaces backends.
> If we don't consider the bio max zasl then we will get the request more than
> one bio can handle for zone append and will lead to bio split whichneeds to
> be avoided.

Why ? zasl comes from the target backend max zone append sectors, which we
already know is OK and does not lead to BIO splitting for zone append. So why do
you need an extra verification against that bio_max_zasl ?

>>> +}> +
>>> +bool nvmet_bdev_zns_enable(struct nvmet_ns *ns)
>>> +{
>>> +	if (ns->bdev->bd_disk->queue->conv_zones_bitmap) {
>>> +		pr_err("block devices with conventional zones are not supported.");
>>> +		return false;
>>> +	}
>> It may be better to move this test in nvmet_bdev_validate_zns_zones(), so that
>> all backend zone configuration checks are together.
> Okay.
>>> +
>>> +	if (!nvmet_bdev_validate_zns_zones(ns))
>>> +		return false;
>>> +
>>> +	/*
>>> +	 * For ZBC and ZAC devices, writes into sequential zones must be aligned
>>> +	 * to the device physical block size. So use this value as the logical
>>> +	 * block size to avoid errors.
>>> +	 */
>>> +	ns->blksize_shift = blksize_bits(bdev_physical_block_size(ns->bdev));
>>> +
>>> +	nvmet_zns_update_zasl(ns);
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +/*
>>> + * ZNS related Admin and I/O command handlers.
>>> + */
>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>> +{
>>> +	struct nvmet_ctrl *ctrl = req->sq->ctrl;
>>> +	struct nvme_id_ctrl_zns *id;
>>> +	u16 status = 0;
>>> +	u8 mdts;
>>> +
>>> +	id = kzalloc(sizeof(*id), GFP_KERNEL);
>>> +	if (!id) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	/*
>>> +	 * Even though this function sets Zone Append Size Limit to 0,
>>> +	 * the 0 value here indicates that the maximum data transfer size for
>>> +	 * the Zone Append command is indicated by the ctrl
>>> +	 * Maximum Data Transfer Size (MDTS).
>>> +	 */
>> I do not understand this comment. It does not really exactly match what the code
>> below is doing.
> Let me fix the code and the comment in next version.
>>> +
>>> +	mdts = ctrl->ops->get_mdts ? ctrl->ops->get_mdts(ctrl) : 0;
>>> +
>>> +	id->zasl = min_t(u8, mdts, req->sq->ctrl->subsys->id_ctrl_zns.zasl);
>>> +
>>> +	status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
>>> +
>>> +	kfree(id);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>> +{
>>> +	struct nvme_id_ns_zns *id_zns;
>>> +	u16 status = 0;
>>> +	u64 zsze;
>>> +
>>> +	if (le32_to_cpu(req->cmd->identify.nsid) == NVME_NSID_ALL) {
>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	id_zns = kzalloc(sizeof(*id_zns), GFP_KERNEL);
>>> +	if (!id_zns) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	req->ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid);
>>> +	if (!req->ns) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto done;
>>> +	}
>>> +
>>> +	if (!bdev_is_zoned(nvmet_bdev(req))) {
>>> +		req->error_loc = offsetof(struct nvme_identify, nsid);
>>> +		status = NVME_SC_INVALID_NS | NVME_SC_DNR;
>>> +		goto done;
>>> +	}
>>> +
>>> +	nvmet_ns_revalidate(req->ns);
>>> +	zsze = (bdev_zone_sectors(nvmet_bdev(req)) << 9) >>
>>> +					req->ns->blksize_shift;
>>> +	id_zns->lbafe[0].zsze = cpu_to_le64(zsze);
>>> +	id_zns->mor = cpu_to_le32(bdev_max_open_zones(nvmet_bdev(req)));
>>> +	id_zns->mar = cpu_to_le32(bdev_max_active_zones(nvmet_bdev(req)));
>>> +
>>> +done:
>>> +	status = nvmet_copy_to_sgl(req, 0, id_zns, sizeof(*id_zns));
>>> +	kfree(id_zns);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +struct nvmet_report_zone_data {
>>> +	struct nvmet_ns *ns;
>>> +	struct nvme_zone_report *rz;
>>> +};
>>> +
>>> +static int nvmet_bdev_report_zone_cb(struct blk_zone *z, unsigned int idx,
>>> +				     void *data)
>>> +{
>>> +	struct nvmet_report_zone_data *report_zone_data = data;
>>> +	struct nvme_zone_descriptor *entries = report_zone_data->rz->entries;
>>> +	struct nvmet_ns *ns = report_zone_data->ns;
>>> +
>>> +	entries[idx].zcap = cpu_to_le64(nvmet_sect_to_lba(ns, z->capacity));
>>> +	entries[idx].zslba = cpu_to_le64(nvmet_sect_to_lba(ns, z->start));
>>> +	entries[idx].wp = cpu_to_le64(nvmet_sect_to_lba(ns, z->wp));
>>> +	entries[idx].za = z->reset ? 1 << 2 : 0;
>>> +	entries[idx].zt = z->type;
>>> +	entries[idx].zs = z->cond << 4;
>>> +
>>> +	return 0;
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>> +{
>>> +	u32 bufsize = (le32_to_cpu(req->cmd->zmr.numd) + 1) << 2;
>> size_t ?
> Yes.
>>> +	struct nvmet_report_zone_data data = { .ns = req->ns };
>>> +	struct nvme_zone_mgmt_recv_cmd *zmr = &req->cmd->zmr;
>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(zmr->slba));
>>> +	unsigned int nr_zones = bufsize / nvmet_zones_to_desc_size(1);
>> I think this is wrong since nvmet_zones_to_desc_size includes sizeof(struct
>> nvme_zone_report). I think what you want here is:
>>
>> nr_zones = (bufsize - sizeof(struct nvme_zone_report)) /
>> 	sizeof(struct nvme_zone_descriptor);
>>
>> And then you can get rid of nvmet_zones_to_desc_size();
> Yes, it doesn't need nvmet_zones_to_desc_size() anymore from v1.
>>> +	int reported_zones;Maybe there is better way.
>>> +	u16 status;
>>> +
>>> +	status = nvmet_bdev_zns_checks(req);
>>> +	if (status)
>>> +		goto out;
>>> +
>>> +	data.rz = __vmalloc(bufsize, GFP_KERNEL | __GFP_NORETRY);
>>> +	if (!data.rz) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out;
>>> +	}
>>> +
>>> +	reported_zones = blkdev_report_zones(nvmet_bdev(req), sect, nr_zones,
>>> +					     nvmet_bdev_report_zone_cb,
>>> +					     &data);
>>> +	if (reported_zones < 0) {
>>> +		status = NVME_SC_INTERNAL;
>>> +		goto out_free_report_zones;
>>> +	}
>>> +
>>> +	data.rz->nr_zones = cpu_to_le64(reported_zones);
>>> +
>>> +	status = nvmet_copy_to_sgl(req, 0, data.rz, bufsize);
>>> +
>>> +out_free_report_zones:
>>> +	kvfree(data.rz);
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>> +{
>>> +	sector_t nr_sect = bdev_zone_sectors(nvmet_bdev(req));
>>> +	struct nvme_zone_mgmt_send_cmd *c = &req->cmd->zms;
>>> +	enum req_opf op = REQ_OP_LAST;
>>> +	u16 status = NVME_SC_SUCCESS;
>>> +	sector_t sect;
>>> +	int ret;
>>> +
>>> +	sect = nvmet_lba_to_sect(req->ns, le64_to_cpu(req->cmd->zms.slba));
>>> +
>>> +	switch (c->zsa) {
>>> +	case NVME_ZONE_OPEN:
>>> +		op = REQ_OP_ZONE_OPEN;
>>> +		break;
>>> +	case NVME_ZONE_CLOSE:
>>> +		op = REQ_OP_ZONE_CLOSE;
>>> +		break;
>>> +	case NVME_ZONE_FINISH:
>>> +		op = REQ_OP_ZONE_FINISH;
>>> +		break;
>>> +	case NVME_ZONE_RESET:
>>> +		if (c->select_all)
>>> +			nr_sect = get_capacity(nvmet_bdev(req)->bd_disk);
>>> +		op = REQ_OP_ZONE_RESET;
>>> +		break;
>>> +	default:
>>> +		status = NVME_SC_INVALID_FIELD;
>>> +		goto out;
>>> +	}
>>> +
>>> +	ret = blkdev_zone_mgmt(nvmet_bdev(req), op, sect, nr_sect, GFP_KERNEL);
>>> +	if (ret)
>>> +		status = NVME_SC_INTERNAL;
>>> +
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>> +{
>>> +	unsigned long bv_cnt = req->sg_cnt;
>>> +	int op = REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE;
>>> +	u64 slba = le64_to_cpu(req->cmd->rw.slba);
>>> +	sector_t sect = nvmet_lba_to_sect(req->ns, slba);
>>> +	u16 status = NVME_SC_SUCCESS;
>>> +	size_t mapped_data_len = 0;
>>> +	int sg_cnt = req->sg_cnt;
>>> +	struct scatterlist *sg;
>>> +	struct iov_iter from;
>>> +	struct bio_vec *bvec;
>>> +	size_t mapped_cnt;
>>> +	struct bio *bio;
>>> +	int ret;
>>> +
>>> +	if (!nvmet_check_transfer_len(req, nvmet_rw_data_len(req)))
>>> +		return;
>>> +
>>> +	/*
>>> +	 * When setting the ctrl->zasl we consider the BIO_MAX_PAGES so that we
>>> +	 * don't have to split the bio, i.e. we shouldn't get
>>> +	 * sg_cnt > BIO_MAX_PAGES since zasl on the host will limit the I/Os
>>> +	 * with the size that considers the BIO_MAX_PAGES.
>>> +	 */
>> What ? Unclear... You advertize max zone append as ctrl->zasl = min(all ns max
>> zone append sectors). What does BIO_MAX_PAGES has to do with anything ?
> That is not true, ctrl->zasl is advertised
> min(min all ns max zone append sectors), bio_max_zasl) to avoid the
> splitting
> of the bio on the target side:-

We already know that zasl for each NS, given by the max zone append sector of
the backends, are already OK, regardless of BIO_MAX_PAGES (see below).

> 
> See nvmet_zns_update_zasl()
> 
> +	ns->subsys->id_ctrl_zns.zasl = min_t(u8, min_zasl, bio_max_zasl);
> 
> Without considering the bio_max_pages we may have to set the ctrl->zasl
> value
> thatis > bio_max_pages_zasl, then we'll get a request that is greater
> than what
> one bio can handle, that will lead to splitting the bios, which we want to
> avoid as per the comment in the V1.
> 
> Can you please explain what is wrong with this approach which regulates the
> zasl with all the possible namespaces zasl and bio_zasl?

For starter, with multi-page bvecs now, I am not sure BIO_MAX_PAGES makes any
sense anymore. Next, on the target side, the max zone append sectors limit for
each NS guarantee that a single BIO can be used without splitting needed. Taking
the min  of all of them will NOT remove that guarantee. Hence I think that
BIO_MAX_PAGES has nothing to do in the middle of all this.

> 
> May be there is better way?
>>> +	if (!req->sg_cnt)
>>> +		goto out;
>>> +
>>> +	if (WARN_ON(req->sg_cnt > BIO_MAX_PAGES)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	bvec = kmalloc_array(bv_cnt, sizeof(*bvec), GFP_KERNEL);
>>> +	if (!bvec) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	for_each_sg(req->sg, sg, req->sg_cnt, mapped_cnt) {
>>> +		nvmet_file_init_bvec(bvec, sg);
>>> +		mapped_data_len += bvec[mapped_cnt].bv_len;
>>> +		sg_cnt--;
>>> +		if (mapped_cnt == bv_cnt)
>>> +			brhigh frequency I/O operationeak;
>>> +	}
>>> +
>>> +	if (WARN_ON(sg_cnt)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		goto out;
>>> +	}
>>> +
>>> +	iov_iter_bvec(&from, WRITE, bvec, mapped_cnt, mapped_data_len);
>>> +
>>> +	bio = bio_alloc(GFP_KERNEL, bv_cnt);
>>> +	bio_set_dev(bio, nvmet_bdev(req));
>>> +	bio->bi_iter.bi_sector = sect;
>>> +	bio->bi_opf = op;
>> The op variable is used only here. It is not necessary.
> Yes.
>>> +
>>> +	ret =  __bio_iov_append_get_pages(bio, &from);
>> I still do not see why bio_iov_iter_get_pages() does not work here. Would you
>> care to explain please ?
>>
> The same reason pointed out by the Johannes. Instead of calling wrapper,
> call the underlaying core API, since we don't care about the rest of the
> generic code. This also avoids an extra function callfor zone-append
> fast path.

I am still not convinced that __bio_iov_append_get_pages() will do the right
thing as it lacks the loop that bio_iov_iter_get_pages() has.

>>> +	if (unlikely(ret)) {
>>> +		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>>> +		bio_io_error(bio);
>>> +		goto bvec_free;
>>> +	}
>>> +
>>> +	ret = submit_bio_wait(bio);
>>> +	status = ret < 0 ? NVME_SC_INTERNAL : status;
>>> +	bio_put(bio);
>>> +
>>> +	sect += (mapped_data_len >> 9);
>>> +	req->cqe->result.u64 = le64_to_cpu(nvmet_sect_to_lba(req->ns, sect));
>>> +
>>> +bvec_free:
>>> +	kfree(bvec);
>>> +
>>> +out:
>>> +	nvmet_req_complete(req, status);
>>> +}
>>> +
>>> +#else  /* CONFIG_BLK_DEV_ZONED */
>>> +void nvmet_execute_identify_cns_cs_ctrl(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_execute_identify_cns_cs_ns(struct nvmet_req *req)
>>> +{
>>> +}
>>> +u16 nvmet_process_zns_cis(struct nvmet_req *req, off_t *off)
>>> +{
>>> +	return 0;
>>> +}
>>> +bool nvmet_bdev_zns_config(struct nvmet_ns *ns)
>>> +{
>>> +	return false;
>>> +}
>>> +void nvmet_bdev_execute_zone_mgmt_recv(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_bdev_execute_zone_mgmt_send(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_bdev_execute_zone_append(struct nvmet_req *req)
>>> +{
>>> +}
>>> +void nvmet_zns_add_cmd_effects(struct nvme_effects_log *log)
>>> +{
>>> +}
>>> +#endif /* CONFIG_BLK_DEV_ZONED */
>>>
>>
> 
> 


-- 
Damien Le Moal
Western Digital Research

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

  reply	other threads:[~2020-12-01  5:45 UTC|newest]

Thread overview: 44+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-11-30  3:29 [PATCH 0/9] nvmet: add genblk ZBD backend Chaitanya Kulkarni
2020-11-30  3:29 ` Chaitanya Kulkarni
2020-11-30  3:29 ` [PATCH V2 1/9] block: export __bio_iov_append_get_pages() Chaitanya Kulkarni
2020-11-30  3:29   ` Chaitanya Kulkarni
2020-11-30  3:29 ` [PATCH V2 2/9] nvmet: add ZNS support for bdev-ns Chaitanya Kulkarni
2020-11-30  3:29   ` Chaitanya Kulkarni
2020-11-30 11:57   ` Damien Le Moal
2020-11-30 11:57     ` Damien Le Moal
2020-12-01  3:38     ` Chaitanya Kulkarni
2020-12-01  3:38       ` Chaitanya Kulkarni
2020-12-01  5:44       ` Damien Le Moal [this message]
2020-12-01  5:44         ` Damien Le Moal
2020-12-01 22:37         ` Chaitanya Kulkarni
2020-12-01 22:37           ` Chaitanya Kulkarni
2020-11-30 12:29   ` Johannes Thumshirn
2020-11-30 12:29     ` Johannes Thumshirn
2020-12-01  3:49     ` Chaitanya Kulkarni
2020-12-01  3:49       ` Chaitanya Kulkarni
2020-12-01  7:51       ` Johannes Thumshirn
2020-12-01  7:51         ` Johannes Thumshirn
2020-12-01 22:36         ` Chaitanya Kulkarni
2020-12-01 22:36           ` Chaitanya Kulkarni
2020-11-30  3:29 ` [PATCH V2 3/9] nvmet: trim down id-desclist to use req->ns Chaitanya Kulkarni
2020-11-30  3:29   ` Chaitanya Kulkarni
2020-11-30  3:29 ` [PATCH V2 4/9] nvmet: add NVME_CSI_ZNS in ns-desc for zbdev Chaitanya Kulkarni
2020-11-30  3:29   ` Chaitanya Kulkarni
2020-11-30  3:29 ` [PATCH V2 5/9] nvmet: add cns-cs-ctrl in id-ctrl for ZNS bdev Chaitanya Kulkarni
2020-11-30  3:29   ` Chaitanya Kulkarni
2020-11-30  3:29 ` [PATCH V2 6/9] nvmet: add cns-cs-ns " Chaitanya Kulkarni
2020-11-30  3:29   ` Chaitanya Kulkarni
2020-11-30  3:29 ` [PATCH V2 7/9] nvmet: add zns cmd effects to support zbdev Chaitanya Kulkarni
2020-11-30  3:29   ` Chaitanya Kulkarni
2020-11-30  3:29 ` [PATCH V2 8/9] nvmet: add zns bdev config support Chaitanya Kulkarni
2020-11-30  3:29   ` Chaitanya Kulkarni
2020-11-30 12:02   ` Damien Le Moal
2020-11-30 12:02     ` Damien Le Moal
2020-12-01  3:40     ` Chaitanya Kulkarni
2020-12-01  3:40       ` Chaitanya Kulkarni
2020-11-30  3:29 ` [PATCH V2 9/9] nvmet: add ZNS based I/O cmds handlers Chaitanya Kulkarni
2020-11-30  3:29   ` Chaitanya Kulkarni
2020-11-30  6:51 ` [PATCH 0/9] nvmet: add genblk ZBD backend Damien Le Moal
2020-11-30  6:51   ` Damien Le Moal
2020-12-01  3:42   ` Chaitanya Kulkarni
2020-12-01  3:42     ` Chaitanya Kulkarni

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CH2PR04MB6522A475CD3F0F4D123E913BE7F40@CH2PR04MB6522.namprd04.prod.outlook.com \
    --to=damien.lemoal@wdc.com \
    --cc=Chaitanya.Kulkarni@wdc.com \
    --cc=hch@lst.de \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.