From: Max Gurtovoy <maxg@mellanox.com>
To: linux-nvme@lists.infradead.org, kbusch@kernel.org, hch@lst.de,
sagi@grimberg.me, martin.petersen@oracle.com
Cc: axboe@kernel.dk, vladimirk@mellanox.com, shlomin@mellanox.com,
israelr@mellanox.com, idanb@mellanox.com, oren@mellanox.com,
maxg@mellanox.com
Subject: [PATCH 15/15] nvmet-rdma: Add metadata/T10-PI support
Date: Mon, 6 Jan 2020 15:37:36 +0200 [thread overview]
Message-ID: <20200106133736.123038-17-maxg@mellanox.com> (raw)
In-Reply-To: <20200106133736.123038-1-maxg@mellanox.com>
From: Israel Rukshin <israelr@mellanox.com>
For capable HCAs (e.g. ConnectX-4/ConnectX-5) this will allow end-to-end
protection information passthrough and validation for NVMe over RDMA
transport. Metadata support was implemented over the new RDMA signature
verbs API.
Signed-off-by: Israel Rukshin <israelr@mellanox.com>
Reviewed-by: Max Gurtovoy <maxg@mellanox.com>
---
drivers/nvme/target/rdma.c | 235 +++++++++++++++++++++++++++++++++++++++++----
1 file changed, 217 insertions(+), 18 deletions(-)
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 37d262a..bdd3299 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -54,6 +54,7 @@ struct nvmet_rdma_rsp {
struct nvmet_rdma_queue *queue;
struct ib_cqe read_cqe;
+ struct ib_cqe write_cqe;
struct rdma_rw_ctx rw;
struct nvmet_req req;
@@ -129,6 +130,9 @@ struct nvmet_rdma_device {
static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc);
static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc);
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc);
+#endif
static void nvmet_rdma_qp_event(struct ib_event *event, void *priv);
static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue);
static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev,
@@ -386,6 +390,10 @@ static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev,
/* Data In / RDMA READ */
r->read_cqe.done = nvmet_rdma_read_data_done;
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ /* Data Out / RDMA WRITE */
+ r->write_cqe.done = nvmet_rdma_write_data_done;
+#endif
return 0;
out_free_rsp:
@@ -495,6 +503,138 @@ static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue)
spin_unlock(&queue->rsp_wr_wait_lock);
}
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static u16 nvmet_rdma_check_pi_status(struct ib_mr *sig_mr)
+{
+ struct ib_mr_status mr_status;
+ int ret;
+ u16 status = 0;
+
+ ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
+ if (ret) {
+ pr_err("ib_check_mr_status failed, ret %d\n", ret);
+ return NVME_SC_INVALID_PI;
+ }
+
+ if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
+ switch (mr_status.sig_err.err_type) {
+ case IB_SIG_BAD_GUARD:
+ status = NVME_SC_GUARD_CHECK;
+ break;
+ case IB_SIG_BAD_REFTAG:
+ status = NVME_SC_REFTAG_CHECK;
+ break;
+ case IB_SIG_BAD_APPTAG:
+ status = NVME_SC_APPTAG_CHECK;
+ break;
+ }
+ pr_err("PI error found type %d expected 0x%x vs actual 0x%x\n",
+ mr_status.sig_err.err_type,
+ mr_status.sig_err.expected,
+ mr_status.sig_err.actual);
+ }
+
+ return status;
+}
+
+static void nvmet_rdma_set_diff_domain(struct nvme_command *cmd,
+ struct ib_sig_domain *domain, struct nvmet_ns *ns)
+{
+ struct blk_integrity *bi = bdev_get_integrity(ns->bdev);
+
+ WARN_ON(bi == NULL);
+
+ domain->sig_type = IB_SIG_TYPE_T10_DIF;
+ domain->sig.dif.bg_type = IB_T10DIF_CRC;
+ domain->sig.dif.pi_interval = 1 << bi->interval_exp;
+ domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag);
+
+ /*
+ * At the moment we hard code those, but in the future
+ * we will take them from cmd.
+ */
+ domain->sig.dif.apptag_check_mask = 0xffff;
+ domain->sig.dif.app_escape = true;
+ domain->sig.dif.ref_escape = true;
+ if (ns->prot_type != NVME_NS_DPS_PI_TYPE3)
+ domain->sig.dif.ref_remap = true;
+}
+
+static void nvmet_rdma_set_sig_attrs(struct nvmet_req *req,
+ struct ib_sig_attrs *sig_attrs)
+{
+ struct nvme_command *cmd = req->cmd;
+ u16 control = le16_to_cpu(cmd->rw.control);
+
+ memset(sig_attrs, 0, sizeof(*sig_attrs));
+
+ if (control & NVME_RW_PRINFO_PRACT) {
+ /* for WRITE_INSERT/READ_STRIP no wire domain */
+ sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
+ nvmet_rdma_set_diff_domain(cmd, &sig_attrs->mem, req->ns);
+ /* Clear the PRACT bit since HCA will generate/verify the PI */
+ control &= ~NVME_RW_PRINFO_PRACT;
+ cmd->rw.control = cpu_to_le16(control);
+ /* PI is added by the HW */
+ req->transfer_len += req->prot_len;
+ } else {
+ /* for WRITE_PASS/READ_PASS both wire/memory domains exist */
+ nvmet_rdma_set_diff_domain(cmd, &sig_attrs->wire, req->ns);
+ nvmet_rdma_set_diff_domain(cmd, &sig_attrs->mem, req->ns);
+ }
+
+ if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF)
+ sig_attrs->check_mask |= IB_SIG_CHECK_REFTAG;
+ if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD)
+ sig_attrs->check_mask |= IB_SIG_CHECK_GUARD;
+ if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_APP)
+ sig_attrs->check_mask |= IB_SIG_CHECK_APPTAG;
+}
+#else
+static u16 nvmet_rdma_check_pi_status(struct ib_mr *sig_mr)
+{
+ return 0;
+}
+
+static void nvmet_rdma_set_sig_attrs(struct nvmet_req *req,
+ struct ib_sig_attrs *sig_attrs)
+{
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
+static int nvmet_rdma_rw_ctx_init(struct nvmet_rdma_rsp *rsp, u64 addr, u32 key,
+ struct ib_sig_attrs *sig_attrs)
+{
+ struct rdma_cm_id *cm_id = rsp->queue->cm_id;
+ struct nvmet_req *req = &rsp->req;
+ int ret;
+
+ if (req->use_pi)
+ ret = rdma_rw_ctx_signature_init(&rsp->rw, cm_id->qp,
+ cm_id->port_num, req->sg, req->sg_cnt, req->prot_sg,
+ req->prot_sg_cnt, sig_attrs, addr, key,
+ nvmet_data_dir(req));
+ else
+ ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
+ req->sg, req->sg_cnt, 0, addr, key,
+ nvmet_data_dir(req));
+
+ return ret;
+}
+
+static void nvmet_rdma_rw_ctx_destroy(struct nvmet_rdma_rsp *rsp)
+{
+ struct rdma_cm_id *cm_id = rsp->queue->cm_id;
+ struct nvmet_req *req = &rsp->req;
+
+ if (req->use_pi)
+ rdma_rw_ctx_destroy_signature(&rsp->rw, cm_id->qp,
+ cm_id->port_num, req->sg, req->sg_cnt, req->prot_sg,
+ req->prot_sg_cnt, nvmet_data_dir(req));
+ else
+ rdma_rw_ctx_destroy(&rsp->rw, cm_id->qp, cm_id->port_num,
+ req->sg, req->sg_cnt, nvmet_data_dir(req));
+}
static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
{
@@ -502,11 +642,8 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
- if (rsp->n_rdma) {
- rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
- queue->cm_id->port_num, rsp->req.sg,
- rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
- }
+ if (rsp->n_rdma)
+ nvmet_rdma_rw_ctx_destroy(rsp);
if (rsp->req.sg != rsp->cmd->inline_sg)
nvmet_req_free_sgl(&rsp->req);
@@ -561,11 +698,16 @@ static void nvmet_rdma_queue_response(struct nvmet_req *req)
rsp->send_wr.opcode = IB_WR_SEND;
}
- if (nvmet_rdma_need_data_out(rsp))
- first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
- cm_id->port_num, NULL, &rsp->send_wr);
- else
+ if (nvmet_rdma_need_data_out(rsp)) {
+ if (rsp->req.use_pi)
+ first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
+ cm_id->port_num, &rsp->write_cqe, NULL);
+ else
+ first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp,
+ cm_id->port_num, NULL, &rsp->send_wr);
+ } else {
first_wr = &rsp->send_wr;
+ }
nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
@@ -584,15 +726,14 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
struct nvmet_rdma_rsp *rsp =
container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe);
struct nvmet_rdma_queue *queue = cq->cq_context;
+ u16 status = 0;
WARN_ON(rsp->n_rdma <= 0);
atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
- rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp,
- queue->cm_id->port_num, rsp->req.sg,
- rsp->req.sg_cnt, nvmet_data_dir(&rsp->req));
rsp->n_rdma = 0;
if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ nvmet_rdma_rw_ctx_destroy(rsp);
nvmet_req_uninit(&rsp->req);
nvmet_rdma_release_rsp(rsp);
if (wc->status != IB_WC_WR_FLUSH_ERR) {
@@ -603,9 +744,59 @@ static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc)
return;
}
- rsp->req.execute(&rsp->req);
+ if (rsp->req.use_pi)
+ status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr);
+ nvmet_rdma_rw_ctx_destroy(rsp);
+
+ if (unlikely(status))
+ nvmet_req_complete(&rsp->req, status);
+ else
+ rsp->req.execute(&rsp->req);
}
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+static void nvmet_rdma_write_data_done(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct nvmet_rdma_rsp *rsp =
+ container_of(wc->wr_cqe, struct nvmet_rdma_rsp, write_cqe);
+ struct nvmet_rdma_queue *queue = cq->cq_context;
+ struct rdma_cm_id *cm_id = rsp->queue->cm_id;
+ u16 status;
+
+ WARN_ON(rsp->n_rdma <= 0);
+ atomic_add(rsp->n_rdma, &queue->sq_wr_avail);
+ rsp->n_rdma = 0;
+
+ if (unlikely(wc->status != IB_WC_SUCCESS)) {
+ nvmet_rdma_rw_ctx_destroy(rsp);
+ nvmet_req_uninit(&rsp->req);
+ nvmet_rdma_release_rsp(rsp);
+ if (wc->status != IB_WC_WR_FLUSH_ERR) {
+ pr_info("RDMA WRITE for CQE 0x%p failed with status %s (%d).\n",
+ wc->wr_cqe, ib_wc_status_msg(wc->status),
+ wc->status);
+ nvmet_rdma_error_comp(queue);
+ }
+ return;
+ }
+
+ /*
+ * Upon RDMA completion check the signature status
+ * - if succeeded send good NVMe response
+ * - if failed send bad NVMe response with appropriate error
+ */
+ status = nvmet_rdma_check_pi_status(rsp->rw.reg->mr);
+ if (unlikely(status))
+ rsp->req.cqe->status = cpu_to_le16(status << 1);
+ nvmet_rdma_rw_ctx_destroy(rsp);
+
+ if (unlikely(ib_post_send(cm_id->qp, &rsp->send_wr, NULL))) {
+ pr_err("sending cmd response failed\n");
+ nvmet_rdma_release_rsp(rsp);
+ }
+}
+#endif /* CONFIG_BLK_DEV_INTEGRITY */
+
static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len,
u64 off)
{
@@ -660,9 +851,9 @@ static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp)
static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
struct nvme_keyed_sgl_desc *sgl, bool invalidate)
{
- struct rdma_cm_id *cm_id = rsp->queue->cm_id;
u64 addr = le64_to_cpu(sgl->addr);
u32 key = get_unaligned_le32(sgl->key);
+ struct ib_sig_attrs sig_attrs;
int ret;
rsp->req.transfer_len = get_unaligned_le24(sgl->length);
@@ -671,13 +862,14 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
if (!rsp->req.transfer_len)
return 0;
+ if (rsp->req.use_pi)
+ nvmet_rdma_set_sig_attrs(&rsp->req, &sig_attrs);
+
ret = nvmet_req_alloc_sgl(&rsp->req);
if (unlikely(ret < 0))
goto error_out;
- ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
- rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
- nvmet_data_dir(&rsp->req));
+ ret = nvmet_rdma_rw_ctx_init(rsp, addr, key, &sig_attrs);
if (unlikely(ret < 0))
goto error_out;
rsp->n_rdma += ret;
@@ -956,6 +1148,9 @@ static void nvmet_rdma_free_dev(struct kref *ref)
goto out_free_pd;
}
+ port->pi_capable = ndev->device->attrs.device_cap_flags &
+ IB_DEVICE_INTEGRITY_HANDOVER ? true : false;
+
list_add(&ndev->entry, &device_list);
out_unlock:
mutex_unlock(&device_list_mutex);
@@ -1020,6 +1215,10 @@ static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue)
qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count;
}
+ if (queue->port->pi_capable && queue->port->pi_enable &&
+ queue->host_qid)
+ qp_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
+
ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr);
if (ret) {
pr_err("failed to create_qp ret= %d\n", ret);
@@ -1164,6 +1363,7 @@ static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id,
INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work);
queue->dev = ndev;
queue->cm_id = cm_id;
+ queue->port = cm_id->context;
spin_lock_init(&queue->state_lock);
queue->state = NVMET_RDMA_Q_CONNECTING;
@@ -1282,7 +1482,6 @@ static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id,
ret = -ENOMEM;
goto put_device;
}
- queue->port = cm_id->context;
if (queue->host_qid == 0) {
/* Let inflight controller teardown complete */
--
1.8.3.1
_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme
next prev parent reply other threads:[~2020-01-06 13:42 UTC|newest]
Thread overview: 51+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-01-06 13:37 [PATCH 00/15 V3] nvme-rdma/nvmet-rdma: Add metadata/T10-PI support Max Gurtovoy
2020-01-06 13:37 ` [PATCH] nvme-cli/fabrics: Add pi_enable param to connect cmd Max Gurtovoy
2020-01-06 13:37 ` [PATCH 01/15] nvme: Introduce namespace features flag Max Gurtovoy
2020-01-07 18:07 ` Keith Busch
2020-01-08 12:00 ` Max Gurtovoy
2020-01-09 3:11 ` Martin K. Petersen
2020-01-09 10:38 ` Max Gurtovoy
2020-01-09 16:26 ` Keith Busch
2020-01-12 9:40 ` Max Gurtovoy
2020-01-13 20:31 ` Keith Busch
2020-01-14 16:04 ` Max Gurtovoy
2020-01-12 9:40 ` Max Gurtovoy
2020-01-06 13:37 ` [PATCH 02/15] nvme: Enforce extended LBA format for fabrics metadata Max Gurtovoy
2020-01-16 23:53 ` James Smart
2020-01-19 11:20 ` Max Gurtovoy
2020-01-21 17:40 ` James Smart
2020-01-06 13:37 ` [PATCH 03/15] nvme: Introduce max_integrity_segments ctrl attribute Max Gurtovoy
2020-01-09 3:12 ` Martin K. Petersen
2020-01-06 13:37 ` [PATCH 04/15] nvme-fabrics: Allow user enabling metadata/T10-PI support Max Gurtovoy
2020-01-06 13:37 ` [PATCH 05/15] nvme: Introduce NVME_INLINE_PROT_SG_CNT Max Gurtovoy
2020-01-09 3:13 ` Martin K. Petersen
2020-01-06 13:37 ` [PATCH 06/15] nvme-rdma: Introduce nvme_rdma_sgl structure Max Gurtovoy
2020-01-06 13:37 ` [PATCH 07/15] nvme-rdma: Add metadata/T10-PI support Max Gurtovoy
2020-01-06 13:37 ` [PATCH 08/15] nvmet: Prepare metadata request Max Gurtovoy
2020-01-06 13:37 ` [PATCH 09/15] nvmet: Add metadata characteristics for a namespace Max Gurtovoy
2020-01-09 3:16 ` Martin K. Petersen
2020-01-06 13:37 ` [PATCH 10/15] nvmet: Rename nvmet_rw_len to nvmet_rw_data_len Max Gurtovoy
2020-01-09 3:17 ` Martin K. Petersen
2020-01-06 13:37 ` [PATCH 11/15] nvmet: Rename nvmet_check_data_len to nvmet_check_transfer_len Max Gurtovoy
2020-01-09 3:19 ` Martin K. Petersen
2020-01-06 13:37 ` [PATCH 12/15] nvme: Add Metadata Capabilities enumerations Max Gurtovoy
2020-01-06 13:37 ` [PATCH 13/15] nvmet: Add metadata/T10-PI support Max Gurtovoy
2020-01-09 3:24 ` Martin K. Petersen
2020-01-27 17:17 ` Max Gurtovoy
2020-01-29 2:32 ` Martin K. Petersen
2020-01-17 16:46 ` James Smart
2020-01-19 13:47 ` Max Gurtovoy
2020-01-06 13:37 ` [PATCH 14/15] nvmet: Add metadata support for block devices Max Gurtovoy
2020-01-06 13:37 ` Max Gurtovoy [this message]
2020-01-09 3:29 ` [PATCH 15/15] nvmet-rdma: Add metadata/T10-PI support Martin K. Petersen
-- strict thread matches above, loose matches on Subject: below --
2020-04-28 13:11 [PATCH 00/15 V6] nvme-rdma/nvmet-rdma: " Max Gurtovoy
2020-04-28 13:11 ` [PATCH 15/15] nvmet-rdma: add " Max Gurtovoy
2020-05-01 16:48 ` Christoph Hellwig
2020-05-03 16:29 ` Max Gurtovoy
2020-05-04 14:08 ` Christoph Hellwig
2020-05-04 14:19 ` Max Gurtovoy
2019-11-05 16:20 [PATCH 00/15] nvme-rdma/nvmet-rdma: Add " Max Gurtovoy
2019-11-05 16:20 ` [PATCH 15/15] nvmet-rdma: " Max Gurtovoy
2019-11-05 18:02 ` Christoph Hellwig
2019-11-07 13:43 ` Max Gurtovoy
2019-11-12 18:34 ` Sagi Grimberg
2019-11-13 13:56 ` Max Gurtovoy
2019-11-14 23:45 ` Sagi Grimberg
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200106133736.123038-17-maxg@mellanox.com \
--to=maxg@mellanox.com \
--cc=axboe@kernel.dk \
--cc=hch@lst.de \
--cc=idanb@mellanox.com \
--cc=israelr@mellanox.com \
--cc=kbusch@kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=martin.petersen@oracle.com \
--cc=oren@mellanox.com \
--cc=sagi@grimberg.me \
--cc=shlomin@mellanox.com \
--cc=vladimirk@mellanox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).