Re: [PATCH 05/15] nvme-rdma: Add metadata/T10-PI support

From: Sagi Grimberg <sagi@grimberg.me>
To: Max Gurtovoy <maxg@mellanox.com>,
	linux-nvme@lists.infradead.org, kbusch@kernel.org, hch@lst.de
Cc: shlomin@mellanox.com, israelr@mellanox.com, oren@mellanox.com,
	vladimirk@mellanox.com, idanb@mellanox.com
Subject: Re: [PATCH 05/15] nvme-rdma: Add metadata/T10-PI support
Date: Thu, 14 Nov 2019 15:57:24 -0800	[thread overview]
Message-ID: <2233ab61-0715-b0fe-c4ac-c0d06b25ff49@grimberg.me> (raw)
In-Reply-To: <2205eff8-3fdc-ba0f-8578-994a8ed835f6@mellanox.com>



On 11/13/19 6:35 AM, Max Gurtovoy wrote:
> 
> On 11/12/2019 8:22 PM, Sagi Grimberg wrote:
>>
>>
>> On 11/5/19 8:20 AM, Max Gurtovoy wrote:
>>> For capable HCAs (e.g. ConnectX-4/ConnectX-5) this will allow end-to-end
>>> protection information passthrough and validation for NVMe over RDMA
>>> transport. Metadata offload support was implemented over the new RDMA
>>> signature verbs API and it is enabled per controller by using nvme-cli.
>>>
>>> usage example:
>>> nvme connect --pi_enable --transport=rdma --traddr=10.0.1.1 
>>> --nqn=test-nvme
>>>
>>> Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
>>> Signed-off-by: Israel Rukshin <israelr@mellanox.com>
>>> ---
>>>   drivers/nvme/host/rdma.c | 346 
>>> ++++++++++++++++++++++++++++++++++++++++-------
>>>   1 file changed, 298 insertions(+), 48 deletions(-)
>>>
>>> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
>>> index 05f2dfa..16263b8 100644
>>> --- a/drivers/nvme/host/rdma.c
>>> +++ b/drivers/nvme/host/rdma.c
>>> @@ -48,6 +48,12 @@ struct nvme_rdma_qe {
>>>       u64            dma;
>>>   };
>>>   +struct nvme_rdma_sgl {
>>> +    int            nents;
>>> +    struct sg_table        sg_table;
>>> +    struct scatterlist    first_sgl[SG_CHUNK_SIZE];
>>> +};
>>
>> How about we dynamically allocate this in nvme_rdma_init_request (with
>> a prep patch for it)? both for pi and data, this is getting quite
>> large...
>>
>>> +
>>>   struct nvme_rdma_queue;
>>>   struct nvme_rdma_request {
>>>       struct nvme_request    req;
>>> @@ -58,12 +64,14 @@ struct nvme_rdma_request {
>>>       refcount_t        ref;
>>>       struct ib_sge        sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
>>>       u32            num_sge;
>>> -    int            nents;
>>>       struct ib_reg_wr    reg_wr;
>>>       struct ib_cqe        reg_cqe;
>>>       struct nvme_rdma_queue  *queue;
>>> -    struct sg_table        sg_table;
>>> -    struct scatterlist    first_sgl[];
>>> +    /* T10-PI support */
>>> +    bool            is_protected;
>>> +
>>> +    struct nvme_rdma_sgl    data_sgl;
>>> +    struct nvme_rdma_sgl    pi_sgl[];
>>>   };
>>>     enum nvme_rdma_queue_flags {
>>> @@ -85,6 +93,7 @@ struct nvme_rdma_queue {
>>>       struct rdma_cm_id    *cm_id;
>>>       int            cm_error;
>>>       struct completion    cm_done;
>>> +    bool            pi_support;
>>
>> Any specific reason for the queue local pi_support?
>>
>>>   };
>>>     struct nvme_rdma_ctrl {
>>> @@ -112,6 +121,7 @@ struct nvme_rdma_ctrl {
>>>       struct nvme_ctrl    ctrl;
>>>       bool            use_inline_data;
>>>       u32            io_queues[HCTX_MAX_TYPES];
>>> +    bool            pi_support;
>>>   };
>>>     static inline struct nvme_rdma_ctrl *to_rdma_ctrl(struct 
>>> nvme_ctrl *ctrl)
>>> @@ -269,6 +279,8 @@ static int nvme_rdma_create_qp(struct 
>>> nvme_rdma_queue *queue, const int factor)
>>>       init_attr.qp_type = IB_QPT_RC;
>>>       init_attr.send_cq = queue->ib_cq;
>>>       init_attr.recv_cq = queue->ib_cq;
>>> +    if (queue->pi_support)
>>> +        init_attr.create_flags |= IB_QP_CREATE_INTEGRITY_EN;
>>>         ret = rdma_create_qp(queue->cm_id, dev->pd, &init_attr);
>>>   @@ -408,6 +420,8 @@ static void nvme_rdma_destroy_queue_ib(struct 
>>> nvme_rdma_queue *queue)
>>>       dev = queue->device;
>>>       ibdev = dev->dev;
>>>   +    if (queue->pi_support)
>>> +        ib_mr_pool_destroy(queue->qp, &queue->qp->sig_mrs);
>>>       ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
>>>         /*
>>> @@ -424,10 +438,14 @@ static void nvme_rdma_destroy_queue_ib(struct 
>>> nvme_rdma_queue *queue)
>>>       nvme_rdma_dev_put(dev);
>>>   }
>>>   -static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev)
>>> +static int nvme_rdma_get_max_fr_pages(struct ib_device *ibdev, bool 
>>> pi_support)
>>>   {
>>> -    return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
>>> -             ibdev->attrs.max_fast_reg_page_list_len - 1);
>>> +    if (pi_support)
>>> +        return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
>>> +                 ibdev->attrs.max_pi_fast_reg_page_list_len - 1);
>>> +    else
>>> +        return min_t(u32, NVME_RDMA_MAX_SEGMENTS,
>>> +                 ibdev->attrs.max_fast_reg_page_list_len - 1);
>>>   }
>>
>> Maybe just take the max_page_list_len on the condition and take the 
>> min once.
>>
>>>     static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
>>> @@ -484,7 +502,7 @@ static int nvme_rdma_create_queue_ib(struct 
>>> nvme_rdma_queue *queue)
>>>        * misaligned we'll end up using two entries for a single data 
>>> page,
>>>        * so one additional entry is required.
>>>        */
>>> -    pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev) + 1;
>>> +    pages_per_mr = nvme_rdma_get_max_fr_pages(ibdev, 
>>> queue->pi_support) + 1;
>>>       ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
>>>                     queue->queue_size,
>>>                     IB_MR_TYPE_MEM_REG,
>>> @@ -496,10 +514,24 @@ static int nvme_rdma_create_queue_ib(struct 
>>> nvme_rdma_queue *queue)
>>>           goto out_destroy_ring;
>>>       }
>>>   +    if (queue->pi_support) {
>>> +        ret = ib_mr_pool_init(queue->qp, &queue->qp->sig_mrs,
>>> +                      queue->queue_size, IB_MR_TYPE_INTEGRITY,
>>> +                      pages_per_mr, pages_per_mr);
>>> +        if (ret) {
>>> +            dev_err(queue->ctrl->ctrl.device,
>>> +                "failed to initialize PI MR pool sized %d for QID 
>>> %d\n",
>>> +                queue->queue_size, idx);
>>> +            goto out_destroy_mr_pool;
>>> +        }
>>> +    }
>>> +
>>>       set_bit(NVME_RDMA_Q_TR_READY, &queue->flags);
>>>         return 0;
>>>   +out_destroy_mr_pool:
>>> +    ib_mr_pool_destroy(queue->qp, &queue->qp->rdma_mrs);
>>>   out_destroy_ring:
>>>       nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
>>>                   sizeof(struct nvme_completion), DMA_FROM_DEVICE);
>>> @@ -521,6 +553,7 @@ static int nvme_rdma_alloc_queue(struct 
>>> nvme_rdma_ctrl *ctrl,
>>>         queue = &ctrl->queues[idx];
>>>       queue->ctrl = ctrl;
>>> +    queue->pi_support = idx && ctrl->pi_support;
>>
>> Thats why..
>>
>>> init_completion(&queue->cm_done);
>>>         if (idx > 0)
>>> @@ -730,8 +763,7 @@ static struct blk_mq_tag_set 
>>> *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
>>>           set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
>>>           set->reserved_tags = 2; /* connect + keep-alive */
>>>           set->numa_node = nctrl->numa_node;
>>> -        set->cmd_size = sizeof(struct nvme_rdma_request) +
>>> -            SG_CHUNK_SIZE * sizeof(struct scatterlist);
>>> +        set->cmd_size = sizeof(struct nvme_rdma_request);
>>>           set->driver_data = ctrl;
>>>           set->nr_hw_queues = 1;
>>>           set->timeout = ADMIN_TIMEOUT;
>>> @@ -745,7 +777,7 @@ static struct blk_mq_tag_set 
>>> *nvme_rdma_alloc_tagset(struct nvme_ctrl *nctrl,
>>>           set->numa_node = nctrl->numa_node;
>>>           set->flags = BLK_MQ_F_SHOULD_MERGE;
>>>           set->cmd_size = sizeof(struct nvme_rdma_request) +
>>> -            SG_CHUNK_SIZE * sizeof(struct scatterlist);
>>> +            (ctrl->pi_support * sizeof(struct nvme_rdma_sgl));
>>>           set->driver_data = ctrl;
>>>           set->nr_hw_queues = nctrl->queue_count - 1;
>>>           set->timeout = NVME_IO_TIMEOUT;
>>> @@ -787,7 +819,22 @@ static int 
>>> nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
>>>       ctrl->device = ctrl->queues[0].device;
>>>       ctrl->ctrl.numa_node = dev_to_node(ctrl->device->dev->dma_device);
>>>   -    ctrl->max_fr_pages = 
>>> nvme_rdma_get_max_fr_pages(ctrl->device->dev);
>>> +    /* T10-PI support */
>>> +    if (ctrl->ctrl.opts->pi_enable) {
>>> +        if (!(ctrl->device->dev->attrs.device_cap_flags &
>>> +              IB_DEVICE_INTEGRITY_HANDOVER)) {
>>> +            dev_warn(ctrl->ctrl.device,
>>> +                 "T10-PI requested but not supported on %s, continue 
>>> without T10-PI\n",
>>> +                 ctrl->device->dev->name);
>>> +            ctrl->pi_support = false;
>>> +            ctrl->ctrl.opts->pi_enable = false;
>>> +        } else {
>>> +            ctrl->pi_support = true;
>>> +        }
>>> +    }
>>> +
>>> +    ctrl->max_fr_pages = nvme_rdma_get_max_fr_pages(ctrl->device->dev,
>>> +                            ctrl->pi_support);
>>>         /*
>>>        * Bind the async event SQE DMA mapping to the admin queue 
>>> lifetime.
>>> @@ -829,6 +876,8 @@ static int nvme_rdma_configure_admin_queue(struct 
>>> nvme_rdma_ctrl *ctrl,
>>>         ctrl->ctrl.max_segments = ctrl->max_fr_pages;
>>>       ctrl->ctrl.max_hw_sectors = ctrl->max_fr_pages << (ilog2(SZ_4K) 
>>> - 9);
>>> +    if (ctrl->pi_support)
>>> +        ctrl->ctrl.max_integrity_segments = ctrl->max_fr_pages;
>>>         blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
>>>   @@ -1154,13 +1203,24 @@ static void nvme_rdma_unmap_data(struct 
>>> nvme_rdma_queue *queue,
>>>       if (!blk_rq_nr_phys_segments(rq))
>>>           return;
>>>   +    if (blk_integrity_rq(rq)) {
>>> +        ib_dma_unmap_sg(ibdev, req->pi_sgl->sg_table.sgl,
>>> +            req->pi_sgl->nents, rq_dma_dir(rq));
>>> +        sg_free_table_chained(&req->pi_sgl->sg_table, SG_CHUNK_SIZE);
>>> +    }
>>> +
>>>       if (req->mr) {
>>> -        ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
>>> +        if (req->is_protected)
>>> +            ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr);
>>> +        else
>>> +            ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs,
>>> +                       req->mr);
>>>           req->mr = NULL;
>>>       }
>>>   -    ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, 
>>> rq_dma_dir(rq));
>>> -    sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
>>> +    ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl,
>>> +            req->data_sgl.nents, rq_dma_dir(rq));
>>> +    sg_free_table_chained(&req->data_sgl.sg_table, SG_CHUNK_SIZE);
>>>   }
>>>     static int nvme_rdma_set_sg_null(struct nvme_command *c)
>>> @@ -1179,7 +1239,7 @@ static int nvme_rdma_map_sg_inline(struct 
>>> nvme_rdma_queue *queue,
>>>           int count)
>>>   {
>>>       struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
>>> -    struct scatterlist *sgl = req->sg_table.sgl;
>>> +    struct scatterlist *sgl = req->data_sgl.sg_table.sgl;
>>>       struct ib_sge *sge = &req->sge[1];
>>>       u32 len = 0;
>>>       int i;
>>> @@ -1204,40 +1264,115 @@ static int nvme_rdma_map_sg_single(struct 
>>> nvme_rdma_queue *queue,
>>>   {
>>>       struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
>>>   -    sg->addr = cpu_to_le64(sg_dma_address(req->sg_table.sgl));
>>> -    put_unaligned_le24(sg_dma_len(req->sg_table.sgl), sg->length);
>>> +    sg->addr = cpu_to_le64(sg_dma_address(req->data_sgl.sg_table.sgl));
>>> + put_unaligned_le24(sg_dma_len(req->data_sgl.sg_table.sgl), 
>>> sg->length);
>>> put_unaligned_le32(queue->device->pd->unsafe_global_rkey, sg->key);
>>>       sg->type = NVME_KEY_SGL_FMT_DATA_DESC << 4;
>>>       return 0;
>>>   }
>>>   -static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
>>> -        struct nvme_rdma_request *req, struct nvme_command *c,
>>> -        int count)
>>> +static void nvme_rdma_set_diff_domain(struct nvme_command *cmd, 
>>> struct bio *bio,
>>> +        struct ib_sig_domain *domain, struct request *rq)
>>>   {
>>> -    struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
>>> -    int nr;
>>> +    struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
>>> +    struct nvme_ns *ns = rq->rq_disk->private_data;
>>>   -    req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
>>> -    if (WARN_ON_ONCE(!req->mr))
>>> -        return -EAGAIN;
>>> +    WARN_ON(bi == NULL);
>>> +
>>> +    domain->sig_type = IB_SIG_TYPE_T10_DIF;
>>> +    domain->sig.dif.bg_type = IB_T10DIF_CRC;
>>> +    domain->sig.dif.pi_interval = 1 << bi->interval_exp;
>>> +    domain->sig.dif.ref_tag = le32_to_cpu(cmd->rw.reftag);
>>>         /*
>>> -     * Align the MR to a 4K page size to match the ctrl page size and
>>> -     * the block virtual boundary.
>>> +     * At the moment we hard code those, but in the future
>>> +     * we will take them from cmd.
>>>        */
>>> -    nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
>>> -    if (unlikely(nr < count)) {
>>> -        ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
>>> -        req->mr = NULL;
>>> -        if (nr < 0)
>>> -            return nr;
>>> -        return -EINVAL;
>>> +    domain->sig.dif.apptag_check_mask = 0xffff;
>>> +    domain->sig.dif.app_escape = true;
>>> +    domain->sig.dif.ref_escape = true;
>>> +    if (ns->pi_type != NVME_NS_DPS_PI_TYPE3)
>>> +        domain->sig.dif.ref_remap = true;
>>> +}
>>> +
>>> +static void nvme_rdma_set_sig_attrs(struct bio *bio,
>>> +        struct ib_sig_attrs *sig_attrs, struct nvme_command *c,
>>> +        struct request *rq)
>>> +{
>>> +    u16 control = le16_to_cpu(c->rw.control);
>>> +
>>> +    memset(sig_attrs, 0, sizeof(*sig_attrs));
>>> +
>>> +    if (control & NVME_RW_PRINFO_PRACT) {
>>> +        /* for WRITE_INSERT/READ_STRIP no memory domain */
>>> +        sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
>>> +        nvme_rdma_set_diff_domain(c, bio, &sig_attrs->wire, rq);
>>> +        /* Clear the PRACT bit since HCA will generate/verify the PI */
>>> +        control &= ~NVME_RW_PRINFO_PRACT;
>>> +        c->rw.control = cpu_to_le16(control);
>>> +    } else {
>>> +        /* for WRITE_PASS/READ_PASS both wire/memory domains exist */
>>> +        nvme_rdma_set_diff_domain(c, bio, &sig_attrs->wire, rq);
>>> +        nvme_rdma_set_diff_domain(c, bio, &sig_attrs->mem, rq);
>>>       }
>>> +}
>>> +
>>> +static void nvme_rdma_set_prot_checks(struct nvme_command *cmd, u8 
>>> *mask)
>>> +{
>>> +    *mask = 0;
>>> +    if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_REF)
>>> +        *mask |= IB_SIG_CHECK_REFTAG;
>>> +    if (le16_to_cpu(cmd->rw.control) & NVME_RW_PRINFO_PRCHK_GUARD)
>>> +        *mask |= IB_SIG_CHECK_GUARD;
>>> +}
>>> +
>>> +static void nvme_rdma_sig_done(struct ib_cq *cq, struct ib_wc *wc)
>>> +{
>>> +    if (unlikely(wc->status != IB_WC_SUCCESS))
>>> +        nvme_rdma_wr_error(cq, wc, "SIG");
>>> +}
>>> +
>>> +static void nvme_rdma_set_pi_wr(struct nvme_rdma_request *req,
>>> +                struct nvme_command *c)
>>> +{
>>> +    struct ib_sig_attrs *sig_attrs = req->mr->sig_attrs;
>>> +    struct ib_reg_wr *wr = &req->reg_wr;
>>> +    struct request *rq = blk_mq_rq_from_pdu(req);
>>> +    struct bio *bio = rq->bio;
>>> +    struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
>>> +
>>> +    nvme_rdma_set_sig_attrs(bio, sig_attrs, c, rq);
>>> +
>>> +    nvme_rdma_set_prot_checks(c, &sig_attrs->check_mask);
>>>         ib_update_fast_reg_key(req->mr, ib_inc_rkey(req->mr->rkey));
>>>   +    req->reg_cqe.done = nvme_rdma_sig_done;
>>> +
>>> +    memset(wr, 0, sizeof(*wr));
>>> +    wr->wr.opcode = IB_WR_REG_MR_INTEGRITY;
>>> +    wr->wr.wr_cqe = &req->reg_cqe;
>>> +    wr->wr.num_sge = 0;
>>> +    wr->wr.send_flags = 0;
>>> +    wr->mr = req->mr;
>>> +    wr->key = req->mr->rkey;
>>> +    wr->access = IB_ACCESS_LOCAL_WRITE |
>>> +             IB_ACCESS_REMOTE_READ |
>>> +             IB_ACCESS_REMOTE_WRITE;
>>> +
>>> +    sg->addr = cpu_to_le64(req->mr->iova);
>>> +    put_unaligned_le24(req->mr->length, sg->length);
>>> +    put_unaligned_le32(req->mr->rkey, sg->key);
>>> +    sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) | 
>>> NVME_SGL_FMT_INVALIDATE;
>>> +}
>>> +
>>> +static void nvme_rdma_set_reg_wr(struct nvme_rdma_request *req,
>>> +                 struct nvme_command *c)
>>> +{
>>> +    struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
>>> +
>>>       req->reg_cqe.done = nvme_rdma_memreg_done;
>>> +
>>>       memset(&req->reg_wr, 0, sizeof(req->reg_wr));
>>>       req->reg_wr.wr.opcode = IB_WR_REG_MR;
>>>       req->reg_wr.wr.wr_cqe = &req->reg_cqe;
>>> @@ -1253,8 +1388,52 @@ static int nvme_rdma_map_sg_fr(struct 
>>> nvme_rdma_queue *queue,
>>>       put_unaligned_le32(req->mr->rkey, sg->key);
>>>       sg->type = (NVME_KEY_SGL_FMT_DATA_DESC << 4) |
>>>               NVME_SGL_FMT_INVALIDATE;
>>> +}
>>> +
>>> +static int nvme_rdma_map_sg_fr(struct nvme_rdma_queue *queue,
>>> +        struct nvme_rdma_request *req, struct nvme_command *c,
>>> +        int count, int pi_count)
>>> +{
>>> +    struct nvme_rdma_sgl *sgl = &req->data_sgl;
>>> +    int nr;
>>> +
>>> +    if (req->is_protected) {
>>> +        req->mr = ib_mr_pool_get(queue->qp, &queue->qp->sig_mrs);
>>> +        if (WARN_ON_ONCE(!req->mr))
>>> +            return -EAGAIN;
>>> +
>>> +        nr = ib_map_mr_sg_pi(req->mr, sgl->sg_table.sgl, count, 0,
>>> +                     req->pi_sgl->sg_table.sgl, pi_count, 0,
>>> +                     SZ_4K);
>>> +        if (unlikely(nr))
>>> +            goto mr_put;
>>> +
>>> +        nvme_rdma_set_pi_wr(req, c);
>>> +    } else {
>>> +        req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
>>> +        if (WARN_ON_ONCE(!req->mr))
>>> +            return -EAGAIN;
>>> +        /*
>>> +         * Align the MR to a 4K page size to match the ctrl page size
>>> +         * and the block virtual boundary.
>>> +         */
>>> +        nr = ib_map_mr_sg(req->mr, sgl->sg_table.sgl, count, 0, SZ_4K);
>>> +        if (unlikely(nr < count))
>>> +            goto mr_put;
>>> +
>>> +        nvme_rdma_set_reg_wr(req, c);
>>> +    }
>>>         return 0;
>>> +mr_put:
>>> +    if (req->is_protected)
>>> +        ib_mr_pool_put(queue->qp, &queue->qp->sig_mrs, req->mr);
>>> +    else
>>> +        ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
>>> +    req->mr = NULL;
>>> +    if (nr < 0)
>>> +        return nr;
>>> +    return -EINVAL;
>>>   }
>>>     static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
>>> @@ -1263,6 +1442,7 @@ static int nvme_rdma_map_data(struct 
>>> nvme_rdma_queue *queue,
>>>       struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq);
>>>       struct nvme_rdma_device *dev = queue->device;
>>>       struct ib_device *ibdev = dev->dev;
>>> +    int pi_count = 0;
>>>       int count, ret;
>>>         req->num_sge = 1;
>>> @@ -1273,23 +1453,44 @@ static int nvme_rdma_map_data(struct 
>>> nvme_rdma_queue *queue,
>>>       if (!blk_rq_nr_phys_segments(rq))
>>>           return nvme_rdma_set_sg_null(c);
>>>   -    req->sg_table.sgl = req->first_sgl;
>>> -    ret = sg_alloc_table_chained(&req->sg_table,
>>> -            blk_rq_nr_phys_segments(rq), req->sg_table.sgl,
>>> -            SG_CHUNK_SIZE);
>>> +    req->data_sgl.sg_table.sgl = req->data_sgl.first_sgl;
>>> +    ret = sg_alloc_table_chained(&req->data_sgl.sg_table,
>>> +            blk_rq_nr_phys_segments(rq),
>>> +            req->data_sgl.sg_table.sgl, SG_CHUNK_SIZE);
>>>       if (ret)
>>>           return -ENOMEM;
>>>   -    req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl);
>>> +    req->data_sgl.nents = blk_rq_map_sg(rq->q, rq,
>>> +                        req->data_sgl.sg_table.sgl);
>>>   -    count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents,
>>> -                  rq_dma_dir(rq));
>>> +    count = ib_dma_map_sg(ibdev, req->data_sgl.sg_table.sgl,
>>> +                  req->data_sgl.nents, rq_dma_dir(rq));
>>>       if (unlikely(count <= 0)) {
>>>           ret = -EIO;
>>>           goto out_free_table;
>>>       }
>>>   -    if (count <= dev->num_inline_segments) {
>>> +    if (blk_integrity_rq(rq)) {
>>> +        req->pi_sgl->sg_table.sgl = req->pi_sgl->first_sgl;
>>> +        ret = sg_alloc_table_chained(&req->pi_sgl->sg_table,
>>> +                blk_rq_count_integrity_sg(rq->q, rq->bio),
>>> +                req->pi_sgl->sg_table.sgl, SG_CHUNK_SIZE);
>>> +        if (unlikely(ret)) {
>>> +            ret = -ENOMEM;
>>> +            goto out_unmap_sg;
>>> +        }
>>> +
>>> +        req->pi_sgl->nents = blk_rq_map_integrity_sg(rq->q, rq->bio,
>>> +                    req->pi_sgl->sg_table.sgl);
>>> +        pi_count = ib_dma_map_sg(ibdev, req->pi_sgl->sg_table.sgl,
>>> +                req->pi_sgl->nents, rq_dma_dir(rq));
>>> +        if (unlikely(pi_count <= 0)) {
>>> +            ret = -EIO;
>>> +            goto out_free_pi_table;
>>> +        }
>>> +    }
>>> +
>>> +    if (count <= dev->num_inline_segments && !req->is_protected) {
>>>           if (rq_data_dir(rq) == WRITE && nvme_rdma_queue_idx(queue) &&
>>>               queue->ctrl->use_inline_data &&
>>>               blk_rq_payload_bytes(rq) <=
>>> @@ -1304,17 +1505,25 @@ static int nvme_rdma_map_data(struct 
>>> nvme_rdma_queue *queue,
>>>           }
>>>       }
>>>   -    ret = nvme_rdma_map_sg_fr(queue, req, c, count);
>>> +    ret = nvme_rdma_map_sg_fr(queue, req, c, count, pi_count);
>>>   out:
>>>       if (unlikely(ret))
>>> -        goto out_unmap_sg;
>>> +        goto out_unmap_pi_sg;
>>>         return 0;
>>>   +out_unmap_pi_sg:
>>> +    if (blk_integrity_rq(rq))
>>> +        ib_dma_unmap_sg(ibdev, req->pi_sgl->sg_table.sgl,
>>> +                req->pi_sgl->nents, rq_dma_dir(rq));
>>> +out_free_pi_table:
>>> +    if (blk_integrity_rq(rq))
>>> +        sg_free_table_chained(&req->pi_sgl->sg_table, SG_CHUNK_SIZE);
>>>   out_unmap_sg:
>>> -    ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, 
>>> rq_dma_dir(rq));
>>> +    ib_dma_unmap_sg(ibdev, req->data_sgl.sg_table.sgl,
>>> +            req->data_sgl.nents, rq_dma_dir(rq));
>>>   out_free_table:
>>> -    sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE);
>>> +    sg_free_table_chained(&req->data_sgl.sg_table, SG_CHUNK_SIZE);
>>>       return ret;
>>>   }
>>>   @@ -1754,6 +1963,13 @@ static blk_status_t 
>>> nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
>>>         blk_mq_start_request(rq);
>>>   +    req->is_protected = false;
>>> +    if (nvme_rdma_queue_idx(queue) && queue->pi_support) {
>>> +        if (c->common.opcode == nvme_cmd_write ||
>>> +            c->common.opcode == nvme_cmd_read)
>>> +            req->is_protected = nvme_ns_has_pi(ns);
>>> +    }
>>> +
>>
>> This check belongs to nvme-core in nature. Why not test
>> blk_integrity_rq()?
> 
> In case that write_generate and read_verify are 0, the call 
> blk_integrity_rq() will return 0 as well. So it's not good enough for us.
> 
> is_protected bool is a helper for RDMA transport to distinguish between 
> pi_req and non_pi request.

This is a hack, You need to take this info from the command.
How about flagging it when the PRINFO is set..

Something like:
--

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 4be64703aa47..12ef260296e1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -434,6 +434,7 @@ static inline void nvme_clear_nvme_request(struct 
request *req)
         if (!(req->rq_flags & RQF_DONTPREP)) {
                 nvme_req(req)->retries = 0;
                 nvme_req(req)->flags = 0;
+               nvme_req(req)->has_pi = false;
                 req->rq_flags |= RQF_DONTPREP;
         }
  }
@@ -684,6 +685,7 @@ static inline blk_status_t nvme_setup_rw(struct 
nvme_ns *ns,
                         cmnd->rw.reftag = cpu_to_le32(t10_pi_ref_tag(req));
                         break;
                 }
+               nvme_req(req)->has_pi = true;
         }

         cmnd->rw.control = cpu_to_le16(control);
--

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme