From: Max Gurtovoy <maxg@mellanox.com>
To: Laurence Oberman <loberman@redhat.com>,
<linux-nvme@lists.infradead.org>, <kbusch@kernel.org>,
<hch@lst.de>, <sagi@grimberg.me>
Cc: shlomin@mellanox.com, jgg@mellanox.com, bvanassche@acm.org
Subject: Re: [PATCH v4 1/1] nvmet-rdma: use SRQ per completion vector
Date: Fri, 27 Mar 2020 00:14:21 +0200 [thread overview]
Message-ID: <a176a10a-2915-76a8-e672-61e764ca582a@mellanox.com> (raw)
In-Reply-To: <62fefed0e5d459563dfd3e0e027ceef529dce803.camel@redhat.com>
On 3/26/2020 8:29 PM, Laurence Oberman wrote:
> On Thu, 2020-03-26 at 20:15 +0200, Max Gurtovoy wrote:
>> In order to save resource allocation and utilize the completion
>> locality in a better way (compared to SRQ per device that exist
>> today),
>> allocate Shared Receive Queues (SRQs) per completion vector.
>> Associate
>> each created QP/CQ with an appropriate SRQ according to the queue
>> index.
>> This association will reduce the lock contention in the fast path
>> (compared to SRQ per device solution) and increase the locality in
>> memory buffers. Add new module parameter for SRQ size to adjust it
>> according to the expected load. User should make sure the size is >=
>> 256
>> to avoid lack of resources. Also reduce the debug level of "last WQE
>> reached" event that is raised when a QP is using SRQ during
>> destruction
>> process to relief the log.
>>
>> Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
>> ---
>>
>> changes from V3:
>> - remove redundent NULL assingment (Sagi)
>> - reduce log level for last wqe reached event
>> changes from V1 + V2:
>> - removed SRQ pool
>> - removed SRP target implementation
>>
>> ---
>> drivers/nvme/target/rdma.c | 176 ++++++++++++++++++++++++++++++++++-
>> ----------
>> 1 file changed, 134 insertions(+), 42 deletions(-)
>>
>> diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
>> index 9e1b8c6..dc8a969 100644
>> --- a/drivers/nvme/target/rdma.c
>> +++ b/drivers/nvme/target/rdma.c
>> @@ -34,6 +34,8 @@
>> /* Assume mpsmin == device_page_size == 4KB */
>> #define NVMET_RDMA_MAX_MDTS 8
>>
>> +struct nvmet_rdma_srq;
>> +
>> struct nvmet_rdma_cmd {
>> struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1];
>> struct ib_cqe cqe;
>> @@ -41,6 +43,7 @@ struct nvmet_rdma_cmd {
>> struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE];
>> struct nvme_command *nvme_cmd;
>> struct nvmet_rdma_queue *queue;
>> + struct nvmet_rdma_srq *nsrq;
>> };
>>
>> enum {
>> @@ -82,6 +85,7 @@ struct nvmet_rdma_queue {
>> struct ib_cq *cq;
>> atomic_t sq_wr_avail;
>> struct nvmet_rdma_device *dev;
>> + struct nvmet_rdma_srq *nsrq;
>> spinlock_t state_lock;
>> enum nvmet_rdma_queue_state state;
>> struct nvmet_cq nvme_cq;
>> @@ -99,17 +103,24 @@ struct nvmet_rdma_queue {
>>
>> int idx;
>> int host_qid;
>> + int comp_vector;
>> int recv_queue_size;
>> int send_queue_size;
>>
>> struct list_head queue_list;
>> };
>>
>> +struct nvmet_rdma_srq {
>> + struct ib_srq *srq;
>> + struct nvmet_rdma_cmd *cmds;
>> + struct nvmet_rdma_device *ndev;
>> +};
>> +
>> struct nvmet_rdma_device {
>> struct ib_device *device;
>> struct ib_pd *pd;
>> - struct ib_srq *srq;
>> - struct nvmet_rdma_cmd *srq_cmds;
>> + struct nvmet_rdma_srq **srqs;
>> + int srq_count;
>> size_t srq_size;
>> struct kref ref;
>> struct list_head entry;
>> @@ -121,6 +132,16 @@ struct nvmet_rdma_device {
>> module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444);
>> MODULE_PARM_DESC(use_srq, "Use shared receive queue.");
>>
>> +static int srq_size_set(const char *val, const struct kernel_param
>> *kp);
>> +static const struct kernel_param_ops srq_size_ops = {
>> + .set = srq_size_set,
>> + .get = param_get_int,
>> +};
>> +
>> +static int nvmet_rdma_srq_size = 1024;
>> +module_param_cb(srq_size, &srq_size_ops, &nvmet_rdma_srq_size,
>> 0644);
>> +MODULE_PARM_DESC(srq_size, "set Shared Receive Queue (SRQ) size,
>> should >= 256 (default: 1024)");
>> +
>> static DEFINE_IDA(nvmet_rdma_queue_ida);
>> static LIST_HEAD(nvmet_rdma_queue_list);
>> static DEFINE_MUTEX(nvmet_rdma_queue_mutex);
>> @@ -141,6 +162,17 @@ static int nvmet_rdma_alloc_rsp(struct
>> nvmet_rdma_device *ndev,
>>
>> static const struct nvmet_fabrics_ops nvmet_rdma_ops;
>>
>> +static int srq_size_set(const char *val, const struct kernel_param
>> *kp)
>> +{
>> + int n = 0, ret;
>> +
>> + ret = kstrtoint(val, 10, &n);
>> + if (ret != 0 || n < 256)
>> + return -EINVAL;
>> +
>> + return param_set_int(val, kp);
>> +}
>> +
>> static int num_pages(int len)
>> {
>> return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
>> @@ -464,8 +496,8 @@ static int nvmet_rdma_post_recv(struct
>> nvmet_rdma_device *ndev,
>> cmd->sge[0].addr, cmd->sge[0].length,
>> DMA_FROM_DEVICE);
>>
>> - if (ndev->srq)
>> - ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL);
>> + if (cmd->nsrq)
>> + ret = ib_post_srq_recv(cmd->nsrq->srq, &cmd->wr, NULL);
>> else
>> ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr,
>> NULL);
>>
>> @@ -843,23 +875,40 @@ static void nvmet_rdma_recv_done(struct ib_cq
>> *cq, struct ib_wc *wc)
>> nvmet_rdma_handle_command(queue, rsp);
>> }
>>
>> -static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev)
>> +static void nvmet_rdma_destroy_srq(struct nvmet_rdma_srq *nsrq)
>> +{
>> + nvmet_rdma_free_cmds(nsrq->ndev, nsrq->cmds, nsrq->ndev-
>>> srq_size,
>> + false);
>> + ib_destroy_srq(nsrq->srq);
>> +
>> + kfree(nsrq);
>> +}
>> +
>> +static void nvmet_rdma_destroy_srqs(struct nvmet_rdma_device *ndev)
>> {
>> - if (!ndev->srq)
>> + int i;
>> +
>> + if (!ndev->srqs)
>> return;
>>
>> - nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size,
>> false);
>> - ib_destroy_srq(ndev->srq);
>> + for (i = 0; i < ndev->srq_count; i++)
>> + nvmet_rdma_destroy_srq(ndev->srqs[i]);
>> +
>> + kfree(ndev->srqs);
>> }
>>
>> -static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
>> +static struct nvmet_rdma_srq *
>> +nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev)
>> {
>> struct ib_srq_init_attr srq_attr = { NULL, };
>> + size_t srq_size = ndev->srq_size;
>> + struct nvmet_rdma_srq *nsrq;
>> struct ib_srq *srq;
>> - size_t srq_size;
>> int ret, i;
>>
>> - srq_size = 4095; /* XXX: tune */
>> + nsrq = kzalloc(sizeof(*nsrq), GFP_KERNEL);
>> + if (!nsrq)
>> + return ERR_PTR(-ENOMEM);
>>
>> srq_attr.attr.max_wr = srq_size;
>> srq_attr.attr.max_sge = 1 + ndev->inline_page_count;
>> @@ -867,35 +916,71 @@ static int nvmet_rdma_init_srq(struct
>> nvmet_rdma_device *ndev)
>> srq_attr.srq_type = IB_SRQT_BASIC;
>> srq = ib_create_srq(ndev->pd, &srq_attr);
>> if (IS_ERR(srq)) {
>> - /*
>> - * If SRQs aren't supported we just go ahead and use
>> normal
>> - * non-shared receive queues.
>> - */
>> - pr_info("SRQ requested but not supported.\n");
>> - return 0;
>> + ret = PTR_ERR(srq);
>> + goto out_free;
>> }
>>
>> - ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
>> - if (IS_ERR(ndev->srq_cmds)) {
>> - ret = PTR_ERR(ndev->srq_cmds);
>> + nsrq->cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false);
>> + if (IS_ERR(nsrq->cmds)) {
>> + ret = PTR_ERR(nsrq->cmds);
>> goto out_destroy_srq;
>> }
>>
>> - ndev->srq = srq;
>> - ndev->srq_size = srq_size;
>> + nsrq->srq = srq;
>> + nsrq->ndev = ndev;
>>
>> for (i = 0; i < srq_size; i++) {
>> - ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]);
>> + nsrq->cmds[i].nsrq = nsrq;
>> + ret = nvmet_rdma_post_recv(ndev, &nsrq->cmds[i]);
>> if (ret)
>> goto out_free_cmds;
>> }
>>
>> - return 0;
>> + return nsrq;
>>
>> out_free_cmds:
>> - nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size,
>> false);
>> + nvmet_rdma_free_cmds(ndev, nsrq->cmds, srq_size, false);
>> out_destroy_srq:
>> ib_destroy_srq(srq);
>> +out_free:
>> + kfree(nsrq);
>> + return ERR_PTR(ret);
>> +}
>> +
>> +static int nvmet_rdma_init_srqs(struct nvmet_rdma_device *ndev)
>> +{
>> + int i, ret;
>> +
>> + if (!ndev->device->attrs.max_srq_wr || !ndev->device-
>>> attrs.max_srq) {
>> + /*
>> + * If SRQs aren't supported we just go ahead and use
>> normal
>> + * non-shared receive queues.
>> + */
>> + pr_info("SRQ requested but not supported.\n");
>> + return 0;
>> + }
>> +
>> + ndev->srq_size = min(ndev->device->attrs.max_srq_wr,
>> + nvmet_rdma_srq_size);
>> + ndev->srq_count = min(ndev->device->num_comp_vectors,
>> + ndev->device->attrs.max_srq);
>> +
>> + ndev->srqs = kcalloc(ndev->srq_count, sizeof(*ndev->srqs),
>> GFP_KERNEL);
>> + if (!ndev->srqs)
>> + return -ENOMEM;
>> +
>> + for (i = 0; i < ndev->srq_count; i++) {
>> + ndev->srqs[i] = nvmet_rdma_init_srq(ndev);
>> + if (IS_ERR(ndev->srqs[i]))
>> + goto err_srq;
>> + }
>> +
>> + return 0;
>> +
>> +err_srq:
>> + while (--i >= 0)
>> + nvmet_rdma_destroy_srq(ndev->srqs[i]);
>> + kfree(ndev->srqs);
>> return ret;
>> }
>>
>> @@ -908,7 +993,7 @@ static void nvmet_rdma_free_dev(struct kref *ref)
>> list_del(&ndev->entry);
>> mutex_unlock(&device_list_mutex);
>>
>> - nvmet_rdma_destroy_srq(ndev);
>> + nvmet_rdma_destroy_srqs(ndev);
>> ib_dealloc_pd(ndev->pd);
>>
>> kfree(ndev);
>> @@ -954,7 +1039,7 @@ static void nvmet_rdma_free_dev(struct kref
>> *ref)
>> goto out_free_dev;
>>
>> if (nvmet_rdma_use_srq) {
>> - ret = nvmet_rdma_init_srq(ndev);
>> + ret = nvmet_rdma_init_srqs(ndev);
>> if (ret)
>> goto out_free_pd;
>> }
>> @@ -978,14 +1063,7 @@ static int nvmet_rdma_create_queue_ib(struct
>> nvmet_rdma_queue *queue)
>> {
>> struct ib_qp_init_attr qp_attr;
>> struct nvmet_rdma_device *ndev = queue->dev;
>> - int comp_vector, nr_cqe, ret, i, factor;
>> -
>> - /*
>> - * Spread the io queues across completion vectors,
>> - * but still keep all admin queues on vector 0.
>> - */
>> - comp_vector = !queue->host_qid ? 0 :
>> - queue->idx % ndev->device->num_comp_vectors;
>> + int nr_cqe, ret, i, factor;
>>
>> /*
>> * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE +
>> RDMA_SEND.
>> @@ -993,7 +1071,7 @@ static int nvmet_rdma_create_queue_ib(struct
>> nvmet_rdma_queue *queue)
>> nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size;
>>
>> queue->cq = ib_alloc_cq(ndev->device, queue,
>> - nr_cqe + 1, comp_vector,
>> + nr_cqe + 1, queue->comp_vector,
>> IB_POLL_WORKQUEUE);
>> if (IS_ERR(queue->cq)) {
>> ret = PTR_ERR(queue->cq);
>> @@ -1017,8 +1095,8 @@ static int nvmet_rdma_create_queue_ib(struct
>> nvmet_rdma_queue *queue)
>> qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd,
>> ndev->device-
>>> attrs.max_send_sge);
>>
>> - if (ndev->srq) {
>> - qp_attr.srq = ndev->srq;
>> + if (queue->nsrq) {
>> + qp_attr.srq = queue->nsrq->srq;
>> } else {
>> /* +1 for drain */
>> qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size;
>> @@ -1037,7 +1115,7 @@ static int nvmet_rdma_create_queue_ib(struct
>> nvmet_rdma_queue *queue)
>> __func__, queue->cq->cqe, qp_attr.cap.max_send_sge,
>> qp_attr.cap.max_send_wr, queue->cm_id);
>>
>> - if (!ndev->srq) {
>> + if (!queue->nsrq) {
>> for (i = 0; i < queue->recv_queue_size; i++) {
>> queue->cmds[i].queue = queue;
>> ret = nvmet_rdma_post_recv(ndev, &queue-
>>> cmds[i]);
>> @@ -1073,7 +1151,7 @@ static void nvmet_rdma_free_queue(struct
>> nvmet_rdma_queue *queue)
>> nvmet_sq_destroy(&queue->nvme_sq);
>>
>> nvmet_rdma_destroy_queue_ib(queue);
>> - if (!queue->dev->srq) {
>> + if (!queue->nsrq) {
>> nvmet_rdma_free_cmds(queue->dev, queue->cmds,
>> queue->recv_queue_size,
>> !queue->host_qid);
>> @@ -1185,13 +1263,23 @@ static int nvmet_rdma_cm_reject(struct
>> rdma_cm_id *cm_id,
>> goto out_destroy_sq;
>> }
>>
>> + /*
>> + * Spread the io queues across completion vectors,
>> + * but still keep all admin queues on vector 0.
>> + */
>> + queue->comp_vector = !queue->host_qid ? 0 :
>> + queue->idx % ndev->device->num_comp_vectors;
>> +
>> +
>> ret = nvmet_rdma_alloc_rsps(queue);
>> if (ret) {
>> ret = NVME_RDMA_CM_NO_RSC;
>> goto out_ida_remove;
>> }
>>
>> - if (!ndev->srq) {
>> + if (ndev->srqs) {
>> + queue->nsrq = ndev->srqs[queue->comp_vector % ndev-
>>> srq_count];
>> + } else {
>> queue->cmds = nvmet_rdma_alloc_cmds(ndev,
>> queue->recv_queue_size,
>> !queue->host_qid);
>> @@ -1212,7 +1300,7 @@ static int nvmet_rdma_cm_reject(struct
>> rdma_cm_id *cm_id,
>> return queue;
>>
>> out_free_cmds:
>> - if (!ndev->srq) {
>> + if (!queue->nsrq) {
>> nvmet_rdma_free_cmds(queue->dev, queue->cmds,
>> queue->recv_queue_size,
>> !queue->host_qid);
>> @@ -1238,6 +1326,10 @@ static void nvmet_rdma_qp_event(struct
>> ib_event *event, void *priv)
>> case IB_EVENT_COMM_EST:
>> rdma_notify(queue->cm_id, event->event);
>> break;
>> + case IB_EVENT_QP_LAST_WQE_REACHED:
>> + pr_debug("received last WQE reached event for
>> queue=0x%p\n",
>> + queue);
>> + break;
>> default:
>> pr_err("received IB QP event: %s (%d)\n",
>> ib_event_msg(event->event), event->event);
> Max
> I will test this.
> Is this against latest 5.6
Laurence,
It's rebased against nvme-5.7-rc1 (git://git.infradead.org/nvme.git).
In order to use it, please load nvmet_rdma with use_srq=Y and srq_size
>= 256 (default = 1024).
Thanks for your efforts !
Max.
>
_______________________________________________
linux-nvme mailing list
linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme
next prev parent reply other threads:[~2020-03-26 22:14 UTC|newest]
Thread overview: 8+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-03-26 18:15 [PATCH v4 1/1] nvmet-rdma: use SRQ per completion vector Max Gurtovoy
2020-03-26 18:29 ` Laurence Oberman
2020-03-26 22:14 ` Max Gurtovoy [this message]
2020-03-26 23:20 ` Sagi Grimberg
2020-03-27 0:44 ` Max Gurtovoy
2020-03-31 14:43 ` Max Gurtovoy
2020-03-31 15:06 ` Christoph Hellwig
2020-04-01 8:33 ` Christoph Hellwig
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=a176a10a-2915-76a8-e672-61e764ca582a@mellanox.com \
--to=maxg@mellanox.com \
--cc=bvanassche@acm.org \
--cc=hch@lst.de \
--cc=jgg@mellanox.com \
--cc=kbusch@kernel.org \
--cc=linux-nvme@lists.infradead.org \
--cc=loberman@redhat.com \
--cc=sagi@grimberg.me \
--cc=shlomin@mellanox.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).