[PATCH v1] rdma: Enable ib_alloc_cq to spread work over a device's comp_vectors

* [PATCH v1] rdma: Enable ib_alloc_cq to spread work over a device's comp_vectors
@ 2019-07-23 19:13 Chuck Lever
  2019-07-24  5:47 ` Leon Romanovsky
  0 siblings, 1 reply; 6+ messages in thread
From: Chuck Lever @ 2019-07-23 19:13 UTC (permalink / raw)
  To: linux-rdma, linux-nfs

Send and Receive completion is handled on a single CPU selected at
the time each Completion Queue is allocated. Typically this is when
an initiator instantiates an RDMA transport, or when a target
accepts an RDMA connection.

Some ULPs cannot open a connection per CPU to spread completion
workload across available CPUs. For these ULPs, allow the RDMA core
to select a completion vector based on the device's complement of
available comp_vecs.

When a ULP elects to use RDMA_CORE_ANY_COMPVEC, if multiple CPUs are
available, a different CPU will be selected for each Completion
Queue. For the moment, a simple round-robin mechanism is used.

Suggested-by: Håkon Bugge <haakon.bugge@oracle.com>
Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
---
 drivers/infiniband/core/cq.c             |   20 +++++++++++++++++++-
 include/rdma/ib_verbs.h                  |    3 +++
 net/sunrpc/xprtrdma/svc_rdma_transport.c |    6 ++++--
 net/sunrpc/xprtrdma/verbs.c              |    5 ++---
 4 files changed, 28 insertions(+), 6 deletions(-)

Jason-

If this patch is acceptable to all, then I would expect you to take
it through the RDMA tree.

diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c
index 7c599878ccf7..a89d549490c4 100644
--- a/drivers/infiniband/core/cq.c
+++ b/drivers/infiniband/core/cq.c
@@ -165,12 +165,27 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
 	queue_work(cq->comp_wq, &cq->work);
 }
 
+/*
+ * Attempt to spread ULP completion queues over a device's completion
+ * vectors so that all available CPU cores can help service the device's
+ * interrupt workload. This mechanism may be improved at a later point
+ * to dynamically take into account the system's actual workload.
+ */
+static int ib_get_comp_vector(struct ib_device *dev)
+{
+	static atomic_t cv;
+
+	if (dev->num_comp_vectors > 1)
+		return atomic_inc_return(&cv) % dev->num_comp_vectors;
+	return 0;
+}
+
 /**
  * __ib_alloc_cq_user - allocate a completion queue
  * @dev:		device to allocate the CQ for
  * @private:		driver private data, accessible from cq->cq_context
  * @nr_cqe:		number of CQEs to allocate
- * @comp_vector:	HCA completion vectors for this CQ
+ * @comp_vector:	HCA completion vector for this CQ
  * @poll_ctx:		context to poll the CQ from.
  * @caller:		module owner name.
  * @udata:		Valid user data or NULL for kernel object
@@ -208,6 +223,9 @@ struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 	cq->res.type = RDMA_RESTRACK_CQ;
 	rdma_restrack_set_task(&cq->res, caller);
 
+	if (comp_vector == RDMA_CORE_ANY_COMPVEC)
+		cq_attr.comp_vector = ib_get_comp_vector(dev);
+
 	ret = dev->ops.create_cq(cq, &cq_attr, NULL);
 	if (ret)
 		goto out_free_wc;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index c5f8a9f17063..547d36bcef7e 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -3669,6 +3669,9 @@ static inline int ib_post_recv(struct ib_qp *qp,
 	return qp->device->ops.post_recv(qp, recv_wr, bad_recv_wr ? : &dummy);
 }
 
+/* Tell the RDMA core to select an appropriate comp_vector */
+#define RDMA_CORE_ANY_COMPVEC	((int)(-1))
+
 struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
 				 int nr_cqe, int comp_vector,
 				 enum ib_poll_context poll_ctx,
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 3fe665152d95..7df6de6e9162 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -455,13 +455,15 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 		goto errout;
 	}
 	newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
-					0, IB_POLL_WORKQUEUE);
+					RDMA_CORE_ANY_COMPVEC,
+					IB_POLL_WORKQUEUE);
 	if (IS_ERR(newxprt->sc_sq_cq)) {
 		dprintk("svcrdma: error creating SQ CQ for connect request\n");
 		goto errout;
 	}
 	newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, rq_depth,
-					0, IB_POLL_WORKQUEUE);
+					RDMA_CORE_ANY_COMPVEC,
+					IB_POLL_WORKQUEUE);
 	if (IS_ERR(newxprt->sc_rq_cq)) {
 		dprintk("svcrdma: error creating RQ CQ for connect request\n");
 		goto errout;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 805b1f35e1ca..6e5989e2b8ed 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -523,8 +523,7 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 
 	sendcq = ib_alloc_cq(ia->ri_id->device, NULL,
 			     ep->rep_attr.cap.max_send_wr + 1,
-			     ia->ri_id->device->num_comp_vectors > 1 ? 1 : 0,
-			     IB_POLL_WORKQUEUE);
+			     RDMA_CORE_ANY_COMPVEC, IB_POLL_WORKQUEUE);
 	if (IS_ERR(sendcq)) {
 		rc = PTR_ERR(sendcq);
 		goto out1;
@@ -532,7 +531,7 @@ int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
 
 	recvcq = ib_alloc_cq(ia->ri_id->device, NULL,
 			     ep->rep_attr.cap.max_recv_wr + 1,
-			     0, IB_POLL_WORKQUEUE);
+			     RDMA_CORE_ANY_COMPVEC, IB_POLL_WORKQUEUE);
 	if (IS_ERR(recvcq)) {
 		rc = PTR_ERR(recvcq);
 		goto out2;


^ permalink raw reply related	[flat|nested] 6+ messages in thread