Linux-RDMA Archive on lore.kernel.org
 help / color / Atom feed
* [PATCH for-rc 0/5] Add CM packets missing and harden the proxying
@ 2020-07-20 12:22 Håkon Bugge
  2020-07-20 12:22 ` [PATCH for-rc 1/5] IB/mlx4: Add and improve logging Håkon Bugge
                   ` (4 more replies)
  0 siblings, 5 replies; 7+ messages in thread
From: Håkon Bugge @ 2020-07-20 12:22 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma, Yishai Hadas, Jack Morgenstein

A high number of MAD packet drops are observed in the mlx4 MAD proxy
system. These are fixed by separating the parameters for the tunnel
vs. wire QPs and by introducing a separate worker-thread for the wire
QPs.

Support for MRA and REJ with its reason being timeout is also added.

Dynamic debug prints adjusted and amended.

Håkon Bugge (5):
  IB/mlx4: Add and improve logging
  IB/mlx4: Add support for MRA
  IB/mlx4: Separate tunnel and wire bufs parameters
  IB/mlx4: Fix starvation in paravirt mux/demux
  IB/mlx4: Add support for REJ due to timeout

 drivers/infiniband/hw/mlx4/cm.c      | 143 ++++++++++++++++++++++++++-
 drivers/infiniband/hw/mlx4/mad.c     | 121 +++++++++++++----------
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   6 ++
 3 files changed, 211 insertions(+), 59 deletions(-)

--
2.20.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH for-rc 1/5] IB/mlx4: Add and improve logging
  2020-07-20 12:22 [PATCH for-rc 0/5] Add CM packets missing and harden the proxying Håkon Bugge
@ 2020-07-20 12:22 ` Håkon Bugge
  2020-07-20 12:22 ` [PATCH for-rc 2/5] IB/mlx4: Add support for MRA Håkon Bugge
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Håkon Bugge @ 2020-07-20 12:22 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma, Yishai Hadas, Jack Morgenstein

Add missing check for success after call to mlx4_ib_send_to_wire() in
mlx4_ib_multiplex_mad().

Amended the existing pr_debug() in mlx4_ib_multiplex_cm_handler() and
mlx4_ib_demux_cm_handler() with attr_id during a lookup failure.

Removed two noisy pr_debug() in mad.c

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
---
 drivers/infiniband/hw/mlx4/cm.c  |  7 +++---
 drivers/infiniband/hw/mlx4/mad.c | 43 +++++++++-----------------------
 2 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
index b591861934b3..302ea7ec2008 100644
--- a/drivers/infiniband/hw/mlx4/cm.c
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -314,8 +314,8 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id
 	}
 
 	if (!id) {
-		pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n",
-			 slave_id, sl_cm_id);
+		pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL! attr_id: 0x%x\n",
+			 slave_id, sl_cm_id, be16_to_cpu(mad->mad_hdr.attr_id));
 		return -EINVAL;
 	}
 
@@ -354,7 +354,8 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
 	id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1);
 
 	if (!id) {
-		pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id);
+		pr_debug("Couldn't find an entry for pv_cm_id 0x%x, attr_id 0x%x\n",
+			 pv_cm_id, be16_to_cpu(mad->mad_hdr.attr_id));
 		return -ENOENT;
 	}
 
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index abe68708d6d6..04316ba55a8c 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -807,27 +807,6 @@ static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
 	int err;
 	struct ib_port_attr pattr;
 
-	if (in_wc && in_wc->qp) {
-		pr_debug("received MAD: port:%d slid:%d sqpn:%d "
-			 "dlid_bits:%d dqpn:%d wc_flags:0x%x tid:%016llx cls:%x mtd:%x atr:%x\n",
-			 port_num,
-			 in_wc->slid, in_wc->src_qp,
-			 in_wc->dlid_path_bits,
-			 in_wc->qp->qp_num,
-			 in_wc->wc_flags,
-			 be64_to_cpu(in_mad->mad_hdr.tid),
-			 in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method,
-			 be16_to_cpu(in_mad->mad_hdr.attr_id));
-		if (in_wc->wc_flags & IB_WC_GRH) {
-			pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n",
-				 be64_to_cpu(in_grh->sgid.global.subnet_prefix),
-				 be64_to_cpu(in_grh->sgid.global.interface_id));
-			pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n",
-				 be64_to_cpu(in_grh->dgid.global.subnet_prefix),
-				 be64_to_cpu(in_grh->dgid.global.interface_id));
-		}
-	}
-
 	slid = in_wc ? ib_lid_cpu16(in_wc->slid) : be16_to_cpu(IB_LID_PERMISSIVE);
 
 	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
@@ -1484,6 +1463,7 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc
 	u16 vlan_id;
 	u8 qos;
 	u8 *dmac;
+	int sts;
 
 	/* Get slave that sent this packet */
 	if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn ||
@@ -1580,13 +1560,17 @@ static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc
 					&vlan_id, &qos))
 		rdma_ah_set_sl(&ah_attr, qos);
 
-	mlx4_ib_send_to_wire(dev, slave, ctx->port,
-			     is_proxy_qp0(dev, wc->src_qp, slave) ?
-			     IB_QPT_SMI : IB_QPT_GSI,
-			     be16_to_cpu(tunnel->hdr.pkey_index),
-			     be32_to_cpu(tunnel->hdr.remote_qpn),
-			     be32_to_cpu(tunnel->hdr.qkey),
-			     &ah_attr, wc->smac, vlan_id, &tunnel->mad);
+	sts = mlx4_ib_send_to_wire(dev, slave, ctx->port,
+				   is_proxy_qp0(dev, wc->src_qp, slave) ?
+				   IB_QPT_SMI : IB_QPT_GSI,
+				   be16_to_cpu(tunnel->hdr.pkey_index),
+				   be32_to_cpu(tunnel->hdr.remote_qpn),
+				   be32_to_cpu(tunnel->hdr.qkey),
+				   &ah_attr, wc->smac, vlan_id, &tunnel->mad);
+	if (sts)
+		pr_debug("failed sending %s to wire on behalf of slave %d (%d)\n",
+			 is_proxy_qp0(dev, wc->src_qp, slave) ? "SMI" : "GSI",
+			 slave, sts);
 }
 
 static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
@@ -1744,9 +1728,6 @@ static void mlx4_ib_tunnel_comp_worker(struct work_struct *work)
 					       "buf:%lld\n", wc.wr_id);
 				break;
 			case IB_WC_SEND:
-				pr_debug("received tunnel send completion:"
-					 "wrid=0x%llx, status=0x%x\n",
-					 wc.wr_id, wc.status);
 				rdma_destroy_ah(tun_qp->tx_ring[wc.wr_id &
 					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
 				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
-- 
2.20.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH for-rc 2/5] IB/mlx4: Add support for MRA
  2020-07-20 12:22 [PATCH for-rc 0/5] Add CM packets missing and harden the proxying Håkon Bugge
  2020-07-20 12:22 ` [PATCH for-rc 1/5] IB/mlx4: Add and improve logging Håkon Bugge
@ 2020-07-20 12:22 ` Håkon Bugge
  2020-07-20 12:22 ` [PATCH for-rc 3/5] IB/mlx4: Separate tunnel and wire bufs parameters Håkon Bugge
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 7+ messages in thread
From: Håkon Bugge @ 2020-07-20 12:22 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma, Yishai Hadas, Jack Morgenstein

Using CX-3 in virtualized mode, MAD packets are proxied through the PF
driver. However, the handling lacks support of the MRA (Message
Receipt Acknowledgment) packet. When having dynamic debug enabled, we
see tons of:

mlx4_ib_multiplex_cm_handler: id{slave: 7, sl_cm_id: 0x8fcb45a0} is NULL! attr_id: 0x11

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
---
 drivers/infiniband/hw/mlx4/cm.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
index 302ea7ec2008..6f0ffd0906e6 100644
--- a/drivers/infiniband/hw/mlx4/cm.c
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -293,8 +293,9 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id
 	int pv_cm_id = -1;
 
 	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
-			mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
-			mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+	    mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
+	    mad->mad_hdr.attr_id == CM_MRA_ATTR_ID ||
+	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
 		sl_cm_id = get_local_comm_id(mad);
 		id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
 		if (id)
-- 
2.20.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH for-rc 3/5] IB/mlx4: Separate tunnel and wire bufs parameters
  2020-07-20 12:22 [PATCH for-rc 0/5] Add CM packets missing and harden the proxying Håkon Bugge
  2020-07-20 12:22 ` [PATCH for-rc 1/5] IB/mlx4: Add and improve logging Håkon Bugge
  2020-07-20 12:22 ` [PATCH for-rc 2/5] IB/mlx4: Add support for MRA Håkon Bugge
@ 2020-07-20 12:22 ` Håkon Bugge
  2020-07-20 12:22 ` [PATCH for-rc 4/5] IB/mlx4: Fix starvation in paravirt mux/demux Håkon Bugge
  2020-07-20 12:22 ` [PATCH for-rc 5/5] IB/mlx4: Add support for REJ due to timeout Håkon Bugge
  4 siblings, 0 replies; 7+ messages in thread
From: Håkon Bugge @ 2020-07-20 12:22 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma, Yishai Hadas, Jack Morgenstein

Using CX-3 in virtualized mode, MAD packets are proxied through the PF
driver. The feed is N tunnel QPs, and what is received from the VFs is
multiplexed out on the wire QP. Since this is a many-to-one, it is
better to have separate initialization parameters for the two usages.

Yanking the number of wire buffers up to 2K.

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
---
 drivers/infiniband/hw/mlx4/mad.c     | 44 +++++++++++++++-------------
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  1 +
 2 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 04316ba55a8c..336b894de7cf 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1380,10 +1380,10 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
 
 	spin_lock(&sqp->tx_lock);
 	if (sqp->tx_ix_head - sqp->tx_ix_tail >=
-	    (MLX4_NUM_TUNNEL_BUFS - 1))
+	    (MLX4_NUM_WIRE_BUFS - 1))
 		ret = -EAGAIN;
 	else
-		wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+		wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_WIRE_BUFS - 1);
 	spin_unlock(&sqp->tx_lock);
 	if (ret)
 		goto out;
@@ -1579,19 +1579,20 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
 	int i;
 	struct mlx4_ib_demux_pv_qp *tun_qp;
 	int rx_buf_size, tx_buf_size;
+	const int nmbr_bufs = is_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
 
 	if (qp_type > IB_QPT_GSI)
 		return -EINVAL;
 
 	tun_qp = &ctx->qp[qp_type];
 
-	tun_qp->ring = kcalloc(MLX4_NUM_TUNNEL_BUFS,
+	tun_qp->ring = kcalloc(nmbr_bufs,
 			       sizeof(struct mlx4_ib_buf),
 			       GFP_KERNEL);
 	if (!tun_qp->ring)
 		return -ENOMEM;
 
-	tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS,
+	tun_qp->tx_ring = kcalloc(nmbr_bufs,
 				  sizeof (struct mlx4_ib_tun_tx_buf),
 				  GFP_KERNEL);
 	if (!tun_qp->tx_ring) {
@@ -1608,7 +1609,7 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
 		tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
 	}
 
-	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+	for (i = 0; i < nmbr_bufs; i++) {
 		tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL);
 		if (!tun_qp->ring[i].addr)
 			goto err;
@@ -1622,7 +1623,7 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
 		}
 	}
 
-	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+	for (i = 0; i < nmbr_bufs; i++) {
 		tun_qp->tx_ring[i].buf.addr =
 			kmalloc(tx_buf_size, GFP_KERNEL);
 		if (!tun_qp->tx_ring[i].buf.addr)
@@ -1653,7 +1654,7 @@ static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
 				    tx_buf_size, DMA_TO_DEVICE);
 		kfree(tun_qp->tx_ring[i].buf.addr);
 	}
-	i = MLX4_NUM_TUNNEL_BUFS;
+	i = nmbr_bufs;
 err:
 	while (i > 0) {
 		--i;
@@ -1674,6 +1675,7 @@ static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
 	int i;
 	struct mlx4_ib_demux_pv_qp *tun_qp;
 	int rx_buf_size, tx_buf_size;
+	const int nmbr_bufs = is_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
 
 	if (qp_type > IB_QPT_GSI)
 		return;
@@ -1688,13 +1690,13 @@ static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
 	}
 
 
-	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+	for (i = 0; i < nmbr_bufs; i++) {
 		ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
 				    rx_buf_size, DMA_FROM_DEVICE);
 		kfree(tun_qp->ring[i].addr);
 	}
 
-	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+	for (i = 0; i < nmbr_bufs; i++) {
 		ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
 				    tx_buf_size, DMA_TO_DEVICE);
 		kfree(tun_qp->tx_ring[i].buf.addr);
@@ -1774,6 +1776,7 @@ static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx,
 	struct mlx4_ib_qp_tunnel_init_attr qp_init_attr;
 	struct ib_qp_attr attr;
 	int qp_attr_mask_INIT;
+	const int nmbr_bufs = create_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
 
 	if (qp_type > IB_QPT_GSI)
 		return -EINVAL;
@@ -1784,8 +1787,8 @@ static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx,
 	qp_init_attr.init_attr.send_cq = ctx->cq;
 	qp_init_attr.init_attr.recv_cq = ctx->cq;
 	qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
-	qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS;
-	qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS;
+	qp_init_attr.init_attr.cap.max_send_wr = nmbr_bufs;
+	qp_init_attr.init_attr.cap.max_recv_wr = nmbr_bufs;
 	qp_init_attr.init_attr.cap.max_send_sge = 1;
 	qp_init_attr.init_attr.cap.max_recv_sge = 1;
 	if (create_tun) {
@@ -1847,7 +1850,7 @@ static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx,
 		goto err_qp;
 	}
 
-	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+	for (i = 0; i < nmbr_bufs; i++) {
 		ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i);
 		if (ret) {
 			pr_err(" mlx4_ib_post_pv_buf error"
@@ -1883,8 +1886,8 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
 			switch (wc.opcode) {
 			case IB_WC_SEND:
 				kfree(sqp->tx_ring[wc.wr_id &
-				      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
-				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+				      (MLX4_NUM_WIRE_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_WIRE_BUFS - 1)].ah
 					= NULL;
 				spin_lock(&sqp->tx_lock);
 				sqp->tx_ix_tail++;
@@ -1893,13 +1896,13 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
 			case IB_WC_RECV:
 				mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *)
 						(sqp->ring[wc.wr_id &
-						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload);
+						(MLX4_NUM_WIRE_BUFS - 1)].addr))->payload);
 				grh = &(((struct mlx4_mad_rcv_buf *)
 						(sqp->ring[wc.wr_id &
-						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh);
+						(MLX4_NUM_WIRE_BUFS - 1)].addr))->grh);
 				mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad);
 				if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id &
-							   (MLX4_NUM_TUNNEL_BUFS - 1)))
+							   (MLX4_NUM_WIRE_BUFS - 1)))
 					pr_err("Failed reposting SQP "
 					       "buf:%lld\n", wc.wr_id);
 				break;
@@ -1912,8 +1915,8 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
 				 ctx->slave, wc.status, wc.wr_id);
 			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
 				kfree(sqp->tx_ring[wc.wr_id &
-				      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
-				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+				      (MLX4_NUM_WIRE_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_WIRE_BUFS - 1)].ah
 					= NULL;
 				spin_lock(&sqp->tx_lock);
 				sqp->tx_ix_tail++;
@@ -1953,6 +1956,7 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
 {
 	int ret, cq_size;
 	struct ib_cq_init_attr cq_attr = {};
+	const int nmbr_bufs = create_tun ? MLX4_NUM_TUNNEL_BUFS : MLX4_NUM_WIRE_BUFS;
 
 	if (ctx->state != DEMUX_PV_STATE_DOWN)
 		return -EEXIST;
@@ -1977,7 +1981,7 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
 		goto err_out_qp0;
 	}
 
-	cq_size = 2 * MLX4_NUM_TUNNEL_BUFS;
+	cq_size = 2 * nmbr_bufs;
 	if (ctx->has_smi)
 		cq_size *= 2;
 
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 6f4ea1067095..8999fecb045b 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -234,6 +234,7 @@ enum mlx4_ib_mad_ifc_flags {
 
 enum {
 	MLX4_NUM_TUNNEL_BUFS		= 256,
+	MLX4_NUM_WIRE_BUFS		= 2048,
 };
 
 struct mlx4_ib_tunnel_header {
-- 
2.20.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH for-rc 4/5] IB/mlx4: Fix starvation in paravirt mux/demux
  2020-07-20 12:22 [PATCH for-rc 0/5] Add CM packets missing and harden the proxying Håkon Bugge
                   ` (2 preceding siblings ...)
  2020-07-20 12:22 ` [PATCH for-rc 3/5] IB/mlx4: Separate tunnel and wire bufs parameters Håkon Bugge
@ 2020-07-20 12:22 ` Håkon Bugge
  2020-07-20 12:22 ` [PATCH for-rc 5/5] IB/mlx4: Add support for REJ due to timeout Håkon Bugge
  4 siblings, 0 replies; 7+ messages in thread
From: Håkon Bugge @ 2020-07-20 12:22 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma, Yishai Hadas, Jack Morgenstein

The mlx4 driver will proxy MAD packets through the PF driver. A VM or
an instantiated VF will send its MAD packets to the PF driver using
loop-back. The PF driver will be informed by an interrupt, but defer
the handling and polling of CQEs to a worker thread running on an
ordered work-queue.

Consider the following scenario: the VMs will in short proximity in
time, for example due to a network event, send many MAD packets to the
PF driver. Lets say there are K VMs, each sending N packets.

The interrupt from the first VM will start the worker thread, which
will poll N CQEs. A common case here is where the PF driver will
multiplex the packets received from the VMs out on the wire QP.

But before the wire QP has returned a send CQE and associated interrupt,
the other K - 1 VMs has sent their N packets.

The PF driver will have to multiplex K * N packets out on the wire
QP. But the send-queue on the wire QP has a finite capacity.

So, in this scenario, if K * N is larger that the send-queue capacity
of the wire QP, we will get MAD packets dropped on the floor with this
dynamic debug message:

mlx4_ib_multiplex_mad: failed sending GSI to wire on behalf of slave 2 (-11)

and this despite the fact that the wire send-queue could have
capacity, but the PF driver isn't aware, because the wire send CQEs
have not yet been polled.

We can also have a similar scenario inbound, with a wire recv-queue
larger than the tunnel QP's recv queue. If many remote peers sends MAD
packets to the very same VM, the tunnel send-queue destined to the VM
could overflow.

This starvation is fixed by introducing separate work queues for the
wire QPs vs. the tunnel QPs

With this fix, using a dual ported HCA, 8 VFs instantiated, we could
run cmtime on each of the 18 interfaces towards a similar configured
peer, each cmtime instance with 800 QPs (all in all 14400 QPs) without
a single CM packet getting lost.

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
---
 drivers/infiniband/hw/mlx4/mad.c     | 34 +++++++++++++++++++++++++---
 drivers/infiniband/hw/mlx4/mlx4_ib.h |  2 ++
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
index 336b894de7cf..28bf8ddb019e 100644
--- a/drivers/infiniband/hw/mlx4/mad.c
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -1278,6 +1278,18 @@ static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg)
 	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
 }
 
+static void mlx4_ib_wire_comp_handler(struct ib_cq *cq, void *arg)
+{
+	unsigned long flags;
+	struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context;
+	struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE)
+		queue_work(ctx->wi_wq, &ctx->work);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
 static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
 				  struct mlx4_ib_demux_pv_qp *tun_qp,
 				  int index)
@@ -1986,7 +1998,8 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
 		cq_size *= 2;
 
 	cq_attr.cqe = cq_size;
-	ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler,
+	ctx->cq = ib_create_cq(ctx->ib_dev,
+			       create_tun ? mlx4_ib_tunnel_comp_handler : mlx4_ib_wire_comp_handler,
 			       NULL, ctx, &cq_attr);
 	if (IS_ERR(ctx->cq)) {
 		ret = PTR_ERR(ctx->cq);
@@ -2023,6 +2036,7 @@ static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
 		INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker);
 
 	ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq;
+	ctx->wi_wq = to_mdev(ibdev)->sriov.demux[port - 1].wi_wq;
 
 	ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
 	if (ret) {
@@ -2166,7 +2180,7 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
 		goto err_mcg;
 	}
 
-	snprintf(name, sizeof name, "mlx4_ibt%d", port);
+	snprintf(name, sizeof(name), "mlx4_ibt%d", port);
 	ctx->wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
 	if (!ctx->wq) {
 		pr_err("Failed to create tunnelling WQ for port %d\n", port);
@@ -2174,7 +2188,15 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
 		goto err_wq;
 	}
 
-	snprintf(name, sizeof name, "mlx4_ibud%d", port);
+	snprintf(name, sizeof(name), "mlx4_ibwi%d", port);
+	ctx->wi_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
+	if (!ctx->wi_wq) {
+		pr_err("Failed to create wire WQ for port %d\n", port);
+		ret = -ENOMEM;
+		goto err_wiwq;
+	}
+
+	snprintf(name, sizeof(name), "mlx4_ibud%d", port);
 	ctx->ud_wq = alloc_ordered_workqueue(name, WQ_MEM_RECLAIM);
 	if (!ctx->ud_wq) {
 		pr_err("Failed to create up/down WQ for port %d\n", port);
@@ -2185,6 +2207,10 @@ static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
 	return 0;
 
 err_udwq:
+	destroy_workqueue(ctx->wi_wq);
+	ctx->wi_wq = NULL;
+
+err_wiwq:
 	destroy_workqueue(ctx->wq);
 	ctx->wq = NULL;
 
@@ -2232,12 +2258,14 @@ static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx)
 				ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING;
 		}
 		flush_workqueue(ctx->wq);
+		flush_workqueue(ctx->wi_wq);
 		for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
 			destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0);
 			free_pv_object(dev, i, ctx->port);
 		}
 		kfree(ctx->tun);
 		destroy_workqueue(ctx->ud_wq);
+		destroy_workqueue(ctx->wi_wq);
 		destroy_workqueue(ctx->wq);
 	}
 }
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 8999fecb045b..20cfa69d0aed 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -455,6 +455,7 @@ struct mlx4_ib_demux_pv_ctx {
 	struct ib_pd *pd;
 	struct work_struct work;
 	struct workqueue_struct *wq;
+	struct workqueue_struct *wi_wq;
 	struct mlx4_ib_demux_pv_qp qp[2];
 };
 
@@ -462,6 +463,7 @@ struct mlx4_ib_demux_ctx {
 	struct ib_device *ib_dev;
 	int port;
 	struct workqueue_struct *wq;
+	struct workqueue_struct *wi_wq;
 	struct workqueue_struct *ud_wq;
 	spinlock_t ud_lock;
 	atomic64_t subnet_prefix;
-- 
2.20.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH for-rc 5/5] IB/mlx4: Add support for REJ due to timeout
  2020-07-20 12:22 [PATCH for-rc 0/5] Add CM packets missing and harden the proxying Håkon Bugge
                   ` (3 preceding siblings ...)
  2020-07-20 12:22 ` [PATCH for-rc 4/5] IB/mlx4: Fix starvation in paravirt mux/demux Håkon Bugge
@ 2020-07-20 12:22 ` Håkon Bugge
  2020-07-23 15:32   ` Håkon Bugge
  4 siblings, 1 reply; 7+ messages in thread
From: Håkon Bugge @ 2020-07-20 12:22 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe; +Cc: linux-rdma, Yishai Hadas, Jack Morgenstein

A CM REJ packet with its reason equal to timeout is a special beast in
the sense that it doesn't have a Remote Communication ID nor does it
have a Remote Port GID.

Using CX-3 virtual functions, either from a bare-metal machine or
pass-through from a VM, MAD packets are proxied through the PF driver.

Since the VF drivers have separate name spaces for MAD Transaction Ids
(TIDs), the PF driver has to re-map the TIDs and keep the book keeping
in a cache.

This proxying doesn't not handle said REJ packets.

If the active side abandons its connection attempt after having sent a
REQ, it will send a REJ with the reason being timeout. This example
can be provoked by a simple user-verbs program, which ends up doing:

    rdma_connect(cm_id, &conn_param);
    rdma_destroy_id(cm_id);

using the async librdmacm API.

Having dynamic debug prints enabled in the mlx4_ib driver, we will
then see:

mlx4_ib_demux_cm_handler: Couldn't find an entry for pv_cm_id 0x0, attr_id 0x12

The solution is to introduce a radix-tree. When a REQ packet is
received and handled in mlx4_ib_demux_cm_handler(), we know the
connecting peer's para-virtual cm_id and the destination slave. We
then insert an entry into the tree with said information. We also
schedule work to remove this entry from the tree and free it, in order
to avoid memory leak.

When a REJ packet with reason timeout is received, we can look up the
slave in the tree, and deliver the packet to the correct slave.

When cleaning up, we simply traverse the tree and modify any delayed
work to use a zero delay. A subsequent flush of the system_wq will
ensure all entries being wiped out.

Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
---
 drivers/infiniband/hw/mlx4/cm.c      | 133 ++++++++++++++++++++++++++-
 drivers/infiniband/hw/mlx4/mlx4_ib.h |   3 +
 2 files changed, 135 insertions(+), 1 deletion(-)

diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
index 6f0ffd0906e6..883436548901 100644
--- a/drivers/infiniband/hw/mlx4/cm.c
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -54,11 +54,22 @@ struct id_map_entry {
 	struct delayed_work timeout;
 };
 
+struct rej_tmout_entry {
+	int slave;
+	u32 rem_pv_cm_id;
+	struct delayed_work timeout;
+	struct radix_tree_root *rej_tmout_root;
+	/* Points to the mutex protecting this radix-tree */
+	struct mutex *lock;
+};
+
 struct cm_generic_msg {
 	struct ib_mad_hdr hdr;
 
 	__be32 local_comm_id;
 	__be32 remote_comm_id;
+	unsigned char unused[2];
+	__be16 rej_reason;
 };
 
 struct cm_sidr_generic_msg {
@@ -285,6 +296,7 @@ static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id)
 	spin_unlock(&sriov->id_map_lock);
 }
 
+#define REJ_REASON(m) be16_to_cpu(((struct cm_generic_msg *)(m))->rej_reason)
 int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
 		struct ib_mad *mad)
 {
@@ -295,7 +307,8 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id
 	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
 	    mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
 	    mad->mad_hdr.attr_id == CM_MRA_ATTR_ID ||
-	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID ||
+	    (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID && REJ_REASON(mad) == IB_CM_REJ_TIMEOUT)) {
 		sl_cm_id = get_local_comm_id(mad);
 		id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
 		if (id)
@@ -328,11 +341,87 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id
 	return 0;
 }
 
+static void rej_tmout_timeout(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct rej_tmout_entry *item = container_of(delay, struct rej_tmout_entry, timeout);
+	struct rej_tmout_entry *deleted;
+
+	mutex_lock(item->lock);
+	deleted = radix_tree_delete_item(item->rej_tmout_root, item->rem_pv_cm_id, NULL);
+	mutex_unlock(item->lock);
+
+	if (deleted != item)
+		pr_debug("deleted(%p) != item(%p)\n", deleted, item);
+
+	pr_debug("rej_tmout entry, rem_pv_cm_id 0x%x, slave %d deleted\n",
+		 item->rem_pv_cm_id, item->slave);
+	kfree(item);
+}
+
+static int alloc_rej_tmout(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id, int slave)
+{
+	struct rej_tmout_entry *item;
+	int sts;
+
+	mutex_lock(&sriov->rej_tmout_lock);
+	item = radix_tree_lookup(&sriov->rej_tmout_root, (unsigned long)rem_pv_cm_id);
+	mutex_unlock(&sriov->rej_tmout_lock);
+	if (item)
+		return PTR_ERR(item);
+
+	item = kmalloc(sizeof(*item), GFP_KERNEL);
+	if (!item)
+		return -ENOMEM;
+
+	INIT_DELAYED_WORK(&item->timeout, rej_tmout_timeout);
+	item->slave = slave;
+	item->rem_pv_cm_id = rem_pv_cm_id;
+	item->rej_tmout_root = &sriov->rej_tmout_root;
+	item->lock = &sriov->rej_tmout_lock;
+
+	mutex_lock(&sriov->rej_tmout_lock);
+	sts = radix_tree_insert(&sriov->rej_tmout_root, (unsigned long)rem_pv_cm_id, item);
+	mutex_unlock(&sriov->rej_tmout_lock);
+	if (sts)
+		goto err_insert;
+
+	pr_debug("Inserted rem_pv_cm_id 0x%x slave %d\n", rem_pv_cm_id, slave);
+	schedule_delayed_work(&item->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+
+	return 0;
+
+err_insert:
+	kfree(item);
+	return sts;
+}
+
+static int lookup_rej_tmout_slave(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id)
+{
+	struct rej_tmout_entry *item;
+
+	mutex_lock(&sriov->rej_tmout_lock);
+	item = radix_tree_lookup(&sriov->rej_tmout_root, (unsigned long)rem_pv_cm_id);
+	mutex_unlock(&sriov->rej_tmout_lock);
+
+	if (!item || IS_ERR(item)) {
+		pr_debug("Could not find rem_pv_cm_id 0x%x error: %d\n",
+			 rem_pv_cm_id, (int)PTR_ERR(item));
+		return !item ? -ENOENT : PTR_ERR(item);
+	}
+	pr_debug("Found rem_pv_cm_id 0x%x slave: %d\n", rem_pv_cm_id, item->slave);
+
+	return item->slave;
+}
+
 int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
 			     struct ib_mad *mad)
 {
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+	u32 rem_pv_cm_id = get_local_comm_id(mad);
 	u32 pv_cm_id;
 	struct id_map_entry *id;
+	int sts;
 
 	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
 	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
@@ -348,7 +437,18 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
 				     be64_to_cpu(gid.global.interface_id));
 			return -ENOENT;
 		}
+
+		sts = alloc_rej_tmout(sriov, rem_pv_cm_id, *slave);
+		if (sts)
+			/* Even if this fails, we pass on the REQ to the slave */
+			pr_debug("Could not allocate rej_tmout entry. rem_pv_cm_id 0x%x slave %d status %d\n",
+				 rem_pv_cm_id, *slave, sts);
+
 		return 0;
+	} else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID && REJ_REASON(mad) == IB_CM_REJ_TIMEOUT) {
+		*slave = lookup_rej_tmout_slave(sriov, rem_pv_cm_id);
+
+		return *slave < 0 ? *slave : 0;
 	}
 
 	pv_cm_id = get_remote_comm_id(mad);
@@ -377,6 +477,35 @@ void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev)
 	INIT_LIST_HEAD(&dev->sriov.cm_list);
 	dev->sriov.sl_id_map = RB_ROOT;
 	xa_init_flags(&dev->sriov.pv_id_table, XA_FLAGS_ALLOC);
+	mutex_init(&dev->sriov.rej_tmout_lock);
+	INIT_RADIX_TREE(&dev->sriov.rej_tmout_root, GFP_KERNEL);
+}
+
+static void rej_tmout_tree_cleanup(struct mlx4_ib_sriov *sriov, int slave)
+{
+	struct radix_tree_iter iter;
+	bool flush_needed = false;
+	void **slot;
+	int cnt = 0;
+
+	mutex_lock(&sriov->rej_tmout_lock);
+	radix_tree_for_each_slot(slot, &sriov->rej_tmout_root, &iter, 0) {
+		struct rej_tmout_entry *item = *slot;
+
+		if (slave < 0 || slave == item->slave) {
+			mod_delayed_work(system_wq, &item->timeout, 0);
+			flush_needed = true;
+			++cnt;
+		}
+	}
+	mutex_unlock(&sriov->rej_tmout_lock);
+
+	if (flush_needed) {
+		flush_scheduled_work();
+
+		pr_debug("%d entries in radix_tree for slave %d during cleanup\n",
+			 slave, cnt);
+	}
 }
 
 /* slave = -1 ==> all slaves */
@@ -446,4 +575,6 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
 		list_del(&map->list);
 		kfree(map);
 	}
+
+	rej_tmout_tree_cleanup(sriov, slave);
 }
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 20cfa69d0aed..92cb686bdc49 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -495,6 +495,9 @@ struct mlx4_ib_sriov {
 	spinlock_t id_map_lock;
 	struct rb_root sl_id_map;
 	struct list_head cm_list;
+	/* Protects the radix-tree */
+	struct mutex rej_tmout_lock;
+	struct radix_tree_root rej_tmout_root;
 };
 
 struct gid_cache_context {
-- 
2.20.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH for-rc 5/5] IB/mlx4: Add support for REJ due to timeout
  2020-07-20 12:22 ` [PATCH for-rc 5/5] IB/mlx4: Add support for REJ due to timeout Håkon Bugge
@ 2020-07-23 15:32   ` Håkon Bugge
  0 siblings, 0 replies; 7+ messages in thread
From: Håkon Bugge @ 2020-07-23 15:32 UTC (permalink / raw)
  To: Doug Ledford, Jason Gunthorpe
  Cc: OFED mailing list, Yishai Hadas, Jack Morgenstein



> On 20 Jul 2020, at 14:22, Håkon Bugge <haakon.bugge@oracle.com> wrote:
> 
> A CM REJ packet with its reason equal to timeout is a special beast in
> the sense that it doesn't have a Remote Communication ID nor does it
> have a Remote Port GID.
> 
> Using CX-3 virtual functions, either from a bare-metal machine or
> pass-through from a VM, MAD packets are proxied through the PF driver.
> 
> Since the VF drivers have separate name spaces for MAD Transaction Ids
> (TIDs), the PF driver has to re-map the TIDs and keep the book keeping
> in a cache.
> 
> This proxying doesn't not handle said REJ packets.
> 
> If the active side abandons its connection attempt after having sent a
> REQ, it will send a REJ with the reason being timeout. This example
> can be provoked by a simple user-verbs program, which ends up doing:
> 
>    rdma_connect(cm_id, &conn_param);
>    rdma_destroy_id(cm_id);
> 
> using the async librdmacm API.
> 
> Having dynamic debug prints enabled in the mlx4_ib driver, we will
> then see:
> 
> mlx4_ib_demux_cm_handler: Couldn't find an entry for pv_cm_id 0x0, attr_id 0x12
> 
> The solution is to introduce a radix-tree. When a REQ packet is
> received and handled in mlx4_ib_demux_cm_handler(), we know the
> connecting peer's para-virtual cm_id and the destination slave. We
> then insert an entry into the tree with said information. We also
> schedule work to remove this entry from the tree and free it, in order
> to avoid memory leak.
> 
> When a REJ packet with reason timeout is received, we can look up the
> slave in the tree, and deliver the packet to the correct slave.
> 
> When cleaning up, we simply traverse the tree and modify any delayed
> work to use a zero delay. A subsequent flush of the system_wq will
> ensure all entries being wiped out.
> 
> Signed-off-by: Håkon Bugge <haakon.bugge@oracle.com>
> ---
> drivers/infiniband/hw/mlx4/cm.c      | 133 ++++++++++++++++++++++++++-
> drivers/infiniband/hw/mlx4/mlx4_ib.h |   3 +
> 2 files changed, 135 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
> index 6f0ffd0906e6..883436548901 100644
> --- a/drivers/infiniband/hw/mlx4/cm.c
> +++ b/drivers/infiniband/hw/mlx4/cm.c
> @@ -54,11 +54,22 @@ struct id_map_entry {
> 	struct delayed_work timeout;
> };
> 
> +struct rej_tmout_entry {
> +	int slave;
> +	u32 rem_pv_cm_id;
> +	struct delayed_work timeout;
> +	struct radix_tree_root *rej_tmout_root;
> +	/* Points to the mutex protecting this radix-tree */
> +	struct mutex *lock;
> +};
> +
> struct cm_generic_msg {
> 	struct ib_mad_hdr hdr;
> 
> 	__be32 local_comm_id;
> 	__be32 remote_comm_id;
> +	unsigned char unused[2];
> +	__be16 rej_reason;
> };
> 
> struct cm_sidr_generic_msg {
> @@ -285,6 +296,7 @@ static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id)
> 	spin_unlock(&sriov->id_map_lock);
> }
> 
> +#define REJ_REASON(m) be16_to_cpu(((struct cm_generic_msg *)(m))->rej_reason)
> int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
> 		struct ib_mad *mad)
> {
> @@ -295,7 +307,8 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id
> 	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
> 	    mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
> 	    mad->mad_hdr.attr_id == CM_MRA_ATTR_ID ||
> -	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
> +	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID ||
> +	    (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID && REJ_REASON(mad) == IB_CM_REJ_TIMEOUT)) {
> 		sl_cm_id = get_local_comm_id(mad);
> 		id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
> 		if (id)
> @@ -328,11 +341,87 @@ int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id
> 	return 0;
> }
> 
> +static void rej_tmout_timeout(struct work_struct *work)
> +{
> +	struct delayed_work *delay = to_delayed_work(work);
> +	struct rej_tmout_entry *item = container_of(delay, struct rej_tmout_entry, timeout);
> +	struct rej_tmout_entry *deleted;
> +
> +	mutex_lock(item->lock);
> +	deleted = radix_tree_delete_item(item->rej_tmout_root, item->rem_pv_cm_id, NULL);
> +	mutex_unlock(item->lock);
> +
> +	if (deleted != item)
> +		pr_debug("deleted(%p) != item(%p)\n", deleted, item);
> +
> +	pr_debug("rej_tmout entry, rem_pv_cm_id 0x%x, slave %d deleted\n",
> +		 item->rem_pv_cm_id, item->slave);
> +	kfree(item);
> +}
> +
> +static int alloc_rej_tmout(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id, int slave)
> +{
> +	struct rej_tmout_entry *item;
> +	int sts;
> +
> +	mutex_lock(&sriov->rej_tmout_lock);
> +	item = radix_tree_lookup(&sriov->rej_tmout_root, (unsigned long)rem_pv_cm_id);
> +	mutex_unlock(&sriov->rej_tmout_lock);
> +	if (item)
> +		return PTR_ERR(item);

Hmm, this shall read:
		return IS_ERR(item) ? PTR_ERR(item) : 0;

I'll also remove the noisy pr_debug()s in this commit.

Will wait before sending out a v2.


Thxs, Håkon


> +
> +	item = kmalloc(sizeof(*item), GFP_KERNEL);
> +	if (!item)
> +		return -ENOMEM;
> +
> +	INIT_DELAYED_WORK(&item->timeout, rej_tmout_timeout);
> +	item->slave = slave;
> +	item->rem_pv_cm_id = rem_pv_cm_id;
> +	item->rej_tmout_root = &sriov->rej_tmout_root;
> +	item->lock = &sriov->rej_tmout_lock;
> +
> +	mutex_lock(&sriov->rej_tmout_lock);
> +	sts = radix_tree_insert(&sriov->rej_tmout_root, (unsigned long)rem_pv_cm_id, item);
> +	mutex_unlock(&sriov->rej_tmout_lock);
> +	if (sts)
> +		goto err_insert;
> +
> +	pr_debug("Inserted rem_pv_cm_id 0x%x slave %d\n", rem_pv_cm_id, slave);
> +	schedule_delayed_work(&item->timeout, CM_CLEANUP_CACHE_TIMEOUT);
> +
> +	return 0;
> +
> +err_insert:
> +	kfree(item);
> +	return sts;
> +}
> +
> +static int lookup_rej_tmout_slave(struct mlx4_ib_sriov *sriov, u32 rem_pv_cm_id)
> +{
> +	struct rej_tmout_entry *item;
> +
> +	mutex_lock(&sriov->rej_tmout_lock);
> +	item = radix_tree_lookup(&sriov->rej_tmout_root, (unsigned long)rem_pv_cm_id);
> +	mutex_unlock(&sriov->rej_tmout_lock);
> +
> +	if (!item || IS_ERR(item)) {
> +		pr_debug("Could not find rem_pv_cm_id 0x%x error: %d\n",
> +			 rem_pv_cm_id, (int)PTR_ERR(item));
> +		return !item ? -ENOENT : PTR_ERR(item);
> +	}
> +	pr_debug("Found rem_pv_cm_id 0x%x slave: %d\n", rem_pv_cm_id, item->slave);
> +
> +	return item->slave;
> +}
> +
> int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
> 			     struct ib_mad *mad)
> {
> +	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
> +	u32 rem_pv_cm_id = get_local_comm_id(mad);
> 	u32 pv_cm_id;
> 	struct id_map_entry *id;
> +	int sts;
> 
> 	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
> 	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
> @@ -348,7 +437,18 @@ int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
> 				     be64_to_cpu(gid.global.interface_id));
> 			return -ENOENT;
> 		}
> +
> +		sts = alloc_rej_tmout(sriov, rem_pv_cm_id, *slave);
> +		if (sts)
> +			/* Even if this fails, we pass on the REQ to the slave */
> +			pr_debug("Could not allocate rej_tmout entry. rem_pv_cm_id 0x%x slave %d status %d\n",
> +				 rem_pv_cm_id, *slave, sts);
> +
> 		return 0;
> +	} else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID && REJ_REASON(mad) == IB_CM_REJ_TIMEOUT) {
> +		*slave = lookup_rej_tmout_slave(sriov, rem_pv_cm_id);
> +
> +		return *slave < 0 ? *slave : 0;
> 	}
> 
> 	pv_cm_id = get_remote_comm_id(mad);
> @@ -377,6 +477,35 @@ void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev)
> 	INIT_LIST_HEAD(&dev->sriov.cm_list);
> 	dev->sriov.sl_id_map = RB_ROOT;
> 	xa_init_flags(&dev->sriov.pv_id_table, XA_FLAGS_ALLOC);
> +	mutex_init(&dev->sriov.rej_tmout_lock);
> +	INIT_RADIX_TREE(&dev->sriov.rej_tmout_root, GFP_KERNEL);
> +}
> +
> +static void rej_tmout_tree_cleanup(struct mlx4_ib_sriov *sriov, int slave)
> +{
> +	struct radix_tree_iter iter;
> +	bool flush_needed = false;
> +	void **slot;
> +	int cnt = 0;
> +
> +	mutex_lock(&sriov->rej_tmout_lock);
> +	radix_tree_for_each_slot(slot, &sriov->rej_tmout_root, &iter, 0) {
> +		struct rej_tmout_entry *item = *slot;
> +
> +		if (slave < 0 || slave == item->slave) {
> +			mod_delayed_work(system_wq, &item->timeout, 0);
> +			flush_needed = true;
> +			++cnt;
> +		}
> +	}
> +	mutex_unlock(&sriov->rej_tmout_lock);
> +
> +	if (flush_needed) {
> +		flush_scheduled_work();
> +
> +		pr_debug("%d entries in radix_tree for slave %d during cleanup\n",
> +			 slave, cnt);
> +	}
> }
> 
> /* slave = -1 ==> all slaves */
> @@ -446,4 +575,6 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
> 		list_del(&map->list);
> 		kfree(map);
> 	}
> +
> +	rej_tmout_tree_cleanup(sriov, slave);
> }
> diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
> index 20cfa69d0aed..92cb686bdc49 100644
> --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
> +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
> @@ -495,6 +495,9 @@ struct mlx4_ib_sriov {
> 	spinlock_t id_map_lock;
> 	struct rb_root sl_id_map;
> 	struct list_head cm_list;
> +	/* Protects the radix-tree */
> +	struct mutex rej_tmout_lock;
> +	struct radix_tree_root rej_tmout_root;
> };
> 
> struct gid_cache_context {
> -- 
> 2.20.1
> 


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, back to index

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-20 12:22 [PATCH for-rc 0/5] Add CM packets missing and harden the proxying Håkon Bugge
2020-07-20 12:22 ` [PATCH for-rc 1/5] IB/mlx4: Add and improve logging Håkon Bugge
2020-07-20 12:22 ` [PATCH for-rc 2/5] IB/mlx4: Add support for MRA Håkon Bugge
2020-07-20 12:22 ` [PATCH for-rc 3/5] IB/mlx4: Separate tunnel and wire bufs parameters Håkon Bugge
2020-07-20 12:22 ` [PATCH for-rc 4/5] IB/mlx4: Fix starvation in paravirt mux/demux Håkon Bugge
2020-07-20 12:22 ` [PATCH for-rc 5/5] IB/mlx4: Add support for REJ due to timeout Håkon Bugge
2020-07-23 15:32   ` Håkon Bugge

Linux-RDMA Archive on lore.kernel.org

Archives are clonable:
	git clone --mirror https://lore.kernel.org/linux-rdma/0 linux-rdma/git/0.git

	# If you have public-inbox 1.1+ installed, you may
	# initialize and index your mirror using the following commands:
	public-inbox-init -V2 linux-rdma linux-rdma/ https://lore.kernel.org/linux-rdma \
		linux-rdma@vger.kernel.org
	public-inbox-index linux-rdma

Example config snippet for mirrors

Newsgroup available over NNTP:
	nntp://nntp.lore.kernel.org/org.kernel.vger.linux-rdma


AGPL code for this site: git clone https://public-inbox.org/public-inbox.git