All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH for-next 0/6] mlx4 and more IPoIB changes for 4.1
@ 2015-04-02 10:38 Or Gerlitz
       [not found] ` <1427971145-26943-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Or Gerlitz @ 2015-04-02 10:38 UTC (permalink / raw)
  To: Roland Dreier, Doug Ledford
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Tal Alon,
	Amir Vadai, Or Gerlitz

This series has one mlx4 bug fix and few fixes / enhancements 
on top of Doug's major cleanup in the multicast area.

Next week both of us will be OOO as of a holiday, so Erez will 
be able to address comments only on the week on that follows.

Erez and Or.

Erez Shitrit (6):
  IB/ipoib: Use one linear skb in RX flow
  IB/ipoib: Update broadcast record values after each successful join request
  IB/ipoib: Handle QP in SQE state
  IB/ipoib: Save only IPOIB_MAX_PATH_REC_QUEUE skb's
  IB/ipoib: Remove IPOIB_MCAST_RUN bit
  IB/mlx4: Fix WQE LSO segment calculation

 drivers/infiniband/hw/mlx4/qp.c                |    3 +-
 drivers/infiniband/ulp/ipoib/ipoib.h           |   11 +-
 drivers/infiniband/ulp/ipoib/ipoib_ib.c        |  126 +++++++++++++-----------
 drivers/infiniband/ulp/ipoib/ipoib_main.c      |   13 ++-
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   24 ++++-
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c     |   13 +--
 6 files changed, 106 insertions(+), 84 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH for-next 1/6] IB/ipoib: Use one linear skb in RX flow
       [not found] ` <1427971145-26943-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-04-02 10:39   ` Or Gerlitz
  2015-04-02 10:39   ` [PATCH for-next 2/6] IB/ipoib: Update broadcast record values after each successful join request Or Gerlitz
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: Or Gerlitz @ 2015-04-02 10:39 UTC (permalink / raw)
  To: Roland Dreier, Doug Ledford
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Tal Alon,
	Amir Vadai, Or Gerlitz

From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

The current code in the RX flow uses two sg entries for each incoming
packet, the first one was for the IB headers and the second for the rest
of the data, that causes two  dma map/unmap and two allocations, and few
more actions that were done at the data path.

Use only one linear skb on each incoming packet, for the data (IB
headers and payload), that reduces the packet processing in the
data-path (only one skb, no frags, the first frag was not used anyway,
less memory allocations) and the dma handling (only one dma map/unmap
over each incoming packet instead of two map/unmap per each incoming packet).

After commit 73d3fe6d1c6d ("gro: fix aggregation for skb using frag_list") from
Eric Dumazet, we will get full aggregation for large packets.

When running bandwidth tests before and after the (over the card's numa node),
using "netperf -H 1.1.1.3 -T -t TCP_STREAM", the results before are ~12Gbs before
and after ~16Gbs on my setup (Mellanox's ConnectX3).

Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/infiniband/ulp/ipoib/ipoib.h       |    5 --
 drivers/infiniband/ulp/ipoib/ipoib_ib.c    |   67 ++++------------------------
 drivers/infiniband/ulp/ipoib/ipoib_verbs.c |   13 ++----
 3 files changed, 13 insertions(+), 72 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index c79dcd5..769044c 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -434,11 +434,6 @@ struct ipoib_neigh {
 #define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
 #define IPOIB_UD_BUF_SIZE(ib_mtu)	(ib_mtu + IB_GRH_BYTES)
 
-static inline int ipoib_ud_need_sg(unsigned int ib_mtu)
-{
-	return IPOIB_UD_BUF_SIZE(ib_mtu) > PAGE_SIZE;
-}
-
 void ipoib_neigh_dtor(struct ipoib_neigh *neigh);
 static inline void ipoib_neigh_put(struct ipoib_neigh *neigh)
 {
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index e144d07..29b376d 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -94,39 +94,9 @@ void ipoib_free_ah(struct kref *kref)
 static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
 				  u64 mapping[IPOIB_UD_RX_SG])
 {
-	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
-		ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_UD_HEAD_SIZE,
-				    DMA_FROM_DEVICE);
-		ib_dma_unmap_page(priv->ca, mapping[1], PAGE_SIZE,
-				  DMA_FROM_DEVICE);
-	} else
-		ib_dma_unmap_single(priv->ca, mapping[0],
-				    IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
-				    DMA_FROM_DEVICE);
-}
-
-static void ipoib_ud_skb_put_frags(struct ipoib_dev_priv *priv,
-				   struct sk_buff *skb,
-				   unsigned int length)
-{
-	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
-		skb_frag_t *frag = &skb_shinfo(skb)->frags[0];
-		unsigned int size;
-		/*
-		 * There is only two buffers needed for max_payload = 4K,
-		 * first buf size is IPOIB_UD_HEAD_SIZE
-		 */
-		skb->tail += IPOIB_UD_HEAD_SIZE;
-		skb->len  += length;
-
-		size = length - IPOIB_UD_HEAD_SIZE;
-
-		skb_frag_size_set(frag, size);
-		skb->data_len += size;
-		skb->truesize += PAGE_SIZE;
-	} else
-		skb_put(skb, length);
-
+	ib_dma_unmap_single(priv->ca, mapping[0],
+			    IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
+			    DMA_FROM_DEVICE);
 }
 
 static int ipoib_ib_post_receive(struct net_device *dev, int id)
@@ -156,18 +126,11 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
 	struct sk_buff *skb;
 	int buf_size;
-	int tailroom;
 	u64 *mapping;
 
-	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
-		buf_size = IPOIB_UD_HEAD_SIZE;
-		tailroom = 128; /* reserve some tailroom for IP/TCP headers */
-	} else {
-		buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
-		tailroom = 0;
-	}
+	buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
 
-	skb = dev_alloc_skb(buf_size + tailroom + 4);
+	skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN);
 	if (unlikely(!skb))
 		return NULL;
 
@@ -184,23 +147,8 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
 	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
 		goto error;
 
-	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
-		struct page *page = alloc_page(GFP_ATOMIC);
-		if (!page)
-			goto partial_error;
-		skb_fill_page_desc(skb, 0, page, 0, PAGE_SIZE);
-		mapping[1] =
-			ib_dma_map_page(priv->ca, page,
-					0, PAGE_SIZE, DMA_FROM_DEVICE);
-		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[1])))
-			goto partial_error;
-	}
-
 	priv->rx_ring[id].skb = skb;
 	return skb;
-
-partial_error:
-	ib_dma_unmap_single(priv->ca, mapping[0], buf_size, DMA_FROM_DEVICE);
 error:
 	dev_kfree_skb_any(skb);
 	return NULL;
@@ -278,7 +226,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 		       wc->byte_len, wc->slid);
 
 	ipoib_ud_dma_unmap_rx(priv, mapping);
-	ipoib_ud_skb_put_frags(priv, skb, wc->byte_len);
+
+	skb_put(skb, wc->byte_len);
 
 	/* First byte of dgid signals multicast when 0xff */
 	dgid = &((struct ib_grh *)skb->data)->dgid;
@@ -296,6 +245,8 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
 	skb_reset_mac_header(skb);
 	skb_pull(skb, IPOIB_ENCAP_LEN);
 
+	skb->truesize = SKB_TRUESIZE(skb->len);
+
 	++dev->stats.rx_packets;
 	dev->stats.rx_bytes += skb->len;
 
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
index 3462840..e5cc430 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -227,15 +227,10 @@ int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
 	priv->tx_wr.send_flags	= IB_SEND_SIGNALED;
 
 	priv->rx_sge[0].lkey = priv->mr->lkey;
-	if (ipoib_ud_need_sg(priv->max_ib_mtu)) {
-		priv->rx_sge[0].length = IPOIB_UD_HEAD_SIZE;
-		priv->rx_sge[1].length = PAGE_SIZE;
-		priv->rx_sge[1].lkey = priv->mr->lkey;
-		priv->rx_wr.num_sge = IPOIB_UD_RX_SG;
-	} else {
-		priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
-		priv->rx_wr.num_sge = 1;
-	}
+
+	priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+	priv->rx_wr.num_sge = 1;
+
 	priv->rx_wr.next = NULL;
 	priv->rx_wr.sg_list = priv->rx_sge;
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH for-next 2/6] IB/ipoib: Update broadcast record values after each successful join request
       [not found] ` <1427971145-26943-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2015-04-02 10:39   ` [PATCH for-next 1/6] IB/ipoib: Use one linear skb in RX flow Or Gerlitz
@ 2015-04-02 10:39   ` Or Gerlitz
  2015-04-02 10:39   ` [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state Or Gerlitz
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 15+ messages in thread
From: Or Gerlitz @ 2015-04-02 10:39 UTC (permalink / raw)
  To: Roland Dreier, Doug Ledford
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Tal Alon,
	Amir Vadai, Or Gerlitz

From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Update the cached broadcast record in the priv object after every new
join of this broadcast domain group.

These values are needed for the port configuration (MTU size) and to
all the new multicast (non-broadcast) join requests initial parameters.

For example, SM starts with 2K MTU for all the fabric, and after that it
restarts (or handover to new SM) with new port configuration of 4K MTU.
Without using the new values, the driver will keep its old configuration
of 2K and will not apply the new configuration of 4K.

Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |   18 +++++++++++++++++-
 1 files changed, 17 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index 3203ebe..c83c958 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -226,7 +226,23 @@ static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
 			spin_unlock_irq(&priv->lock);
 			return -EAGAIN;
 		}
-		priv->mcast_mtu = IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+		/*update priv member according to the new mcast*/
+		priv->broadcast->mcmember.qkey = mcmember->qkey;
+		priv->broadcast->mcmember.mtu = mcmember->mtu;
+		priv->broadcast->mcmember.traffic_class = mcmember->traffic_class;
+		priv->broadcast->mcmember.rate = mcmember->rate;
+		priv->broadcast->mcmember.sl = mcmember->sl;
+		priv->broadcast->mcmember.flow_label = mcmember->flow_label;
+		priv->broadcast->mcmember.hop_limit = mcmember->hop_limit;
+		/* assume if the admin and the mcast are the same both can be changed */
+		if (priv->mcast_mtu == priv->admin_mtu)
+			priv->admin_mtu =
+			priv->mcast_mtu =
+			IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+		else
+			priv->mcast_mtu =
+			IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+
 		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
 		spin_unlock_irq(&priv->lock);
 		priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
       [not found] ` <1427971145-26943-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2015-04-02 10:39   ` [PATCH for-next 1/6] IB/ipoib: Use one linear skb in RX flow Or Gerlitz
  2015-04-02 10:39   ` [PATCH for-next 2/6] IB/ipoib: Update broadcast record values after each successful join request Or Gerlitz
@ 2015-04-02 10:39   ` Or Gerlitz
       [not found]     ` <1427971145-26943-4-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2015-04-02 10:39   ` [PATCH for-next 4/6] IB/ipoib: Save only IPOIB_MAX_PATH_REC_QUEUE skb's Or Gerlitz
                     ` (2 subsequent siblings)
  5 siblings, 1 reply; 15+ messages in thread
From: Or Gerlitz @ 2015-04-02 10:39 UTC (permalink / raw)
  To: Roland Dreier, Doug Ledford
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Tal Alon,
	Amir Vadai, Or Gerlitz

From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

As the result of a completion error the QP can moved to SQE state by
the hardware. Since it's not the Error state, there are no flushes
and hence the driver doesn't know about that.

The fix creates a task that after completion with error which is not a
flush tracks the QP state and if it is in SQE state moves it back to RTS.

Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/infiniband/ulp/ipoib/ipoib.h    |    5 +++
 drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59 ++++++++++++++++++++++++++++++-
 2 files changed, 63 insertions(+), 1 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 769044c..2703d9a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -299,6 +299,11 @@ struct ipoib_neigh_table {
 	struct completion		deleted;
 };
 
+struct ipoib_qp_state_validate {
+	struct work_struct work;
+	struct ipoib_dev_priv   *priv;
+};
+
 /*
  * Device private locking: network stack tx_lock protects members used
  * in TX fast path, lock protects everything else.  lock nests inside
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
index 29b376d..63b92cb 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
 	}
 }
 
+/*
+ * As the result of a completion error the QP Can be transferred to SQE states.
+ * The function checks if the (send)QP is in SQE state and
+ * moves it back to RTS state, that in order to have it functional again.
+ */
+static void ipoib_qp_state_validate_work(struct work_struct *work)
+{
+	struct ipoib_qp_state_validate *qp_work =
+		container_of(work, struct ipoib_qp_state_validate, work);
+
+	struct ipoib_dev_priv *priv = qp_work->priv;
+	struct ib_qp_attr qp_attr;
+	struct ib_qp_init_attr query_init_attr;
+	int ret;
+
+	ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
+	if (ret) {
+		ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
+			   __func__, ret);
+		goto free_res;
+	}
+	pr_info("%s: QP: 0x%x is in state: %d\n",
+		__func__, priv->qp->qp_num, qp_attr.qp_state);
+
+	/* currently support only in SQE->RTS transition*/
+	if (qp_attr.qp_state == IB_QPS_SQE) {
+		qp_attr.qp_state = IB_QPS_RTS;
+
+		ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
+		if (ret) {
+			pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
+				ret, priv->qp->qp_num);
+			goto free_res;
+		}
+		pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
+			__func__, priv->qp->qp_num);
+	} else {
+		pr_warn("QP (%d) will stay in state: %d\n",
+			priv->qp->qp_num, qp_attr.qp_state);
+	}
+
+free_res:
+	kfree(qp_work);
+}
+
 static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 {
 	struct ipoib_dev_priv *priv = netdev_priv(dev);
@@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
 		netif_wake_queue(dev);
 
 	if (wc->status != IB_WC_SUCCESS &&
-	    wc->status != IB_WC_WR_FLUSH_ERR)
+	    wc->status != IB_WC_WR_FLUSH_ERR) {
+		struct ipoib_qp_state_validate *qp_work;
 		ipoib_warn(priv, "failed send event "
 			   "(status=%d, wrid=%d vend_err %x)\n",
 			   wc->status, wr_id, wc->vendor_err);
+		qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
+		if (!qp_work) {
+			ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n",
+				   __func__, priv->qp->qp_num);
+			return;
+		}
+
+		INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
+		qp_work->priv = priv;
+		queue_work(priv->wq, &qp_work->work);
+	}
 }
 
 static int poll_tx(struct ipoib_dev_priv *priv)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH for-next 4/6] IB/ipoib: Save only IPOIB_MAX_PATH_REC_QUEUE skb's
       [not found] ` <1427971145-26943-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
                     ` (2 preceding siblings ...)
  2015-04-02 10:39   ` [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state Or Gerlitz
@ 2015-04-02 10:39   ` Or Gerlitz
  2015-04-02 10:39   ` [PATCH for-next 5/6] IB/ipoib: Remove IPOIB_MCAST_RUN bit Or Gerlitz
  2015-04-02 10:39   ` [PATCH for-next 6/6] IB/mlx4: Fix WQE LSO segment calculation Or Gerlitz
  5 siblings, 0 replies; 15+ messages in thread
From: Or Gerlitz @ 2015-04-02 10:39 UTC (permalink / raw)
  To: Roland Dreier, Doug Ledford
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Tal Alon,
	Amir Vadai, Or Gerlitz

From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

Whenever there is no path->ah to the destination, keep only defined
number of skb's. Otherwise there are cases that the driver can keep
infinite list of skb's.

For example, when one device want to send unicast arp to the destination,
and from some reason the SM doesn't respond, the driver currently keeps
all the skb's. If that unicast arp traffic stopped, all  these skb's
are kept by the path object till the interface is down.

Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/infiniband/ulp/ipoib/ipoib_main.c |   13 ++++++++++---
 1 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
index 26e0eed..0d46100 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -640,8 +640,10 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
 
 		if (!path->query && path_rec_start(dev, path))
 			goto err_path;
-
-		__skb_queue_tail(&neigh->queue, skb);
+		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+			__skb_queue_tail(&neigh->queue, skb);
+		else
+			goto err_drop;
 	}
 
 	spin_unlock_irqrestore(&priv->lock, flags);
@@ -676,7 +678,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
 			new_path = 1;
 		}
 		if (path) {
-			__skb_queue_tail(&path->queue, skb);
+			if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+				__skb_queue_tail(&path->queue, skb);
+			} else {
+				++dev->stats.tx_dropped;
+				dev_kfree_skb_any(skb);
+			}
 
 			if (!path->query && path_rec_start(dev, path)) {
 				spin_unlock_irqrestore(&priv->lock, flags);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH for-next 5/6] IB/ipoib: Remove IPOIB_MCAST_RUN bit
       [not found] ` <1427971145-26943-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
                     ` (3 preceding siblings ...)
  2015-04-02 10:39   ` [PATCH for-next 4/6] IB/ipoib: Save only IPOIB_MAX_PATH_REC_QUEUE skb's Or Gerlitz
@ 2015-04-02 10:39   ` Or Gerlitz
  2015-04-02 10:39   ` [PATCH for-next 6/6] IB/mlx4: Fix WQE LSO segment calculation Or Gerlitz
  5 siblings, 0 replies; 15+ messages in thread
From: Or Gerlitz @ 2015-04-02 10:39 UTC (permalink / raw)
  To: Roland Dreier, Doug Ledford
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Tal Alon,
	Amir Vadai, Or Gerlitz

From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

After Doug Ledford's changes there is no need in that bit, it's
semantic becomes subset of the IPOIB_FLAG_OPER_UP bit.

Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/infiniband/ulp/ipoib/ipoib.h           |    1 -
 drivers/infiniband/ulp/ipoib/ipoib_multicast.c |    6 ++----
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
index 2703d9a..bd94b0a 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib.h
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -87,7 +87,6 @@ enum {
 	IPOIB_FLAG_ADMIN_UP	  = 2,
 	IPOIB_PKEY_ASSIGNED	  = 3,
 	IPOIB_FLAG_SUBINTERFACE	  = 5,
-	IPOIB_MCAST_RUN		  = 6,
 	IPOIB_STOP_REAPER	  = 7,
 	IPOIB_FLAG_ADMIN_CM	  = 9,
 	IPOIB_FLAG_UMCAST	  = 10,
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
index c83c958..0d23e05 100644
--- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -71,7 +71,7 @@ static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv,
 					       struct ipoib_mcast *mcast,
 					       bool delay)
 {
-	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
 		return;
 
 	/*
@@ -519,7 +519,7 @@ void ipoib_mcast_join_task(struct work_struct *work)
 	struct ipoib_mcast *mcast = NULL;
 	int create = 1;
 
-	if (!test_bit(IPOIB_MCAST_RUN, &priv->flags))
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
 		return;
 
 	if (ib_query_port(priv->ca, priv->port, &port_attr) ||
@@ -629,7 +629,6 @@ int ipoib_mcast_start_thread(struct net_device *dev)
 	ipoib_dbg_mcast(priv, "starting multicast thread\n");
 
 	spin_lock_irqsave(&priv->lock, flags);
-	set_bit(IPOIB_MCAST_RUN, &priv->flags);
 	__ipoib_mcast_schedule_join_thread(priv, NULL, 0);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
@@ -644,7 +643,6 @@ int ipoib_mcast_stop_thread(struct net_device *dev)
 	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
 
 	spin_lock_irqsave(&priv->lock, flags);
-	clear_bit(IPOIB_MCAST_RUN, &priv->flags);
 	cancel_delayed_work(&priv->mcast_task);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [PATCH for-next 6/6] IB/mlx4: Fix WQE LSO segment calculation
       [not found] ` <1427971145-26943-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
                     ` (4 preceding siblings ...)
  2015-04-02 10:39   ` [PATCH for-next 5/6] IB/ipoib: Remove IPOIB_MCAST_RUN bit Or Gerlitz
@ 2015-04-02 10:39   ` Or Gerlitz
  5 siblings, 0 replies; 15+ messages in thread
From: Or Gerlitz @ 2015-04-02 10:39 UTC (permalink / raw)
  To: Roland Dreier, Doug Ledford
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Tal Alon,
	Amir Vadai, Or Gerlitz

From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>

The current code decreases from the mss size (which is the gso_size
from the kernel skb) the size of the packet headers.

It shouldn't do that because the mss that comes from the stack
(e.g IPoIB) includes only the tcp payload without the headers.

The result is indication to the HW that each packet that the HW sends
is smaller than what it could be, and too many packets will be sent
for big messages.

An easy way to demonstrate one more aspect of the problem is by
configuring the ipoib mtu to be less than 2*hlen (2*56) and then
run app sending big TCP messages. This will tell the HW to send packets
with giant (negative value which under unsigned arithmetics becomes
a huge positive one) length and the QP moves to SQE state.

Fixes: b832be1e4007 ('IB/mlx4: Add IPoIB LSO support')
Reported-by: Matthew Finlay <matt-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
---
 drivers/infiniband/hw/mlx4/qp.c |    3 +--
 1 files changed, 1 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index ed2bd67..fbde33a 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -2605,8 +2605,7 @@ static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
 
 	memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
 
-	*lso_hdr_sz  = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
-				   wr->wr.ud.hlen);
+	*lso_hdr_sz  = cpu_to_be32(wr->wr.ud.mss << 16 | wr->wr.ud.hlen);
 	*lso_seg_len = halign;
 	return 0;
 }
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
       [not found]     ` <1427971145-26943-4-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
@ 2015-04-15 12:08       ` Doug Ledford
       [not found]         ` <13F95634-36B6-4759-A300-E140CB7FF70D-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  2015-04-15 17:20       ` Devesh Sharma
  1 sibling, 1 reply; 15+ messages in thread
From: Doug Ledford @ 2015-04-15 12:08 UTC (permalink / raw)
  Cc: Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit,
	Tal Alon, Amir Vadai, Or Gerlitz

[-- Attachment #1: Type: text/plain, Size: 5394 bytes --]


> On Apr 2, 2015, at 6:39 AM, Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
> 
> From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> 
> As the result of a completion error the QP can moved to SQE state by
> the hardware. Since it's not the Error state, there are no flushes
> and hence the driver doesn't know about that.
> 
> The fix creates a task that after completion with error which is not a
> flush tracks the QP state and if it is in SQE state moves it back to RTS.

I assume you found this particular issue while tracking down the cause of the problem that you fixed in patch #6?  Otherwise, this is a “never should happen” situation, yes?

> Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> ---
> drivers/infiniband/ulp/ipoib/ipoib.h    |    5 +++
> drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59 ++++++++++++++++++++++++++++++-
> 2 files changed, 63 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
> index 769044c..2703d9a 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> @@ -299,6 +299,11 @@ struct ipoib_neigh_table {
> 	struct completion		deleted;
> };
> 
> +struct ipoib_qp_state_validate {
> +	struct work_struct work;
> +	struct ipoib_dev_priv   *priv;
> +};

Why did you choose to do an allocated work struct?  All of the other tasks we have use a static work struct that’s part of the ipoib private dev, so this is a bit out of the norm.  In its current form, you only use this on the single UD queue pair that the dev uses, and not ever on any connected mode queue pairs, so it could be static.  Or were you looking at the possibility of making the task work on the CM queue pairs and so left it allocated?  And if that, why not include a reference to the broken queue pair in the work struct instead of always using priv->qp?

> /*
>  * Device private locking: network stack tx_lock protects members used
>  * in TX fast path, lock protects everything else.  lock nests inside
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index 29b376d..63b92cb 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
> 	}
> }
> 
> +/*
> + * As the result of a completion error the QP Can be transferred to SQE states.
> + * The function checks if the (send)QP is in SQE state and
> + * moves it back to RTS state, that in order to have it functional again.
> + */
> +static void ipoib_qp_state_validate_work(struct work_struct *work)
> +{
> +	struct ipoib_qp_state_validate *qp_work =
> +		container_of(work, struct ipoib_qp_state_validate, work);
> +
> +	struct ipoib_dev_priv *priv = qp_work->priv;
> +	struct ib_qp_attr qp_attr;
> +	struct ib_qp_init_attr query_init_attr;
> +	int ret;
> +
> +	ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
> +	if (ret) {
> +		ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
> +			   __func__, ret);
> +		goto free_res;
> +	}
> +	pr_info("%s: QP: 0x%x is in state: %d\n",
> +		__func__, priv->qp->qp_num, qp_attr.qp_state);
> +
> +	/* currently support only in SQE->RTS transition*/
> +	if (qp_attr.qp_state == IB_QPS_SQE) {
> +		qp_attr.qp_state = IB_QPS_RTS;
> +
> +		ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
> +		if (ret) {
> +			pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
> +				ret, priv->qp->qp_num);
> +			goto free_res;
> +		}
> +		pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
> +			__func__, priv->qp->qp_num);
> +	} else {
> +		pr_warn("QP (%d) will stay in state: %d\n",
> +			priv->qp->qp_num, qp_attr.qp_state);
> +	}
> +
> +free_res:
> +	kfree(qp_work);
> +}
> +
> static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
> {
> 	struct ipoib_dev_priv *priv = netdev_priv(dev);
> @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
> 		netif_wake_queue(dev);
> 
> 	if (wc->status != IB_WC_SUCCESS &&
> -	    wc->status != IB_WC_WR_FLUSH_ERR)
> +	    wc->status != IB_WC_WR_FLUSH_ERR) {
> +		struct ipoib_qp_state_validate *qp_work;
> 		ipoib_warn(priv, "failed send event "
> 			   "(status=%d, wrid=%d vend_err %x)\n",
> 			   wc->status, wr_id, wc->vendor_err);
> +		qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
> +		if (!qp_work) {
> +			ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n",
> +				   __func__, priv->qp->qp_num);
> +			return;
> +		}
> +
> +		INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
> +		qp_work->priv = priv;
> +		queue_work(priv->wq, &qp_work->work);
> +	}
> }
> 
> static int poll_tx(struct ipoib_dev_priv *priv)
> --
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

—
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
	GPG Key ID: 0E572FDD






[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 842 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
       [not found]         ` <13F95634-36B6-4759-A300-E140CB7FF70D-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-04-15 13:29           ` Erez Shitrit
       [not found]             ` <CAAk-MO8CX7zrbSnVc9NJt5pYTO4Apj_j4PSosXRKwe0BKv+1Aw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Erez Shitrit @ 2015-04-15 13:29 UTC (permalink / raw)
  To: Doug Ledford
  Cc: Erez Shitrit, Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Tal Alon, Amir Vadai, Or Gerlitz

Hi Doug

(now with "reply to all" :) )

On Wed, Apr 15, 2015 at 3:08 PM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>
>> On Apr 2, 2015, at 6:39 AM, Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
>>
>> From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>
>> As the result of a completion error the QP can moved to SQE state by
>> the hardware. Since it's not the Error state, there are no flushes
>> and hence the driver doesn't know about that.
>>
>> The fix creates a task that after completion with error which is not a
>> flush tracks the QP state and if it is in SQE state moves it back to RTS.
>
> I assume you found this particular issue while tracking down the cause of the problem that you fixed in patch #6?  Otherwise, this is a “never should happen” situation, yes?
>

Not only. it happened in the case that described in patch#6 and also
in some other errors that caused the FW/HW to decide move the UD QO to
SQE state (in the category of "bad thing happened.."  like damaged
WQE, FW problem, etc.)
The patch originally was written to solve such cases which we really
saw, at testing, customers etc. and not for patch#6

>> Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>> Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>> ---
>> drivers/infiniband/ulp/ipoib/ipoib.h    |    5 +++
>> drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59 ++++++++++++++++++++++++++++++-
>> 2 files changed, 63 insertions(+), 1 deletions(-)
>>
>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
>> index 769044c..2703d9a 100644
>> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
>> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
>> @@ -299,6 +299,11 @@ struct ipoib_neigh_table {
>>       struct completion               deleted;
>> };
>>
>> +struct ipoib_qp_state_validate {
>> +     struct work_struct work;
>> +     struct ipoib_dev_priv   *priv;
>> +};
>
> Why did you choose to do an allocated work struct?  All of the other tasks we have use a static work struct that’s part of the ipoib private dev, so this is a bit out of the norm.  In its current form, you only use this on the single UD queue pair that the dev uses, and not ever on any connected mode queue pairs, so it could be static.  Or were you looking at the possibility of making the task work on the CM queue pairs and so left it allocated?  And if that, why not include a reference to the broken queue pair in the work struct instead of always using priv->qp?
>

It looks (at least for me) more elegant to create "on-the-fly" work
create/queue and delete at the end (BTW, I saw it many times, other
than ipoib).
There is no need for that in the CM mode, there the connection and the
QP are destroyed, and new one will be created, only in UD mode the QP
is forever.

>> /*
>>  * Device private locking: network stack tx_lock protects members used
>>  * in TX fast path, lock protects everything else.  lock nests inside
>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>> index 29b376d..63b92cb 100644
>> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
>>       }
>> }
>>
>> +/*
>> + * As the result of a completion error the QP Can be transferred to SQE states.
>> + * The function checks if the (send)QP is in SQE state and
>> + * moves it back to RTS state, that in order to have it functional again.
>> + */
>> +static void ipoib_qp_state_validate_work(struct work_struct *work)
>> +{
>> +     struct ipoib_qp_state_validate *qp_work =
>> +             container_of(work, struct ipoib_qp_state_validate, work);
>> +
>> +     struct ipoib_dev_priv *priv = qp_work->priv;
>> +     struct ib_qp_attr qp_attr;
>> +     struct ib_qp_init_attr query_init_attr;
>> +     int ret;
>> +
>> +     ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
>> +     if (ret) {
>> +             ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
>> +                        __func__, ret);
>> +             goto free_res;
>> +     }
>> +     pr_info("%s: QP: 0x%x is in state: %d\n",
>> +             __func__, priv->qp->qp_num, qp_attr.qp_state);
>> +
>> +     /* currently support only in SQE->RTS transition*/
>> +     if (qp_attr.qp_state == IB_QPS_SQE) {
>> +             qp_attr.qp_state = IB_QPS_RTS;
>> +
>> +             ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
>> +             if (ret) {
>> +                     pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
>> +                             ret, priv->qp->qp_num);
>> +                     goto free_res;
>> +             }
>> +             pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
>> +                     __func__, priv->qp->qp_num);
>> +     } else {
>> +             pr_warn("QP (%d) will stay in state: %d\n",
>> +                     priv->qp->qp_num, qp_attr.qp_state);
>> +     }
>> +
>> +free_res:
>> +     kfree(qp_work);
>> +}
>> +
>> static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>> {
>>       struct ipoib_dev_priv *priv = netdev_priv(dev);
>> @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>>               netif_wake_queue(dev);
>>
>>       if (wc->status != IB_WC_SUCCESS &&
>> -         wc->status != IB_WC_WR_FLUSH_ERR)
>> +         wc->status != IB_WC_WR_FLUSH_ERR) {
>> +             struct ipoib_qp_state_validate *qp_work;
>>               ipoib_warn(priv, "failed send event "
>>                          "(status=%d, wrid=%d vend_err %x)\n",
>>                          wc->status, wr_id, wc->vendor_err);
>> +             qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
>> +             if (!qp_work) {
>> +                     ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n",
>> +                                __func__, priv->qp->qp_num);
>> +                     return;
>> +             }
>> +
>> +             INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
>> +             qp_work->priv = priv;
>> +             queue_work(priv->wq, &qp_work->work);
>> +     }
>> }
>>
>> static int poll_tx(struct ipoib_dev_priv *priv)
>> --
>> 1.7.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> —
> Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>         GPG Key ID: 0E572FDD
>
>
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
       [not found]     ` <1427971145-26943-4-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
  2015-04-15 12:08       ` Doug Ledford
@ 2015-04-15 17:20       ` Devesh Sharma
       [not found]         ` <EE7902D3F51F404C82415C4803930ACD5DC42DB7-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
  1 sibling, 1 reply; 15+ messages in thread
From: Devesh Sharma @ 2015-04-15 17:20 UTC (permalink / raw)
  To: Or Gerlitz, Roland Dreier, Doug Ledford
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Tal Alon, Amir Vadai

> -----Original Message-----
> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Or Gerlitz
> Sent: Thursday, April 02, 2015 4:09 PM
> To: Roland Dreier; Doug Ledford
> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Erez Shitrit; Tal Alon; Amir Vadai; Or Gerlitz
> Subject: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
> 
> From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> 
> As the result of a completion error the QP can moved to SQE state by the
> hardware. Since it's not the Error state, there are no flushes and hence the
> driver doesn't know about that.

As per spec, QP transition to SQE causes flush completion for subsequent WQEs, the description is telling other way. Am I missing something?

> 
> The fix creates a task that after completion with error which is not a flush
> tracks the QP state and if it is in SQE state moves it back to RTS.
> 
> Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> ---
>  drivers/infiniband/ulp/ipoib/ipoib.h    |    5 +++
>  drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59
> ++++++++++++++++++++++++++++++-
>  2 files changed, 63 insertions(+), 1 deletions(-)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h
> b/drivers/infiniband/ulp/ipoib/ipoib.h
> index 769044c..2703d9a 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> @@ -299,6 +299,11 @@ struct ipoib_neigh_table {
>  	struct completion		deleted;
>  };
> 
> +struct ipoib_qp_state_validate {
> +	struct work_struct work;
> +	struct ipoib_dev_priv   *priv;
> +};
> +
>  /*
>   * Device private locking: network stack tx_lock protects members used
>   * in TX fast path, lock protects everything else.  lock nests inside diff --git
> a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> index 29b376d..63b92cb 100644
> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
>  	}
>  }
> 
> +/*
> + * As the result of a completion error the QP Can be transferred to SQE states.
> + * The function checks if the (send)QP is in SQE state and
> + * moves it back to RTS state, that in order to have it functional again.
> + */
> +static void ipoib_qp_state_validate_work(struct work_struct *work) {
> +	struct ipoib_qp_state_validate *qp_work =
> +		container_of(work, struct ipoib_qp_state_validate, work);
> +
> +	struct ipoib_dev_priv *priv = qp_work->priv;
> +	struct ib_qp_attr qp_attr;
> +	struct ib_qp_init_attr query_init_attr;
> +	int ret;
> +
> +	ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
> +	if (ret) {
> +		ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
> +			   __func__, ret);
> +		goto free_res;
> +	}
> +	pr_info("%s: QP: 0x%x is in state: %d\n",
> +		__func__, priv->qp->qp_num, qp_attr.qp_state);
> +
> +	/* currently support only in SQE->RTS transition*/
> +	if (qp_attr.qp_state == IB_QPS_SQE) {
> +		qp_attr.qp_state = IB_QPS_RTS;
> +
> +		ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
> +		if (ret) {
> +			pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
> +				ret, priv->qp->qp_num);
> +			goto free_res;
> +		}
> +		pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to
> IB_QPS_RTS\n",
> +			__func__, priv->qp->qp_num);
> +	} else {
> +		pr_warn("QP (%d) will stay in state: %d\n",
> +			priv->qp->qp_num, qp_attr.qp_state);
> +	}
> +
> +free_res:
> +	kfree(qp_work);
> +}
> +
>  static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)  {
>  	struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -358,10 +403,22
> @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc
> *wc)
>  		netif_wake_queue(dev);
> 
>  	if (wc->status != IB_WC_SUCCESS &&
> -	    wc->status != IB_WC_WR_FLUSH_ERR)
> +	    wc->status != IB_WC_WR_FLUSH_ERR) {
> +		struct ipoib_qp_state_validate *qp_work;
>  		ipoib_warn(priv, "failed send event "
>  			   "(status=%d, wrid=%d vend_err %x)\n",
>  			   wc->status, wr_id, wc->vendor_err);
> +		qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
> +		if (!qp_work) {
> +			ipoib_warn(priv, "%s Failed alloc
> ipoib_qp_state_validate for qp: 0x%x\n",
> +				   __func__, priv->qp->qp_num);
> +			return;
> +		}
> +
> +		INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
> +		qp_work->priv = priv;
> +		queue_work(priv->wq, &qp_work->work);
> +	}
>  }
> 
>  static int poll_tx(struct ipoib_dev_priv *priv)
> --
> 1.7.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body
> of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
       [not found]         ` <EE7902D3F51F404C82415C4803930ACD5DC42DB7-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
@ 2015-04-16  6:44           ` Erez Shitrit
       [not found]             ` <552F5A37.2070609-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Erez Shitrit @ 2015-04-16  6:44 UTC (permalink / raw)
  To: Devesh Sharma, Or Gerlitz, Roland Dreier, Doug Ledford
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Tal Alon, Amir Vadai

On 4/15/2015 8:20 PM, Devesh Sharma wrote:
>> -----Original Message-----
>> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
>> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Or Gerlitz
>> Sent: Thursday, April 02, 2015 4:09 PM
>> To: Roland Dreier; Doug Ledford
>> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Erez Shitrit; Tal Alon; Amir Vadai; Or Gerlitz
>> Subject: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
>>
>> From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>
>> As the result of a completion error the QP can moved to SQE state by the
>> hardware. Since it's not the Error state, there are no flushes and hence the
>> driver doesn't know about that.
> As per spec, QP transition to SQE causes flush completion for subsequent WQEs, the description is telling other way. Am I missing something?

No you are not :) . the description tries to say the following: the 
driver cannot distinguish between IB_WC_WR_FLUSH_ERR that threated as 
IB_WC_SUCCESS to IB_WC_WR_FLUSH_ERR that comes after other errors that 
take the QP to SQE,
The driver must recognize the first error that is not IB_WC_WR_FLUSH_ERR 
and handle accordingly.
For example, the driver can take the QP to ERROR state as a part of its 
life cycle (drain it, driver down, etc.) and at these situations many 
IB_WC_WR_FLUSH_ERR return and no need to change the state of the QP, it 
is already under the handling of the driver, which is not when other 
error comes. this is the intention of that patch, to return the QP back 
to life after that (un-handed) cases.

>> The fix creates a task that after completion with error which is not a flush
>> tracks the QP state and if it is in SQE state moves it back to RTS.
>>
>> Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>> Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>> ---
>>   drivers/infiniband/ulp/ipoib/ipoib.h    |    5 +++
>>   drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59
>> ++++++++++++++++++++++++++++++-
>>   2 files changed, 63 insertions(+), 1 deletions(-)
>>
>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h
>> b/drivers/infiniband/ulp/ipoib/ipoib.h
>> index 769044c..2703d9a 100644
>> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
>> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
>> @@ -299,6 +299,11 @@ struct ipoib_neigh_table {
>>   	struct completion		deleted;
>>   };
>>
>> +struct ipoib_qp_state_validate {
>> +	struct work_struct work;
>> +	struct ipoib_dev_priv   *priv;
>> +};
>> +
>>   /*
>>    * Device private locking: network stack tx_lock protects members used
>>    * in TX fast path, lock protects everything else.  lock nests inside diff --git
>> a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>> b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>> index 29b376d..63b92cb 100644
>> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
>>   	}
>>   }
>>
>> +/*
>> + * As the result of a completion error the QP Can be transferred to SQE states.
>> + * The function checks if the (send)QP is in SQE state and
>> + * moves it back to RTS state, that in order to have it functional again.
>> + */
>> +static void ipoib_qp_state_validate_work(struct work_struct *work) {
>> +	struct ipoib_qp_state_validate *qp_work =
>> +		container_of(work, struct ipoib_qp_state_validate, work);
>> +
>> +	struct ipoib_dev_priv *priv = qp_work->priv;
>> +	struct ib_qp_attr qp_attr;
>> +	struct ib_qp_init_attr query_init_attr;
>> +	int ret;
>> +
>> +	ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
>> +	if (ret) {
>> +		ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
>> +			   __func__, ret);
>> +		goto free_res;
>> +	}
>> +	pr_info("%s: QP: 0x%x is in state: %d\n",
>> +		__func__, priv->qp->qp_num, qp_attr.qp_state);
>> +
>> +	/* currently support only in SQE->RTS transition*/
>> +	if (qp_attr.qp_state == IB_QPS_SQE) {
>> +		qp_attr.qp_state = IB_QPS_RTS;
>> +
>> +		ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
>> +		if (ret) {
>> +			pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
>> +				ret, priv->qp->qp_num);
>> +			goto free_res;
>> +		}
>> +		pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to
>> IB_QPS_RTS\n",
>> +			__func__, priv->qp->qp_num);
>> +	} else {
>> +		pr_warn("QP (%d) will stay in state: %d\n",
>> +			priv->qp->qp_num, qp_attr.qp_state);
>> +	}
>> +
>> +free_res:
>> +	kfree(qp_work);
>> +}
>> +
>>   static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)  {
>>   	struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -358,10 +403,22
>> @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc
>> *wc)
>>   		netif_wake_queue(dev);
>>
>>   	if (wc->status != IB_WC_SUCCESS &&
>> -	    wc->status != IB_WC_WR_FLUSH_ERR)
>> +	    wc->status != IB_WC_WR_FLUSH_ERR) {
>> +		struct ipoib_qp_state_validate *qp_work;
>>   		ipoib_warn(priv, "failed send event "
>>   			   "(status=%d, wrid=%d vend_err %x)\n",
>>   			   wc->status, wr_id, wc->vendor_err);
>> +		qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
>> +		if (!qp_work) {
>> +			ipoib_warn(priv, "%s Failed alloc
>> ipoib_qp_state_validate for qp: 0x%x\n",
>> +				   __func__, priv->qp->qp_num);
>> +			return;
>> +		}
>> +
>> +		INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
>> +		qp_work->priv = priv;
>> +		queue_work(priv->wq, &qp_work->work);
>> +	}
>>   }
>>
>>   static int poll_tx(struct ipoib_dev_priv *priv)
>> --
>> 1.7.1
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body
>> of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
>> http://vger.kernel.org/majordomo-info.html
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
       [not found]             ` <552F5A37.2070609-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
@ 2015-04-16  7:29               ` Devesh Sharma
  0 siblings, 0 replies; 15+ messages in thread
From: Devesh Sharma @ 2015-04-16  7:29 UTC (permalink / raw)
  To: Erez Shitrit, Or Gerlitz, Roland Dreier, Doug Ledford
  Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Erez Shitrit, Tal Alon, Amir Vadai

> -----Original Message-----
> From: Erez Shitrit [mailto:erezsh-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org]
> Sent: Thursday, April 16, 2015 12:14 PM
> To: Devesh Sharma; Or Gerlitz; Roland Dreier; Doug Ledford
> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Erez Shitrit; Tal Alon; Amir Vadai
> Subject: Re: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
> 
> On 4/15/2015 8:20 PM, Devesh Sharma wrote:
> >> -----Original Message-----
> >> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
> >> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Or Gerlitz
> >> Sent: Thursday, April 02, 2015 4:09 PM
> >> To: Roland Dreier; Doug Ledford
> >> Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org; Erez Shitrit; Tal Alon; Amir Vadai;
> >> Or Gerlitz
> >> Subject: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
> >>
> >> From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> >>
> >> As the result of a completion error the QP can moved to SQE state by
> >> the hardware. Since it's not the Error state, there are no flushes
> >> and hence the driver doesn't know about that.
> > As per spec, QP transition to SQE causes flush completion for subsequent
> WQEs, the description is telling other way. Am I missing something?
> 
> No you are not :) . the description tries to say the following: the driver cannot
> distinguish between IB_WC_WR_FLUSH_ERR that threated as IB_WC_SUCCESS
> to IB_WC_WR_FLUSH_ERR that comes after other errors that take the QP to
> SQE, The driver must recognize the first error that is not
> IB_WC_WR_FLUSH_ERR and handle accordingly.
> For example, the driver can take the QP to ERROR state as a part of its life
> cycle (drain it, driver down, etc.) and at these situations many
> IB_WC_WR_FLUSH_ERR return and no need to change the state of the QP, it is
> already under the handling of the driver, which is not when other error comes.
> this is the intention of that patch, to return the QP back to life after that (un-
> handed) cases.

Okay, makes sense to me.

> 
> >> The fix creates a task that after completion with error which is not
> >> a flush tracks the QP state and if it is in SQE state moves it back to RTS.
> >>
> >> Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> >> Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
> >> ---
> >>   drivers/infiniband/ulp/ipoib/ipoib.h    |    5 +++
> >>   drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59
> >> ++++++++++++++++++++++++++++++-
> >>   2 files changed, 63 insertions(+), 1 deletions(-)
> >>
> >> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h
> >> b/drivers/infiniband/ulp/ipoib/ipoib.h
> >> index 769044c..2703d9a 100644
> >> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
> >> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
> >> @@ -299,6 +299,11 @@ struct ipoib_neigh_table {
> >>   	struct completion		deleted;
> >>   };
> >>
> >> +struct ipoib_qp_state_validate {
> >> +	struct work_struct work;
> >> +	struct ipoib_dev_priv   *priv;
> >> +};
> >> +
> >>   /*
> >>    * Device private locking: network stack tx_lock protects members used
> >>    * in TX fast path, lock protects everything else.  lock nests
> >> inside diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> >> b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> >> index 29b376d..63b92cb 100644
> >> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> >> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
> >> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device
> *ca,
> >>   	}
> >>   }
> >>
> >> +/*
> >> + * As the result of a completion error the QP Can be transferred to SQE
> states.
> >> + * The function checks if the (send)QP is in SQE state and
> >> + * moves it back to RTS state, that in order to have it functional again.
> >> + */
> >> +static void ipoib_qp_state_validate_work(struct work_struct *work) {
> >> +	struct ipoib_qp_state_validate *qp_work =
> >> +		container_of(work, struct ipoib_qp_state_validate, work);
> >> +
> >> +	struct ipoib_dev_priv *priv = qp_work->priv;
> >> +	struct ib_qp_attr qp_attr;
> >> +	struct ib_qp_init_attr query_init_attr;
> >> +	int ret;
> >> +
> >> +	ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
> >> +	if (ret) {
> >> +		ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
> >> +			   __func__, ret);
> >> +		goto free_res;
> >> +	}
> >> +	pr_info("%s: QP: 0x%x is in state: %d\n",
> >> +		__func__, priv->qp->qp_num, qp_attr.qp_state);
> >> +
> >> +	/* currently support only in SQE->RTS transition*/
> >> +	if (qp_attr.qp_state == IB_QPS_SQE) {
> >> +		qp_attr.qp_state = IB_QPS_RTS;
> >> +
> >> +		ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
> >> +		if (ret) {
> >> +			pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
> >> +				ret, priv->qp->qp_num);
> >> +			goto free_res;
> >> +		}
> >> +		pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to
> >> IB_QPS_RTS\n",
> >> +			__func__, priv->qp->qp_num);
> >> +	} else {
> >> +		pr_warn("QP (%d) will stay in state: %d\n",
> >> +			priv->qp->qp_num, qp_attr.qp_state);
> >> +	}
> >> +
> >> +free_res:
> >> +	kfree(qp_work);
> >> +}
> >> +
> >>   static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc
> *wc)  {
> >>   	struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -358,10 +403,22
> >> @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct
> >> ib_wc
> >> *wc)
> >>   		netif_wake_queue(dev);
> >>
> >>   	if (wc->status != IB_WC_SUCCESS &&
> >> -	    wc->status != IB_WC_WR_FLUSH_ERR)
> >> +	    wc->status != IB_WC_WR_FLUSH_ERR) {
> >> +		struct ipoib_qp_state_validate *qp_work;
> >>   		ipoib_warn(priv, "failed send event "
> >>   			   "(status=%d, wrid=%d vend_err %x)\n",
> >>   			   wc->status, wr_id, wc->vendor_err);
> >> +		qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
> >> +		if (!qp_work) {
> >> +			ipoib_warn(priv, "%s Failed alloc
> >> ipoib_qp_state_validate for qp: 0x%x\n",
> >> +				   __func__, priv->qp->qp_num);
> >> +			return;
> >> +		}
> >> +
> >> +		INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
> >> +		qp_work->priv = priv;
> >> +		queue_work(priv->wq, &qp_work->work);
> >> +	}
> >>   }
> >>
> >>   static int poll_tx(struct ipoib_dev_priv *priv)
> >> --
> >> 1.7.1
> >>
> >> --
> >> To unsubscribe from this list: send the line "unsubscribe linux-rdma"
> >> in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo
> >> info at http://vger.kernel.org/majordomo-info.html
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma"
> > in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo
> > info at  http://vger.kernel.org/majordomo-info.html
> >

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
       [not found]             ` <CAAk-MO8CX7zrbSnVc9NJt5pYTO4Apj_j4PSosXRKwe0BKv+1Aw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-04-16 13:29               ` Doug Ledford
       [not found]                 ` <26D88BAE-0450-4191-AE16-088F3714DC0E-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Doug Ledford @ 2015-04-16 13:29 UTC (permalink / raw)
  To: Erez Shitrit
  Cc: Erez Shitrit, Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Tal Alon, Amir Vadai, Or Gerlitz

[-- Attachment #1: Type: text/plain, Size: 8686 bytes --]


> On Apr 15, 2015, at 9:29 AM, Erez Shitrit <erezsh-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org> wrote:
> 
> Hi Doug
> 
> (now with "reply to all" :) )
> 
> On Wed, Apr 15, 2015 at 3:08 PM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>> 
>>> On Apr 2, 2015, at 6:39 AM, Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
>>> 
>>> From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>> 
>>> As the result of a completion error the QP can moved to SQE state by
>>> the hardware. Since it's not the Error state, there are no flushes
>>> and hence the driver doesn't know about that.
>>> 
>>> The fix creates a task that after completion with error which is not a
>>> flush tracks the QP state and if it is in SQE state moves it back to RTS.
>> 
>> I assume you found this particular issue while tracking down the cause of the problem that you fixed in patch #6?  Otherwise, this is a “never should happen” situation, yes?
>> 
> 
> Not only. it happened in the case that described in patch#6 and also
> in some other errors that caused the FW/HW to decide move the UD QO to
> SQE state (in the category of "bad thing happened.."  like damaged
> WQE, FW problem, etc.)
> The patch originally was written to solve such cases which we really
> saw, at testing, customers etc. and not for patch#6
> 
>>> Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>> Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>> ---
>>> drivers/infiniband/ulp/ipoib/ipoib.h    |    5 +++
>>> drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59 ++++++++++++++++++++++++++++++-
>>> 2 files changed, 63 insertions(+), 1 deletions(-)
>>> 
>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
>>> index 769044c..2703d9a 100644
>>> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
>>> @@ -299,6 +299,11 @@ struct ipoib_neigh_table {
>>>      struct completion               deleted;
>>> };
>>> 
>>> +struct ipoib_qp_state_validate {
>>> +     struct work_struct work;
>>> +     struct ipoib_dev_priv   *priv;
>>> +};
>> 
>> Why did you choose to do an allocated work struct?  All of the other tasks we have use a static work struct that’s part of the ipoib private dev, so this is a bit out of the norm.  In its current form, you only use this on the single UD queue pair that the dev uses, and not ever on any connected mode queue pairs, so it could be static.  Or were you looking at the possibility of making the task work on the CM queue pairs and so left it allocated?  And if that, why not include a reference to the broken queue pair in the work struct instead of always using priv->qp?
>> 
> 
> It looks (at least for me) more elegant to create "on-the-fly" work
> create/queue and delete at the end (BTW, I saw it many times, other
> than ipoib).
> There is no need for that in the CM mode, there the connection and the
> QP are destroyed, and new one will be created, only in UD mode the QP
> is forever.

I knew it wasn’t needed in CM mode currently, but I didn’t know if there was a plan in your mind to change the way CM worked so that it tried to rescue a queue pair instead of destroying and starting over.

However, I really think we need to be consistent in this driver.  If we allocate our work struct in one and only one place, then we run into the problem where someone working on the driver in the future has to try and remember “Gee, is this the one function that allocates our work struct and we must then free it to avoid leaking memory, or was that a different function?”  I would overlook that if we had a plan that involved this function ever operating on anything other than the UD QP, in which case you would have to allocate a struct to designate the QP in question, and you would have to allocate a struct per instance because it would technically be possible to have more than one QP in an SQE state at once.  But if that’s not the plan, then please rework this patch to add the work struct to the ipoib private dev like we do with all the other work structs.  I’d like to swap this one out of my tree for your new one ASAP.

> 
>>> /*
>>> * Device private locking: network stack tx_lock protects members used
>>> * in TX fast path, lock protects everything else.  lock nests inside
>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>>> index 29b376d..63b92cb 100644
>>> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>>> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
>>>      }
>>> }
>>> 
>>> +/*
>>> + * As the result of a completion error the QP Can be transferred to SQE states.
>>> + * The function checks if the (send)QP is in SQE state and
>>> + * moves it back to RTS state, that in order to have it functional again.
>>> + */
>>> +static void ipoib_qp_state_validate_work(struct work_struct *work)
>>> +{
>>> +     struct ipoib_qp_state_validate *qp_work =
>>> +             container_of(work, struct ipoib_qp_state_validate, work);
>>> +
>>> +     struct ipoib_dev_priv *priv = qp_work->priv;
>>> +     struct ib_qp_attr qp_attr;
>>> +     struct ib_qp_init_attr query_init_attr;
>>> +     int ret;
>>> +
>>> +     ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
>>> +     if (ret) {
>>> +             ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
>>> +                        __func__, ret);
>>> +             goto free_res;
>>> +     }
>>> +     pr_info("%s: QP: 0x%x is in state: %d\n",
>>> +             __func__, priv->qp->qp_num, qp_attr.qp_state);
>>> +
>>> +     /* currently support only in SQE->RTS transition*/
>>> +     if (qp_attr.qp_state == IB_QPS_SQE) {
>>> +             qp_attr.qp_state = IB_QPS_RTS;
>>> +
>>> +             ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
>>> +             if (ret) {
>>> +                     pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
>>> +                             ret, priv->qp->qp_num);
>>> +                     goto free_res;
>>> +             }
>>> +             pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
>>> +                     __func__, priv->qp->qp_num);
>>> +     } else {
>>> +             pr_warn("QP (%d) will stay in state: %d\n",
>>> +                     priv->qp->qp_num, qp_attr.qp_state);
>>> +     }
>>> +
>>> +free_res:
>>> +     kfree(qp_work);
>>> +}
>>> +
>>> static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>>> {
>>>      struct ipoib_dev_priv *priv = netdev_priv(dev);
>>> @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>>>              netif_wake_queue(dev);
>>> 
>>>      if (wc->status != IB_WC_SUCCESS &&
>>> -         wc->status != IB_WC_WR_FLUSH_ERR)
>>> +         wc->status != IB_WC_WR_FLUSH_ERR) {
>>> +             struct ipoib_qp_state_validate *qp_work;
>>>              ipoib_warn(priv, "failed send event "
>>>                         "(status=%d, wrid=%d vend_err %x)\n",
>>>                         wc->status, wr_id, wc->vendor_err);
>>> +             qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
>>> +             if (!qp_work) {
>>> +                     ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n",
>>> +                                __func__, priv->qp->qp_num);
>>> +                     return;
>>> +             }
>>> +
>>> +             INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
>>> +             qp_work->priv = priv;
>>> +             queue_work(priv->wq, &qp_work->work);
>>> +     }
>>> }
>>> 
>>> static int poll_tx(struct ipoib_dev_priv *priv)
>>> --
>>> 1.7.1
>>> 
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>>> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> 
>> —
>> Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>>        GPG Key ID: 0E572FDD
>> 
>> 
>> 
>> 
>> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

—
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
	GPG Key ID: 0E572FDD






[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 842 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
       [not found]                 ` <26D88BAE-0450-4191-AE16-088F3714DC0E-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
@ 2015-04-16 14:32                   ` Erez Shitrit
       [not found]                     ` <CAAk-MO_nojMF-xTOCSAwE6xO8-2Ad9Hb4GUqR3z5zZB8z7B+9Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 15+ messages in thread
From: Erez Shitrit @ 2015-04-16 14:32 UTC (permalink / raw)
  To: Doug Ledford
  Cc: Erez Shitrit, Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Tal Alon, Amir Vadai, Or Gerlitz

On Thu, Apr 16, 2015 at 4:29 PM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>
>> On Apr 15, 2015, at 9:29 AM, Erez Shitrit <erezsh-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org> wrote:
>>
>> Hi Doug
>>
>> (now with "reply to all" :) )
>>
>> On Wed, Apr 15, 2015 at 3:08 PM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>>>
>>>> On Apr 2, 2015, at 6:39 AM, Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
>>>>
>>>> From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>>
>>>> As the result of a completion error the QP can moved to SQE state by
>>>> the hardware. Since it's not the Error state, there are no flushes
>>>> and hence the driver doesn't know about that.
>>>>
>>>> The fix creates a task that after completion with error which is not a
>>>> flush tracks the QP state and if it is in SQE state moves it back to RTS.
>>>
>>> I assume you found this particular issue while tracking down the cause of the problem that you fixed in patch #6?  Otherwise, this is a “never should happen” situation, yes?
>>>
>>
>> Not only. it happened in the case that described in patch#6 and also
>> in some other errors that caused the FW/HW to decide move the UD QO to
>> SQE state (in the category of "bad thing happened.."  like damaged
>> WQE, FW problem, etc.)
>> The patch originally was written to solve such cases which we really
>> saw, at testing, customers etc. and not for patch#6
>>
>>>> Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>> Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>> ---
>>>> drivers/infiniband/ulp/ipoib/ipoib.h    |    5 +++
>>>> drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59 ++++++++++++++++++++++++++++++-
>>>> 2 files changed, 63 insertions(+), 1 deletions(-)
>>>>
>>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
>>>> index 769044c..2703d9a 100644
>>>> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
>>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
>>>> @@ -299,6 +299,11 @@ struct ipoib_neigh_table {
>>>>      struct completion               deleted;
>>>> };
>>>>
>>>> +struct ipoib_qp_state_validate {
>>>> +     struct work_struct work;
>>>> +     struct ipoib_dev_priv   *priv;
>>>> +};
>>>
>>> Why did you choose to do an allocated work struct?  All of the other tasks we have use a static work struct that’s part of the ipoib private dev, so this is a bit out of the norm.  In its current form, you only use this on the single UD queue pair that the dev uses, and not ever on any connected mode queue pairs, so it could be static.  Or were you looking at the possibility of making the task work on the CM queue pairs and so left it allocated?  And if that, why not include a reference to the broken queue pair in the work struct instead of always using priv->qp?
>>>
>>
>> It looks (at least for me) more elegant to create "on-the-fly" work
>> create/queue and delete at the end (BTW, I saw it many times, other
>> than ipoib).
>> There is no need for that in the CM mode, there the connection and the
>> QP are destroyed, and new one will be created, only in UD mode the QP
>> is forever.
>
> I knew it wasn’t needed in CM mode currently, but I didn’t know if there was a plan in your mind to change the way CM worked so that it tried to rescue a queue pair instead of destroying and starting over.
>
> However, I really think we need to be consistent in this driver.  If we allocate our work struct in one and only one place, then we run into the problem where someone working on the driver in the future has to try and remember “Gee, is this the one function that allocates our work struct and we must then free it to avoid leaking memory, or was that a different function?”  I would overlook that if we had a plan that involved this function ever operating on anything other than the UD QP, in which case you would have to allocate a struct to designate the QP in question, and you would have to allocate a struct per instance because it would technically be possible to have more than one QP in an SQE state at once.  But if that’s not the plan, then please rework this patch to add the work struct to the ipoib private dev like we do with all the other work structs.  I’d like to swap this one out of my tree for your new one ASAP.
>

There is a plan to have more than one UD QP per device, in order to
have multi-queue interfaces (not according to the CM QP but with the
same idea that you said, many objects that can use different work
each).
So, i think that this way to allocate work object is still reasonable.

>>
>>>> /*
>>>> * Device private locking: network stack tx_lock protects members used
>>>> * in TX fast path, lock protects everything else.  lock nests inside
>>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>>>> index 29b376d..63b92cb 100644
>>>> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>>>> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
>>>>      }
>>>> }
>>>>
>>>> +/*
>>>> + * As the result of a completion error the QP Can be transferred to SQE states.
>>>> + * The function checks if the (send)QP is in SQE state and
>>>> + * moves it back to RTS state, that in order to have it functional again.
>>>> + */
>>>> +static void ipoib_qp_state_validate_work(struct work_struct *work)
>>>> +{
>>>> +     struct ipoib_qp_state_validate *qp_work =
>>>> +             container_of(work, struct ipoib_qp_state_validate, work);
>>>> +
>>>> +     struct ipoib_dev_priv *priv = qp_work->priv;
>>>> +     struct ib_qp_attr qp_attr;
>>>> +     struct ib_qp_init_attr query_init_attr;
>>>> +     int ret;
>>>> +
>>>> +     ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
>>>> +     if (ret) {
>>>> +             ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
>>>> +                        __func__, ret);
>>>> +             goto free_res;
>>>> +     }
>>>> +     pr_info("%s: QP: 0x%x is in state: %d\n",
>>>> +             __func__, priv->qp->qp_num, qp_attr.qp_state);
>>>> +
>>>> +     /* currently support only in SQE->RTS transition*/
>>>> +     if (qp_attr.qp_state == IB_QPS_SQE) {
>>>> +             qp_attr.qp_state = IB_QPS_RTS;
>>>> +
>>>> +             ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
>>>> +             if (ret) {
>>>> +                     pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
>>>> +                             ret, priv->qp->qp_num);
>>>> +                     goto free_res;
>>>> +             }
>>>> +             pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
>>>> +                     __func__, priv->qp->qp_num);
>>>> +     } else {
>>>> +             pr_warn("QP (%d) will stay in state: %d\n",
>>>> +                     priv->qp->qp_num, qp_attr.qp_state);
>>>> +     }
>>>> +
>>>> +free_res:
>>>> +     kfree(qp_work);
>>>> +}
>>>> +
>>>> static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>>>> {
>>>>      struct ipoib_dev_priv *priv = netdev_priv(dev);
>>>> @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>>>>              netif_wake_queue(dev);
>>>>
>>>>      if (wc->status != IB_WC_SUCCESS &&
>>>> -         wc->status != IB_WC_WR_FLUSH_ERR)
>>>> +         wc->status != IB_WC_WR_FLUSH_ERR) {
>>>> +             struct ipoib_qp_state_validate *qp_work;
>>>>              ipoib_warn(priv, "failed send event "
>>>>                         "(status=%d, wrid=%d vend_err %x)\n",
>>>>                         wc->status, wr_id, wc->vendor_err);
>>>> +             qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
>>>> +             if (!qp_work) {
>>>> +                     ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n",
>>>> +                                __func__, priv->qp->qp_num);
>>>> +                     return;
>>>> +             }
>>>> +
>>>> +             INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
>>>> +             qp_work->priv = priv;
>>>> +             queue_work(priv->wq, &qp_work->work);
>>>> +     }
>>>> }
>>>>
>>>> static int poll_tx(struct ipoib_dev_priv *priv)
>>>> --
>>>> 1.7.1
>>>>
>>>> --
>>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>>>> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>
>>> —
>>> Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>>>        GPG Key ID: 0E572FDD
>>>
>>>
>>>
>>>
>>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
> —
> Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>         GPG Key ID: 0E572FDD
>
>
>
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state
       [not found]                     ` <CAAk-MO_nojMF-xTOCSAwE6xO8-2Ad9Hb4GUqR3z5zZB8z7B+9Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2015-04-16 14:41                       ` Doug Ledford
  0 siblings, 0 replies; 15+ messages in thread
From: Doug Ledford @ 2015-04-16 14:41 UTC (permalink / raw)
  To: Erez Shitrit
  Cc: Erez Shitrit, Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA,
	Tal Alon, Amir Vadai, Or Gerlitz

[-- Attachment #1: Type: text/plain, Size: 9737 bytes --]


> On Apr 16, 2015, at 10:32 AM, Erez Shitrit <erezsh-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org> wrote:
> 
> On Thu, Apr 16, 2015 at 4:29 PM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>> 
>>> On Apr 15, 2015, at 9:29 AM, Erez Shitrit <erezsh-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org> wrote:
>>> 
>>> Hi Doug
>>> 
>>> (now with "reply to all" :) )
>>> 
>>> On Wed, Apr 15, 2015 at 3:08 PM, Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org> wrote:
>>>> 
>>>>> On Apr 2, 2015, at 6:39 AM, Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org> wrote:
>>>>> 
>>>>> From: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>>> 
>>>>> As the result of a completion error the QP can moved to SQE state by
>>>>> the hardware. Since it's not the Error state, there are no flushes
>>>>> and hence the driver doesn't know about that.
>>>>> 
>>>>> The fix creates a task that after completion with error which is not a
>>>>> flush tracks the QP state and if it is in SQE state moves it back to RTS.
>>>> 
>>>> I assume you found this particular issue while tracking down the cause of the problem that you fixed in patch #6?  Otherwise, this is a “never should happen” situation, yes?
>>>> 
>>> 
>>> Not only. it happened in the case that described in patch#6 and also
>>> in some other errors that caused the FW/HW to decide move the UD QO to
>>> SQE state (in the category of "bad thing happened.."  like damaged
>>> WQE, FW problem, etc.)
>>> The patch originally was written to solve such cases which we really
>>> saw, at testing, customers etc. and not for patch#6
>>> 
>>>>> Signed-off-by: Erez Shitrit <erezsh-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>>> Signed-off-by: Or Gerlitz <ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
>>>>> ---
>>>>> drivers/infiniband/ulp/ipoib/ipoib.h    |    5 +++
>>>>> drivers/infiniband/ulp/ipoib/ipoib_ib.c |   59 ++++++++++++++++++++++++++++++-
>>>>> 2 files changed, 63 insertions(+), 1 deletions(-)
>>>>> 
>>>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
>>>>> index 769044c..2703d9a 100644
>>>>> --- a/drivers/infiniband/ulp/ipoib/ipoib.h
>>>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib.h
>>>>> @@ -299,6 +299,11 @@ struct ipoib_neigh_table {
>>>>>     struct completion               deleted;
>>>>> };
>>>>> 
>>>>> +struct ipoib_qp_state_validate {
>>>>> +     struct work_struct work;
>>>>> +     struct ipoib_dev_priv   *priv;
>>>>> +};
>>>> 
>>>> Why did you choose to do an allocated work struct?  All of the other tasks we have use a static work struct that’s part of the ipoib private dev, so this is a bit out of the norm. In its current form, you only use this on the single UD queue pair that the dev uses, and not ever on any connected mode queue pairs, so it could be static.  Or were you looking at the possibility of making the task work on the CM queue pairs and so left it allocated? And if that, why not include a reference to the broken queue pair in the work struct instead of always using priv->qp?
>>>> 
>>> 
>>> It looks (at least for me) more elegant to create "on-the-fly" work
>>> create/queue and delete at the end (BTW, I saw it many times, other
>>> than ipoib).
>>> There is no need for that in the CM mode, there the connection and the
>>> QP are destroyed, and new one will be created, only in UD mode the QP
>>> is forever.
>> 
>> I knew it wasn’t needed in CM mode currently, but I didn’t know if there was a plan in your mind to change the way CM worked so that it tried to rescue a queue pair instead of destroying and starting over.
>> 
>> However, I really think we need to be consistent in this driver.  If we allocate our work struct in one and only one place, then we run into the problem where someone working on the driver in the future has to try and remember “Gee, is this the one function that allocates our work struct and we must then free it to avoid leaking memory, or was that a different function?”  I would overlook that if we had a plan that involved this function ever operating on anything other than the UD QP, in which case you would have to allocate a struct to designate the QP in question, and you would have to allocate a struct per instance because it would technically be possible to have more than one QP in an SQE state at once.  But if that’s not the plan, then please rework this patch to add the work struct to the ipoib private dev like we do with all the other work structs.  I’d like to swap this one out of my tree for your new one ASAP.
>> 
> 
> There is a plan to have more than one UD QP per device, in order to
> have multi-queue interfaces (not according to the CM QP but with the
> same idea that you said, many objects that can use different work
> each).
> So, i think that this way to allocate work object is still reasonable.

OK, then yes, that changes things.  The patch as it stands is good then.

> 
>>> 
>>>>> /*
>>>>> * Device private locking: network stack tx_lock protects members used
>>>>> * in TX fast path, lock protects everything else.  lock nests inside
>>>>> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>>>>> index 29b376d..63b92cb 100644
>>>>> --- a/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>>>>> +++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
>>>>> @@ -327,6 +327,51 @@ static void ipoib_dma_unmap_tx(struct ib_device *ca,
>>>>>     }
>>>>> }
>>>>> 
>>>>> +/*
>>>>> + * As the result of a completion error the QP Can be transferred to SQE states.
>>>>> + * The function checks if the (send)QP is in SQE state and
>>>>> + * moves it back to RTS state, that in order to have it functional again.
>>>>> + */
>>>>> +static void ipoib_qp_state_validate_work(struct work_struct *work)
>>>>> +{
>>>>> +     struct ipoib_qp_state_validate *qp_work =
>>>>> +             container_of(work, struct ipoib_qp_state_validate, work);
>>>>> +
>>>>> +     struct ipoib_dev_priv *priv = qp_work->priv;
>>>>> +     struct ib_qp_attr qp_attr;
>>>>> +     struct ib_qp_init_attr query_init_attr;
>>>>> +     int ret;
>>>>> +
>>>>> +     ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
>>>>> +     if (ret) {
>>>>> +             ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
>>>>> +                        __func__, ret);
>>>>> +             goto free_res;
>>>>> +     }
>>>>> +     pr_info("%s: QP: 0x%x is in state: %d\n",
>>>>> +             __func__, priv->qp->qp_num, qp_attr.qp_state);
>>>>> +
>>>>> +     /* currently support only in SQE->RTS transition*/
>>>>> +     if (qp_attr.qp_state == IB_QPS_SQE) {
>>>>> +             qp_attr.qp_state = IB_QPS_RTS;
>>>>> +
>>>>> +             ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
>>>>> +             if (ret) {
>>>>> +                     pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
>>>>> +                             ret, priv->qp->qp_num);
>>>>> +                     goto free_res;
>>>>> +             }
>>>>> +             pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
>>>>> +                     __func__, priv->qp->qp_num);
>>>>> +     } else {
>>>>> +             pr_warn("QP (%d) will stay in state: %d\n",
>>>>> +                     priv->qp->qp_num, qp_attr.qp_state);
>>>>> +     }
>>>>> +
>>>>> +free_res:
>>>>> +     kfree(qp_work);
>>>>> +}
>>>>> +
>>>>> static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>>>>> {
>>>>>     struct ipoib_dev_priv *priv = netdev_priv(dev);
>>>>> @@ -358,10 +403,22 @@ static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
>>>>>             netif_wake_queue(dev);
>>>>> 
>>>>>     if (wc->status != IB_WC_SUCCESS &&
>>>>> -         wc->status != IB_WC_WR_FLUSH_ERR)
>>>>> +         wc->status != IB_WC_WR_FLUSH_ERR) {
>>>>> +             struct ipoib_qp_state_validate *qp_work;
>>>>>             ipoib_warn(priv, "failed send event "
>>>>>                        "(status=%d, wrid=%d vend_err %x)\n",
>>>>>                        wc->status, wr_id, wc->vendor_err);
>>>>> +             qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
>>>>> +             if (!qp_work) {
>>>>> +                     ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n",
>>>>> +                                __func__, priv->qp->qp_num);
>>>>> +                     return;
>>>>> +             }
>>>>> +
>>>>> +             INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
>>>>> +             qp_work->priv = priv;
>>>>> +             queue_work(priv->wq, &qp_work->work);
>>>>> +     }
>>>>> }
>>>>> 
>>>>> static int poll_tx(struct ipoib_dev_priv *priv)
>>>>> --
>>>>> 1.7.1
>>>>> 
>>>>> --
>>>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>>>>> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>>>> 
>>>> —
>>>> Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>>>>       GPG Key ID: 0E572FDD
>>>> 
>>>> 
>>>> 
>>>> 
>>>> 
>>> --
>>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>>> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
>>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> 
>> —
>> Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
>>        GPG Key ID: 0E572FDD

—
Doug Ledford <dledford-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
	GPG Key ID: 0E572FDD






[-- Attachment #2: Message signed with OpenPGP using GPGMail --]
[-- Type: application/pgp-signature, Size: 842 bytes --]

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2015-04-16 14:41 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-02 10:38 [PATCH for-next 0/6] mlx4 and more IPoIB changes for 4.1 Or Gerlitz
     [not found] ` <1427971145-26943-1-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-04-02 10:39   ` [PATCH for-next 1/6] IB/ipoib: Use one linear skb in RX flow Or Gerlitz
2015-04-02 10:39   ` [PATCH for-next 2/6] IB/ipoib: Update broadcast record values after each successful join request Or Gerlitz
2015-04-02 10:39   ` [PATCH for-next 3/6] IB/ipoib: Handle QP in SQE state Or Gerlitz
     [not found]     ` <1427971145-26943-4-git-send-email-ogerlitz-VPRAkNaXOzVWk0Htik3J/w@public.gmane.org>
2015-04-15 12:08       ` Doug Ledford
     [not found]         ` <13F95634-36B6-4759-A300-E140CB7FF70D-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-04-15 13:29           ` Erez Shitrit
     [not found]             ` <CAAk-MO8CX7zrbSnVc9NJt5pYTO4Apj_j4PSosXRKwe0BKv+1Aw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-04-16 13:29               ` Doug Ledford
     [not found]                 ` <26D88BAE-0450-4191-AE16-088F3714DC0E-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org>
2015-04-16 14:32                   ` Erez Shitrit
     [not found]                     ` <CAAk-MO_nojMF-xTOCSAwE6xO8-2Ad9Hb4GUqR3z5zZB8z7B+9Q-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2015-04-16 14:41                       ` Doug Ledford
2015-04-15 17:20       ` Devesh Sharma
     [not found]         ` <EE7902D3F51F404C82415C4803930ACD5DC42DB7-DWYeeINJQrxExQ8dmkPuX0M9+F4ksjoh@public.gmane.org>
2015-04-16  6:44           ` Erez Shitrit
     [not found]             ` <552F5A37.2070609-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2015-04-16  7:29               ` Devesh Sharma
2015-04-02 10:39   ` [PATCH for-next 4/6] IB/ipoib: Save only IPOIB_MAX_PATH_REC_QUEUE skb's Or Gerlitz
2015-04-02 10:39   ` [PATCH for-next 5/6] IB/ipoib: Remove IPOIB_MCAST_RUN bit Or Gerlitz
2015-04-02 10:39   ` [PATCH for-next 6/6] IB/mlx4: Fix WQE LSO segment calculation Or Gerlitz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.