linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH for-rc v2 0/4] RDMA/rxe: Cleanup retransmit and rnr retry
@ 2022-05-14  3:04 Bob Pearson
  2022-05-14  3:04 ` [PATCH for-rc v2 1/4] RDMA/rxe: Rename qp->qp_timeout_jiffies Bob Pearson
                   ` (3 more replies)
  0 siblings, 4 replies; 6+ messages in thread
From: Bob Pearson @ 2022-05-14  3:04 UTC (permalink / raw)
  To: jgg, zyjzyj2000, tom, linux-rdma, jhack; +Cc: Bob Pearson

This series of patches makes some minor cleanups of retry related
code in rdma_rxe. It fixes incorrect behavior in rnr retry and
reduces the number of spurious retransmit retry events.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
v2
  The 3/4 patch was previously submitted alone and is now modified to
  respond to suggestions made by Tom Talpey.

  The other 3 patches are newly added.

Bob Pearson (4):
  RDMA/rxe: Rename qp->qp_timeout_jiffies
  RDMA/rxe: Add a pkt->mask bit for ack_req
  RDMA/rxe: Fix rnr retry behavior
  RDMA/rxe: Reduce spurious retransmit timeouts

 drivers/infiniband/sw/rxe/rxe_comp.c   | 58 ++++++++++++++++++--------
 drivers/infiniband/sw/rxe/rxe_opcode.h |  1 +
 drivers/infiniband/sw/rxe/rxe_qp.c     |  9 ++--
 drivers/infiniband/sw/rxe/rxe_req.c    | 51 ++++++++++++++++++----
 drivers/infiniband/sw/rxe/rxe_verbs.h  |  8 ++--
 5 files changed, 94 insertions(+), 33 deletions(-)


base-commit: c5eb0a61238dd6faf37f58c9ce61c9980aaffd7a
-- 
2.34.1


^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH for-rc v2 1/4] RDMA/rxe: Rename qp->qp_timeout_jiffies
  2022-05-14  3:04 [PATCH for-rc v2 0/4] RDMA/rxe: Cleanup retransmit and rnr retry Bob Pearson
@ 2022-05-14  3:04 ` Bob Pearson
  2022-05-14  3:04 ` [PATCH for-rc v2 2/4] RDMA/rxe: Add a pkt->mask bit for ack_req Bob Pearson
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 6+ messages in thread
From: Bob Pearson @ 2022-05-14  3:04 UTC (permalink / raw)
  To: jgg, zyjzyj2000, tom, linux-rdma, jhack; +Cc: Bob Pearson

Replace qp->qp_timeout_jiffies by the shorter name
qp->timeout_jiffies which is a little less redundant.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_comp.c  | 4 ++--
 drivers/infiniband/sw/rxe/rxe_qp.c    | 8 ++++----
 drivers/infiniband/sw/rxe/rxe_req.c   | 5 +++--
 drivers/infiniband/sw/rxe/rxe_verbs.h | 6 +++---
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index 138b3e7d3a5f..badd423966dc 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -672,9 +672,9 @@ int rxe_completer(void *arg)
 			if ((qp_type(qp) == IB_QPT_RC) &&
 			    (qp->req.state == QP_STATE_READY) &&
 			    (psn_compare(qp->req.psn, qp->comp.psn) > 0) &&
-			    qp->qp_timeout_jiffies)
+			    qp->timeout_jiffies)
 				mod_timer(&qp->retrans_timer,
-					  jiffies + qp->qp_timeout_jiffies);
+					  jiffies + qp->timeout_jiffies);
 			ret = -EAGAIN;
 			goto done;
 
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index 62acf890af6c..fc22ff36fdea 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -253,7 +253,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
 	rxe_init_task(rxe, &qp->comp.task, qp,
 		      rxe_completer, "comp");
 
-	qp->qp_timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */
+	qp->timeout_jiffies = 0; /* Can't be set for UD/UC in modify_qp */
 	if (init->qp_type == IB_QPT_RC) {
 		timer_setup(&qp->rnr_nak_timer, rnr_nak_timer, 0);
 		timer_setup(&qp->retrans_timer, retransmit_timer, 0);
@@ -633,12 +633,12 @@ int rxe_qp_from_attr(struct rxe_qp *qp, struct ib_qp_attr *attr, int mask,
 	if (mask & IB_QP_TIMEOUT) {
 		qp->attr.timeout = attr->timeout;
 		if (attr->timeout == 0) {
-			qp->qp_timeout_jiffies = 0;
+			qp->timeout_jiffies = 0;
 		} else {
 			/* According to the spec, timeout = 4.096 * 2 ^ attr->timeout [us] */
 			int j = nsecs_to_jiffies(4096ULL << attr->timeout);
 
-			qp->qp_timeout_jiffies = j ? j : 1;
+			qp->timeout_jiffies = j ? j : 1;
 		}
 	}
 
@@ -781,7 +781,7 @@ int rxe_qp_chk_destroy(struct rxe_qp *qp)
 void rxe_qp_destroy(struct rxe_qp *qp)
 {
 	qp->valid = 0;
-	qp->qp_timeout_jiffies = 0;
+	qp->timeout_jiffies = 0;
 	rxe_cleanup_task(&qp->resp.task);
 
 	if (qp_type(qp) == IB_QPT_RC) {
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index ae5fbc79dd5c..1884e3a64310 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -157,6 +157,7 @@ static struct rxe_send_wqe *req_next_wqe(struct rxe_qp *qp)
 		return NULL;
 
 	wqe = queue_addr_from_index(q, index);
+	WARN_ON((long)wqe & (L1_CACHE_BYTES - 1));
 
 	if (unlikely((qp->req.state == QP_STATE_DRAIN ||
 		      qp->req.state == QP_STATE_DRAINED) &&
@@ -538,9 +539,9 @@ static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
 
 	qp->need_req_skb = 0;
 
-	if (qp->qp_timeout_jiffies && !timer_pending(&qp->retrans_timer))
+	if (qp->timeout_jiffies && !timer_pending(&qp->retrans_timer))
 		mod_timer(&qp->retrans_timer,
-			  jiffies + qp->qp_timeout_jiffies);
+			  jiffies + qp->timeout_jiffies);
 }
 
 static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index e7eff1ca75e9..83b6f80440d8 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -249,11 +249,11 @@ struct rxe_qp {
 	 * started. The responder resets it whenever an ack is
 	 * received.
 	 */
-	struct timer_list retrans_timer;
-	u64 qp_timeout_jiffies;
+	struct timer_list	retrans_timer;
+	u64			timeout_jiffies;
 
 	/* Timer for handling RNR NAKS. */
-	struct timer_list rnr_nak_timer;
+	struct timer_list	rnr_nak_timer;
 
 	spinlock_t		state_lock; /* guard requester and completer */
 
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH for-rc v2 2/4] RDMA/rxe: Add a pkt->mask bit for ack_req
  2022-05-14  3:04 [PATCH for-rc v2 0/4] RDMA/rxe: Cleanup retransmit and rnr retry Bob Pearson
  2022-05-14  3:04 ` [PATCH for-rc v2 1/4] RDMA/rxe: Rename qp->qp_timeout_jiffies Bob Pearson
@ 2022-05-14  3:04 ` Bob Pearson
  2022-05-14  3:04 ` [PATCH for-rc v2 3/4] RDMA/rxe: Fix rnr retry behavior Bob Pearson
  2022-05-14  3:04 ` [PATCH for-rc v2 4/4] RDMA/rxe: Reduce spurious retransmit timeouts Bob Pearson
  3 siblings, 0 replies; 6+ messages in thread
From: Bob Pearson @ 2022-05-14  3:04 UTC (permalink / raw)
  To: jgg, zyjzyj2000, tom, linux-rdma, jhack; +Cc: Bob Pearson

Add a bit to pkt->mask indicating that the ackreq bit has been
set in the current packet. Use this bit to condition setting
the retransmit timer since a packet without the ackreq bit set
will not generate a response.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_opcode.h |  1 +
 drivers/infiniband/sw/rxe/rxe_req.c    | 11 +++++++----
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_opcode.h b/drivers/infiniband/sw/rxe/rxe_opcode.h
index 8f9aaaf260f2..629504245e98 100644
--- a/drivers/infiniband/sw/rxe/rxe_opcode.h
+++ b/drivers/infiniband/sw/rxe/rxe_opcode.h
@@ -80,6 +80,7 @@ enum rxe_hdr_mask {
 	RXE_END_MASK		= BIT(NUM_HDR_TYPES + 10),
 
 	RXE_LOOPBACK_MASK	= BIT(NUM_HDR_TYPES + 12),
+	RXE_ACK_REQ_MASK	= BIT(NUM_HDR_TYPES + 13),
 
 	RXE_READ_OR_ATOMIC_MASK	= (RXE_READ_MASK | RXE_ATOMIC_MASK),
 	RXE_WRITE_OR_SEND_MASK	= (RXE_WRITE_MASK | RXE_SEND_MASK),
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index 1884e3a64310..d15165e9319c 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -394,11 +394,13 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp,
 
 	ack_req = ((pkt->mask & RXE_END_MASK) ||
 		(qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
-	if (ack_req)
+	if (ack_req) {
 		qp->req.noack_pkts = 0;
+		pkt->mask |= RXE_ACK_REQ_MASK;
+	}
 
-	bth_init(pkt, pkt->opcode, solicited, 0, pad, IB_DEFAULT_PKEY_FULL, qp_num,
-		 ack_req, pkt->psn);
+	bth_init(pkt, pkt->opcode, solicited, 0, pad, IB_DEFAULT_PKEY_FULL,
+		 qp_num, ack_req, pkt->psn);
 
 	/* init optional headers */
 	if (pkt->mask & RXE_RETH_MASK) {
@@ -539,7 +541,8 @@ static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
 
 	qp->need_req_skb = 0;
 
-	if (qp->timeout_jiffies && !timer_pending(&qp->retrans_timer))
+	if ((pkt->mask & RXE_ACK_REQ_MASK) && qp->timeout_jiffies &&
+	    !timer_pending(&qp->retrans_timer))
 		mod_timer(&qp->retrans_timer,
 			  jiffies + qp->timeout_jiffies);
 }
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH for-rc v2 3/4] RDMA/rxe: Fix rnr retry behavior
  2022-05-14  3:04 [PATCH for-rc v2 0/4] RDMA/rxe: Cleanup retransmit and rnr retry Bob Pearson
  2022-05-14  3:04 ` [PATCH for-rc v2 1/4] RDMA/rxe: Rename qp->qp_timeout_jiffies Bob Pearson
  2022-05-14  3:04 ` [PATCH for-rc v2 2/4] RDMA/rxe: Add a pkt->mask bit for ack_req Bob Pearson
@ 2022-05-14  3:04 ` Bob Pearson
  2022-05-14  3:04 ` [PATCH for-rc v2 4/4] RDMA/rxe: Reduce spurious retransmit timeouts Bob Pearson
  3 siblings, 0 replies; 6+ messages in thread
From: Bob Pearson @ 2022-05-14  3:04 UTC (permalink / raw)
  To: jgg, zyjzyj2000, tom, linux-rdma, jhack; +Cc: Bob Pearson

Currently the completer tasklet when it sets the retransmit timer or the
rnr timer sets the same flag (qp->req.need_retry) so that if either
timer fires it will attempt to perform a retry flow on the send queue.
This has the effect of responding to an RNR NAK at the first retransmit
timer event which might not allow for the requested rnr timeout.

This patch adds a new flag (qp->req.wait_for_rnr_timer) which, if set,
prevents a retry flow until the rnr nak timer fires.

This patch fixes rnr retry errors which can be observed by running the
pyverbs test_rdmacm_async_traffic_external_qp multiple times. With this
patch applied they do not occur.

Link: https://lore.kernel.org/linux-rdma/a8287823-1408-4273-bc22-99a0678db640@gmail.com/
Link: https://lore.kernel.org/linux-rdma/2bafda9e-2bb6-186d-12a1-179e8f6a2678@talpey.com/
Fixes: 8700e3e7c485 ("Soft RoCE (RXE) - The software RoCE driver")
Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
v2
  Added comments and changed the name of the new flag to make things
  more understandable per an email exchange with Tom Talpey.
---
 drivers/infiniband/sw/rxe/rxe_comp.c  |  8 +++++++-
 drivers/infiniband/sw/rxe/rxe_qp.c    |  1 +
 drivers/infiniband/sw/rxe/rxe_req.c   | 15 +++++++++++++--
 drivers/infiniband/sw/rxe/rxe_verbs.h |  1 +
 4 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index badd423966dc..3c77201c01d1 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -114,6 +114,8 @@ void retransmit_timer(struct timer_list *t)
 {
 	struct rxe_qp *qp = from_timer(qp, t, retrans_timer);
 
+	pr_debug("%s: fired for qp#%d\n", __func__, qp->elem.index);
+
 	if (qp->valid) {
 		qp->comp.timeout = 1;
 		rxe_run_task(&qp->comp.task, 1);
@@ -729,11 +731,15 @@ int rxe_completer(void *arg)
 			break;
 
 		case COMPST_RNR_RETRY:
+			/* we come here if we received an RNR NAK */
 			if (qp->comp.rnr_retry > 0) {
 				if (qp->comp.rnr_retry != 7)
 					qp->comp.rnr_retry--;
 
-				qp->req.need_retry = 1;
+				/* don't start a retry flow until the
+				 * rnr timer has fired
+				 */
+				qp->req.wait_for_rnr_timer = 1;
 				pr_debug("qp#%d set rnr nak timer\n",
 					 qp_num(qp));
 				mod_timer(&qp->rnr_nak_timer,
diff --git a/drivers/infiniband/sw/rxe/rxe_qp.c b/drivers/infiniband/sw/rxe/rxe_qp.c
index fc22ff36fdea..2f6f378a232d 100644
--- a/drivers/infiniband/sw/rxe/rxe_qp.c
+++ b/drivers/infiniband/sw/rxe/rxe_qp.c
@@ -513,6 +513,7 @@ static void rxe_qp_reset(struct rxe_qp *qp)
 	atomic_set(&qp->ssn, 0);
 	qp->req.opcode = -1;
 	qp->req.need_retry = 0;
+	qp->req.wait_for_rnr_timer = 0;
 	qp->req.noack_pkts = 0;
 	qp->resp.msn = 0;
 	qp->resp.opcode = -1;
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index d15165e9319c..aa9066ff5257 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -103,7 +103,11 @@ void rnr_nak_timer(struct timer_list *t)
 {
 	struct rxe_qp *qp = from_timer(qp, t, rnr_nak_timer);
 
-	pr_debug("qp#%d rnr nak timer fired\n", qp_num(qp));
+	pr_debug("%s: fired for qp#%d\n", __func__, qp_num(qp));
+
+	/* request a send queue retry */
+	qp->req.need_retry = 1;
+	qp->req.wait_for_rnr_timer = 0;
 	rxe_run_task(&qp->req.task, 1);
 }
 
@@ -628,10 +632,17 @@ int rxe_requester(void *arg)
 		qp->req.need_rd_atomic = 0;
 		qp->req.wait_psn = 0;
 		qp->req.need_retry = 0;
+		qp->req.wait_for_rnr_timer = 0;
 		goto exit;
 	}
 
-	if (unlikely(qp->req.need_retry)) {
+	/* we come here if the retransmot timer has fired
+	 * or if the rnr timer has fired. If the retransmit
+	 * timer fires while we are processing an RNR NAK wait
+	 * until the rnr timer has fired before starting the
+	 * retry flow
+	 */
+	if (unlikely(qp->req.need_retry && !qp->req.wait_for_rnr_timer)) {
 		req_retry(qp);
 		qp->req.need_retry = 0;
 	}
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index 83b6f80440d8..a6c6f0d786c7 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -123,6 +123,7 @@ struct rxe_req_info {
 	int			need_rd_atomic;
 	int			wait_psn;
 	int			need_retry;
+	int			wait_for_rnr_timer;
 	int			noack_pkts;
 	struct rxe_task		task;
 };
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH for-rc v2 4/4] RDMA/rxe: Reduce spurious retransmit timeouts
  2022-05-14  3:04 [PATCH for-rc v2 0/4] RDMA/rxe: Cleanup retransmit and rnr retry Bob Pearson
                   ` (2 preceding siblings ...)
  2022-05-14  3:04 ` [PATCH for-rc v2 3/4] RDMA/rxe: Fix rnr retry behavior Bob Pearson
@ 2022-05-14  3:04 ` Bob Pearson
  2022-05-14 16:41   ` Bob Pearson
  3 siblings, 1 reply; 6+ messages in thread
From: Bob Pearson @ 2022-05-14  3:04 UTC (permalink / raw)
  To: jgg, zyjzyj2000, tom, linux-rdma, jhack; +Cc: Bob Pearson

Currently the rxe driver sets a timer to handle response timeouts
from lost packets or other failures. However this timer is never
deleted so it will always fire in the period it was set for.
Under heavy load this means that there will be retry flows attempted
every few msec depending on the setting for retry timeout.

This patch modifies the driver in several ways to reduce the
number of spurious timeouts:

 - The psn of the current packet is saved when the timer is set
   for non-read requests and for read requests a psn in the
   expected response is saved. When a response packet is received
   by the completer tasklet that has a psn >= to the saved psn
   the timer is deleted.
 - As soon as there is no pending timer any new request will
   reset the timer. For long read replies the timer is reset
   as blocks of data arrive so that the response is covered by
   a timer.

For typical heavy loads (e.g. ib_send_bw etc.) the number of
retries is reduced from ~40 retries per second to one every
several seconds. These are legitimate timeouts where the responder
does not reply in the given time.

Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
---
 drivers/infiniband/sw/rxe/rxe_comp.c  | 50 ++++++++++++++++++---------
 drivers/infiniband/sw/rxe/rxe_req.c   | 26 ++++++++++++--
 drivers/infiniband/sw/rxe/rxe_verbs.h |  1 +
 3 files changed, 57 insertions(+), 20 deletions(-)

diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
index 3c77201c01d1..a8ef1f781a24 100644
--- a/drivers/infiniband/sw/rxe/rxe_comp.c
+++ b/drivers/infiniband/sw/rxe/rxe_comp.c
@@ -170,12 +170,44 @@ static inline void reset_retry_counters(struct rxe_qp *qp)
 	qp->comp.started_retry = 0;
 }
 
+static void update_retransmit_timer(struct rxe_qp *qp,
+				    struct rxe_pkt_info *pkt,
+				    struct rxe_send_wqe *wqe)
+{
+	/* reset the retry timer for long read responses
+	 * if there is more data expected
+	 */
+	if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) {
+		int psn = (pkt->psn + RXE_MAX_PKT_PER_ACK) & 0xffffff;
+
+		if (psn_compare(psn, wqe->last_psn) > 0)
+			psn = wqe->last_psn;
+
+		if (psn_compare(psn, pkt->psn) > 0) {
+			atomic_set(&qp->timeout_psn, psn);
+			mod_timer(&qp->retrans_timer,
+				  jiffies + qp->timeout_jiffies);
+			return;
+		}
+	}
+
+	/* else just delete the timer */
+	del_timer_sync(&qp->retrans_timer);
+}
+
 static inline enum comp_state check_psn(struct rxe_qp *qp,
 					struct rxe_pkt_info *pkt,
 					struct rxe_send_wqe *wqe)
 {
 	s32 diff;
 
+	/* if this packet psn matches or exceeds the request that set the
+	 * retry timer update the timer.
+	 */
+	if ((psn_compare(pkt->psn, atomic_read(&qp->timeout_psn)) >= 0) &&
+	    timer_pending(&qp->retrans_timer))
+		update_retransmit_timer(qp, pkt, wqe);
+
 	/* check to see if response is past the oldest WQE. if it is, complete
 	 * send/write or error read/atomic
 	 */
@@ -663,20 +695,6 @@ int rxe_completer(void *arg)
 				break;
 			}
 
-			/* re reset the timeout counter if
-			 * (1) QP is type RC
-			 * (2) the QP is alive
-			 * (3) there is a packet sent by the requester that
-			 *     might be acked (we still might get spurious
-			 *     timeouts but try to keep them as few as possible)
-			 * (4) the timeout parameter is set
-			 */
-			if ((qp_type(qp) == IB_QPT_RC) &&
-			    (qp->req.state == QP_STATE_READY) &&
-			    (psn_compare(qp->req.psn, qp->comp.psn) > 0) &&
-			    qp->timeout_jiffies)
-				mod_timer(&qp->retrans_timer,
-					  jiffies + qp->timeout_jiffies);
 			ret = -EAGAIN;
 			goto done;
 
@@ -684,9 +702,7 @@ int rxe_completer(void *arg)
 			/* we come here if the retry timer fired and we did
 			 * not receive a response packet. try to retry the send
 			 * queue if that makes sense and the limits have not
-			 * been exceeded. remember that some timeouts are
-			 * spurious since we do not reset the timer but kick
-			 * it down the road or let it expire
+			 * been exceeded.
 			 */
 
 			/* there is nothing to retry in this case */
diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
index aa9066ff5257..5e031661bc49 100644
--- a/drivers/infiniband/sw/rxe/rxe_req.c
+++ b/drivers/infiniband/sw/rxe/rxe_req.c
@@ -397,7 +397,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp,
 					 qp->attr.dest_qp_num;
 
 	ack_req = ((pkt->mask & RXE_END_MASK) ||
-		(qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
+		   (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK) ||
+		   (qp->timeout_jiffies &&
+				!timer_pending(&qp->retrans_timer)));
 	if (ack_req) {
 		qp->req.noack_pkts = 0;
 		pkt->mask |= RXE_ACK_REQ_MASK;
@@ -545,10 +547,28 @@ static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
 
 	qp->need_req_skb = 0;
 
-	if ((pkt->mask & RXE_ACK_REQ_MASK) && qp->timeout_jiffies &&
-	    !timer_pending(&qp->retrans_timer))
+	/* reset the retry timer if the packet has the ackreq bit set,
+	 * we are using the retry timer and there is no timer set
+	 * currently
+	 */
+	if (pkt->mask & RXE_ACK_REQ_MASK && qp->timeout_jiffies &&
+	    !timer_pending(&qp->retrans_timer)) {
+		int psn = pkt->psn;
+
+		/* for read ops delay matching psn by RXE_MAX_PKT_PER_ACK
+		 * up to the last psn. The completer will also reset the
+		 * retry timer as required
+		 */
+		if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) {
+			psn = (psn + RXE_MAX_PKT_PER_ACK) & 0xffffff;
+			if (psn_compare(psn, wqe->last_psn) > 0)
+				psn = wqe->last_psn;
+		}
+
+		atomic_set(&qp->timeout_psn, psn);
 		mod_timer(&qp->retrans_timer,
 			  jiffies + qp->timeout_jiffies);
+	}
 }
 
 static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
index a6c6f0d786c7..c41092d790f0 100644
--- a/drivers/infiniband/sw/rxe/rxe_verbs.h
+++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
@@ -252,6 +252,7 @@ struct rxe_qp {
 	 */
 	struct timer_list	retrans_timer;
 	u64			timeout_jiffies;
+	atomic_t		timeout_psn;
 
 	/* Timer for handling RNR NAKS. */
 	struct timer_list	rnr_nak_timer;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH for-rc v2 4/4] RDMA/rxe: Reduce spurious retransmit timeouts
  2022-05-14  3:04 ` [PATCH for-rc v2 4/4] RDMA/rxe: Reduce spurious retransmit timeouts Bob Pearson
@ 2022-05-14 16:41   ` Bob Pearson
  0 siblings, 0 replies; 6+ messages in thread
From: Bob Pearson @ 2022-05-14 16:41 UTC (permalink / raw)
  To: jgg, zyjzyj2000, tom, linux-rdma, jhack

On 5/13/22 22:04, Bob Pearson wrote:
> Currently the rxe driver sets a timer to handle response timeouts
> from lost packets or other failures. However this timer is never
> deleted so it will always fire in the period it was set for.
> Under heavy load this means that there will be retry flows attempted
> every few msec depending on the setting for retry timeout.
> 
> This patch modifies the driver in several ways to reduce the
> number of spurious timeouts:
> 
>  - The psn of the current packet is saved when the timer is set
>    for non-read requests and for read requests a psn in the
>    expected response is saved. When a response packet is received
>    by the completer tasklet that has a psn >= to the saved psn
>    the timer is deleted.
>  - As soon as there is no pending timer any new request will
>    reset the timer. For long read replies the timer is reset
>    as blocks of data arrive so that the response is covered by
>    a timer.
> 
> For typical heavy loads (e.g. ib_send_bw etc.) the number of
> retries is reduced from ~40 retries per second to one every
> several seconds. These are legitimate timeouts where the responder
> does not reply in the given time.
> 
> Signed-off-by: Bob Pearson <rpearsonhpe@gmail.com>
> ---
>  drivers/infiniband/sw/rxe/rxe_comp.c  | 50 ++++++++++++++++++---------
>  drivers/infiniband/sw/rxe/rxe_req.c   | 26 ++++++++++++--
>  drivers/infiniband/sw/rxe/rxe_verbs.h |  1 +
>  3 files changed, 57 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/infiniband/sw/rxe/rxe_comp.c b/drivers/infiniband/sw/rxe/rxe_comp.c
> index 3c77201c01d1..a8ef1f781a24 100644
> --- a/drivers/infiniband/sw/rxe/rxe_comp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_comp.c
> @@ -170,12 +170,44 @@ static inline void reset_retry_counters(struct rxe_qp *qp)
>  	qp->comp.started_retry = 0;
>  }
>  
> +static void update_retransmit_timer(struct rxe_qp *qp,
> +				    struct rxe_pkt_info *pkt,
> +				    struct rxe_send_wqe *wqe)
> +{
> +	/* reset the retry timer for long read responses
> +	 * if there is more data expected
> +	 */
> +	if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) {

Oops. This is the wrong opcode. Should be one of the read replies.
Bob

> +		int psn = (pkt->psn + RXE_MAX_PKT_PER_ACK) & 0xffffff;
> +
> +		if (psn_compare(psn, wqe->last_psn) > 0)
> +			psn = wqe->last_psn;
> +
> +		if (psn_compare(psn, pkt->psn) > 0) {
> +			atomic_set(&qp->timeout_psn, psn);
> +			mod_timer(&qp->retrans_timer,
> +				  jiffies + qp->timeout_jiffies);
> +			return;
> +		}
> +	}
> +
> +	/* else just delete the timer */
> +	del_timer_sync(&qp->retrans_timer);
> +}
> +
>  static inline enum comp_state check_psn(struct rxe_qp *qp,
>  					struct rxe_pkt_info *pkt,
>  					struct rxe_send_wqe *wqe)
>  {
>  	s32 diff;
>  
> +	/* if this packet psn matches or exceeds the request that set the
> +	 * retry timer update the timer.
> +	 */
> +	if ((psn_compare(pkt->psn, atomic_read(&qp->timeout_psn)) >= 0) &&
> +	    timer_pending(&qp->retrans_timer))
> +		update_retransmit_timer(qp, pkt, wqe);
> +
>  	/* check to see if response is past the oldest WQE. if it is, complete
>  	 * send/write or error read/atomic
>  	 */
> @@ -663,20 +695,6 @@ int rxe_completer(void *arg)
>  				break;
>  			}
>  
> -			/* re reset the timeout counter if
> -			 * (1) QP is type RC
> -			 * (2) the QP is alive
> -			 * (3) there is a packet sent by the requester that
> -			 *     might be acked (we still might get spurious
> -			 *     timeouts but try to keep them as few as possible)
> -			 * (4) the timeout parameter is set
> -			 */
> -			if ((qp_type(qp) == IB_QPT_RC) &&
> -			    (qp->req.state == QP_STATE_READY) &&
> -			    (psn_compare(qp->req.psn, qp->comp.psn) > 0) &&
> -			    qp->timeout_jiffies)
> -				mod_timer(&qp->retrans_timer,
> -					  jiffies + qp->timeout_jiffies);
>  			ret = -EAGAIN;
>  			goto done;
>  
> @@ -684,9 +702,7 @@ int rxe_completer(void *arg)
>  			/* we come here if the retry timer fired and we did
>  			 * not receive a response packet. try to retry the send
>  			 * queue if that makes sense and the limits have not
> -			 * been exceeded. remember that some timeouts are
> -			 * spurious since we do not reset the timer but kick
> -			 * it down the road or let it expire
> +			 * been exceeded.
>  			 */
>  
>  			/* there is nothing to retry in this case */
> diff --git a/drivers/infiniband/sw/rxe/rxe_req.c b/drivers/infiniband/sw/rxe/rxe_req.c
> index aa9066ff5257..5e031661bc49 100644
> --- a/drivers/infiniband/sw/rxe/rxe_req.c
> +++ b/drivers/infiniband/sw/rxe/rxe_req.c
> @@ -397,7 +397,9 @@ static struct sk_buff *init_req_packet(struct rxe_qp *qp,
>  					 qp->attr.dest_qp_num;
>  
>  	ack_req = ((pkt->mask & RXE_END_MASK) ||
> -		(qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK));
> +		   (qp->req.noack_pkts++ > RXE_MAX_PKT_PER_ACK) ||
> +		   (qp->timeout_jiffies &&
> +				!timer_pending(&qp->retrans_timer)));
>  	if (ack_req) {
>  		qp->req.noack_pkts = 0;
>  		pkt->mask |= RXE_ACK_REQ_MASK;
> @@ -545,10 +547,28 @@ static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
>  
>  	qp->need_req_skb = 0;
>  
> -	if ((pkt->mask & RXE_ACK_REQ_MASK) && qp->timeout_jiffies &&
> -	    !timer_pending(&qp->retrans_timer))
> +	/* reset the retry timer if the packet has the ackreq bit set,
> +	 * we are using the retry timer and there is no timer set
> +	 * currently
> +	 */
> +	if (pkt->mask & RXE_ACK_REQ_MASK && qp->timeout_jiffies &&
> +	    !timer_pending(&qp->retrans_timer)) {
> +		int psn = pkt->psn;
> +
> +		/* for read ops delay matching psn by RXE_MAX_PKT_PER_ACK
> +		 * up to the last psn. The completer will also reset the
> +		 * retry timer as required
> +		 */
> +		if ((pkt->opcode & 0x1f) == IB_OPCODE_RDMA_READ_REQUEST) {
> +			psn = (psn + RXE_MAX_PKT_PER_ACK) & 0xffffff;
> +			if (psn_compare(psn, wqe->last_psn) > 0)
> +				psn = wqe->last_psn;
> +		}
> +
> +		atomic_set(&qp->timeout_psn, psn);
>  		mod_timer(&qp->retrans_timer,
>  			  jiffies + qp->timeout_jiffies);
> +	}
>  }
>  
>  static int rxe_do_local_ops(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
> diff --git a/drivers/infiniband/sw/rxe/rxe_verbs.h b/drivers/infiniband/sw/rxe/rxe_verbs.h
> index a6c6f0d786c7..c41092d790f0 100644
> --- a/drivers/infiniband/sw/rxe/rxe_verbs.h
> +++ b/drivers/infiniband/sw/rxe/rxe_verbs.h
> @@ -252,6 +252,7 @@ struct rxe_qp {
>  	 */
>  	struct timer_list	retrans_timer;
>  	u64			timeout_jiffies;
> +	atomic_t		timeout_psn;
>  
>  	/* Timer for handling RNR NAKS. */
>  	struct timer_list	rnr_nak_timer;


^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2022-05-14 16:42 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-14  3:04 [PATCH for-rc v2 0/4] RDMA/rxe: Cleanup retransmit and rnr retry Bob Pearson
2022-05-14  3:04 ` [PATCH for-rc v2 1/4] RDMA/rxe: Rename qp->qp_timeout_jiffies Bob Pearson
2022-05-14  3:04 ` [PATCH for-rc v2 2/4] RDMA/rxe: Add a pkt->mask bit for ack_req Bob Pearson
2022-05-14  3:04 ` [PATCH for-rc v2 3/4] RDMA/rxe: Fix rnr retry behavior Bob Pearson
2022-05-14  3:04 ` [PATCH for-rc v2 4/4] RDMA/rxe: Reduce spurious retransmit timeouts Bob Pearson
2022-05-14 16:41   ` Bob Pearson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).