linux-rdma.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Frank Huang <tigerinxm@gmail.com>
To: Leon Romanovsky <leon@kernel.org>
Cc: linux-rdma@vger.kernel.org
Subject: Re: rxe panic
Date: Wed, 25 Dec 2019 15:43:13 +0800	[thread overview]
Message-ID: <CAKC_zStPUCpjaF3mJb01A9sPjB0t0GxfBqB=8zsto96dsaD5qQ@mail.gmail.com> (raw)
In-Reply-To: <CAKC_zSts5zdbM4LhUaPBWk8=uKGAKWX6vgd85cdKjOrZViiEJg@mail.gmail.com>

there is the patch what i used. :)


rdma_rxe(4.14.97) : has problems in dealing with disorderly messages
this patch transplant rdma_rxe module from linux-5.2.9 to fix this problems.
the fix only under linux-4.14.97/drivers/infiniband/sw/rxe. At
present, no impact on other modules has been found.

diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_comp.c
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_comp.c
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_comp.c 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_comp.c 2019-09-17
16:00:39.168896560 +0800
@@ -191,6 +191,7 @@
 {
  qp->comp.retry_cnt = qp->attr.retry_cnt;
  qp->comp.rnr_retry = qp->attr.rnr_retry;
+ qp->comp.started_retry = 0;
 }

 static inline enum comp_state check_psn(struct rxe_qp *qp,
@@ -253,6 +254,17 @@
  case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
  if (pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE &&
      pkt->opcode != IB_OPCODE_RC_RDMA_READ_RESPONSE_LAST) {
+ /* read retries of partial data may restart from
+ * read response first or response only.
+ */
+ if ((pkt->psn == wqe->first_psn &&
+      pkt->opcode ==
+      IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) ||
+     (wqe->first_psn == wqe->last_psn &&
+      pkt->opcode ==
+      IB_OPCODE_RC_RDMA_READ_RESPONSE_ONLY))
+ break;
+
  return COMPST_ERROR;
  }
  break;
@@ -270,8 +282,8 @@
  if ((syn & AETH_TYPE_MASK) != AETH_ACK)
  return COMPST_ERROR;

- /* Fall through (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE
- * doesn't have an AETH)
+ /* fall through */
+ /* (IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE doesn't have an AETH)
  */
  case IB_OPCODE_RC_RDMA_READ_RESPONSE_MIDDLE:
  if (wqe->wr.opcode != IB_WR_RDMA_READ &&
@@ -501,11 +513,11 @@
     struct rxe_pkt_info *pkt,
     struct rxe_send_wqe *wqe)
 {
- qp->comp.opcode = -1;
-
- if (pkt) {
- if (psn_compare(pkt->psn, qp->comp.psn) >= 0)
- qp->comp.psn = (pkt->psn + 1) & BTH_PSN_MASK;
+ if (pkt && wqe->state == wqe_state_pending) {
+ if (psn_compare(wqe->last_psn, qp->comp.psn) >= 0) {
+ qp->comp.psn = (wqe->last_psn + 1) & BTH_PSN_MASK;
+ qp->comp.opcode = -1;
+ }

  if (qp->req.wait_psn) {
  qp->req.wait_psn = 0;
@@ -662,7 +674,6 @@
      qp->qp_timeout_jiffies)
  mod_timer(&qp->retrans_timer,
    jiffies + qp->qp_timeout_jiffies);
- WARN_ON_ONCE(skb);
  goto exit;

  case COMPST_ERROR_RETRY:
@@ -676,10 +687,23 @@

  /* there is nothing to retry in this case */
  if (!wqe || (wqe->state == wqe_state_posted)) {
- WARN_ON_ONCE(skb);
  goto exit;
  }

+ /* if we've started a retry, don't start another
+ * retry sequence, unless this is a timeout.
+ */
+ if (qp->comp.started_retry &&
+     !qp->comp.timeout_retry) {
+ if (pkt) {
+ rxe_drop_ref(pkt->qp);
+ kfree_skb(skb);
+ skb = NULL;
+ }
+
+ goto done;
+ }
+
  if (qp->comp.retry_cnt > 0) {
  if (qp->comp.retry_cnt != 7)
  qp->comp.retry_cnt--;
@@ -696,6 +720,7 @@
  rxe_counter_inc(rxe,
  RXE_CNT_COMP_RETRY);
  qp->req.need_retry = 1;
+ qp->comp.started_retry = 1;
  rxe_run_task(&qp->req.task, 1);
  }

@@ -705,8 +730,7 @@
  skb = NULL;
  }

- WARN_ON_ONCE(skb);
- goto exit;
+ goto done;

  } else {
  rxe_counter_inc(rxe, RXE_CNT_RETRY_EXCEEDED);
@@ -749,7 +773,6 @@
  skb = NULL;
  }

- WARN_ON_ONCE(skb);
  goto exit;
  }
  }
diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe.h
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe.h
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe.h 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe.h 2019-09-17
16:00:39.169896565 +0800
@@ -74,7 +74,6 @@
  SHASH_DESC_ON_STACK(shash, rxe->tfm);

  shash->tfm = rxe->tfm;
- shash->flags = 0;
  *(u32 *)shash_desc_ctx(shash) = crc;
  err = crypto_shash_update(shash, next, len);
  if (unlikely(err)) {
diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_hdr.h
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_hdr.h
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_hdr.h 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_hdr.h 2019-09-17
16:00:39.169896565 +0800
@@ -643,7 +643,7 @@
  __be32 rkey;
  __be64 swap_add;
  __be64 comp;
-} __attribute__((__packed__));
+} __packed;

 static inline u64 __atmeth_va(void *arg)
 {
diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_hw_counters.c
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_hw_counters.c
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_hw_counters.c
2019-01-31 15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_hw_counters.c
2019-09-17 16:00:39.169896565 +0800
@@ -37,11 +37,11 @@
  [RXE_CNT_SENT_PKTS]           =  "sent_pkts",
  [RXE_CNT_RCVD_PKTS]           =  "rcvd_pkts",
  [RXE_CNT_DUP_REQ]             =  "duplicate_request",
- [RXE_CNT_OUT_OF_SEQ_REQ]      =  "out_of_sequence",
+ [RXE_CNT_OUT_OF_SEQ_REQ]      =  "out_of_seq_request",
  [RXE_CNT_RCV_RNR]             =  "rcvd_rnr_err",
  [RXE_CNT_SND_RNR]             =  "send_rnr_err",
  [RXE_CNT_RCV_SEQ_ERR]         =  "rcvd_seq_err",
- [RXE_CNT_COMPLETER_SCHED]     =  "ack_deffered",
+ [RXE_CNT_COMPLETER_SCHED]     =  "ack_deferred",
  [RXE_CNT_RETRY_EXCEEDED]      =  "retry_exceeded_err",
  [RXE_CNT_RNR_RETRY_EXCEEDED]  =  "retry_rnr_exceeded_err",
  [RXE_CNT_COMP_RETRY]          =  "completer_retry_err",
diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_loc.h
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_loc.h
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_loc.h 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_loc.h 2019-09-17
16:00:39.170896570 +0800
@@ -268,7 +268,8 @@

  if (pkt->mask & RXE_LOOPBACK_MASK) {
  memcpy(SKB_TO_PKT(skb), pkt, sizeof(*pkt));
- err = rxe_loopback(skb);
+ rxe_loopback(skb);
+ err = 0;
  } else {
  err = rxe_send(rxe, pkt, skb);
  }
diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_mmap.c
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_mmap.c
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_mmap.c 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_mmap.c 2019-09-17
16:00:39.170896570 +0800
@@ -146,6 +146,8 @@
     void *obj)
 {
  struct rxe_mmap_info *ip;
+    if (!context)
+     return ERR_PTR(-EINVAL);

  ip = kmalloc(sizeof(*ip), GFP_KERNEL);
  if (!ip)
diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_pool.c
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_pool.c
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_pool.c 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_pool.c 2019-09-17
16:00:39.171896575 +0800
@@ -112,6 +112,20 @@
  return rxe_type_info[pool->type].cache;
 }

+static void rxe_cache_clean(size_t cnt)
+{
+ int i;
+ struct rxe_type_info *type;
+
+ for (i = 0; i < cnt; i++) {
+ type = &rxe_type_info[i];
+ if (!(type->flags & RXE_POOL_NO_ALLOC)) {
+ kmem_cache_destroy(type->cache);
+ type->cache = NULL;
+ }
+ }
+}
+
 int rxe_cache_init(void)
 {
  int err;
@@ -136,24 +150,14 @@
  return 0;

 err1:
- while (--i >= 0) {
- kmem_cache_destroy(type->cache);
- type->cache = NULL;
- }
+ rxe_cache_clean(i);

  return err;
 }

 void rxe_cache_exit(void)
 {
- int i;
- struct rxe_type_info *type;
-
- for (i = 0; i < RXE_NUM_TYPES; i++) {
- type = &rxe_type_info[i];
- kmem_cache_destroy(type->cache);
- type->cache = NULL;
- }
+ rxe_cache_clean(RXE_NUM_TYPES);
 }

 static int rxe_pool_init_index(struct rxe_pool *pool, u32 max, u32 min)
@@ -207,7 +211,7 @@

  kref_init(&pool->ref_cnt);

- spin_lock_init(&pool->pool_lock);
+ rwlock_init(&pool->pool_lock);

  if (rxe_type_info[type].flags & RXE_POOL_INDEX) {
  err = rxe_pool_init_index(pool,
@@ -222,7 +226,7 @@
  pool->key_size = rxe_type_info[type].key_size;
  }

- pool->state = rxe_pool_valid;
+ pool->state = RXE_POOL_STATE_VALID;

 out:
  return err;
@@ -232,7 +236,7 @@
 {
  struct rxe_pool *pool = container_of(kref, struct rxe_pool, ref_cnt);

- pool->state = rxe_pool_invalid;
+ pool->state = RXE_POOL_STATE_INVALID;
  kfree(pool->table);
 }

@@ -245,12 +249,12 @@
 {
  unsigned long flags;

- spin_lock_irqsave(&pool->pool_lock, flags);
- pool->state = rxe_pool_invalid;
+ write_lock_irqsave(&pool->pool_lock, flags);
+ pool->state = RXE_POOL_STATE_INVALID;
  if (atomic_read(&pool->num_elem) > 0)
  pr_warn("%s pool destroyed with unfree'd elem\n",
  pool_name(pool));
- spin_unlock_irqrestore(&pool->pool_lock, flags);
+ write_unlock_irqrestore(&pool->pool_lock, flags);

  rxe_pool_put(pool);

@@ -336,10 +340,10 @@
  struct rxe_pool *pool = elem->pool;
  unsigned long flags;

- spin_lock_irqsave(&pool->pool_lock, flags);
+ write_lock_irqsave(&pool->pool_lock, flags);
  memcpy((u8 *)elem + pool->key_offset, key, pool->key_size);
  insert_key(pool, elem);
- spin_unlock_irqrestore(&pool->pool_lock, flags);
+ write_unlock_irqrestore(&pool->pool_lock, flags);
 }

 void rxe_drop_key(void *arg)
@@ -348,9 +352,9 @@
  struct rxe_pool *pool = elem->pool;
  unsigned long flags;

- spin_lock_irqsave(&pool->pool_lock, flags);
+ write_lock_irqsave(&pool->pool_lock, flags);
  rb_erase(&elem->node, &pool->tree);
- spin_unlock_irqrestore(&pool->pool_lock, flags);
+ write_unlock_irqrestore(&pool->pool_lock, flags);
 }

 void rxe_add_index(void *arg)
@@ -359,10 +363,10 @@
  struct rxe_pool *pool = elem->pool;
  unsigned long flags;

- spin_lock_irqsave(&pool->pool_lock, flags);
+ write_lock_irqsave(&pool->pool_lock, flags);
  elem->index = alloc_index(pool);
  insert_index(pool, elem);
- spin_unlock_irqrestore(&pool->pool_lock, flags);
+ write_unlock_irqrestore(&pool->pool_lock, flags);
 }

 void rxe_drop_index(void *arg)
@@ -371,10 +375,10 @@
  struct rxe_pool *pool = elem->pool;
  unsigned long flags;

- spin_lock_irqsave(&pool->pool_lock, flags);
+ write_lock_irqsave(&pool->pool_lock, flags);
  clear_bit(elem->index - pool->min_index, pool->table);
  rb_erase(&elem->node, &pool->tree);
- spin_unlock_irqrestore(&pool->pool_lock, flags);
+ write_unlock_irqrestore(&pool->pool_lock, flags);
 }

 void *rxe_alloc(struct rxe_pool *pool)
@@ -384,13 +388,13 @@

  might_sleep_if(!(pool->flags & RXE_POOL_ATOMIC));

- spin_lock_irqsave(&pool->pool_lock, flags);
- if (pool->state != rxe_pool_valid) {
- spin_unlock_irqrestore(&pool->pool_lock, flags);
+ read_lock_irqsave(&pool->pool_lock, flags);
+ if (pool->state != RXE_POOL_STATE_VALID) {
+ read_unlock_irqrestore(&pool->pool_lock, flags);
  return NULL;
  }
  kref_get(&pool->ref_cnt);
- spin_unlock_irqrestore(&pool->pool_lock, flags);
+ read_unlock_irqrestore(&pool->pool_lock, flags);

  kref_get(&pool->rxe->ref_cnt);

@@ -436,9 +440,9 @@
  struct rxe_pool_entry *elem = NULL;
  unsigned long flags;

- spin_lock_irqsave(&pool->pool_lock, flags);
+ read_lock_irqsave(&pool->pool_lock, flags);

- if (pool->state != rxe_pool_valid)
+ if (pool->state != RXE_POOL_STATE_VALID)
  goto out;

  node = pool->tree.rb_node;
@@ -450,15 +454,14 @@
  node = node->rb_left;
  else if (elem->index < index)
  node = node->rb_right;
- else
+ else {
+ kref_get(&elem->ref_cnt);
  break;
+ }
  }

- if (node)
- kref_get(&elem->ref_cnt);
-
 out:
- spin_unlock_irqrestore(&pool->pool_lock, flags);
+ read_unlock_irqrestore(&pool->pool_lock, flags);
  return node ? elem : NULL;
 }

@@ -469,9 +472,9 @@
  int cmp;
  unsigned long flags;

- spin_lock_irqsave(&pool->pool_lock, flags);
+ read_lock_irqsave(&pool->pool_lock, flags);

- if (pool->state != rxe_pool_valid)
+ if (pool->state != RXE_POOL_STATE_VALID)
  goto out;

  node = pool->tree.rb_node;
@@ -494,6 +497,6 @@
  kref_get(&elem->ref_cnt);

 out:
- spin_unlock_irqrestore(&pool->pool_lock, flags);
+ read_unlock_irqrestore(&pool->pool_lock, flags);
  return node ? elem : NULL;
 }
diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_pool.h
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_pool.h
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_pool.h 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_pool.h 2019-09-17
16:00:39.171896575 +0800
@@ -41,6 +41,7 @@
  RXE_POOL_ATOMIC = BIT(0),
  RXE_POOL_INDEX = BIT(1),
  RXE_POOL_KEY = BIT(2),
+ RXE_POOL_NO_ALLOC = BIT(4),
 };

 enum rxe_elem_type {
@@ -74,8 +75,8 @@
 extern struct rxe_type_info rxe_type_info[];

 enum rxe_pool_state {
- rxe_pool_invalid,
- rxe_pool_valid,
+ RXE_POOL_STATE_INVALID,
+ RXE_POOL_STATE_VALID,
 };

 struct rxe_pool_entry {
@@ -90,7 +91,7 @@

 struct rxe_pool {
  struct rxe_dev *rxe;
- spinlock_t              pool_lock; /* pool spinlock */
+ rwlock_t pool_lock; /* protects pool add/del/search */
  size_t elem_size;
  struct kref ref_cnt;
  void (*cleanup)(struct rxe_pool_entry *obj);
diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_qp.c
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_qp.c
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_qp.c 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_qp.c 2019-09-17
16:00:39.172896580 +0800
@@ -235,6 +235,16 @@
  return err;
  qp->sk->sk->sk_user_data = qp;

+ /* pick a source UDP port number for this QP based on
+ * the source QPN. this spreads traffic for different QPs
+ * across different NIC RX queues (while using a single
+ * flow for a given QP to maintain packet order).
+ * the port number must be in the Dynamic Ports range
+ * (0xc000 - 0xffff).
+ */
+ qp->src_port = RXE_ROCE_V2_SPORT +
+ (hash_32_generic(qp_num(qp), 14) & 0x3fff);
+
  qp->sq.max_wr = init->cap.max_send_wr;
  qp->sq.max_sge = init->cap.max_send_sge;
  qp->sq.max_inline = init->cap.max_inline_data;
diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_req.c
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_req.c
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_req.c 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_req.c 2019-09-17
16:00:39.172896580 +0800
@@ -73,9 +73,6 @@
  int npsn;
  int first = 1;

- wqe = queue_head(qp->sq.queue);
- npsn = (qp->comp.psn - wqe->first_psn) & BTH_PSN_MASK;
-
  qp->req.wqe_index = consumer_index(qp->sq.queue);
  qp->req.psn = qp->comp.psn;
  qp->req.opcode = -1;
@@ -107,11 +104,17 @@
  if (first) {
  first = 0;

- if (mask & WR_WRITE_OR_SEND_MASK)
+ if (mask & WR_WRITE_OR_SEND_MASK) {
+ npsn = (qp->comp.psn - wqe->first_psn) &
+ BTH_PSN_MASK;
  retry_first_write_send(qp, wqe, mask, npsn);
+ }

- if (mask & WR_READ_MASK)
+ if (mask & WR_READ_MASK) {
+ npsn = (wqe->dma.length - wqe->dma.resid) /
+ qp->mtu;
  wqe->iova += npsn * qp->mtu;
+ }
  }

  wqe->state = wqe_state_posted;
@@ -435,7 +438,7 @@
  if (pkt->mask & RXE_RETH_MASK) {
  reth_set_rkey(pkt, ibwr->wr.rdma.rkey);
  reth_set_va(pkt, wqe->iova);
- reth_set_len(pkt, wqe->dma.length);
+ reth_set_len(pkt, wqe->dma.resid);
  }

  if (pkt->mask & RXE_IMMDT_MASK)
@@ -713,6 +716,7 @@

  if (fill_packet(qp, wqe, &pkt, skb, payload)) {
  pr_debug("qp#%d Error during fill packet\n", qp_num(qp));
+ kfree_skb(skb);
  goto err;
  }

@@ -744,7 +748,6 @@
  goto next_wqe;

 err:
- kfree_skb(skb);
  wqe->status = IB_WC_LOC_PROT_ERR;
  wqe->state = wqe_state_error;
  __rxe_do_task(&qp->comp.task);
diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_resp.c
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_resp.c
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_resp.c 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_resp.c 2019-09-17
16:00:39.173896585 +0800
@@ -124,12 +124,9 @@
  struct sk_buff *skb;

  if (qp->resp.state == QP_STATE_ERROR) {
- skb = skb_dequeue(&qp->req_pkts);
- if (skb) {
- /* drain request packet queue */
+ while ((skb = skb_dequeue(&qp->req_pkts))) {
  rxe_drop_ref(qp);
  kfree_skb(skb);
- return RESPST_GET_REQ;
  }

  /* go drain recv wr queue */
@@ -435,6 +432,7 @@
  qp->resp.va = reth_va(pkt);
  qp->resp.rkey = reth_rkey(pkt);
  qp->resp.resid = reth_len(pkt);
+ qp->resp.length = reth_len(pkt);
  }
  access = (pkt->mask & RXE_READ_MASK) ? IB_ACCESS_REMOTE_READ
       : IB_ACCESS_REMOTE_WRITE;
@@ -860,7 +858,9 @@
  pkt->mask & RXE_WRITE_MASK) ?
  IB_WC_RECV_RDMA_WITH_IMM : IB_WC_RECV;
  wc->vendor_err = 0;
- wc->byte_len = wqe->dma.length - wqe->dma.resid;
+ wc->byte_len = (pkt->mask & RXE_IMMDT_MASK &&
+ pkt->mask & RXE_WRITE_MASK) ?
+ qp->resp.length : wqe->dma.length - wqe->dma.resid;

  /* fields after byte_len are different between kernel and user
  * space
diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_verbs.c
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_verbs.c
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_verbs.c 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_verbs.c 2019-09-17
16:00:39.174896590 +0800
@@ -644,6 +644,7 @@
  switch (wr->opcode) {
  case IB_WR_RDMA_WRITE_WITH_IMM:
  wr->ex.imm_data = ibwr->ex.imm_data;
+ /* fall through */
  case IB_WR_RDMA_READ:
  case IB_WR_RDMA_WRITE:
  wr->wr.rdma.remote_addr = rdma_wr(ibwr)->remote_addr;
@@ -774,7 +775,6 @@
  unsigned int mask;
  unsigned int length = 0;
  int i;
- int must_sched;

  while (wr) {
  mask = wr_opcode_mask(wr->opcode, qp);
@@ -804,14 +804,7 @@
  wr = wr->next;
  }

- /*
- * Must sched in case of GSI QP because ib_send_mad() hold irq lock,
- * and the requester call ip_local_out_sk() that takes spin_lock_bh.
- */
- must_sched = (qp_type(qp) == IB_QPT_GSI) ||
- (queue_count(qp->sq.queue) > 1);
-
- rxe_run_task(&qp->req.task, must_sched);
+ rxe_run_task(&qp->req.task, 1);
  if (unlikely(qp->req.state == QP_STATE_ERROR))
  rxe_run_task(&qp->comp.task, 1);

diff -ur linux-4.14.97/drivers/infiniband/sw/rxe/rxe_verbs.h
linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_verbs.h
--- linux-4.14.97/drivers/infiniband/sw/rxe/rxe_verbs.h 2019-01-31
15:13:48.000000000 +0800
+++ linux-4.14.97-rxe/drivers/infiniband/sw/rxe/rxe_verbs.h 2019-09-17
16:00:39.174896590 +0800
@@ -160,6 +160,7 @@
  int opcode;
  int timeout;
  int timeout_retry;
+ int started_retry;
  u32 retry_cnt;
  u32 rnr_retry;
  struct rxe_task task;
@@ -214,6 +215,7 @@
  struct rxe_mem *mr;
  u32 resid;
  u32 rkey;
+ u32 length;
  u64 atomic_orig;

  /* SRQ only */
@@ -252,6 +254,7 @@

  struct socket *sk;
  u32 dst_cookie;
+ u16 src_port;

  struct rxe_av pri_av;
  struct rxe_av alt_av;

On Wed, Dec 25, 2019 at 3:23 PM Frank Huang <tigerinxm@gmail.com> wrote:
>
> hi leon
>
> I can not get what you means, do you say the rxe_add_ref(qp) is not needed?
> My kernel is old, and I found some bugs of rxe on 4.14.97, especially
> the rnr errors.
> I can not upgrade whole kernel because there are many dependencies.
> Finally , I sync the fixed from newest kernel version to the 4.14.97.
>
> When I compare my rxe_resp.c with kernel 5.2.9 , I found the snippet
> of duplicate_request is changed.
> and rxe_xmit_packet will call rxe_send,enter the log "rdma_rxe:
> Unknown layer 3 protocol: 0"
>
>   1137 } else {
>   1138 struct resp_res *res;
>   1139
>   1140 /* Find the operation in our list of responder resources. */
>   1141 res = find_resource(qp, pkt->psn);
>   1142 if (res) {
>   1143 struct sk_buff *skb_copy;
>   1144
>   1145 skb_copy = skb_clone(res->atomic.skb, GFP_ATOMIC);
>   1146 if (skb_copy) {
>   1147 rxe_add_ref(qp); /* for the new SKB */
>   1148 } else {
>   1149 pr_warn("Couldn't clone atomic resp\n");
>   1150 rc = RESPST_CLEANUP;
>   1151 goto out;
>   1152 }
>   1153
>   1154 /* Resend the result. */
>   1155 rc = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp,
>   1156      pkt, skb_copy);
>   1157 if (rc) {
>   1158 pr_err("Failed resending result. This flow is not handled - skb
> ignored\n");
>   1159 rxe_drop_ref(qp);
>   1160 rc = RESPST_CLEANUP;
>   1161 goto out;
>   1162 }
>   1163 }
>   1164
>   1165 /* Resource not found. Class D error. Drop the request. */
>   1166 rc = RESPST_CLEANUP;
>   1167 goto out;
>   1168 }
>   1169 out:
>   1170 return rc;
>   1171 }
>
> On Wed, Dec 25, 2019 at 2:33 PM Leon Romanovsky <leon@kernel.org> wrote:
> >
> > On Wed, Dec 25, 2019 at 12:55:35PM +0800, Frank Huang wrote:
> > > hi, there is a panic on rdma_rxe module when the restart
> > > network.service or shutdown the switch.
> > >
> > > it looks like a use-after-free error.
> > >
> > > everytime it happens, there is the log "rdma_rxe: Unknown layer 3 protocol: 0"
> >
> > The error print itself is harmless.
> > >
> > > is it a known error?
> > >
> > > my kernel version is 4.14.97
> >
> > Your kernel is old enough and doesn't include refcount,
> > so I can't say for sure that it is the case, but the
> > following code is not correct and with refcount debug
> > it will be seen immediately.
> >
> > 1213 int rxe_responder(void *arg)
> > 1214 {
> > 1215         struct rxe_qp *qp = (struct rxe_qp *)arg;
> > 1216         struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
> > 1217         enum resp_states state;
> > 1218         struct rxe_pkt_info *pkt = NULL;
> > 1219         int ret = 0;
> > 1220
> > 1221         rxe_add_ref(qp); <------ USE-AFTER-FREE
> > 1222
> > 1223         qp->resp.aeth_syndrome = AETH_ACK_UNLIMITED;
> > 1224
> > 1225         if (!qp->valid) {
> > 1226                 ret = -EINVAL;
> > 1227                 goto done;
> > 1228         }
> >
> > Thanks

  reply	other threads:[~2019-12-25  7:43 UTC|newest]

Thread overview: 12+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2019-12-25  4:55 rxe panic Frank Huang
2019-12-25  5:27 ` Zhu Yanjun
2019-12-25  6:01   ` Frank Huang
2019-12-25  6:34     ` Zhu Yanjun
2019-12-25  7:10       ` Frank Huang
2019-12-25  6:32 ` Leon Romanovsky
2019-12-25  7:23   ` Frank Huang
2019-12-25  7:43     ` Frank Huang [this message]
2019-12-25  9:23     ` Leon Romanovsky
2019-12-26  1:08       ` Zhu Yanjun
2019-12-26  1:39         ` Frank Huang
2019-12-26  2:35           ` Zhu Yanjun

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAKC_zStPUCpjaF3mJb01A9sPjB0t0GxfBqB=8zsto96dsaD5qQ@mail.gmail.com' \
    --to=tigerinxm@gmail.com \
    --cc=leon@kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).