All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support
@ 2021-10-14 16:47 Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 01/11] net/smc: improved fix wait on already cleared link Karsten Graul
                   ` (11 more replies)
  0 siblings, 12 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

Please apply the following patch series for smc to netdev's net-next tree.

SMC-Rv2 support (see https://www.ibm.com/support/pages/node/6326337)
provides routable RoCE support for SMC-R, eliminating the current
same-subnet restriction, by exploiting the UDP encapsulation feature
of the RoCE adapter hardware.

Patch 1 ("net/smc: improved fix wait on already cleared link") is
already applied on netdevs net tree but its changes are needed for
this series on net-next. The patch is unchanged compared to the
version on the net tree.

v2: resend of the v1 patch series, and CC linux-rdma this time

Karsten Graul (11):
  net/smc: improved fix wait on already cleared link
  net/smc: save stack space and allocate smc_init_info
  net/smc: prepare for SMC-Rv2 connection
  net/smc: add SMC-Rv2 connection establishment
  net/smc: add listen processing for SMC-Rv2
  net/smc: add v2 format of CLC decline message
  net/smc: retrieve v2 gid from IB device
  net/smc: add v2 support to the work request layer
  net/smc: extend LLC layer for SMC-Rv2
  net/smc: add netlink support for SMC-Rv2
  net/smc: stop links when their GID is removed

 include/uapi/linux/smc.h |  17 +-
 net/smc/af_smc.c         | 431 +++++++++++++++++-------
 net/smc/smc.h            |  20 +-
 net/smc/smc_cdc.c        |   7 +-
 net/smc/smc_clc.c        | 147 +++++++--
 net/smc/smc_clc.h        |  55 +++-
 net/smc/smc_core.c       | 193 +++++++----
 net/smc/smc_core.h       |  50 ++-
 net/smc/smc_ib.c         | 160 ++++++++-
 net/smc/smc_ib.h         |  16 +-
 net/smc/smc_llc.c        | 684 ++++++++++++++++++++++++++++++---------
 net/smc/smc_llc.h        |  12 +-
 net/smc/smc_pnet.c       |  41 ++-
 net/smc/smc_tx.c         |  22 +-
 net/smc/smc_wr.c         | 237 ++++++++++++--
 net/smc/smc_wr.h         |  22 ++
 16 files changed, 1691 insertions(+), 423 deletions(-)

-- 
2.25.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 01/11] net/smc: improved fix wait on already cleared link
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
@ 2021-10-14 16:47 ` Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 02/11] net/smc: save stack space and allocate smc_init_info Karsten Graul
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

Commit 8f3d65c16679 ("net/smc: fix wait on already cleared link")
introduced link refcounting to avoid waits on already cleared links.
This patch extents and improves the refcounting to cover all
remaining possible cases for this kind of error situation.

Fixes: 15e1b99aadfb ("net/smc: no WR buffer wait for terminating link group")
Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
---
 net/smc/smc_cdc.c  |  7 +++++-
 net/smc/smc_core.c | 20 ++++++++-------
 net/smc/smc_llc.c  | 63 +++++++++++++++++++++++++++++++++++-----------
 net/smc/smc_tx.c   | 22 ++++------------
 net/smc/smc_wr.h   | 14 +++++++++++
 5 files changed, 85 insertions(+), 41 deletions(-)

diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
index f23f558054a7..99acd337ba90 100644
--- a/net/smc/smc_cdc.c
+++ b/net/smc/smc_cdc.c
@@ -150,9 +150,11 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
 
 again:
 	link = conn->lnk;
+	if (!smc_wr_tx_link_hold(link))
+		return -ENOLINK;
 	rc = smc_cdc_get_free_slot(conn, link, &wr_buf, NULL, &pend);
 	if (rc)
-		return rc;
+		goto put_out;
 
 	spin_lock_bh(&conn->send_lock);
 	if (link != conn->lnk) {
@@ -160,6 +162,7 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
 		spin_unlock_bh(&conn->send_lock);
 		smc_wr_tx_put_slot(link,
 				   (struct smc_wr_tx_pend_priv *)pend);
+		smc_wr_tx_link_put(link);
 		if (again)
 			return -ENOLINK;
 		again = true;
@@ -167,6 +170,8 @@ static int smcr_cdc_get_slot_and_msg_send(struct smc_connection *conn)
 	}
 	rc = smc_cdc_msg_send(conn, wr_buf, pend);
 	spin_unlock_bh(&conn->send_lock);
+put_out:
+	smc_wr_tx_link_put(link);
 	return rc;
 }
 
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index f57449089a16..4d463701a759 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -943,7 +943,7 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
 		to_lnk = &lgr->lnk[i];
 		break;
 	}
-	if (!to_lnk) {
+	if (!to_lnk || !smc_wr_tx_link_hold(to_lnk)) {
 		smc_lgr_terminate_sched(lgr);
 		return NULL;
 	}
@@ -975,24 +975,26 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr,
 		read_unlock_bh(&lgr->conns_lock);
 		/* pre-fetch buffer outside of send_lock, might sleep */
 		rc = smc_cdc_get_free_slot(conn, to_lnk, &wr_buf, NULL, &pend);
-		if (rc) {
-			smcr_link_down_cond_sched(to_lnk);
-			return NULL;
-		}
+		if (rc)
+			goto err_out;
 		/* avoid race with smcr_tx_sndbuf_nonempty() */
 		spin_lock_bh(&conn->send_lock);
 		smc_switch_link_and_count(conn, to_lnk);
 		rc = smc_switch_cursor(smc, pend, wr_buf);
 		spin_unlock_bh(&conn->send_lock);
 		sock_put(&smc->sk);
-		if (rc) {
-			smcr_link_down_cond_sched(to_lnk);
-			return NULL;
-		}
+		if (rc)
+			goto err_out;
 		goto again;
 	}
 	read_unlock_bh(&lgr->conns_lock);
+	smc_wr_tx_link_put(to_lnk);
 	return to_lnk;
+
+err_out:
+	smcr_link_down_cond_sched(to_lnk);
+	smc_wr_tx_link_put(to_lnk);
+	return NULL;
 }
 
 static void smcr_buf_unuse(struct smc_buf_desc *rmb_desc,
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 2e7560eba981..72f4b72eb175 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -383,9 +383,11 @@ int smc_llc_send_confirm_link(struct smc_link *link,
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
+	if (!smc_wr_tx_link_hold(link))
+		return -ENOLINK;
 	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
 	if (rc)
-		return rc;
+		goto put_out;
 	confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
 	memset(confllc, 0, sizeof(*confllc));
 	confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
@@ -402,6 +404,8 @@ int smc_llc_send_confirm_link(struct smc_link *link,
 	confllc->max_links = SMC_LLC_ADD_LNK_MAX_LINKS;
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
+put_out:
+	smc_wr_tx_link_put(link);
 	return rc;
 }
 
@@ -415,9 +419,11 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link,
 	struct smc_link *link;
 	int i, rc, rtok_ix;
 
+	if (!smc_wr_tx_link_hold(send_link))
+		return -ENOLINK;
 	rc = smc_llc_add_pending_send(send_link, &wr_buf, &pend);
 	if (rc)
-		return rc;
+		goto put_out;
 	rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf;
 	memset(rkeyllc, 0, sizeof(*rkeyllc));
 	rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY;
@@ -444,6 +450,8 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link,
 		(u64)sg_dma_address(rmb_desc->sgt[send_link->link_idx].sgl));
 	/* send llc message */
 	rc = smc_wr_tx_send(send_link, pend);
+put_out:
+	smc_wr_tx_link_put(send_link);
 	return rc;
 }
 
@@ -456,9 +464,11 @@ static int smc_llc_send_delete_rkey(struct smc_link *link,
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
+	if (!smc_wr_tx_link_hold(link))
+		return -ENOLINK;
 	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
 	if (rc)
-		return rc;
+		goto put_out;
 	rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf;
 	memset(rkeyllc, 0, sizeof(*rkeyllc));
 	rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY;
@@ -467,6 +477,8 @@ static int smc_llc_send_delete_rkey(struct smc_link *link,
 	rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey);
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
+put_out:
+	smc_wr_tx_link_put(link);
 	return rc;
 }
 
@@ -480,9 +492,11 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
+	if (!smc_wr_tx_link_hold(link))
+		return -ENOLINK;
 	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
 	if (rc)
-		return rc;
+		goto put_out;
 	addllc = (struct smc_llc_msg_add_link *)wr_buf;
 
 	memset(addllc, 0, sizeof(*addllc));
@@ -504,6 +518,8 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
 	}
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
+put_out:
+	smc_wr_tx_link_put(link);
 	return rc;
 }
 
@@ -517,9 +533,11 @@ int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
+	if (!smc_wr_tx_link_hold(link))
+		return -ENOLINK;
 	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
 	if (rc)
-		return rc;
+		goto put_out;
 	delllc = (struct smc_llc_msg_del_link *)wr_buf;
 
 	memset(delllc, 0, sizeof(*delllc));
@@ -536,6 +554,8 @@ int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
 	delllc->reason = htonl(reason);
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
+put_out:
+	smc_wr_tx_link_put(link);
 	return rc;
 }
 
@@ -547,9 +567,11 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
+	if (!smc_wr_tx_link_hold(link))
+		return -ENOLINK;
 	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
 	if (rc)
-		return rc;
+		goto put_out;
 	testllc = (struct smc_llc_msg_test_link *)wr_buf;
 	memset(testllc, 0, sizeof(*testllc));
 	testllc->hd.common.type = SMC_LLC_TEST_LINK;
@@ -557,6 +579,8 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
 	memcpy(testllc->user_data, user_data, sizeof(testllc->user_data));
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
+put_out:
+	smc_wr_tx_link_put(link);
 	return rc;
 }
 
@@ -567,13 +591,16 @@ static int smc_llc_send_message(struct smc_link *link, void *llcbuf)
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
-	if (!smc_link_usable(link))
+	if (!smc_wr_tx_link_hold(link))
 		return -ENOLINK;
 	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
 	if (rc)
-		return rc;
+		goto put_out;
 	memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
-	return smc_wr_tx_send(link, pend);
+	rc = smc_wr_tx_send(link, pend);
+put_out:
+	smc_wr_tx_link_put(link);
+	return rc;
 }
 
 /* schedule an llc send on link, may wait for buffers,
@@ -586,13 +613,16 @@ static int smc_llc_send_message_wait(struct smc_link *link, void *llcbuf)
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
-	if (!smc_link_usable(link))
+	if (!smc_wr_tx_link_hold(link))
 		return -ENOLINK;
 	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
 	if (rc)
-		return rc;
+		goto put_out;
 	memcpy(wr_buf, llcbuf, sizeof(union smc_llc_msg));
-	return smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME);
+	rc = smc_wr_tx_send_wait(link, pend, SMC_LLC_WAIT_TIME);
+put_out:
+	smc_wr_tx_link_put(link);
+	return rc;
 }
 
 /********************************* receive ***********************************/
@@ -672,9 +702,11 @@ static int smc_llc_add_link_cont(struct smc_link *link,
 	struct smc_buf_desc *rmb;
 	u8 n;
 
+	if (!smc_wr_tx_link_hold(link))
+		return -ENOLINK;
 	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
 	if (rc)
-		return rc;
+		goto put_out;
 	addc_llc = (struct smc_llc_msg_add_link_cont *)wr_buf;
 	memset(addc_llc, 0, sizeof(*addc_llc));
 
@@ -706,7 +738,10 @@ static int smc_llc_add_link_cont(struct smc_link *link,
 	addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont);
 	if (lgr->role == SMC_CLNT)
 		addc_llc->hd.flags |= SMC_LLC_FLAG_RESP;
-	return smc_wr_tx_send(link, pend);
+	rc = smc_wr_tx_send(link, pend);
+put_out:
+	smc_wr_tx_link_put(link);
+	return rc;
 }
 
 static int smc_llc_cli_rkey_exchange(struct smc_link *link,
diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c
index c79361dfcdfb..738a4a99c827 100644
--- a/net/smc/smc_tx.c
+++ b/net/smc/smc_tx.c
@@ -496,7 +496,7 @@ static int smc_tx_rdma_writes(struct smc_connection *conn,
 /* Wakeup sndbuf consumers from any context (IRQ or process)
  * since there is more data to transmit; usable snd_wnd as max transmit
  */
-static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
+static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
 {
 	struct smc_cdc_producer_flags *pflags = &conn->local_tx_ctrl.prod_flags;
 	struct smc_link *link = conn->lnk;
@@ -505,8 +505,11 @@ static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
 	struct smc_wr_buf *wr_buf;
 	int rc;
 
+	if (!link || !smc_wr_tx_link_hold(link))
+		return -ENOLINK;
 	rc = smc_cdc_get_free_slot(conn, link, &wr_buf, &wr_rdma_buf, &pend);
 	if (rc < 0) {
+		smc_wr_tx_link_put(link);
 		if (rc == -EBUSY) {
 			struct smc_sock *smc =
 				container_of(conn, struct smc_sock, conn);
@@ -547,22 +550,7 @@ static int _smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
 
 out_unlock:
 	spin_unlock_bh(&conn->send_lock);
-	return rc;
-}
-
-static int smcr_tx_sndbuf_nonempty(struct smc_connection *conn)
-{
-	struct smc_link *link = conn->lnk;
-	int rc = -ENOLINK;
-
-	if (!link)
-		return rc;
-
-	atomic_inc(&link->wr_tx_refcnt);
-	if (smc_link_usable(link))
-		rc = _smcr_tx_sndbuf_nonempty(conn);
-	if (atomic_dec_and_test(&link->wr_tx_refcnt))
-		wake_up_all(&link->wr_tx_wait);
+	smc_wr_tx_link_put(link);
 	return rc;
 }
 
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 423b8709f1c9..2bc626f230a5 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -60,6 +60,20 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
 	atomic_long_set(wr_tx_id, val);
 }
 
+static inline bool smc_wr_tx_link_hold(struct smc_link *link)
+{
+	if (!smc_link_usable(link))
+		return false;
+	atomic_inc(&link->wr_tx_refcnt);
+	return true;
+}
+
+static inline void smc_wr_tx_link_put(struct smc_link *link)
+{
+	if (atomic_dec_and_test(&link->wr_tx_refcnt))
+		wake_up_all(&link->wr_tx_wait);
+}
+
 static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk)
 {
 	wake_up_all(&lnk->wr_tx_wait);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 02/11] net/smc: save stack space and allocate smc_init_info
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 01/11] net/smc: improved fix wait on already cleared link Karsten Graul
@ 2021-10-14 16:47 ` Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 03/11] net/smc: prepare for SMC-Rv2 connection Karsten Graul
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

The struct smc_init_info grew over time, its time to save space on stack
and allocate this struct dynamically.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
---
 net/smc/smc_llc.c | 83 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 53 insertions(+), 30 deletions(-)

diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 72f4b72eb175..760f5ed7c8f0 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -870,31 +870,37 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 	struct smc_llc_msg_add_link *llc = &qentry->msg.add_link;
 	enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
 	struct smc_link_group *lgr = smc_get_lgr(link);
+	struct smc_init_info *ini = NULL;
 	struct smc_link *lnk_new = NULL;
-	struct smc_init_info ini;
 	int lnk_idx, rc = 0;
 
 	if (!llc->qp_mtu)
 		goto out_reject;
 
-	ini.vlan_id = lgr->vlan_id;
-	smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
+	ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+	if (!ini) {
+		rc = -ENOMEM;
+		goto out_reject;
+	}
+
+	ini->vlan_id = lgr->vlan_id;
+	smc_pnet_find_alt_roce(lgr, ini, link->smcibdev);
 	if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
 	    !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN)) {
-		if (!ini.ib_dev)
+		if (!ini->ib_dev)
 			goto out_reject;
 		lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
 	}
-	if (!ini.ib_dev) {
+	if (!ini->ib_dev) {
 		lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
-		ini.ib_dev = link->smcibdev;
-		ini.ib_port = link->ibport;
+		ini->ib_dev = link->smcibdev;
+		ini->ib_port = link->ibport;
 	}
 	lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
 	if (lnk_idx < 0)
 		goto out_reject;
 	lnk_new = &lgr->lnk[lnk_idx];
-	rc = smcr_link_init(lgr, lnk_new, lnk_idx, &ini);
+	rc = smcr_link_init(lgr, lnk_new, lnk_idx, ini);
 	if (rc)
 		goto out_reject;
 	smc_llc_save_add_link_info(lnk_new, llc);
@@ -910,7 +916,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 		goto out_clear_lnk;
 
 	rc = smc_llc_send_add_link(link,
-				   lnk_new->smcibdev->mac[ini.ib_port - 1],
+				   lnk_new->smcibdev->mac[ini->ib_port - 1],
 				   lnk_new->gid, lnk_new, SMC_LLC_RESP);
 	if (rc)
 		goto out_clear_lnk;
@@ -919,7 +925,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 		rc = 0;
 		goto out_clear_lnk;
 	}
-	rc = smc_llc_cli_conf_link(link, &ini, lnk_new, lgr_new_t);
+	rc = smc_llc_cli_conf_link(link, ini, lnk_new, lgr_new_t);
 	if (!rc)
 		goto out;
 out_clear_lnk:
@@ -928,6 +934,7 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 out_reject:
 	smc_llc_cli_add_link_reject(qentry);
 out:
+	kfree(ini);
 	kfree(qentry);
 	return rc;
 }
@@ -937,20 +944,25 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link,
 					struct smc_llc_qentry *qentry)
 {
 	struct smc_link_group *lgr = smc_get_lgr(link);
-	struct smc_init_info ini;
+	struct smc_init_info *ini = NULL;
 
 	if (lgr->type == SMC_LGR_SYMMETRIC ||
 	    lgr->type == SMC_LGR_ASYMMETRIC_PEER)
 		goto out;
 
-	ini.vlan_id = lgr->vlan_id;
-	smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
-	if (!ini.ib_dev)
+	ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+	if (!ini)
+		goto out;
+
+	ini->vlan_id = lgr->vlan_id;
+	smc_pnet_find_alt_roce(lgr, ini, link->smcibdev);
+	if (!ini->ib_dev)
 		goto out;
 
-	smc_llc_send_add_link(link, ini.ib_dev->mac[ini.ib_port - 1],
-			      ini.ib_gid, NULL, SMC_LLC_REQ);
+	smc_llc_send_add_link(link, ini->ib_dev->mac[ini->ib_port - 1],
+			      ini->ib_gid, NULL, SMC_LLC_REQ);
 out:
+	kfree(ini);
 	kfree(qentry);
 }
 
@@ -1158,28 +1170,34 @@ int smc_llc_srv_add_link(struct smc_link *link)
 	struct smc_link_group *lgr = link->lgr;
 	struct smc_llc_msg_add_link *add_llc;
 	struct smc_llc_qentry *qentry = NULL;
-	struct smc_link *link_new;
-	struct smc_init_info ini;
+	struct smc_link *link_new = NULL;
+	struct smc_init_info *ini;
 	int lnk_idx, rc = 0;
 
+	ini = kzalloc(sizeof(*ini), GFP_KERNEL);
+	if (!ini)
+		return -ENOMEM;
+
 	/* ignore client add link recommendation, start new flow */
-	ini.vlan_id = lgr->vlan_id;
-	smc_pnet_find_alt_roce(lgr, &ini, link->smcibdev);
-	if (!ini.ib_dev) {
+	ini->vlan_id = lgr->vlan_id;
+	smc_pnet_find_alt_roce(lgr, ini, link->smcibdev);
+	if (!ini->ib_dev) {
 		lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
-		ini.ib_dev = link->smcibdev;
-		ini.ib_port = link->ibport;
+		ini->ib_dev = link->smcibdev;
+		ini->ib_port = link->ibport;
 	}
 	lnk_idx = smc_llc_alloc_alt_link(lgr, lgr_new_t);
-	if (lnk_idx < 0)
-		return 0;
+	if (lnk_idx < 0) {
+		rc = 0;
+		goto out;
+	}
 
-	rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, &ini);
+	rc = smcr_link_init(lgr, &lgr->lnk[lnk_idx], lnk_idx, ini);
 	if (rc)
-		return rc;
+		goto out;
 	link_new = &lgr->lnk[lnk_idx];
 	rc = smc_llc_send_add_link(link,
-				   link_new->smcibdev->mac[ini.ib_port - 1],
+				   link_new->smcibdev->mac[ini->ib_port - 1],
 				   link_new->gid, link_new, SMC_LLC_REQ);
 	if (rc)
 		goto out_err;
@@ -1218,10 +1236,15 @@ int smc_llc_srv_add_link(struct smc_link *link)
 	rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t);
 	if (rc)
 		goto out_err;
+	kfree(ini);
 	return 0;
 out_err:
-	link_new->state = SMC_LNK_INACTIVE;
-	smcr_link_clear(link_new, false);
+	if (link_new) {
+		link_new->state = SMC_LNK_INACTIVE;
+		smcr_link_clear(link_new, false);
+	}
+out:
+	kfree(ini);
 	return rc;
 }
 
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 03/11] net/smc: prepare for SMC-Rv2 connection
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 01/11] net/smc: improved fix wait on already cleared link Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 02/11] net/smc: save stack space and allocate smc_init_info Karsten Graul
@ 2021-10-14 16:47 ` Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 04/11] net/smc: add SMC-Rv2 connection establishment Karsten Graul
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

Prepare the connection establishment with SMC-Rv2. Detect eligible
RoCE cards and indicate all supported SMC modes for the connection.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
---
 net/smc/af_smc.c   | 97 +++++++++++++++++++++++++++++-----------------
 net/smc/smc_clc.c  | 11 ++++++
 net/smc/smc_clc.h  | 12 ++++++
 net/smc/smc_core.h | 28 +++++++++++++
 4 files changed, 113 insertions(+), 35 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index f69ef3f2019f..f21e74537f53 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -608,7 +608,9 @@ static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
 	 * used for the internal TCP socket
 	 */
 	smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
-	if (!ini->ib_dev)
+	if (!ini->check_smcrv2 && !ini->ib_dev)
+		return SMC_CLC_DECL_NOSMCRDEV;
+	if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2)
 		return SMC_CLC_DECL_NOSMCRDEV;
 	return 0;
 }
@@ -692,27 +694,42 @@ static int smc_find_proposal_devices(struct smc_sock *smc,
 	int rc = 0;
 
 	/* check if there is an ism device available */
-	if (ini->smcd_version & SMC_V1) {
-		if (smc_find_ism_device(smc, ini) ||
-		    smc_connect_ism_vlan_setup(smc, ini)) {
-			if (ini->smc_type_v1 == SMC_TYPE_B)
-				ini->smc_type_v1 = SMC_TYPE_R;
-			else
-				ini->smc_type_v1 = SMC_TYPE_N;
-		} /* else ISM V1 is supported for this connection */
-		if (smc_find_rdma_device(smc, ini)) {
-			if (ini->smc_type_v1 == SMC_TYPE_B)
-				ini->smc_type_v1 = SMC_TYPE_D;
-			else
-				ini->smc_type_v1 = SMC_TYPE_N;
-		} /* else RDMA is supported for this connection */
-	}
-	if (smc_ism_is_v2_capable() && smc_find_ism_v2_device_clnt(smc, ini))
-		ini->smc_type_v2 = SMC_TYPE_N;
+	if (!(ini->smcd_version & SMC_V1) ||
+	    smc_find_ism_device(smc, ini) ||
+	    smc_connect_ism_vlan_setup(smc, ini))
+		ini->smcd_version &= ~SMC_V1;
+	/* else ISM V1 is supported for this connection */
+
+	/* check if there is an rdma device available */
+	if (!(ini->smcr_version & SMC_V1) ||
+	    smc_find_rdma_device(smc, ini))
+		ini->smcr_version &= ~SMC_V1;
+	/* else RDMA is supported for this connection */
+
+	ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1,
+					      ini->smcr_version & SMC_V1);
+
+	/* check if there is an ism v2 device available */
+	if (!(ini->smcd_version & SMC_V2) ||
+	    !smc_ism_is_v2_capable() ||
+	    smc_find_ism_v2_device_clnt(smc, ini))
+		ini->smcd_version &= ~SMC_V2;
+
+	/* check if there is an rdma v2 device available */
+	ini->check_smcrv2 = true;
+	ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr;
+	if (!(ini->smcr_version & SMC_V2) ||
+	    smc->clcsock->sk->sk_family != AF_INET ||
+	    !smc_clc_ueid_count() ||
+	    smc_find_rdma_device(smc, ini))
+		ini->smcr_version &= ~SMC_V2;
+	ini->check_smcrv2 = false;
+
+	ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2,
+					      ini->smcr_version & SMC_V2);
 
 	/* if neither ISM nor RDMA are supported, fallback */
-	if (!smcr_indicated(ini->smc_type_v1) &&
-	    ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
+	if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
 		rc = SMC_CLC_DECL_NOSMCDEV;
 
 	return rc;
@@ -950,17 +967,24 @@ static int smc_connect_ism(struct smc_sock *smc,
 static int smc_connect_check_aclc(struct smc_init_info *ini,
 				  struct smc_clc_msg_accept_confirm *aclc)
 {
-	if ((aclc->hdr.typev1 == SMC_TYPE_R &&
-	     !smcr_indicated(ini->smc_type_v1)) ||
-	    (aclc->hdr.typev1 == SMC_TYPE_D &&
-	     ((!smcd_indicated(ini->smc_type_v1) &&
-	       !smcd_indicated(ini->smc_type_v2)) ||
-	      (aclc->hdr.version == SMC_V1 &&
-	       !smcd_indicated(ini->smc_type_v1)) ||
-	      (aclc->hdr.version == SMC_V2 &&
-	       !smcd_indicated(ini->smc_type_v2)))))
+	if (aclc->hdr.typev1 != SMC_TYPE_R &&
+	    aclc->hdr.typev1 != SMC_TYPE_D)
 		return SMC_CLC_DECL_MODEUNSUPP;
 
+	if (aclc->hdr.version >= SMC_V2) {
+		if ((aclc->hdr.typev1 == SMC_TYPE_R &&
+		     !smcr_indicated(ini->smc_type_v2)) ||
+		    (aclc->hdr.typev1 == SMC_TYPE_D &&
+		     !smcd_indicated(ini->smc_type_v2)))
+			return SMC_CLC_DECL_MODEUNSUPP;
+	} else {
+		if ((aclc->hdr.typev1 == SMC_TYPE_R &&
+		     !smcr_indicated(ini->smc_type_v1)) ||
+		    (aclc->hdr.typev1 == SMC_TYPE_D &&
+		     !smcd_indicated(ini->smc_type_v1)))
+			return SMC_CLC_DECL_MODEUNSUPP;
+	}
+
 	return 0;
 }
 
@@ -991,14 +1015,15 @@ static int __smc_connect(struct smc_sock *smc)
 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
 						    version);
 
-	ini->smcd_version = SMC_V1;
-	ini->smcd_version |= smc_ism_is_v2_capable() ? SMC_V2 : 0;
+	ini->smcd_version = SMC_V1 | SMC_V2;
+	ini->smcr_version = SMC_V1 | SMC_V2;
 	ini->smc_type_v1 = SMC_TYPE_B;
-	ini->smc_type_v2 = smc_ism_is_v2_capable() ? SMC_TYPE_D : SMC_TYPE_N;
+	ini->smc_type_v2 = SMC_TYPE_B;
 
 	/* get vlan id from IP device */
 	if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
 		ini->smcd_version &= ~SMC_V1;
+		ini->smcr_version = 0;
 		ini->smc_type_v1 = SMC_TYPE_N;
 		if (!ini->smcd_version) {
 			rc = SMC_CLC_DECL_GETVLANERR;
@@ -1026,15 +1051,17 @@ static int __smc_connect(struct smc_sock *smc)
 	/* check if smc modes and versions of CLC proposal and accept match */
 	rc = smc_connect_check_aclc(ini, aclc);
 	version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
-	ini->smcd_version = version;
 	if (rc)
 		goto vlan_cleanup;
 
 	/* depending on previous steps, connect using rdma or ism */
-	if (aclc->hdr.typev1 == SMC_TYPE_R)
+	if (aclc->hdr.typev1 == SMC_TYPE_R) {
+		ini->smcr_version = version;
 		rc = smc_connect_rdma(smc, aclc, ini);
-	else if (aclc->hdr.typev1 == SMC_TYPE_D)
+	} else if (aclc->hdr.typev1 == SMC_TYPE_D) {
+		ini->smcd_version = version;
 		rc = smc_connect_ism(smc, aclc, ini);
+	}
 	if (rc)
 		goto vlan_cleanup;
 
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 1cc8a76b39f9..8d44f06cf401 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -114,6 +114,17 @@ static int smc_clc_ueid_add(char *ueid)
 	return rc;
 }
 
+int smc_clc_ueid_count(void)
+{
+	int count;
+
+	read_lock(&smc_clc_eid_table.lock);
+	count = smc_clc_eid_table.ueid_cnt;
+	read_unlock(&smc_clc_eid_table.lock);
+
+	return count;
+}
+
 int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info)
 {
 	struct nlattr *nla_ueid = info->attrs[SMC_NLA_EID_TABLE_ENTRY];
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 974d01d16bb5..37ce97f7fdb0 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -282,6 +282,17 @@ static inline bool smcd_indicated(int smc_type)
 	return smc_type == SMC_TYPE_D || smc_type == SMC_TYPE_B;
 }
 
+static inline u8 smc_indicated_type(int is_smcd, int is_smcr)
+{
+	if (is_smcd && is_smcr)
+		return SMC_TYPE_B;
+	if (is_smcd)
+		return SMC_TYPE_D;
+	if (is_smcr)
+		return SMC_TYPE_R;
+	return SMC_TYPE_N;
+}
+
 /* get SMC-D info from proposal message */
 static inline struct smc_clc_msg_smcd *
 smc_get_clc_msg_smcd(struct smc_clc_msg_proposal *prop)
@@ -343,6 +354,7 @@ void smc_clc_get_hostname(u8 **host);
 bool smc_clc_match_eid(u8 *negotiated_eid,
 		       struct smc_clc_v2_extension *smc_v2_ext,
 		       u8 *peer_eid, u8 *local_eid);
+int smc_clc_ueid_count(void);
 int smc_nl_dump_ueid(struct sk_buff *skb, struct netlink_callback *cb);
 int smc_nl_add_ueid(struct sk_buff *skb, struct genl_info *info);
 int smc_nl_remove_ueid(struct sk_buff *skb, struct genl_info *info);
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 83d30b06016f..4a1778a7e3a5 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -302,6 +302,31 @@ struct smc_link_group {
 
 struct smc_clc_msg_local;
 
+#define GID_LIST_SIZE	2
+
+struct smc_gidlist {
+	u8			len;
+	u8			list[GID_LIST_SIZE][SMC_GID_SIZE];
+};
+
+struct smc_init_info_smcrv2 {
+	/* Input fields */
+	__be32			saddr;
+	struct sock		*clc_sk;
+	__be32			daddr;
+
+	/* Output fields when saddr is set */
+	struct smc_ib_device	*ib_dev_v2;
+	u8			ib_port_v2;
+	u8			ib_gid_v2[SMC_GID_SIZE];
+
+	/* Additional output fields when clc_sk and daddr is set as well */
+	u8			uses_gateway;
+	u8			nexthop_mac[ETH_ALEN];
+
+	struct smc_gidlist	gidlist;
+};
+
 struct smc_init_info {
 	u8			is_smcd;
 	u8			smc_type_v1;
@@ -313,10 +338,13 @@ struct smc_init_info {
 	u8			negotiated_eid[SMC_MAX_EID_LEN];
 	/* SMC-R */
 	struct smc_clc_msg_local *ib_lcl;
+	u8			smcr_version;
+	u8			check_smcrv2;
 	struct smc_ib_device	*ib_dev;
 	u8			ib_gid[SMC_GID_SIZE];
 	u8			ib_port;
 	u32			ib_clcqpn;
+	struct smc_init_info_smcrv2 smcrv2;
 	/* SMC-D */
 	u64			ism_peer_gid[SMC_MAX_ISM_DEVS + 1];
 	struct smcd_dev		*ism_dev[SMC_MAX_ISM_DEVS + 1];
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 04/11] net/smc: add SMC-Rv2 connection establishment
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
                   ` (2 preceding siblings ...)
  2021-10-14 16:47 ` [PATCH net-next v2 03/11] net/smc: prepare for SMC-Rv2 connection Karsten Graul
@ 2021-10-14 16:47 ` Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 05/11] net/smc: add listen processing for SMC-Rv2 Karsten Graul
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

Send a CLC proposal message, and the remote side process this type of
message and determine the target GID. Check for a valid route to this
GID, and complete the connection establishment.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
---
 net/smc/af_smc.c   | 169 +++++++++++++++++++++++++++++++++++----------
 net/smc/smc_clc.c  |  89 +++++++++++++++++++-----
 net/smc/smc_clc.h  |  24 +++++--
 net/smc/smc_core.h |   6 ++
 net/smc/smc_ib.c   |  27 ++++++++
 net/smc/smc_ib.h   |  13 ++++
 6 files changed, 267 insertions(+), 61 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index f21e74537f53..d4280262b829 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -439,6 +439,47 @@ static int smcr_clnt_conf_first_link(struct smc_sock *smc)
 	return 0;
 }
 
+static bool smc_isascii(char *hostname)
+{
+	int i;
+
+	for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
+		if (!isascii(hostname[i]))
+			return false;
+	return true;
+}
+
+static void smc_conn_save_peer_info_fce(struct smc_sock *smc,
+					struct smc_clc_msg_accept_confirm *clc)
+{
+	struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+		(struct smc_clc_msg_accept_confirm_v2 *)clc;
+	struct smc_clc_first_contact_ext *fce;
+	int clc_v2_len;
+
+	if (clc->hdr.version == SMC_V1 ||
+	    !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK))
+		return;
+
+	if (smc->conn.lgr->is_smcd) {
+		memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid,
+		       SMC_MAX_EID_LEN);
+		clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
+					 d1);
+	} else {
+		memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid,
+		       SMC_MAX_EID_LEN);
+		clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
+					 r1);
+	}
+	fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len);
+	smc->conn.lgr->peer_os = fce->os_type;
+	smc->conn.lgr->peer_smc_release = fce->release;
+	if (smc_isascii(fce->hostname))
+		memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
+		       SMC_MAX_HOSTNAME_LEN);
+}
+
 static void smcr_conn_save_peer_info(struct smc_sock *smc,
 				     struct smc_clc_msg_accept_confirm *clc)
 {
@@ -451,16 +492,6 @@ static void smcr_conn_save_peer_info(struct smc_sock *smc,
 	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
 }
 
-static bool smc_isascii(char *hostname)
-{
-	int i;
-
-	for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
-		if (!isascii(hostname[i]))
-			return false;
-	return true;
-}
-
 static void smcd_conn_save_peer_info(struct smc_sock *smc,
 				     struct smc_clc_msg_accept_confirm *clc)
 {
@@ -472,22 +503,6 @@ static void smcd_conn_save_peer_info(struct smc_sock *smc,
 	smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
 	smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
-	if (clc->hdr.version > SMC_V1 &&
-	    (clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK)) {
-		struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
-			(struct smc_clc_msg_accept_confirm_v2 *)clc;
-		struct smc_clc_first_contact_ext *fce =
-			(struct smc_clc_first_contact_ext *)
-				(((u8 *)clc_v2) + sizeof(*clc_v2));
-
-		memcpy(smc->conn.lgr->negotiated_eid, clc_v2->eid,
-		       SMC_MAX_EID_LEN);
-		smc->conn.lgr->peer_os = fce->os_type;
-		smc->conn.lgr->peer_smc_release = fce->release;
-		if (smc_isascii(fce->hostname))
-			memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
-			       SMC_MAX_HOSTNAME_LEN);
-	}
 }
 
 static void smc_conn_save_peer_info(struct smc_sock *smc,
@@ -497,14 +512,16 @@ static void smc_conn_save_peer_info(struct smc_sock *smc,
 		smcd_conn_save_peer_info(smc, clc);
 	else
 		smcr_conn_save_peer_info(smc, clc);
+	smc_conn_save_peer_info_fce(smc, clc);
 }
 
 static void smc_link_save_peer_info(struct smc_link *link,
-				    struct smc_clc_msg_accept_confirm *clc)
+				    struct smc_clc_msg_accept_confirm *clc,
+				    struct smc_init_info *ini)
 {
 	link->peer_qpn = ntoh24(clc->r0.qpn);
-	memcpy(link->peer_gid, clc->r0.lcl.gid, SMC_GID_SIZE);
-	memcpy(link->peer_mac, clc->r0.lcl.mac, sizeof(link->peer_mac));
+	memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE);
+	memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac));
 	link->peer_psn = ntoh24(clc->r0.psn);
 	link->peer_mtu = clc->r0.qp_mtu;
 }
@@ -769,6 +786,64 @@ static int smc_connect_clc(struct smc_sock *smc,
 				SMC_CLC_ACCEPT, CLC_WAIT_TIME);
 }
 
+static void smc_fill_gid_list(struct smc_link_group *lgr,
+			      struct smc_gidlist *gidlist,
+			      struct smc_ib_device *known_dev, u8 *known_gid)
+{
+	struct smc_init_info *alt_ini = NULL;
+
+	memset(gidlist, 0, sizeof(*gidlist));
+	memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE);
+
+	alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL);
+	if (!alt_ini)
+		goto out;
+
+	alt_ini->vlan_id = lgr->vlan_id;
+	alt_ini->check_smcrv2 = true;
+	alt_ini->smcrv2.saddr = lgr->saddr;
+	smc_pnet_find_alt_roce(lgr, alt_ini, known_dev);
+
+	if (!alt_ini->smcrv2.ib_dev_v2)
+		goto out;
+
+	memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2,
+	       SMC_GID_SIZE);
+
+out:
+	kfree(alt_ini);
+}
+
+static int smc_connect_rdma_v2_prepare(struct smc_sock *smc,
+				       struct smc_clc_msg_accept_confirm *aclc,
+				       struct smc_init_info *ini)
+{
+	struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+		(struct smc_clc_msg_accept_confirm_v2 *)aclc;
+	struct smc_clc_first_contact_ext *fce =
+		(struct smc_clc_first_contact_ext *)
+			(((u8 *)clc_v2) + sizeof(*clc_v2));
+
+	if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1)
+		return 0;
+
+	if (fce->v2_direct) {
+		memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN);
+		ini->smcrv2.uses_gateway = false;
+	} else {
+		if (smc_ib_find_route(smc->clcsock->sk->sk_rcv_saddr,
+				      smc_ib_gid_to_ipv4(aclc->r0.lcl.gid),
+				      ini->smcrv2.nexthop_mac,
+				      &ini->smcrv2.uses_gateway))
+			return SMC_CLC_DECL_NOROUTE;
+		if (!ini->smcrv2.uses_gateway) {
+			/* mismatch: peer claims indirect, but its direct */
+			return SMC_CLC_DECL_NOINDIRECT;
+		}
+	}
+	return 0;
+}
+
 /* setup for RDMA connection of client */
 static int smc_connect_rdma(struct smc_sock *smc,
 			    struct smc_clc_msg_accept_confirm *aclc,
@@ -776,11 +851,18 @@ static int smc_connect_rdma(struct smc_sock *smc,
 {
 	int i, reason_code = 0;
 	struct smc_link *link;
+	u8 *eid = NULL;
 
 	ini->is_smcd = false;
-	ini->ib_lcl = &aclc->r0.lcl;
 	ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
 	ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
+	memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN);
+	memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE);
+	memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN);
+
+	reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini);
+	if (reason_code)
+		return reason_code;
 
 	mutex_lock(&smc_client_lgr_pending);
 	reason_code = smc_conn_create(smc, ini);
@@ -802,8 +884,9 @@ static int smc_connect_rdma(struct smc_sock *smc,
 			if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
 			    !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
 				    SMC_GID_SIZE) &&
-			    !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
-				    sizeof(l->peer_mac))) {
+			    (aclc->hdr.version > SMC_V1 ||
+			     !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
+				     sizeof(l->peer_mac)))) {
 				link = l;
 				break;
 			}
@@ -822,7 +905,7 @@ static int smc_connect_rdma(struct smc_sock *smc,
 	}
 
 	if (ini->first_contact_local)
-		smc_link_save_peer_info(link, aclc);
+		smc_link_save_peer_info(link, aclc, ini);
 
 	if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
 		reason_code = SMC_CLC_DECL_ERR_RTOK;
@@ -845,8 +928,18 @@ static int smc_connect_rdma(struct smc_sock *smc,
 	}
 	smc_rmb_sync_sg_for_device(&smc->conn);
 
+	if (aclc->hdr.version > SMC_V1) {
+		struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
+			(struct smc_clc_msg_accept_confirm_v2 *)aclc;
+
+		eid = clc_v2->r1.eid;
+		if (ini->first_contact_local)
+			smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist,
+					  link->smcibdev, link->gid);
+	}
+
 	reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
-					   SMC_V1, NULL);
+					   aclc->hdr.version, eid, ini);
 	if (reason_code)
 		goto connect_abort;
 
@@ -886,7 +979,7 @@ smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
 	int i;
 
 	for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
-		if (ini->ism_chid[i] == ntohs(aclc->chid)) {
+		if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) {
 			ini->ism_selected = i;
 			return 0;
 		}
@@ -940,11 +1033,11 @@ static int smc_connect_ism(struct smc_sock *smc,
 		struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
 			(struct smc_clc_msg_accept_confirm_v2 *)aclc;
 
-		eid = clc_v2->eid;
+		eid = clc_v2->d1.eid;
 	}
 
 	rc = smc_clc_send_confirm(smc, ini->first_contact_local,
-				  aclc->hdr.version, eid);
+				  aclc->hdr.version, eid, NULL);
 	if (rc)
 		goto connect_abort;
 	mutex_unlock(&smc_server_lgr_pending);
@@ -1735,7 +1828,7 @@ static int smc_listen_rdma_finish(struct smc_sock *new_smc,
 	int reason_code = 0;
 
 	if (local_first)
-		smc_link_save_peer_info(link, cclc);
+		smc_link_save_peer_info(link, cclc, NULL);
 
 	if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
 		return SMC_CLC_DECL_ERR_RTOK;
diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 8d44f06cf401..5cc2e2dc7417 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -31,6 +31,7 @@
 #define SMCR_CLC_ACCEPT_CONFIRM_LEN 68
 #define SMCD_CLC_ACCEPT_CONFIRM_LEN 48
 #define SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 78
+#define SMCR_CLC_ACCEPT_CONFIRM_LEN_V2 108
 #define SMC_CLC_RECV_BUF_LEN	100
 
 /* eye catcher "SMCR" EBCDIC for CLC messages */
@@ -309,7 +310,8 @@ bool smc_clc_match_eid(u8 *negotiated_eid,
 
 	negotiated_eid[0] = 0;
 	read_lock(&smc_clc_eid_table.lock);
-	if (smc_clc_eid_table.seid_enabled &&
+	if (peer_eid && local_eid &&
+	    smc_clc_eid_table.seid_enabled &&
 	    smc_v2_ext->hdr.flag.seid &&
 	    !memcmp(peer_eid, local_eid, SMC_MAX_EID_LEN)) {
 		memcpy(negotiated_eid, peer_eid, SMC_MAX_EID_LEN);
@@ -391,6 +393,9 @@ smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2)
 		    (ntohs(hdr->length) != SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 +
 				sizeof(struct smc_clc_first_contact_ext)))
 			return false;
+		if (hdr->typev1 == SMC_TYPE_R &&
+		    ntohs(hdr->length) < SMCR_CLC_ACCEPT_CONFIRM_LEN_V2)
+			return false;
 	}
 	return true;
 }
@@ -844,8 +849,8 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
 	} else {
 		struct smc_clc_eid_entry *ueident;
 		u16 v2_ext_offset;
-		u8 *eid = NULL;
 
+		v2_ext->hdr.flag.release = SMC_RELEASE;
 		v2_ext_offset = sizeof(*pclc_smcd) -
 			offsetofend(struct smc_clc_msg_smcd, v2_ext_offset);
 		if (ini->smc_type_v1 != SMC_TYPE_N)
@@ -853,6 +858,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
 						pclc_prfx->ipv6_prefixes_cnt *
 						sizeof(ipv6_prfx[0]);
 		pclc_smcd->v2_ext_offset = htons(v2_ext_offset);
+		plen += sizeof(*v2_ext);
 
 		read_lock(&smc_clc_eid_table.lock);
 		v2_ext->hdr.eid_cnt = smc_clc_eid_table.ueid_cnt;
@@ -862,10 +868,13 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
 			memcpy(v2_ext->user_eids[i++], ueident->eid,
 			       sizeof(ueident->eid));
 		}
-		v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled;
 		read_unlock(&smc_clc_eid_table.lock);
+	}
+	if (smcd_indicated(ini->smc_type_v2)) {
+		u8 *eid = NULL;
+
+		v2_ext->hdr.flag.seid = smc_clc_eid_table.seid_enabled;
 		v2_ext->hdr.ism_gid_cnt = ini->ism_offered_cnt;
-		v2_ext->hdr.flag.release = SMC_RELEASE;
 		v2_ext->hdr.smcd_v2_ext_offset = htons(sizeof(*v2_ext) -
 				offsetofend(struct smc_clnt_opts_area_hdr,
 					    smcd_v2_ext_offset) +
@@ -873,7 +882,7 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
 		smc_ism_get_system_eid(&eid);
 		if (eid && v2_ext->hdr.flag.seid)
 			memcpy(smcd_v2_ext->system_eid, eid, SMC_MAX_EID_LEN);
-		plen += sizeof(*v2_ext) + sizeof(*smcd_v2_ext);
+		plen += sizeof(*smcd_v2_ext);
 		if (ini->ism_offered_cnt) {
 			for (i = 1; i <= ini->ism_offered_cnt; i++) {
 				gidchids[i - 1].gid =
@@ -885,6 +894,9 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
 				sizeof(struct smc_clc_smcd_gid_chid);
 		}
 	}
+	if (smcr_indicated(ini->smc_type_v2))
+		memcpy(v2_ext->roce, ini->smcrv2.ib_gid_v2, SMC_GID_SIZE);
+
 	pclc_base->hdr.length = htons(plen);
 	memcpy(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 
@@ -908,12 +920,14 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
 		vec[i].iov_base = v2_ext;
 		vec[i++].iov_len = sizeof(*v2_ext) +
 				   (v2_ext->hdr.eid_cnt * SMC_MAX_EID_LEN);
-		vec[i].iov_base = smcd_v2_ext;
-		vec[i++].iov_len = sizeof(*smcd_v2_ext);
-		if (ini->ism_offered_cnt) {
-			vec[i].iov_base = gidchids;
-			vec[i++].iov_len = ini->ism_offered_cnt *
+		if (smcd_indicated(ini->smc_type_v2)) {
+			vec[i].iov_base = smcd_v2_ext;
+			vec[i++].iov_len = sizeof(*smcd_v2_ext);
+			if (ini->ism_offered_cnt) {
+				vec[i].iov_base = gidchids;
+				vec[i++].iov_len = ini->ism_offered_cnt *
 					sizeof(struct smc_clc_smcd_gid_chid);
+			}
 		}
 	}
 	vec[i].iov_base = trl;
@@ -936,13 +950,14 @@ int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini)
 static int smc_clc_send_confirm_accept(struct smc_sock *smc,
 				       struct smc_clc_msg_accept_confirm_v2 *clc_v2,
 				       int first_contact, u8 version,
-				       u8 *eid)
+				       u8 *eid, struct smc_init_info *ini)
 {
 	struct smc_connection *conn = &smc->conn;
 	struct smc_clc_msg_accept_confirm *clc;
 	struct smc_clc_first_contact_ext fce;
+	struct smc_clc_fce_gid_ext gle;
 	struct smc_clc_msg_trail trl;
-	struct kvec vec[3];
+	struct kvec vec[5];
 	struct msghdr msg;
 	int i, len;
 
@@ -964,9 +979,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
 		if (version == SMC_V1) {
 			clc->hdr.length = htons(SMCD_CLC_ACCEPT_CONFIRM_LEN);
 		} else {
-			clc_v2->chid = htons(smc_ism_get_chid(conn->lgr->smcd));
-			if (eid[0])
-				memcpy(clc_v2->eid, eid, SMC_MAX_EID_LEN);
+			clc_v2->d1.chid =
+				htons(smc_ism_get_chid(conn->lgr->smcd));
+			if (eid && eid[0])
+				memcpy(clc_v2->d1.eid, eid, SMC_MAX_EID_LEN);
 			len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2;
 			if (first_contact)
 				smc_clc_fill_fce(&fce, &len);
@@ -1005,6 +1021,26 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
 		clc->r0.rmb_dma_addr = cpu_to_be64((u64)sg_dma_address
 				(conn->rmb_desc->sgt[link->link_idx].sgl));
 		hton24(clc->r0.psn, link->psn_initial);
+		if (version == SMC_V1) {
+			clc->hdr.length = htons(SMCR_CLC_ACCEPT_CONFIRM_LEN);
+		} else {
+			if (eid && eid[0])
+				memcpy(clc_v2->r1.eid, eid, SMC_MAX_EID_LEN);
+			len = SMCR_CLC_ACCEPT_CONFIRM_LEN_V2;
+			if (first_contact) {
+				smc_clc_fill_fce(&fce, &len);
+				fce.v2_direct = !link->lgr->uses_gateway;
+				memset(&gle, 0, sizeof(gle));
+				if (ini && clc->hdr.type == SMC_CLC_CONFIRM) {
+					gle.gid_cnt = ini->smcrv2.gidlist.len;
+					len += sizeof(gle);
+					len += gle.gid_cnt * sizeof(gle.gid[0]);
+				} else {
+					len += sizeof(gle.reserved);
+				}
+			}
+			clc_v2->hdr.length = htons(len);
+		}
 		memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	}
 
@@ -1012,7 +1048,10 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
 	i = 0;
 	vec[i].iov_base = clc_v2;
 	if (version > SMC_V1)
-		vec[i++].iov_len = SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 - sizeof(trl);
+		vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ?
+					SMCD_CLC_ACCEPT_CONFIRM_LEN_V2 :
+					SMCR_CLC_ACCEPT_CONFIRM_LEN_V2) -
+				   sizeof(trl);
 	else
 		vec[i++].iov_len = (clc->hdr.typev1 == SMC_TYPE_D ?
 						SMCD_CLC_ACCEPT_CONFIRM_LEN :
@@ -1021,6 +1060,18 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
 	if (version > SMC_V1 && first_contact) {
 		vec[i].iov_base = &fce;
 		vec[i++].iov_len = sizeof(fce);
+		if (!conn->lgr->is_smcd) {
+			if (clc->hdr.type == SMC_CLC_CONFIRM) {
+				vec[i].iov_base = &gle;
+				vec[i++].iov_len = sizeof(gle);
+				vec[i].iov_base = &ini->smcrv2.gidlist.list;
+				vec[i++].iov_len = gle.gid_cnt *
+						   sizeof(gle.gid[0]);
+			} else {
+				vec[i].iov_base = &gle.reserved;
+				vec[i++].iov_len = sizeof(gle.reserved);
+			}
+		}
 	}
 	vec[i].iov_base = &trl;
 	vec[i++].iov_len = sizeof(trl);
@@ -1030,7 +1081,7 @@ static int smc_clc_send_confirm_accept(struct smc_sock *smc,
 
 /* send CLC CONFIRM message across internal TCP socket */
 int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
-			 u8 version, u8 *eid)
+			 u8 version, u8 *eid, struct smc_init_info *ini)
 {
 	struct smc_clc_msg_accept_confirm_v2 cclc_v2;
 	int reason_code = 0;
@@ -1040,7 +1091,7 @@ int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
 	memset(&cclc_v2, 0, sizeof(cclc_v2));
 	cclc_v2.hdr.type = SMC_CLC_CONFIRM;
 	len = smc_clc_send_confirm_accept(smc, &cclc_v2, clnt_first_contact,
-					  version, eid);
+					  version, eid, ini);
 	if (len < ntohs(cclc_v2.hdr.length)) {
 		if (len >= 0) {
 			reason_code = -ENETUNREACH;
@@ -1063,7 +1114,7 @@ int smc_clc_send_accept(struct smc_sock *new_smc, bool srv_first_contact,
 	memset(&aclc_v2, 0, sizeof(aclc_v2));
 	aclc_v2.hdr.type = SMC_CLC_ACCEPT;
 	len = smc_clc_send_confirm_accept(new_smc, &aclc_v2, srv_first_contact,
-					  version, negotiated_eid);
+					  version, negotiated_eid, NULL);
 	if (len < ntohs(aclc_v2.hdr.length))
 		len = len >= 0 ? -EPROTO : -new_smc->clcsock->sk->sk_err;
 
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 37ce97f7fdb0..4f2fe278f404 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -54,6 +54,8 @@
 #define SMC_CLC_DECL_NOSRVLINK	0x030b0000  /* SMC-R link from srv not found  */
 #define SMC_CLC_DECL_VERSMISMAT	0x030c0000  /* SMC version mismatch	      */
 #define SMC_CLC_DECL_MAX_DMB	0x030d0000  /* SMC-D DMB limit exceeded       */
+#define SMC_CLC_DECL_NOROUTE	0x030e0000  /* SMC-Rv2 conn. no route to peer */
+#define SMC_CLC_DECL_NOINDIRECT	0x030f0000  /* SMC-Rv2 conn. indirect mismatch*/
 #define SMC_CLC_DECL_SYNCERR	0x04000000  /* synchronization error          */
 #define SMC_CLC_DECL_PEERDECL	0x05000000  /* peer declined during handshake */
 #define SMC_CLC_DECL_INTERR	0x09990000  /* internal error		      */
@@ -213,11 +215,14 @@ struct smcd_clc_msg_accept_confirm_common {	/* SMCD accept/confirm */
 #define SMC_CLC_OS_AIX		3
 
 struct smc_clc_first_contact_ext {
-	u8 reserved1;
 #if defined(__BIG_ENDIAN_BITFIELD)
+	u8 v2_direct : 1,
+	   reserved  : 7;
 	u8 os_type : 4,
 	   release : 4;
 #elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 reserved  : 7,
+	   v2_direct : 1;
 	u8 release : 4,
 	   os_type : 4;
 #endif
@@ -225,6 +230,13 @@ struct smc_clc_first_contact_ext {
 	u8 hostname[SMC_MAX_HOSTNAME_LEN];
 };
 
+struct smc_clc_fce_gid_ext {
+	u8 reserved[16];
+	u8 gid_cnt;
+	u8 reserved2[3];
+	u8 gid[][SMC_GID_SIZE];
+};
+
 struct smc_clc_msg_accept_confirm {	/* clc accept / confirm message */
 	struct smc_clc_msg_hdr hdr;
 	union {
@@ -239,13 +251,17 @@ struct smc_clc_msg_accept_confirm {	/* clc accept / confirm message */
 struct smc_clc_msg_accept_confirm_v2 {	/* clc accept / confirm message */
 	struct smc_clc_msg_hdr hdr;
 	union {
-		struct smcr_clc_msg_accept_confirm r0; /* SMC-R */
+		struct { /* SMC-R */
+			struct smcr_clc_msg_accept_confirm r0;
+			u8 eid[SMC_MAX_EID_LEN];
+			u8 reserved6[8];
+		} r1;
 		struct { /* SMC-D */
 			struct smcd_clc_msg_accept_confirm_common d0;
 			__be16 chid;
 			u8 eid[SMC_MAX_EID_LEN];
 			u8 reserved5[8];
-		};
+		} d1;
 	};
 };
 
@@ -345,7 +361,7 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version);
 int smc_clc_send_proposal(struct smc_sock *smc, struct smc_init_info *ini);
 int smc_clc_send_confirm(struct smc_sock *smc, bool clnt_first_contact,
-			 u8 version, u8 *eid);
+			 u8 version, u8 *eid, struct smc_init_info *ini);
 int smc_clc_send_accept(struct smc_sock *smc, bool srv_first_contact,
 			u8 version, u8 *negotiated_eid);
 void smc_clc_init(void) __init;
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 4a1778a7e3a5..128474f33b8f 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -288,6 +288,9 @@ struct smc_link_group {
 						/* link keep alive time */
 			u32			llc_termination_rsn;
 						/* rsn code for termination */
+			u8			nexthop_mac[ETH_ALEN];
+			u8			uses_gateway;
+			__be32			saddr;
 		};
 		struct { /* SMC-D */
 			u64			peer_gid;
@@ -340,6 +343,9 @@ struct smc_init_info {
 	struct smc_clc_msg_local *ib_lcl;
 	u8			smcr_version;
 	u8			check_smcrv2;
+	u8			peer_gid[SMC_GID_SIZE];
+	u8			peer_mac[ETH_ALEN];
+	u8			peer_systemid[SMC_SYSTEMID_LEN];
 	struct smc_ib_device	*ib_dev;
 	u8			ib_gid[SMC_GID_SIZE];
 	u8			ib_port;
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index a8845343d183..9f72910af1d0 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -183,6 +183,33 @@ bool smc_ib_port_active(struct smc_ib_device *smcibdev, u8 ibport)
 	return smcibdev->pattr[ibport - 1].state == IB_PORT_ACTIVE;
 }
 
+int smc_ib_find_route(__be32 saddr, __be32 daddr,
+		      u8 nexthop_mac[], u8 *uses_gateway)
+{
+	struct neighbour *neigh = NULL;
+	struct rtable *rt = NULL;
+	struct flowi4 fl4 = {
+		.saddr = saddr,
+		.daddr = daddr
+	};
+
+	if (daddr == cpu_to_be32(INADDR_NONE))
+		goto out;
+	rt = ip_route_output_flow(&init_net, &fl4, NULL);
+	if (IS_ERR(rt))
+		goto out;
+	if (rt->rt_uses_gateway && rt->rt_gw_family != AF_INET)
+		goto out;
+	neigh = rt->dst.ops->neigh_lookup(&rt->dst, NULL, &fl4.daddr);
+	if (neigh) {
+		memcpy(nexthop_mac, neigh->ha, ETH_ALEN);
+		*uses_gateway = rt->rt_uses_gateway;
+		return 0;
+	}
+out:
+	return -ENOENT;
+}
+
 /* determine the gid for an ib-device port and vlan id */
 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
 			 unsigned short vlan_id, u8 gid[], u8 *sgid_index)
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index 3085f5180da7..c55cbd7be67a 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -59,6 +59,17 @@ struct smc_ib_device {				/* ib-device infos for smc */
 	int			ndev_ifidx[SMC_MAX_PORTS]; /* ndev if indexes */
 };
 
+static inline __be32 smc_ib_gid_to_ipv4(u8 gid[SMC_GID_SIZE])
+{
+	struct in6_addr *addr6 = (struct in6_addr *)gid;
+
+	if (ipv6_addr_v4mapped(addr6) ||
+	    !(addr6->s6_addr32[0] | addr6->s6_addr32[1] | addr6->s6_addr32[2]))
+		return addr6->s6_addr32[3];
+	return cpu_to_be32(INADDR_NONE);
+}
+
+struct smc_init_info_smcrv2;
 struct smc_buf_desc;
 struct smc_link;
 
@@ -91,6 +102,8 @@ void smc_ib_sync_sg_for_device(struct smc_link *lnk,
 			       enum dma_data_direction data_direction);
 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
 			 unsigned short vlan_id, u8 gid[], u8 *sgid_index);
+int smc_ib_find_route(__be32 saddr, __be32 daddr,
+		      u8 nexthop_mac[], u8 *uses_gateway);
 bool smc_ib_is_valid_local_systemid(void);
 int smcr_nl_get_device(struct sk_buff *skb, struct netlink_callback *cb);
 #endif
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 05/11] net/smc: add listen processing for SMC-Rv2
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
                   ` (3 preceding siblings ...)
  2021-10-14 16:47 ` [PATCH net-next v2 04/11] net/smc: add SMC-Rv2 connection establishment Karsten Graul
@ 2021-10-14 16:47 ` Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 06/11] net/smc: add v2 format of CLC decline message Karsten Graul
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

Implement the server side of the SMC-Rv2 processing. Process incoming
CLC messages, find eligible devices and check for a valid route to the
remote peer.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
---
 net/smc/af_smc.c   | 165 ++++++++++++++++++++++++++++++++-------------
 net/smc/smc_clc.h  |   1 +
 net/smc/smc_core.c |  64 +++++++++++++-----
 net/smc/smc_core.h |   1 -
 4 files changed, 165 insertions(+), 66 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index d4280262b829..ce5feb68382f 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -1515,33 +1515,48 @@ static int smc_listen_v2_check(struct smc_sock *new_smc,
 
 	ini->smc_type_v1 = pclc->hdr.typev1;
 	ini->smc_type_v2 = pclc->hdr.typev2;
-	ini->smcd_version = ini->smc_type_v1 != SMC_TYPE_N ? SMC_V1 : 0;
-	if (pclc->hdr.version > SMC_V1)
-		ini->smcd_version |=
-				ini->smc_type_v2 != SMC_TYPE_N ? SMC_V2 : 0;
-	if (!(ini->smcd_version & SMC_V2)) {
+	ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
+	ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
+	if (pclc->hdr.version > SMC_V1) {
+		if (smcd_indicated(ini->smc_type_v2))
+			ini->smcd_version |= SMC_V2;
+		if (smcr_indicated(ini->smc_type_v2))
+			ini->smcr_version |= SMC_V2;
+	}
+	if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) {
 		rc = SMC_CLC_DECL_PEERNOSMC;
 		goto out;
 	}
-	if (!smc_ism_is_v2_capable()) {
-		ini->smcd_version &= ~SMC_V2;
-		rc = SMC_CLC_DECL_NOISM2SUPP;
-		goto out;
-	}
 	pclc_v2_ext = smc_get_clc_v2_ext(pclc);
 	if (!pclc_v2_ext) {
 		ini->smcd_version &= ~SMC_V2;
+		ini->smcr_version &= ~SMC_V2;
 		rc = SMC_CLC_DECL_NOV2EXT;
 		goto out;
 	}
 	pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
-	if (!pclc_smcd_v2_ext) {
-		ini->smcd_version &= ~SMC_V2;
-		rc = SMC_CLC_DECL_NOV2DEXT;
+	if (ini->smcd_version & SMC_V2) {
+		if (!smc_ism_is_v2_capable()) {
+			ini->smcd_version &= ~SMC_V2;
+			rc = SMC_CLC_DECL_NOISM2SUPP;
+		} else if (!pclc_smcd_v2_ext) {
+			ini->smcd_version &= ~SMC_V2;
+			rc = SMC_CLC_DECL_NOV2DEXT;
+		} else if (!pclc_v2_ext->hdr.eid_cnt &&
+			   !pclc_v2_ext->hdr.flag.seid) {
+			ini->smcd_version &= ~SMC_V2;
+			rc = SMC_CLC_DECL_NOUEID;
+		}
+	}
+	if (ini->smcr_version & SMC_V2) {
+		if (!pclc_v2_ext->hdr.eid_cnt) {
+			ini->smcr_version &= ~SMC_V2;
+			rc = SMC_CLC_DECL_NOUEID;
+		}
 	}
 
 out:
-	if (!ini->smcd_version)
+	if (!ini->smcd_version && !ini->smcr_version)
 		return rc;
 
 	return 0;
@@ -1661,10 +1676,6 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
 	pclc_smcd = smc_get_clc_msg_smcd(pclc);
 	smc_v2_ext = smc_get_clc_v2_ext(pclc);
 	smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
-	if (!smcd_v2_ext) {
-		smc_find_ism_store_rc(SMC_CLC_DECL_NOV2DEXT, ini);
-		goto not_found;
-	}
 
 	mutex_lock(&smcd_dev_list.mutex);
 	if (pclc_smcd->ism.chid)
@@ -1682,8 +1693,10 @@ static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
 	}
 	mutex_unlock(&smcd_dev_list.mutex);
 
-	if (!ini->ism_dev[0])
+	if (!ini->ism_dev[0]) {
+		smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini);
 		goto not_found;
+	}
 
 	smc_ism_get_system_eid(&eid);
 	if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext,
@@ -1736,6 +1749,7 @@ static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
 
 not_found:
 	smc_find_ism_store_rc(rc, ini);
+	ini->smcd_version &= ~SMC_V1;
 	ini->ism_dev[0] = NULL;
 	ini->is_smcd = false;
 }
@@ -1754,24 +1768,69 @@ static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
 	return 0;
 }
 
+static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc,
+					 struct smc_clc_msg_proposal *pclc,
+					 struct smc_init_info *ini)
+{
+	struct smc_clc_v2_extension *smc_v2_ext;
+	u8 smcr_version;
+	int rc;
+
+	if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2))
+		goto not_found;
+
+	smc_v2_ext = smc_get_clc_v2_ext(pclc);
+	if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL))
+		goto not_found;
+
+	/* prepare RDMA check */
+	memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
+	memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE);
+	memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
+	ini->check_smcrv2 = true;
+	ini->smcrv2.clc_sk = new_smc->clcsock->sk;
+	ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr;
+	ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce);
+	rc = smc_find_rdma_device(new_smc, ini);
+	if (rc) {
+		smc_find_ism_store_rc(rc, ini);
+		goto not_found;
+	}
+	if (!ini->smcrv2.uses_gateway)
+		memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN);
+
+	smcr_version = ini->smcr_version;
+	ini->smcr_version = SMC_V2;
+	rc = smc_listen_rdma_init(new_smc, ini);
+	if (!rc)
+		rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local);
+	if (!rc)
+		return;
+	ini->smcr_version = smcr_version;
+	smc_find_ism_store_rc(rc, ini);
+
+not_found:
+	ini->smcr_version &= ~SMC_V2;
+	ini->check_smcrv2 = false;
+}
+
 static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
 					struct smc_clc_msg_proposal *pclc,
 					struct smc_init_info *ini)
 {
 	int rc;
 
-	if (!smcr_indicated(ini->smc_type_v1))
+	if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1))
 		return SMC_CLC_DECL_NOSMCDEV;
 
 	/* prepare RDMA check */
-	ini->ib_lcl = &pclc->lcl;
+	memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
+	memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE);
+	memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
 	rc = smc_find_rdma_device(new_smc, ini);
 	if (rc) {
 		/* no RDMA device found */
-		if (ini->smc_type_v1 == SMC_TYPE_B)
-			/* neither ISM nor RDMA device found */
-			rc = SMC_CLC_DECL_NOSMCDEV;
-		return rc;
+		return SMC_CLC_DECL_NOSMCDEV;
 	}
 	rc = smc_listen_rdma_init(new_smc, ini);
 	if (rc)
@@ -1784,51 +1843,60 @@ static int smc_listen_find_device(struct smc_sock *new_smc,
 				  struct smc_clc_msg_proposal *pclc,
 				  struct smc_init_info *ini)
 {
-	int rc;
+	int prfx_rc;
 
 	/* check for ISM device matching V2 proposed device */
 	smc_find_ism_v2_device_serv(new_smc, pclc, ini);
 	if (ini->ism_dev[0])
 		return 0;
 
-	if (!(ini->smcd_version & SMC_V1))
-		return ini->rc ?: SMC_CLC_DECL_NOSMCD2DEV;
-
-	/* check for matching IP prefix and subnet length */
-	rc = smc_listen_prfx_check(new_smc, pclc);
-	if (rc)
-		return ini->rc ?: rc;
+	/* check for matching IP prefix and subnet length (V1) */
+	prfx_rc = smc_listen_prfx_check(new_smc, pclc);
+	if (prfx_rc)
+		smc_find_ism_store_rc(prfx_rc, ini);
 
 	/* get vlan id from IP device */
 	if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
 		return ini->rc ?: SMC_CLC_DECL_GETVLANERR;
 
 	/* check for ISM device matching V1 proposed device */
-	smc_find_ism_v1_device_serv(new_smc, pclc, ini);
+	if (!prfx_rc)
+		smc_find_ism_v1_device_serv(new_smc, pclc, ini);
 	if (ini->ism_dev[0])
 		return 0;
 
-	if (pclc->hdr.typev1 == SMC_TYPE_D)
+	if (!smcr_indicated(pclc->hdr.typev1) &&
+	    !smcr_indicated(pclc->hdr.typev2))
 		/* skip RDMA and decline */
 		return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV;
 
-	/* check if RDMA is available */
-	rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
-	smc_find_ism_store_rc(rc, ini);
+	/* check if RDMA V2 is available */
+	smc_find_rdma_v2_device_serv(new_smc, pclc, ini);
+	if (ini->smcrv2.ib_dev_v2)
+		return 0;
 
-	return (!rc) ? 0 : ini->rc;
+	/* check if RDMA V1 is available */
+	if (!prfx_rc) {
+		int rc;
+
+		rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
+		smc_find_ism_store_rc(rc, ini);
+		return (!rc) ? 0 : ini->rc;
+	}
+	return SMC_CLC_DECL_NOSMCDEV;
 }
 
 /* listen worker: finish RDMA setup */
 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
 				  struct smc_clc_msg_accept_confirm *cclc,
-				  bool local_first)
+				  bool local_first,
+				  struct smc_init_info *ini)
 {
 	struct smc_link *link = new_smc->conn.lnk;
 	int reason_code = 0;
 
 	if (local_first)
-		smc_link_save_peer_info(link, cclc, NULL);
+		smc_link_save_peer_info(link, cclc, ini);
 
 	if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
 		return SMC_CLC_DECL_ERR_RTOK;
@@ -1849,12 +1917,13 @@ static void smc_listen_work(struct work_struct *work)
 {
 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
 						smc_listen_work);
-	u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1;
 	struct socket *newclcsock = new_smc->clcsock;
 	struct smc_clc_msg_accept_confirm *cclc;
 	struct smc_clc_msg_proposal_area *buf;
 	struct smc_clc_msg_proposal *pclc;
 	struct smc_init_info *ini = NULL;
+	u8 proposal_version = SMC_V1;
+	u8 accept_version;
 	int rc = 0;
 
 	if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
@@ -1885,7 +1954,9 @@ static void smc_listen_work(struct work_struct *work)
 			      SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
 	if (rc)
 		goto out_decl;
-	version = pclc->hdr.version == SMC_V1 ? SMC_V1 : version;
+
+	if (pclc->hdr.version > SMC_V1)
+		proposal_version = SMC_V2;
 
 	/* IPSec connections opt out of SMC optimizations */
 	if (using_ipsec(new_smc)) {
@@ -1915,9 +1986,9 @@ static void smc_listen_work(struct work_struct *work)
 		goto out_unlock;
 
 	/* send SMC Accept CLC message */
+	accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version;
 	rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
-				 ini->smcd_version == SMC_V2 ? SMC_V2 : SMC_V1,
-				 ini->negotiated_eid);
+				 accept_version, ini->negotiated_eid);
 	if (rc)
 		goto out_unlock;
 
@@ -1939,7 +2010,7 @@ static void smc_listen_work(struct work_struct *work)
 	/* finish worker */
 	if (!ini->is_smcd) {
 		rc = smc_listen_rdma_finish(new_smc, cclc,
-					    ini->first_contact_local);
+					    ini->first_contact_local, ini);
 		if (rc)
 			goto out_unlock;
 		mutex_unlock(&smc_server_lgr_pending);
@@ -1953,7 +2024,7 @@ static void smc_listen_work(struct work_struct *work)
 	mutex_unlock(&smc_server_lgr_pending);
 out_decl:
 	smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
-			   version);
+			   proposal_version);
 out_free:
 	kfree(ini);
 	kfree(buf);
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 4f2fe278f404..22c079ed77a9 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -44,6 +44,7 @@
 #define SMC_CLC_DECL_NOV2DEXT	0x03030005  /* peer sent no clc SMC-Dv2 ext.  */
 #define SMC_CLC_DECL_NOSEID	0x03030006  /* peer sent no SEID	      */
 #define SMC_CLC_DECL_NOSMCD2DEV	0x03030007  /* no SMC-Dv2 device found	      */
+#define SMC_CLC_DECL_NOUEID	0x03030008  /* peer sent no UEID	      */
 #define SMC_CLC_DECL_MODEUNSUPP	0x03040000  /* smc modes do not match (R or D)*/
 #define SMC_CLC_DECL_RMBE_EC	0x03050000  /* peer has eyecatcher in RMBE    */
 #define SMC_CLC_DECL_OPTUNSUPP	0x03060000  /* fastopen sockopt not supported */
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 4d463701a759..a081582e5669 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -684,24 +684,30 @@ static void smcr_copy_dev_info_to_link(struct smc_link *link)
 int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 		   u8 link_idx, struct smc_init_info *ini)
 {
+	struct smc_ib_device *smcibdev;
 	u8 rndvec[3];
 	int rc;
 
-	get_device(&ini->ib_dev->ibdev->dev);
-	atomic_inc(&ini->ib_dev->lnk_cnt);
+	if (lgr->smc_version == SMC_V2) {
+		lnk->smcibdev = ini->smcrv2.ib_dev_v2;
+		lnk->ibport = ini->smcrv2.ib_port_v2;
+	} else {
+		lnk->smcibdev = ini->ib_dev;
+		lnk->ibport = ini->ib_port;
+	}
+	get_device(&lnk->smcibdev->ibdev->dev);
+	atomic_inc(&lnk->smcibdev->lnk_cnt);
+	lnk->path_mtu = lnk->smcibdev->pattr[lnk->ibport - 1].active_mtu;
 	lnk->link_id = smcr_next_link_id(lgr);
 	lnk->lgr = lgr;
 	lnk->link_idx = link_idx;
-	lnk->smcibdev = ini->ib_dev;
-	lnk->ibport = ini->ib_port;
 	smc_ibdev_cnt_inc(lnk);
 	smcr_copy_dev_info_to_link(lnk);
-	lnk->path_mtu = ini->ib_dev->pattr[ini->ib_port - 1].active_mtu;
 	atomic_set(&lnk->conn_cnt, 0);
 	smc_llc_link_set_uid(lnk);
 	INIT_WORK(&lnk->link_down_wrk, smc_link_down_work);
-	if (!ini->ib_dev->initialized) {
-		rc = (int)smc_ib_setup_per_ibdev(ini->ib_dev);
+	if (!lnk->smcibdev->initialized) {
+		rc = (int)smc_ib_setup_per_ibdev(lnk->smcibdev);
 		if (rc)
 			goto out;
 	}
@@ -740,11 +746,12 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	smc_llc_link_clear(lnk, false);
 out:
 	smc_ibdev_cnt_dec(lnk);
-	put_device(&ini->ib_dev->ibdev->dev);
+	put_device(&lnk->smcibdev->ibdev->dev);
+	smcibdev = lnk->smcibdev;
 	memset(lnk, 0, sizeof(struct smc_link));
 	lnk->state = SMC_LNK_UNUSED;
-	if (!atomic_dec_return(&ini->ib_dev->lnk_cnt))
-		wake_up(&ini->ib_dev->lnks_deleted);
+	if (!atomic_dec_return(&smcibdev->lnk_cnt))
+		wake_up(&smcibdev->lnks_deleted);
 	return rc;
 }
 
@@ -808,10 +815,25 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		atomic_inc(&ini->ism_dev[ini->ism_selected]->lgr_cnt);
 	} else {
 		/* SMC-R specific settings */
+		struct smc_ib_device *ibdev;
+		int ibport;
+
 		lgr->role = smc->listen_smc ? SMC_SERV : SMC_CLNT;
-		memcpy(lgr->peer_systemid, ini->ib_lcl->id_for_peer,
+		lgr->smc_version = ini->smcr_version;
+		memcpy(lgr->peer_systemid, ini->peer_systemid,
 		       SMC_SYSTEMID_LEN);
-		memcpy(lgr->pnet_id, ini->ib_dev->pnetid[ini->ib_port - 1],
+		if (lgr->smc_version == SMC_V2) {
+			ibdev = ini->smcrv2.ib_dev_v2;
+			ibport = ini->smcrv2.ib_port_v2;
+			lgr->saddr = ini->smcrv2.saddr;
+			lgr->uses_gateway = ini->smcrv2.uses_gateway;
+			memcpy(lgr->nexthop_mac, ini->smcrv2.nexthop_mac,
+			       ETH_ALEN);
+		} else {
+			ibdev = ini->ib_dev;
+			ibport = ini->ib_port;
+		}
+		memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1],
 		       SMC_MAX_PNETID_LEN);
 		smc_llc_lgr_init(lgr, smc);
 
@@ -1636,13 +1658,15 @@ int smc_vlan_by_tcpsk(struct socket *clcsock, struct smc_init_info *ini)
 	return rc;
 }
 
-static bool smcr_lgr_match(struct smc_link_group *lgr,
-			   struct smc_clc_msg_local *lcl,
+static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version,
+			   u8 peer_systemid[],
+			   u8 peer_gid[],
+			   u8 peer_mac_v1[],
 			   enum smc_lgr_role role, u32 clcqpn)
 {
 	int i;
 
-	if (memcmp(lgr->peer_systemid, lcl->id_for_peer, SMC_SYSTEMID_LEN) ||
+	if (memcmp(lgr->peer_systemid, peer_systemid, SMC_SYSTEMID_LEN) ||
 	    lgr->role != role)
 		return false;
 
@@ -1650,8 +1674,9 @@ static bool smcr_lgr_match(struct smc_link_group *lgr,
 		if (!smc_link_active(&lgr->lnk[i]))
 			continue;
 		if ((lgr->role == SMC_SERV || lgr->lnk[i].peer_qpn == clcqpn) &&
-		    !memcmp(lgr->lnk[i].peer_gid, &lcl->gid, SMC_GID_SIZE) &&
-		    !memcmp(lgr->lnk[i].peer_mac, lcl->mac, sizeof(lcl->mac)))
+		    !memcmp(lgr->lnk[i].peer_gid, peer_gid, SMC_GID_SIZE) &&
+		    (smcr_version == SMC_V2 ||
+		     !memcmp(lgr->lnk[i].peer_mac, peer_mac_v1, ETH_ALEN)))
 			return true;
 	}
 	return false;
@@ -1690,7 +1715,10 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
 		if ((ini->is_smcd ?
 		     smcd_lgr_match(lgr, ini->ism_dev[ini->ism_selected],
 				    ini->ism_peer_gid[ini->ism_selected]) :
-		     smcr_lgr_match(lgr, ini->ib_lcl, role, ini->ib_clcqpn)) &&
+		     smcr_lgr_match(lgr, ini->smcr_version,
+				    ini->peer_systemid,
+				    ini->peer_gid, ini->peer_mac, role,
+				    ini->ib_clcqpn)) &&
 		    !lgr->sync_err &&
 		    (ini->smcd_version == SMC_V2 ||
 		     lgr->vlan_id == ini->vlan_id) &&
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 128474f33b8f..dd3198bb6cb9 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -340,7 +340,6 @@ struct smc_init_info {
 	u32			rc;
 	u8			negotiated_eid[SMC_MAX_EID_LEN];
 	/* SMC-R */
-	struct smc_clc_msg_local *ib_lcl;
 	u8			smcr_version;
 	u8			check_smcrv2;
 	u8			peer_gid[SMC_GID_SIZE];
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 06/11] net/smc: add v2 format of CLC decline message
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
                   ` (4 preceding siblings ...)
  2021-10-14 16:47 ` [PATCH net-next v2 05/11] net/smc: add listen processing for SMC-Rv2 Karsten Graul
@ 2021-10-14 16:47 ` Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 07/11] net/smc: retrieve v2 gid from IB device Karsten Graul
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

The CLC decline message changed with SMC-Rv2 and supports up to
4 additional diagnosis codes.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
---
 net/smc/smc_clc.c | 47 +++++++++++++++++++++++++++++++++++++----------
 net/smc/smc_clc.h | 18 ++++++++++++++++++
 2 files changed, 55 insertions(+), 10 deletions(-)

diff --git a/net/smc/smc_clc.c b/net/smc/smc_clc.c
index 5cc2e2dc7417..8409ab71a5e4 100644
--- a/net/smc/smc_clc.c
+++ b/net/smc/smc_clc.c
@@ -400,6 +400,24 @@ smc_clc_msg_acc_conf_valid(struct smc_clc_msg_accept_confirm_v2 *clc_v2)
 	return true;
 }
 
+/* check arriving CLC decline */
+static bool
+smc_clc_msg_decl_valid(struct smc_clc_msg_decline *dclc)
+{
+	struct smc_clc_msg_hdr *hdr = &dclc->hdr;
+
+	if (hdr->typev1 != SMC_TYPE_R && hdr->typev1 != SMC_TYPE_D)
+		return false;
+	if (hdr->version == SMC_V1) {
+		if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline))
+			return false;
+	} else {
+		if (ntohs(hdr->length) != sizeof(struct smc_clc_msg_decline_v2))
+			return false;
+	}
+	return true;
+}
+
 static void smc_clc_fill_fce(struct smc_clc_first_contact_ext *fce, int *len)
 {
 	memset(fce, 0, sizeof(*fce));
@@ -441,9 +459,9 @@ static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm, bool check_trl)
 		break;
 	case SMC_CLC_DECLINE:
 		dclc = (struct smc_clc_msg_decline *)clcm;
-		if (ntohs(dclc->hdr.length) != sizeof(*dclc))
+		if (!smc_clc_msg_decl_valid(dclc))
 			return false;
-		trl = &dclc->trl;
+		check_trl = false;
 		break;
 	default:
 		return false;
@@ -742,15 +760,16 @@ int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
 /* send CLC DECLINE message across internal TCP socket */
 int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version)
 {
-	struct smc_clc_msg_decline dclc;
+	struct smc_clc_msg_decline *dclc_v1;
+	struct smc_clc_msg_decline_v2 dclc;
 	struct msghdr msg;
+	int len, send_len;
 	struct kvec vec;
-	int len;
 
+	dclc_v1 = (struct smc_clc_msg_decline *)&dclc;
 	memset(&dclc, 0, sizeof(dclc));
 	memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
 	dclc.hdr.type = SMC_CLC_DECLINE;
-	dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
 	dclc.hdr.version = version;
 	dclc.os_type = version == SMC_V1 ? 0 : SMC_CLC_OS_LINUX;
 	dclc.hdr.typev2 = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ?
@@ -760,14 +779,22 @@ int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info, u8 version)
 		memcpy(dclc.id_for_peer, local_systemid,
 		       sizeof(local_systemid));
 	dclc.peer_diagnosis = htonl(peer_diag_info);
-	memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
+	if (version == SMC_V1) {
+		memcpy(dclc_v1->trl.eyecatcher, SMC_EYECATCHER,
+		       sizeof(SMC_EYECATCHER));
+		send_len = sizeof(*dclc_v1);
+	} else {
+		memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER,
+		       sizeof(SMC_EYECATCHER));
+		send_len = sizeof(dclc);
+	}
+	dclc.hdr.length = htons(send_len);
 
 	memset(&msg, 0, sizeof(msg));
 	vec.iov_base = &dclc;
-	vec.iov_len = sizeof(struct smc_clc_msg_decline);
-	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
-			     sizeof(struct smc_clc_msg_decline));
-	if (len < 0 || len < sizeof(struct smc_clc_msg_decline))
+	vec.iov_len = send_len;
+	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, send_len);
+	if (len < 0 || len < send_len)
 		len = -EPROTO;
 	return len > 0 ? 0 : len;
 }
diff --git a/net/smc/smc_clc.h b/net/smc/smc_clc.h
index 22c079ed77a9..83f02f131fc0 100644
--- a/net/smc/smc_clc.h
+++ b/net/smc/smc_clc.h
@@ -281,6 +281,24 @@ struct smc_clc_msg_decline {	/* clc decline message */
 	struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */
 } __aligned(4);
 
+#define SMC_DECL_DIAG_COUNT_V2	4 /* no. of additional peer diagnosis codes */
+
+struct smc_clc_msg_decline_v2 {	/* clc decline message */
+	struct smc_clc_msg_hdr hdr;
+	u8 id_for_peer[SMC_SYSTEMID_LEN]; /* sender peer_id */
+	__be32 peer_diagnosis;	/* diagnosis information */
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8 os_type  : 4,
+	   reserved : 4;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 reserved : 4,
+	   os_type  : 4;
+#endif
+	u8 reserved2[3];
+	__be32 peer_diagnosis_v2[SMC_DECL_DIAG_COUNT_V2];
+	struct smc_clc_msg_trail trl; /* eye catcher "SMCD" or "SMCR" EBCDIC */
+} __aligned(4);
+
 /* determine start of the prefix area within the proposal message */
 static inline struct smc_clc_msg_proposal_prefix *
 smc_clc_proposal_get_prefix(struct smc_clc_msg_proposal *pclc)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 07/11] net/smc: retrieve v2 gid from IB device
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
                   ` (5 preceding siblings ...)
  2021-10-14 16:47 ` [PATCH net-next v2 06/11] net/smc: add v2 format of CLC decline message Karsten Graul
@ 2021-10-14 16:47 ` Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 08/11] net/smc: add v2 support to the work request layer Karsten Graul
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

In smc_ib.c, scan for RoCE devices that support UDP encapsulation.
Find an eligible device and check that there is a route to the
remote peer.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
---
 net/smc/smc_core.c |  4 ++-
 net/smc/smc_ib.c   | 77 ++++++++++++++++++++++++++++++++++++++--------
 net/smc/smc_ib.h   |  3 +-
 net/smc/smc_pnet.c | 41 +++++++++++++++---------
 4 files changed, 95 insertions(+), 30 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index a081582e5669..6bbd71de6bc0 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -715,7 +715,9 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk,
 	lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) +
 		(rndvec[2] << 16);
 	rc = smc_ib_determine_gid(lnk->smcibdev, lnk->ibport,
-				  ini->vlan_id, lnk->gid, &lnk->sgid_index);
+				  ini->vlan_id, lnk->gid, &lnk->sgid_index,
+				  lgr->smc_version == SMC_V2 ?
+						  &ini->smcrv2 : NULL);
 	if (rc)
 		goto out;
 	rc = smc_llc_link_init(lnk);
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 9f72910af1d0..d15bacbd73e0 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -17,6 +17,7 @@
 #include <linux/scatterlist.h>
 #include <linux/wait.h>
 #include <linux/mutex.h>
+#include <linux/inetdevice.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_cache.h>
 
@@ -62,16 +63,23 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk)
 		IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU | IB_QP_DEST_QPN |
 		IB_QP_RQ_PSN | IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER;
 	struct ib_qp_attr qp_attr;
+	u8 hop_lim = 1;
 
 	memset(&qp_attr, 0, sizeof(qp_attr));
 	qp_attr.qp_state = IB_QPS_RTR;
 	qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu);
 	qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE;
 	rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport);
-	rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, 1, 0);
+	if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
+		hop_lim = IPV6_DEFAULT_HOPLIMIT;
+	rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, lnk->sgid_index, hop_lim, 0);
 	rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid);
-	memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
-	       sizeof(lnk->peer_mac));
+	if (lnk->lgr->smc_version == SMC_V2 && lnk->lgr->uses_gateway)
+		memcpy(&qp_attr.ah_attr.roce.dmac, lnk->lgr->nexthop_mac,
+		       sizeof(lnk->lgr->nexthop_mac));
+	else
+		memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac,
+		       sizeof(lnk->peer_mac));
 	qp_attr.dest_qp_num = lnk->peer_qpn;
 	qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */
 	qp_attr.max_dest_rd_atomic = 1; /* max # of resources for incoming
@@ -210,9 +218,54 @@ int smc_ib_find_route(__be32 saddr, __be32 daddr,
 	return -ENOENT;
 }
 
+static int smc_ib_determine_gid_rcu(const struct net_device *ndev,
+				    const struct ib_gid_attr *attr,
+				    u8 gid[], u8 *sgid_index,
+				    struct smc_init_info_smcrv2 *smcrv2)
+{
+	if (!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) {
+		if (gid)
+			memcpy(gid, &attr->gid, SMC_GID_SIZE);
+		if (sgid_index)
+			*sgid_index = attr->index;
+		return 0;
+	}
+	if (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
+	    smc_ib_gid_to_ipv4((u8 *)&attr->gid) != cpu_to_be32(INADDR_NONE)) {
+		struct in_device *in_dev = __in_dev_get_rcu(ndev);
+		const struct in_ifaddr *ifa;
+		bool subnet_match = false;
+
+		if (!in_dev)
+			goto out;
+		in_dev_for_each_ifa_rcu(ifa, in_dev) {
+			if (!inet_ifa_match(smcrv2->saddr, ifa))
+				continue;
+			subnet_match = true;
+			break;
+		}
+		if (!subnet_match)
+			goto out;
+		if (smcrv2->daddr && smc_ib_find_route(smcrv2->saddr,
+						       smcrv2->daddr,
+						       smcrv2->nexthop_mac,
+						       &smcrv2->uses_gateway))
+			goto out;
+
+		if (gid)
+			memcpy(gid, &attr->gid, SMC_GID_SIZE);
+		if (sgid_index)
+			*sgid_index = attr->index;
+		return 0;
+	}
+out:
+	return -ENODEV;
+}
+
 /* determine the gid for an ib-device port and vlan id */
 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
-			 unsigned short vlan_id, u8 gid[], u8 *sgid_index)
+			 unsigned short vlan_id, u8 gid[], u8 *sgid_index,
+			 struct smc_init_info_smcrv2 *smcrv2)
 {
 	const struct ib_gid_attr *attr;
 	const struct net_device *ndev;
@@ -228,15 +281,13 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
 		if (!IS_ERR(ndev) &&
 		    ((!vlan_id && !is_vlan_dev(ndev)) ||
 		     (vlan_id && is_vlan_dev(ndev) &&
-		      vlan_dev_vlan_id(ndev) == vlan_id)) &&
-		    attr->gid_type == IB_GID_TYPE_ROCE) {
-			rcu_read_unlock();
-			if (gid)
-				memcpy(gid, &attr->gid, SMC_GID_SIZE);
-			if (sgid_index)
-				*sgid_index = attr->index;
-			rdma_put_gid_attr(attr);
-			return 0;
+		      vlan_dev_vlan_id(ndev) == vlan_id))) {
+			if (!smc_ib_determine_gid_rcu(ndev, attr, gid,
+						      sgid_index, smcrv2)) {
+				rcu_read_unlock();
+				rdma_put_gid_attr(attr);
+				return 0;
+			}
 		}
 		rcu_read_unlock();
 		rdma_put_gid_attr(attr);
diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h
index c55cbd7be67a..07585937370e 100644
--- a/net/smc/smc_ib.h
+++ b/net/smc/smc_ib.h
@@ -101,7 +101,8 @@ void smc_ib_sync_sg_for_device(struct smc_link *lnk,
 			       struct smc_buf_desc *buf_slot,
 			       enum dma_data_direction data_direction);
 int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
-			 unsigned short vlan_id, u8 gid[], u8 *sgid_index);
+			 unsigned short vlan_id, u8 gid[], u8 *sgid_index,
+			 struct smc_init_info_smcrv2 *smcrv2);
 int smc_ib_find_route(__be32 saddr, __be32 daddr,
 		      u8 nexthop_mac[], u8 *uses_gateway);
 bool smc_ib_is_valid_local_systemid(void);
diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c
index 4a964e9190b0..67e9d9fde085 100644
--- a/net/smc/smc_pnet.c
+++ b/net/smc/smc_pnet.c
@@ -953,6 +953,26 @@ static int smc_pnet_find_ndev_pnetid_by_table(struct net_device *ndev,
 	return rc;
 }
 
+static int smc_pnet_determine_gid(struct smc_ib_device *ibdev, int i,
+				  struct smc_init_info *ini)
+{
+	if (!ini->check_smcrv2 &&
+	    !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->ib_gid, NULL,
+				  NULL)) {
+		ini->ib_dev = ibdev;
+		ini->ib_port = i;
+		return 0;
+	}
+	if (ini->check_smcrv2 &&
+	    !smc_ib_determine_gid(ibdev, i, ini->vlan_id, ini->smcrv2.ib_gid_v2,
+				  NULL, &ini->smcrv2)) {
+		ini->smcrv2.ib_dev_v2 = ibdev;
+		ini->smcrv2.ib_port_v2 = i;
+		return 0;
+	}
+	return -ENODEV;
+}
+
 /* find a roce device for the given pnetid */
 static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
 					  struct smc_init_info *ini,
@@ -961,7 +981,6 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
 	struct smc_ib_device *ibdev;
 	int i;
 
-	ini->ib_dev = NULL;
 	mutex_lock(&smc_ib_devices.mutex);
 	list_for_each_entry(ibdev, &smc_ib_devices.list, list) {
 		if (ibdev == known_dev)
@@ -971,12 +990,9 @@ static void _smc_pnet_find_roce_by_pnetid(u8 *pnet_id,
 				continue;
 			if (smc_pnet_match(ibdev->pnetid[i - 1], pnet_id) &&
 			    smc_ib_port_active(ibdev, i) &&
-			    !test_bit(i - 1, ibdev->ports_going_away) &&
-			    !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
-						  ini->ib_gid, NULL)) {
-				ini->ib_dev = ibdev;
-				ini->ib_port = i;
-				goto out;
+			    !test_bit(i - 1, ibdev->ports_going_away)) {
+				if (!smc_pnet_determine_gid(ibdev, i, ini))
+					goto out;
 			}
 		}
 	}
@@ -1016,12 +1032,9 @@ static void smc_pnet_find_rdma_dev(struct net_device *netdev,
 			dev_put(ndev);
 			if (netdev == ndev &&
 			    smc_ib_port_active(ibdev, i) &&
-			    !test_bit(i - 1, ibdev->ports_going_away) &&
-			    !smc_ib_determine_gid(ibdev, i, ini->vlan_id,
-						  ini->ib_gid, NULL)) {
-				ini->ib_dev = ibdev;
-				ini->ib_port = i;
-				break;
+			    !test_bit(i - 1, ibdev->ports_going_away)) {
+				if (!smc_pnet_determine_gid(ibdev, i, ini))
+					break;
 			}
 		}
 	}
@@ -1083,8 +1096,6 @@ void smc_pnet_find_roce_resource(struct sock *sk, struct smc_init_info *ini)
 {
 	struct dst_entry *dst = sk_dst_get(sk);
 
-	ini->ib_dev = NULL;
-	ini->ib_port = 0;
 	if (!dst)
 		goto out;
 	if (!dst->dev)
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 08/11] net/smc: add v2 support to the work request layer
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
                   ` (6 preceding siblings ...)
  2021-10-14 16:47 ` [PATCH net-next v2 07/11] net/smc: retrieve v2 gid from IB device Karsten Graul
@ 2021-10-14 16:47 ` Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 09/11] net/smc: extend LLC layer for SMC-Rv2 Karsten Graul
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

In the work request layer define one large v2 buffer for each link group
that is used to transmit and receive large LLC control messages.
Add the completion queue handling for this buffer.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
---
 net/smc/smc_core.c |   7 +-
 net/smc/smc_core.h |  14 ++++
 net/smc/smc_ib.c   |   3 +-
 net/smc/smc_wr.c   | 194 ++++++++++++++++++++++++++++++++++++++++-----
 net/smc/smc_wr.h   |   2 +
 5 files changed, 199 insertions(+), 21 deletions(-)

diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 6bbd71de6bc0..1ccab993683d 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -837,13 +837,17 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini)
 		}
 		memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1],
 		       SMC_MAX_PNETID_LEN);
+		if (smc_wr_alloc_lgr_mem(lgr))
+			goto free_wq;
 		smc_llc_lgr_init(lgr, smc);
 
 		link_idx = SMC_SINGLE_LINK;
 		lnk = &lgr->lnk[link_idx];
 		rc = smcr_link_init(lgr, lnk, link_idx, ini);
-		if (rc)
+		if (rc) {
+			smc_wr_free_lgr_mem(lgr);
 			goto free_wq;
+		}
 		lgr_list = &smc_lgr_list.list;
 		lgr_lock = &smc_lgr_list.lock;
 		atomic_inc(&lgr_cnt);
@@ -1250,6 +1254,7 @@ static void smc_lgr_free(struct smc_link_group *lgr)
 		if (!atomic_dec_return(&lgr->smcd->lgr_cnt))
 			wake_up(&lgr->smcd->lgrs_deleted);
 	} else {
+		smc_wr_free_lgr_mem(lgr);
 		if (!atomic_dec_return(&lgr_cnt))
 			wake_up(&lgrs_deleted);
 	}
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index dd3198bb6cb9..5997657ee9cc 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -42,11 +42,16 @@ enum smc_link_state {			/* possible states of a link */
 };
 
 #define SMC_WR_BUF_SIZE		48	/* size of work request buffer */
+#define SMC_WR_BUF_V2_SIZE	8192	/* size of v2 work request buffer */
 
 struct smc_wr_buf {
 	u8	raw[SMC_WR_BUF_SIZE];
 };
 
+struct smc_wr_v2_buf {
+	u8	raw[SMC_WR_BUF_V2_SIZE];
+};
+
 #define SMC_WR_REG_MR_WAIT_TIME	(5 * HZ)/* wait time for ib_wr_reg_mr result */
 
 enum smc_wr_reg_state {
@@ -92,7 +97,11 @@ struct smc_link {
 	struct smc_wr_tx_pend	*wr_tx_pends;	/* WR send waiting for CQE */
 	struct completion	*wr_tx_compl;	/* WR send CQE completion */
 	/* above four vectors have wr_tx_cnt elements and use the same index */
+	struct ib_send_wr	*wr_tx_v2_ib;	/* WR send v2 meta data */
+	struct ib_sge		*wr_tx_v2_sge;	/* WR send v2 gather meta data*/
+	struct smc_wr_tx_pend	*wr_tx_v2_pend;	/* WR send v2 waiting for CQE */
 	dma_addr_t		wr_tx_dma_addr;	/* DMA address of wr_tx_bufs */
+	dma_addr_t		wr_tx_v2_dma_addr; /* DMA address of v2 tx buf*/
 	atomic_long_t		wr_tx_id;	/* seq # of last sent WR */
 	unsigned long		*wr_tx_mask;	/* bit mask of used indexes */
 	u32			wr_tx_cnt;	/* number of WR send buffers */
@@ -104,6 +113,7 @@ struct smc_link {
 	struct ib_sge		*wr_rx_sges;	/* WR recv scatter meta data */
 	/* above three vectors have wr_rx_cnt elements and use the same index */
 	dma_addr_t		wr_rx_dma_addr;	/* DMA address of wr_rx_bufs */
+	dma_addr_t		wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/
 	u64			wr_rx_id;	/* seq # of last recv WR */
 	u32			wr_rx_cnt;	/* number of WR recv buffers */
 	unsigned long		wr_rx_tstamp;	/* jiffies when last buf rx */
@@ -250,6 +260,10 @@ struct smc_link_group {
 						/* client or server */
 			struct smc_link		lnk[SMC_LINKS_PER_LGR_MAX];
 						/* smc link */
+			struct smc_wr_v2_buf	*wr_rx_buf_v2;
+						/* WR v2 recv payload buffer */
+			struct smc_wr_v2_buf	*wr_tx_buf_v2;
+						/* WR v2 send payload buffer */
 			char			peer_systemid[SMC_SYSTEMID_LEN];
 						/* unique system_id of peer */
 			struct smc_rtoken	rtokens[SMC_RMBS_PER_LGR_MAX]
diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index d15bacbd73e0..32f9d2fdc474 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -601,6 +601,7 @@ void smc_ib_destroy_queue_pair(struct smc_link *lnk)
 /* create a queue pair within the protection domain for a link */
 int smc_ib_create_queue_pair(struct smc_link *lnk)
 {
+	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
 	struct ib_qp_init_attr qp_attr = {
 		.event_handler = smc_ib_qp_event_handler,
 		.qp_context = lnk,
@@ -614,7 +615,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk)
 			.max_send_wr = SMC_WR_BUF_CNT * 3,
 			.max_recv_wr = SMC_WR_BUF_CNT * 3,
 			.max_send_sge = SMC_IB_MAX_SEND_SGE,
-			.max_recv_sge = 1,
+			.max_recv_sge = sges_per_buf,
 		},
 		.sq_sig_type = IB_SIGNAL_REQ_WR,
 		.qp_type = IB_QPT_RC,
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index a419e9af36b9..22d7324969cd 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -101,19 +101,32 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
 	}
 
 	pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
-	if (pnd_snd_idx == link->wr_tx_cnt)
-		return;
-	link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
-	if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
-		complete(&link->wr_tx_compl[pnd_snd_idx]);
-	memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], sizeof(pnd_snd));
-	/* clear the full struct smc_wr_tx_pend including .priv */
-	memset(&link->wr_tx_pends[pnd_snd_idx], 0,
-	       sizeof(link->wr_tx_pends[pnd_snd_idx]));
-	memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
-	       sizeof(link->wr_tx_bufs[pnd_snd_idx]));
-	if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
-		return;
+	if (pnd_snd_idx == link->wr_tx_cnt) {
+		if (link->lgr->smc_version != SMC_V2 ||
+		    link->wr_tx_v2_pend->wr_id != wc->wr_id)
+			return;
+		link->wr_tx_v2_pend->wc_status = wc->status;
+		memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
+		/* clear the full struct smc_wr_tx_pend including .priv */
+		memset(link->wr_tx_v2_pend, 0,
+		       sizeof(*link->wr_tx_v2_pend));
+		memset(link->lgr->wr_tx_buf_v2, 0,
+		       sizeof(*link->lgr->wr_tx_buf_v2));
+	} else {
+		link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
+		if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
+			complete(&link->wr_tx_compl[pnd_snd_idx]);
+		memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
+		       sizeof(pnd_snd));
+		/* clear the full struct smc_wr_tx_pend including .priv */
+		memset(&link->wr_tx_pends[pnd_snd_idx], 0,
+		       sizeof(link->wr_tx_pends[pnd_snd_idx]));
+		memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
+		       sizeof(link->wr_tx_bufs[pnd_snd_idx]));
+		if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
+			return;
+	}
+
 	if (wc->status) {
 		for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
 			/* clear full struct smc_wr_tx_pend including .priv */
@@ -123,6 +136,12 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
 			       sizeof(link->wr_tx_bufs[i]));
 			clear_bit(i, link->wr_tx_mask);
 		}
+		if (link->lgr->smc_version == SMC_V2) {
+			memset(link->wr_tx_v2_pend, 0,
+			       sizeof(*link->wr_tx_v2_pend));
+			memset(link->lgr->wr_tx_buf_v2, 0,
+			       sizeof(*link->lgr->wr_tx_buf_v2));
+		}
 		/* terminate link */
 		smcr_link_down_cond_sched(link);
 	}
@@ -256,6 +275,14 @@ int smc_wr_tx_put_slot(struct smc_link *link,
 		test_and_clear_bit(idx, link->wr_tx_mask);
 		wake_up(&link->wr_tx_wait);
 		return 1;
+	} else if (link->lgr->smc_version == SMC_V2 &&
+		   pend->idx == link->wr_tx_cnt) {
+		/* Large v2 buffer */
+		memset(&link->wr_tx_v2_pend, 0,
+		       sizeof(link->wr_tx_v2_pend));
+		memset(&link->lgr->wr_tx_buf_v2, 0,
+		       sizeof(link->lgr->wr_tx_buf_v2));
+		return 1;
 	}
 
 	return 0;
@@ -517,6 +544,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
 
 static void smc_wr_init_sge(struct smc_link *lnk)
 {
+	int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1;
 	u32 i;
 
 	for (i = 0; i < lnk->wr_tx_cnt; i++) {
@@ -545,14 +573,44 @@ static void smc_wr_init_sge(struct smc_link *lnk)
 		lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list =
 			lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge;
 	}
+
+	if (lnk->lgr->smc_version == SMC_V2) {
+		lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr;
+		lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE;
+		lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey;
+
+		lnk->wr_tx_v2_ib->next = NULL;
+		lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge;
+		lnk->wr_tx_v2_ib->num_sge = 1;
+		lnk->wr_tx_v2_ib->opcode = IB_WR_SEND;
+		lnk->wr_tx_v2_ib->send_flags =
+			IB_SEND_SIGNALED | IB_SEND_SOLICITED;
+	}
+
+	/* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE.
+	 * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer
+	 * and the same buffer for all sges. When a larger message arrived then
+	 * the content of the first small sge is copied to the beginning of
+	 * the larger spillover buffer, allowing easy data mapping.
+	 */
 	for (i = 0; i < lnk->wr_rx_cnt; i++) {
-		lnk->wr_rx_sges[i].addr =
+		int x = i * sges_per_buf;
+
+		lnk->wr_rx_sges[x].addr =
 			lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE;
-		lnk->wr_rx_sges[i].length = SMC_WR_BUF_SIZE;
-		lnk->wr_rx_sges[i].lkey = lnk->roce_pd->local_dma_lkey;
+		lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE;
+		lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey;
+		if (lnk->lgr->smc_version == SMC_V2) {
+			lnk->wr_rx_sges[x + 1].addr =
+					lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE;
+			lnk->wr_rx_sges[x + 1].length =
+					SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE;
+			lnk->wr_rx_sges[x + 1].lkey =
+					lnk->roce_pd->local_dma_lkey;
+		}
 		lnk->wr_rx_ibs[i].next = NULL;
-		lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[i];
-		lnk->wr_rx_ibs[i].num_sge = 1;
+		lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x];
+		lnk->wr_rx_ibs[i].num_sge = sges_per_buf;
 	}
 	lnk->wr_reg.wr.next = NULL;
 	lnk->wr_reg.wr.num_sge = 0;
@@ -585,16 +643,45 @@ void smc_wr_free_link(struct smc_link *lnk)
 				    DMA_FROM_DEVICE);
 		lnk->wr_rx_dma_addr = 0;
 	}
+	if (lnk->wr_rx_v2_dma_addr) {
+		ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
+				    SMC_WR_BUF_V2_SIZE,
+				    DMA_FROM_DEVICE);
+		lnk->wr_rx_v2_dma_addr = 0;
+	}
 	if (lnk->wr_tx_dma_addr) {
 		ib_dma_unmap_single(ibdev, lnk->wr_tx_dma_addr,
 				    SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
 				    DMA_TO_DEVICE);
 		lnk->wr_tx_dma_addr = 0;
 	}
+	if (lnk->wr_tx_v2_dma_addr) {
+		ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
+				    SMC_WR_BUF_V2_SIZE,
+				    DMA_TO_DEVICE);
+		lnk->wr_tx_v2_dma_addr = 0;
+	}
+}
+
+void smc_wr_free_lgr_mem(struct smc_link_group *lgr)
+{
+	if (lgr->smc_version < SMC_V2)
+		return;
+
+	kfree(lgr->wr_rx_buf_v2);
+	lgr->wr_rx_buf_v2 = NULL;
+	kfree(lgr->wr_tx_buf_v2);
+	lgr->wr_tx_buf_v2 = NULL;
 }
 
 void smc_wr_free_link_mem(struct smc_link *lnk)
 {
+	kfree(lnk->wr_tx_v2_ib);
+	lnk->wr_tx_v2_ib = NULL;
+	kfree(lnk->wr_tx_v2_sge);
+	lnk->wr_tx_v2_sge = NULL;
+	kfree(lnk->wr_tx_v2_pend);
+	lnk->wr_tx_v2_pend = NULL;
 	kfree(lnk->wr_tx_compl);
 	lnk->wr_tx_compl = NULL;
 	kfree(lnk->wr_tx_pends);
@@ -619,8 +706,26 @@ void smc_wr_free_link_mem(struct smc_link *lnk)
 	lnk->wr_rx_bufs = NULL;
 }
 
+int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr)
+{
+	if (lgr->smc_version < SMC_V2)
+		return 0;
+
+	lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
+	if (!lgr->wr_rx_buf_v2)
+		return -ENOMEM;
+	lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL);
+	if (!lgr->wr_tx_buf_v2) {
+		kfree(lgr->wr_rx_buf_v2);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
 int smc_wr_alloc_link_mem(struct smc_link *link)
 {
+	int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1;
+
 	/* allocate link related memory */
 	link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL);
 	if (!link->wr_tx_bufs)
@@ -653,7 +758,7 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
 	if (!link->wr_tx_sges)
 		goto no_mem_wr_tx_rdma_sges;
 	link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3,
-				   sizeof(link->wr_rx_sges[0]),
+				   sizeof(link->wr_rx_sges[0]) * sges_per_buf,
 				   GFP_KERNEL);
 	if (!link->wr_rx_sges)
 		goto no_mem_wr_tx_sges;
@@ -672,8 +777,29 @@ int smc_wr_alloc_link_mem(struct smc_link *link)
 				    GFP_KERNEL);
 	if (!link->wr_tx_compl)
 		goto no_mem_wr_tx_pends;
+
+	if (link->lgr->smc_version == SMC_V2) {
+		link->wr_tx_v2_ib = kzalloc(sizeof(*link->wr_tx_v2_ib),
+					    GFP_KERNEL);
+		if (!link->wr_tx_v2_ib)
+			goto no_mem_tx_compl;
+		link->wr_tx_v2_sge = kzalloc(sizeof(*link->wr_tx_v2_sge),
+					     GFP_KERNEL);
+		if (!link->wr_tx_v2_sge)
+			goto no_mem_v2_ib;
+		link->wr_tx_v2_pend = kzalloc(sizeof(*link->wr_tx_v2_pend),
+					      GFP_KERNEL);
+		if (!link->wr_tx_v2_pend)
+			goto no_mem_v2_sge;
+	}
 	return 0;
 
+no_mem_v2_sge:
+	kfree(link->wr_tx_v2_sge);
+no_mem_v2_ib:
+	kfree(link->wr_tx_v2_ib);
+no_mem_tx_compl:
+	kfree(link->wr_tx_compl);
 no_mem_wr_tx_pends:
 	kfree(link->wr_tx_pends);
 no_mem_wr_tx_mask:
@@ -725,6 +851,24 @@ int smc_wr_create_link(struct smc_link *lnk)
 		rc = -EIO;
 		goto out;
 	}
+	if (lnk->lgr->smc_version == SMC_V2) {
+		lnk->wr_rx_v2_dma_addr = ib_dma_map_single(ibdev,
+			lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE,
+			DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(ibdev, lnk->wr_rx_v2_dma_addr)) {
+			lnk->wr_rx_v2_dma_addr = 0;
+			rc = -EIO;
+			goto dma_unmap;
+		}
+		lnk->wr_tx_v2_dma_addr = ib_dma_map_single(ibdev,
+			lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE,
+			DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(ibdev, lnk->wr_tx_v2_dma_addr)) {
+			lnk->wr_tx_v2_dma_addr = 0;
+			rc = -EIO;
+			goto dma_unmap;
+		}
+	}
 	lnk->wr_tx_dma_addr = ib_dma_map_single(
 		ibdev, lnk->wr_tx_bufs,	SMC_WR_BUF_SIZE * lnk->wr_tx_cnt,
 		DMA_TO_DEVICE);
@@ -742,6 +886,18 @@ int smc_wr_create_link(struct smc_link *lnk)
 	return rc;
 
 dma_unmap:
+	if (lnk->wr_rx_v2_dma_addr) {
+		ib_dma_unmap_single(ibdev, lnk->wr_rx_v2_dma_addr,
+				    SMC_WR_BUF_V2_SIZE,
+				    DMA_FROM_DEVICE);
+		lnk->wr_rx_v2_dma_addr = 0;
+	}
+	if (lnk->wr_tx_v2_dma_addr) {
+		ib_dma_unmap_single(ibdev, lnk->wr_tx_v2_dma_addr,
+				    SMC_WR_BUF_V2_SIZE,
+				    DMA_TO_DEVICE);
+		lnk->wr_tx_v2_dma_addr = 0;
+	}
 	ib_dma_unmap_single(ibdev, lnk->wr_rx_dma_addr,
 			    SMC_WR_BUF_SIZE * lnk->wr_rx_cnt,
 			    DMA_FROM_DEVICE);
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 2bc626f230a5..6d2cdb231859 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -101,8 +101,10 @@ static inline int smc_wr_rx_post(struct smc_link *link)
 
 int smc_wr_create_link(struct smc_link *lnk);
 int smc_wr_alloc_link_mem(struct smc_link *lnk);
+int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr);
 void smc_wr_free_link(struct smc_link *lnk);
 void smc_wr_free_link_mem(struct smc_link *lnk);
+void smc_wr_free_lgr_mem(struct smc_link_group *lgr);
 void smc_wr_remember_qp_attr(struct smc_link *lnk);
 void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
 void smc_wr_add_dev(struct smc_ib_device *smcibdev);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 09/11] net/smc: extend LLC layer for SMC-Rv2
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
                   ` (7 preceding siblings ...)
  2021-10-14 16:47 ` [PATCH net-next v2 08/11] net/smc: add v2 support to the work request layer Karsten Graul
@ 2021-10-14 16:47 ` Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 10/11] net/smc: add netlink support " Karsten Graul
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

Add support for large v2 LLC control messages in smc_llc.c.
The new large work request buffer allows to combine control
messages into one packet that had to be spread over several
packets before.
Add handling of the new v2 LLC messages.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
---
 net/smc/af_smc.c   |   8 +-
 net/smc/smc.h      |  20 +-
 net/smc/smc_core.h |   1 +
 net/smc/smc_llc.c  | 556 ++++++++++++++++++++++++++++++++++++---------
 net/smc/smc_llc.h  |  12 +-
 net/smc/smc_wr.c   |  43 ++++
 net/smc/smc_wr.h   |   6 +
 7 files changed, 531 insertions(+), 115 deletions(-)

diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index ce5feb68382f..5e50e007a7da 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -786,9 +786,9 @@ static int smc_connect_clc(struct smc_sock *smc,
 				SMC_CLC_ACCEPT, CLC_WAIT_TIME);
 }
 
-static void smc_fill_gid_list(struct smc_link_group *lgr,
-			      struct smc_gidlist *gidlist,
-			      struct smc_ib_device *known_dev, u8 *known_gid)
+void smc_fill_gid_list(struct smc_link_group *lgr,
+		       struct smc_gidlist *gidlist,
+		       struct smc_ib_device *known_dev, u8 *known_gid)
 {
 	struct smc_init_info *alt_ini = NULL;
 
@@ -1435,7 +1435,7 @@ static int smcr_serv_conf_first_link(struct smc_sock *smc)
 	smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
 
 	/* initial contact - try to establish second link */
-	smc_llc_srv_add_link(link);
+	smc_llc_srv_add_link(link, NULL);
 	return 0;
 }
 
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 5e7def3ab730..f4286ca1f228 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -56,7 +56,20 @@ enum smc_state {		/* possible states of an SMC socket */
 struct smc_link_group;
 
 struct smc_wr_rx_hdr {	/* common prefix part of LLC and CDC to demultiplex */
-	u8			type;
+	union {
+		u8 type;
+#if defined(__BIG_ENDIAN_BITFIELD)
+		struct {
+			u8 llc_version:4,
+			   llc_type:4;
+		};
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+		struct {
+			u8 llc_type:4,
+			   llc_version:4;
+		};
+#endif
+	};
 } __aligned(1);
 
 struct smc_cdc_conn_state_flags {
@@ -286,7 +299,12 @@ static inline bool using_ipsec(struct smc_sock *smc)
 }
 #endif
 
+struct smc_gidlist;
+
 struct sock *smc_accept_dequeue(struct sock *parent, struct socket *new_sock);
 void smc_close_non_accepted(struct sock *sk);
+void smc_fill_gid_list(struct smc_link_group *lgr,
+		       struct smc_gidlist *gidlist,
+		       struct smc_ib_device *known_dev, u8 *known_gid);
 
 #endif	/* __SMC_H */
diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h
index 5997657ee9cc..59cef3b830d8 100644
--- a/net/smc/smc_core.h
+++ b/net/smc/smc_core.h
@@ -218,6 +218,7 @@ enum smc_llc_flowtype {
 	SMC_LLC_FLOW_NONE	= 0,
 	SMC_LLC_FLOW_ADD_LINK	= 2,
 	SMC_LLC_FLOW_DEL_LINK	= 4,
+	SMC_LLC_FLOW_REQ_ADD_LINK = 5,
 	SMC_LLC_FLOW_RKEY	= 6,
 };
 
diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c
index 760f5ed7c8f0..a9623c952007 100644
--- a/net/smc/smc_llc.c
+++ b/net/smc/smc_llc.c
@@ -23,16 +23,24 @@
 
 struct smc_llc_hdr {
 	struct smc_wr_rx_hdr common;
-	u8 length;	/* 44 */
-#if defined(__BIG_ENDIAN_BITFIELD)
-	u8 reserved:4,
-	   add_link_rej_rsn:4;
+	union {
+		struct {
+			u8 length;	/* 44 */
+	#if defined(__BIG_ENDIAN_BITFIELD)
+			u8 reserved:4,
+			   add_link_rej_rsn:4;
 #elif defined(__LITTLE_ENDIAN_BITFIELD)
-	u8 add_link_rej_rsn:4,
-	   reserved:4;
+			u8 add_link_rej_rsn:4,
+			   reserved:4;
 #endif
+		};
+		u16 length_v2;	/* 44 - 8192*/
+	};
 	u8 flags;
-};
+} __packed;		/* format defined in
+			 * IBM Shared Memory Communications Version 2
+			 * (https://www.ibm.com/support/pages/node/6326337)
+			 */
 
 #define SMC_LLC_FLAG_NO_RMBE_EYEC	0x03
 
@@ -76,6 +84,32 @@ struct smc_llc_msg_add_link_cont_rt {
 	__be64 rmb_vaddr_new;
 };
 
+struct smc_llc_msg_add_link_v2_ext {
+#if defined(__BIG_ENDIAN_BITFIELD)
+	u8 v2_direct : 1,
+	   reserved  : 7;
+#elif defined(__LITTLE_ENDIAN_BITFIELD)
+	u8 reserved  : 7,
+	   v2_direct : 1;
+#endif
+	u8 reserved2;
+	u8 client_target_gid[SMC_GID_SIZE];
+	u8 reserved3[8];
+	u16 num_rkeys;
+	struct smc_llc_msg_add_link_cont_rt rt[];
+} __packed;		/* format defined in
+			 * IBM Shared Memory Communications Version 2
+			 * (https://www.ibm.com/support/pages/node/6326337)
+			 */
+
+struct smc_llc_msg_req_add_link_v2 {
+	struct smc_llc_hdr hd;
+	u8 reserved[20];
+	u8 gid_cnt;
+	u8 reserved2[3];
+	u8 gid[][SMC_GID_SIZE];
+};
+
 #define SMC_LLC_RKEYS_PER_CONT_MSG	2
 
 struct smc_llc_msg_add_link_cont {	/* type 0x03 */
@@ -114,7 +148,8 @@ struct smc_rmb_rtoken {
 	__be64 rmb_vaddr;
 } __packed;			/* format defined in RFC7609 */
 
-#define SMC_LLC_RKEYS_PER_MSG	3
+#define SMC_LLC_RKEYS_PER_MSG		3
+#define SMC_LLC_RKEYS_PER_MSG_V2	255
 
 struct smc_llc_msg_confirm_rkey {	/* type 0x06 */
 	struct smc_llc_hdr hd;
@@ -135,9 +170,18 @@ struct smc_llc_msg_delete_rkey {	/* type 0x09 */
 	u8 reserved2[4];
 };
 
+struct smc_llc_msg_delete_rkey_v2 {	/* type 0x29 */
+	struct smc_llc_hdr hd;
+	u8 num_rkeys;
+	u8 num_inval_rkeys;
+	u8 reserved[2];
+	__be32 rkey[];
+};
+
 union smc_llc_msg {
 	struct smc_llc_msg_confirm_link confirm_link;
 	struct smc_llc_msg_add_link add_link;
+	struct smc_llc_msg_req_add_link_v2 req_add_link;
 	struct smc_llc_msg_add_link_cont add_link_cont;
 	struct smc_llc_msg_del_link delete_link;
 
@@ -189,7 +233,7 @@ static inline void smc_llc_flow_qentry_set(struct smc_llc_flow *flow,
 static void smc_llc_flow_parallel(struct smc_link_group *lgr, u8 flow_type,
 				  struct smc_llc_qentry *qentry)
 {
-	u8 msg_type = qentry->msg.raw.hdr.common.type;
+	u8 msg_type = qentry->msg.raw.hdr.common.llc_type;
 
 	if ((msg_type == SMC_LLC_ADD_LINK || msg_type == SMC_LLC_DELETE_LINK) &&
 	    flow_type != msg_type && !lgr->delayed_event) {
@@ -219,7 +263,7 @@ static bool smc_llc_flow_start(struct smc_llc_flow *flow,
 		spin_unlock_bh(&lgr->llc_flow_lock);
 		return false;
 	}
-	switch (qentry->msg.raw.hdr.common.type) {
+	switch (qentry->msg.raw.hdr.common.llc_type) {
 	case SMC_LLC_ADD_LINK:
 		flow->type = SMC_LLC_FLOW_ADD_LINK;
 		break;
@@ -306,7 +350,7 @@ struct smc_llc_qentry *smc_llc_wait(struct smc_link_group *lgr,
 		smc_llc_flow_qentry_del(flow);
 		goto out;
 	}
-	rcv_msg = flow->qentry->msg.raw.hdr.common.type;
+	rcv_msg = flow->qentry->msg.raw.hdr.common.llc_type;
 	if (exp_msg && rcv_msg != exp_msg) {
 		if (exp_msg == SMC_LLC_ADD_LINK &&
 		    rcv_msg == SMC_LLC_DELETE_LINK) {
@@ -374,6 +418,30 @@ static int smc_llc_add_pending_send(struct smc_link *link,
 	return 0;
 }
 
+static int smc_llc_add_pending_send_v2(struct smc_link *link,
+				       struct smc_wr_v2_buf **wr_buf,
+				       struct smc_wr_tx_pend_priv **pend)
+{
+	int rc;
+
+	rc = smc_wr_tx_get_v2_slot(link, smc_llc_tx_handler, wr_buf, pend);
+	if (rc < 0)
+		return rc;
+	return 0;
+}
+
+static void smc_llc_init_msg_hdr(struct smc_llc_hdr *hdr,
+				 struct smc_link_group *lgr, size_t len)
+{
+	if (lgr->smc_version == SMC_V2) {
+		hdr->common.llc_version = SMC_V2;
+		hdr->length_v2 = len;
+	} else {
+		hdr->common.llc_version = 0;
+		hdr->length = len;
+	}
+}
+
 /* high-level API to send LLC confirm link */
 int smc_llc_send_confirm_link(struct smc_link *link,
 			      enum smc_llc_reqresp reqresp)
@@ -390,8 +458,8 @@ int smc_llc_send_confirm_link(struct smc_link *link,
 		goto put_out;
 	confllc = (struct smc_llc_msg_confirm_link *)wr_buf;
 	memset(confllc, 0, sizeof(*confllc));
-	confllc->hd.common.type = SMC_LLC_CONFIRM_LINK;
-	confllc->hd.length = sizeof(struct smc_llc_msg_confirm_link);
+	confllc->hd.common.llc_type = SMC_LLC_CONFIRM_LINK;
+	smc_llc_init_msg_hdr(&confllc->hd, link->lgr, sizeof(*confllc));
 	confllc->hd.flags |= SMC_LLC_FLAG_NO_RMBE_EYEC;
 	if (reqresp == SMC_LLC_RESP)
 		confllc->hd.flags |= SMC_LLC_FLAG_RESP;
@@ -426,8 +494,8 @@ static int smc_llc_send_confirm_rkey(struct smc_link *send_link,
 		goto put_out;
 	rkeyllc = (struct smc_llc_msg_confirm_rkey *)wr_buf;
 	memset(rkeyllc, 0, sizeof(*rkeyllc));
-	rkeyllc->hd.common.type = SMC_LLC_CONFIRM_RKEY;
-	rkeyllc->hd.length = sizeof(struct smc_llc_msg_confirm_rkey);
+	rkeyllc->hd.common.llc_type = SMC_LLC_CONFIRM_RKEY;
+	smc_llc_init_msg_hdr(&rkeyllc->hd, send_link->lgr, sizeof(*rkeyllc));
 
 	rtok_ix = 1;
 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
@@ -471,8 +539,8 @@ static int smc_llc_send_delete_rkey(struct smc_link *link,
 		goto put_out;
 	rkeyllc = (struct smc_llc_msg_delete_rkey *)wr_buf;
 	memset(rkeyllc, 0, sizeof(*rkeyllc));
-	rkeyllc->hd.common.type = SMC_LLC_DELETE_RKEY;
-	rkeyllc->hd.length = sizeof(struct smc_llc_msg_delete_rkey);
+	rkeyllc->hd.common.llc_type = SMC_LLC_DELETE_RKEY;
+	smc_llc_init_msg_hdr(&rkeyllc->hd, link->lgr, sizeof(*rkeyllc));
 	rkeyllc->num_rkeys = 1;
 	rkeyllc->rkey[0] = htonl(rmb_desc->mr_rx[link->link_idx]->rkey);
 	/* send llc message */
@@ -482,26 +550,116 @@ static int smc_llc_send_delete_rkey(struct smc_link *link,
 	return rc;
 }
 
+/* return first buffer from any of the next buf lists */
+static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr,
+						  int *buf_lst)
+{
+	struct smc_buf_desc *buf_pos;
+
+	while (*buf_lst < SMC_RMBE_SIZES) {
+		buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst],
+						   struct smc_buf_desc, list);
+		if (buf_pos)
+			return buf_pos;
+		(*buf_lst)++;
+	}
+	return NULL;
+}
+
+/* return next rmb from buffer lists */
+static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr,
+						 int *buf_lst,
+						 struct smc_buf_desc *buf_pos)
+{
+	struct smc_buf_desc *buf_next;
+
+	if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) {
+		(*buf_lst)++;
+		return _smc_llc_get_next_rmb(lgr, buf_lst);
+	}
+	buf_next = list_next_entry(buf_pos, list);
+	return buf_next;
+}
+
+static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr,
+						  int *buf_lst)
+{
+	*buf_lst = 0;
+	return smc_llc_get_next_rmb(lgr, buf_lst, NULL);
+}
+
+static int smc_llc_fill_ext_v2(struct smc_llc_msg_add_link_v2_ext *ext,
+			       struct smc_link *link, struct smc_link *link_new)
+{
+	struct smc_link_group *lgr = link->lgr;
+	struct smc_buf_desc *buf_pos;
+	int prim_lnk_idx, lnk_idx, i;
+	struct smc_buf_desc *rmb;
+	int len = sizeof(*ext);
+	int buf_lst;
+
+	ext->v2_direct = !lgr->uses_gateway;
+	memcpy(ext->client_target_gid, link_new->gid, SMC_GID_SIZE);
+
+	prim_lnk_idx = link->link_idx;
+	lnk_idx = link_new->link_idx;
+	mutex_lock(&lgr->rmbs_lock);
+	ext->num_rkeys = lgr->conns_num;
+	if (!ext->num_rkeys)
+		goto out;
+	buf_pos = smc_llc_get_first_rmb(lgr, &buf_lst);
+	for (i = 0; i < ext->num_rkeys; i++) {
+		if (!buf_pos)
+			break;
+		rmb = buf_pos;
+		ext->rt[i].rmb_key = htonl(rmb->mr_rx[prim_lnk_idx]->rkey);
+		ext->rt[i].rmb_key_new = htonl(rmb->mr_rx[lnk_idx]->rkey);
+		ext->rt[i].rmb_vaddr_new =
+			cpu_to_be64((u64)sg_dma_address(rmb->sgt[lnk_idx].sgl));
+		buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos);
+		while (buf_pos && !(buf_pos)->used)
+			buf_pos = smc_llc_get_next_rmb(lgr, &buf_lst, buf_pos);
+	}
+	len += i * sizeof(ext->rt[0]);
+out:
+	mutex_unlock(&lgr->rmbs_lock);
+	return len;
+}
+
 /* send ADD LINK request or response */
 int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
 			  struct smc_link *link_new,
 			  enum smc_llc_reqresp reqresp)
 {
+	struct smc_llc_msg_add_link_v2_ext *ext = NULL;
 	struct smc_llc_msg_add_link *addllc;
 	struct smc_wr_tx_pend_priv *pend;
-	struct smc_wr_buf *wr_buf;
+	int len = sizeof(*addllc);
 	int rc;
 
 	if (!smc_wr_tx_link_hold(link))
 		return -ENOLINK;
-	rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
-	if (rc)
-		goto put_out;
-	addllc = (struct smc_llc_msg_add_link *)wr_buf;
+	if (link->lgr->smc_version == SMC_V2) {
+		struct smc_wr_v2_buf *wr_buf;
+
+		rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend);
+		if (rc)
+			goto put_out;
+		addllc = (struct smc_llc_msg_add_link *)wr_buf;
+		ext = (struct smc_llc_msg_add_link_v2_ext *)
+						&wr_buf->raw[sizeof(*addllc)];
+		memset(ext, 0, SMC_WR_TX_SIZE);
+	} else {
+		struct smc_wr_buf *wr_buf;
+
+		rc = smc_llc_add_pending_send(link, &wr_buf, &pend);
+		if (rc)
+			goto put_out;
+		addllc = (struct smc_llc_msg_add_link *)wr_buf;
+	}
 
 	memset(addllc, 0, sizeof(*addllc));
-	addllc->hd.common.type = SMC_LLC_ADD_LINK;
-	addllc->hd.length = sizeof(struct smc_llc_msg_add_link);
+	addllc->hd.common.llc_type = SMC_LLC_ADD_LINK;
 	if (reqresp == SMC_LLC_RESP)
 		addllc->hd.flags |= SMC_LLC_FLAG_RESP;
 	memcpy(addllc->sender_mac, mac, ETH_ALEN);
@@ -516,8 +674,14 @@ int smc_llc_send_add_link(struct smc_link *link, u8 mac[], u8 gid[],
 			addllc->qp_mtu = min(link_new->path_mtu,
 					     link_new->peer_mtu);
 	}
+	if (ext && link_new)
+		len += smc_llc_fill_ext_v2(ext, link, link_new);
+	smc_llc_init_msg_hdr(&addllc->hd, link->lgr, len);
 	/* send llc message */
-	rc = smc_wr_tx_send(link, pend);
+	if (link->lgr->smc_version == SMC_V2)
+		rc = smc_wr_tx_v2_send(link, pend, len);
+	else
+		rc = smc_wr_tx_send(link, pend);
 put_out:
 	smc_wr_tx_link_put(link);
 	return rc;
@@ -541,8 +705,8 @@ int smc_llc_send_delete_link(struct smc_link *link, u8 link_del_id,
 	delllc = (struct smc_llc_msg_del_link *)wr_buf;
 
 	memset(delllc, 0, sizeof(*delllc));
-	delllc->hd.common.type = SMC_LLC_DELETE_LINK;
-	delllc->hd.length = sizeof(struct smc_llc_msg_del_link);
+	delllc->hd.common.llc_type = SMC_LLC_DELETE_LINK;
+	smc_llc_init_msg_hdr(&delllc->hd, link->lgr, sizeof(*delllc));
 	if (reqresp == SMC_LLC_RESP)
 		delllc->hd.flags |= SMC_LLC_FLAG_RESP;
 	if (orderly)
@@ -574,8 +738,8 @@ static int smc_llc_send_test_link(struct smc_link *link, u8 user_data[16])
 		goto put_out;
 	testllc = (struct smc_llc_msg_test_link *)wr_buf;
 	memset(testllc, 0, sizeof(*testllc));
-	testllc->hd.common.type = SMC_LLC_TEST_LINK;
-	testllc->hd.length = sizeof(struct smc_llc_msg_test_link);
+	testllc->hd.common.llc_type = SMC_LLC_TEST_LINK;
+	smc_llc_init_msg_hdr(&testllc->hd, link->lgr, sizeof(*testllc));
 	memcpy(testllc->user_data, user_data, sizeof(testllc->user_data));
 	/* send llc message */
 	rc = smc_wr_tx_send(link, pend);
@@ -651,44 +815,6 @@ static int smc_llc_alloc_alt_link(struct smc_link_group *lgr,
 	return -EMLINK;
 }
 
-/* return first buffer from any of the next buf lists */
-static struct smc_buf_desc *_smc_llc_get_next_rmb(struct smc_link_group *lgr,
-						  int *buf_lst)
-{
-	struct smc_buf_desc *buf_pos;
-
-	while (*buf_lst < SMC_RMBE_SIZES) {
-		buf_pos = list_first_entry_or_null(&lgr->rmbs[*buf_lst],
-						   struct smc_buf_desc, list);
-		if (buf_pos)
-			return buf_pos;
-		(*buf_lst)++;
-	}
-	return NULL;
-}
-
-/* return next rmb from buffer lists */
-static struct smc_buf_desc *smc_llc_get_next_rmb(struct smc_link_group *lgr,
-						 int *buf_lst,
-						 struct smc_buf_desc *buf_pos)
-{
-	struct smc_buf_desc *buf_next;
-
-	if (!buf_pos || list_is_last(&buf_pos->list, &lgr->rmbs[*buf_lst])) {
-		(*buf_lst)++;
-		return _smc_llc_get_next_rmb(lgr, buf_lst);
-	}
-	buf_next = list_next_entry(buf_pos, list);
-	return buf_next;
-}
-
-static struct smc_buf_desc *smc_llc_get_first_rmb(struct smc_link_group *lgr,
-						  int *buf_lst)
-{
-	*buf_lst = 0;
-	return smc_llc_get_next_rmb(lgr, buf_lst, NULL);
-}
-
 /* send one add_link_continue msg */
 static int smc_llc_add_link_cont(struct smc_link *link,
 				 struct smc_link *link_new, u8 *num_rkeys_todo,
@@ -734,7 +860,7 @@ static int smc_llc_add_link_cont(struct smc_link *link,
 		while (*buf_pos && !(*buf_pos)->used)
 			*buf_pos = smc_llc_get_next_rmb(lgr, buf_lst, *buf_pos);
 	}
-	addc_llc->hd.common.type = SMC_LLC_ADD_LINK_CONT;
+	addc_llc->hd.common.llc_type = SMC_LLC_ADD_LINK_CONT;
 	addc_llc->hd.length = sizeof(struct smc_llc_msg_add_link_cont);
 	if (lgr->role == SMC_CLNT)
 		addc_llc->hd.flags |= SMC_LLC_FLAG_RESP;
@@ -793,6 +919,8 @@ static int smc_llc_cli_add_link_reject(struct smc_llc_qentry *qentry)
 	qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
 	qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_ADD_LNK_REJ;
 	qentry->msg.raw.hdr.add_link_rej_rsn = SMC_LLC_REJ_RSN_NO_ALT_PATH;
+	smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr,
+			     sizeof(qentry->msg));
 	return smc_llc_send_message(qentry->link, &qentry->msg);
 }
 
@@ -813,7 +941,7 @@ static int smc_llc_cli_conf_link(struct smc_link *link,
 					      SMC_LLC_DEL_LOST_PATH);
 		return -ENOLINK;
 	}
-	if (qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) {
+	if (qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) {
 		/* received DELETE_LINK instead */
 		qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
 		smc_llc_send_message(link, &qentry->msg);
@@ -854,6 +982,26 @@ static int smc_llc_cli_conf_link(struct smc_link *link,
 	return 0;
 }
 
+static void smc_llc_save_add_link_rkeys(struct smc_link *link,
+					struct smc_link *link_new)
+{
+	struct smc_llc_msg_add_link_v2_ext *ext;
+	struct smc_link_group *lgr = link->lgr;
+	int max, i;
+
+	ext = (struct smc_llc_msg_add_link_v2_ext *)((u8 *)lgr->wr_rx_buf_v2 +
+						     SMC_WR_TX_SIZE);
+	max = min_t(u8, ext->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2);
+	mutex_lock(&lgr->rmbs_lock);
+	for (i = 0; i < max; i++) {
+		smc_rtoken_set(lgr, link->link_idx, link_new->link_idx,
+			       ext->rt[i].rmb_key,
+			       ext->rt[i].rmb_vaddr_new,
+			       ext->rt[i].rmb_key_new);
+	}
+	mutex_unlock(&lgr->rmbs_lock);
+}
+
 static void smc_llc_save_add_link_info(struct smc_link *link,
 				       struct smc_llc_msg_add_link *add_llc)
 {
@@ -884,14 +1032,24 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 	}
 
 	ini->vlan_id = lgr->vlan_id;
+	if (lgr->smc_version == SMC_V2) {
+		ini->check_smcrv2 = true;
+		ini->smcrv2.saddr = lgr->saddr;
+		ini->smcrv2.daddr = smc_ib_gid_to_ipv4(llc->sender_gid);
+	}
 	smc_pnet_find_alt_roce(lgr, ini, link->smcibdev);
 	if (!memcmp(llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
-	    !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN)) {
-		if (!ini->ib_dev)
+	    (lgr->smc_version == SMC_V2 ||
+	     !memcmp(llc->sender_mac, link->peer_mac, ETH_ALEN))) {
+		if (!ini->ib_dev && !ini->smcrv2.ib_dev_v2)
 			goto out_reject;
 		lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
 	}
-	if (!ini->ib_dev) {
+	if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) {
+		lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+		ini->smcrv2.ib_dev_v2 = link->smcibdev;
+		ini->smcrv2.ib_port_v2 = link->ibport;
+	} else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) {
 		lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
 		ini->ib_dev = link->smcibdev;
 		ini->ib_port = link->ibport;
@@ -916,14 +1074,18 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 		goto out_clear_lnk;
 
 	rc = smc_llc_send_add_link(link,
-				   lnk_new->smcibdev->mac[ini->ib_port - 1],
+				   lnk_new->smcibdev->mac[lnk_new->ibport - 1],
 				   lnk_new->gid, lnk_new, SMC_LLC_RESP);
 	if (rc)
 		goto out_clear_lnk;
-	rc = smc_llc_cli_rkey_exchange(link, lnk_new);
-	if (rc) {
-		rc = 0;
-		goto out_clear_lnk;
+	if (lgr->smc_version == SMC_V2) {
+		smc_llc_save_add_link_rkeys(link, lnk_new);
+	} else {
+		rc = smc_llc_cli_rkey_exchange(link, lnk_new);
+		if (rc) {
+			rc = 0;
+			goto out_clear_lnk;
+		}
 	}
 	rc = smc_llc_cli_conf_link(link, ini, lnk_new, lgr_new_t);
 	if (!rc)
@@ -939,6 +1101,44 @@ int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry)
 	return rc;
 }
 
+static void smc_llc_send_request_add_link(struct smc_link *link)
+{
+	struct smc_llc_msg_req_add_link_v2 *llc;
+	struct smc_wr_tx_pend_priv *pend;
+	struct smc_wr_v2_buf *wr_buf;
+	struct smc_gidlist gidlist;
+	int rc, len, i;
+
+	if (!smc_wr_tx_link_hold(link))
+		return;
+	if (link->lgr->type == SMC_LGR_SYMMETRIC ||
+	    link->lgr->type == SMC_LGR_ASYMMETRIC_PEER)
+		goto put_out;
+
+	smc_fill_gid_list(link->lgr, &gidlist, link->smcibdev, link->gid);
+	if (gidlist.len <= 1)
+		goto put_out;
+
+	rc = smc_llc_add_pending_send_v2(link, &wr_buf, &pend);
+	if (rc)
+		goto put_out;
+	llc = (struct smc_llc_msg_req_add_link_v2 *)wr_buf;
+	memset(llc, 0, SMC_WR_TX_SIZE);
+
+	llc->hd.common.llc_type = SMC_LLC_REQ_ADD_LINK;
+	for (i = 0; i < gidlist.len; i++)
+		memcpy(llc->gid[i], gidlist.list[i], sizeof(gidlist.list[0]));
+	llc->gid_cnt = gidlist.len;
+	len = sizeof(*llc) + (gidlist.len * sizeof(gidlist.list[0]));
+	smc_llc_init_msg_hdr(&llc->hd, link->lgr, len);
+	rc = smc_wr_tx_v2_send(link, pend, len);
+	if (!rc)
+		/* set REQ_ADD_LINK flow and wait for response from peer */
+		link->lgr->llc_flow_lcl.type = SMC_LLC_FLOW_REQ_ADD_LINK;
+put_out:
+	smc_wr_tx_link_put(link);
+}
+
 /* as an SMC client, invite server to start the add_link processing */
 static void smc_llc_cli_add_link_invite(struct smc_link *link,
 					struct smc_llc_qentry *qentry)
@@ -946,6 +1146,11 @@ static void smc_llc_cli_add_link_invite(struct smc_link *link,
 	struct smc_link_group *lgr = smc_get_lgr(link);
 	struct smc_init_info *ini = NULL;
 
+	if (lgr->smc_version == SMC_V2) {
+		smc_llc_send_request_add_link(link);
+		goto out;
+	}
+
 	if (lgr->type == SMC_LGR_SYMMETRIC ||
 	    lgr->type == SMC_LGR_ASYMMETRIC_PEER)
 		goto out;
@@ -978,7 +1183,7 @@ static bool smc_llc_is_empty_llc_message(union smc_llc_msg *llc)
 
 static bool smc_llc_is_local_add_link(union smc_llc_msg *llc)
 {
-	if (llc->raw.hdr.common.type == SMC_LLC_ADD_LINK &&
+	if (llc->raw.hdr.common.llc_type == SMC_LLC_ADD_LINK &&
 	    smc_llc_is_empty_llc_message(llc))
 		return true;
 	return false;
@@ -1145,7 +1350,7 @@ static int smc_llc_srv_conf_link(struct smc_link *link,
 	/* receive CONFIRM LINK response over the RoCE fabric */
 	qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_FIRST_TIME, 0);
 	if (!qentry ||
-	    qentry->msg.raw.hdr.common.type != SMC_LLC_CONFIRM_LINK) {
+	    qentry->msg.raw.hdr.common.llc_type != SMC_LLC_CONFIRM_LINK) {
 		/* send DELETE LINK */
 		smc_llc_send_delete_link(link, link_new->link_id, SMC_LLC_REQ,
 					 false, SMC_LLC_DEL_LOST_PATH);
@@ -1164,24 +1369,55 @@ static int smc_llc_srv_conf_link(struct smc_link *link,
 	return 0;
 }
 
-int smc_llc_srv_add_link(struct smc_link *link)
+static void smc_llc_send_req_add_link_response(struct smc_llc_qentry *qentry)
+{
+	qentry->msg.raw.hdr.flags |= SMC_LLC_FLAG_RESP;
+	smc_llc_init_msg_hdr(&qentry->msg.raw.hdr, qentry->link->lgr,
+			     sizeof(qentry->msg));
+	memset(&qentry->msg.raw.data, 0, sizeof(qentry->msg.raw.data));
+	smc_llc_send_message(qentry->link, &qentry->msg);
+}
+
+int smc_llc_srv_add_link(struct smc_link *link,
+			 struct smc_llc_qentry *req_qentry)
 {
 	enum smc_lgr_type lgr_new_t = SMC_LGR_SYMMETRIC;
 	struct smc_link_group *lgr = link->lgr;
 	struct smc_llc_msg_add_link *add_llc;
 	struct smc_llc_qentry *qentry = NULL;
+	bool send_req_add_link_resp = false;
 	struct smc_link *link_new = NULL;
-	struct smc_init_info *ini;
+	struct smc_init_info *ini = NULL;
 	int lnk_idx, rc = 0;
 
+	if (req_qentry &&
+	    req_qentry->msg.raw.hdr.common.llc_type == SMC_LLC_REQ_ADD_LINK)
+		send_req_add_link_resp = true;
+
 	ini = kzalloc(sizeof(*ini), GFP_KERNEL);
-	if (!ini)
-		return -ENOMEM;
+	if (!ini) {
+		rc = -ENOMEM;
+		goto out;
+	}
 
 	/* ignore client add link recommendation, start new flow */
 	ini->vlan_id = lgr->vlan_id;
+	if (lgr->smc_version == SMC_V2) {
+		ini->check_smcrv2 = true;
+		ini->smcrv2.saddr = lgr->saddr;
+		if (send_req_add_link_resp) {
+			struct smc_llc_msg_req_add_link_v2 *req_add =
+				&req_qentry->msg.req_add_link;
+
+			ini->smcrv2.daddr = smc_ib_gid_to_ipv4(req_add->gid[0]);
+		}
+	}
 	smc_pnet_find_alt_roce(lgr, ini, link->smcibdev);
-	if (!ini->ib_dev) {
+	if (lgr->smc_version == SMC_V2 && !ini->smcrv2.ib_dev_v2) {
+		lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
+		ini->smcrv2.ib_dev_v2 = link->smcibdev;
+		ini->smcrv2.ib_port_v2 = link->ibport;
+	} else if (lgr->smc_version < SMC_V2 && !ini->ib_dev) {
 		lgr_new_t = SMC_LGR_ASYMMETRIC_LOCAL;
 		ini->ib_dev = link->smcibdev;
 		ini->ib_port = link->ibport;
@@ -1196,11 +1432,17 @@ int smc_llc_srv_add_link(struct smc_link *link)
 	if (rc)
 		goto out;
 	link_new = &lgr->lnk[lnk_idx];
+
+	rc = smcr_buf_map_lgr(link_new);
+	if (rc)
+		goto out_err;
+
 	rc = smc_llc_send_add_link(link,
-				   link_new->smcibdev->mac[ini->ib_port - 1],
+				   link_new->smcibdev->mac[link_new->ibport-1],
 				   link_new->gid, link_new, SMC_LLC_REQ);
 	if (rc)
 		goto out_err;
+	send_req_add_link_resp = false;
 	/* receive ADD LINK response over the RoCE fabric */
 	qentry = smc_llc_wait(lgr, link, SMC_LLC_WAIT_TIME, SMC_LLC_ADD_LINK);
 	if (!qentry) {
@@ -1215,24 +1457,26 @@ int smc_llc_srv_add_link(struct smc_link *link)
 	}
 	if (lgr->type == SMC_LGR_SINGLE &&
 	    (!memcmp(add_llc->sender_gid, link->peer_gid, SMC_GID_SIZE) &&
-	     !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN))) {
+	     (lgr->smc_version == SMC_V2 ||
+	      !memcmp(add_llc->sender_mac, link->peer_mac, ETH_ALEN)))) {
 		lgr_new_t = SMC_LGR_ASYMMETRIC_PEER;
 	}
 	smc_llc_save_add_link_info(link_new, add_llc);
 	smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
 
 	rc = smc_ib_ready_link(link_new);
-	if (rc)
-		goto out_err;
-	rc = smcr_buf_map_lgr(link_new);
 	if (rc)
 		goto out_err;
 	rc = smcr_buf_reg_lgr(link_new);
 	if (rc)
 		goto out_err;
-	rc = smc_llc_srv_rkey_exchange(link, link_new);
-	if (rc)
-		goto out_err;
+	if (lgr->smc_version == SMC_V2) {
+		smc_llc_save_add_link_rkeys(link, link_new);
+	} else {
+		rc = smc_llc_srv_rkey_exchange(link, link_new);
+		if (rc)
+			goto out_err;
+	}
 	rc = smc_llc_srv_conf_link(link, link_new, lgr_new_t);
 	if (rc)
 		goto out_err;
@@ -1245,23 +1489,27 @@ int smc_llc_srv_add_link(struct smc_link *link)
 	}
 out:
 	kfree(ini);
+	if (send_req_add_link_resp)
+		smc_llc_send_req_add_link_response(req_qentry);
 	return rc;
 }
 
 static void smc_llc_process_srv_add_link(struct smc_link_group *lgr)
 {
 	struct smc_link *link = lgr->llc_flow_lcl.qentry->link;
+	struct smc_llc_qentry *qentry;
 	int rc;
 
-	smc_llc_flow_qentry_del(&lgr->llc_flow_lcl);
+	qentry = smc_llc_flow_qentry_clr(&lgr->llc_flow_lcl);
 
 	mutex_lock(&lgr->llc_conf_mutex);
-	rc = smc_llc_srv_add_link(link);
+	rc = smc_llc_srv_add_link(link, qentry);
 	if (!rc && lgr->type == SMC_LGR_SYMMETRIC) {
 		/* delete any asymmetric link */
 		smc_llc_delete_asym_link(lgr);
 	}
 	mutex_unlock(&lgr->llc_conf_mutex);
+	kfree(qentry);
 }
 
 /* enqueue a local add_link req to trigger a new add_link flow */
@@ -1269,8 +1517,8 @@ void smc_llc_add_link_local(struct smc_link *link)
 {
 	struct smc_llc_msg_add_link add_llc = {};
 
-	add_llc.hd.length = sizeof(add_llc);
-	add_llc.hd.common.type = SMC_LLC_ADD_LINK;
+	add_llc.hd.common.llc_type = SMC_LLC_ADD_LINK;
+	smc_llc_init_msg_hdr(&add_llc.hd, link->lgr, sizeof(add_llc));
 	/* no dev and port needed */
 	smc_llc_enqueue(link, (union smc_llc_msg *)&add_llc);
 }
@@ -1292,7 +1540,8 @@ static void smc_llc_add_link_work(struct work_struct *work)
 	else
 		smc_llc_process_srv_add_link(lgr);
 out:
-	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
+	if (lgr->llc_flow_lcl.type != SMC_LLC_FLOW_REQ_ADD_LINK)
+		smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
 }
 
 /* enqueue a local del_link msg to trigger a new del_link flow,
@@ -1302,8 +1551,8 @@ void smc_llc_srv_delete_link_local(struct smc_link *link, u8 del_link_id)
 {
 	struct smc_llc_msg_del_link del_llc = {};
 
-	del_llc.hd.length = sizeof(del_llc);
-	del_llc.hd.common.type = SMC_LLC_DELETE_LINK;
+	del_llc.hd.common.llc_type = SMC_LLC_DELETE_LINK;
+	smc_llc_init_msg_hdr(&del_llc.hd, link->lgr, sizeof(del_llc));
 	del_llc.link_num = del_link_id;
 	del_llc.reason = htonl(SMC_LLC_DEL_LOST_PATH);
 	del_llc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
@@ -1373,8 +1622,8 @@ void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn)
 	struct smc_llc_msg_del_link delllc = {};
 	int i;
 
-	delllc.hd.common.type = SMC_LLC_DELETE_LINK;
-	delllc.hd.length = sizeof(delllc);
+	delllc.hd.common.llc_type = SMC_LLC_DELETE_LINK;
+	smc_llc_init_msg_hdr(&delllc.hd, lgr, sizeof(delllc));
 	if (ord)
 		delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ORDERLY;
 	delllc.hd.flags |= SMC_LLC_FLAG_DEL_LINK_ALL;
@@ -1490,6 +1739,8 @@ static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr)
 	link = qentry->link;
 
 	num_entries = llc->rtoken[0].num_rkeys;
+	if (num_entries > SMC_LLC_RKEYS_PER_MSG)
+		goto out_err;
 	/* first rkey entry is for receiving link */
 	rk_idx = smc_rtoken_add(link,
 				llc->rtoken[0].rmb_vaddr,
@@ -1508,6 +1759,7 @@ static void smc_llc_rmt_conf_rkey(struct smc_link_group *lgr)
 	llc->hd.flags |= SMC_LLC_FLAG_RKEY_RETRY;
 out:
 	llc->hd.flags |= SMC_LLC_FLAG_RESP;
+	smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc));
 	smc_llc_send_message(link, &qentry->msg);
 	smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
 }
@@ -1525,6 +1777,28 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr)
 	llc = &qentry->msg.delete_rkey;
 	link = qentry->link;
 
+	if (lgr->smc_version == SMC_V2) {
+		struct smc_llc_msg_delete_rkey_v2 *llcv2;
+
+		memcpy(lgr->wr_rx_buf_v2, llc, sizeof(*llc));
+		llcv2 = (struct smc_llc_msg_delete_rkey_v2 *)lgr->wr_rx_buf_v2;
+		llcv2->num_inval_rkeys = 0;
+
+		max = min_t(u8, llcv2->num_rkeys, SMC_LLC_RKEYS_PER_MSG_V2);
+		for (i = 0; i < max; i++) {
+			if (smc_rtoken_delete(link, llcv2->rkey[i]))
+				llcv2->num_inval_rkeys++;
+		}
+		memset(&llc->rkey[0], 0, sizeof(llc->rkey));
+		memset(&llc->reserved2, 0, sizeof(llc->reserved2));
+		smc_llc_init_msg_hdr(&llc->hd, link->lgr, sizeof(*llc));
+		if (llcv2->num_inval_rkeys) {
+			llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
+			llc->err_mask = llcv2->num_inval_rkeys;
+		}
+		goto finish;
+	}
+
 	max = min_t(u8, llc->num_rkeys, SMC_LLC_DEL_RKEY_MAX);
 	for (i = 0; i < max; i++) {
 		if (smc_rtoken_delete(link, llc->rkey[i]))
@@ -1534,6 +1808,7 @@ static void smc_llc_rmt_delete_rkey(struct smc_link_group *lgr)
 		llc->hd.flags |= SMC_LLC_FLAG_RKEY_NEG;
 		llc->err_mask = err_mask;
 	}
+finish:
 	llc->hd.flags |= SMC_LLC_FLAG_RESP;
 	smc_llc_send_message(link, &qentry->msg);
 	smc_llc_flow_qentry_del(&lgr->llc_flow_rmt);
@@ -1569,7 +1844,7 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 	if (!smc_link_usable(link))
 		goto out;
 
-	switch (llc->raw.hdr.common.type) {
+	switch (llc->raw.hdr.common.llc_type) {
 	case SMC_LLC_TEST_LINK:
 		llc->test_link.hd.flags |= SMC_LLC_FLAG_RESP;
 		smc_llc_send_message(link, llc);
@@ -1594,8 +1869,18 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 				smc_llc_flow_qentry_set(&lgr->llc_flow_lcl,
 							qentry);
 				wake_up(&lgr->llc_msg_waiter);
-			} else if (smc_llc_flow_start(&lgr->llc_flow_lcl,
-						      qentry)) {
+				return;
+			}
+			if (lgr->llc_flow_lcl.type ==
+					SMC_LLC_FLOW_REQ_ADD_LINK) {
+				/* server started add_link processing */
+				lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK;
+				smc_llc_flow_qentry_set(&lgr->llc_flow_lcl,
+							qentry);
+				schedule_work(&lgr->llc_add_link_work);
+				return;
+			}
+			if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
 				schedule_work(&lgr->llc_add_link_work);
 			}
 		} else if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
@@ -1643,6 +1928,23 @@ static void smc_llc_event_handler(struct smc_llc_qentry *qentry)
 			smc_llc_flow_stop(lgr, &lgr->llc_flow_rmt);
 		}
 		return;
+	case SMC_LLC_REQ_ADD_LINK:
+		/* handle response here, smc_llc_flow_stop() cannot be called
+		 * in tasklet context
+		 */
+		if (lgr->role == SMC_CLNT &&
+		    lgr->llc_flow_lcl.type == SMC_LLC_FLOW_REQ_ADD_LINK &&
+		    (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP)) {
+			smc_llc_flow_stop(link->lgr, &lgr->llc_flow_lcl);
+		} else if (lgr->role == SMC_SERV) {
+			if (smc_llc_flow_start(&lgr->llc_flow_lcl, qentry)) {
+				/* as smc server, handle client suggestion */
+				lgr->llc_flow_lcl.type = SMC_LLC_FLOW_ADD_LINK;
+				schedule_work(&lgr->llc_add_link_work);
+			}
+			return;
+		}
+		break;
 	default:
 		smc_llc_protocol_violation(lgr, llc->raw.hdr.common.type);
 		break;
@@ -1686,7 +1988,7 @@ static void smc_llc_rx_response(struct smc_link *link,
 {
 	enum smc_llc_flowtype flowtype = link->lgr->llc_flow_lcl.type;
 	struct smc_llc_flow *flow = &link->lgr->llc_flow_lcl;
-	u8 llc_type = qentry->msg.raw.hdr.common.type;
+	u8 llc_type = qentry->msg.raw.hdr.common.llc_type;
 
 	switch (llc_type) {
 	case SMC_LLC_TEST_LINK:
@@ -1712,7 +2014,8 @@ static void smc_llc_rx_response(struct smc_link *link,
 		/* not used because max links is 3 */
 		break;
 	default:
-		smc_llc_protocol_violation(link->lgr, llc_type);
+		smc_llc_protocol_violation(link->lgr,
+					   qentry->msg.raw.hdr.common.type);
 		break;
 	}
 	kfree(qentry);
@@ -1737,7 +2040,8 @@ static void smc_llc_enqueue(struct smc_link *link, union smc_llc_msg *llc)
 	memcpy(&qentry->msg, llc, sizeof(union smc_llc_msg));
 
 	/* process responses immediately */
-	if (llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) {
+	if ((llc->raw.hdr.flags & SMC_LLC_FLAG_RESP) &&
+	    llc->raw.hdr.common.llc_type != SMC_LLC_REQ_ADD_LINK) {
 		smc_llc_rx_response(link, qentry);
 		return;
 	}
@@ -1757,8 +2061,13 @@ static void smc_llc_rx_handler(struct ib_wc *wc, void *buf)
 
 	if (wc->byte_len < sizeof(*llc))
 		return; /* short message */
-	if (llc->raw.hdr.length != sizeof(*llc))
-		return; /* invalid message */
+	if (!llc->raw.hdr.common.llc_version) {
+		if (llc->raw.hdr.length != sizeof(*llc))
+			return; /* invalid message */
+	} else {
+		if (llc->raw.hdr.length_v2 < sizeof(*llc))
+			return; /* invalid message */
+	}
 
 	smc_llc_enqueue(link, llc);
 }
@@ -1977,6 +2286,35 @@ static struct smc_wr_rx_handler smc_llc_rx_handlers[] = {
 		.handler	= smc_llc_rx_handler,
 		.type		= SMC_LLC_DELETE_RKEY
 	},
+	/* V2 types */
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_CONFIRM_LINK_V2
+	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_TEST_LINK_V2
+	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_ADD_LINK_V2
+	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_DELETE_LINK_V2
+	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_REQ_ADD_LINK_V2
+	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_CONFIRM_RKEY_V2
+	},
+	{
+		.handler	= smc_llc_rx_handler,
+		.type		= SMC_LLC_DELETE_RKEY_V2
+	},
 	{
 		.handler	= NULL,
 	}
diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h
index cc00a2ec4e92..4404e52b3346 100644
--- a/net/smc/smc_llc.h
+++ b/net/smc/smc_llc.h
@@ -30,10 +30,19 @@ enum smc_llc_msg_type {
 	SMC_LLC_ADD_LINK		= 0x02,
 	SMC_LLC_ADD_LINK_CONT		= 0x03,
 	SMC_LLC_DELETE_LINK		= 0x04,
+	SMC_LLC_REQ_ADD_LINK		= 0x05,
 	SMC_LLC_CONFIRM_RKEY		= 0x06,
 	SMC_LLC_TEST_LINK		= 0x07,
 	SMC_LLC_CONFIRM_RKEY_CONT	= 0x08,
 	SMC_LLC_DELETE_RKEY		= 0x09,
+	/* V2 types */
+	SMC_LLC_CONFIRM_LINK_V2		= 0x21,
+	SMC_LLC_ADD_LINK_V2		= 0x22,
+	SMC_LLC_DELETE_LINK_V2		= 0x24,
+	SMC_LLC_REQ_ADD_LINK_V2		= 0x25,
+	SMC_LLC_CONFIRM_RKEY_V2		= 0x26,
+	SMC_LLC_TEST_LINK_V2		= 0x27,
+	SMC_LLC_DELETE_RKEY_V2		= 0x29,
 };
 
 #define smc_link_downing(state) \
@@ -102,7 +111,8 @@ void smc_llc_flow_qentry_del(struct smc_llc_flow *flow);
 void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord,
 				  u32 rsn);
 int smc_llc_cli_add_link(struct smc_link *link, struct smc_llc_qentry *qentry);
-int smc_llc_srv_add_link(struct smc_link *link);
+int smc_llc_srv_add_link(struct smc_link *link,
+			 struct smc_llc_qentry *req_qentry);
 void smc_llc_add_link_local(struct smc_link *link);
 int smc_llc_init(void) __init;
 
diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
index 22d7324969cd..600ab5889227 100644
--- a/net/smc/smc_wr.c
+++ b/net/smc/smc_wr.c
@@ -258,6 +258,33 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
 	return 0;
 }
 
+int smc_wr_tx_get_v2_slot(struct smc_link *link,
+			  smc_wr_tx_handler handler,
+			  struct smc_wr_v2_buf **wr_buf,
+			  struct smc_wr_tx_pend_priv **wr_pend_priv)
+{
+	struct smc_wr_tx_pend *wr_pend;
+	struct ib_send_wr *wr_ib;
+	u64 wr_id;
+
+	if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt)
+		return -EBUSY;
+
+	*wr_buf = NULL;
+	*wr_pend_priv = NULL;
+	wr_id = smc_wr_tx_get_next_wr_id(link);
+	wr_pend = link->wr_tx_v2_pend;
+	wr_pend->wr_id = wr_id;
+	wr_pend->handler = handler;
+	wr_pend->link = link;
+	wr_pend->idx = link->wr_tx_cnt;
+	wr_ib = link->wr_tx_v2_ib;
+	wr_ib->wr_id = wr_id;
+	*wr_buf = link->lgr->wr_tx_buf_v2;
+	*wr_pend_priv = &wr_pend->priv;
+	return 0;
+}
+
 int smc_wr_tx_put_slot(struct smc_link *link,
 		       struct smc_wr_tx_pend_priv *wr_pend_priv)
 {
@@ -307,6 +334,22 @@ int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv)
 	return rc;
 }
 
+int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
+		      int len)
+{
+	int rc;
+
+	link->wr_tx_v2_ib->sg_list[0].length = len;
+	ib_req_notify_cq(link->smcibdev->roce_cq_send,
+			 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+	rc = ib_post_send(link->roce_qp, link->wr_tx_v2_ib, NULL);
+	if (rc) {
+		smc_wr_tx_put_slot(link, priv);
+		smcr_link_down_cond_sched(link);
+	}
+	return rc;
+}
+
 /* Send prepared WR slot via ib_post_send and wait for send completion
  * notification.
  * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer
diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h
index 6d2cdb231859..f353311e6f84 100644
--- a/net/smc/smc_wr.h
+++ b/net/smc/smc_wr.h
@@ -113,10 +113,16 @@ int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
 			    struct smc_wr_buf **wr_buf,
 			    struct smc_rdma_wr **wrs,
 			    struct smc_wr_tx_pend_priv **wr_pend_priv);
+int smc_wr_tx_get_v2_slot(struct smc_link *link,
+			  smc_wr_tx_handler handler,
+			  struct smc_wr_v2_buf **wr_buf,
+			  struct smc_wr_tx_pend_priv **wr_pend_priv);
 int smc_wr_tx_put_slot(struct smc_link *link,
 		       struct smc_wr_tx_pend_priv *wr_pend_priv);
 int smc_wr_tx_send(struct smc_link *link,
 		   struct smc_wr_tx_pend_priv *wr_pend_priv);
+int smc_wr_tx_v2_send(struct smc_link *link,
+		      struct smc_wr_tx_pend_priv *priv, int len);
 int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv,
 			unsigned long timeout);
 void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 10/11] net/smc: add netlink support for SMC-Rv2
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
                   ` (8 preceding siblings ...)
  2021-10-14 16:47 ` [PATCH net-next v2 09/11] net/smc: extend LLC layer for SMC-Rv2 Karsten Graul
@ 2021-10-14 16:47 ` Karsten Graul
  2021-10-14 16:47 ` [PATCH net-next v2 11/11] net/smc: stop links when their GID is removed Karsten Graul
  2021-10-16  8:03 ` [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support David Miller
  11 siblings, 0 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

Implement the netlink support for SMC-Rv2 related attributes that are
provided to user space.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
---
 include/uapi/linux/smc.h | 17 ++++++-
 net/smc/smc_core.c       | 98 ++++++++++++++++++++++++++++++----------
 2 files changed, 88 insertions(+), 27 deletions(-)

diff --git a/include/uapi/linux/smc.h b/include/uapi/linux/smc.h
index b175bd0165a1..20f33b27787f 100644
--- a/include/uapi/linux/smc.h
+++ b/include/uapi/linux/smc.h
@@ -84,17 +84,28 @@ enum {
 	SMC_NLA_SYS_IS_ISM_V2,		/* u8 */
 	SMC_NLA_SYS_LOCAL_HOST,		/* string */
 	SMC_NLA_SYS_SEID,		/* string */
+	SMC_NLA_SYS_IS_SMCR_V2,		/* u8 */
 	__SMC_NLA_SYS_MAX,
 	SMC_NLA_SYS_MAX = __SMC_NLA_SYS_MAX - 1
 };
 
-/* SMC_NLA_LGR_V2 nested attributes */
+/* SMC_NLA_LGR_D_V2_COMMON and SMC_NLA_LGR_R_V2_COMMON nested attributes */
 enum {
 	SMC_NLA_LGR_V2_VER,		/* u8 */
 	SMC_NLA_LGR_V2_REL,		/* u8 */
 	SMC_NLA_LGR_V2_OS,		/* u8 */
 	SMC_NLA_LGR_V2_NEG_EID,		/* string */
 	SMC_NLA_LGR_V2_PEER_HOST,	/* string */
+	__SMC_NLA_LGR_V2_MAX,
+	SMC_NLA_LGR_V2_MAX = __SMC_NLA_LGR_V2_MAX - 1
+};
+
+/* SMC_NLA_LGR_R_V2 nested attributes */
+enum {
+	SMC_NLA_LGR_R_V2_UNSPEC,
+	SMC_NLA_LGR_R_V2_DIRECT,	/* u8 */
+	__SMC_NLA_LGR_R_V2_MAX,
+	SMC_NLA_LGR_R_V2_MAX = __SMC_NLA_LGR_R_V2_MAX - 1
 };
 
 /* SMC_GEN_LGR_SMCR attributes */
@@ -106,6 +117,8 @@ enum {
 	SMC_NLA_LGR_R_PNETID,		/* string */
 	SMC_NLA_LGR_R_VLAN_ID,		/* u8 */
 	SMC_NLA_LGR_R_CONNS_NUM,	/* u32 */
+	SMC_NLA_LGR_R_V2_COMMON,	/* nest */
+	SMC_NLA_LGR_R_V2,		/* nest */
 	__SMC_NLA_LGR_R_MAX,
 	SMC_NLA_LGR_R_MAX = __SMC_NLA_LGR_R_MAX - 1
 };
@@ -138,7 +151,7 @@ enum {
 	SMC_NLA_LGR_D_PNETID,		/* string */
 	SMC_NLA_LGR_D_CHID,		/* u16 */
 	SMC_NLA_LGR_D_PAD,		/* flag */
-	SMC_NLA_LGR_V2,			/* nest */
+	SMC_NLA_LGR_D_V2_COMMON,	/* nest */
 	__SMC_NLA_LGR_D_MAX,
 	SMC_NLA_LGR_D_MAX = __SMC_NLA_LGR_D_MAX - 1
 };
diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c
index 1ccab993683d..8e642f8f334f 100644
--- a/net/smc/smc_core.c
+++ b/net/smc/smc_core.c
@@ -244,6 +244,8 @@ int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb)
 		goto errattr;
 	if (nla_put_u8(skb, SMC_NLA_SYS_IS_ISM_V2, smc_ism_is_v2_capable()))
 		goto errattr;
+	if (nla_put_u8(skb, SMC_NLA_SYS_IS_SMCR_V2, true))
+		goto errattr;
 	smc_clc_get_hostname(&host);
 	if (host) {
 		memcpy(hostname, host, SMC_MAX_HOSTNAME_LEN);
@@ -271,12 +273,65 @@ int smc_nl_get_sys_info(struct sk_buff *skb, struct netlink_callback *cb)
 	return skb->len;
 }
 
+/* Fill SMC_NLA_LGR_D_V2_COMMON/SMC_NLA_LGR_R_V2_COMMON nested attributes */
+static int smc_nl_fill_lgr_v2_common(struct smc_link_group *lgr,
+				     struct sk_buff *skb,
+				     struct netlink_callback *cb,
+				     struct nlattr *v2_attrs)
+{
+	char smc_host[SMC_MAX_HOSTNAME_LEN + 1];
+	char smc_eid[SMC_MAX_EID_LEN + 1];
+
+	if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version))
+		goto errv2attr;
+	if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release))
+		goto errv2attr;
+	if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os))
+		goto errv2attr;
+	memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN);
+	smc_host[SMC_MAX_HOSTNAME_LEN] = 0;
+	if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host))
+		goto errv2attr;
+	memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN);
+	smc_eid[SMC_MAX_EID_LEN] = 0;
+	if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid))
+		goto errv2attr;
+
+	nla_nest_end(skb, v2_attrs);
+	return 0;
+
+errv2attr:
+	nla_nest_cancel(skb, v2_attrs);
+	return -EMSGSIZE;
+}
+
+static int smc_nl_fill_smcr_lgr_v2(struct smc_link_group *lgr,
+				   struct sk_buff *skb,
+				   struct netlink_callback *cb)
+{
+	struct nlattr *v2_attrs;
+
+	v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2);
+	if (!v2_attrs)
+		goto errattr;
+	if (nla_put_u8(skb, SMC_NLA_LGR_R_V2_DIRECT, !lgr->uses_gateway))
+		goto errv2attr;
+
+	nla_nest_end(skb, v2_attrs);
+	return 0;
+
+errv2attr:
+	nla_nest_cancel(skb, v2_attrs);
+errattr:
+	return -EMSGSIZE;
+}
+
 static int smc_nl_fill_lgr(struct smc_link_group *lgr,
 			   struct sk_buff *skb,
 			   struct netlink_callback *cb)
 {
 	char smc_target[SMC_MAX_PNETID_LEN + 1];
-	struct nlattr *attrs;
+	struct nlattr *attrs, *v2_attrs;
 
 	attrs = nla_nest_start(skb, SMC_GEN_LGR_SMCR);
 	if (!attrs)
@@ -296,6 +351,15 @@ static int smc_nl_fill_lgr(struct smc_link_group *lgr,
 	smc_target[SMC_MAX_PNETID_LEN] = 0;
 	if (nla_put_string(skb, SMC_NLA_LGR_R_PNETID, smc_target))
 		goto errattr;
+	if (lgr->smc_version > SMC_V1) {
+		v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_R_V2_COMMON);
+		if (!v2_attrs)
+			goto errattr;
+		if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs))
+			goto errattr;
+		if (smc_nl_fill_smcr_lgr_v2(lgr, skb, cb))
+			goto errattr;
+	}
 
 	nla_nest_end(skb, attrs);
 	return 0;
@@ -428,10 +492,7 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr,
 				struct sk_buff *skb,
 				struct netlink_callback *cb)
 {
-	char smc_host[SMC_MAX_HOSTNAME_LEN + 1];
 	char smc_pnet[SMC_MAX_PNETID_LEN + 1];
-	char smc_eid[SMC_MAX_EID_LEN + 1];
-	struct nlattr *v2_attrs;
 	struct nlattr *attrs;
 	void *nlh;
 
@@ -463,32 +524,19 @@ static int smc_nl_fill_smcd_lgr(struct smc_link_group *lgr,
 	smc_pnet[SMC_MAX_PNETID_LEN] = 0;
 	if (nla_put_string(skb, SMC_NLA_LGR_D_PNETID, smc_pnet))
 		goto errattr;
+	if (lgr->smc_version > SMC_V1) {
+		struct nlattr *v2_attrs;
 
-	v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_V2);
-	if (!v2_attrs)
-		goto errattr;
-	if (nla_put_u8(skb, SMC_NLA_LGR_V2_VER, lgr->smc_version))
-		goto errv2attr;
-	if (nla_put_u8(skb, SMC_NLA_LGR_V2_REL, lgr->peer_smc_release))
-		goto errv2attr;
-	if (nla_put_u8(skb, SMC_NLA_LGR_V2_OS, lgr->peer_os))
-		goto errv2attr;
-	memcpy(smc_host, lgr->peer_hostname, SMC_MAX_HOSTNAME_LEN);
-	smc_host[SMC_MAX_HOSTNAME_LEN] = 0;
-	if (nla_put_string(skb, SMC_NLA_LGR_V2_PEER_HOST, smc_host))
-		goto errv2attr;
-	memcpy(smc_eid, lgr->negotiated_eid, SMC_MAX_EID_LEN);
-	smc_eid[SMC_MAX_EID_LEN] = 0;
-	if (nla_put_string(skb, SMC_NLA_LGR_V2_NEG_EID, smc_eid))
-		goto errv2attr;
-
-	nla_nest_end(skb, v2_attrs);
+		v2_attrs = nla_nest_start(skb, SMC_NLA_LGR_D_V2_COMMON);
+		if (!v2_attrs)
+			goto errattr;
+		if (smc_nl_fill_lgr_v2_common(lgr, skb, cb, v2_attrs))
+			goto errattr;
+	}
 	nla_nest_end(skb, attrs);
 	genlmsg_end(skb, nlh);
 	return 0;
 
-errv2attr:
-	nla_nest_cancel(skb, v2_attrs);
 errattr:
 	nla_nest_cancel(skb, attrs);
 errout:
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 11/11] net/smc: stop links when their GID is removed
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
                   ` (9 preceding siblings ...)
  2021-10-14 16:47 ` [PATCH net-next v2 10/11] net/smc: add netlink support " Karsten Graul
@ 2021-10-14 16:47 ` Karsten Graul
  2021-10-16  8:03 ` [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support David Miller
  11 siblings, 0 replies; 13+ messages in thread
From: Karsten Graul @ 2021-10-14 16:47 UTC (permalink / raw)
  To: David Miller, Jakub Kicinski
  Cc: netdev, linux-s390, Heiko Carstens, linux-rdma

With SMC-Rv2 the GID is an IP address that can be deleted from the
device. When an IB_EVENT_GID_CHANGE event is provided then iterate over
all active links and check if their GID is still defined. Otherwise
stop the affected link.

Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
---
 net/smc/smc_ib.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c
index 32f9d2fdc474..d93055ec17ae 100644
--- a/net/smc/smc_ib.c
+++ b/net/smc/smc_ib.c
@@ -295,6 +295,58 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
 	return -ENODEV;
 }
 
+/* check if gid is still defined on smcibdev */
+static bool smc_ib_check_link_gid(u8 gid[SMC_GID_SIZE], bool smcrv2,
+				  struct smc_ib_device *smcibdev, u8 ibport)
+{
+	const struct ib_gid_attr *attr;
+	bool rc = false;
+	int i;
+
+	for (i = 0; !rc && i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
+		attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, i);
+		if (IS_ERR(attr))
+			continue;
+
+		rcu_read_lock();
+		if ((!smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE) ||
+		    (smcrv2 && attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP &&
+		     !(ipv6_addr_type((const struct in6_addr *)&attr->gid)
+				     & IPV6_ADDR_LINKLOCAL)))
+			if (!memcmp(gid, &attr->gid, SMC_GID_SIZE))
+				rc = true;
+		rcu_read_unlock();
+		rdma_put_gid_attr(attr);
+	}
+	return rc;
+}
+
+/* check all links if the gid is still defined on smcibdev */
+static void smc_ib_gid_check(struct smc_ib_device *smcibdev, u8 ibport)
+{
+	struct smc_link_group *lgr;
+	int i;
+
+	spin_lock_bh(&smc_lgr_list.lock);
+	list_for_each_entry(lgr, &smc_lgr_list.list, list) {
+		if (strncmp(smcibdev->pnetid[ibport - 1], lgr->pnet_id,
+			    SMC_MAX_PNETID_LEN))
+			continue; /* lgr is not affected */
+		if (list_empty(&lgr->list))
+			continue;
+		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
+			if (lgr->lnk[i].state == SMC_LNK_UNUSED ||
+			    lgr->lnk[i].smcibdev != smcibdev)
+				continue;
+			if (!smc_ib_check_link_gid(lgr->lnk[i].gid,
+						   lgr->smc_version == SMC_V2,
+						   smcibdev, ibport))
+				smcr_port_err(smcibdev, ibport);
+		}
+	}
+	spin_unlock_bh(&smc_lgr_list.lock);
+}
+
 static int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport)
 {
 	int rc;
@@ -333,6 +385,7 @@ static void smc_ib_port_event_work(struct work_struct *work)
 		} else {
 			clear_bit(port_idx, smcibdev->ports_going_away);
 			smcr_port_add(smcibdev, port_idx + 1);
+			smc_ib_gid_check(smcibdev, port_idx + 1);
 		}
 	}
 }
-- 
2.25.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support
  2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
                   ` (10 preceding siblings ...)
  2021-10-14 16:47 ` [PATCH net-next v2 11/11] net/smc: stop links when their GID is removed Karsten Graul
@ 2021-10-16  8:03 ` David Miller
  11 siblings, 0 replies; 13+ messages in thread
From: David Miller @ 2021-10-16  8:03 UTC (permalink / raw)
  To: kgraul; +Cc: kuba, netdev, linux-s390, hca, linux-rdma

From: Karsten Graul <kgraul@linux.ibm.com>
Date: Thu, 14 Oct 2021 18:47:41 +0200

> Please apply the following patch series for smc to netdev's net-next tree.
> 
> SMC-Rv2 support (see https://www.ibm.com/support/pages/node/6326337)
> provides routable RoCE support for SMC-R, eliminating the current
> same-subnet restriction, by exploiting the UDP encapsulation feature
> of the RoCE adapter hardware.
> 
> Patch 1 ("net/smc: improved fix wait on already cleared link") is
> already applied on netdevs net tree but its changes are needed for
> this series on net-next. The patch is unchanged compared to the
> version on the net tree.
> 
> v2: resend of the v1 patch series, and CC linux-rdma this time

This does not apply cleanly to net-next, please respin.

Thank you.


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2021-10-16  8:03 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-10-14 16:47 [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support Karsten Graul
2021-10-14 16:47 ` [PATCH net-next v2 01/11] net/smc: improved fix wait on already cleared link Karsten Graul
2021-10-14 16:47 ` [PATCH net-next v2 02/11] net/smc: save stack space and allocate smc_init_info Karsten Graul
2021-10-14 16:47 ` [PATCH net-next v2 03/11] net/smc: prepare for SMC-Rv2 connection Karsten Graul
2021-10-14 16:47 ` [PATCH net-next v2 04/11] net/smc: add SMC-Rv2 connection establishment Karsten Graul
2021-10-14 16:47 ` [PATCH net-next v2 05/11] net/smc: add listen processing for SMC-Rv2 Karsten Graul
2021-10-14 16:47 ` [PATCH net-next v2 06/11] net/smc: add v2 format of CLC decline message Karsten Graul
2021-10-14 16:47 ` [PATCH net-next v2 07/11] net/smc: retrieve v2 gid from IB device Karsten Graul
2021-10-14 16:47 ` [PATCH net-next v2 08/11] net/smc: add v2 support to the work request layer Karsten Graul
2021-10-14 16:47 ` [PATCH net-next v2 09/11] net/smc: extend LLC layer for SMC-Rv2 Karsten Graul
2021-10-14 16:47 ` [PATCH net-next v2 10/11] net/smc: add netlink support " Karsten Graul
2021-10-14 16:47 ` [PATCH net-next v2 11/11] net/smc: stop links when their GID is removed Karsten Graul
2021-10-16  8:03 ` [PATCH net-next v2 00/11] net/smc: introduce SMC-Rv2 support David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.