From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id C8056C43217 for ; Sat, 26 Nov 2022 09:04:01 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S229606AbiKZJD6 (ORCPT ); Sat, 26 Nov 2022 04:03:58 -0500 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:33576 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S229485AbiKZJDz (ORCPT ); Sat, 26 Nov 2022 04:03:55 -0500 Received: from out30-132.freemail.mail.aliyun.com (out30-132.freemail.mail.aliyun.com [115.124.30.132]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 6D07D2A435; Sat, 26 Nov 2022 01:03:52 -0800 (PST) X-Alimail-AntiSpam: AC=PASS;BC=-1|-1;BR=01201311R161e4;CH=green;DM=||false|;DS=||;FP=0|-1|-1|-1|0|-1|-1|-1;HT=ay29a033018046056;MF=alibuda@linux.alibaba.com;NM=1;PH=DS;RN=8;SR=0;TI=SMTPD_---0VViBjT-_1669453429; Received: from j66a10360.sqa.eu95.tbsite.net(mailfrom:alibuda@linux.alibaba.com fp:SMTPD_---0VViBjT-_1669453429) by smtp.aliyun-inc.com; Sat, 26 Nov 2022 17:03:49 +0800 From: "D.Wythe" To: kgraul@linux.ibm.com, wenjia@linux.ibm.com, jaka@linux.ibm.com Cc: kuba@kernel.org, davem@davemloft.net, netdev@vger.kernel.org, linux-s390@vger.kernel.org, linux-rdma@vger.kernel.org Subject: [PATCH net-next v6 1/7] net/smc: remove locks smc_client_lgr_pending and smc_server_lgr_pending Date: Sat, 26 Nov 2022 17:03:36 +0800 Message-Id: <1669453422-38152-2-git-send-email-alibuda@linux.alibaba.com> X-Mailer: git-send-email 1.8.3.1 In-Reply-To: <1669453422-38152-1-git-send-email-alibuda@linux.alibaba.com> References: <1669453422-38152-1-git-send-email-alibuda@linux.alibaba.com> Precedence: bulk List-ID: X-Mailing-List: linux-rdma@vger.kernel.org From: "D. Wythe" This patch attempts to remove locks named smc_client_lgr_pending and smc_server_lgr_pending, which aim to serialize the creation of link group. However, once link group existed already, those locks are meaningless, worse still, they make incoming connections have to be queued one after the other. Now, the creation of link group is no longer generated by competition, but allocated through following strategy. 1. Try to find a suitable link group, if successd, current connection is considered as NON first contact connection. ends. 2. Check the number of connections currently waiting for a suitable link group to be created, if it is not less that the number of link groups to be created multiplied by (SMC_RMBS_PER_LGR_MAX - 1), then increase the number of link groups to be created, current connection is considered as the first contact connection. ends. 3. Increase the number of connections currently waiting, and wait for woken up. 4. Decrease the number of connections currently waiting, goto 1. We wake up the connection that was put to sleep in stage 3 through the SMC link state change event. Once the link moves out of the SMC_LNK_ACTIVATING state, decrease the number of link groups to be created, and then wake up at most (SMC_RMBS_PER_LGR_MAX - 1) connections. In the iplementation, we introduce the concept of lnk cluster, which is a collection of links with the same characteristics (see smcr_lnk_cluster_cmpfn() with more details), which makes it possible to wake up efficiently in the scenario of N v.s 1. Signed-off-by: D. Wythe --- net/smc/af_smc.c | 50 +++--- net/smc/smc_core.c | 450 ++++++++++++++++++++++++++++++++++++++++++++++++++++- net/smc/smc_core.h | 46 ++++++ 3 files changed, 508 insertions(+), 38 deletions(-) diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index e12d4fa..52287ee 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -53,12 +53,7 @@ #include "smc_tracepoint.h" #include "smc_sysctl.h" -static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group - * creation on server - */ -static DEFINE_MUTEX(smc_client_lgr_pending); /* serialize link group - * creation on client - */ +static DEFINE_MUTEX(smcd_buf_pending); /* serialize SMC-D buf creation */ static struct workqueue_struct *smc_tcp_ls_wq; /* wq for tcp listen work */ struct workqueue_struct *smc_hs_wq; /* wq for handshake work */ @@ -1197,10 +1192,8 @@ static int smc_connect_rdma(struct smc_sock *smc, if (reason_code) return reason_code; - mutex_lock(&smc_client_lgr_pending); reason_code = smc_conn_create(smc, ini); if (reason_code) { - mutex_unlock(&smc_client_lgr_pending); return reason_code; } @@ -1292,7 +1285,6 @@ static int smc_connect_rdma(struct smc_sock *smc, if (reason_code) goto connect_abort; } - mutex_unlock(&smc_client_lgr_pending); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; @@ -1302,7 +1294,6 @@ static int smc_connect_rdma(struct smc_sock *smc, return 0; connect_abort: smc_conn_abort(smc, ini->first_contact_local); - mutex_unlock(&smc_client_lgr_pending); smc->connect_nonblock = 0; return reason_code; @@ -1348,16 +1339,15 @@ static int smc_connect_ism(struct smc_sock *smc, } ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid; - /* there is only one lgr role for SMC-D; use server lock */ - mutex_lock(&smc_server_lgr_pending); rc = smc_conn_create(smc, ini); if (rc) { - mutex_unlock(&smc_server_lgr_pending); return rc; } + mutex_lock(&smcd_buf_pending); /* Create send and receive buffers */ rc = smc_buf_create(smc, true); + mutex_unlock(&smcd_buf_pending); if (rc) { rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM; goto connect_abort; @@ -1379,7 +1369,6 @@ static int smc_connect_ism(struct smc_sock *smc, aclc->hdr.version, eid, NULL); if (rc) goto connect_abort; - mutex_unlock(&smc_server_lgr_pending); smc_copy_sock_settings_to_clc(smc); smc->connect_nonblock = 0; @@ -1389,7 +1378,6 @@ static int smc_connect_ism(struct smc_sock *smc, return 0; connect_abort: smc_conn_abort(smc, ini->first_contact_local); - mutex_unlock(&smc_server_lgr_pending); smc->connect_nonblock = 0; return rc; @@ -1505,6 +1493,9 @@ static int __smc_connect(struct smc_sock *smc) SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc); smc_connect_ism_vlan_cleanup(smc, ini); + if (ini->first_contact_local) + smc_lgr_decision_maker_on_first_contact_success(smc, ini); + kfree(buf); kfree(ini); return 0; @@ -1513,6 +1504,8 @@ static int __smc_connect(struct smc_sock *smc) smc_connect_ism_vlan_cleanup(smc, ini); kfree(buf); fallback: + if (ini->first_contact_local) + smc_lgr_decision_maker_on_first_contact_fail(ini); kfree(ini); return smc_connect_decline_fallback(smc, rc, version); } @@ -2001,8 +1994,10 @@ static int smc_listen_ism_init(struct smc_sock *new_smc, if (rc) return rc; + mutex_lock(&smcd_buf_pending); /* Create send and receive buffers */ rc = smc_buf_create(new_smc, true); + mutex_unlock(&smcd_buf_pending); if (rc) { smc_conn_abort(new_smc, ini->first_contact_local); return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : @@ -2379,7 +2374,6 @@ static void smc_listen_work(struct work_struct *work) if (rc) goto out_decl; - mutex_lock(&smc_server_lgr_pending); smc_close_init(new_smc); smc_rx_init(new_smc); smc_tx_init(new_smc); @@ -2387,46 +2381,42 @@ static void smc_listen_work(struct work_struct *work) /* determine ISM or RoCE device used for connection */ rc = smc_listen_find_device(new_smc, pclc, ini); if (rc) - goto out_unlock; + goto out_decl; /* send SMC Accept CLC message */ accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version; rc = smc_clc_send_accept(new_smc, ini->first_contact_local, accept_version, ini->negotiated_eid); if (rc) - goto out_unlock; - - /* SMC-D does not need this lock any more */ - if (ini->is_smcd) - mutex_unlock(&smc_server_lgr_pending); + goto out_decl; /* receive SMC Confirm CLC message */ memset(buf, 0, sizeof(*buf)); cclc = (struct smc_clc_msg_accept_confirm *)buf; rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf), SMC_CLC_CONFIRM, CLC_WAIT_TIME); - if (rc) { - if (!ini->is_smcd) - goto out_unlock; + if (rc) goto out_decl; - } /* finish worker */ if (!ini->is_smcd) { rc = smc_listen_rdma_finish(new_smc, cclc, ini->first_contact_local, ini); if (rc) - goto out_unlock; - mutex_unlock(&smc_server_lgr_pending); + goto out_decl; } + smc_conn_leave_rtoken_pending(new_smc, ini); smc_conn_save_peer_info(new_smc, cclc); smc_listen_out_connected(new_smc); SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini); + if (ini->first_contact_local) + smc_lgr_decision_maker_on_first_contact_success(new_smc, ini); goto out_free; -out_unlock: - mutex_unlock(&smc_server_lgr_pending); out_decl: + smc_conn_leave_rtoken_pending(new_smc, ini); + if (ini && ini->first_contact_local) + smc_lgr_decision_maker_on_first_contact_fail(ini); smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0, proposal_version); out_free: diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index c305d8d..1c4d669 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -40,12 +40,289 @@ #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) +#define LDM_TBD 0 +#define LDM_RETRY 1 +#define LDM_FIRST_CONTACT 2 + struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), .list = LIST_HEAD_INIT(smc_lgr_list.list), .num = 0, }; +struct smc_lgr_decision_maker { + struct rhash_head rnode; /* node for rhashtable */ + struct wait_queue_head wq; /* queue for connection that have been + * decided to wait + */ + spinlock_t lock; /* protection for decision maker */ + refcount_t ref; /* refcount for decision maker */ + int type; /* smc type */ + int role; /* smc role */ + unsigned long pending_capability; + /* maximum pending number of connections that + * need to wait. + */ + unsigned long conns_pending; + /* the number of pending connections */ +}; + +struct smcr_lgr_decision_maker { + struct smc_lgr_decision_maker ldm; + u8 peer_systemid[SMC_SYSTEMID_LEN]; + u8 peer_mac[ETH_ALEN]; /* = gid[8:10||13:15] */ + u8 peer_gid[SMC_GID_SIZE]; /* gid of peer*/ + int clcqpn; +}; + +struct smcd_lgr_decision_maker { + struct smc_lgr_decision_maker ldm; + u64 peer_gid; + struct smcd_dev *dev; +}; + +static int smcr_lgr_decision_maker_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct smcr_lgr_decision_maker *rldm = obj; + const struct smc_init_info *ini = arg->key; + + if (ini->role != rldm->ldm.role) + return 1; + + if (memcmp(ini->peer_systemid, rldm->peer_systemid, SMC_SYSTEMID_LEN)) + return 1; + + if (memcmp(ini->peer_gid, rldm->peer_gid, SMC_GID_SIZE)) + return 1; + + if ((ini->role == SMC_SERV || ini->ib_clcqpn == rldm->clcqpn) && + (ini->smcr_version == SMC_V2 || + !memcmp(ini->peer_mac, rldm->peer_mac, ETH_ALEN))) + return 0; + + return 1; +} + +static u32 smcr_lgr_decision_maker_hashfn(const void *data, u32 len, u32 seed) +{ + const struct smcr_lgr_decision_maker *rldm = data; + + return jhash2((u32 *)rldm->peer_systemid, SMC_SYSTEMID_LEN / sizeof(u32), seed) + + ((rldm->ldm.role == SMC_SERV) ? 0 : rldm->clcqpn); +} + +static u32 smcr_lgr_decision_maker_arg_hashfn(const void *data, u32 len, u32 seed) +{ + const struct smc_init_info *ini = data; + + return jhash2((u32 *)ini->peer_systemid, SMC_SYSTEMID_LEN / sizeof(u32), seed) + + ((ini->role == SMC_SERV) ? 0 : ini->ib_clcqpn); +} + +static void smcr_lgr_decision_maker_init(struct smc_lgr_decision_maker *ldm, + struct smc_init_info *ini) +{ + struct smcr_lgr_decision_maker *rldm = (struct smcr_lgr_decision_maker *)ldm; + + memcpy(rldm->peer_systemid, ini->peer_systemid, SMC_SYSTEMID_LEN); + memcpy(rldm->peer_gid, ini->peer_gid, SMC_GID_SIZE); + memcpy(rldm->peer_mac, ini->peer_mac, ETH_ALEN); + rldm->clcqpn = ini->ib_clcqpn; +} + +static int smcd_lgr_decision_maker_cmpfn(struct rhashtable_compare_arg *arg, const void *obj) +{ + const struct smcd_lgr_decision_maker *dldm = obj; + const struct smc_init_info *ini = arg->key; + + if (ini->role != dldm->ldm.role) + return 1; + + if (ini->ism_peer_gid[ini->ism_selected] != dldm->peer_gid) + return 1; + + if (ini->ism_dev[ini->ism_selected] != dldm->dev) + return 1; + + return 0; +} + +static u32 smcd_lgr_decision_maker_hashfn(const void *data, u32 len, u32 seed) +{ + const struct smcd_lgr_decision_maker *dlcm = data; + + return jhash2((u32 *)&dlcm->peer_gid, sizeof(dlcm->peer_gid) / sizeof(u32), seed); +} + +static u32 smcd_lgr_decision_maker_arg_hashfn(const void *data, u32 len, u32 seed) +{ + const struct smc_init_info *ini = data; + u64 select_gid; + + select_gid = ini->ism_peer_gid[ini->ism_selected]; + return jhash2((u32 *)&select_gid, sizeof(select_gid) / sizeof(u32), seed); +} + +static void smcd_lgr_decision_maker_init(struct smc_lgr_decision_maker *ldm, + struct smc_init_info *ini) +{ + struct smcd_lgr_decision_maker *dldm = (struct smcd_lgr_decision_maker *)ldm; + + dldm->peer_gid = ini->ism_peer_gid[ini->ism_selected]; + dldm->dev = ini->ism_dev[ini->ism_selected]; +} + +struct smc_lgr_decision_builder { + struct rhashtable map; + spinlock_t map_lock; /* protect map */ + struct rhashtable_params default_params; + /* how to serach and insert decision maker by ini */ + void (*init)(struct smc_lgr_decision_maker *ldm, struct smc_init_info *ini); + /* init maker by ini */ + u32 sz; /* size */ +}; + +static struct smc_lgr_decision_builder smc_lgr_decision_set[SMC_TYPE_D + 1] = { + /* SMC_TYPE_R = 0 */ + { + .sz = sizeof(struct smcr_lgr_decision_maker), + .init = smcr_lgr_decision_maker_init, + .map_lock = __SPIN_LOCK_UNLOCKED(smc_lgr_decision_set[SMC_TYPE_R].map_lock), + .default_params = { + .head_offset = offsetof(struct smc_lgr_decision_maker, rnode), + .key_len = sizeof(struct smc_init_info), + .obj_cmpfn = smcr_lgr_decision_maker_cmpfn, + .obj_hashfn = smcr_lgr_decision_maker_hashfn, + .hashfn = smcr_lgr_decision_maker_arg_hashfn, + .automatic_shrinking = true, + }, + }, + /* SMC_TYPE_D = 1 */ + { + .sz = sizeof(struct smcd_lgr_decision_maker), + .init = smcd_lgr_decision_maker_init, + .map_lock = __SPIN_LOCK_UNLOCKED(smc_lgr_decision_set[SMC_TYPE_D].map_lock), + .default_params = { + .head_offset = offsetof(struct smc_lgr_decision_maker, rnode), + .key_len = sizeof(struct smc_init_info), + .obj_cmpfn = smcd_lgr_decision_maker_cmpfn, + .obj_hashfn = smcd_lgr_decision_maker_hashfn, + .hashfn = smcd_lgr_decision_maker_arg_hashfn, + .automatic_shrinking = true, + }, + }, +}; + +/* hold a reference for smc_lgr_decision_maker */ +static inline void smc_lgr_decision_maker_hold(struct smc_lgr_decision_maker *ldm) +{ + if (likely(ldm)) + refcount_inc(&ldm->ref); +} + +/* release a reference for smc_lgr_decision_maker */ +static inline void smc_lgr_decision_maker_put(struct smc_lgr_decision_maker *ldm) +{ + bool do_free = false; + int type; + + if (unlikely(!ldm)) + return; + + if (refcount_dec_not_one(&ldm->ref)) + return; + + type = ldm->type; + + spin_lock_bh(&smc_lgr_decision_set[type].map_lock); + /* last ref */ + if (refcount_dec_and_test(&ldm->ref)) { + do_free = true; + rhashtable_remove_fast(&smc_lgr_decision_set[type].map, &ldm->rnode, + smc_lgr_decision_set[type].default_params); + } + spin_unlock_bh(&smc_lgr_decision_set[type].map_lock); + if (do_free) + kfree(ldm); +} + +static struct smc_lgr_decision_maker * +smc_get_or_create_lgr_decision_maker(struct smc_init_info *ini) +{ + struct smc_lgr_decision_maker *ldm; + int err, type; + + type = ini->is_smcd ? SMC_TYPE_D : SMC_TYPE_R; + + spin_lock_bh(&smc_lgr_decision_set[type].map_lock); + ldm = rhashtable_lookup_fast(&smc_lgr_decision_set[type].map, ini, + smc_lgr_decision_set[type].default_params); + if (!ldm) { + ldm = kzalloc(smc_lgr_decision_set[type].sz, GFP_ATOMIC); + if (unlikely(!ldm)) + goto fail; + + /* common init */ + spin_lock_init(&ldm->lock); + init_waitqueue_head(&ldm->wq); + refcount_set(&ldm->ref, 1); + ldm->type = type; + ldm->role = ini->role; + + /* init */ + if (smc_lgr_decision_set[type].init) + smc_lgr_decision_set[type].init(ldm, ini); + + err = rhashtable_insert_fast(&smc_lgr_decision_set[type].map, + &ldm->rnode, + smc_lgr_decision_set[type].default_params); + if (unlikely(err)) { + pr_warn_ratelimited("smc: rhashtable_insert_fast failed (%d)", err); + kfree(ldm); + ldm = NULL; + } + } else { + smc_lgr_decision_maker_hold(ldm); + } +fail: + spin_unlock_bh(&smc_lgr_decision_set[type].map_lock); + return ldm; +} + +void smc_lgr_decision_maker_on_first_contact_done(struct smc_init_info *ini, bool success) +{ + struct smc_lgr_decision_maker *ldm; + int nr; + + if (unlikely(!ini->first_contact_local)) + return; + + /* get lgr decision maker */ + ldm = ini->ldm; + + if (unlikely(!ldm)) + return; + + spin_lock_bh(&ldm->lock); + ldm->pending_capability -= (SMC_RMBS_PER_LGR_MAX - 1); + nr = SMC_RMBS_PER_LGR_MAX - 1; + if (unlikely(!success) && ldm->role == SMC_SERV) { + /* only to wake up one connection to perfrom + * first contact in server side, client MUST wake up + * all to decline. + */ + nr = min(1, nr); + } + if (nr) + __wake_up(&ldm->wq, TASK_NORMAL, nr, + success ? (void *)LDM_RETRY : (void *)LDM_FIRST_CONTACT); + + spin_unlock_bh(&ldm->lock); + + /* hold in smc_lgr_create */ + smc_lgr_decision_maker_put(ldm); +} + static atomic_t lgr_cnt = ATOMIC_INIT(0); /* number of existing link groups */ static DECLARE_WAIT_QUEUE_HEAD(lgrs_deleted); @@ -756,6 +1033,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->link_id = smcr_next_link_id(lgr); lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ + rwlock_init(&lnk->rtokens_lock); lnk->link_idx = link_idx; lnk->wr_rx_id_compl = 0; smc_ibdev_cnt_inc(lnk); @@ -915,6 +1193,11 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) atomic_inc(&lgr_cnt); } smc->conn.lgr = lgr; + + lgr->ldm = ini->ldm; + /* smc_lgr_decision_maker_put in __smc_lgr_free() */ + smc_lgr_decision_maker_hold(lgr->ldm); + spin_lock_bh(lgr_lock); list_add_tail(&lgr->list, lgr_list); spin_unlock_bh(lgr_lock); @@ -1364,6 +1647,9 @@ static void __smc_lgr_free(struct smc_link_group *lgr) if (!atomic_dec_return(&lgr_cnt)) wake_up(&lgrs_deleted); } + /* smc_lgr_decision_maker_hold in smc_lgr_create() */ + if (lgr->ldm) + smc_lgr_decision_maker_put(lgr->ldm); kfree(lgr); } @@ -1824,6 +2110,9 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, lgr->role != role) return false; + if (!READ_ONCE(lgr->first_contact_done)) + return false; + for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) { lnk = &lgr->lnk[i]; @@ -1844,16 +2133,39 @@ static bool smcr_lgr_match(struct smc_link_group *lgr, u8 smcr_version, static bool smcd_lgr_match(struct smc_link_group *lgr, struct smcd_dev *smcismdev, u64 peer_gid) { - return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev; + return lgr->peer_gid == peer_gid && lgr->smcd == smcismdev && + READ_ONCE(lgr->first_contact_done); +} + +static int smc_ldm_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, int sync, + void *key) +{ + struct smc_init_info *ini = wq_entry->private; + int ret; + + wq_entry->private = ini->private; + ini->advise = (u8)(uintptr_t)key; + + ret = woken_wake_function(wq_entry, mode, sync, NULL); + if (ret) { + /* only wake up once */ + list_del_init_careful(&wq_entry->entry); + if (likely(ini->ldm)) + ini->ldm->conns_pending--; + } + return ret; } /* create a new SMC connection (and a new link group if necessary) */ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) { + struct smc_lgr_decision_maker *ldm = NULL; struct smc_connection *conn = &smc->conn; struct net *net = sock_net(&smc->sk); + struct wait_queue_entry wait; struct list_head *lgr_list; struct smc_link_group *lgr; + int timeo = CLC_WAIT_TIME; enum smc_lgr_role role; spinlock_t *lgr_lock; int rc = 0; @@ -1864,12 +2176,40 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) &smc_lgr_list.lock; ini->first_contact_local = 1; role = smc->listen_smc ? SMC_SERV : SMC_CLNT; - if (role == SMC_CLNT && ini->first_contact_peer) + ini->role = role; + + ldm = smc_get_or_create_lgr_decision_maker(ini); + if (unlikely(!ldm)) + return SMC_CLC_DECL_INTERR; + + /* Considering a scenario, after find out the SMCDv2 device, a potential failures + * occur in smc_find_rdma_v2_device_serv, for example smc_buf_create failed. And then, + * it will continue to search for SMCDv1 devices or SMCR devices. Hence, + * smc_conn_create will called again with the same ini, if the ini performs the first + * contact logic, which means that the lgr that should have been created by it must be + * failed, and we need to actively trigger the failed logic here. + */ + if (unlikely(ini->ldm)) { + smc_lgr_decision_maker_on_first_contact_fail(ini); + ini->ldm = NULL; + } + + if (role == SMC_CLNT && ini->first_contact_peer) { + spin_lock_bh(&ldm->lock); + ldm->pending_capability += (SMC_RMBS_PER_LGR_MAX - 1); + spin_unlock_bh(&ldm->lock); /* create new link group as well */ goto create; + } /* determine if an existing link group can be reused */ spin_lock_bh(lgr_lock); + spin_lock(&ldm->lock); + +again: + /* init advise */ + ini->advise = LDM_TBD; + list_for_each_entry(lgr, lgr_list, list) { write_lock_bh(&lgr->conns_lock); if ((ini->is_smcd ? @@ -1884,21 +2224,82 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) lgr->vlan_id == ini->vlan_id) && (role == SMC_CLNT || ini->is_smcd || (lgr->conns_num < SMC_RMBS_PER_LGR_MAX && - !bitmap_full(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX)))) { + (SMC_RMBS_PER_LGR_MAX - + bitmap_weight(lgr->rtokens_used_mask, SMC_RMBS_PER_LGR_MAX) + > atomic_read(&lgr->rtoken_pendings))))) { /* link group found */ ini->first_contact_local = 0; conn->lgr = lgr; rc = smc_lgr_register_conn(conn, false); write_unlock_bh(&lgr->conns_lock); - if (!rc && delayed_work_pending(&lgr->free_work)) - cancel_delayed_work(&lgr->free_work); + if (!rc) { + smc_conn_enter_rtoken_pending(smc, ini); + if (delayed_work_pending(&lgr->free_work)) + cancel_delayed_work(&lgr->free_work); + } break; } write_unlock_bh(&lgr->conns_lock); } + + /* not found */ + if (!rc && ini->first_contact_local) { + if (timeo && ldm->pending_capability > ldm->conns_pending) { + /* record pending connection, release + * in smc_lgr_decision_maker_on_first_contact_done() or after timeout + */ + ldm->conns_pending++; + + /* used in ldm_wakeup, clear after remove from queue */ + ini->ldm = ldm; + + /* init wait entry */ + init_wait_entry(&wait, 0); + /* replace wait with new private & func */ + ini->private = wait.private; + wait.private = ini; + wait.func = smc_ldm_wake_function; + + /* add to wq */ + add_wait_queue_exclusive(&ldm->wq, &wait); + + spin_unlock(&ldm->lock); + spin_unlock_bh(lgr_lock); + + /* wait woken */ + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); + + spin_lock_bh(lgr_lock); + spin_lock(&ldm->lock); + + ini->ldm = NULL; + + /* remove from wq */ + remove_wait_queue(&ldm->wq, &wait); + + /* timeout */ + if (unlikely(!timeo || ini->advise == LDM_TBD)) { + if (ini->advise == LDM_TBD) { + ldm->conns_pending--; + ini->advise = LDM_RETRY; + } + } + } else { + ini->advise = LDM_FIRST_CONTACT; + } + + if (ini->advise == LDM_RETRY) + goto again; + + /* do first contact */ + ldm->pending_capability += (SMC_RMBS_PER_LGR_MAX - 1); + } + + spin_unlock(&ldm->lock); spin_unlock_bh(lgr_lock); + if (rc) - return rc; + goto out; if (role == SMC_CLNT && !ini->first_contact_peer && ini->first_contact_local) { @@ -1906,11 +2307,15 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) * a new one * send out_of_sync decline, reason synchr. error */ - return SMC_CLC_DECL_SYNCERR; + rc = SMC_CLC_DECL_SYNCERR; + goto out; } create: if (ini->first_contact_local) { + ini->ldm = ldm; + /* smc_lgr_decision_maker_put in first_contact_done() */ + smc_lgr_decision_maker_hold(ldm); rc = smc_lgr_create(smc, ini); if (rc) goto out; @@ -1943,6 +2348,8 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini) #endif out: + /* smc_lgr_decision_maker_hold in smc_get_or_create_lgr_decision_make() */ + smc_lgr_decision_maker_put(ldm); return rc; } @@ -2505,19 +2912,24 @@ int smc_rtoken_add(struct smc_link *lnk, __be64 nw_vaddr, __be32 nw_rkey) u32 rkey = ntohl(nw_rkey); int i; + write_lock_bh(&lnk->rtokens_lock); for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && lgr->rtokens[i][lnk->link_idx].dma_addr == dma_addr && test_bit(i, lgr->rtokens_used_mask)) { /* already in list */ + write_unlock_bh(&lnk->rtokens_lock); return i; } } i = smc_rmb_reserve_rtoken_idx(lgr); - if (i < 0) + if (i < 0) { + write_unlock_bh(&lnk->rtokens_lock); return i; + } lgr->rtokens[i][lnk->link_idx].rkey = rkey; lgr->rtokens[i][lnk->link_idx].dma_addr = dma_addr; + write_unlock_bh(&lnk->rtokens_lock); return i; } @@ -2528,6 +2940,7 @@ int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) u32 rkey = ntohl(nw_rkey); int i, j; + write_lock_bh(&lnk->rtokens_lock); for (i = 0; i < SMC_RMBS_PER_LGR_MAX; i++) { if (lgr->rtokens[i][lnk->link_idx].rkey == rkey && test_bit(i, lgr->rtokens_used_mask)) { @@ -2536,9 +2949,11 @@ int smc_rtoken_delete(struct smc_link *lnk, __be32 nw_rkey) lgr->rtokens[i][j].dma_addr = 0; } clear_bit(i, lgr->rtokens_used_mask); + write_unlock_bh(&lnk->rtokens_lock); return 0; } } + write_unlock_bh(&lnk->rtokens_lock); return -ENOENT; } @@ -2604,12 +3019,31 @@ static int smc_core_reboot_event(struct notifier_block *this, int __init smc_core_init(void) { + int i; + + /* init smc lgr decision maker builder */ + for (i = 0; i <= SMC_TYPE_D; i++) + rhashtable_init(&smc_lgr_decision_set[i].map, + &smc_lgr_decision_set[i].default_params); + return register_reboot_notifier(&smc_reboot_notifier); } +static void smc_lgr_decision_maker_free_cb(void *ptr, void *arg) +{ + kfree(ptr); +} + /* Called (from smc_exit) when module is removed */ void smc_core_exit(void) { + int i; + unregister_reboot_notifier(&smc_reboot_notifier); smc_lgrs_shutdown(); + + /* destroy smc lgr decision maker builder */ + for (i = 0; i <= SMC_TYPE_D; i++) + rhashtable_free_and_destroy(&smc_lgr_decision_set[i].map, + smc_lgr_decision_maker_free_cb, NULL); } diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 285f9bd..e9adf4f 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -107,6 +108,7 @@ struct smc_link { u32 wr_tx_cnt; /* number of WR send buffers */ wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */ atomic_t wr_tx_refcnt; /* tx refs to link */ + rwlock_t rtokens_lock; struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */ struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */ @@ -244,17 +246,23 @@ struct smc_llc_flow { struct smc_llc_qentry *qentry; }; +struct smc_lgr_decision_maker; + struct smc_link_group { struct list_head list; struct rb_root conns_all; /* connection tree */ rwlock_t conns_lock; /* protects conns_all */ unsigned int conns_num; /* current # of connections */ + atomic_t rtoken_pendings;/* number of connection that + * lgr assigned but no rtoken got yet + */ unsigned short vlan_id; /* vlan id of link group */ struct list_head sndbufs[SMC_RMBE_SIZES];/* tx buffers */ struct mutex sndbufs_lock; /* protects tx buffers */ struct list_head rmbs[SMC_RMBE_SIZES]; /* rx buffers */ struct mutex rmbs_lock; /* protects rx buffers */ + u8 first_contact_done; /* if first contact succeed */ u8 id[SMC_LGR_ID_SIZE]; /* unique lgr id */ struct delayed_work free_work; /* delayed freeing of an lgr */ @@ -335,6 +343,8 @@ struct smc_link_group { /* peer triggered shutdownn */ }; }; + struct smc_lgr_decision_maker *ldm; + /* who decides to create this lgr */ }; struct smc_clc_msg_local; @@ -373,6 +383,9 @@ struct smc_init_info { unsigned short vlan_id; u32 rc; u8 negotiated_eid[SMC_MAX_EID_LEN]; + struct smc_lgr_decision_maker *ldm; + u8 advise; + void *private; /* SMC-R */ u8 smcr_version; u8 check_smcrv2; @@ -391,6 +404,7 @@ struct smc_init_info { u8 ism_offered_cnt; /* # of ISM devices offered */ u8 ism_selected; /* index of selected ISM dev*/ u8 smcd_version; + u8 role; }; /* Find the connection associated with the given alert token in the link group. @@ -559,6 +573,38 @@ struct smc_link *smc_switch_conns(struct smc_link_group *lgr, int smcr_nl_get_link(struct sk_buff *skb, struct netlink_callback *cb); int smcd_nl_get_lgr(struct sk_buff *skb, struct netlink_callback *cb); +void smc_lgr_decision_maker_on_first_contact_done(struct smc_init_info *ini, bool success); + +static inline void smc_lgr_decision_maker_on_first_contact_success(struct smc_sock *smc, + struct smc_init_info *ini) +{ + smc->conn.lgr->first_contact_done = 1; + /* make sure first_contact_done can be seen after wakeup */ + smp_mb(); + smc_lgr_decision_maker_on_first_contact_done(ini, 1 /* success */); +} + +static inline void smc_lgr_decision_maker_on_first_contact_fail(struct smc_init_info *ini) +{ + smc_lgr_decision_maker_on_first_contact_done(ini, 0 /* failed */); +} + +static inline void smc_conn_enter_rtoken_pending(struct smc_sock *smc, struct smc_init_info *ini) +{ + struct smc_link_group *lgr = smc->conn.lgr; + + if (lgr && !ini->first_contact_local) + atomic_inc(&lgr->rtoken_pendings); +} + +static inline void smc_conn_leave_rtoken_pending(struct smc_sock *smc, struct smc_init_info *ini) +{ + struct smc_link_group *lgr = smc->conn.lgr; + + if (lgr && !ini->first_contact_local) + atomic_dec(&lgr->rtoken_pendings); +} + static inline struct smc_link_group *smc_get_lgr(struct smc_link *link) { return link->lgr; -- 1.8.3.1