From: Jack Wang <jinpu.wang@cloud.ionos.com>
To: linux-rdma@vger.kernel.org
Cc: bvanassche@acm.org, leon@kernel.org, dledford@redhat.com,
jgg@ziepe.ca, danil.kipnis@cloud.ionos.com,
jinpu.wang@cloud.ionos.com,
Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Subject: [PATCHv2 for-next 05/12] RDMA/rtrs-srv: don't guard the whole __alloc_srv with srv_mutex
Date: Fri, 23 Oct 2020 09:43:46 +0200 [thread overview]
Message-ID: <20201023074353.21946-6-jinpu.wang@cloud.ionos.com> (raw)
In-Reply-To: <20201023074353.21946-1-jinpu.wang@cloud.ionos.com>
From: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
The purpose of srv_mutex is to protect srv_list as in put_srv, so no need
to hold it when allocate memory for srv since it could be time consuming.
Otherwise if one machine has limited memory, rsrv_close_work could be
blocked for a longer time due to the mutex is held by get_or_create_srv
since it can't get memory in time.
[13327733.807781] INFO: task kworker/1:1:27478 blocked for more than 120 seconds.
[13327733.854623] Tainted: G O 4.14.171-1-storage #4.14.171-1.3~deb9
[13327733.902461] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[13327733.953930] kworker/1:1 D 0 27478 2 0x80000000
[13327733.953938] Workqueue: rtrs_server_wq rtrs_srv_close_work [rtrs_server]
[13327733.953939] Call Trace:
[13327733.953945] ? __schedule+0x38c/0x7e0
[13327733.953946] schedule+0x32/0x80
[13327733.953948] schedule_preempt_disabled+0xa/0x10
[13327733.953949] __mutex_lock.isra.2+0x25e/0x4d0
[13327733.953954] ? put_srv+0x44/0x100 [rtrs_server]
[13327733.953958] put_srv+0x44/0x100 [rtrs_server]
[13327733.953961] rtrs_srv_close_work+0x16c/0x280 [rtrs_server]
[13327733.953966] process_one_work+0x1c5/0x3c0
[13327733.953969] worker_thread+0x47/0x3e0
[13327733.953970] kthread+0xfc/0x130
[13327733.953972] ? trace_event_raw_event_workqueue_execute_start+0xa0/0xa0
[13327733.953973] ? kthread_create_on_node+0x70/0x70
[13327733.953974] ret_from_fork+0x1f/0x30
Let's move all the logics from __find_srv_and_get and __alloc_srv to
get_or_create_srv, and remove the two functions. Then it should be safe
for multiple processes to access the same srv since it is protected
with srv_mutex.
And since we don't want to allocate chunks with srv_mutex held, let's
check the srv->refcount after get srv because the chunks could not be
allocated yet.
Fixes: 9cb837480424 ("RDMA/rtrs: server: main functionality")
Signed-off-by: Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
---
drivers/infiniband/ulp/rtrs/rtrs-srv.c | 86 +++++++++++---------------
1 file changed, 37 insertions(+), 49 deletions(-)
diff --git a/drivers/infiniband/ulp/rtrs/rtrs-srv.c b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
index d6f93601712e..1cb778aff3c5 100644
--- a/drivers/infiniband/ulp/rtrs/rtrs-srv.c
+++ b/drivers/infiniband/ulp/rtrs/rtrs-srv.c
@@ -1328,17 +1328,42 @@ static void rtrs_srv_dev_release(struct device *dev)
kfree(srv);
}
-static struct rtrs_srv *__alloc_srv(struct rtrs_srv_ctx *ctx,
- const uuid_t *paths_uuid)
+static void free_srv(struct rtrs_srv *srv)
+{
+ int i;
+
+ WARN_ON(refcount_read(&srv->refcount));
+ for (i = 0; i < srv->queue_depth; i++)
+ mempool_free(srv->chunks[i], chunk_pool);
+ kfree(srv->chunks);
+ mutex_destroy(&srv->paths_mutex);
+ mutex_destroy(&srv->paths_ev_mutex);
+ /* last put to release the srv structure */
+ put_device(&srv->dev);
+}
+
+static struct rtrs_srv *get_or_create_srv(struct rtrs_srv_ctx *ctx,
+ const uuid_t *paths_uuid)
{
struct rtrs_srv *srv;
int i;
+ mutex_lock(&ctx->srv_mutex);
+ list_for_each_entry(srv, &ctx->srv_list, ctx_list) {
+ if (uuid_equal(&srv->paths_uuid, paths_uuid) &&
+ refcount_inc_not_zero(&srv->refcount)) {
+ mutex_unlock(&ctx->srv_mutex);
+ return srv;
+ }
+ }
+
+ /* need to allocate a new srv */
srv = kzalloc(sizeof(*srv), GFP_KERNEL);
- if (!srv)
+ if (!srv) {
+ mutex_unlock(&ctx->srv_mutex);
return NULL;
+ }
- refcount_set(&srv->refcount, 1);
INIT_LIST_HEAD(&srv->paths_list);
mutex_init(&srv->paths_mutex);
mutex_init(&srv->paths_ev_mutex);
@@ -1347,6 +1372,8 @@ static struct rtrs_srv *__alloc_srv(struct rtrs_srv_ctx *ctx,
srv->ctx = ctx;
device_initialize(&srv->dev);
srv->dev.release = rtrs_srv_dev_release;
+ list_add(&srv->ctx_list, &ctx->srv_list);
+ mutex_unlock(&ctx->srv_mutex);
srv->chunks = kcalloc(srv->queue_depth, sizeof(*srv->chunks),
GFP_KERNEL);
@@ -1358,7 +1385,7 @@ static struct rtrs_srv *__alloc_srv(struct rtrs_srv_ctx *ctx,
if (!srv->chunks[i])
goto err_free_chunks;
}
- list_add(&srv->ctx_list, &ctx->srv_list);
+ refcount_set(&srv->refcount, 1);
return srv;
@@ -1369,52 +1396,9 @@ static struct rtrs_srv *__alloc_srv(struct rtrs_srv_ctx *ctx,
err_free_srv:
kfree(srv);
-
return NULL;
}
-static void free_srv(struct rtrs_srv *srv)
-{
- int i;
-
- WARN_ON(refcount_read(&srv->refcount));
- for (i = 0; i < srv->queue_depth; i++)
- mempool_free(srv->chunks[i], chunk_pool);
- kfree(srv->chunks);
- mutex_destroy(&srv->paths_mutex);
- mutex_destroy(&srv->paths_ev_mutex);
- /* last put to release the srv structure */
- put_device(&srv->dev);
-}
-
-static inline struct rtrs_srv *__find_srv_and_get(struct rtrs_srv_ctx *ctx,
- const uuid_t *paths_uuid)
-{
- struct rtrs_srv *srv;
-
- list_for_each_entry(srv, &ctx->srv_list, ctx_list) {
- if (uuid_equal(&srv->paths_uuid, paths_uuid) &&
- refcount_inc_not_zero(&srv->refcount))
- return srv;
- }
-
- return NULL;
-}
-
-static struct rtrs_srv *get_or_create_srv(struct rtrs_srv_ctx *ctx,
- const uuid_t *paths_uuid)
-{
- struct rtrs_srv *srv;
-
- mutex_lock(&ctx->srv_mutex);
- srv = __find_srv_and_get(ctx, paths_uuid);
- if (!srv)
- srv = __alloc_srv(ctx, paths_uuid);
- mutex_unlock(&ctx->srv_mutex);
-
- return srv;
-}
-
static void put_srv(struct rtrs_srv *srv)
{
if (refcount_dec_and_test(&srv->refcount)) {
@@ -1813,7 +1797,11 @@ static int rtrs_rdma_connect(struct rdma_cm_id *cm_id,
}
recon_cnt = le16_to_cpu(msg->recon_cnt);
srv = get_or_create_srv(ctx, &msg->paths_uuid);
- if (!srv) {
+ /*
+ * "refcount == 0" happens if a previous thread calls get_or_create_srv
+ * allocate srv, but chunks of srv are not allocated yet.
+ */
+ if (!srv || refcount_read(&srv->refcount) == 0) {
err = -ENOMEM;
goto reject_w_err;
}
--
2.25.1
next prev parent reply other threads:[~2020-10-23 7:44 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-10-23 7:43 [PATCHv2 for-next 00/12] rtrs: misc fix and cleanup Jack Wang
2020-10-23 7:43 ` [PATCHv2 for-next 01/12] RDMA/rtrs-clt: remove destroy_con_cq_qp in case route resolving failed Jack Wang
2020-10-23 7:43 ` [PATCHv2 for-next 02/12] RDMA/rtrs-clt: remove outdated comment in create_con_cq_qp Jack Wang
2020-10-23 7:43 ` [PATCHv2 for-next 03/12] RDMA/rtrs-clt: avoid run destroy_con_cq_qp/create_con_cq_qp in parallel Jack Wang
2020-10-23 7:43 ` [PATCHv2 for-next 04/12] RDMA/rtrs-clt: missing error from rtrs_rdma_conn_established Jack Wang
2020-10-23 7:43 ` Jack Wang [this message]
2020-10-23 7:43 ` [PATCHv2 for-next 06/12] RDMA/rtrs-srv: fix typo Jack Wang
2020-10-23 7:43 ` [PATCHv2 for-next 07/12] RDMA/rtrs: remove unnecessary argument dir of rtrs_iu_free Jack Wang
2020-10-23 7:43 ` [PATCHv2 for-next 08/12] RDMA/rtrs-clt: remove duplicated switch-case handling for CM error events Jack Wang
2020-10-23 7:43 ` [PATCHv2 for-next 09/12] RDMA/rtrs-clt: remove duplicated code Jack Wang
2020-10-23 7:43 ` [PATCHv2 for-next 10/12] RDMA/rtrs-srv: kill rtrs_srv_change_state_get_old Jack Wang
2020-10-23 7:43 ` [PATCHv2 for-next 11/12] RDMA/rtrs: introduce rtrs_post_send Jack Wang
2020-10-23 7:43 ` [PATCHv2 for-next 12/12] RDMA/rtrs-clt: remove 'addr' from rtrs_clt_add_path_to_arr Jack Wang
2020-10-28 16:28 ` [PATCHv2 for-next 00/12] rtrs: misc fix and cleanup Jason Gunthorpe
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20201023074353.21946-6-jinpu.wang@cloud.ionos.com \
--to=jinpu.wang@cloud.ionos.com \
--cc=bvanassche@acm.org \
--cc=danil.kipnis@cloud.ionos.com \
--cc=dledford@redhat.com \
--cc=guoqing.jiang@cloud.ionos.com \
--cc=jgg@ziepe.ca \
--cc=leon@kernel.org \
--cc=linux-rdma@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).