dm-devel.redhat.com archive mirror
 help / color / mirror / Atom feed
From: Martin Wilck <mwilck@suse.com>
To: lixiaokeng <lixiaokeng@huawei.com>,
	Benjamin Marzinski <bmarzins@redhat.com>,
	Christophe Varoqui <christophe.varoqui@opensvc.com>
Cc: linfeilong <linfeilong@huawei.com>, dm-devel@redhat.com
Subject: Re: [dm-devel] [PATCH] multipathd: avoid crash in uevent_cleanup()
Date: Tue, 02 Mar 2021 17:55:19 +0100	[thread overview]
Message-ID: <50905e363e77c3c4c0d25eb5a742839a1caa1082.camel@suse.com> (raw)
In-Reply-To: <41e79d67f568baf8de6b28e4924620240f0a2731.camel@suse.com>

[-- Attachment #1: Type: text/plain, Size: 1006 bytes --]

Hi lixiaokeng,

On Tue, 2021-03-02 at 16:29 +0100, Martin Wilck wrote:
> On Tue, 2021-03-02 at 20:44 +0800, lixiaokeng wrote:
> > 
> > 
> 
> The stacks you have shown indicate that the instruction pointers were
> broken. That would suggest something similar as dicussed in the ML
> thread leading to 38ffd89 ("libmultipath: prevent DSO unloading with
> astray checker threads"). Your logs show "tur checker refcount 1", so
> the next call to checker_put would have unloaded the DSO. 
> 
> Please try commenting out the dlclose() call in free_checker_class(),
> and see if it makes a difference.

I have two TENTATIVE patches here that I'd like you to ask to try (with
the dlclose in place again). Also, please make sure you've got 38ffd89.

This is really tentative, I'm still pretty much in the dark. But my
theory is that the crash can happen if the thread is about to start. So
the most important part is the hunk that checks the return value of
checker_class_ref() in start_checker_thread().

Martin



[-- Attachment #2: 0001-libmultipath-protect-DSO-unloading-with-RCU.patch --]
[-- Type: text/x-patch, Size: 4795 bytes --]

From a4dd64808d49f5a0d2a94336e56401262ef99e55 Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Tue, 2 Mar 2021 17:03:15 +0100
Subject: [PATCH 1/2] libmultipath: protect DSO unloading with RCU

Some crashes possibly related to DSO unloading are still observed.
Try protecting the unloading with RCU.

Signed-off-by: Martin Wilck <mwilck@suse.com>
---
 libmultipath/checkers.c | 79 ++++++++++++++++++++++++++++++-----------
 libmultipath/propsel.c  |  4 +++
 2 files changed, 63 insertions(+), 20 deletions(-)

diff --git a/libmultipath/checkers.c b/libmultipath/checkers.c
index 2dd9915..25f07ce 100644
--- a/libmultipath/checkers.c
+++ b/libmultipath/checkers.c
@@ -3,6 +3,7 @@
 #include <stddef.h>
 #include <dlfcn.h>
 #include <sys/stat.h>
+#include <errno.h>
 #include <urcu.h>
 #include <urcu/uatomic.h>
 
@@ -25,6 +26,7 @@ struct checker_class {
 	void *(*thread)(void *);	     /* async thread entry point */
 	const char **msgtable;
 	short msgtable_size;
+	struct rcu_head rcu;
 };
 
 static const char *checker_state_names[PATH_MAX_STATE] = {
@@ -74,20 +76,16 @@ static int checker_class_unref(struct checker_class *cls)
 	return uatomic_sub_return(&cls->refcount, 1);
 }
 
-void free_checker_class(struct checker_class *c)
+static void free_checker_class_rcu(struct rcu_head *head)
 {
-	int cnt;
+	struct checker_class *c = container_of(head, struct checker_class, rcu);
 
-	if (!c)
-		return;
-	cnt = checker_class_unref(c);
-	if (cnt != 0) {
-		condlog(cnt < 0 ? 1 : 4, "%s checker refcount %d",
-			c->name, cnt);
+	if (uatomic_read(&c-refcount) > 0) {
+		condlog(1, "%s: RACE: refcount = %d, not freeing checker",
+			__func__, refcount);
 		return;
 	}
 	condlog(3, "unloading %s checker", c->name);
-	list_del(&c->node);
 	if (c->reset)
 		c->reset();
 	if (c->handle) {
@@ -99,6 +97,22 @@ void free_checker_class(struct checker_class *c)
 	FREE(c);
 }
 
+static void free_checker_class(struct checker_class *c)
+{
+	int cnt;
+
+	if (!c)
+		return;
+	cnt = checker_class_unref(c);
+	if (cnt != 0) {
+		condlog(cnt < 0 ? 1 : 4, "%s checker refcount %d",
+			c->name, cnt);
+		return;
+	}
+	list_del(&c->node);
+	call_rcu(&c->rcu, free_checker_class_rcu);
+}
+
 void cleanup_checkers (void)
 {
 	struct checker_class *checker_loop;
@@ -111,15 +125,32 @@ void cleanup_checkers (void)
 
 static struct checker_class *checker_class_lookup(const char *name)
 {
-	struct checker_class *c;
+	struct checker_class *c, *found = NULL;
+	int refcount = 0;
 
 	if (!name || !strlen(name))
 		return NULL;
+
+	rcu_read_lock();
 	list_for_each_entry(c, &checkers, node) {
-		if (!strncmp(name, c->name, CHECKER_NAME_LEN))
-			return c;
+		if (!strncmp(name, c->name, CHECKER_NAME_LEN)) {
+			found = c;
+			break;
+		}
 	}
-	return NULL;
+	if (found) {
+		refcount = checker_class_ref(found);
+		if (refcount == 1)
+			checker_class_unref(found);
+	}
+	rcu_read_unlock();
+
+	if (refcount <= 1) {
+		condlog(1, "%s: RACE: got refcount == %d", __func__, refcount);
+		found = NULL;
+	}
+
+	return found;
 }
 
 void reset_checker_classes(void)
@@ -387,11 +418,20 @@ static void *checker_thread_entry(void *arg)
 int start_checker_thread(pthread_t *thread, const pthread_attr_t *attr,
 			 struct checker_context *ctx)
 {
-	int rv;
+	int rv, refcount;
 
 	assert(ctx && ctx->cls && ctx->cls->thread);
+
 	/* Take a ref here, lest the class be freed before the thread starts */
-	(void)checker_class_ref(ctx->cls);
+	rcu_read_lock();
+	refcount = checker_class_ref(ctx->cls);
+	if (refcount <= 1)
+		checker_class_unref(ctx->cls);
+	rcu_read_unlock();
+	if (refcount <= 1)
+		condlog(1, "%s: RACE: got refcount == %d", __func_, refcount);
+		return EIO;
+	}
 	rv = pthread_create(thread, attr, checker_thread_entry, ctx);
 	if (rv != 0) {
 		condlog(1, "failed to start checker thread for %s: %m",
@@ -418,14 +458,13 @@ void checker_get(const char *multipath_dir, struct checker *dst,
 
 	if (name && strlen(name)) {
 		src = checker_class_lookup(name);
-		if (!src)
+		if (!src) {
 			src = add_checker_class(multipath_dir, name);
+			if (src && checker_class_ref(src) == 1)
+				src = NULL;
+		}
 	}
 	dst->cls = src;
-	if (!src)
-		return;
-
-	(void)checker_class_ref(dst->cls);
 }
 
 int init_checkers(const char *multipath_dir)
diff --git a/libmultipath/propsel.c b/libmultipath/propsel.c
index f771a83..4add95a 100644
--- a/libmultipath/propsel.c
+++ b/libmultipath/propsel.c
@@ -536,6 +536,10 @@ int select_checker(struct config *conf, struct path *pp)
 	do_default(ckr_name, DEFAULT_CHECKER);
 out:
 	checker_get(conf->multipath_dir, c, ckr_name);
+	if (!checker_selected(c)) {
+		condlog(1, "%s: failed to grab checker", __func__);
+		return 1;
+	}
 	condlog(3, "%s: path_checker = %s %s", pp->dev,
 		checker_name(c), origin);
 	if (conf->checker_timeout) {
-- 
2.29.2


[-- Attachment #3: 0002-libmultipath-tur_thread-use-pthread_exit.patch --]
[-- Type: text/x-patch, Size: 2236 bytes --]

From c44375bb5e218b1e54ca4d9069b2b1632df87f75 Mon Sep 17 00:00:00 2001
From: Martin Wilck <mwilck@suse.com>
Date: Tue, 2 Mar 2021 17:05:26 +0100
Subject: [PATCH 2/2] libmultipath: tur_thread: use pthread_exit()

Using "return" would jump into a different DSO (libmultipath),
avoid that.

Signed-off-by: Martin Wilck <mwilck@suse.com>
---
 libmultipath/checkers.c     | 11 ++++++-----
 libmultipath/checkers/tur.c |  2 ++
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/libmultipath/checkers.c b/libmultipath/checkers.c
index 25f07ce..99e48bc 100644
--- a/libmultipath/checkers.c
+++ b/libmultipath/checkers.c
@@ -79,8 +79,9 @@ static int checker_class_unref(struct checker_class *cls)
 static void free_checker_class_rcu(struct rcu_head *head)
 {
 	struct checker_class *c = container_of(head, struct checker_class, rcu);
+	int refcount;
 
-	if (uatomic_read(&c-refcount) > 0) {
+	if ((refcount = uatomic_read(&c->refcount)) > 0) {
 		condlog(1, "%s: RACE: refcount = %d, not freeing checker",
 			__func__, refcount);
 		return;
@@ -145,7 +146,7 @@ static struct checker_class *checker_class_lookup(const char *name)
 	}
 	rcu_read_unlock();
 
-	if (refcount <= 1) {
+	if (refcount == 1) {
 		condlog(1, "%s: RACE: got refcount == %d", __func__, refcount);
 		found = NULL;
 	}
@@ -425,11 +426,11 @@ int start_checker_thread(pthread_t *thread, const pthread_attr_t *attr,
 	/* Take a ref here, lest the class be freed before the thread starts */
 	rcu_read_lock();
 	refcount = checker_class_ref(ctx->cls);
-	if (refcount <= 1)
+	if (refcount == 1)
 		checker_class_unref(ctx->cls);
 	rcu_read_unlock();
-	if (refcount <= 1)
-		condlog(1, "%s: RACE: got refcount == %d", __func_, refcount);
+	if (refcount <= 1) {
+		condlog(1, "%s: RACE: got refcount == %d", __func__, refcount);
 		return EIO;
 	}
 	rv = pthread_create(thread, attr, checker_thread_entry, ctx);
diff --git a/libmultipath/checkers/tur.c b/libmultipath/checkers/tur.c
index a4b4a21..0db50ba 100644
--- a/libmultipath/checkers/tur.c
+++ b/libmultipath/checkers/tur.c
@@ -284,6 +284,8 @@ void *libcheck_thread(struct checker_context *ctx)
 
 	tur_thread_cleanup_pop(ct);
 
+	pthread_exit(NULL);
+	/* not reached */
 	return ((void *)0);
 }
 
-- 
2.29.2


[-- Attachment #4: Type: text/plain, Size: 97 bytes --]

--
dm-devel mailing list
dm-devel@redhat.com
https://listman.redhat.com/mailman/listinfo/dm-devel

  reply	other threads:[~2021-03-02 16:55 UTC|newest]

Thread overview: 34+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-01-28 21:08 [dm-devel] [PATCH] multipathd: avoid crash in uevent_cleanup() mwilck
2021-02-02 20:52 ` Martin Wilck
2021-02-03 10:48   ` lixiaokeng
2021-02-03 13:57     ` Martin Wilck
2021-02-04  1:40       ` lixiaokeng
2021-02-04 15:06         ` Martin Wilck
2021-02-05 11:08           ` Martin Wilck
2021-02-05 11:09             ` Martin Wilck
2021-02-07  7:05             ` lixiaokeng
2021-03-01 14:53       ` lixiaokeng
2021-03-02  8:41         ` lixiaokeng
2021-03-02 11:07           ` Martin Wilck
2021-03-02 15:49             ` lixiaokeng
2021-03-02  9:56         ` Martin Wilck
2021-03-02 12:44           ` lixiaokeng
2021-03-02 15:29             ` Martin Wilck
2021-03-02 16:55               ` Martin Wilck [this message]
2021-03-03 10:42               ` lixiaokeng
2021-03-08  9:40                 ` Martin Wilck
2021-03-15 13:00                   ` Martin Wilck
2021-03-16 11:12                     ` lixiaokeng
2021-03-17 16:59                       ` Martin Wilck
2021-03-19  1:49                         ` lixiaokeng
2021-02-08  7:41     ` lixiaokeng
2021-02-08  9:50       ` Martin Wilck
2021-02-08 10:49         ` lixiaokeng
2021-02-08 11:03           ` Martin Wilck
2021-02-09  1:36             ` lixiaokeng
2021-02-09 17:30               ` Martin Wilck
2021-02-10  2:02                 ` lixiaokeng
2021-02-10  2:29                   ` Hexiaowen (Hex, EulerOS)
2021-02-19 10:35                     ` Martin Wilck
2021-02-19  1:36                 ` lixiaokeng
2021-02-02 22:23 ` Benjamin Marzinski

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=50905e363e77c3c4c0d25eb5a742839a1caa1082.camel@suse.com \
    --to=mwilck@suse.com \
    --cc=bmarzins@redhat.com \
    --cc=christophe.varoqui@opensvc.com \
    --cc=dm-devel@redhat.com \
    --cc=linfeilong@huawei.com \
    --cc=lixiaokeng@huawei.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).