All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 0/5] multipath-tools series: coredump and memory leak bugfix
@ 2020-08-18 12:59 lixiaokeng
  2020-08-18 13:02 ` [PATCH 1/5 v4] libmultipath fix a memory leak in set_ble_device lixiaokeng
                   ` (5 more replies)
  0 siblings, 6 replies; 20+ messages in thread
From: lixiaokeng @ 2020-08-18 12:59 UTC (permalink / raw)
  To: Benjamin Marzinski, Martin Wilck, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

When I learn the multipath-tools source code and test it, I
find some bugs and fix them.

repo: openSUSE/multipath-tools
repo link: https://github.com/openSUSE/multipath-tools
branch: upstream-queue

lixiaokeng (5):
  libmultipath: fix a memory leak in set_ble_device
  libmultipath: fix NULL dereference in select_action
  multipathd: add reclear_pp_from_mpp in ev_remove_path
  multipathd: disable queueing for recreated map in uev_remove_map
  libmultipath: fix daemon memory leak in disassemble_map

 libmultipath/blacklist.c | 79 +++++++++++++++++++++-------------------
 libmultipath/blacklist.h |  4 +-
 libmultipath/configure.c | 30 +++++++++++----
 libmultipath/dmparser.c  | 14 ++++++-
 multipathd/main.c        | 28 ++++++++++++++
 tests/blacklist.c        | 31 +++++++---------
 6 files changed, 119 insertions(+), 67 deletions(-)

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 1/5 v4] libmultipath fix a memory leak in set_ble_device
  2020-08-18 12:59 [PATCH 0/5] multipath-tools series: coredump and memory leak bugfix lixiaokeng
@ 2020-08-18 13:02 ` lixiaokeng
  2020-08-18 15:57   ` Martin Wilck
  2020-08-18 13:06 ` [PATCH 2/5] libmultipath fix NULL dereference in select_action lixiaokeng
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 20+ messages in thread
From: lixiaokeng @ 2020-08-18 13:02 UTC (permalink / raw)
  To: Benjamin Marzinski, Martin Wilck, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

In set_ble_device func, if blist is NULL or ble is NULL,
the vendor and product isn't freed. We think it is not
reasonable that strdup(XXX) is used as set_ble_device
and store_ble functions' parameter.

Here we call strdup() in store_ble and set_ble_device
functions and the string will be free if functions fail.
Because constant string like "sdb" will be their parameter,
char * is changed to const char *. This is base on
upstream-queue branch in openSUSE/multipath-tools.

The type of ble->vendor_reg is regex_t struct but not a
pointer, so it can not be set NULL.

Signed-off-by: Lixiaokeng <lixiaokeng@huawei.com>
Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
---
 libmultipath/blacklist.c | 79 +++++++++++++++++++++-------------------
 libmultipath/blacklist.h |  4 +-
 tests/blacklist.c        | 31 +++++++---------
 3 files changed, 57 insertions(+), 57 deletions(-)

diff --git a/libmultipath/blacklist.c b/libmultipath/blacklist.c
index db58ccca..fd89a8e2 100644
--- a/libmultipath/blacklist.c
+++ b/libmultipath/blacklist.c
@@ -29,14 +29,19 @@ char *check_invert(char *str, bool *invert)
 	return str;
 }

-int store_ble(vector blist, char * str, int origin)
+int store_ble(vector blist, const char *str, int origin)
 {
 	struct blentry * ble;
 	char *regex_str;
+	char *strdup_str = NULL;

 	if (!str)
 		return 0;

+	strdup_str = strdup(str);
+	if (!strdup_str)
+		return 1;
+
 	if (!blist)
 		goto out;

@@ -45,21 +50,21 @@ int store_ble(vector blist, char * str, int origin)
 	if (!ble)
 		goto out;

-	regex_str = check_invert(str, &ble->invert);
+	regex_str = check_invert(strdup_str, &ble->invert);
 	if (regcomp(&ble->regex, regex_str, REG_EXTENDED|REG_NOSUB))
 		goto out1;

 	if (!vector_alloc_slot(blist))
 		goto out1;

-	ble->str = str;
+	ble->str = strdup_str;
 	ble->origin = origin;
 	vector_set_slot(blist, ble);
 	return 0;
 out1:
 	FREE(ble);
 out:
-	FREE(str);
+	FREE(strdup_str);
 	return 1;
 }

@@ -79,10 +84,12 @@ int alloc_ble_device(vector blist)
 	return 0;
 }

-int set_ble_device(vector blist, char * vendor, char * product, int origin)
+int set_ble_device(vector blist, const char *vendor, const char *product, int origin)
 {
 	struct blentry_device * ble;
 	char *regex_str;
+	char *vendor_str = NULL;
+	char *product_str = NULL;

 	if (!blist)
 		return 1;
@@ -93,31 +100,38 @@ int set_ble_device(vector blist, char * vendor, char * product, int origin)
 		return 1;

 	if (vendor) {
-		regex_str = check_invert(vendor, &ble->vendor_invert);
-		if (regcomp(&ble->vendor_reg, regex_str,
-			    REG_EXTENDED|REG_NOSUB)) {
-			FREE(vendor);
-			if (product)
-				FREE(product);
-			return 1;
-		}
-		ble->vendor = vendor;
+		vendor_str = STRDUP(vendor);
+		if (!vendor_str)
+			goto out;
+
+		regex_str = check_invert(vendor_str, &ble->vendor_invert);
+		if (regcomp(&ble->vendor_reg, regex_str, REG_EXTENDED|REG_NOSUB))
+			goto out;
+
+		ble->vendor = vendor_str;
 	}
 	if (product) {
-		regex_str = check_invert(product, &ble->product_invert);
-		if (regcomp(&ble->product_reg, regex_str,
-			    REG_EXTENDED|REG_NOSUB)) {
-			FREE(product);
-			if (vendor) {
-				ble->vendor = NULL;
-				FREE(vendor);
-			}
-			return 1;
-		}
-		ble->product = product;
+		product_str = STRDUP(product);
+		if (!product_str)
+			goto out1;
+
+		regex_str = check_invert(product_str, &ble->product_invert);
+		if (regcomp(&ble->product_reg, regex_str, REG_EXTENDED|REG_NOSUB))
+			goto out1;
+
+		ble->product = product_str;
 	}
 	ble->origin = origin;
 	return 0;
+out1:
+	if (vendor) {
+		regfree(&ble->vendor_reg);
+		ble->vendor = NULL;
+	}
+out:
+	free(vendor_str);
+	free(product_str);
+	return 1;
 }

 static int
@@ -178,19 +192,12 @@ setup_default_blist (struct config * conf)
 {
 	struct blentry * ble;
 	struct hwentry *hwe;
-	char * str;
 	int i;

-	str = STRDUP("!^(sd[a-z]|dasd[a-z]|nvme[0-9])");
-	if (!str)
-		return 1;
-	if (store_ble(conf->blist_devnode, str, ORIGIN_DEFAULT))
+	if (store_ble(conf->blist_devnode, "!^(sd[a-z]|dasd[a-z]|nvme[0-9])", ORIGIN_DEFAULT))
 		return 1;

-	str = STRDUP("(SCSI_IDENT_|ID_WWN)");
-	if (!str)
-		return 1;
-	if (store_ble(conf->elist_property, str, ORIGIN_DEFAULT))
+	if (store_ble(conf->elist_property, "(SCSI_IDENT_|ID_WWN)", ORIGIN_DEFAULT))
 		return 1;

 	vector_foreach_slot (conf->hwtable, hwe, i) {
@@ -202,9 +209,7 @@ setup_default_blist (struct config * conf)
 				return 1;
 			ble = VECTOR_SLOT(conf->blist_device,
 					  VECTOR_SIZE(conf->blist_device) - 1);
-			if (set_ble_device(conf->blist_device,
-					   STRDUP(hwe->vendor),
-					   STRDUP(hwe->bl_product),
+			if (set_ble_device(conf->blist_device, hwe->vendor, hwe->bl_product,
 					   ORIGIN_DEFAULT)) {
 				FREE(ble);
 				vector_del_slot(conf->blist_device, VECTOR_SIZE(conf->blist_device) - 1);
diff --git a/libmultipath/blacklist.h b/libmultipath/blacklist.h
index 4305857d..b08e0978 100644
--- a/libmultipath/blacklist.h
+++ b/libmultipath/blacklist.h
@@ -42,8 +42,8 @@ int filter_device (vector, vector, char *, char *, char *);
 int filter_path (struct config *, struct path *);
 int filter_property(struct config *, struct udev_device *, int, const char*);
 int filter_protocol(vector, vector, struct path *);
-int store_ble (vector, char *, int);
-int set_ble_device (vector, char *, char *, int);
+int store_ble (vector, const char *, int);
+int set_ble_device (vector, const char *, const char *, int);
 void free_blacklist (vector);
 void free_blacklist_device (vector);
 void merge_blacklist(vector);
diff --git a/tests/blacklist.c b/tests/blacklist.c
index d5c40898..84a3ba2f 100644
--- a/tests/blacklist.c
+++ b/tests/blacklist.c
@@ -94,67 +94,62 @@ static int setup(void **state)

 	blist_devnode_sdb = vector_alloc();
 	if (!blist_devnode_sdb ||
-	    store_ble(blist_devnode_sdb, strdup("sdb"), ORIGIN_CONFIG))
+	    store_ble(blist_devnode_sdb, "sdb", ORIGIN_CONFIG))
 		return -1;
 	blist_devnode_sdb_inv = vector_alloc();
 	if (!blist_devnode_sdb_inv ||
-	    store_ble(blist_devnode_sdb_inv, strdup("!sdb"), ORIGIN_CONFIG))
+	    store_ble(blist_devnode_sdb_inv, "!sdb", ORIGIN_CONFIG))
 		return -1;

 	blist_all = vector_alloc();
-	if (!blist_all || store_ble(blist_all, strdup(".*"), ORIGIN_CONFIG))
+	if (!blist_all || store_ble(blist_all, ".*", ORIGIN_CONFIG))
 		return -1;

 	blist_device_foo_bar = vector_alloc();
 	if (!blist_device_foo_bar || alloc_ble_device(blist_device_foo_bar) ||
-	    set_ble_device(blist_device_foo_bar, strdup("foo"), strdup("bar"),
-			   ORIGIN_CONFIG))
+	    set_ble_device(blist_device_foo_bar, "foo", "bar", ORIGIN_CONFIG))
 		return -1;
 	blist_device_foo_inv_bar = vector_alloc();
 	if (!blist_device_foo_inv_bar ||
 	    alloc_ble_device(blist_device_foo_inv_bar) ||
-	    set_ble_device(blist_device_foo_inv_bar, strdup("!foo"),
-			   strdup("bar"), ORIGIN_CONFIG))
+	    set_ble_device(blist_device_foo_inv_bar, "!foo", "bar", ORIGIN_CONFIG))
 		return -1;
 	blist_device_foo_bar_inv = vector_alloc();
 	if (!blist_device_foo_bar_inv ||
 	    alloc_ble_device(blist_device_foo_bar_inv) ||
-	    set_ble_device(blist_device_foo_bar_inv, strdup("foo"),
-			   strdup("!bar"), ORIGIN_CONFIG))
+	    set_ble_device(blist_device_foo_bar_inv, "foo", "!bar", ORIGIN_CONFIG))
 		return -1;

 	blist_device_all = vector_alloc();
 	if (!blist_device_all || alloc_ble_device(blist_device_all) ||
-	    set_ble_device(blist_device_all, strdup(".*"), strdup(".*"),
-			   ORIGIN_CONFIG))
+	    set_ble_device(blist_device_all, ".*", ".*", ORIGIN_CONFIG))
 		return -1;

 	blist_wwid_xyzzy = vector_alloc();
 	if (!blist_wwid_xyzzy ||
-	    store_ble(blist_wwid_xyzzy, strdup("xyzzy"), ORIGIN_CONFIG))
+	    store_ble(blist_wwid_xyzzy, "xyzzy", ORIGIN_CONFIG))
 		return -1;
 	blist_wwid_xyzzy_inv = vector_alloc();
 	if (!blist_wwid_xyzzy_inv ||
-	    store_ble(blist_wwid_xyzzy_inv, strdup("!xyzzy"), ORIGIN_CONFIG))
+	    store_ble(blist_wwid_xyzzy_inv, "!xyzzy", ORIGIN_CONFIG))
 		return -1;

 	blist_protocol_fcp = vector_alloc();
 	if (!blist_protocol_fcp ||
-	    store_ble(blist_protocol_fcp, strdup("scsi:fcp"), ORIGIN_CONFIG))
+	    store_ble(blist_protocol_fcp, "scsi:fcp", ORIGIN_CONFIG))
 		return -1;
 	blist_protocol_fcp_inv = vector_alloc();
 	if (!blist_protocol_fcp_inv ||
-	    store_ble(blist_protocol_fcp_inv, strdup("!scsi:fcp"),
-		      ORIGIN_CONFIG))
+	    store_ble(blist_protocol_fcp_inv, "!scsi:fcp", ORIGIN_CONFIG))
 		return -1;

 	blist_property_wwn = vector_alloc();
 	if (!blist_property_wwn ||
-	    store_ble(blist_property_wwn, strdup("ID_WWN"), ORIGIN_CONFIG))
+	    store_ble(blist_property_wwn, "ID_WWN", ORIGIN_CONFIG))
 		return -1;
 	blist_property_wwn_inv = vector_alloc();
 	if (!blist_property_wwn_inv ||
-	    store_ble(blist_property_wwn_inv, strdup("!ID_WWN"), ORIGIN_CONFIG))
+	    store_ble(blist_property_wwn_inv, "!ID_WWN", ORIGIN_CONFIG))
 		return -1;

 	return 0;
--

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 2/5] libmultipath fix NULL dereference in select_action
  2020-08-18 12:59 [PATCH 0/5] multipath-tools series: coredump and memory leak bugfix lixiaokeng
  2020-08-18 13:02 ` [PATCH 1/5 v4] libmultipath fix a memory leak in set_ble_device lixiaokeng
@ 2020-08-18 13:06 ` lixiaokeng
  2020-08-18 16:28   ` Martin Wilck
  2020-08-18 13:08 ` [PATCH 3/5] multipathd: add reclear_pp_from_mpp in ev_remove_path lixiaokeng
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 20+ messages in thread
From: lixiaokeng @ 2020-08-18 13:06 UTC (permalink / raw)
  To: Benjamin Marzinski, Martin Wilck, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

I got a multipath segfault while running iscsi login/logout and following scripts in parallel:

#!/bin/bash
interval=1
while true
do
              multipath -F &> /dev/null
              multipath -r &> /dev/null
              multipath -v2 &> /dev/null
              multipath -ll &> /dev/null
              sleep $interval
done

This is the debuginfo:
Core was generated by `multipath -v2'.
Program terminated with signal SIGSEGV, Segmentation fault.
#0  0x00007fc5c5c8dedf in select_action (mpp=0x55a94f1f8980, curmp=<optimized out>,
    force_reload=<optimized out>) at configure.c:709
709		if (!cmpp->pg || VECTOR_SIZE(cmpp->pg) != VECTOR_SIZE(mpp->pg)) { {

(gdb) bt
#0  0x00007fc5c5c8dedf in select_action (mpp=0x55a94f1f8980, curmp=<optimized out>,
    force_reload=<optimized out>) at configure.c:709
#1  0x00007fc5c5c8fd0f in coalesce_paths (vecs=0x7ffff1c1c220, newmp=0x0, refwwid=0x0,
    force_reload=0, cmd=CMD_CREATE) at configure.c:1090
#2  0x000055a94e9f524d in configure (devpath=<optimized out>, dev_type=DEV_NONE,
    cmd=CMD_CREATE, conf=0x55a94f1b92d0) at main.c:757
#3  main (argc=<optimized out>, argv=<optimized out>) at main.c:1100
(gdb) p *cmpp
$1 = {
  wwid = "3600140566411a6d4873434fba988066f\000\070\060\066\066f", '\000' <repeats 88 times>,
  ...
  paths = 0x0, pg = 0x0, dmi = 0x55a94f22c3f0, alias = 0x55a94f205fb0 "mpathc",
  alias_prefix = 0x0, selector = 0x0, features = 0x0, hwhandler = 0x0, mpe = 0x0,
  hwe = 0x0, waiter = 0, stat_switchgroup = 0, stat_path_failures = 0, stat_map_loads = 0,
  ...
  generic_mp = {ops = 0x7fc5c5cada40 <dm_gen_multipath_ops>}}

There are many NULL pointers in cmpp such as selector, features, hwhandler and so on.
Here these on mpp is not NULL  but we can not be sure that won't be in mpp, so we
add checking pointers of both mpp and cmpp before using them.

The changes like "mpp->features != cmpp->features" make sure that mpp->action is not
set to ACT_RELOAD when they both equal NULL. Other changes check pointers don't equal
NULL. When only one is NULL, we think there is some difference between mpp and cmpp,
so mpp->action should be set to ACT_RELOAD.

Signed-off-by: Lixiaokeng <lixiaokeng@huawei.com>
Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
---
 libmultipath/configure.c | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/libmultipath/configure.c b/libmultipath/configure.c
index 96c79610..e02e7ef4 100644
--- a/libmultipath/configure.c
+++ b/libmultipath/configure.c
@@ -726,16 +726,20 @@ select_action (struct multipath * mpp, vector curmp, int force_reload)
 	}

 	if (mpp->no_path_retry != NO_PATH_RETRY_UNDEF &&
+	    mpp->features != cmpp->features &&
+	    (!mpp->features || !cmpp->features ||
 	    !!strstr(mpp->features, "queue_if_no_path") !=
-	    !!strstr(cmpp->features, "queue_if_no_path")) {
+	    !!strstr(cmpp->features, "queue_if_no_path"))) {
 		mpp->action =  ACT_RELOAD;
 		condlog(3, "%s: set ACT_RELOAD (no_path_retry change)",
 			mpp->alias);
 		return;
 	}
-	if ((mpp->retain_hwhandler != RETAIN_HWHANDLER_ON ||
+	if ((mpp->retain_hwhandler != RETAIN_HWHANDLER_ON || !cmpp->hwhandler ||
 	     strcmp(cmpp->hwhandler, "0") == 0) &&
-	    (strlen(cmpp->hwhandler) != strlen(mpp->hwhandler) ||
+	     mpp->hwhandler != cmpp->hwhandler &&
+	     (!mpp->hwhandler || !cmpp->hwhandler||
+	     strlen(cmpp->hwhandler) != strlen(mpp->hwhandler) ||
 	     strncmp(cmpp->hwhandler, mpp->hwhandler,
 		    strlen(mpp->hwhandler)))) {
 		mpp->action = ACT_RELOAD;
@@ -745,8 +749,10 @@ select_action (struct multipath * mpp, vector curmp, int force_reload)
 	}

 	if (mpp->retain_hwhandler != RETAIN_HWHANDLER_UNDEF &&
+	    mpp->features != cmpp->features &&
+	    (!mpp->features || !cmpp->features ||
 	    !!strstr(mpp->features, "retain_attached_hw_handler") !=
-	    !!strstr(cmpp->features, "retain_attached_hw_handler") &&
+	    !!strstr(cmpp->features, "retain_attached_hw_handler")) &&
 	    get_linux_version_code() < KERNEL_VERSION(4, 3, 0)) {
 		mpp->action = ACT_RELOAD;
 		condlog(3, "%s: set ACT_RELOAD (retain_hwhandler change)",
@@ -754,8 +760,16 @@ select_action (struct multipath * mpp, vector curmp, int force_reload)
 		return;
 	}

-	cmpp_feat = STRDUP(cmpp->features);
-	mpp_feat = STRDUP(mpp->features);
+	if (!cmpp->features ) {
+		cmpp_feat = NULL;
+	} else {
+		cmpp_feat = STRDUP(cmpp->features);
+	}
+	if (!mpp->features ) {
+		mpp_feat = NULL;
+	} else {
+		mpp_feat = STRDUP(mpp->features);
+	}
 	if (cmpp_feat && mpp_feat) {
 		remove_feature(&mpp_feat, "queue_if_no_path");
 		remove_feature(&mpp_feat, "retain_attached_hw_handler");
@@ -770,8 +784,8 @@ select_action (struct multipath * mpp, vector curmp, int force_reload)
 	FREE(cmpp_feat);
 	FREE(mpp_feat);

-	if (!cmpp->selector || strncmp(cmpp->selector, mpp->selector,
-		    strlen(mpp->selector))) {
+	if (mpp->selector != cmpp->selector && (!cmpp->selector || !mpp->selector ||
+		    strncmp(cmpp->selector, mpp->selector,strlen(mpp->selector)))) {
 		mpp->action = ACT_RELOAD;
 		condlog(3, "%s: set ACT_RELOAD (selector change)",
 			mpp->alias);
-- 

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 3/5] multipathd: add reclear_pp_from_mpp in ev_remove_path
  2020-08-18 12:59 [PATCH 0/5] multipath-tools series: coredump and memory leak bugfix lixiaokeng
  2020-08-18 13:02 ` [PATCH 1/5 v4] libmultipath fix a memory leak in set_ble_device lixiaokeng
  2020-08-18 13:06 ` [PATCH 2/5] libmultipath fix NULL dereference in select_action lixiaokeng
@ 2020-08-18 13:08 ` lixiaokeng
  2020-08-18 16:36   ` Martin Wilck
  2020-08-18 13:09 ` [PATCH 4/5] multipathd: disable queueing for recreated map in uev_remove_map lixiaokeng
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 20+ messages in thread
From: lixiaokeng @ 2020-08-18 13:08 UTC (permalink / raw)
  To: Benjamin Marzinski, Martin Wilck, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

Add reclear_pp_from_mpp in ev_remove_path to make sure that pp is cleared in mpp.

When multipathd del path xxx, multipathd -v2, multipathd add path xxx and multipath -U
dm-x are executed simultaneously, multipath -U dm-x will case coredump.

The reason is that there are two paths with same dev_t in dm_table. The process
is as follows:

multipathd del path xxx(such as sde whose dev_t is 8:64):

cli_del_path
	->ev_remove_path
		->domap //dm_table in kernel will be reloaded and doesn't contain 8:64.
		        //Then multipath -v2 is executed, and the dm_table in kernel
		        //will be reloaded and contains 8:64.
		->setup_multipath
			->update_multipath_strings
				->update_multipath_table
					->dm_get_map //get params with 8:64
					->disassemble_map //pp1 will be saved mpp->pg
		->delete pp1 in pathvec
		->clear_ref_from_mpp //pp is cleared in mpp->paths but still saved in
		                     //mpp->pg
		->free_paths //pp1 is freed but still exist in mpp->pg

multipathd add path sde
cli_add_path
	->store_pathinfo //alloc pp2 (dev_t is 8:64), and store it to gvecs->pathvec
	->ev_add_path
		->adopt_paths
			->update_mpp_paths //pp1 is found in mpp->pg and its dev_t is
			                   //8:64 and dev is not sde (cased by free).
			                   //it will be stored in mpp->paths.
			->pp2 is stored to mpp->paths
		->setup_map //params with two 8:64
		->domap //dm_table is reloaded and contains two 8:64

multipath -U dm-x(sde is one path of dm-x)
main
	->check_usable_paths
		->dm_get_maps //get params with two 8:64
		->disassemble_map //alloc pp3, and pp3 is saved twice in mpp->pg
		->free_multipath(mpp, FREE_PATHS) //double free

Here, we add that pp1 in mpp->pg is cleared in clear_ref_from_mpp.

Reported-by: Tianxiong Lu <lutianxiong@huawei.com>
Signed-off-by: lixiaokeng <lixiaokeng@huawei.com>
Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
---
 multipathd/main.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/multipathd/main.c b/multipathd/main.c
index 9ec65856..a1db17a0 100644
--- a/multipathd/main.c
+++ b/multipathd/main.c
@@ -1059,6 +1059,32 @@ fail:
 	return 1;
 }

+static
+void reclear_pp_from_mpp(struct path * pp, struct vectors * vecs)
+{
+	struct multipath * mpp = NULL;
+	struct pathgroup * pgp;
+	int i = -1;
+	int j = 0;
+	int is_log = 0;
+
+	mpp = find_mp_by_wwid(vecs->mpvec, pp->wwid);
+	if(!!mpp) {
+		if ((i = find_slot(mpp->paths, (void *)pp)) != -1) {
+			vector_del_slot(mpp->paths, i);
+			is_log = 1;
+		}
+		vector_foreach_slot (mpp->pg, pgp, j) {
+			if ((i = find_slot(pgp->paths, (void *)pp)) != -1) {
+				vector_del_slot(pgp->paths, i);
+				is_log = 1;
+			}
+		}
+		if (is_log)
+			condlog(2, "%s: reclear path from mpp %s", pp->dev, mpp->alias);
+	}
+}
+
 static int
 uev_remove_path (struct uevent *uev, struct vectors * vecs, int need_do_map)
 {
@@ -1186,6 +1212,7 @@ out:
 	if ((i = find_slot(vecs->pathvec, (void *)pp)) != -1)
 		vector_del_slot(vecs->pathvec, i);

+	reclear_pp_from_mpp(pp, vecs);
 	free_path(pp);

 	return retval;
-- 

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* [PATCH 4/5] multipathd: disable queueing for recreated map in uev_remove_map
  2020-08-18 12:59 [PATCH 0/5] multipath-tools series: coredump and memory leak bugfix lixiaokeng
                   ` (2 preceding siblings ...)
  2020-08-18 13:08 ` [PATCH 3/5] multipathd: add reclear_pp_from_mpp in ev_remove_path lixiaokeng
@ 2020-08-18 13:09 ` lixiaokeng
  2020-08-18 19:23   ` Martin Wilck
  2020-08-19  9:27   ` Martin Wilck
  2020-08-18 13:11 ` [PATCH 5/5] libmultipath fix daemon memory leak in disassemble_map lixiaokeng
  2020-08-19  1:50 ` lixiaokeng
  5 siblings, 2 replies; 20+ messages in thread
From: lixiaokeng @ 2020-08-18 13:09 UTC (permalink / raw)
  To: Benjamin Marzinski, Martin Wilck, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

There may be a race window here:
1. all paths gone, causing map flushed both from multipathd and kernel
2. paths regenerated, causing multipathd creating the map again.

1 will generate a remove uevent which can be handled after 2, so we can
disable queueing for the map created by 2 here temporarily and let the
change uevent (generated by 2) calling uev_add_map->setup_multipath
to set queueing again. This can prevent the deadlock in this race window.

The possible deadlock is: all udevd workers hangs in devices because of
queue_if_no_path, so no udevd workers can handle new event and since
multipathd will remove the map, the checkerloop cannot check this map's
retry tick timeout and cancel the io hang which makes the udevd worker
hang forever. multipathd cannot receive any uevent from udevd because
all udevd workers hang there so the map cannot be recreated again which
makes a deadlock.

Signed-off-by: Lixiaokeng@huawei.com
---
 multipathd/main.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/multipathd/main.c b/multipathd/main.c
index baa18183..d7e20a10 100644
--- a/multipathd/main.c
+++ b/multipathd/main.c
@@ -798,6 +798,7 @@ uev_remove_map (struct uevent * uev, struct vectors * vecs)
 		goto out;
 	}

+	dm_queue_if_no_path(alias, 0);
 	remove_map_and_stop_waiter(mpp, vecs);
 out:
 	lock_cleanup_pop(vecs->lock);
-- 

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 5/5] libmultipath fix daemon memory leak in disassemble_map
  2020-08-18 12:59 [PATCH 0/5] multipath-tools series: coredump and memory leak bugfix lixiaokeng
                   ` (3 preceding siblings ...)
  2020-08-18 13:09 ` [PATCH 4/5] multipathd: disable queueing for recreated map in uev_remove_map lixiaokeng
@ 2020-08-18 13:11 ` lixiaokeng
  2020-08-19  1:42   ` lixiaokeng
  2020-08-19  1:50 ` lixiaokeng
  5 siblings, 1 reply; 20+ messages in thread
From: lixiaokeng @ 2020-08-18 13:11 UTC (permalink / raw)
  To: Benjamin Marzinski, Martin Wilck, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

When one iscsi device logs in and logs out with the "multipath -r"
executed at the same time, memory leak happens in multipathd
process.

The reason is following. When "multipath -r" is executed, the path
will be free in configure function. Before path_discovery executed,
iscsi device logs out. Then path_discovery will not find any path and
there is no path in the gvecs->pathvec. When map_discovery function
is executed, disassemble_map function will be called. Because
gvecs->pathvec->slot is empty and is_deamon is 1, a path will be
allocated and is not stored in gvecs->pathvec but store in
mpp->pg. But when the mpp is removed and freed by remove_map
function, the path will not be free and can't be find anymore.

The procedure details given as follows,
1."multipath -r" is executed
main
	->child
		->reconfigure
			->configure
				->path_discovery //after iscsi logout
				->map_discovery
					->update_multipath_table
						->disassemble_map
							->alloc_path
2.then "multipath -r" is executed again
main
main
	->child
		->reconfigure
			->remove_maps_and_stop_waiters
				->remove_maps

If checking is_deamon is deleted, there are many other things need be done like in
https://www.redhat.com/archives/dm-devel/2020-July/msg00245.html. And this
branch develops from 0.8.3 but upstream_queue is mainline which develops from
0.8.4.
Here, we skip alloc_path if pp isn't find in pathvec and process is daemon.  In
daemon, we should not store path with incomplete information to pathvec.  The
pathvec stores all paths in daemon, so it is reasonable keep same with pathvec.

Reported-by: Tianxiong Li <lutianxiong@huawei.com>
Signed-off-by: Lixiaokeng <lixiaokeng@huawei.com>
Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
---
 libmultipath/dmparser.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/libmultipath/dmparser.c b/libmultipath/dmparser.c
index f02c551..0f370e9 100644
--- a/libmultipath/dmparser.c
+++ b/libmultipath/dmparser.c
@@ -314,6 +314,16 @@ int disassemble_map(vector pathvec, char *params, struct multipath *mpp,
 			}

 			if (!pp) {
+				/* daemon should keep same with pathvec */
+				/* pp is not find in pathvec, skip it */
+				if (is_daemon) {
+					FREE(word);
+					for (k = 0; k < num_paths_args; k++) {
+						p += get_word(p, NULL);
+					}
+					continue;
+				}
+
 				pp_unfound = 1;
 				pp = alloc_path();

@@ -326,8 +336,8 @@ int disassemble_map(vector pathvec, char *params, struct multipath *mpp,
 					strncpy(pp->wwid, mpp->wwid,
 						WWID_SIZE - 1);
 				}
-				/* Only call this in multipath client mode */
-				if (!is_daemon && store_path(pathvec, pp)) {
+
+				if (store_path(pathvec, pp)) {
 					free_path(pp);
 					goto out1;
 				}
--

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 1/5 v4] libmultipath fix a memory leak in set_ble_device
  2020-08-18 13:02 ` [PATCH 1/5 v4] libmultipath fix a memory leak in set_ble_device lixiaokeng
@ 2020-08-18 15:57   ` Martin Wilck
  0 siblings, 0 replies; 20+ messages in thread
From: Martin Wilck @ 2020-08-18 15:57 UTC (permalink / raw)
  To: lixiaokeng, Benjamin Marzinski, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

On Tue, 2020-08-18 at 21:02 +0800, lixiaokeng wrote:
> In set_ble_device func, if blist is NULL or ble is NULL,
> the vendor and product isn't freed. We think it is not
> reasonable that strdup(XXX) is used as set_ble_device
> and store_ble functions' parameter.
> 
> Here we call strdup() in store_ble and set_ble_device
> functions and the string will be free if functions fail.
> Because constant string like "sdb" will be their parameter,
> char * is changed to const char *. This is base on
> upstream-queue branch in openSUSE/multipath-tools.
> 
> The type of ble->vendor_reg is regex_t struct but not a
> pointer, so it can not be set NULL.

Ups. Right. Thanks for correcting me.

> 
> Signed-off-by: Lixiaokeng <lixiaokeng@huawei.com>
> Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>

Reviewed-by: Martin Wilck <mwilck@suse.com>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/5] libmultipath fix NULL dereference in select_action
  2020-08-18 13:06 ` [PATCH 2/5] libmultipath fix NULL dereference in select_action lixiaokeng
@ 2020-08-18 16:28   ` Martin Wilck
  2020-08-21  4:41     ` lixiaokeng
  0 siblings, 1 reply; 20+ messages in thread
From: Martin Wilck @ 2020-08-18 16:28 UTC (permalink / raw)
  To: lixiaokeng, Benjamin Marzinski, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

On Tue, 2020-08-18 at 21:06 +0800, lixiaokeng wrote:
> I got a multipath segfault while running iscsi login/logout and
> following scripts in parallel:
> 
> #!/bin/bash
> interval=1
> while true
> do
>               multipath -F &> /dev/null
>               multipath -r &> /dev/null
>               multipath -v2 &> /dev/null
>               multipath -ll &> /dev/null
>               sleep $interval
> done
> 
> This is the debuginfo:
> Core was generated by `multipath -v2'.
> Program terminated with signal SIGSEGV, Segmentation fault.
> #0  0x00007fc5c5c8dedf in select_action (mpp=0x55a94f1f8980,
> curmp=<optimized out>,
>     force_reload=<optimized out>) at configure.c:709
> 709		if (!cmpp->pg || VECTOR_SIZE(cmpp->pg) !=
> VECTOR_SIZE(mpp->pg)) { {
> 
> (gdb) bt
> #0  0x00007fc5c5c8dedf in select_action (mpp=0x55a94f1f8980,
> curmp=<optimized out>,
>     force_reload=<optimized out>) at configure.c:709
> #1  0x00007fc5c5c8fd0f in coalesce_paths (vecs=0x7ffff1c1c220,
> newmp=0x0, refwwid=0x0,
>     force_reload=0, cmd=CMD_CREATE) at configure.c:1090
> #2  0x000055a94e9f524d in configure (devpath=<optimized out>,
> dev_type=DEV_NONE,
>     cmd=CMD_CREATE, conf=0x55a94f1b92d0) at main.c:757
> #3  main (argc=<optimized out>, argv=<optimized out>) at main.c:1100
> (gdb) p *cmpp
> $1 = {
>   wwid = "3600140566411a6d4873434fba988066f\000\070\060\066\066f",
> '\000' <repeats 88 times>,
>   ...
>   paths = 0x0, pg = 0x0, dmi = 0x55a94f22c3f0, alias = 0x55a94f205fb0
> "mpathc",
>   alias_prefix = 0x0, selector = 0x0, features = 0x0, hwhandler =
> 0x0, mpe = 0x0,
>   hwe = 0x0, waiter = 0, stat_switchgroup = 0, stat_path_failures =
> 0, stat_map_loads = 0,
>   ...
>   generic_mp = {ops = 0x7fc5c5cada40 <dm_gen_multipath_ops>}}
> 
> There are many NULL pointers in cmpp such as selector, features,
> hwhandler and so on.
> Here these on mpp is not NULL  but we can not be sure that won't be
> in mpp, so we
> add checking pointers of both mpp and cmpp before using them.
> 
> The changes like "mpp->features != cmpp->features" make sure that
> mpp->action is not
> set to ACT_RELOAD when they both equal NULL. Other changes check
> pointers don't equal
> NULL. When only one is NULL, we think there is some difference
> between mpp and cmpp,
> so mpp->action should be set to ACT_RELOAD.

Hm. We should be able to make some assumptions about validity of
structure fields.

I'd like to understand better what's going wrong here. cmpp is
taken from curmp, which has been populated in get_dm_mpvec().
get_dm_mpvec() calls disassemble_map(), which would always allocate
mpp->features, AFAICS. Well, always, except if the parameter string was
totally broken (or memory allocation failed):

int disassemble_map()
{
...
	p += get_word(p, &mpp->features);

	if (!mpp->features)
		return 1;

The same holds for mpp->hwhandler, mpp->selector, and mpp->pg (not sure
what you refer to with "and so on").

In this case the map also wouldn't have paths, and shouldn't have been
added to curmp in the first place. So IMO this is caused by
get_dm_mpvec() not checking the return value of disassemble_map().
map_discovery() in multipathd does this better - maps for which
update_multipath_table() fails are not added. Can you confirm that you
see this crash only in multipath, not in multipathd?

Regards,
Martin

> 
> Signed-off-by: Lixiaokeng <lixiaokeng@huawei.com>
> Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
> ---
>  libmultipath/configure.c | 30 ++++++++++++++++++++++--------
>  1 file changed, 22 insertions(+), 8 deletions(-)
> 
> diff --git a/libmultipath/configure.c b/libmultipath/configure.c
> index 96c79610..e02e7ef4 100644
> --- a/libmultipath/configure.c
> +++ b/libmultipath/configure.c
> @@ -726,16 +726,20 @@ select_action (struct multipath * mpp, vector
> curmp, int force_reload)
>  	}
> 
>  	if (mpp->no_path_retry != NO_PATH_RETRY_UNDEF &&
> +	    mpp->features != cmpp->features &&
> +	    (!mpp->features || !cmpp->features ||
>  	    !!strstr(mpp->features, "queue_if_no_path") !=
> -	    !!strstr(cmpp->features, "queue_if_no_path")) {
> +	    !!strstr(cmpp->features, "queue_if_no_path"))) {
>  		mpp->action =  ACT_RELOAD;
>  		condlog(3, "%s: set ACT_RELOAD (no_path_retry change)",
>  			mpp->alias);
>  		return;
>  	}
> -	if ((mpp->retain_hwhandler != RETAIN_HWHANDLER_ON ||
> +	if ((mpp->retain_hwhandler != RETAIN_HWHANDLER_ON || !cmpp-
> >hwhandler ||
>  	     strcmp(cmpp->hwhandler, "0") == 0) &&
> -	    (strlen(cmpp->hwhandler) != strlen(mpp->hwhandler) ||
> +	     mpp->hwhandler != cmpp->hwhandler &&
> +	     (!mpp->hwhandler || !cmpp->hwhandler||
> +	     strlen(cmpp->hwhandler) != strlen(mpp->hwhandler) ||
>  	     strncmp(cmpp->hwhandler, mpp->hwhandler,
>  		    strlen(mpp->hwhandler)))) {
>  		mpp->action = ACT_RELOAD;
> @@ -745,8 +749,10 @@ select_action (struct multipath * mpp, vector
> curmp, int force_reload)
>  	}
> 
>  	if (mpp->retain_hwhandler != RETAIN_HWHANDLER_UNDEF &&
> +	    mpp->features != cmpp->features &&
> +	    (!mpp->features || !cmpp->features ||
>  	    !!strstr(mpp->features, "retain_attached_hw_handler") !=
> -	    !!strstr(cmpp->features, "retain_attached_hw_handler") &&
> +	    !!strstr(cmpp->features, "retain_attached_hw_handler")) &&
>  	    get_linux_version_code() < KERNEL_VERSION(4, 3, 0)) {
>  		mpp->action = ACT_RELOAD;
>  		condlog(3, "%s: set ACT_RELOAD (retain_hwhandler
> change)",
> @@ -754,8 +760,16 @@ select_action (struct multipath * mpp, vector
> curmp, int force_reload)
>  		return;
>  	}
> 
> -	cmpp_feat = STRDUP(cmpp->features);
> -	mpp_feat = STRDUP(mpp->features);
> +	if (!cmpp->features ) {
> +		cmpp_feat = NULL;
> +	} else {
> +		cmpp_feat = STRDUP(cmpp->features);
> +	}
> +	if (!mpp->features ) {
> +		mpp_feat = NULL;
> +	} else {
> +		mpp_feat = STRDUP(mpp->features);
> +	}
>  	if (cmpp_feat && mpp_feat) {
>  		remove_feature(&mpp_feat, "queue_if_no_path");
>  		remove_feature(&mpp_feat,
> "retain_attached_hw_handler");
> @@ -770,8 +784,8 @@ select_action (struct multipath * mpp, vector
> curmp, int force_reload)
>  	FREE(cmpp_feat);
>  	FREE(mpp_feat);
> 
> -	if (!cmpp->selector || strncmp(cmpp->selector, mpp->selector,
> -		    strlen(mpp->selector))) {
> +	if (mpp->selector != cmpp->selector && (!cmpp->selector ||
> !mpp->selector ||
> +		    strncmp(cmpp->selector, mpp->selector,strlen(mpp-
> >selector)))) {
>  		mpp->action = ACT_RELOAD;
>  		condlog(3, "%s: set ACT_RELOAD (selector change)",
>  			mpp->alias);

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/5] multipathd: add reclear_pp_from_mpp in ev_remove_path
  2020-08-18 13:08 ` [PATCH 3/5] multipathd: add reclear_pp_from_mpp in ev_remove_path lixiaokeng
@ 2020-08-18 16:36   ` Martin Wilck
  2020-08-20 14:51     ` lixiaokeng
  0 siblings, 1 reply; 20+ messages in thread
From: Martin Wilck @ 2020-08-18 16:36 UTC (permalink / raw)
  To: lixiaokeng, Benjamin Marzinski, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

On Tue, 2020-08-18 at 21:08 +0800, lixiaokeng wrote:
> Add reclear_pp_from_mpp in ev_remove_path to make sure that pp is
> cleared in mpp.
> 
> When multipathd del path xxx, multipathd -v2, multipathd add path xxx
> and multipath -U
> dm-x are executed simultaneously, multipath -U dm-x will case
> coredump.
> 
> The reason is that there are two paths with same dev_t in dm_table.
> The process
> is as follows:

Thanks for the report.

With which code have you tested this? I have reason to believe that 
it would behave differently with my last patch series (in particular,
part V "removed path handling") applied. I'm not sure if my series
would fix the issue, but it would probably need a different fix.

Regards,
Martin

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/5] multipathd: disable queueing for recreated map in uev_remove_map
  2020-08-18 13:09 ` [PATCH 4/5] multipathd: disable queueing for recreated map in uev_remove_map lixiaokeng
@ 2020-08-18 19:23   ` Martin Wilck
  2020-08-19  8:48     ` lixiaokeng
  2020-08-19  9:27   ` Martin Wilck
  1 sibling, 1 reply; 20+ messages in thread
From: Martin Wilck @ 2020-08-18 19:23 UTC (permalink / raw)
  To: lixiaokeng, Benjamin Marzinski, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

Hi Lixiaokeng,

On Tue, 2020-08-18 at 21:09 +0800, lixiaokeng wrote:
> There may be a race window here:
> 1. all paths gone, causing map flushed both from multipathd and
> kernel
> 2. paths regenerated, causing multipathd creating the map again.
> 
> 1 will generate a remove uevent which can be handled after 2, so we
> can
> disable queueing for the map created by 2 here temporarily and let
> the
> change uevent (generated by 2) calling uev_add_map->setup_multipath
> to set queueing again. This can prevent the deadlock in this race
> window.
> 
> The possible deadlock is: all udevd workers hangs in devices because
> of
> queue_if_no_path, so no udevd workers can handle new event and since
> multipathd will remove the map, the checkerloop cannot check this
> map's
> retry tick timeout and cancel the io hang which makes the udevd
> worker
> hang forever. multipathd cannot receive any uevent from udevd because
> all udevd workers hang there so the map cannot be recreated again
> which
> makes a deadlock.
> 
> Signed-off-by: Lixiaokeng@huawei.com

A map which is removed and not yet re-added again (as far as udev is
concerned) doesn't need to queue because it can't possibly be in use.
So I think the patch can't hurt in other scenarios, at it makes sense
in the situation you describe. However I have a few questions.

Have you observed this, or is it theory? I'm wondering: After 2) there
should be some paths again, so why would the udev workers hang? 
I guess this could happen if the regenerated paths all in failed /
standby state, is that what you mean?

Note also that we set DM_NOSCAN in the udev rules when there are no
usable paths, so udev workers would only hang if the last path fails /
is removed after the "multipath -U" check.

You've certainly hit a weak spot here, and you've nicely described a
potential problem scenario. The delayed processing of uevents that
multipathd triggered itself is a recurring cause of headache.

Regards,
Martin



> ---
>  multipathd/main.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/multipathd/main.c b/multipathd/main.c
> index baa18183..d7e20a10 100644
> --- a/multipathd/main.c
> +++ b/multipathd/main.c
> @@ -798,6 +798,7 @@ uev_remove_map (struct uevent * uev, struct
> vectors * vecs)
>  		goto out;
>  	}
> 
> +	dm_queue_if_no_path(alias, 0);
>  	remove_map_and_stop_waiter(mpp, vecs);
>  out:
>  	lock_cleanup_pop(vecs->lock);

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 5/5] libmultipath fix daemon memory leak in disassemble_map
  2020-08-18 13:11 ` [PATCH 5/5] libmultipath fix daemon memory leak in disassemble_map lixiaokeng
@ 2020-08-19  1:42   ` lixiaokeng
  0 siblings, 0 replies; 20+ messages in thread
From: lixiaokeng @ 2020-08-19  1:42 UTC (permalink / raw)
  To: Benjamin Marzinski, Martin Wilck, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

Hi
  I'm sorry for subject with Re. I will send it  again.

On 2020/8/18 21:11, lixiaokeng wrote:
> When one iscsi device logs in and logs out with the "multipath -r"
> executed at the same time, memory leak happens in multipathd
> process.
> 
> The reason is following. When "multipath -r" is executed, the path
> will be free in configure function. Before path_discovery executed,
> iscsi device logs out. Then path_discovery will not find any path and
> there is no path in the gvecs->pathvec. When map_discovery function
> is executed, disassemble_map function will be called. Because
> gvecs->pathvec->slot is empty and is_deamon is 1, a path will be
> allocated and is not stored in gvecs->pathvec but store in
> mpp->pg. But when the mpp is removed and freed by remove_map
> function, the path will not be free and can't be find anymore.
> 
> The procedure details given as follows,
> 1."multipath -r" is executed
> main
> 	->child
> 		->reconfigure
> 			->configure
> 				->path_discovery //after iscsi logout
> 				->map_discovery
> 					->update_multipath_table
> 						->disassemble_map
> 							->alloc_path
> 2.then "multipath -r" is executed again
> main
> main
> 	->child
> 		->reconfigure
> 			->remove_maps_and_stop_waiters
> 				->remove_maps
> 
> If checking is_deamon is deleted, there are many other things need be done like in
> https://www.redhat.com/archives/dm-devel/2020-July/msg00245.html. And this
> branch develops from 0.8.3 but upstream_queue is mainline which develops from
> 0.8.4.
> Here, we skip alloc_path if pp isn't find in pathvec and process is daemon.  In
> daemon, we should not store path with incomplete information to pathvec.  The
> pathvec stores all paths in daemon, so it is reasonable keep same with pathvec.
> 
> Reported-by: Tianxiong Li <lutianxiong@huawei.com>
> Signed-off-by: Lixiaokeng <lixiaokeng@huawei.com>
> Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
> ---
>  libmultipath/dmparser.c | 14 ++++++++++++--
>  1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/libmultipath/dmparser.c b/libmultipath/dmparser.c
> index f02c551..0f370e9 100644
> --- a/libmultipath/dmparser.c
> +++ b/libmultipath/dmparser.c
> @@ -314,6 +314,16 @@ int disassemble_map(vector pathvec, char *params, struct multipath *mpp,
>  			}
> 
>  			if (!pp) {
> +				/* daemon should keep same with pathvec */
> +				/* pp is not find in pathvec, skip it */
> +				if (is_daemon) {
> +					FREE(word);
> +					for (k = 0; k < num_paths_args; k++) {
> +						p += get_word(p, NULL);
> +					}
> +					continue;
> +				}
> +
>  				pp_unfound = 1;
>  				pp = alloc_path();
> 
> @@ -326,8 +336,8 @@ int disassemble_map(vector pathvec, char *params, struct multipath *mpp,
>  					strncpy(pp->wwid, mpp->wwid,
>  						WWID_SIZE - 1);
>  				}
> -				/* Only call this in multipath client mode */
> -				if (!is_daemon && store_path(pathvec, pp)) {
> +
> +				if (store_path(pathvec, pp)) {
>  					free_path(pp);
>  					goto out1;
>  				}
> --
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH 5/5] libmultipath fix daemon memory leak in disassemble_map
  2020-08-18 12:59 [PATCH 0/5] multipath-tools series: coredump and memory leak bugfix lixiaokeng
                   ` (4 preceding siblings ...)
  2020-08-18 13:11 ` [PATCH 5/5] libmultipath fix daemon memory leak in disassemble_map lixiaokeng
@ 2020-08-19  1:50 ` lixiaokeng
  2020-08-19  8:26   ` Martin Wilck
  5 siblings, 1 reply; 20+ messages in thread
From: lixiaokeng @ 2020-08-19  1:50 UTC (permalink / raw)
  To: Benjamin Marzinski, Martin Wilck, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

When one iscsi device logs in and logs out with the "multipath -r"
executed at the same time, memory leak happens in multipathd
process.

The reason is following. When "multipath -r" is executed, the path
will be free in configure function. Before path_discovery executed,
iscsi device logs out. Then path_discovery will not find any path and
there is no path in the gvecs->pathvec. When map_discovery function
is executed, disassemble_map function will be called. Because
gvecs->pathvec->slot is empty and is_deamon is 1, a path will be
allocated and is not stored in gvecs->pathvec but store in
mpp->pg. But when the mpp is removed and freed by remove_map
function, the path will not be free and can't be find anymore.

The procedure details given as follows,
1."multipath -r" is executed
main
	->child
		->reconfigure
			->configure
				->path_discovery //after iscsi logout
				->map_discovery
					->update_multipath_table
						->disassemble_map
							->alloc_path
2.then "multipath -r" is executed again
main
main
	->child
		->reconfigure
			->remove_maps_and_stop_waiters
				->remove_maps

If checking is_deamon is deleted, there are many other things need be done like in
https://www.redhat.com/archives/dm-devel/2020-July/msg00245.html. And this
branch develops from 0.8.3 but upstream_queue is mainline which develops from
0.8.4.
Here, we skip alloc_path if pp isn't find in pathvec and process is daemon.  In
daemon, we should not store path with incomplete information to pathvec.  The
pathvec stores all paths in daemon, so it is reasonable keep same with pathvec.

Reported-by: Tianxiong Li <lutianxiong@huawei.com>
Signed-off-by: Lixiaokeng <lixiaokeng@huawei.com>
Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
---
 libmultipath/dmparser.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/libmultipath/dmparser.c b/libmultipath/dmparser.c
index f02c551..0f370e9 100644
--- a/libmultipath/dmparser.c
+++ b/libmultipath/dmparser.c
@@ -314,6 +314,16 @@ int disassemble_map(vector pathvec, char *params, struct multipath *mpp,
 			}

 			if (!pp) {
+				/* daemon should keep same with pathvec */
+				/* pp is not find in pathvec, skip it */
+				if (is_daemon) {
+					FREE(word);
+					for (k = 0; k < num_paths_args; k++) {
+						p += get_word(p, NULL);
+					}
+					continue;
+				}
+
 				pp_unfound = 1;
 				pp = alloc_path();

@@ -326,8 +336,8 @@ int disassemble_map(vector pathvec, char *params, struct multipath *mpp,
 					strncpy(pp->wwid, mpp->wwid,
 						WWID_SIZE - 1);
 				}
-				/* Only call this in multipath client mode */
-				if (!is_daemon && store_path(pathvec, pp)) {
+
+				if (store_path(pathvec, pp)) {
 					free_path(pp);
 					goto out1;
 				}
--

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH 5/5] libmultipath fix daemon memory leak in disassemble_map
  2020-08-19  1:50 ` lixiaokeng
@ 2020-08-19  8:26   ` Martin Wilck
  2020-08-19  8:45     ` lixiaokeng
  0 siblings, 1 reply; 20+ messages in thread
From: Martin Wilck @ 2020-08-19  8:26 UTC (permalink / raw)
  To: lixiaokeng, Benjamin Marzinski, Christophe Varoqui
  Cc: linfeilong, dm-devel mailing list, liuzhiqiang26

Hello lixiaokeng,

Thanks again for your contribution.

On Wed, 2020-08-19 at 09:50 +0800, lixiaokeng wrote:
> 
> If checking is_deamon is deleted, there are many other things need be
> done like in
> https://www.redhat.com/archives/dm-devel/2020-July/msg00245.html. And
> this
> branch develops from 0.8.3 but upstream_queue is mainline which
> develops from
> 0.8.4.

This is a misunderstanding. My upstream-queue branch is *not* mainline.
Mainline is http://git.opensvc.com/multipath-tools/.git. Also, my
entire patch series (link above) was based on upstream-queue, which in
turn was based on 0.8.4, not 0.8.3.

As the name says, "upstream-queue" represents the queue of pending
patches which have at least one positive review (and no negative ones).
The intention is 1. to provide an overview for myself and other
interested parties, and 2. to simplify matters for the actual
maintainer, Christophe, when he applies patches onto mainline.

My patch series changes the same code path as this one, and I think it
solves the same issue, albeit differently. Most of my series has
meanwhile been positively reviewed by Ben, and the remaining open
issues are in other parts of the series. I'll hopefully be able to push
it to upstream-queue soon. IMO it makes little sense to push changes to
upstream-queue which are going to be removed again when my series is
applied (I don't intend to start using merge operations in this
branch).

Christophe is on the recipients list of your patch. He may of course
decide to apply your patch before my series, in which case I'll have to
rebase mine. But he'll probably have other prior patches to look at
first before he gets down to this one.

Regards,
Martin

> Here, we skip alloc_path if pp isn't find in pathvec and process is
> daemon.  In
> daemon, we should not store path with incomplete information to
> pathvec.  The
> pathvec stores all paths in daemon, so it is reasonable keep same
> with pathvec.
> 
> Reported-by: Tianxiong Li <lutianxiong@huawei.com>
> Signed-off-by: Lixiaokeng <lixiaokeng@huawei.com>
> Signed-off-by: Zhiqiang Liu <liuzhiqiang26@huawei.com>
> ---
>  libmultipath/dmparser.c | 14 ++++++++++++--
>  1 file changed, 12 insertions(+), 2 deletions(-)
> 
> diff --git a/libmultipath/dmparser.c b/libmultipath/dmparser.c
> index f02c551..0f370e9 100644
> --- a/libmultipath/dmparser.c
> +++ b/libmultipath/dmparser.c
> @@ -314,6 +314,16 @@ int disassemble_map(vector pathvec, char
> *params, struct multipath *mpp,
>  			}
> 
>  			if (!pp) {
> +				/* daemon should keep same with pathvec
> */
> +				/* pp is not find in pathvec, skip it
> */
> +				if (is_daemon) {
> +					FREE(word);
> +					for (k = 0; k < num_paths_args;
> k++) {
> +						p += get_word(p, NULL);
> +					}
> +					continue;
> +				}
> +
>  				pp_unfound = 1;
>  				pp = alloc_path();
> 
> @@ -326,8 +336,8 @@ int disassemble_map(vector pathvec, char *params,
> struct multipath *mpp,
>  					strncpy(pp->wwid, mpp->wwid,
>  						WWID_SIZE - 1);
>  				}
> -				/* Only call this in multipath client
> mode */
> -				if (!is_daemon && store_path(pathvec,
> pp)) {
> +
> +				if (store_path(pathvec, pp)) {
>  					free_path(pp);
>  					goto out1;
>  				}
> --
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 5/5] libmultipath fix daemon memory leak in disassemble_map
  2020-08-19  8:26   ` Martin Wilck
@ 2020-08-19  8:45     ` lixiaokeng
  0 siblings, 0 replies; 20+ messages in thread
From: lixiaokeng @ 2020-08-19  8:45 UTC (permalink / raw)
  To: Martin Wilck, Benjamin Marzinski, Christophe Varoqui
  Cc: linfeilong, dm-devel mailing list, liuzhiqiang26

Hi Martin,
   Thanks for your answers. I understand the relationship between branches.

On 2020/8/19 16:26, Martin Wilck wrote:
> Hello lixiaokeng,
> 
> Thanks again for your contribution.
> 
> On Wed, 2020-08-19 at 09:50 +0800, lixiaokeng wrote:
>>
>> If checking is_deamon is deleted, there are many other things need be
>> done like in
>> https://www.redhat.com/archives/dm-devel/2020-July/msg00245.html. And
>> this
>> branch develops from 0.8.3 but upstream_queue is mainline which
>> develops from
>> 0.8.4.
> 
> This is a misunderstanding. My upstream-queue branch is *not* mainline.
> Mainline is http://git.opensvc.com/multipath-tools/.git. Also, my
> entire patch series (link above) was based on upstream-queue, which in
> turn was based on 0.8.4, not 0.8.3.
> 
> As the name says, "upstream-queue" represents the queue of pending
> patches which have at least one positive review (and no negative ones).
> The intention is 1. to provide an overview for myself and other
> interested parties, and 2. to simplify matters for the actual
> maintainer, Christophe, when he applies patches onto mainline.
> 
> My patch series changes the same code path as this one, and I think it
> solves the same issue, albeit differently. Most of my series has
> meanwhile been positively reviewed by Ben, and the remaining open
> issues are in other parts of the series. I'll hopefully be able to push
> it to upstream-queue soon. IMO it makes little sense to push changes to
> upstream-queue which are going to be removed again when my series is
> applied (I don't intend to start using merge operations in this
> branch).
> 
> Christophe is on the recipients list of your patch. He may of course
> decide to apply your patch before my series, in which case I'll have to
> rebase mine. But he'll probably have other prior patches to look at
> first before he gets down to this one.
> 
> Regards,
> Martin
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/5] multipathd: disable queueing for recreated map in uev_remove_map
  2020-08-18 19:23   ` Martin Wilck
@ 2020-08-19  8:48     ` lixiaokeng
  0 siblings, 0 replies; 20+ messages in thread
From: lixiaokeng @ 2020-08-19  8:48 UTC (permalink / raw)
  To: Martin Wilck, Benjamin Marzinski, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

Hi Martin:
    In history, the deadlock has happened. When I review patchs made by
huawei employees before me, I think this should be sent to upstream.
It is made in May 2018, so I don't know more details. I'm sorry for that.

On 2020/8/19 3:23, Martin Wilck wrote:
> Hi Lixiaokeng,
> 
> 
> A map which is removed and not yet re-added again (as far as udev is
> concerned) doesn't need to queue because it can't possibly be in use.
> So I think the patch can't hurt in other scenarios, at it makes sense
> in the situation you describe. However I have a few questions.
> 
> Have you observed this, or is it theory? I'm wondering: After 2) there
> should be some paths again, so why would the udev workers hang? 
> I guess this could happen if the regenerated paths all in failed /
> standby state, is that what you mean?
> 
> Note also that we set DM_NOSCAN in the udev rules when there are no
> usable paths, so udev workers would only hang if the last path fails /
> is removed after the "multipath -U" check.
> 
> You've certainly hit a weak spot here, and you've nicely described a
> potential problem scenario. The delayed processing of uevents that
> multipathd triggered itself is a recurring cause of headache.
> 
> Regards,
> Martin
> 
> 
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 4/5] multipathd: disable queueing for recreated map in uev_remove_map
  2020-08-18 13:09 ` [PATCH 4/5] multipathd: disable queueing for recreated map in uev_remove_map lixiaokeng
  2020-08-18 19:23   ` Martin Wilck
@ 2020-08-19  9:27   ` Martin Wilck
  1 sibling, 0 replies; 20+ messages in thread
From: Martin Wilck @ 2020-08-19  9:27 UTC (permalink / raw)
  To: lixiaokeng, Benjamin Marzinski, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

On Tue, 2020-08-18 at 21:09 +0800, lixiaokeng wrote:
> There may be a race window here:
> 1. all paths gone, causing map flushed both from multipathd and
> kernel
> 2. paths regenerated, causing multipathd creating the map again.
> 
> 1 will generate a remove uevent which can be handled after 2, so we
> can
> disable queueing for the map created by 2 here temporarily and let
> the
> change uevent (generated by 2) calling uev_add_map->setup_multipath
> to set queueing again. This can prevent the deadlock in this race
> window.
> 
> The possible deadlock is: all udevd workers hangs in devices because
> of
> queue_if_no_path, so no udevd workers can handle new event and since
> multipathd will remove the map, the checkerloop cannot check this
> map's
> retry tick timeout and cancel the io hang which makes the udevd
> worker
> hang forever. multipathd cannot receive any uevent from udevd because
> all udevd workers hang there so the map cannot be recreated again
> which
> makes a deadlock.
> 
> Signed-off-by: Lixiaokeng@huawei.com

As noted in my other reply, I don't fully understand how this deadlock
actually came to pass. But disabling queuing on a map which can't be
in use at the given point in time can't do no harm. So:

Reviewed-by: Martin Wilck <mwilck@suse.com>

> ---
>  multipathd/main.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/multipathd/main.c b/multipathd/main.c
> index baa18183..d7e20a10 100644
> --- a/multipathd/main.c
> +++ b/multipathd/main.c
> @@ -798,6 +798,7 @@ uev_remove_map (struct uevent * uev, struct
> vectors * vecs)
>  		goto out;
>  	}
> 
> +	dm_queue_if_no_path(alias, 0);
>  	remove_map_and_stop_waiter(mpp, vecs);
>  out:
>  	lock_cleanup_pop(vecs->lock);

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/5] multipathd: add reclear_pp_from_mpp in ev_remove_path
  2020-08-18 16:36   ` Martin Wilck
@ 2020-08-20 14:51     ` lixiaokeng
  2020-08-20 15:22       ` Martin Wilck
  0 siblings, 1 reply; 20+ messages in thread
From: lixiaokeng @ 2020-08-20 14:51 UTC (permalink / raw)
  To: Martin Wilck, Benjamin Marzinski, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26


Hi Martin:
   I test this in 0.8.4 without your patch series . I have review the
code with your patch series and I think this problem will be solved.
But I have another question.
ev_remove_path
	->__setup_multipath
		->update_multipath_strings
			->update_multipath_table
				->update_pathvec_from_dm
					->store_path
When multipathd del path xxx(such as sde) and multipath -v2 are
executed simultaneously, will the path(sde) deleted be stored to pathvec
again? In my opinion, sde is't delete in pathvec and in disassembel_map
sde will be stored to mpp->pg. When update_pathvec_from_dm, sde will be
stored again.

On 2020/8/19 0:36, Martin Wilck wrote:
> On Tue, 2020-08-18 at 21:08 +0800, lixiaokeng wrote:
>> Add reclear_pp_from_mpp in ev_remove_path to make sure that pp is
>> cleared in mpp.
>>
>> When multipathd del path xxx, multipathd -v2, multipathd add path xxx
>> and multipath -U
>> dm-x are executed simultaneously, multipath -U dm-x will case
>> coredump.
>>
>> The reason is that there are two paths with same dev_t in dm_table.
>> The process
>> is as follows:
> 
> Thanks for the report.
> 
> With which code have you tested this? I have reason to believe that 
> it would behave differently with my last patch series (in particular,
> part V "removed path handling") applied. I'm not sure if my series
> would fix the issue, but it would probably need a different fix.
> 
> Regards,
> Martin
> 
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/5] multipathd: add reclear_pp_from_mpp in ev_remove_path
  2020-08-20 14:51     ` lixiaokeng
@ 2020-08-20 15:22       ` Martin Wilck
  2020-08-24 13:07         ` lixiaokeng
  0 siblings, 1 reply; 20+ messages in thread
From: Martin Wilck @ 2020-08-20 15:22 UTC (permalink / raw)
  To: lixiaokeng, Benjamin Marzinski, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

On Thu, 2020-08-20 at 22:51 +0800, lixiaokeng wrote:
> Hi Martin:
>    I test this in 0.8.4 without your patch series . I have review the
> code with your patch series and I think this problem will be solved.
> But I have another question.
> ev_remove_path
> 	->__setup_multipath
> 		->update_multipath_strings
> 			->update_multipath_table
> 				->update_pathvec_from_dm
> 					->store_path
> When multipathd del path xxx(such as sde) and multipath -v2 are
> executed simultaneously, will the path(sde) deleted be stored to
> pathvec
> again? In my opinion, sde is't delete in pathvec and in
> disassembel_map
> sde will be stored to mpp->pg. When update_pathvec_from_dm, sde will
> be
> stored again.

in ev_remove_path(), we set INIT_REMOVED state. That means this path
won't be used in the RELOAD ioctl. If the ioctl is successful, it will
be gone from the kernel (for the time being). Later, sync_map_state()
will figure this out, and remove the path. If the parallel "multipath"
command reloads the map after multipathd's RELOAD operation, and before
update_multipath_table() called, re-adding the path, the path will be
read back from the kernel, and sync_map_state() will not delete the it.
From multipathd's point of view it will remain in INIT_REMOVED state,
though. multipathd will not try to reload the map again unless it's
asked to do so with another "add path" or "del path" command. This
results in an inconsistent internal state between multipathd and the
kernel (multipathd considers the path as REMOVED even though it's still
present in the map). I don't think this can be generally avoided if we
allow multipath to make changes to kernel maps while multipathd is
running. (*)

We could avoid this by delegating these commands from multipath to
multipathd, as we already do for some other commands. "multipath"
without any arguments would map to a "multipathd reconfigure" command,
while "multipath $DEVICE" would be translated to "multipathd add map"
or "multipathd add path".

Does this make sense?

Regards,
Martin

(*) Well: we *could* try to analyze incoming uevents, and distinguish
those that multipathd has initiated itself from other, externally-
triggered ones. Then if an external event arrives, adding a path which
we had previously removed, and this external event arrives after the
remove event we initiated, we could in theory infer that an external
program had reverted the change we just made. But this would result in
complex and pretty fragile logic. I'm not sure if it's worth it. The
"delegating" approach is safer and cleaner IMO.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 2/5] libmultipath fix NULL dereference in select_action
  2020-08-18 16:28   ` Martin Wilck
@ 2020-08-21  4:41     ` lixiaokeng
  0 siblings, 0 replies; 20+ messages in thread
From: lixiaokeng @ 2020-08-21  4:41 UTC (permalink / raw)
  To: Martin Wilck, Benjamin Marzinski, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

Hi Martin:
   Thanks for your reply.

On 2020/8/19 0:28, Martin Wilck wrote:
> On Tue, 2020-08-18 at 21:06 +0800, lixiaokeng wrote:
>>
>> There are many NULL pointers in cmpp such as selector, features,
>> hwhandler and so on.
>> Here these on mpp is not NULL  but we can not be sure that won't be
>> in mpp, so we
>> add checking pointers of both mpp and cmpp before using them.
>>
>> The changes like "mpp->features != cmpp->features" make sure that
>> mpp->action is not
>> set to ACT_RELOAD when they both equal NULL. Other changes check
>> pointers don't equal
>> NULL. When only one is NULL, we think there is some difference
>> between mpp and cmpp,
>> so mpp->action should be set to ACT_RELOAD.
> 
> Hm. We should be able to make some assumptions about validity of
> structure fields.
> 
> I'd like to understand better what's going wrong here. cmpp is
> taken from curmp, which has been populated in get_dm_mpvec().
> get_dm_mpvec() calls disassemble_map(), which would always allocate
> mpp->features, AFAICS. Well, always, except if the parameter string was
> totally broken (or memory allocation failed):
>
   I add some printing in get_dm_mpvec, but I did not get any useful
information in messages. But we can see that there are path in mpp->pg.
The following are mpp and cmpp when coredump cause:

(gdb) p *mpp
$9 = {wwid = "36001405ef6a7bc355084b3abcb236a4c", '\000' <repeats 94 times>,
  alias_old = '\000' <repeats 127 times>, pgpolicy = 1,
  pgpolicyfn = 0x7f5fc886e0b0 <one_path_per_group>, nextpg = 0, bestpg = 1,
  queuedio = 0, action = 0, wait_for_udev = 0, uev_wait_tick = 0,
  pgfailback = -1, failback_tick = 0, rr_weight = 1, nr_active = 1,
  no_path_retry = 0, retry_tick = 0, disable_queueing = 0, minio = 1,
  flush_on_last_del = 1, attribute_flags = 0, fast_io_fail = 5,
  retain_hwhandler = 2, deferred_remove = 1, delay_watch_checks = -1,
  delay_wait_checks = -1, marginal_path_err_sample_time = -1,
  marginal_path_err_rate_threshold = -1,
  marginal_path_err_recheck_gap_time = -1,
  marginal_path_double_failed_time = -1, skip_kpartx = 1, max_sectors_kb = 0,
  force_readonly = 0, force_udev_reload = 0, needs_paths_uevent = 0,
  ghost_delay = -1, ghost_delay_tick = 0, dev_loss = 0, uid = 0, gid = 0,
  mode = 0, size = 2097152, paths = 0x0, pg = 0x55a07fafbb90, dmi = 0x0,
  alias = 0x55a07fade4d0 "mpatha", alias_prefix = 0x7f5fc8886b4a "mpath",
  selector = 0x55a07fade4b0 "service-time 0", features = 0x55a07fac24b0 "0",
  hwhandler = 0x55a07fade510 "1 alua", mpe = 0x0, hwe = 0x55a07faec5a0,
  waiter = 0, stat_switchgroup = 0, stat_path_failures = 0, stat_map_loads = 0,
  stat_total_queueing_time = 0, stat_queueing_timeouts = 0,
  stat_map_failures = 0, mpcontext = 0x0, prkey_source = 0, reservation_key = {
    _v = 0}, sa_flags = 0 '\000', prflag = 0 '\000', all_tg_pt = 0,
  generic_mp = {ops = 0x7f5fc8897a40 <dm_gen_multipath_ops>}}
(gdb) p *cmpp
$10 = {
  wwid = "36001405ef6a7bc355084b3abcb236a4c\000\063\066a4c", '\000' <repeats 88 times>,
  alias_old = '\000' <repeats 127 times>, pgpolicy = 0, pgpolicyfn = 0x0,
  nextpg = 0, bestpg = 1, queuedio = 0, action = 0, wait_for_udev = 0,
  uev_wait_tick = 0, pgfailback = 0, failback_tick = 0, rr_weight = 0,
  nr_active = 0, no_path_retry = 0, retry_tick = 0, disable_queueing = 0,
  minio = 0, flush_on_last_del = 0, attribute_flags = 0, fast_io_fail = 0,
  retain_hwhandler = 0, deferred_remove = 0, delay_watch_checks = 0,
  delay_wait_checks = 0, marginal_path_err_sample_time = 0,
  marginal_path_err_rate_threshold = 0, marginal_path_err_recheck_gap_time = 0,
  marginal_path_double_failed_time = 0, skip_kpartx = 0, max_sectors_kb = 0,
  force_readonly = 0, force_udev_reload = 0, needs_paths_uevent = 0,
  ghost_delay = 0, ghost_delay_tick = 0, dev_loss = 0, uid = 0, gid = 0,
  mode = 0, size = 2097152, paths = 0x0, pg = 0x0, dmi = 0x55a07fb0d5f0,
  alias = 0x55a07fafc490 "mpatha", alias_prefix = 0x0, selector = 0x0,
  features = 0x0, hwhandler = 0x0, mpe = 0x0, hwe = 0x0, waiter = 0,
  stat_switchgroup = 0, stat_path_failures = 0, stat_map_loads = 0,
  stat_total_queueing_time = 0, stat_queueing_timeouts = 0,
  stat_map_failures = 0, mpcontext = 0x0, prkey_source = 0, reservation_key = {
    _v = 0}, sa_flags = 0 '\000', prflag = 0 '\000', all_tg_pt = 0,
  generic_mp = {ops = 0x7f5fc8897a40 <dm_gen_multipath_ops>}}

> int disassemble_map()
> {
> ...
> 	p += get_word(p, &mpp->features);
> 
> 	if (!mpp->features)
> 		return 1;
> 
> The same holds for mpp->hwhandler, mpp->selector, and mpp->pg (not sure
> what you refer to with "and so on").
> 
> In this case the map also wouldn't have paths, and shouldn't have been
> added to curmp in the first place. So IMO this is caused by
> get_dm_mpvec() not checking the return value of disassemble_map().
> map_discovery() in multipathd does this better - maps for which
> update_multipath_table() fails are not added. Can you confirm that you
> see this crash only in multipath, not in multipathd?

  I have been runing 48h, this crash did not happen in mutlipathd. But I
can't confirm that just in multipath, not in multipathd.

Regards
Lixiaokeng

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH 3/5] multipathd: add reclear_pp_from_mpp in ev_remove_path
  2020-08-20 15:22       ` Martin Wilck
@ 2020-08-24 13:07         ` lixiaokeng
  0 siblings, 0 replies; 20+ messages in thread
From: lixiaokeng @ 2020-08-24 13:07 UTC (permalink / raw)
  To: Martin Wilck, Benjamin Marzinski, Christophe Varoqui, dm-devel
  Cc: linfeilong, liuzhiqiang26

Hi Martin:
  Thanks for your detailed explanation. I understand it  well.

Regards
Lixiaokeng

On 2020/8/20 23:22, Martin Wilck wrote:
> On Thu, 2020-08-20 at 22:51 +0800, lixiaokeng wrote:
>> Hi Martin:
>>    I test this in 0.8.4 without your patch series . I have review the
>> code with your patch series and I think this problem will be solved.
>> But I have another question.
>> ev_remove_path
>> 	->__setup_multipath
>> 		->update_multipath_strings
>> 			->update_multipath_table
>> 				->update_pathvec_from_dm
>> 					->store_path
>> When multipathd del path xxx(such as sde) and multipath -v2 are
>> executed simultaneously, will the path(sde) deleted be stored to
>> pathvec
>> again? In my opinion, sde is't delete in pathvec and in
>> disassembel_map
>> sde will be stored to mpp->pg. When update_pathvec_from_dm, sde will
>> be
>> stored again.
> 
> in ev_remove_path(), we set INIT_REMOVED state. That means this path
> won't be used in the RELOAD ioctl. If the ioctl is successful, it will
> be gone from the kernel (for the time being). Later, sync_map_state()
> will figure this out, and remove the path. If the parallel "multipath"
> command reloads the map after multipathd's RELOAD operation, and before
> update_multipath_table() called, re-adding the path, the path will be
> read back from the kernel, and sync_map_state() will not delete the it.
>>From multipathd's point of view it will remain in INIT_REMOVED state,
> though. multipathd will not try to reload the map again unless it's
> asked to do so with another "add path" or "del path" command. This
> results in an inconsistent internal state between multipathd and the
> kernel (multipathd considers the path as REMOVED even though it's still
> present in the map). I don't think this can be generally avoided if we
> allow multipath to make changes to kernel maps while multipathd is
> running. (*)
> 
> We could avoid this by delegating these commands from multipath to
> multipathd, as we already do for some other commands. "multipath"
> without any arguments would map to a "multipathd reconfigure" command,
> while "multipath $DEVICE" would be translated to "multipathd add map"
> or "multipathd add path".
> 
> Does this make sense?
> 
> Regards,
> Martin
> 
> (*) Well: we *could* try to analyze incoming uevents, and distinguish
> those that multipathd has initiated itself from other, externally-
> triggered ones. Then if an external event arrives, adding a path which
> we had previously removed, and this external event arrives after the
> remove event we initiated, we could in theory infer that an external
> program had reverted the change we just made. But this would result in
> complex and pretty fragile logic. I'm not sure if it's worth it. The
> "delegating" approach is safer and cleaner IMO.
> 
> 
> 
> .
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2020-08-24 13:07 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-18 12:59 [PATCH 0/5] multipath-tools series: coredump and memory leak bugfix lixiaokeng
2020-08-18 13:02 ` [PATCH 1/5 v4] libmultipath fix a memory leak in set_ble_device lixiaokeng
2020-08-18 15:57   ` Martin Wilck
2020-08-18 13:06 ` [PATCH 2/5] libmultipath fix NULL dereference in select_action lixiaokeng
2020-08-18 16:28   ` Martin Wilck
2020-08-21  4:41     ` lixiaokeng
2020-08-18 13:08 ` [PATCH 3/5] multipathd: add reclear_pp_from_mpp in ev_remove_path lixiaokeng
2020-08-18 16:36   ` Martin Wilck
2020-08-20 14:51     ` lixiaokeng
2020-08-20 15:22       ` Martin Wilck
2020-08-24 13:07         ` lixiaokeng
2020-08-18 13:09 ` [PATCH 4/5] multipathd: disable queueing for recreated map in uev_remove_map lixiaokeng
2020-08-18 19:23   ` Martin Wilck
2020-08-19  8:48     ` lixiaokeng
2020-08-19  9:27   ` Martin Wilck
2020-08-18 13:11 ` [PATCH 5/5] libmultipath fix daemon memory leak in disassemble_map lixiaokeng
2020-08-19  1:42   ` lixiaokeng
2020-08-19  1:50 ` lixiaokeng
2020-08-19  8:26   ` Martin Wilck
2020-08-19  8:45     ` lixiaokeng

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.