All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCHv2] dm-multipath: Accept failed paths for multipath maps
@ 2013-12-18  7:52 Hannes Reinecke
  2013-12-18 14:08 ` Mike Snitzer
  0 siblings, 1 reply; 26+ messages in thread
From: Hannes Reinecke @ 2013-12-18  7:52 UTC (permalink / raw)
  To: Alasdair Kergon; +Cc: dm-devel, Sean Stewart, Mike Snitzer

The multipath kernel module is rejecting any map with an invalid
device. However, as the multipathd is processing the events serially
it will try to push a map with invalid devices if more than one
device failed at the same time.
So we can as well accept those maps and make sure to mark the
paths as down.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 drivers/md/dm-mpath.c | 76 +++++++++++++++++++++++++++++++++++++++++----------
 drivers/md/dm-mpath.h |  1 +
 drivers/md/dm-table.c |  7 +++--
 3 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6eb9dc9..fd246f7 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -166,7 +166,7 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
 
 	list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
 		list_del(&pgpath->list);
-		if (m->hw_handler_name)
+		if (m->hw_handler_name && pgpath->path.dev)
 			scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
 		dm_put_device(ti, pgpath->path.dev);
 		free_pgpath(pgpath);
@@ -303,6 +303,11 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
 
 	m->current_pgpath = path_to_pgpath(path);
 
+	if (!m->current_pgpath->path.dev) {
+		m->current_pgpath = NULL;
+		return -ENODEV;
+	}
+
 	if (m->current_pg != pg)
 		__switch_pg(m, m->current_pgpath);
 
@@ -573,6 +578,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 {
 	int r;
 	struct pgpath *p;
+	const char *path;
 	struct multipath *m = ti->private;
 	struct request_queue *q = NULL;
 	const char *attached_handler_name;
@@ -587,17 +593,40 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 
-	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+	path = dm_shift_arg(as);
+	r = dm_get_device(ti, path, dm_table_get_mode(ti->table),
 			  &p->path.dev);
 	if (r) {
-		ti->error = "error getting device";
-		goto bad;
+		unsigned major, minor;
+
+		/* Try to add a failed device */
+		if (r == -ENXIO && sscanf(path, "%u:%u", &major, &minor) == 2) {
+			dev_t dev;
+
+			/* Extract the major/minor numbers */
+			dev = MKDEV(major, minor);
+			if (MAJOR(dev) != major || MINOR(dev) != minor) {
+				/* Nice try, didn't work */
+				DMWARN("Invalid device path %s", path);
+				ti->error = "error converting devnum";
+				goto bad;
+			}
+			DMWARN("adding disabled device %d:%d", major, minor);
+			p->path.dev = NULL;
+			format_dev_t(p->path.pdev, dev);
+			p->is_active = 0;
+		} else {
+			ti->error = "error getting device";
+			goto bad;
+		}
+	} else {
+		memcpy(p->path.pdev, p->path.dev->name, 16);
 	}
 
-	if (m->retain_attached_hw_handler || m->hw_handler_name)
+	if (p->path.dev)
 		q = bdev_get_queue(p->path.dev->bdev);
 
-	if (m->retain_attached_hw_handler) {
+	if (q && (m->retain_attached_hw_handler || m->hw_handler_name)) {
 		attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
 		if (attached_handler_name) {
 			/*
@@ -616,7 +645,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 		}
 	}
 
-	if (m->hw_handler_name) {
+	if (q && m->hw_handler_name) {
 		/*
 		 * Increments scsi_dh reference, even when using an
 		 * already-attached handler.
@@ -655,6 +684,11 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 		goto bad;
 	}
 
+	if (!p->is_active) {
+		ps->type->fail_path(ps, &p->path);
+		p->fail_count++;
+		m->nr_valid_paths--;
+	}
 	return p;
 
  bad:
@@ -1005,7 +1039,7 @@ static int fail_path(struct pgpath *pgpath)
 	if (!pgpath->is_active)
 		goto out;
 
-	DMWARN("Failing path %s.", pgpath->path.dev->name);
+	DMWARN("Failing path %s.", pgpath->path.pdev);
 
 	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
 	pgpath->is_active = 0;
@@ -1017,7 +1051,7 @@ static int fail_path(struct pgpath *pgpath)
 		m->current_pgpath = NULL;
 
 	dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
-		      pgpath->path.dev->name, m->nr_valid_paths);
+		       pgpath->path.pdev, m->nr_valid_paths);
 
 	schedule_work(&m->trigger_event);
 
@@ -1041,6 +1075,12 @@ static int reinstate_path(struct pgpath *pgpath)
 	if (pgpath->is_active)
 		goto out;
 
+	if (!pgpath->path.dev) {
+		DMWARN("Cannot reinstate disabled path %s", pgpath->path.pdev);
+		r = -ENODEV;
+		goto out;
+	}
+
 	if (!pgpath->pg->ps.type->reinstate_path) {
 		DMWARN("Reinstate path not supported by path selector %s",
 		       pgpath->pg->ps.type->name);
@@ -1063,7 +1103,7 @@ static int reinstate_path(struct pgpath *pgpath)
 	}
 
 	dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
-		      pgpath->path.dev->name, m->nr_valid_paths);
+		       pgpath->path.pdev, m->nr_valid_paths);
 
 	schedule_work(&m->trigger_event);
 
@@ -1083,6 +1123,9 @@ static int action_dev(struct multipath *m, struct dm_dev *dev,
 	struct pgpath *pgpath;
 	struct priority_group *pg;
 
+	if (!dev)
+		return 0;
+
 	list_for_each_entry(pg, &m->priority_groups, list) {
 		list_for_each_entry(pgpath, &pg->pgpaths, list) {
 			if (pgpath->path.dev == dev)
@@ -1272,8 +1315,9 @@ static void activate_path(struct work_struct *work)
 	struct pgpath *pgpath =
 		container_of(work, struct pgpath, activate_path.work);
 
-	scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
-				pg_init_done, pgpath);
+	if (pgpath->path.dev && pgpath->is_active)
+		scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
+				 pg_init_done, pgpath);
 }
 
 static int noretry_error(int error)
@@ -1488,7 +1532,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 			       pg->ps.type->info_args);
 
 			list_for_each_entry(p, &pg->pgpaths, list) {
-				DMEMIT("%s %s %u ", p->path.dev->name,
+				DMEMIT("%s %s %u ", p->path.pdev,
 				       p->is_active ? "A" : "F",
 				       p->fail_count);
 				if (pg->ps.type->status)
@@ -1514,7 +1558,7 @@ static void multipath_status(struct dm_target *ti, status_type_t type,
 			       pg->ps.type->table_args);
 
 			list_for_each_entry(p, &pg->pgpaths, list) {
-				DMEMIT("%s ", p->path.dev->name);
+				DMEMIT("%s ", p->path.pdev);
 				if (pg->ps.type->status)
 					sz += pg->ps.type->status(&pg->ps,
 					      &p->path, type, result + sz,
@@ -1611,7 +1655,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 
 	pgpath = m->current_pgpath;
 
-	if (pgpath) {
+	if (pgpath && pgpath->path.dev) {
 		bdev = pgpath->path.dev->bdev;
 		mode = pgpath->path.dev->mode;
 	}
@@ -1645,6 +1689,8 @@ static int multipath_iterate_devices(struct dm_target *ti,
 
 	list_for_each_entry(pg, &m->priority_groups, list) {
 		list_for_each_entry(p, &pg->pgpaths, list) {
+			if (!p->path.dev)
+				continue;
 			ret = fn(ti, p->path.dev, ti->begin, ti->len, data);
 			if (ret)
 				goto out;
diff --git a/drivers/md/dm-mpath.h b/drivers/md/dm-mpath.h
index e230f71..f97388d 100644
--- a/drivers/md/dm-mpath.h
+++ b/drivers/md/dm-mpath.h
@@ -12,6 +12,7 @@
 struct dm_dev;
 
 struct dm_path {
+	char pdev[16];		/* Requested physical device */
 	struct dm_dev *dev;	/* Read-only */
 	void *pscontext;	/* For path-selector use */
 };
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 465f08c..253c908 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -518,9 +518,12 @@ EXPORT_SYMBOL_GPL(dm_set_device_limits);
  */
 void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 {
-	struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal,
-						  dm_dev);
+	struct dm_dev_internal *dd;
+
+	if (!d)
+		return;
 
+	dd = container_of(d, struct dm_dev_internal, dm_dev);
 	if (atomic_dec_and_test(&dd->count)) {
 		close_dev(dd, ti->table->md);
 		list_del(&dd->list);
-- 
1.7.12.4

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2013-12-18  7:52 [PATCHv2] dm-multipath: Accept failed paths for multipath maps Hannes Reinecke
@ 2013-12-18 14:08 ` Mike Snitzer
  2013-12-18 14:25   ` Hannes Reinecke
  0 siblings, 1 reply; 26+ messages in thread
From: Mike Snitzer @ 2013-12-18 14:08 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: dm-devel, Sean Stewart, Alasdair Kergon

On Wed, Dec 18 2013 at  2:52am -0500,
Hannes Reinecke <hare@suse.de> wrote:

> The multipath kernel module is rejecting any map with an invalid
> device. However, as the multipathd is processing the events serially
> it will try to push a map with invalid devices if more than one
> device failed at the same time.
> So we can as well accept those maps and make sure to mark the
> paths as down.

Why is it so desirable to do this?  Reduced latency to restore at least
one valid path when a bunch of paths go down?

Why can't we just rely on userspace eventually figuring out which paths
are failed and pushing a valid map down?

Are there favorable reports that this new behavior actually helps?
Please quantify how.

(As an aside, really not a fan of parse_path doing what you've
suggested.. will look closer once I have more justification to do so)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2013-12-18 14:08 ` Mike Snitzer
@ 2013-12-18 14:25   ` Hannes Reinecke
  2013-12-18 15:28     ` Stewart, Sean
  2013-12-18 15:28     ` Merla, ShivaKrishna
  0 siblings, 2 replies; 26+ messages in thread
From: Hannes Reinecke @ 2013-12-18 14:25 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: dm-devel, Sean Stewart, Alasdair Kergon

On 12/18/2013 03:08 PM, Mike Snitzer wrote:
> On Wed, Dec 18 2013 at  2:52am -0500,
> Hannes Reinecke <hare@suse.de> wrote:
> 
>> The multipath kernel module is rejecting any map with an invalid
>> device. However, as the multipathd is processing the events serially
>> it will try to push a map with invalid devices if more than one
>> device failed at the same time.
>> So we can as well accept those maps and make sure to mark the
>> paths as down.
> 
> Why is it so desirable to do this?  Reduced latency to restore at least
> one valid path when a bunch of paths go down?
> 
Without this patch multipathd cannot update the map as long is
hasn't catched up with udev.
During that time any scheduling decisions by the kernel part are
necessarily wrong, as it has to rely on the old map.

> Why can't we just rely on userspace eventually figuring out which paths
> are failed and pushing a valid map down?
> 
Oh, you can. This is what we're doing now :-)

But it will lead to spurious error during failover when multipathd
is trying to push down maps with invalid devices.

You are also running into a race window between checking the path in
multipathd and pushing down the map; if the device disappears during
that time you won't be able to push down the map.
If that happens during boot multipathd won't be able to create the
map at all, so you might not be able to boot here.
With that patch you at least have the device-mapper device, allowing
booting to continue.

> Are there favorable reports that this new behavior actually helps?
> Please quantify how.
> 
NetApp will have; they've been pushing me to forward this patch.
Sean?

BTW, SUSE / SLES is running happily with this patch for years now.
So it can't be at all bad ...

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2013-12-18 14:25   ` Hannes Reinecke
@ 2013-12-18 15:28     ` Stewart, Sean
  2013-12-19  8:21       ` Bart Van Assche
  2014-07-18  0:04       ` Mike Snitzer
  2013-12-18 15:28     ` Merla, ShivaKrishna
  1 sibling, 2 replies; 26+ messages in thread
From: Stewart, Sean @ 2013-12-18 15:28 UTC (permalink / raw)
  To: device-mapper development; +Cc: Alasdair Kergon, Stewart, Sean, Mike Snitzer

On Wed, 2013-12-18 at 15:25 +0100, Hannes Reinecke wrote:
> On 12/18/2013 03:08 PM, Mike Snitzer wrote:
> > On Wed, Dec 18 2013 at  2:52am -0500,
> > Hannes Reinecke <hare@suse.de> wrote:
> > 
> >> The multipath kernel module is rejecting any map with an invalid
> >> device. However, as the multipathd is processing the events serially
> >> it will try to push a map with invalid devices if more than one
> >> device failed at the same time.
> >> So we can as well accept those maps and make sure to mark the
> >> paths as down.
> > 
> > Why is it so desirable to do this?  Reduced latency to restore at least
> > one valid path when a bunch of paths go down?
> > 
> Without this patch multipathd cannot update the map as long is
> hasn't catched up with udev.
> During that time any scheduling decisions by the kernel part are
> necessarily wrong, as it has to rely on the old map.
> 
> > Why can't we just rely on userspace eventually figuring out which paths
> > are failed and pushing a valid map down?
> > 
> Oh, you can. This is what we're doing now :-)
> 
> But it will lead to spurious error during failover when multipathd
> is trying to push down maps with invalid devices.
> 
> You are also running into a race window between checking the path in
> multipathd and pushing down the map; if the device disappears during
> that time you won't be able to push down the map.
> If that happens during boot multipathd won't be able to create the
> map at all, so you might not be able to boot here.
> With that patch you at least have the device-mapper device, allowing
> booting to continue.
> 
> > Are there favorable reports that this new behavior actually helps?
> > Please quantify how.
> > 
> NetApp will have; they've been pushing me to forward this patch.
> Sean?
> 
Agree.  Internally, we have run into numerous cases with Red Hat where
the "failed in domap" error will occur, due to user space being behind,
or device detaching taking too long.  The most severe case is with
InfiniBand, where the LLD may place a device offline, then every single
reload that is trying to add a good path in will fail.  I will qualify
this by saying that I realize it is a problem that the device gets
placed offline in the first place, but this patch would allow it a
chance to continue on. The user still has to take manual steps to fix
the problem in this case, but it seems less disruptive to applications.

The device detaching case could be kind of disruptive to a user in the
scenario they are upgrading the firmware on a NetApp E-Series box, and
with this patch, at least a good path is able to be added in ASAP.

> BTW, SUSE / SLES is running happily with this patch for years now.
> So it can't be at all bad ...
> 
> Cheers,
> 
> Hannes

Also agreed.  We have seen this functionality in SLES for years, and
have not run into a problem with it.

Thanks,
Sean

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2013-12-18 14:25   ` Hannes Reinecke
  2013-12-18 15:28     ` Stewart, Sean
@ 2013-12-18 15:28     ` Merla, ShivaKrishna
  1 sibling, 0 replies; 26+ messages in thread
From: Merla, ShivaKrishna @ 2013-12-18 15:28 UTC (permalink / raw)
  To: device-mapper development, Mike Snitzer; +Cc: Stewart, Sean, Alasdair Kergon

> From: dm-devel-bounces@redhat.com [mailto:dm-devel-
> bounces@redhat.com] On Behalf Of Hannes Reinecke
> Sent: Wednesday, December 18, 2013 8:25 AM
> To: Mike Snitzer
> Cc: dm-devel@redhat.com; Stewart, Sean; Alasdair Kergon
> Subject: Re: [dm-devel] dm-multipath: Accept failed paths for multipath
> maps
> 
> On 12/18/2013 03:08 PM, Mike Snitzer wrote:
> > On Wed, Dec 18 2013 at  2:52am -0500,
> > Hannes Reinecke <hare@suse.de> wrote:
> >
> >> The multipath kernel module is rejecting any map with an invalid
> >> device. However, as the multipathd is processing the events serially
> >> it will try to push a map with invalid devices if more than one
> >> device failed at the same time.
> >> So we can as well accept those maps and make sure to mark the
> >> paths as down.
> >
> > Why is it so desirable to do this?  Reduced latency to restore at least
> > one valid path when a bunch of paths go down?
> >
> Without this patch multipathd cannot update the map as long is
> hasn't catched up with udev.
> During that time any scheduling decisions by the kernel part are
> necessarily wrong, as it has to rely on the old map.
> 
> > Why can't we just rely on userspace eventually figuring out which paths
> > are failed and pushing a valid map down?
> >
> Oh, you can. This is what we're doing now :-)
> 
> But it will lead to spurious error during failover when multipathd
> is trying to push down maps with invalid devices.
> 
> You are also running into a race window between checking the path in
> multipathd and pushing down the map; if the device disappears during
> that time you won't be able to push down the map.
> If that happens during boot multipathd won't be able to create the
> map at all, so you might not be able to boot here.
> With that patch you at least have the device-mapper device, allowing
> booting to continue.
> 
> > Are there favorable reports that this new behavior actually helps?
> > Please quantify how.
> >
> NetApp will have; they've been pushing me to forward this patch.
> Sean?
Yes, we have been seeing these issues with RHEL where table-load fails if
not able to open at least one device in the map. This might be due to transient errors and
device being in offline state. Due to this add-path events fail and new paths will not be added to the map.
When we reboot the alternate controllers we see all path failures. We saw this with IB and SAS configs.  
This is much needed fix for us during controller firmware upgrade tests.
> 
> BTW, SUSE / SLES is running happily with this patch for years now.
> So it can't be at all bad ...
> 
> Cheers,
> 
> Hannes
> --
> Dr. Hannes Reinecke		      zSeries & Storage
> hare@suse.de			      +49 911 74053 688
> SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
> GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)
> 
> --
> dm-devel mailing list
> dm-devel@redhat.com
> https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2013-12-18 15:28     ` Stewart, Sean
@ 2013-12-19  8:21       ` Bart Van Assche
  2014-02-19  1:14         ` Mike Snitzer
  2014-07-18  0:04       ` Mike Snitzer
  1 sibling, 1 reply; 26+ messages in thread
From: Bart Van Assche @ 2013-12-19  8:21 UTC (permalink / raw)
  To: device-mapper development; +Cc: Mike Snitzer, Stewart, Sean, Alasdair Kergon

On 12/18/13 16:28, Stewart, Sean wrote:
> The most severe case is with
> InfiniBand, where the LLD may place a device offline, then every single
> reload that is trying to add a good path in will fail.  I will qualify
> this by saying that I realize it is a problem that the device gets
> placed offline in the first place, but this patch would allow it a
> chance to continue on. The user still has to take manual steps to fix
> the problem in this case, but it seems less disruptive to applications.

Are you perhaps referring to the SRP initiator ? The above is correct
for old versions of the SRP initiator (< Linux kernel 3.12) but no
longer for upstream kernel versions >= 3.12. An effort is ongoing to
backport the latest SRP initiator changes into RHEL and SLES.

Bart.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2013-12-19  8:21       ` Bart Van Assche
@ 2014-02-19  1:14         ` Mike Snitzer
  2014-02-19  8:11           ` Bart Van Assche
  0 siblings, 1 reply; 26+ messages in thread
From: Mike Snitzer @ 2014-02-19  1:14 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: device-mapper development, Stewart, Sean, Alasdair Kergon

On Thu, Dec 19 2013 at  3:21am -0500,
Bart Van Assche <bvanassche@acm.org> wrote:

> On 12/18/13 16:28, Stewart, Sean wrote:
> > The most severe case is with
> > InfiniBand, where the LLD may place a device offline, then every single
> > reload that is trying to add a good path in will fail.  I will qualify
> > this by saying that I realize it is a problem that the device gets
> > placed offline in the first place, but this patch would allow it a
> > chance to continue on. The user still has to take manual steps to fix
> > the problem in this case, but it seems less disruptive to applications.
> 
> Are you perhaps referring to the SRP initiator ? The above is correct
> for old versions of the SRP initiator (< Linux kernel 3.12) but no
> longer for upstream kernel versions >= 3.12. An effort is ongoing to
> backport the latest SRP initiator changes into RHEL and SLES.

Hi Bart,

It has been a while.  Where do things stand for the RHEL backport of
these SRP initiator changes?  Are we talking RHEL6 and RHEL7?  Who at
Red Hat is your contact for this work?

Regardless of whether some version of Hannes's patch is applied to
upstream dm-mpath: I'd really like to see this offlining problem that
forced the need for this dm-mpath change addressed.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-02-19  1:14         ` Mike Snitzer
@ 2014-02-19  8:11           ` Bart Van Assche
  0 siblings, 0 replies; 26+ messages in thread
From: Bart Van Assche @ 2014-02-19  8:11 UTC (permalink / raw)
  To: Mike Snitzer; +Cc: device-mapper development, Stewart, Sean, Alasdair Kergon

On 02/19/14 02:14, Mike Snitzer wrote:
> On Thu, Dec 19 2013 at  3:21am -0500,
> Bart Van Assche <bvanassche@acm.org> wrote:
> 
>> On 12/18/13 16:28, Stewart, Sean wrote:
>>> The most severe case is with
>>> InfiniBand, where the LLD may place a device offline, then every single
>>> reload that is trying to add a good path in will fail.  I will qualify
>>> this by saying that I realize it is a problem that the device gets
>>> placed offline in the first place, but this patch would allow it a
>>> chance to continue on. The user still has to take manual steps to fix
>>> the problem in this case, but it seems less disruptive to applications.
>>
>> Are you perhaps referring to the SRP initiator ? The above is correct
>> for old versions of the SRP initiator (< Linux kernel 3.12) but no
>> longer for upstream kernel versions >= 3.12. An effort is ongoing to
>> backport the latest SRP initiator changes into RHEL and SLES.
> 
> It has been a while.  Where do things stand for the RHEL backport of
> these SRP initiator changes?  Are we talking RHEL6 and RHEL7?  Who at
> Red Hat is your contact for this work?
> 
> Regardless of whether some version of Hannes's patch is applied to
> upstream dm-mpath: I'd really like to see this offlining problem that
> forced the need for this dm-mpath change addressed.

Hello Mike,

At https://github.com/bvanassche/ib_srp-backport a version of the SRP
initiator driver can be found that works fine with at least RHEL 6.3,
6.4, 6.5 and SLES 11. I have not yet had a chance to test that driver
against RHEL 7. The SRP initiator driver backport includes all changes
that have been accepted upstream up to and including kernel 3.14-rc3.
That means that both versions of the SRP initiator driver include the
code that changes the SCSI device state from "offline" into "running"
after reconnecting succeeded after a cable pull. The relevant RHEL
bugzilla entries are as follows:
* https://bugzilla.redhat.com/show_bug.cgi?id=1012926
* https://bugzilla.redhat.com/show_bug.cgi?id=1055654

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2013-12-18 15:28     ` Stewart, Sean
  2013-12-19  8:21       ` Bart Van Assche
@ 2014-07-18  0:04       ` Mike Snitzer
  2014-07-18  0:23         ` Mike Snitzer
                           ` (2 more replies)
  1 sibling, 3 replies; 26+ messages in thread
From: Mike Snitzer @ 2014-07-18  0:04 UTC (permalink / raw)
  To: Stewart, Sean; +Cc: device-mapper development, Bryn Reeves, Alasdair Kergon

On Wed, Dec 18 2013 at 10:28am -0500,
Stewart, Sean <Sean.Stewart@netapp.com> wrote:

> On Wed, 2013-12-18 at 15:25 +0100, Hannes Reinecke wrote:
> > On 12/18/2013 03:08 PM, Mike Snitzer wrote:
> > > On Wed, Dec 18 2013 at  2:52am -0500,
> > > Hannes Reinecke <hare@suse.de> wrote:
> > > 
> > >> The multipath kernel module is rejecting any map with an invalid
> > >> device. However, as the multipathd is processing the events serially
> > >> it will try to push a map with invalid devices if more than one
> > >> device failed at the same time.
> > >> So we can as well accept those maps and make sure to mark the
> > >> paths as down.
> > > 
> > > Why is it so desirable to do this?  Reduced latency to restore at least
> > > one valid path when a bunch of paths go down?
> > > 
> > Without this patch multipathd cannot update the map as long is
> > hasn't catched up with udev.
> > During that time any scheduling decisions by the kernel part are
> > necessarily wrong, as it has to rely on the old map.
> > 
> > > Why can't we just rely on userspace eventually figuring out which paths
> > > are failed and pushing a valid map down?
> > > 
> > Oh, you can. This is what we're doing now :-)
> > 
> > But it will lead to spurious error during failover when multipathd
> > is trying to push down maps with invalid devices.
> > 
> > You are also running into a race window between checking the path in
> > multipathd and pushing down the map; if the device disappears during
> > that time you won't be able to push down the map.
> > If that happens during boot multipathd won't be able to create the
> > map at all, so you might not be able to boot here.
> > With that patch you at least have the device-mapper device, allowing
> > booting to continue.
> > 
> > > Are there favorable reports that this new behavior actually helps?
> > > Please quantify how.
> > > 
> > NetApp will have; they've been pushing me to forward this patch.
> > Sean?
> > 
> Agree.  Internally, we have run into numerous cases with Red Hat where
> the "failed in domap" error will occur, due to user space being behind,
> or device detaching taking too long.  The most severe case is with
> InfiniBand, where the LLD may place a device offline, then every single
> reload that is trying to add a good path in will fail.  I will qualify
> this by saying that I realize it is a problem that the device gets
> placed offline in the first place, but this patch would allow it a
> chance to continue on. The user still has to take manual steps to fix
> the problem in this case, but it seems less disruptive to applications.
> 
> The device detaching case could be kind of disruptive to a user in the
> scenario they are upgrading the firmware on a NetApp E-Series box, and
> with this patch, at least a good path is able to be added in ASAP.
> 
> > BTW, SUSE / SLES is running happily with this patch for years now.
> > So it can't be at all bad ...
> > 
> > Cheers,
> > 
> > Hannes
> 
> Also agreed.  We have seen this functionality in SLES for years, and
> have not run into a problem with it.

Revisiting this can of worms...

As part of full due-diligence on the approach that SUSE and NetApp have
seemingly enjoyed "for years" I reviewed Hannes' v3 patch, fixed one
issue and did some cleanup.  I then converted over to using a slightly
different approach where-in the DM core becomes a more willing
co-conspirator in this hack by introducing the ability to have
place-holder devices (dm_dev without an opened bdev) referenced in a DM
table.  The work is here:
http://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=throwaway-dm-mpath-placeholder-devs

Here is the diffstat of all 3 patches rolled up:

 git diff d4bdac727f1e09412c762f177790a96432738264^..7681ae5ddb5d567800023477be7ddc68f9812a95 | diffstat
 dm-mpath.c |   51 +++++++++++++++++++++++++++++++++++----------------
 dm-table.c |   53 ++++++++++++++++++++++++++++++++++++++++-------------
 dm.c       |    5 ++---
 dm.h       |   12 ++++++++++++
 4 files changed, 89 insertions(+), 32 deletions(-)

But it was only compile tested, because doing more validation of this
work would mean it has a snowballs chance in hell of seeing the light of
upstream.  Sadly it doesn't have a good chance; it would require some
compelling proof:
1) that mpath is bullet-proof no matter how crazy a user got with fake
   place-holder devices in their DM tables (coupled with reinstate_path
   messages, etc)
2) that the storage configs that experienced problems with the current
   DM mpath dm_get_device() failures weren't broken to start with (for
   instance ib srp is apparently fixed now.. but those fixes are still
   working their way into RHEL) -- or put differently: I need _details_
   on the NetApp or other legit storage configs that are still
   experiencing problems without a solution to this problem.

... and even with that proof I'm pretty sure Alasdair will hate this
place-holder approach and will push for some other solution.

I'm going away on paternity leave until Sept 8... my _hope_ is that
someone fixes multipath-tools to suck less or that a more clever
solution to this problem is developed locally in DM mpath.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18  0:04       ` Mike Snitzer
@ 2014-07-18  0:23         ` Mike Snitzer
  2014-07-18  6:00           ` Hannes Reinecke
  2014-07-18  0:29         ` John Utz
  2014-07-18  6:12         ` Hannes Reinecke
  2 siblings, 1 reply; 26+ messages in thread
From: Mike Snitzer @ 2014-07-18  0:23 UTC (permalink / raw)
  To: Stewart, Sean; +Cc: device-mapper development, Bryn Reeves, Alasdair Kergon

On Thu, Jul 17 2014 at  8:04pm -0400,
Mike Snitzer <snitzer@redhat.com> wrote:
 
> Revisiting this can of worms...
> 
> As part of full due-diligence on the approach that SUSE and NetApp have
> seemingly enjoyed "for years" I reviewed Hannes' v3 patch, fixed one
> issue and did some cleanup.  I then converted over to using a slightly
> different approach where-in the DM core becomes a more willing
> co-conspirator in this hack by introducing the ability to have
> place-holder devices (dm_dev without an opened bdev) referenced in a DM
> table.  The work is here:
> http://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=throwaway-dm-mpath-placeholder-devs

Here is the rolled up patch (the individual commits in the above branch
are rather noisy given the sequencing):

 drivers/md/dm-mpath.c | 51 +++++++++++++++++++++++++++++++++----------------
 drivers/md/dm-table.c | 53 ++++++++++++++++++++++++++++++++++++++-------------
 drivers/md/dm.c       |  5 ++---
 drivers/md/dm.h       | 12 ++++++++++++
 4 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 833d7e7..6c3ddd2 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -162,7 +162,7 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
 
 	list_for_each_entry_safe(pgpath, tmp, pgpaths, list) {
 		list_del(&pgpath->list);
-		if (m->hw_handler_name)
+		if (m->hw_handler_name && pgpath->path.dev->bdev)
 			scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
 		dm_put_device(ti, pgpath->path.dev);
 		free_pgpath(pgpath);
@@ -306,6 +306,11 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
 
 	m->current_pgpath = path_to_pgpath(path);
 
+	if (!m->current_pgpath->path.dev->bdev) {
+		m->current_pgpath = NULL;
+		return -ENODEV;
+	}
+
 	if (m->current_pg != pg)
 		__switch_pg(m, m->current_pgpath);
 
@@ -509,6 +514,9 @@ static int parse_path_selector(struct dm_arg_set *as, struct priority_group *pg,
 	return 0;
 }
 
+static void __fail_path(struct pgpath *pgpath, struct multipath *m,
+			struct path_selector *ps);
+
 static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps,
 			       struct dm_target *ti)
 {
@@ -528,17 +536,19 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 	if (!p)
 		return ERR_PTR(-ENOMEM);
 
-	r = dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
-			  &p->path.dev);
+	r = __dm_get_device(ti, dm_shift_arg(as), dm_table_get_mode(ti->table),
+			    &p->path.dev, true);
 	if (r) {
 		ti->error = "error getting device";
 		goto bad;
 	}
 
-	if (m->retain_attached_hw_handler || m->hw_handler_name)
+	if (p->path.dev->bdev)
 		q = bdev_get_queue(p->path.dev->bdev);
+	else
+		p->is_active = 0;
 
-	if (m->retain_attached_hw_handler) {
+	if (q && m->retain_attached_hw_handler) {
 		attached_handler_name = scsi_dh_attached_handler_name(q, GFP_KERNEL);
 		if (attached_handler_name) {
 			/*
@@ -557,7 +567,7 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 		}
 	}
 
-	if (m->hw_handler_name) {
+	if (q && m->hw_handler_name) {
 		/*
 		 * Increments scsi_dh reference, even when using an
 		 * already-attached handler.
@@ -596,6 +606,9 @@ static struct pgpath *parse_path(struct dm_arg_set *as, struct path_selector *ps
 		goto bad;
 	}
 
+	if (!p->is_active)
+		__fail_path(p, m, ps);
+
 	return p;
 
  bad:
@@ -912,6 +925,15 @@ static void multipath_dtr(struct dm_target *ti)
 	free_multipath(m);
 }
 
+static void __fail_path(struct pgpath *pgpath, struct multipath *m,
+			struct path_selector *ps)
+{
+	ps->type->fail_path(ps, &pgpath->path);
+	pgpath->fail_count++;
+
+	m->nr_valid_paths--;
+}
+
 /*
  * Take a path out of use.
  */
@@ -927,17 +949,14 @@ static int fail_path(struct pgpath *pgpath)
 
 	DMWARN("Failing path %s.", pgpath->path.dev->name);
 
-	pgpath->pg->ps.type->fail_path(&pgpath->pg->ps, &pgpath->path);
 	pgpath->is_active = 0;
-	pgpath->fail_count++;
-
-	m->nr_valid_paths--;
+	__fail_path(pgpath, m, &pgpath->pg->ps);
 
 	if (pgpath == m->current_pgpath)
 		m->current_pgpath = NULL;
 
 	dm_path_uevent(DM_UEVENT_PATH_FAILED, m->ti,
-		      pgpath->path.dev->name, m->nr_valid_paths);
+		       pgpath->path.dev->name, m->nr_valid_paths);
 
 	schedule_work(&m->trigger_event);
 
@@ -983,7 +1002,7 @@ static int reinstate_path(struct pgpath *pgpath)
 	}
 
 	dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
-		      pgpath->path.dev->name, m->nr_valid_paths);
+		       pgpath->path.dev->name, m->nr_valid_paths);
 
 	schedule_work(&m->trigger_event);
 
@@ -1195,7 +1214,7 @@ static void activate_path(struct work_struct *work)
 	struct pgpath *pgpath =
 		container_of(work, struct pgpath, activate_path.work);
 
-	if (pgpath->is_active)
+	if (pgpath->is_active && pgpath->path.dev->bdev)
 		scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev),
 				 pg_init_done, pgpath);
 	else
@@ -1528,7 +1547,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 
 	pgpath = m->current_pgpath;
 
-	if (pgpath) {
+	if (pgpath && pgpath->path.dev->bdev) {
 		bdev = pgpath->path.dev->bdev;
 		mode = pgpath->path.dev->mode;
 	}
@@ -1586,9 +1605,9 @@ out:
 
 static int __pgpath_busy(struct pgpath *pgpath)
 {
-	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
+	struct request_queue *q = dm_dev_get_queue(pgpath->path.dev);
 
-	return dm_underlying_device_busy(q);
+	return q && dm_underlying_device_busy(q);
 }
 
 /*
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3c72bf1..1ef1601 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -262,7 +262,7 @@ static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
 	struct dm_dev_internal *dd;
 
 	list_for_each_entry (dd, l, list)
-		if (dd->dm_dev.bdev->bd_dev == dev)
+		if (dd->dm_dev.bdev && dd->dm_dev.bdev->bd_dev == dev)
 			return dd;
 
 	return NULL;
@@ -317,12 +317,16 @@ static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
 	struct request_queue *q;
 	struct queue_limits *limits = data;
 	struct block_device *bdev = dev->bdev;
-	sector_t dev_size =
-		i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+	sector_t dev_size;
 	unsigned short logical_block_size_sectors =
 		limits->logical_block_size >> SECTOR_SHIFT;
 	char b[BDEVNAME_SIZE];
 
+	if (!bdev)
+		return 0;
+
+	dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+
 	/*
 	 * Some devices exist without request functions,
 	 * such as loop devices not yet bound to backing files.
@@ -393,6 +397,9 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
 	dd_new.dm_dev.mode |= new_mode;
 	dd_new.dm_dev.bdev = NULL;
 
+	if (!dd->dm_dev.bdev)
+		return 0;
+
 	r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md);
 	if (r)
 		return r;
@@ -407,8 +414,8 @@ static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
  * Add a device to the list, or just increment the usage count if
  * it's already present.
  */
-int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
-		  struct dm_dev **result)
+int __dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
+		    struct dm_dev **result, bool open_may_fail)
 {
 	int r;
 	dev_t uninitialized_var(dev);
@@ -443,7 +450,8 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 		dd->dm_dev.mode = mode;
 		dd->dm_dev.bdev = NULL;
 
-		if ((r = open_dev(dd, dev, t->md))) {
+		r = open_dev(dd, dev, t->md);
+		if (r && !open_may_fail) {
 			kfree(dd);
 			return r;
 		}
@@ -463,6 +471,13 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 	*result = &dd->dm_dev;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(__dm_get_device);
+
+int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
+		  struct dm_dev **result)
+{
+	return __dm_get_device(ti, path, mode, result, false);
+}
 EXPORT_SYMBOL(dm_get_device);
 
 static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
@@ -470,9 +485,13 @@ static int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 {
 	struct queue_limits *limits = data;
 	struct block_device *bdev = dev->bdev;
-	struct request_queue *q = bdev_get_queue(bdev);
+	struct request_queue *q;
 	char b[BDEVNAME_SIZE];
 
+	if (!bdev)
+		return 0;
+
+	q = bdev_get_queue(bdev);
 	if (unlikely(!q)) {
 		DMWARN("%s: Cannot set limits for nonexistent device %s",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
@@ -906,6 +925,8 @@ static int dm_table_set_type(struct dm_table *t)
 	/* Non-request-stackable devices can't be used for request-based dm */
 	devices = dm_table_get_devices(t);
 	list_for_each_entry(dd, devices, list) {
+		if (!dd->dm_dev.bdev)
+			continue;
 		if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) {
 			DMWARN("table load rejected: including"
 			       " non-request-stackable devices");
@@ -1043,6 +1064,8 @@ static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
 	struct gendisk *prev_disk = NULL, *template_disk = NULL;
 
 	list_for_each_entry(dd, devices, list) {
+		if (!dd->dm_dev.bdev)
+			continue;
 		template_disk = dd->dm_dev.bdev->bd_disk;
 		if (!blk_get_integrity(template_disk))
 			goto no_integrity;
@@ -1321,7 +1344,7 @@ static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
 				sector_t start, sector_t len, void *data)
 {
 	unsigned flush = (*(unsigned *)data);
-	struct request_queue *q = bdev_get_queue(dev->bdev);
+	struct request_queue *q = dm_dev_get_queue(dev);
 
 	return q && (q->flush_flags & flush);
 }
@@ -1373,7 +1396,7 @@ static bool dm_table_discard_zeroes_data(struct dm_table *t)
 static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
 			    sector_t start, sector_t len, void *data)
 {
-	struct request_queue *q = bdev_get_queue(dev->bdev);
+	struct request_queue *q = dm_dev_get_queue(dev);
 
 	return q && blk_queue_nonrot(q);
 }
@@ -1381,7 +1404,7 @@ static int device_is_nonrot(struct dm_target *ti, struct dm_dev *dev,
 static int device_is_not_random(struct dm_target *ti, struct dm_dev *dev,
 			     sector_t start, sector_t len, void *data)
 {
-	struct request_queue *q = bdev_get_queue(dev->bdev);
+	struct request_queue *q = dm_dev_get_queue(dev);
 
 	return q && !blk_queue_add_random(q);
 }
@@ -1406,7 +1429,7 @@ static bool dm_table_all_devices_attribute(struct dm_table *t,
 static int device_not_write_same_capable(struct dm_target *ti, struct dm_dev *dev,
 					 sector_t start, sector_t len, void *data)
 {
-	struct request_queue *q = bdev_get_queue(dev->bdev);
+	struct request_queue *q = dm_dev_get_queue(dev);
 
 	return q && !q->limits.max_write_same_sectors;
 }
@@ -1433,7 +1456,7 @@ static bool dm_table_supports_write_same(struct dm_table *t)
 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
 				  sector_t start, sector_t len, void *data)
 {
-	struct request_queue *q = bdev_get_queue(dev->bdev);
+	struct request_queue *q = dm_dev_get_queue(dev);
 
 	return q && blk_queue_discard(q);
 }
@@ -1616,9 +1639,13 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 	int r = 0;
 
 	list_for_each_entry(dd, devices, list) {
-		struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
+		struct request_queue *q;
 		char b[BDEVNAME_SIZE];
 
+		if (!dd->dm_dev.bdev)
+			continue;
+
+		q = bdev_get_queue(dd->dm_dev.bdev);
 		if (likely(q))
 			r |= bdi_congested(&q->backing_dev_info, bdi_bits);
 		else
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 32b958d..60989fa 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2148,10 +2148,9 @@ static int dm_device_merge_is_compulsory(struct dm_target *ti,
 					 struct dm_dev *dev, sector_t start,
 					 sector_t len, void *data)
 {
-	struct block_device *bdev = dev->bdev;
-	struct request_queue *q = bdev_get_queue(bdev);
+	struct request_queue *q = dm_dev_get_queue(dev);
 
-	return dm_queue_merge_is_compulsory(q);
+	return q && dm_queue_merge_is_compulsory(q);
 }
 
 /*
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index e81d215..aa30a05 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -85,6 +85,18 @@ struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
 
 int dm_setup_md_queue(struct mapped_device *md);
 
+static inline struct request_queue *dm_dev_get_queue(struct dm_dev *dev)
+{
+	return dev->bdev ? bdev_get_queue(dev->bdev) : NULL;
+}
+
+/*
+ * Variant of dm_get_device that allows place-holder devices to be referenced,
+ * but you _really_ shouldn't need to use this!
+ */
+int __dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
+		    struct dm_dev **result, bool open_may_fail);
+
 /*
  * To check the return value from dm_table_find_target().
  */

^ permalink raw reply related	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18  0:04       ` Mike Snitzer
  2014-07-18  0:23         ` Mike Snitzer
@ 2014-07-18  0:29         ` John Utz
  2014-07-18  2:18           ` Mike Snitzer
  2014-07-18  6:12         ` Hannes Reinecke
  2 siblings, 1 reply; 26+ messages in thread
From: John Utz @ 2014-07-18  0:29 UTC (permalink / raw)
  To: device-mapper development, Stewart, Sean; +Cc: Bryn Reeves, Alasdair Kergon

What are the release testing criteria for things like this? From a hand wavy perspective i am sure that the meta-requirement is 'don't break anything'. but how is a new release proven to not break anything?

fer instance, does anybody create dm devices, format them with XFS and then run xfstests against them?

or some other formalized regimen?

or is it all kinda ad-hoc?

tnx!

johnu
________________________________________
From: dm-devel-bounces@redhat.com [dm-devel-bounces@redhat.com] on behalf of Mike Snitzer [snitzer@redhat.com]
Sent: Thursday, July 17, 2014 5:04 PM
To: Stewart, Sean
Cc: device-mapper development; Bryn Reeves; Alasdair Kergon
Subject: Re: [dm-devel] dm-multipath: Accept failed paths for multipath maps

On Wed, Dec 18 2013 at 10:28am -0500,
Stewart, Sean <Sean.Stewart@netapp.com> wrote:

> On Wed, 2013-12-18 at 15:25 +0100, Hannes Reinecke wrote:
> > On 12/18/2013 03:08 PM, Mike Snitzer wrote:
> > > On Wed, Dec 18 2013 at  2:52am -0500,
> > > Hannes Reinecke <hare@suse.de> wrote:
> > >
> > >> The multipath kernel module is rejecting any map with an invalid
> > >> device. However, as the multipathd is processing the events serially
> > >> it will try to push a map with invalid devices if more than one
> > >> device failed at the same time.
> > >> So we can as well accept those maps and make sure to mark the
> > >> paths as down.
> > >
> > > Why is it so desirable to do this?  Reduced latency to restore at least
> > > one valid path when a bunch of paths go down?
> > >
> > Without this patch multipathd cannot update the map as long is
> > hasn't catched up with udev.
> > During that time any scheduling decisions by the kernel part are
> > necessarily wrong, as it has to rely on the old map.
> >
> > > Why can't we just rely on userspace eventually figuring out which paths
> > > are failed and pushing a valid map down?
> > >
> > Oh, you can. This is what we're doing now :-)
> >
> > But it will lead to spurious error during failover when multipathd
> > is trying to push down maps with invalid devices.
> >
> > You are also running into a race window between checking the path in
> > multipathd and pushing down the map; if the device disappears during
> > that time you won't be able to push down the map.
> > If that happens during boot multipathd won't be able to create the
> > map at all, so you might not be able to boot here.
> > With that patch you at least have the device-mapper device, allowing
> > booting to continue.
> >
> > > Are there favorable reports that this new behavior actually helps?
> > > Please quantify how.
> > >
> > NetApp will have; they've been pushing me to forward this patch.
> > Sean?
> >
> Agree.  Internally, we have run into numerous cases with Red Hat where
> the "failed in domap" error will occur, due to user space being behind,
> or device detaching taking too long.  The most severe case is with
> InfiniBand, where the LLD may place a device offline, then every single
> reload that is trying to add a good path in will fail.  I will qualify
> this by saying that I realize it is a problem that the device gets
> placed offline in the first place, but this patch would allow it a
> chance to continue on. The user still has to take manual steps to fix
> the problem in this case, but it seems less disruptive to applications.
>
> The device detaching case could be kind of disruptive to a user in the
> scenario they are upgrading the firmware on a NetApp E-Series box, and
> with this patch, at least a good path is able to be added in ASAP.
>
> > BTW, SUSE / SLES is running happily with this patch for years now.
> > So it can't be at all bad ...
> >
> > Cheers,
> >
> > Hannes
>
> Also agreed.  We have seen this functionality in SLES for years, and
> have not run into a problem with it.

Revisiting this can of worms...

As part of full due-diligence on the approach that SUSE and NetApp have
seemingly enjoyed "for years" I reviewed Hannes' v3 patch, fixed one
issue and did some cleanup.  I then converted over to using a slightly
different approach where-in the DM core becomes a more willing
co-conspirator in this hack by introducing the ability to have
place-holder devices (dm_dev without an opened bdev) referenced in a DM
table.  The work is here:
http://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=throwaway-dm-mpath-placeholder-devs

Here is the diffstat of all 3 patches rolled up:

 git diff d4bdac727f1e09412c762f177790a96432738264^..7681ae5ddb5d567800023477be7ddc68f9812a95 | diffstat
 dm-mpath.c |   51 +++++++++++++++++++++++++++++++++++----------------
 dm-table.c |   53 ++++++++++++++++++++++++++++++++++++++++-------------
 dm.c       |    5 ++---
 dm.h       |   12 ++++++++++++
 4 files changed, 89 insertions(+), 32 deletions(-)

But it was only compile tested, because doing more validation of this
work would mean it has a snowballs chance in hell of seeing the light of
upstream.  Sadly it doesn't have a good chance; it would require some
compelling proof:
1) that mpath is bullet-proof no matter how crazy a user got with fake
   place-holder devices in their DM tables (coupled with reinstate_path
   messages, etc)
2) that the storage configs that experienced problems with the current
   DM mpath dm_get_device() failures weren't broken to start with (for
   instance ib srp is apparently fixed now.. but those fixes are still
   working their way into RHEL) -- or put differently: I need _details_
   on the NetApp or other legit storage configs that are still
   experiencing problems without a solution to this problem.

... and even with that proof I'm pretty sure Alasdair will hate this
place-holder approach and will push for some other solution.

I'm going away on paternity leave until Sept 8... my _hope_ is that
someone fixes multipath-tools to suck less or that a more clever
solution to this problem is developed locally in DM mpath.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18  0:29         ` John Utz
@ 2014-07-18  2:18           ` Mike Snitzer
  2014-07-18  3:31             ` John Utz
  0 siblings, 1 reply; 26+ messages in thread
From: Mike Snitzer @ 2014-07-18  2:18 UTC (permalink / raw)
  To: John Utz
  Cc: device-mapper development, Bryn Reeves, Stewart, Sean, Alasdair Kergon

On Thu, Jul 17 2014 at  8:29pm -0400,
John Utz <John.Utz@wdc.com> wrote:

> What are the release testing criteria for things like this? From a
> hand wavy perspective i am sure that the meta-requirement is 'don't
> break anything'. but how is a new release proven to not break
> anything?

Red Hat and other vendors have their own testbeds and harnesses for
testing dm-multipath.  But we really could benefit from more open
exchange and collaboration of a multipath test harness.

Had I actually tested the patches I pointed to it'd have blown up
pretty solidly, I realized something like this was needed:
http://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/commit/?h=throwaway-dm-mpath-placeholder-devs&id=77d1130e602a399ed1451edd7e91044fe9225fb6

Anyway, I really don't like that 'place-holder device' approach.. way
too hackish (both my code and Hannes').  It is an elaborate workaround
rather than a proper solution.  I'm hopeful Bryn takes this multipath
problem under his wing and fixes it for real ;)

> fer instance, does anybody create dm devices, format them with XFS and
> then run xfstests against them?

We routinely use XFS and ext4 with git to generate load.  But not
xfstests... that may be useful.

> or some other formalized regimen?
> 
> or is it all kinda ad-hoc?

We have the device-mapper-test-suite (dmts) for testing various DM
targets (mainly dm-thinp and dm-cache), see:
https://github.com/jthornber/device-mapper-test-suite

I added basic support for dm-crypt to dmts; but it needs serious
expanding for it to be useful.. luckily the cryptsetup userspace code
has dm-crypt tests.

Also, the lvm2 testsuite is another major resource that is used for
testing the wide range of DM targets.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18  2:18           ` Mike Snitzer
@ 2014-07-18  3:31             ` John Utz
  2014-07-18  5:57               ` Hannes Reinecke
  0 siblings, 1 reply; 26+ messages in thread
From: John Utz @ 2014-07-18  3:31 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: device-mapper development, Bryn Reeves, Stewart, Sean, Alasdair Kergon

Thankyou very much for the exhaustive answer! I forwarded on to my project peers because i don't think any of us where aware of the existing infrastructure.

Of course, said infrastructure would have to be taught about ZAC, but it seems like it would be a nice place to start testing from....

tnx!

johnu
________________________________________
From: Mike Snitzer [snitzer@redhat.com]
Sent: Thursday, July 17, 2014 7:18 PM
To: John Utz
Cc: device-mapper development; Stewart, Sean; Bryn Reeves; Alasdair Kergon
Subject: Re: dm-multipath: Accept failed paths for multipath maps

On Thu, Jul 17 2014 at  8:29pm -0400,
John Utz <John.Utz@wdc.com> wrote:

> What are the release testing criteria for things like this? From a
> hand wavy perspective i am sure that the meta-requirement is 'don't
> break anything'. but how is a new release proven to not break
> anything?

Red Hat and other vendors have their own testbeds and harnesses for
testing dm-multipath.  But we really could benefit from more open
exchange and collaboration of a multipath test harness.

Had I actually tested the patches I pointed to it'd have blown up
pretty solidly, I realized something like this was needed:
http://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/commit/?h=throwaway-dm-mpath-placeholder-devs&id=77d1130e602a399ed1451edd7e91044fe9225fb6

Anyway, I really don't like that 'place-holder device' approach.. way
too hackish (both my code and Hannes').  It is an elaborate workaround
rather than a proper solution.  I'm hopeful Bryn takes this multipath
problem under his wing and fixes it for real ;)

> fer instance, does anybody create dm devices, format them with XFS and
> then run xfstests against them?

We routinely use XFS and ext4 with git to generate load.  But not
xfstests... that may be useful.

> or some other formalized regimen?
>
> or is it all kinda ad-hoc?

We have the device-mapper-test-suite (dmts) for testing various DM
targets (mainly dm-thinp and dm-cache), see:
https://github.com/jthornber/device-mapper-test-suite

I added basic support for dm-crypt to dmts; but it needs serious
expanding for it to be useful.. luckily the cryptsetup userspace code
has dm-crypt tests.

Also, the lvm2 testsuite is another major resource that is used for
testing the wide range of DM targets.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18  3:31             ` John Utz
@ 2014-07-18  5:57               ` Hannes Reinecke
  2014-07-18 14:38                 ` Mike Snitzer
  2014-07-18 16:51                 ` dm-multipath: Accept failed paths for multipath maps John Utz
  0 siblings, 2 replies; 26+ messages in thread
From: Hannes Reinecke @ 2014-07-18  5:57 UTC (permalink / raw)
  To: John Utz; +Cc: dm-devel

On 07/18/2014 05:31 AM, John Utz wrote:
> Thankyou very much for the exhaustive answer! I forwarded on to my
 > project peers because i don't think any of us where aware of the
 > existing infrastructure.
>
> Of course, said infrastructure would have to be taught about ZAC,
 > but it seems like it would be a nice place to start testing from....
>
ZAC is a different beast altogether; I've posted an initial set of 
patches a while back on linux-scsi.
But I don't think multipath needs to be changed for that.
Other areas of device-mapper most certainly do.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18  0:23         ` Mike Snitzer
@ 2014-07-18  6:00           ` Hannes Reinecke
  2014-07-18 16:04             ` Mike Snitzer
  0 siblings, 1 reply; 26+ messages in thread
From: Hannes Reinecke @ 2014-07-18  6:00 UTC (permalink / raw)
  To: Mike Snitzer, Stewart, Sean
  Cc: device-mapper development, Bryn Reeves, Alasdair Kergon

On 07/18/2014 02:23 AM, Mike Snitzer wrote:
> On Thu, Jul 17 2014 at  8:04pm -0400,
> Mike Snitzer <snitzer@redhat.com> wrote:
>
>> Revisiting this can of worms...
>>
>> As part of full due-diligence on the approach that SUSE and NetApp have
>> seemingly enjoyed "for years" I reviewed Hannes' v3 patch, fixed one
>> issue and did some cleanup.  I then converted over to using a slightly
>> different approach where-in the DM core becomes a more willing
>> co-conspirator in this hack by introducing the ability to have
>> place-holder devices (dm_dev without an opened bdev) referenced in a DM
>> table.  The work is here:
>> http://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=throwaway-dm-mpath-placeholder-devs
>
> Here is the rolled up patch (the individual commits in the above branch
> are rather noisy given the sequencing):
>
>   drivers/md/dm-mpath.c | 51 +++++++++++++++++++++++++++++++++----------------
>   drivers/md/dm-table.c | 53 ++++++++++++++++++++++++++++++++++++++-------------
>   drivers/md/dm.c       |  5 ++---
>   drivers/md/dm.h       | 12 ++++++++++++
>   4 files changed, 89 insertions(+), 32 deletions(-)
>
These patches look quite okay; I'll be cross-checking with my 
version and do some testing there.

Will be sending some update once the testing is done.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18  0:04       ` Mike Snitzer
  2014-07-18  0:23         ` Mike Snitzer
  2014-07-18  0:29         ` John Utz
@ 2014-07-18  6:12         ` Hannes Reinecke
  2 siblings, 0 replies; 26+ messages in thread
From: Hannes Reinecke @ 2014-07-18  6:12 UTC (permalink / raw)
  To: Mike Snitzer, Stewart, Sean
  Cc: device-mapper development, Bryn Reeves, Alasdair Kergon

On 07/18/2014 02:04 AM, Mike Snitzer wrote:
> On Wed, Dec 18 2013 at 10:28am -0500,
> Stewart, Sean <Sean.Stewart@netapp.com> wrote:
>
>> On Wed, 2013-12-18 at 15:25 +0100, Hannes Reinecke wrote:
>>> On 12/18/2013 03:08 PM, Mike Snitzer wrote:
>>>> On Wed, Dec 18 2013 at  2:52am -0500,
>>>> Hannes Reinecke <hare@suse.de> wrote:
>>>>
>>>>> The multipath kernel module is rejecting any map with an invalid
>>>>> device. However, as the multipathd is processing the events serially
>>>>> it will try to push a map with invalid devices if more than one
>>>>> device failed at the same time.
>>>>> So we can as well accept those maps and make sure to mark the
>>>>> paths as down.
>>>>
>>>> Why is it so desirable to do this?  Reduced latency to restore at least
>>>> one valid path when a bunch of paths go down?
>>>>
>>> Without this patch multipathd cannot update the map as long is
>>> hasn't catched up with udev.
>>> During that time any scheduling decisions by the kernel part are
>>> necessarily wrong, as it has to rely on the old map.
>>>
>>>> Why can't we just rely on userspace eventually figuring out which paths
>>>> are failed and pushing a valid map down?
>>>>
>>> Oh, you can. This is what we're doing now :-)
>>>
>>> But it will lead to spurious error during failover when multipathd
>>> is trying to push down maps with invalid devices.
>>>
>>> You are also running into a race window between checking the path in
>>> multipathd and pushing down the map; if the device disappears during
>>> that time you won't be able to push down the map.
>>> If that happens during boot multipathd won't be able to create the
>>> map at all, so you might not be able to boot here.
>>> With that patch you at least have the device-mapper device, allowing
>>> booting to continue.
>>>
>>>> Are there favorable reports that this new behavior actually helps?
>>>> Please quantify how.
>>>>
>>> NetApp will have; they've been pushing me to forward this patch.
>>> Sean?
>>>
>> Agree.  Internally, we have run into numerous cases with Red Hat where
>> the "failed in domap" error will occur, due to user space being behind,
>> or device detaching taking too long.  The most severe case is with
>> InfiniBand, where the LLD may place a device offline, then every single
>> reload that is trying to add a good path in will fail.  I will qualify
>> this by saying that I realize it is a problem that the device gets
>> placed offline in the first place, but this patch would allow it a
>> chance to continue on. The user still has to take manual steps to fix
>> the problem in this case, but it seems less disruptive to applications.
>>
>> The device detaching case could be kind of disruptive to a user in the
>> scenario they are upgrading the firmware on a NetApp E-Series box, and
>> with this patch, at least a good path is able to be added in ASAP.
>>
>>> BTW, SUSE / SLES is running happily with this patch for years now.
>>> So it can't be at all bad ...
>>>
>>> Cheers,
>>>
>>> Hannes
>>
>> Also agreed.  We have seen this functionality in SLES for years, and
>> have not run into a problem with it.
>
> Revisiting this can of worms...
>
> As part of full due-diligence on the approach that SUSE and NetApp have
> seemingly enjoyed "for years" I reviewed Hannes' v3 patch, fixed one
> issue and did some cleanup.  I then converted over to using a slightly
> different approach where-in the DM core becomes a more willing
> co-conspirator in this hack by introducing the ability to have
> place-holder devices (dm_dev without an opened bdev) referenced in a DM
> table.  The work is here:
> http://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=throwaway-dm-mpath-placeholder-devs
>
> Here is the diffstat of all 3 patches rolled up:
>
>   git diff d4bdac727f1e09412c762f177790a96432738264^..7681ae5ddb5d567800023477be7ddc68f9812a95 | diffstat
>   dm-mpath.c |   51 +++++++++++++++++++++++++++++++++++----------------
>   dm-table.c |   53 ++++++++++++++++++++++++++++++++++++++++-------------
>   dm.c       |    5 ++---
>   dm.h       |   12 ++++++++++++
>   4 files changed, 89 insertions(+), 32 deletions(-)
>
> But it was only compile tested, because doing more validation of this
> work would mean it has a snowballs chance in hell of seeing the light of
> upstream.  Sadly it doesn't have a good chance; it would require some
> compelling proof:
> 1) that mpath is bullet-proof no matter how crazy a user got with fake
>     place-holder devices in their DM tables (coupled with reinstate_path
>     messages, etc)
At worst the user ends up with tables packed with non-existing 
devices. Which would be equivalent to a table with all paths failed.
So from that I don't see a problem.
We only might run into issues where userspace fails to do proper 
garbage collection on those tables. But then again, this can happen 
nowadays, too, when all paths from a large configuration drop 
suddenly and multipathd crashes.

> 2) that the storage configs that experienced problems with the current
>     DM mpath dm_get_device() failures weren't broken to start with (for
>     instance ib srp is apparently fixed now.. but those fixes are still
>     working their way into RHEL) -- or put differently: I need _details_
>     on the NetApp or other legit storage configs that are still
>     experiencing problems without a solution to this problem.
>
This has nothing to do with 'legit' storage configs, this is a 
direct result of the udev event handling and the internal multipathd 
path checker logic.

udev event are issue sequentially, so multipathd has to process them 
one at a time (it even has a udev receiving thread which basically 
ensures sequential processing).

multipathd then has to process the event, and update the device 
table _based on that event_. At that time it simply _cannot_ know if 
there are other events relating to the same table queued.
If there are, multipathd has no other choice than to push an invalid 
table down into the kernel.

> ... and even with that proof I'm pretty sure Alasdair will hate this
> place-holder approach and will push for some other solution.
>
> I'm going away on paternity leave until Sept 8... my _hope_ is that
> someone fixes multipath-tools to suck less or that a more clever
> solution to this problem is developed locally in DM mpath.
>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18  5:57               ` Hannes Reinecke
@ 2014-07-18 14:38                 ` Mike Snitzer
  2014-07-18 17:04                   ` John Utz
  2014-07-18 16:51                 ` dm-multipath: Accept failed paths for multipath maps John Utz
  1 sibling, 1 reply; 26+ messages in thread
From: Mike Snitzer @ 2014-07-18 14:38 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: dm-devel, John Utz

On Fri, Jul 18 2014 at  1:57am -0400,
Hannes Reinecke <hare@suse.de> wrote:

> On 07/18/2014 05:31 AM, John Utz wrote:
> >Thankyou very much for the exhaustive answer! I forwarded on to my
> > project peers because i don't think any of us where aware of the
> > existing infrastructure.
> >
> >Of course, said infrastructure would have to be taught about ZAC,
> > but it seems like it would be a nice place to start testing from....
> >
> ZAC is a different beast altogether; I've posted an initial set of
> patches a while back on linux-scsi.
> But I don't think multipath needs to be changed for that.
> Other areas of device-mapper most certainly do.

Pretty sure John is working on a new ZAC-oriented DM target.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18  6:00           ` Hannes Reinecke
@ 2014-07-18 16:04             ` Mike Snitzer
  2014-07-18 16:15               ` Mike Snitzer
  0 siblings, 1 reply; 26+ messages in thread
From: Mike Snitzer @ 2014-07-18 16:04 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: device-mapper development, Bryn Reeves, Stewart, Sean, Alasdair Kergon

On Fri, Jul 18 2014 at  2:00am -0400,
Hannes Reinecke <hare@suse.de> wrote:

> On 07/18/2014 02:23 AM, Mike Snitzer wrote:
> >On Thu, Jul 17 2014 at  8:04pm -0400,
> >Mike Snitzer <snitzer@redhat.com> wrote:
> >
> >>Revisiting this can of worms...
> >>
> >>As part of full due-diligence on the approach that SUSE and NetApp have
> >>seemingly enjoyed "for years" I reviewed Hannes' v3 patch, fixed one
> >>issue and did some cleanup.  I then converted over to using a slightly
> >>different approach where-in the DM core becomes a more willing
> >>co-conspirator in this hack by introducing the ability to have
> >>place-holder devices (dm_dev without an opened bdev) referenced in a DM
> >>table.  The work is here:
> >>http://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=throwaway-dm-mpath-placeholder-devs
> >
> >Here is the rolled up patch (the individual commits in the above branch
> >are rather noisy given the sequencing):
> >
> >  drivers/md/dm-mpath.c | 51 +++++++++++++++++++++++++++++++++----------------
> >  drivers/md/dm-table.c | 53 ++++++++++++++++++++++++++++++++++++++-------------
> >  drivers/md/dm.c       |  5 ++---
> >  drivers/md/dm.h       | 12 ++++++++++++
> >  4 files changed, 89 insertions(+), 32 deletions(-)
> >
> These patches look quite okay; I'll be cross-checking with my
> version and do some testing there.

Thanks, the intent was to make your approach more "official".  I did
have a couple fixes to the DM core patch so I rebased, see:
https://git.kernel.org/cgit/linux/kernel/git/snitzer/linux.git/log/?h=throwaway-dm-mpath-placeholder-devs

But as I said in earlier posts, I'm really not in love with using
place-holder devices (mainly because allowing references to dead devices
_seems_ to go against proper device stacking).  But I do think that DM
core _should_ acknowledge that everything isn't always going to be
perfect for all targets.  Especially a target like multipath whose
purpose is to cope with the potential for paths coming/going.

So there definitely is a case to be made for DM being more tolerate of
the potential for devices being inaccessible.

Now this is the last I'm going to channel Alasdair's argument for him on
this thread but: Alasdair would prefer to see userspace multipathd fixed
(more intelligent batched processing of uevents, etc).  As a second 
option, his preferred kernel change is a "handover" style approach that
hands the path group state over from the old table to the new.

I agree that userspace multipathd should be fixed to process events in
batches but I see that even then the kernel should accommodate potential
for imperfect table loads for targets like multipath.  I happen to
disagree with handover (I flipped and now flopped back on this point).
Handover would be serious churn (that agk somehow thinks more
worthwhile).  While I appreciate that mechanically handover is a
different means to the same end of having references to dm_devs that are
"removed failed devices" handover would be overly complicated and
brittle (and very disruptive to DM core because the dm_devs aren't
hanging directly off of a multipath structure, they are associated with
the DM table!)

In that light I do actually prefer the blunt/simple approach taken in my
'throwaway-dm-mpath-placeholder-devs' branch.  But I'm concerned that
accepting a workaround like this will just enable multipathd to never
get the attention it needs for improved uevent processing.  If others
commit to actually making multipathd's uevent processing better that'd
go a long way.

> Will be sending some update once the testing is done.

OK.  The stronger case that can be made for the stability of this change
(no missing checks for dev->bdev, etc) the better.

I did just do some basic testing, of both dm-mpath and dm-thinp, seems
fine.  The DM core changes really are simple (just some extra negative
checks that won't ever apply for the normal case -- where "normal case"
is DM targets that use dm_get_device rather than __dm_get_device, and
had to add a dev_t to struct dm_dev).

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18 16:04             ` Mike Snitzer
@ 2014-07-18 16:15               ` Mike Snitzer
  2014-07-21  6:05                 ` Hannes Reinecke
  0 siblings, 1 reply; 26+ messages in thread
From: Mike Snitzer @ 2014-07-18 16:15 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: device-mapper development, Bryn Reeves, Stewart, Sean, Alasdair Kergon

On Fri, Jul 18 2014 at 12:04pm -0400,
Mike Snitzer <snitzer@redhat.com> wrote:

> On Fri, Jul 18 2014 at  2:00am -0400,
> Hannes Reinecke <hare@suse.de> wrote:
> 
> > Will be sending some update once the testing is done.
> 
> OK.  The stronger case that can be made for the stability of this change
> (no missing checks for dev->bdev, etc) the better.
> 
> I did just do some basic testing, of both dm-mpath and dm-thinp, seems
> fine.  The DM core changes really are simple (just some extra negative
> checks that won't ever apply for the normal case -- where "normal case"
> is DM targets that use dm_get_device rather than __dm_get_device, and
> had to add a dev_t to struct dm_dev).

BTW, I noticed one thing that I didn't think of as logical fallout, but
it makes sense: if configure multipthd to not use find_multipaths then
multipathd will successfully push down an mpath table that covers the
system disk (sda) -- but mpatha isn't able to dm_get_device:

# lsblk /dev/sda /dev/mapper/mpatha
NAME                            MAJ:MIN RM   SIZE RO TYPE  MOUNTPOINT
sda                               8:0    0 930.5G  0 disk  
├─sda1                            8:1    0   500M  0 part  /boot
└─sda2                            8:2    0   930G  0 part  
  ├─rhel_rhel--storage--02-swap 253:0    0     4G  0 lvm   [SWAP]
  └─rhel_rhel--storage--02-root 253:1    0   350G  0 lvm   /
mpatha                          253:6    0 930.5G  0 mpath 

# multipath -ll mpatha
mpatha (36003005700ec189015d42b92237a76d1) dm-6 LSI     ,RAID 5/6 SAS 6G 
size=930G features='0' hwhandler='0' wp=rw
`-+- policy='service-time 0' prio=1 status=enabled
  `- 0:2:0:0 sda  8:0   failed ready running

And multipathd keeps failing to reinstate_path:
...
[ 1906.647322] device-mapper: multipath: message: error getting device 8:0
[ 1912.652652] device-mapper: multipath: message: error getting device 8:0
[ 1918.657949] device-mapper: multipath: message: error getting device 8:0
[ 1924.664118] device-mapper: multipath: message: error getting device 8:0
[ 1930.669635] device-mapper: multipath: message: error getting device 8:0

Again, this all makes sense... but certainly shows that blacklisting the
relevant devices is a must.

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18  5:57               ` Hannes Reinecke
  2014-07-18 14:38                 ` Mike Snitzer
@ 2014-07-18 16:51                 ` John Utz
  1 sibling, 0 replies; 26+ messages in thread
From: John Utz @ 2014-07-18 16:51 UTC (permalink / raw)
  To: Hannes Reinecke; +Cc: dm-devel

Sorry Hannes;

My apologies.

I am guilty of hijacking the thread, tho i plead that it was inadvertent. :-P

indeed, ZAC/ZBC is orthagonal to multipath, no arguments there.

My response was directed to Mike's discourse on the available testing tools for device mapper and contained my ruminations about how to leverage those tools for the project that I am lucky enuf to be working on.
________________________________________
From: Hannes Reinecke [hare@suse.de]
Sent: Thursday, July 17, 2014 10:57 PM
To: John Utz
Cc: dm-devel@redhat.com
Subject: Re: [dm-devel] dm-multipath: Accept failed paths for multipath maps

On 07/18/2014 05:31 AM, John Utz wrote:
> Thankyou very much for the exhaustive answer! I forwarded on to my
 > project peers because i don't think any of us where aware of the
 > existing infrastructure.
>
> Of course, said infrastructure would have to be taught about ZAC,
 > but it seems like it would be a nice place to start testing from....
>
ZAC is a different beast altogether; I've posted an initial set of
patches a while back on linux-scsi.
But I don't think multipath needs to be changed for that.
Other areas of device-mapper most certainly do.

Cheers,

Hannes
--
Dr. Hannes Reinecke                   zSeries & Storage
hare@suse.de                          +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18 14:38                 ` Mike Snitzer
@ 2014-07-18 17:04                   ` John Utz
  2014-07-21 14:23                     ` ZAC target (Was: Re: dm-multipath: Accept failed paths for multipath maps) Hannes Reinecke
  0 siblings, 1 reply; 26+ messages in thread
From: John Utz @ 2014-07-18 17:04 UTC (permalink / raw)
  To: Mike Snitzer, Hannes Reinecke; +Cc: dm-devel

Response below.....
________________________________________
From: Mike Snitzer [snitzer@redhat.com]
Sent: Friday, July 18, 2014 7:38 AM
To: Hannes Reinecke
Cc: John Utz; dm-devel@redhat.com
Subject: Re: dm-multipath: Accept failed paths for multipath maps

On Fri, Jul 18 2014 at  1:57am -0400,
Hannes Reinecke <hare@suse.de> wrote:

> On 07/18/2014 05:31 AM, John Utz wrote:
> >Thankyou very much for the exhaustive answer! I forwarded on to my
> > project peers because i don't think any of us where aware of the
> > existing infrastructure.
> >
> >Of course, said infrastructure would have to be taught about ZAC,
> > but it seems like it would be a nice place to start testing from....
> >
> ZAC is a different beast altogether; I've posted an initial set of
> patches a while back on linux-scsi.
> But I don't think multipath needs to be changed for that.
> Other areas of device-mapper most certainly do.

Pretty sure John is working on a new ZAC-oriented DM target.

YUP.

Per Ted T'so's suggestion several months ago, the goal is to create a new DM target that implements the ZAC/ZBC command set and the SMR write pointer architecture so that FSfolksen can try their hand at porting their stuff to it. 

It's in the very early stages so there is nothing to show yet, but development is ongoing. There are a few unknowns about how to surface some specific behaviors (new verbs and errors, particularly errors with sense codes that return a write pointer) but i have not gotten far enuf along in development to be able to construct succint and specific questions on the topic so that will have to wait for a bit.

Comments and questions welcome.

tnx!

johnu

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: dm-multipath: Accept failed paths for multipath maps
  2014-07-18 16:15               ` Mike Snitzer
@ 2014-07-21  6:05                 ` Hannes Reinecke
  0 siblings, 0 replies; 26+ messages in thread
From: Hannes Reinecke @ 2014-07-21  6:05 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: device-mapper development, Bryn Reeves, Stewart, Sean, Alasdair Kergon

On 07/18/2014 06:15 PM, Mike Snitzer wrote:
> On Fri, Jul 18 2014 at 12:04pm -0400,
> Mike Snitzer <snitzer@redhat.com> wrote:
>
>> On Fri, Jul 18 2014 at  2:00am -0400,
>> Hannes Reinecke <hare@suse.de> wrote:
>>
>>> Will be sending some update once the testing is done.
>>
>> OK.  The stronger case that can be made for the stability of this change
>> (no missing checks for dev->bdev, etc) the better.
>>
>> I did just do some basic testing, of both dm-mpath and dm-thinp, seems
>> fine.  The DM core changes really are simple (just some extra negative
>> checks that won't ever apply for the normal case -- where "normal case"
>> is DM targets that use dm_get_device rather than __dm_get_device, and
>> had to add a dev_t to struct dm_dev).
>
> BTW, I noticed one thing that I didn't think of as logical fallout, but
> it makes sense: if configure multipthd to not use find_multipaths then
> multipathd will successfully push down an mpath table that covers the
> system disk (sda) -- but mpatha isn't able to dm_get_device:
>
> # lsblk /dev/sda /dev/mapper/mpatha
> NAME                            MAJ:MIN RM   SIZE RO TYPE  MOUNTPOINT
> sda                               8:0    0 930.5G  0 disk
> ├─sda1                            8:1    0   500M  0 part  /boot
> └─sda2                            8:2    0   930G  0 part
>    ├─rhel_rhel--storage--02-swap 253:0    0     4G  0 lvm   [SWAP]
>    └─rhel_rhel--storage--02-root 253:1    0   350G  0 lvm   /
> mpatha                          253:6    0 930.5G  0 mpath
>
> # multipath -ll mpatha
> mpatha (36003005700ec189015d42b92237a76d1) dm-6 LSI     ,RAID 5/6 SAS 6G
> size=930G features='0' hwhandler='0' wp=rw
> `-+- policy='service-time 0' prio=1 status=enabled
>    `- 0:2:0:0 sda  8:0   failed ready running
>
> And multipathd keeps failing to reinstate_path:
> ...
> [ 1906.647322] device-mapper: multipath: message: error getting device 8:0
> [ 1912.652652] device-mapper: multipath: message: error getting device 8:0
> [ 1918.657949] device-mapper: multipath: message: error getting device 8:0
> [ 1924.664118] device-mapper: multipath: message: error getting device 8:0
> [ 1930.669635] device-mapper: multipath: message: error getting device 8:0
>
> Again, this all makes sense... but certainly shows that blacklisting the
> relevant devices is a must.
>
Precisely.
We have been taking the approach that starting multipath will 
attempt to take over _any_ (eligible) device.
So this error would constitute a misconfiguration from our POV.
Either the device needs to be blacklisted or the process accessing 
the device needs to be reconfigured to use multipathing.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 26+ messages in thread

* ZAC target (Was: Re: dm-multipath: Accept failed paths for multipath maps)
  2014-07-18 17:04                   ` John Utz
@ 2014-07-21 14:23                     ` Hannes Reinecke
  2014-07-21 19:28                       ` Kent Overstreet
  0 siblings, 1 reply; 26+ messages in thread
From: Hannes Reinecke @ 2014-07-21 14:23 UTC (permalink / raw)
  To: John Utz, Mike Snitzer
  Cc: dm-devel, Linux Kernel, Jens Axboe, Kent Overstreet

On 07/18/2014 07:04 PM, John Utz wrote:
>> On 07/18/2014 05:31 AM, John Utz wrote:
>>> Thankyou very much for the exhaustive answer! I forwarded on to my
>>> project peers because i don't think any of us where aware of the
>>> existing infrastructure.
>>>
>>> Of course, said infrastructure would have to be taught about ZAC,
>>> but it seems like it would be a nice place to start testing from....
>>>
>> ZAC is a different beast altogether; I've posted an initial set of
>> patches a while back on linux-scsi.
>> But I don't think multipath needs to be changed for that.
>> Other areas of device-mapper most certainly do.
>
> Pretty sure John is working on a new ZAC-oriented DM target.
>
> YUP.
>
> Per Ted T'so's suggestion several months ago, the goal is to create
 > a new DM target that implements the ZAC/ZBC command set and the SMR
 > write pointer architecture so that FSfolksen can try their hand at
 > porting their stuff to it.
>
> It's in the very early stages so there is nothing to show yet, but
 > development is ongoing. There are a few unknowns about how to surface
 > some specific behaviors (new verbs and errors, particularly errors
 > with sense codes that return a write pointer) but i have not gotten
 > far enuf along in development to be able to construct succint and
 > specific questions on the topic so that will have to wait for a bit.
>
I was pondering the 'best' ZAC implementation, too, and found the
'report zones' command _very_ cumbersome to use.
Especially the fact that in theory each zone could have a different 
size _and_ plenty of zones could be present will be making zone 
lookup hellish.

However: it seems to me that we might benefit from a generic
'block boundaries' implementation.
Reasoning here is that several subsystems (RAID, ZAC/ZBC, and things 
like referrals) impose I/O scheduling boundaries which must not be 
crossed when assembling requests.

Seeing that we already have some block limitations I was wondering 
if we couldn't have some set of 'I/O scheduling boundaries' as part
of the request_queue structure.

Kent, Jens; comments here?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: ZAC target (Was: Re: dm-multipath: Accept failed paths for multipath maps)
  2014-07-21 14:23                     ` ZAC target (Was: Re: dm-multipath: Accept failed paths for multipath maps) Hannes Reinecke
@ 2014-07-21 19:28                       ` Kent Overstreet
  2014-07-22  5:46                         ` Hannes Reinecke
  0 siblings, 1 reply; 26+ messages in thread
From: Kent Overstreet @ 2014-07-21 19:28 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: John Utz, Mike Snitzer, dm-devel, Linux Kernel, Jens Axboe, tytso

On Mon, Jul 21, 2014 at 04:23:41PM +0200, Hannes Reinecke wrote:
> On 07/18/2014 07:04 PM, John Utz wrote:
> >>On 07/18/2014 05:31 AM, John Utz wrote:
> >>>Thankyou very much for the exhaustive answer! I forwarded on to my
> >>>project peers because i don't think any of us where aware of the
> >>>existing infrastructure.
> >>>
> >>>Of course, said infrastructure would have to be taught about ZAC,
> >>>but it seems like it would be a nice place to start testing from....
> >>>
> >>ZAC is a different beast altogether; I've posted an initial set of
> >>patches a while back on linux-scsi.
> >>But I don't think multipath needs to be changed for that.
> >>Other areas of device-mapper most certainly do.
> >
> >Pretty sure John is working on a new ZAC-oriented DM target.
> >
> >YUP.
> >
> >Per Ted T'so's suggestion several months ago, the goal is to create
> > a new DM target that implements the ZAC/ZBC command set and the SMR
> > write pointer architecture so that FSfolksen can try their hand at
> > porting their stuff to it.
> >
> >It's in the very early stages so there is nothing to show yet, but
> > development is ongoing. There are a few unknowns about how to surface
> > some specific behaviors (new verbs and errors, particularly errors
> > with sense codes that return a write pointer) but i have not gotten
> > far enuf along in development to be able to construct succint and
> > specific questions on the topic so that will have to wait for a bit.
> >
> I was pondering the 'best' ZAC implementation, too, and found the
> 'report zones' command _very_ cumbersome to use.
> Especially the fact that in theory each zone could have a different size
> _and_ plenty of zones could be present will be making zone lookup hellish.
> 
> However: it seems to me that we might benefit from a generic
> 'block boundaries' implementation.
> Reasoning here is that several subsystems (RAID, ZAC/ZBC, and things like
> referrals) impose I/O scheduling boundaries which must not be crossed when
> assembling requests.

Wasn't Ted working on such a thing?

> Seeing that we already have some block limitations I was wondering if we
> couldn't have some set of 'I/O scheduling boundaries' as part
> of the request_queue structure.

I'd prefer not to dump yet more crap in request_queue, but that's a fairly minor
quibble :)

I also tend to think having different size zones is crazy and I would avoid
making any effort to support that in practice, but OTOH there's good reason for
wanting one or two "normal" zones and the rest append only so the interface is
going to have to accomadate some differences between zones.

Also, depending on the approach supporting different size zones might not
actually be problematic. If you're starting with something that's pure COW and
you're just plugging in this "ZAC allocation" stuff (which I think is what I'm
going to do in bcache) then it might not actually be an issue.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: ZAC target (Was: Re: dm-multipath: Accept failed paths for multipath maps)
  2014-07-21 19:28                       ` Kent Overstreet
@ 2014-07-22  5:46                         ` Hannes Reinecke
  2014-07-22  8:08                           ` Matias Bjorling
  0 siblings, 1 reply; 26+ messages in thread
From: Hannes Reinecke @ 2014-07-22  5:46 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: John Utz, Mike Snitzer, dm-devel, Linux Kernel, Jens Axboe, tytso

On 07/21/2014 09:28 PM, Kent Overstreet wrote:
> On Mon, Jul 21, 2014 at 04:23:41PM +0200, Hannes Reinecke wrote:
>> On 07/18/2014 07:04 PM, John Utz wrote:
>>>> On 07/18/2014 05:31 AM, John Utz wrote:
>>>>> Thankyou very much for the exhaustive answer! I forwarded on to my
>>>>> project peers because i don't think any of us where aware of the
>>>>> existing infrastructure.
>>>>>
>>>>> Of course, said infrastructure would have to be taught about ZAC,
>>>>> but it seems like it would be a nice place to start testing from....
>>>>>
>>>> ZAC is a different beast altogether; I've posted an initial set of
>>>> patches a while back on linux-scsi.
>>>> But I don't think multipath needs to be changed for that.
>>>> Other areas of device-mapper most certainly do.
>>>
>>> Pretty sure John is working on a new ZAC-oriented DM target.
>>>
>>> YUP.
>>>
>>> Per Ted T'so's suggestion several months ago, the goal is to create
>>> a new DM target that implements the ZAC/ZBC command set and the SMR
>>> write pointer architecture so that FSfolksen can try their hand at
>>> porting their stuff to it.
>>>
>>> It's in the very early stages so there is nothing to show yet, but
>>> development is ongoing. There are a few unknowns about how to surface
>>> some specific behaviors (new verbs and errors, particularly errors
>>> with sense codes that return a write pointer) but i have not gotten
>>> far enuf along in development to be able to construct succint and
>>> specific questions on the topic so that will have to wait for a bit.
>>>
>> I was pondering the 'best' ZAC implementation, too, and found the
>> 'report zones' command _very_ cumbersome to use.
>> Especially the fact that in theory each zone could have a different size
>> _and_ plenty of zones could be present will be making zone lookup hellish.
>>
>> However: it seems to me that we might benefit from a generic
>> 'block boundaries' implementation.
>> Reasoning here is that several subsystems (RAID, ZAC/ZBC, and things like
>> referrals) impose I/O scheduling boundaries which must not be crossed when
>> assembling requests.
>
> Wasn't Ted working on such a thing?
>
>> Seeing that we already have some block limitations I was wondering if we
>> couldn't have some set of 'I/O scheduling boundaries' as part
>> of the request_queue structure.
>
> I'd prefer not to dump yet more crap in request_queue, but that's a fairly minor
> quibble :)
>
> I also tend to think having different size zones is crazy and I would avoid
> making any effort to support that in practice, but OTOH there's good reason for
> wanting one or two "normal" zones and the rest append only so the interface is
> going to have to accomadate some differences between zones.
>
> Also, depending on the approach supporting different size zones might not
> actually be problematic. If you're starting with something that's pure COW and
> you're just plugging in this "ZAC allocation" stuff (which I think is what I'm
> going to do in bcache) then it might not actually be an issue.
>
No, what I was suggesting is to introduce 'I/O scheduling barriers'.
Some devices like RAID or indeed ZAC have internal boundaries which 
cannot be crossed by any I/O. So either the I/O has to be split up 
or the I/O scheduler have to be made aware of these boundaries.

I have had this issue several times now (once with implementing 
Referrals, now with ZAC) so I was wondering whether we can have some 
sort of generic implementation in the block layer.

And as we're already having request queue limits this might fall 
quite naturally into it. Or so I thought.

Hmm. Guess I should start coding here.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		      zSeries & Storage
hare@suse.de			      +49 911 74053 688
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: J. Hawn, J. Guild, F. Imendörffer, HRB 16746 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: ZAC target (Was: Re: dm-multipath: Accept failed paths for multipath maps)
  2014-07-22  5:46                         ` Hannes Reinecke
@ 2014-07-22  8:08                           ` Matias Bjorling
  0 siblings, 0 replies; 26+ messages in thread
From: Matias Bjorling @ 2014-07-22  8:08 UTC (permalink / raw)
  To: Hannes Reinecke, Kent Overstreet
  Cc: John Utz, Mike Snitzer, dm-devel, Linux Kernel, Jens Axboe, tytso

On 07/22/2014 07:46 AM, Hannes Reinecke wrote:
> On 07/21/2014 09:28 PM, Kent Overstreet wrote:
>> On Mon, Jul 21, 2014 at 04:23:41PM +0200, Hannes Reinecke wrote:
>>> On 07/18/2014 07:04 PM, John Utz wrote:
>>>>> On 07/18/2014 05:31 AM, John Utz wrote:
>>>>>> Thankyou very much for the exhaustive answer! I forwarded on to my
>>>>>> project peers because i don't think any of us where aware of the
>>>>>> existing infrastructure.
>>>>>>
>>>>>> Of course, said infrastructure would have to be taught about ZAC,
>>>>>> but it seems like it would be a nice place to start testing from....
>>>>>>
>>>>> ZAC is a different beast altogether; I've posted an initial set of
>>>>> patches a while back on linux-scsi.
>>>>> But I don't think multipath needs to be changed for that.
>>>>> Other areas of device-mapper most certainly do.
>>>>
>>>> Pretty sure John is working on a new ZAC-oriented DM target.
>>>>
>>>> YUP.
>>>>
>>>> Per Ted T'so's suggestion several months ago, the goal is to create
>>>> a new DM target that implements the ZAC/ZBC command set and the SMR
>>>> write pointer architecture so that FSfolksen can try their hand at
>>>> porting their stuff to it.
>>>>
>>>> It's in the very early stages so there is nothing to show yet, but
>>>> development is ongoing. There are a few unknowns about how to surface
>>>> some specific behaviors (new verbs and errors, particularly errors
>>>> with sense codes that return a write pointer) but i have not gotten
>>>> far enuf along in development to be able to construct succint and
>>>> specific questions on the topic so that will have to wait for a bit.
>>>>
>>> I was pondering the 'best' ZAC implementation, too, and found the
>>> 'report zones' command _very_ cumbersome to use.
>>> Especially the fact that in theory each zone could have a different size
>>> _and_ plenty of zones could be present will be making zone lookup
>>> hellish.
>>>
>>> However: it seems to me that we might benefit from a generic
>>> 'block boundaries' implementation.
>>> Reasoning here is that several subsystems (RAID, ZAC/ZBC, and things
>>> like
>>> referrals) impose I/O scheduling boundaries which must not be crossed
>>> when
>>> assembling requests.
>>
>> Wasn't Ted working on such a thing?
>>
>>> Seeing that we already have some block limitations I was wondering if we
>>> couldn't have some set of 'I/O scheduling boundaries' as part
>>> of the request_queue structure.
>>
>> I'd prefer not to dump yet more crap in request_queue, but that's a
>> fairly minor
>> quibble :)
>>
>> I also tend to think having different size zones is crazy and I would
>> avoid
>> making any effort to support that in practice, but OTOH there's good
>> reason for
>> wanting one or two "normal" zones and the rest append only so the
>> interface is
>> going to have to accomadate some differences between zones.
>>
>> Also, depending on the approach supporting different size zones might not
>> actually be problematic. If you're starting with something that's pure
>> COW and
>> you're just plugging in this "ZAC allocation" stuff (which I think is
>> what I'm
>> going to do in bcache) then it might not actually be an issue.
>>
> No, what I was suggesting is to introduce 'I/O scheduling barriers'.
> Some devices like RAID or indeed ZAC have internal boundaries which
> cannot be crossed by any I/O. So either the I/O has to be split up or
> the I/O scheduler have to be made aware of these boundaries.
>
> I have had this issue several times now (once with implementing
> Referrals, now with ZAC) so I was wondering whether we can have some
> sort of generic implementation in the block layer.

Would it make any sense to put the hole block allocation strategy within 
the block layer (or just on top of the device driver) and then let 
MDs/FSs users hook into this for allocating new blocks?

That allows multiple implementations to use the same block address space.

>
> And as we're already having request queue limits this might fall quite
> naturally into it. Or so I thought.
>
> Hmm. Guess I should start coding here.
>
> Cheers,
>
> Hannes


^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2014-07-22  8:09 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-12-18  7:52 [PATCHv2] dm-multipath: Accept failed paths for multipath maps Hannes Reinecke
2013-12-18 14:08 ` Mike Snitzer
2013-12-18 14:25   ` Hannes Reinecke
2013-12-18 15:28     ` Stewart, Sean
2013-12-19  8:21       ` Bart Van Assche
2014-02-19  1:14         ` Mike Snitzer
2014-02-19  8:11           ` Bart Van Assche
2014-07-18  0:04       ` Mike Snitzer
2014-07-18  0:23         ` Mike Snitzer
2014-07-18  6:00           ` Hannes Reinecke
2014-07-18 16:04             ` Mike Snitzer
2014-07-18 16:15               ` Mike Snitzer
2014-07-21  6:05                 ` Hannes Reinecke
2014-07-18  0:29         ` John Utz
2014-07-18  2:18           ` Mike Snitzer
2014-07-18  3:31             ` John Utz
2014-07-18  5:57               ` Hannes Reinecke
2014-07-18 14:38                 ` Mike Snitzer
2014-07-18 17:04                   ` John Utz
2014-07-21 14:23                     ` ZAC target (Was: Re: dm-multipath: Accept failed paths for multipath maps) Hannes Reinecke
2014-07-21 19:28                       ` Kent Overstreet
2014-07-22  5:46                         ` Hannes Reinecke
2014-07-22  8:08                           ` Matias Bjorling
2014-07-18 16:51                 ` dm-multipath: Accept failed paths for multipath maps John Utz
2014-07-18  6:12         ` Hannes Reinecke
2013-12-18 15:28     ` Merla, ShivaKrishna

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.