All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] multipath-tools: add ANA support for NVMe device
@ 2018-11-08  6:09 lijie
  2018-11-12 16:23 ` Martin Wilck
  0 siblings, 1 reply; 63+ messages in thread
From: lijie @ 2018-11-08  6:09 UTC (permalink / raw)
  To: dm-devel, christophe.varoqui, hare, mwilck, bmarzins, xose.vazquez
  Cc: lijie34, wangzhoumengjian, shenhong09, chengjike.cheng

Add support for Asynchronous Namespace Access as specified in NVMe 1.3
TP 4004. The states are updated through reading the ANA log page.

By default, the native nvme multipath takes over the nvme device.
We can pass a false to the parameter 'multipath' of the nvme-core.ko
module,when we want to use multipath-tools.
---
 libmultipath/prio.h                |   1 +
 libmultipath/prioritizers/Makefile |   1 +
 libmultipath/prioritizers/ana.c    | 292 +++++++++++++++++++++++++++++++++++++
 libmultipath/prioritizers/ana.h    | 221 ++++++++++++++++++++++++++++
 multipath/multipath.conf.5         |   8 +
 5 files changed, 523 insertions(+)
 create mode 100644 libmultipath/prioritizers/ana.c
 create mode 100644 libmultipath/prioritizers/ana.h

diff --git a/libmultipath/prio.h b/libmultipath/prio.h
index aa587cc..599d1d8 100644
--- a/libmultipath/prio.h
+++ b/libmultipath/prio.h
@@ -30,6 +30,7 @@ struct path;
 #define PRIO_WEIGHTED_PATH	"weightedpath"
 #define PRIO_SYSFS		"sysfs"
 #define PRIO_PATH_LATENCY	"path_latency"
+#define PRIO_ANA		"ana"
 
 /*
  * Value used to mark the fact prio was not defined
diff --git a/libmultipath/prioritizers/Makefile b/libmultipath/prioritizers/Makefile
index ab7bc07..15afaba 100644
--- a/libmultipath/prioritizers/Makefile
+++ b/libmultipath/prioritizers/Makefile
@@ -19,6 +19,7 @@ LIBS = \
 	libpriordac.so \
 	libprioweightedpath.so \
 	libpriopath_latency.so \
+	libprioana.so \
 	libpriosysfs.so
 
 all: $(LIBS)
diff --git a/libmultipath/prioritizers/ana.c b/libmultipath/prioritizers/ana.c
new file mode 100644
index 0000000..c5aaa5f
--- /dev/null
+++ b/libmultipath/prioritizers/ana.c
@@ -0,0 +1,292 @@
+/*
+ * (C) Copyright HUAWEI Technology Corp. 2017   All Rights Reserved.
+ *
+ * ana.c
+ * Version 1.00
+ *
+ * Tool to make use of a NVMe-feature called  Asymmetric Namespace Access.
+ * It determines the ANA state of a device and prints a priority value to stdout.
+ *
+ * Author(s): Cheng Jike <chengjike.cheng@huawei.com>
+ *            Li Jie <lijie34@huawei.com>
+ *
+ * This file is released under the GPL version 2, or any later version.
+ */
+#include <stdio.h>
+#include <sys/ioctl.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <stdbool.h>
+
+#include "debug.h"
+#include "prio.h"
+#include "structs.h"
+#include "ana.h"
+
+enum {
+	ANA_PRIO_OPTIMIZED		= 50,
+	ANA_PRIO_NONOPTIMIZED		= 10,
+	ANA_PRIO_INACCESSIBLE		= 5,
+	ANA_PRIO_PERSISTENT_LOSS	= 1,
+	ANA_PRIO_CHANGE			= 0,
+	ANA_PRIO_RESERVED		= 0,
+	ANA_PRIO_GETCTRL_FAILED		= -1,
+	ANA_PRIO_NOT_SUPPORTED		= -2,
+	ANA_PRIO_GETANAS_FAILED		= -3,
+	ANA_PRIO_GETANALOG_FAILED	= -4,
+	ANA_PRIO_GETNSID_FAILED		= -5,
+	ANA_PRIO_GETNS_FAILED		= -6,
+	ANA_PRIO_NO_MEMORY		= -7,
+	ANA_PRIO_NO_INFORMATION		= -8,
+};
+
+static const char * anas_string[] = {
+	[NVME_ANA_OPTIMIZED]			= "ANA Optimized State",
+	[NVME_ANA_NONOPTIMIZED]			= "ANA Non-Optimized State",
+	[NVME_ANA_INACCESSIBLE]			= "ANA Inaccessible State",
+	[NVME_ANA_PERSISTENT_LOSS]		= "ANA Persistent Loss State",
+	[NVME_ANA_CHANGE]			= "ANA Change state",
+	[NVME_ANA_RESERVED]			= "Invalid namespace group state!",
+};
+
+static const char *aas_print_string(int rc)
+{
+	rc &= 0xff;
+
+	switch(rc) {
+	case NVME_ANA_OPTIMIZED:
+	case NVME_ANA_NONOPTIMIZED:
+	case NVME_ANA_INACCESSIBLE:
+	case NVME_ANA_PERSISTENT_LOSS:
+	case NVME_ANA_CHANGE:
+		return anas_string[rc];
+	default:
+		return anas_string[NVME_ANA_RESERVED];
+	}
+
+	return anas_string[NVME_ANA_RESERVED];
+}
+
+static int nvme_get_nsid(int fd, unsigned *nsid)
+{
+	static struct stat nvme_stat;
+	int err = fstat(fd, &nvme_stat);
+	if (err < 0)
+		return 1;
+
+	if (!S_ISBLK(nvme_stat.st_mode)) {
+		condlog(0, "Error: requesting namespace-id from non-block device\n");
+		return 1;
+	}
+
+	*nsid = ioctl(fd, NVME_IOCTL_ID);
+	return 0;
+}
+
+static int nvme_submit_admin_passthru(int fd, struct nvme_passthru_cmd *cmd)
+{
+	return ioctl(fd, NVME_IOCTL_ADMIN_CMD, cmd);
+}
+
+int nvme_get_log13(int fd, __u32 nsid, __u8 log_id, __u8 lsp, __u64 lpo,
+                 __u16 lsi, bool rae, __u32 data_len, void *data)
+{
+	struct nvme_admin_cmd cmd = {
+		.opcode		= nvme_admin_get_log_page,
+		.nsid		= nsid,
+		.addr		= (__u64)(uintptr_t) data,
+		.data_len	= data_len,
+	};
+	__u32 numd = (data_len >> 2) - 1;
+	__u16 numdu = numd >> 16, numdl = numd & 0xffff;
+
+	cmd.cdw10 = log_id | (numdl << 16) | (rae ? 1 << 15 : 0);
+	if (lsp)
+		cmd.cdw10 |= lsp << 8;
+
+	cmd.cdw11 = numdu | (lsi << 16);
+	cmd.cdw12 = lpo;
+	cmd.cdw13 = (lpo >> 32);
+
+	return nvme_submit_admin_passthru(fd, &cmd);
+
+}
+
+int nvme_identify13(int fd, __u32 nsid, __u32 cdw10, __u32 cdw11, void *data)
+{
+	struct nvme_admin_cmd cmd = {
+		.opcode		= nvme_admin_identify,
+		.nsid		= nsid,
+		.addr		= (__u64)(uintptr_t) data,
+		.data_len	= NVME_IDENTIFY_DATA_SIZE,
+		.cdw10		= cdw10,
+		.cdw11		= cdw11,
+	};
+
+	return nvme_submit_admin_passthru(fd, &cmd);
+}
+
+int nvme_identify(int fd, __u32 nsid, __u32 cdw10, void *data)
+{
+	return nvme_identify13(fd, nsid, cdw10, 0, data);
+}
+
+int nvme_identify_ctrl(int fd, void *data)
+{
+	return nvme_identify(fd, 0, NVME_ID_CNS_CTRL, data);
+}
+
+int nvme_identify_ns(int fd, __u32 nsid, void *data)
+{
+	return nvme_identify(fd, nsid, NVME_ID_CNS_NS, data);
+}
+
+int nvme_ana_log(int fd, void *ana_log, size_t ana_log_len, int rgo)
+{
+	__u64 lpo = 0;
+
+	return nvme_get_log13(fd, NVME_NSID_ALL, NVME_LOG_ANA, rgo, lpo, 0,
+			true, ana_log_len, ana_log);
+}
+
+static int get_ana_state(__u32 nsid, __u32 anagrpid, void *ana_log)
+{
+	int	rc = ANA_PRIO_GETANAS_FAILED;
+	void *base = ana_log;
+	struct nvme_ana_rsp_hdr *hdr = base;
+	struct nvme_ana_group_desc *ana_desc;
+	int offset = sizeof(struct nvme_ana_rsp_hdr);
+	__u32 nr_nsids;
+	size_t nsid_buf_size;
+	int i, j;
+
+	for (i = 0; i < le16_to_cpu(hdr->ngrps); i++) {
+		ana_desc = base + offset;
+		nr_nsids = le32_to_cpu(ana_desc->nnsids);
+		nsid_buf_size = nr_nsids * sizeof(__le32);
+
+		offset += sizeof(*ana_desc);
+
+		for (j = 0; j < nr_nsids; j++) {
+			if (nsid == le32_to_cpu(ana_desc->nsids[j]))
+				return ana_desc->state;
+		}
+
+		if (anagrpid != 0 && anagrpid == le32_to_cpu(ana_desc->grpid))
+			rc = ana_desc->state;
+
+		offset += nsid_buf_size;
+	}
+
+	return rc;
+}
+
+int get_ana_info(struct path * pp, unsigned int timeout)
+{
+	int	rc;
+	__u32 nsid;
+	struct nvme_id_ctrl ctrl;
+	struct nvme_id_ns ns;
+	void *ana_log;
+	size_t ana_log_len;
+
+	rc = nvme_identify_ctrl(pp->fd, &ctrl);
+	if (rc)
+		return ANA_PRIO_GETCTRL_FAILED;
+
+	if(!(ctrl.cmic & (1 << 3)))
+		return ANA_PRIO_NOT_SUPPORTED;
+
+	rc = nvme_get_nsid(pp->fd, &nsid);
+	if (rc)
+		return ANA_PRIO_GETNSID_FAILED;
+
+	rc = nvme_identify_ns(pp->fd, nsid, &ns);
+	if (rc)
+		return ANA_PRIO_GETNS_FAILED;
+
+	ana_log_len = sizeof(struct nvme_ana_rsp_hdr) +
+		le32_to_cpu(ctrl.nanagrpid) * sizeof(struct nvme_ana_group_desc);
+	if (!(ctrl.anacap & (1 << 6)))
+		ana_log_len += le32_to_cpu(ctrl.mnan) * sizeof(__le32);
+
+	ana_log = malloc(ana_log_len);
+	if (!ana_log)
+		return ANA_PRIO_NO_MEMORY;
+
+	rc = nvme_ana_log(pp->fd, ana_log, ana_log_len,
+		(ctrl.anacap & (1 << 6)) ? NVME_ANA_LOG_RGO : 0);
+	if (rc) {
+		free(ana_log);
+		return ANA_PRIO_GETANALOG_FAILED;
+	}
+
+	rc = get_ana_state(nsid, le32_to_cpu(ns.anagrpid), ana_log);
+	if (rc < 0){
+		free(ana_log);
+		return ANA_PRIO_GETANAS_FAILED;
+	}
+
+	free(ana_log);
+	condlog(3, "%s: ana state = %02x [%s]", pp->dev, rc, aas_print_string(rc));
+
+	return rc;
+}
+
+int getprio(struct path * pp, char * args, unsigned int timeout)
+{
+	int rc;
+
+	if (pp->fd < 0)
+		return ANA_PRIO_NO_INFORMATION;
+
+	rc = get_ana_info(pp, timeout);
+	if (rc >= 0) {
+		rc &= 0x0f;
+		switch(rc) {
+		case NVME_ANA_OPTIMIZED:
+			rc = ANA_PRIO_OPTIMIZED;
+			break;
+		case NVME_ANA_NONOPTIMIZED:
+			rc = ANA_PRIO_NONOPTIMIZED;
+			break;
+		case NVME_ANA_INACCESSIBLE:
+			rc = ANA_PRIO_INACCESSIBLE;
+			break;
+		case NVME_ANA_PERSISTENT_LOSS:
+			rc = ANA_PRIO_PERSISTENT_LOSS;
+			break;
+		case NVME_ANA_CHANGE:
+			rc = ANA_PRIO_CHANGE;
+			break;
+		default:
+			rc = ANA_PRIO_RESERVED;
+		}
+	} else {
+		switch(rc) {
+		case ANA_PRIO_GETCTRL_FAILED:
+			condlog(0, "%s: couldn't get ctrl info", pp->dev);
+			break;
+		case ANA_PRIO_NOT_SUPPORTED:
+			condlog(0, "%s: ana not supported", pp->dev);
+			break;
+		case ANA_PRIO_GETANAS_FAILED:
+			condlog(0, "%s: couldn't get ana state", pp->dev);
+			break;
+		case ANA_PRIO_GETANALOG_FAILED:
+			condlog(0, "%s: couldn't get ana log", pp->dev);
+			break;
+		case ANA_PRIO_GETNS_FAILED:
+			condlog(0, "%s: couldn't get namespace", pp->dev);
+			break;
+		case ANA_PRIO_GETNSID_FAILED:
+			condlog(0, "%s: couldn't get namespace id", pp->dev);
+			break;
+		case ANA_PRIO_NO_MEMORY:
+			condlog(0, "%s: couldn't alloc memory", pp->dev);
+			break;
+		}
+	}
+	return rc;
+}
+
diff --git a/libmultipath/prioritizers/ana.h b/libmultipath/prioritizers/ana.h
new file mode 100644
index 0000000..92cfa9e
--- /dev/null
+++ b/libmultipath/prioritizers/ana.h
@@ -0,0 +1,221 @@
+#ifndef _ANA_H
+#define _ANA_H
+
+#include <linux/types.h>
+
+#define NVME_NSID_ALL			0xffffffff
+#define NVME_IDENTIFY_DATA_SIZE 	4096
+
+#define NVME_LOG_ANA			0x0c
+
+/* Admin commands */
+enum nvme_admin_opcode {
+	nvme_admin_get_log_page		= 0x02,
+	nvme_admin_identify		= 0x06,
+};
+
+enum {
+	NVME_ID_CNS_NS			= 0x00,
+	NVME_ID_CNS_CTRL		= 0x01,
+};
+
+/* nvme ioctl start */
+struct nvme_passthru_cmd {
+	__u8	opcode;
+	__u8	flags;
+	__u16	rsvd1;
+	__u32	nsid;
+	__u32	cdw2;
+	__u32	cdw3;
+	__u64	metadata;
+	__u64	addr;
+	__u32	metadata_len;
+	__u32	data_len;
+	__u32	cdw10;
+	__u32	cdw11;
+	__u32	cdw12;
+	__u32	cdw13;
+	__u32	cdw14;
+	__u32	cdw15;
+	__u32	timeout_ms;
+	__u32	result;
+};
+
+#define nvme_admin_cmd nvme_passthru_cmd
+
+#define NVME_IOCTL_ID		_IO('N', 0x40)
+#define NVME_IOCTL_ADMIN_CMD	_IOWR('N', 0x41, struct nvme_admin_cmd)
+/* nvme ioctl end */
+
+/* nvme id ctrl start */
+struct nvme_id_power_state {
+	__le16			max_power;	/* centiwatts */
+	__u8			rsvd2;
+	__u8			flags;
+	__le32			entry_lat;	/* microseconds */
+	__le32			exit_lat;	/* microseconds */
+	__u8			read_tput;
+	__u8			read_lat;
+	__u8			write_tput;
+	__u8			write_lat;
+	__le16			idle_power;
+	__u8			idle_scale;
+	__u8			rsvd19;
+	__le16			active_power;
+	__u8			active_work_scale;
+	__u8			rsvd23[9];
+};
+
+struct nvme_id_ctrl {
+	__le16			vid;
+	__le16			ssvid;
+	char			sn[20];
+	char			mn[40];
+	char			fr[8];
+	__u8			rab;
+	__u8			ieee[3];
+	__u8			cmic;
+	__u8			mdts;
+	__le16			cntlid;
+	__le32			ver;
+	__le32			rtd3r;
+	__le32			rtd3e;
+	__le32			oaes;
+	__le32			ctratt;
+	__u8			rsvd100[156];
+	__le16			oacs;
+	__u8			acl;
+	__u8			aerl;
+	__u8			frmw;
+	__u8			lpa;
+	__u8			elpe;
+	__u8			npss;
+	__u8			avscc;
+	__u8			apsta;
+	__le16			wctemp;
+	__le16			cctemp;
+	__le16			mtfa;
+	__le32			hmpre;
+	__le32			hmmin;
+	__u8			tnvmcap[16];
+	__u8			unvmcap[16];
+	__le32			rpmbs;
+	__le16			edstt;
+	__u8			dsto;
+	__u8			fwug;
+	__le16			kas;
+	__le16			hctma;
+	__le16			mntmt;
+	__le16			mxtmt;
+	__le32			sanicap;
+	__le32			hmminds;
+	__le16			hmmaxd;
+	__u8			rsvd338[4];
+	__u8			anatt;
+	__u8			anacap;
+	__le32			anagrpmax;
+	__le32			nanagrpid;
+	__u8			rsvd352[160];
+	__u8			sqes;
+	__u8			cqes;
+	__le16			maxcmd;
+	__le32			nn;
+	__le16			oncs;
+	__le16			fuses;
+	__u8			fna;
+	__u8			vwc;
+	__le16			awun;
+	__le16			awupf;
+	__u8			nvscc;
+	__u8			nwpc;
+	__le16			acwu;
+	__u8			rsvd534[2];
+	__le32			sgls;
+	__le32			mnan;
+	__u8			rsvd544[224];
+	char			subnqn[256];
+	__u8			rsvd1024[768];
+	__le32			ioccsz;
+	__le32			iorcsz;
+	__le16			icdoff;
+	__u8			ctrattr;
+	__u8			msdbd;
+	__u8			rsvd1804[244];
+	struct nvme_id_power_state	psd[32];
+	__u8			vs[1024];
+};
+/* nvme id ctrl end */
+
+/* nvme id ns start */
+struct nvme_lbaf {
+	__le16			ms;
+	__u8			ds;
+	__u8			rp;
+};
+
+struct nvme_id_ns {
+	__le64			nsze;
+	__le64			ncap;
+	__le64			nuse;
+	__u8			nsfeat;
+	__u8			nlbaf;
+	__u8			flbas;
+	__u8			mc;
+	__u8			dpc;
+	__u8			dps;
+	__u8			nmic;
+	__u8			rescap;
+	__u8			fpi;
+	__u8			rsvd33;
+	__le16			nawun;
+	__le16			nawupf;
+	__le16			nacwu;
+	__le16			nabsn;
+	__le16			nabo;
+	__le16			nabspf;
+	__le16			noiob;
+	__u8			nvmcap[16];
+	__u8			rsvd64[28];
+	__le32			anagrpid;
+	__u8			rsvd96[3];
+	__u8			nsattr;
+	__u8			rsvd100[4];
+	__u8			nguid[16];
+	__u8			eui64[8];
+	struct nvme_lbaf	lbaf[16];
+	__u8			rsvd192[192];
+	__u8			vs[3712];
+};
+/* nvme id ns end */
+
+/* nvme ana start */
+enum nvme_ana_state {
+	NVME_ANA_OPTIMIZED		= 0x01,
+	NVME_ANA_NONOPTIMIZED		= 0x02,
+	NVME_ANA_INACCESSIBLE		= 0x03,
+	NVME_ANA_PERSISTENT_LOSS	= 0x04,
+	NVME_ANA_CHANGE			= 0x0f,
+	NVME_ANA_RESERVED		= 0x05,
+};
+
+struct nvme_ana_rsp_hdr {
+	__le64	chgcnt;
+	__le16	ngrps;
+	__le16	rsvd10[3];
+};
+
+struct nvme_ana_group_desc {
+	__le32	grpid;
+	__le32	nnsids;
+	__le64	chgcnt;
+	__u8	state;
+	__u8	rsvd17[15];
+	__le32	nsids[];
+};
+
+/* flag for the log specific field of the ANA log */
+#define NVME_ANA_LOG_RGO	(1 << 0)
+
+/* nvme ana end */
+
+#endif
diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
index 6333366..c5b2fb6 100644
--- a/multipath/multipath.conf.5
+++ b/multipath/multipath.conf.5
@@ -334,6 +334,10 @@ priority provided as argument. Requires prio_args keyword.
 Generate the path priority based on a latency algorithm.
 Requires prio_args keyword.
 .TP
+.I ana
+(Hardware-dependent)
+Generate the path priority based on the NVMe ANA settings.
+.TP
 .I datacore
 (Hardware-dependent)
 Generate the path priority for some DataCore storage arrays. Requires prio_args
@@ -1391,6 +1395,10 @@ Active/Standby mode exclusively.
 .I 1 alua
 (Hardware-dependent)
 Hardware handler for SCSI-3 ALUA compatible arrays.
+.TP
+.I 1 ana
+(Hardware-dependent)
+Hardware handler for NVMe ANA compatible arrays.
 .PP
 The default is: \fB<unset>\fR
 .PP
-- 
2.6.4.windows.1

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] multipath-tools: add ANA support for NVMe device
  2018-11-08  6:09 [PATCH] multipath-tools: add ANA support for NVMe device lijie
@ 2018-11-12 16:23 ` Martin Wilck
  2018-11-12 21:53     ` Mike Snitzer
  0 siblings, 1 reply; 63+ messages in thread
From: Martin Wilck @ 2018-11-12 16:23 UTC (permalink / raw)
  To: lijie, xose.vazquez, christophe.varoqui, bmarzins, dm-devel, hare
  Cc: chengjike.cheng, wangzhoumengjian, shenhong09

Hello Lijie,

On Thu, 2018-11-08 at 14:09 +0800, lijie wrote:
> Add support for Asynchronous Namespace Access as specified in NVMe
> 1.3
> TP 4004. The states are updated through reading the ANA log page.
> 
> By default, the native nvme multipath takes over the nvme device.
> We can pass a false to the parameter 'multipath' of the nvme-core.ko
> module,when we want to use multipath-tools.

Thank you for the patch. It looks quite good to me. I've tested it with
a Linux target and found no problems so far.

I have a few questions and comments inline below.

I suggest you also have a look at detect_prio(); it seems to make sense
to use the ana prioritizer for NVMe paths automatically if ANA is
supported (with your patch, "detect_prio no" and "prio ana" have to be
configured explicitly). But that can be done in a later patch.

Martin

> ---
>  libmultipath/prio.h                |   1 +
>  libmultipath/prioritizers/Makefile |   1 +
>  libmultipath/prioritizers/ana.c    | 292
> +++++++++++++++++++++++++++++++++++++
>  libmultipath/prioritizers/ana.h    | 221
> ++++++++++++++++++++++++++++
>  multipath/multipath.conf.5         |   8 +
>  5 files changed, 523 insertions(+)
>  create mode 100644 libmultipath/prioritizers/ana.c
>  create mode 100644 libmultipath/prioritizers/ana.h
> 
> diff --git a/libmultipath/prio.h b/libmultipath/prio.h
> index aa587cc..599d1d8 100644
> --- a/libmultipath/prio.h
> +++ b/libmultipath/prio.h
> @@ -30,6 +30,7 @@ struct path;
>  #define PRIO_WEIGHTED_PATH	"weightedpath"
>  #define PRIO_SYSFS		"sysfs"
>  #define PRIO_PATH_LATENCY	"path_latency"
> +#define PRIO_ANA		"ana"
>  
>  /*
>   * Value used to mark the fact prio was not defined
> diff --git a/libmultipath/prioritizers/Makefile
> b/libmultipath/prioritizers/Makefile
> index ab7bc07..15afaba 100644
> --- a/libmultipath/prioritizers/Makefile
> +++ b/libmultipath/prioritizers/Makefile
> @@ -19,6 +19,7 @@ LIBS = \
>  	libpriordac.so \
>  	libprioweightedpath.so \
>  	libpriopath_latency.so \
> +	libprioana.so \
>  	libpriosysfs.so
>  
>  all: $(LIBS)
> diff --git a/libmultipath/prioritizers/ana.c
> b/libmultipath/prioritizers/ana.c
> new file mode 100644
> index 0000000..c5aaa5f
> --- /dev/null
> +++ b/libmultipath/prioritizers/ana.c
> @@ -0,0 +1,292 @@
> +/*
> + * (C) Copyright HUAWEI Technology Corp. 2017   All Rights Reserved.
> + *
> + * ana.c
> + * Version 1.00
> + *
> + * Tool to make use of a NVMe-feature called  Asymmetric Namespace
> Access.
> + * It determines the ANA state of a device and prints a priority
> value to stdout.
> + *
> + * Author(s): Cheng Jike <chengjike.cheng@huawei.com>
> + *            Li Jie <lijie34@huawei.com>
> + *
> + * This file is released under the GPL version 2, or any later
> version.
> + */
> +#include <stdio.h>
> +#include <sys/ioctl.h>
> +#include <sys/stat.h>
> +#include <sys/types.h>
> +#include <stdbool.h>
> +
> +#include "debug.h"
> +#include "prio.h"
> +#include "structs.h"
> +#include "ana.h"
> +
> +enum {
> +	ANA_PRIO_OPTIMIZED		= 50,
> +	ANA_PRIO_NONOPTIMIZED		= 10,
> +	ANA_PRIO_INACCESSIBLE		= 5,
> +	ANA_PRIO_PERSISTENT_LOSS	= 1,
> +	ANA_PRIO_CHANGE			= 0,
> +	ANA_PRIO_RESERVED		= 0,
> +	ANA_PRIO_GETCTRL_FAILED		= -1,
> +	ANA_PRIO_NOT_SUPPORTED		= -2,
> +	ANA_PRIO_GETANAS_FAILED		= -3,
> +	ANA_PRIO_GETANALOG_FAILED	= -4,
> +	ANA_PRIO_GETNSID_FAILED		= -5,
> +	ANA_PRIO_GETNS_FAILED		= -6,
> +	ANA_PRIO_NO_MEMORY		= -7,
> +	ANA_PRIO_NO_INFORMATION		= -8,
> +};
> +
> +static const char * anas_string[] = {
> +	[NVME_ANA_OPTIMIZED]			= "ANA Optimized
> State",
> +	[NVME_ANA_NONOPTIMIZED]			= "ANA Non-Optimized
> State",
> +	[NVME_ANA_INACCESSIBLE]			= "ANA Inaccessible
> State",
> +	[NVME_ANA_PERSISTENT_LOSS]		= "ANA Persistent
> Loss State",
> +	[NVME_ANA_CHANGE]			= "ANA Change state",
> +	[NVME_ANA_RESERVED]			= "Invalid namespace
> group state!",
> +};
> +
> +static const char *aas_print_string(int rc)
> +{
> +	rc &= 0xff;
> +
> +	switch(rc) {
> +	case NVME_ANA_OPTIMIZED:
> +	case NVME_ANA_NONOPTIMIZED:
> +	case NVME_ANA_INACCESSIBLE:
> +	case NVME_ANA_PERSISTENT_LOSS:
> +	case NVME_ANA_CHANGE:
> +		return anas_string[rc];
> +	default:
> +		return anas_string[NVME_ANA_RESERVED];
> +	}
> +
> +	return anas_string[NVME_ANA_RESERVED];
> +}

nit: the last statement is superfluous.

> +
> +static int nvme_get_nsid(int fd, unsigned *nsid)
> +{
> +	static struct stat nvme_stat;
> +	int err = fstat(fd, &nvme_stat);
> +	if (err < 0)
> +		return 1;
> +
> +	if (!S_ISBLK(nvme_stat.st_mode)) {
> +		condlog(0, "Error: requesting namespace-id from non-
> block device\n");
> +		return 1;
> +	}
> +
> +	*nsid = ioctl(fd, NVME_IOCTL_ID);
> +	return 0;
> +}
> +
> +static int nvme_submit_admin_passthru(int fd, struct
> nvme_passthru_cmd *cmd)
> +{
> +	return ioctl(fd, NVME_IOCTL_ADMIN_CMD, cmd);
> +}
> +
> +int nvme_get_log13(int fd, __u32 nsid, __u8 log_id, __u8 lsp, __u64
> lpo,
> +                 __u16 lsi, bool rae, __u32 data_len, void *data)

lpo and lsi are 0 in all callers.  Likewise, rae is always true. You
might consider simplifying your code by omitting these paramters.

I can see that you did it this way because you have copied the code
from nvme-cli/nvme-ioctl.c, which is fine. In this case, however, we
may want to copy the entire file, in order to be able to track more
easily if our code is up-to-date. I wish something like libnvmecli
existed.

> +{
> +	struct nvme_admin_cmd cmd = {
> +		.opcode		= nvme_admin_get_log_page,
> +		.nsid		= nsid,
> +		.addr		= (__u64)(uintptr_t) data,
> +		.data_len	= data_len,
> +	};
> +	__u32 numd = (data_len >> 2) - 1;
> +	__u16 numdu = numd >> 16, numdl = numd & 0xffff;
> +
> +	cmd.cdw10 = log_id | (numdl << 16) | (rae ? 1 << 15 : 0);
> +	if (lsp)log143
> +		cmd.cdw10 |= lsp << 8;
> +
> +	cmd.cdw11 = numdu | (lsi << 16);
> +	cmd.cdw12 = lpo;
> +	cmd.cdw13 = (lpo >> 32);
> +
> +	return nvme_submit_admin_passthru(fd, &cmd);
> +
> +}
> +
> +int nvme_identify13(int fd, __u32 nsid, __u32 cdw10, __u32 cdw11,
> void *data)
> +{
> +	struct nvme_admin_cmd cmd = {
> +		.opcode		= nvme_admin_identify,
> +		.nsid		= nsid,
> +		.addr		= (__u64)(uintptr_t) data,
> +		.data_len	= NVME_IDENTIFY_DATA_SIZE,
> +		.cdw10		= cdw10,
> +		.cdw11		= cdw11,

Along similar lines as above, this could be simplified, as cdw11 is
never used.

> +	};
> +
> +	return nvme_submit_admin_passthru(fd, &cmd);
> +}
> +
> +int nvme_identify(int fd, __u32 nsid, __u32 cdw10, void *data)
> +{
> +	return nvme_identify13(fd, nsid, cdw10, 0, data);
> +}
> +
> +int nvme_identify_ctrl(int fd, void *data)
> +{
> +	return nvme_identify(fd, 0, NVME_ID_CNS_CTRL, data);
> +}
> +
> +int nvme_identify_ns(int fd, __u32 nsid, void *data)
> +{
> +	return nvme_identify(fd, nsid, NVME_ID_CNS_NS, data);
> +}
> +
> +int nvme_ana_log(int fd, void *ana_log, size_t ana_log_len, int rgo)
> +{
> +	__u64 lpo = 0;
> +
> +	return nvme_get_log13(fd, NVME_NSID_ALL, NVME_LOG_ANA, rgo,
> lpo, 0,
> +			true, ana_log_len, ana_log);
> +}
> +
> +static int get_ana_state(__u32 nsid, __u32 anagrpid, void *ana_log)
> +{
> +	int	rc = ANA_PRIO_GETANAS_FAILED;
> +	void *base = ana_log;
> +	struct nvme_ana_rsp_hdr *hdr = base;
> +	struct nvme_ana_group_desc *ana_desc;
> +	int offset = sizeof(struct nvme_ana_rsp_hdr);
> +	__u32 nr_nsids;
> +	size_t nsid_buf_size;
> +	int i, j;
> +
> +	for (i = 0; i < le16_to_cpu(hdr->ngrps); i++) {
> +		ana_desc = base + offset;
> +		nr_nsids = le32_to_cpu(ana_desc->nnsids);
> +		nsid_buf_size = nr_nsids * sizeof(__le32);
> +
> +		offset += sizeof(*ana_desc);
> +
> +		for (j = 0; j < nr_nsids; j++) {
> +			if (nsid == le32_to_cpu(ana_desc->nsids[j]))
> +				return ana_desc->state;
> +		}
> +
> +		if (anagrpid != 0 && anagrpid == le32_to_cpu(ana_desc-
> >grpid))
> +			rc = ana_desc->state;


Shouldn't you check this condition before iterating over the nsids?
And if the anagrpid matches, shouldn't you return the state
immediately?

Some comments would be helpful in order to understand this logic.

> +
> +		offset += nsid_buf_size;
> +	}
> +
> +	return rc;
> +}
> +
> +int get_ana_info(struct path * pp, unsigned int timeout)

This should be a static function.

> +{
> +	int	rc;
> +	__u32 nsid;
> +	struct nvme_id_ctrl ctrl;
> +	struct nvme_id_ns ns;
> +	void *ana_log;
> +	size_t ana_log_len;
> +
> +	rc = nvme_identify_ctrl(pp->fd, &ctrl);
> +	if (rc)
> +		return ANA_PRIO_GETCTRL_FAILED;
> +
> +	if(!(ctrl.cmic & (1 << 3)))
> +		return ANA_PRIO_NOT_SUPPORTED;
> +
> +	rc = nvme_get_nsid(pp->fd, &nsid);
> +	if (rc)
> +		return ANA_PRIO_GETNSID_FAILED;
> +
> +	rc = nvme_identify_ns(pp->fd, nsid, &ns);
> +	if (rc)
> +		return ANA_PRIO_GETNS_FAILED;
> +
> +	ana_log_len = sizeof(struct nvme_ana_rsp_hdr) +
> +		le32_to_cpu(ctrl.nanagrpid) * sizeof(struct
> nvme_ana_group_desc);
> +	if (!(ctrl.anacap & (1 << 6)))
> +		ana_log_len += le32_to_cpu(ctrl.mnan) * sizeof(__le32);

I'd like to see a sanity check for ana_log_len here.

> +
> +	ana_log = malloc(ana_log_len);
> +	if (!ana_log)
> +		return ANA_PRIO_NO_MEMORY;
> +
> +	rc = nvme_ana_log(pp->fd, ana_log, ana_log_len,
> +		(ctrl.anacap & (1 << 6)) ? NVME_ANA_LOG_RGO : 0);
> +	if (rc) {
> +		free(ana_log);
> +		return ANA_PRIO_GETANALOG_FAILED;
> +	}

Please use a "goto error;" like construct here.

> +
> +	rc = get_ana_state(nsid, le32_to_cpu(ns.anagrpid), ana_log);
> +	if (rc < 0){
> +		free(ana_log);
> +		return ANA_PRIO_GETANAS_FAILED;
> +	}
> +
> +	free(ana_log);
> +	condlog(3, "%s: ana state = %02x [%s]", pp->dev, rc,
> aas_print_string(rc));

Please put an "error:" label here and free ana_log there.

> +
> +	return rc;
> +}
> +
> +int getprio(struct path * pp, char * args, unsigned int timeout)
> +{
> +	int rc;
> +
> +	if (pp->fd < 0)
> +		return ANA_PRIO_NO_INFORMATION;
> +
> +	rc = get_ana_info(pp, timeout);
> +	if (rc >= 0) {
> +		rc &= 0x0f;
> +		switch(rc) {
> +		case NVME_ANA_OPTIMIZED:
> +			rc = ANA_PRIO_OPTIMIZED;
> +			break;
> +		case NVME_ANA_NONOPTIMIZED:
> +			rc = ANA_PRIO_NONOPTIMIZED;
> +			break;
> +		case NVME_ANA_INACCESSIBLE:
> +			rc = ANA_PRIO_INACCESSIBLE;
> +			break;
> +		case NVME_ANA_PERSISTENT_LOSS:
> +			rc = ANA_PRIO_PERSISTENT_LOSS;
> +			break;
> +		case NVME_ANA_CHANGE:
> +			rc = ANA_PRIO_CHANGE;
> +			break;
> +		default:
> +			rc = ANA_PRIO_RESERVED;
> +		}
> +	} else {
> +		switch(rc) {
> +		case ANA_PRIO_GETCTRL_FAILED:
> +			condlog(0, "%s: couldn't get ctrl info", pp-
> >dev);
> +			break;
> +		case ANA_PRIO_NOT_SUPPORTED:
> +			condlog(0, "%s: ana not supported", pp->dev);
> +			break;
> +		case ANA_PRIO_GETANAS_FAILED:
> +			condlog(0, "%s: couldn't get ana state", pp-
> >dev);
> +			break;
> +		case ANA_PRIO_GETANALOG_FAILED:
> +			condlog(0, "%s: couldn't get ana log", pp-
> >dev);
> +			break;
> +		case ANA_PRIO_GETNS_FAILED:
> +			condlog(0, "%s: couldn't get namespace", pp-
> >dev);
> +			break;
> +		case ANA_PRIO_GETNSID_FAILED:
> +			condlog(0, "%s: couldn't get namespace id", pp-
> >dev);
> +			break;
> +		case ANA_PRIO_NO_MEMORY:
> +			condlog(0, "%s: couldn't alloc memory", pp-
> >dev);
> +			break;
> +		}
> +	}
> +	return rc;
> +}
> +
> diff --git a/libmultipath/prioritizers/ana.h
> b/libmultipath/prioritizers/ana.h
> new file mode 100644
> index 0000000..92cfa9e
> --- /dev/null
> +++ b/libmultipath/prioritizers/ana.h
> @@ -0,0 +1,221 @@
> +#ifndef _ANA_H
> +#define _ANA_H
> +
> +#include <linux/types.h>
> +
> +#define NVME_NSID_ALL			0xffffffff
> +#define NVME_IDENTIFY_DATA_SIZE 	4096
> +
> +#define NVME_LOG_ANA			0x0c
> +
> +/* Admin commands */
> +enum nvme_admin_opcode {
> +	nvme_admin_get_log_page		= 0x02,
> +	nvme_admin_identify		= 0x06,
> +};
> +
> +enum {
> +	NVME_ID_CNS_NS			= 0x00,
> +	NVME_ID_CNS_CTRL		= 0x01,
> +};
> +
> +/* nvme ioctl start */
> +struct nvme_passthru_cmd {
> +	__u8	opcode;
> +	__u8	flags;
> +	__u16	rsvd1;
> +	__u32	nsid;
> +	__u32	cdw2;
> +	__u32	cdw3;
> +	__u64	metadata;
> +	__u64	addr;
> +	__u32	metadata_len;
> +	__u32	data_len;
> +	__u32	cdw10;
> +	__u32	cdw11;
> +	__u32	cdw12;
> +	__u32	cdw13;
> +	__u32	cdw14;
> +	__u32	cdw15;
> +	__u32	timeout_ms;
> +	__u32	result;
> +};
> +
> +#define nvme_admin_cmd nvme_passthru_cmd
> +
> +#define NVME_IOCTL_ID		_IO('N', 0x40)
> +#define NVME_IOCTL_ADMIN_CMD	_IOWR('N', 0x41, struct nvme_admin_cmd)
> +/* nvme ioctl end */

Please #include <linux/nvme_ioctl.h> rather than copying its contents
here.

> +
> +/* nvme id ctrl start */
> +struct nvme_id_power_state {
> +	__le16			max_power;	/* centiwatts */
> +	__u8			rsvd2;
> +	__u8			flags;
> +	__le32			entry_lat;	/* microseconds */
> +	__le32			exit_lat;	/* microseconds */
> +	__u8			read_tput;
> +	__u8			read_lat;
> +	__u8			write_tput;
> +	__u8			write_lat;
> +	__le16			idle_power;
> +	__u8			idle_scale;
> +	__u8			rsvd19;
> +	__le16			active_power;
> +	__u8			active_work_scale;
> +	__u8			rsvd23[9];
> +};
> +
> +struct nvme_id_ctrl {
> +	__le16			vid;
> +	__le16			ssvid;
> +	char			sn[20];
> +	char			mn[40];
> +	char			fr[8];
> +	__u8			rab;
> +	__u8			ieee[3];
> +	__u8			cmic;
> +	__u8			mdts;
> +	__le16			cntlid;
> +	__le32			ver;
> +	__le32			rtd3r;
> +	__le32			rtd3e;
> +	__le32			oaes;
> +	__le32			ctratt;
> +	__u8			rsvd100[156];
> +	__le16			oacs;
> +	__u8			acl;
> +	__u8			aerl;
> +	__u8			frmw;
> +	__u8			lpa;
> +	__u8			elpe;
> +	__u8			npss;
> +	__u8			avscc;
> +	__u8			apsta;
> +	__le16			wctemp;
> +	__le16			cctemp;
> +	__le16			mtfa;
> +	__le32			hmpre;
> +	__le32			hmmin;
> +	__u8			tnvmcap[16];
> +	__u8			unvmcap[16];
> +	__le32			rpmbs;
> +	__le16			edstt;
> +	__u8			dsto;
> +	__u8			fwug;
> +	__le16			kas;
> +	__le16			hctma;
> +	__le16			mntmt;
> +	__le16			mxtmt;
> +	__le32			sanicap;
> +	__le32			hmminds;
> +	__le16			hmmaxd;
> +	__u8			rsvd338[4];
> +	__u8			anatt;
> +	__u8			anacap;
> +	__le32			anagrpmax;
> +	__le32			nanagrpid;
> +	__u8			rsvd352[160];
> +	__u8			sqes;
> +	__u8			cqes;
> +	__le16			maxcmd;
> +	__le32			nn;
> +	__le16			oncs;
> +	__le16			fuses;
> +	__u8			fna;
> +	__u8			vwc;
> +	__le16			awun;
> +	__le16			awupf;
> +	__u8			nvscc;
> +	__u8			nwpc;
> +	__le16			acwu;
> +	__u8			rsvd534[2];
> +	__le32			sgls;
> +	__le32			mnan;
> +	__u8			rsvd544[224];
> +	char			subnqn[256];
> +	__u8			rsvd1024[768];
> +	__le32			ioccsz;
> +	__le32			iorcsz;
> +	__le16			icdoff;
> +	__u8			ctrattr;
> +	__u8			msdbd;
> +	__u8			rsvd1804[244];
> +	struct nvme_id_power_state	psd[32];
> +	__u8			vs[1024];
> +};
> +/* nvme id ctrl end */
> +
> +/* nvme id ns start */
> +struct nvme_lbaf {
> +	__le16			ms;
> +	__u8			ds;
> +	__u8			rp;
> +};
> +
> +struct nvme_id_ns {
> +	__le64			nsze;
> +	__le64			ncap;
> +	__le64			nuse;
> +	__u8			nsfeat;
> +	__u8			nlbaf;
> +	__u8			flbas;
> +	__u8			mc;
> +	__u8			dpc;
> +	__u8			dps;
> +	__u8			nmic;
> +	__u8			rescap;
> +	__u8			fpi;
> +	__u8			rsvd33;
> +	__le16			nawun;
> +	__le16			nawupf;
> +	__le16			nacwu;
> +	__le16			nabsn;
> +	__le16			nabo;
> +	__le16			nabspf;
> +	__le16			noiob;
> +	__u8			nvmcap[16];
> +	__u8			rsvd64[28];
> +	__le32			anagrpid;
> +	__u8			rsvd96[3];
> +	__u8			nsattr;
> +	__u8			rsvd100[4];
> +	__u8			nguid[16];
> +	__u8			eui64[8];
> +	struct nvme_lbaf	lbaf[16];
> +	__u8			rsvd192[192];
> +	__u8			vs[3712];
> +};
> +/* nvme id ns end */
> +
> +/* nvme ana start */
> +enum nvme_ana_state {
> +	NVME_ANA_OPTIMIZED		= 0x01,
> +	NVME_ANA_NONOPTIMIZED		= 0x02,
> +	NVME_ANA_INACCESSIBLE		= 0x03,
> +	NVME_ANA_PERSISTENT_LOSS	= 0x04,
> +	NVME_ANA_CHANGE			= 0x0f,
> +	NVME_ANA_RESERVED		= 0x05,
> +};
> +
> +struct nvme_ana_rsp_hdr {
> +	__le64	chgcnt;
> +	__le16	ngrps;
> +	__le16	rsvd10[3];
> +};
> +
> +struct nvme_ana_group_desc {
> +	__le32	grpid;
> +	__le32	nnsids;
> +	__le64	chgcnt;
> +	__u8	state;
> +	__u8	rsvd17[15];
> +	__le32	nsids[];
> +};
> +
> +/* flag for the log specific field of the ANA log */
> +#define NVME_ANA_LOG_RGO	(1 << 0)
> +
> +/* nvme ana end */
> +
> +#endif
> diff --git a/multipath/multipath.conf.5 b/multipath/multipath.conf.5
> index 6333366..c5b2fb6 100644
> --- a/multipath/multipath.conf.5
> +++ b/multipath/multipath.conf.5
> @@ -334,6 +334,10 @@ priority provided as argument. Requires
> prio_args keyword.
>  Generate the path priority based on a latency algorithm.
>  Requires prio_args keyword.
>  .TP
> +.I ana
> +(Hardware-dependent)
> +Generate the path priority based on the NVMe ANA settings.
> +.TP
>  .I datacore
>  (Hardware-dependent)
>  Generate the path priority for some DataCore storage arrays.
> Requires prio_args
> @@ -1391,6 +1395,10 @@ Active/Standby mode exclusively.
>  .I 1 alua
>  (Hardware-dependent)
>  Hardware handler for SCSI-3 ALUA compatible arrays.
> +.TP
> +.I 1 ana
> +(Hardware-dependent)
> +Hardware handler for NVMe ANA compatible arrays.

No. hardware handlers are for SCSI only.

Regards,
Martin

-- 
Dr. Martin Wilck <mwilck@suse.com>, Tel. +49 (0)911 74053 2107
SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
  2018-11-12 16:23 ` Martin Wilck
@ 2018-11-12 21:53     ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-12 21:53 UTC (permalink / raw)


On Mon, Nov 12 2018 at 11:23am -0500,
Martin Wilck <mwilck@suse.com> wrote:

> Hello Lijie,
> 
> On Thu, 2018-11-08@14:09 +0800, lijie wrote:
> > Add support for Asynchronous Namespace Access as specified in NVMe
> > 1.3
> > TP 4004. The states are updated through reading the ANA log page.
> > 
> > By default, the native nvme multipath takes over the nvme device.
> > We can pass a false to the parameter 'multipath' of the nvme-core.ko
> > module,when we want to use multipath-tools.
> 
> Thank you for the patch. It looks quite good to me. I've tested it with
> a Linux target and found no problems so far.
> 
> I have a few questions and comments inline below.
> 
> I suggest you also have a look at detect_prio(); it seems to make sense
> to use the ana prioritizer for NVMe paths automatically if ANA is
> supported (with your patch, "detect_prio no" and "prio ana" have to be
> configured explicitly). But that can be done in a later patch.

I (and others) think it makes sense to at least triple check with the
NVMe developers (now cc'd) to see if we could get agreement on the nvme
driver providing the ANA state via sysfs (when modparam
nvme_core.multipath=N is set), like Hannes proposed here:
http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html

Then the userspace multipath-tools ANA support could just read sysfs
rather than reinvent harvesting the ANA state via ioctl.

But if we continue to hit a wall on that type of sharing of the nvme
driver's state then I'm fine with reinventing ANA state inquiry and
tracking like has been proposed here.

Mike

p.s. thanks for your review Martin, we really need to determine the way
forward for full multipath-tools support of NVMe with ANA.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
@ 2018-11-12 21:53     ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-12 21:53 UTC (permalink / raw)
  To: Martin Wilck
  Cc: linux-nvme, lijie, xose.vazquez, chengjike.cheng, shenhong09,
	dm-devel, wangzhoumengjian, sschremm

On Mon, Nov 12 2018 at 11:23am -0500,
Martin Wilck <mwilck@suse.com> wrote:

> Hello Lijie,
> 
> On Thu, 2018-11-08 at 14:09 +0800, lijie wrote:
> > Add support for Asynchronous Namespace Access as specified in NVMe
> > 1.3
> > TP 4004. The states are updated through reading the ANA log page.
> > 
> > By default, the native nvme multipath takes over the nvme device.
> > We can pass a false to the parameter 'multipath' of the nvme-core.ko
> > module,when we want to use multipath-tools.
> 
> Thank you for the patch. It looks quite good to me. I've tested it with
> a Linux target and found no problems so far.
> 
> I have a few questions and comments inline below.
> 
> I suggest you also have a look at detect_prio(); it seems to make sense
> to use the ana prioritizer for NVMe paths automatically if ANA is
> supported (with your patch, "detect_prio no" and "prio ana" have to be
> configured explicitly). But that can be done in a later patch.

I (and others) think it makes sense to at least triple check with the
NVMe developers (now cc'd) to see if we could get agreement on the nvme
driver providing the ANA state via sysfs (when modparam
nvme_core.multipath=N is set), like Hannes proposed here:
http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html

Then the userspace multipath-tools ANA support could just read sysfs
rather than reinvent harvesting the ANA state via ioctl.

But if we continue to hit a wall on that type of sharing of the nvme
driver's state then I'm fine with reinventing ANA state inquiry and
tracking like has been proposed here.

Mike

p.s. thanks for your review Martin, we really need to determine the way
forward for full multipath-tools support of NVMe with ANA.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
  2018-11-12 21:53     ` Mike Snitzer
@ 2018-11-13  6:59       ` Martin Wilck
  -1 siblings, 0 replies; 63+ messages in thread
From: Martin Wilck @ 2018-11-13  6:59 UTC (permalink / raw)


On Mon, 2018-11-12@16:53 -0500, Mike Snitzer wrote:
> 
> I (and others) think it makes sense to at least triple check with the
> NVMe developers (now cc'd) to see if we could get agreement on the
> nvme
> driver providing the ANA state via sysfs (when modparam
> nvme_core.multipath=N is set), like Hannes proposed here:
> http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> 
> Then the userspace multipath-tools ANA support could just read sysfs
> rather than reinvent harvesting the ANA state via ioctl.

If some kernels expose the sysfs attributes and some don't, what are we
supposed to do? In order to be portable, multipath-tools (and other
user space tools, for that matter) need to support both, and maintain
multiple code paths. Just like SCSI :-)

I'd really like to see this abstracted away with a "libnvme" (you name
it), which user space tools could simply call without having to worry
how it actually talks to the kernel.

Martin

-- 
Dr. Martin Wilck <mwilck at suse.com>, Tel. +49 (0)911 74053 2107
SUSE Linux GmbH, GF: Felix Imend?rffer, Jane Smithard, Graham Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
@ 2018-11-13  6:59       ` Martin Wilck
  0 siblings, 0 replies; 63+ messages in thread
From: Martin Wilck @ 2018-11-13  6:59 UTC (permalink / raw)
  To: Mike Snitzer, mwilck
  Cc: lijie, xose.vazquez, chengjike.cheng, linux-nvme, dm-devel,
	wangzhoumengjian, shenhong09, sschremm

On Mon, 2018-11-12 at 16:53 -0500, Mike Snitzer wrote:
> 
> I (and others) think it makes sense to at least triple check with the
> NVMe developers (now cc'd) to see if we could get agreement on the
> nvme
> driver providing the ANA state via sysfs (when modparam
> nvme_core.multipath=N is set), like Hannes proposed here:
> http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> 
> Then the userspace multipath-tools ANA support could just read sysfs
> rather than reinvent harvesting the ANA state via ioctl.

If some kernels expose the sysfs attributes and some don't, what are we
supposed to do? In order to be portable, multipath-tools (and other
user space tools, for that matter) need to support both, and maintain
multiple code paths. Just like SCSI :-)

I'd really like to see this abstracted away with a "libnvme" (you name
it), which user space tools could simply call without having to worry
how it actually talks to the kernel.

Martin

-- 
Dr. Martin Wilck <mwilck@suse.com>, Tel. +49 (0)911 74053 2107
SUSE Linux GmbH, GF: Felix Imendörffer, Jane Smithard, Graham Norton
HRB 21284 (AG Nürnberg)


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
  2018-11-12 21:53     ` Mike Snitzer
@ 2018-11-13 16:18       ` Keith Busch
  -1 siblings, 0 replies; 63+ messages in thread
From: Keith Busch @ 2018-11-13 16:18 UTC (permalink / raw)


On Mon, Nov 12, 2018@04:53:23PM -0500, Mike Snitzer wrote:
> On Mon, Nov 12 2018 at 11:23am -0500,
> Martin Wilck <mwilck@suse.com> wrote:
> 
> > Hello Lijie,
> > 
> > On Thu, 2018-11-08@14:09 +0800, lijie wrote:
> > > Add support for Asynchronous Namespace Access as specified in NVMe
> > > 1.3
> > > TP 4004. The states are updated through reading the ANA log page.
> > > 
> > > By default, the native nvme multipath takes over the nvme device.
> > > We can pass a false to the parameter 'multipath' of the nvme-core.ko
> > > module,when we want to use multipath-tools.
> > 
> > Thank you for the patch. It looks quite good to me. I've tested it with
> > a Linux target and found no problems so far.
> > 
> > I have a few questions and comments inline below.
> > 
> > I suggest you also have a look at detect_prio(); it seems to make sense
> > to use the ana prioritizer for NVMe paths automatically if ANA is
> > supported (with your patch, "detect_prio no" and "prio ana" have to be
> > configured explicitly). But that can be done in a later patch.
> 
> I (and others) think it makes sense to at least triple check with the
> NVMe developers (now cc'd) to see if we could get agreement on the nvme
> driver providing the ANA state via sysfs (when modparam
> nvme_core.multipath=N is set), like Hannes proposed here:
> http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> 
> Then the userspace multipath-tools ANA support could just read sysfs
> rather than reinvent harvesting the ANA state via ioctl.

I'd prefer not duplicating the log page parsing. Maybe nvme's shouldn't
even be tied to CONFIG_NVME_MULTIPATH so that the 'multipath' param
isn't even an issue.
 
> But if we continue to hit a wall on that type of sharing of the nvme
> driver's state then I'm fine with reinventing ANA state inquiry and
> tracking like has been proposed here.
> 
> Mike
> 
> p.s. thanks for your review Martin, we really need to determine the way
> forward for full multipath-tools support of NVMe with ANA.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
@ 2018-11-13 16:18       ` Keith Busch
  0 siblings, 0 replies; 63+ messages in thread
From: Keith Busch @ 2018-11-13 16:18 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: lijie, xose.vazquez, chengjike.cheng, shenhong09, dm-devel,
	wangzhoumengjian, linux-nvme, Martin Wilck, sschremm

On Mon, Nov 12, 2018 at 04:53:23PM -0500, Mike Snitzer wrote:
> On Mon, Nov 12 2018 at 11:23am -0500,
> Martin Wilck <mwilck@suse.com> wrote:
> 
> > Hello Lijie,
> > 
> > On Thu, 2018-11-08 at 14:09 +0800, lijie wrote:
> > > Add support for Asynchronous Namespace Access as specified in NVMe
> > > 1.3
> > > TP 4004. The states are updated through reading the ANA log page.
> > > 
> > > By default, the native nvme multipath takes over the nvme device.
> > > We can pass a false to the parameter 'multipath' of the nvme-core.ko
> > > module,when we want to use multipath-tools.
> > 
> > Thank you for the patch. It looks quite good to me. I've tested it with
> > a Linux target and found no problems so far.
> > 
> > I have a few questions and comments inline below.
> > 
> > I suggest you also have a look at detect_prio(); it seems to make sense
> > to use the ana prioritizer for NVMe paths automatically if ANA is
> > supported (with your patch, "detect_prio no" and "prio ana" have to be
> > configured explicitly). But that can be done in a later patch.
> 
> I (and others) think it makes sense to at least triple check with the
> NVMe developers (now cc'd) to see if we could get agreement on the nvme
> driver providing the ANA state via sysfs (when modparam
> nvme_core.multipath=N is set), like Hannes proposed here:
> http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> 
> Then the userspace multipath-tools ANA support could just read sysfs
> rather than reinvent harvesting the ANA state via ioctl.

I'd prefer not duplicating the log page parsing. Maybe nvme's shouldn't
even be tied to CONFIG_NVME_MULTIPATH so that the 'multipath' param
isn't even an issue.
 
> But if we continue to hit a wall on that type of sharing of the nvme
> driver's state then I'm fine with reinventing ANA state inquiry and
> tracking like has been proposed here.
> 
> Mike
> 
> p.s. thanks for your review Martin, we really need to determine the way
> forward for full multipath-tools support of NVMe with ANA.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
  2018-11-13 16:18       ` Keith Busch
@ 2018-11-13 18:00         ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-13 18:00 UTC (permalink / raw)
  To: Keith Busch, Sagi Grimberg, hch, axboe
  Cc: Martin Wilck, lijie, xose.vazquez, linux-nvme, chengjike.cheng,
	shenhong09, dm-devel, wangzhoumengjian, hare, christophe.varoqui,
	bmarzins, sschremm, linux-block, linux-kernel

On Tue, Nov 13 2018 at 11:18am -0500,
Keith Busch <keith.busch@intel.com> wrote:

> On Mon, Nov 12, 2018 at 04:53:23PM -0500, Mike Snitzer wrote:
> > On Mon, Nov 12 2018 at 11:23am -0500,
> > Martin Wilck <mwilck@suse.com> wrote:
> > 
> > > Hello Lijie,
> > > 
> > > On Thu, 2018-11-08 at 14:09 +0800, lijie wrote:
> > > > Add support for Asynchronous Namespace Access as specified in NVMe
> > > > 1.3
> > > > TP 4004. The states are updated through reading the ANA log page.
> > > > 
> > > > By default, the native nvme multipath takes over the nvme device.
> > > > We can pass a false to the parameter 'multipath' of the nvme-core.ko
> > > > module,when we want to use multipath-tools.
> > > 
> > > Thank you for the patch. It looks quite good to me. I've tested it with
> > > a Linux target and found no problems so far.
> > > 
> > > I have a few questions and comments inline below.
> > > 
> > > I suggest you also have a look at detect_prio(); it seems to make sense
> > > to use the ana prioritizer for NVMe paths automatically if ANA is
> > > supported (with your patch, "detect_prio no" and "prio ana" have to be
> > > configured explicitly). But that can be done in a later patch.
> > 
> > I (and others) think it makes sense to at least triple check with the
> > NVMe developers (now cc'd) to see if we could get agreement on the nvme
> > driver providing the ANA state via sysfs (when modparam
> > nvme_core.multipath=N is set), like Hannes proposed here:
> > http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> > 
> > Then the userspace multipath-tools ANA support could just read sysfs
> > rather than reinvent harvesting the ANA state via ioctl.
> 
> I'd prefer not duplicating the log page parsing. Maybe nvme's shouldn't
> even be tied to CONFIG_NVME_MULTIPATH so that the 'multipath' param
> isn't even an issue.

I like your instincts, we just need to take them a bit further.

Splitting out the kernel's ANA log page parsing won't buy us much given
it is userspace (multipath-tools) that needs to consume it.  The less
work userspace needs to do (because kernel has already done it) the
better.

If the NVMe driver is made to always track and export the ANA state via
sysfs [1] we'd avoid userspace parsing duplication "for free".  This
should occur regardless of what layer is reacting to the ANA state
changes (be it NVMe's native multipathing or multipath-tools).

ANA and NVMe multipathing really are disjoint, making them tightly
coupled only serves to force NVMe driver provided multipathing _or_
userspace ANA state tracking duplication that really isn't ideal [2].

We need a reasoned answer to the primary question of whether the NVMe
maintainers are willing to cooperate by providing this basic ANA sysfs
export even if nvme_core.multipath=N [1].

Christoph said "No" [3], but offered little _real_ justification for why
this isn't the right thing for NVMe in general -- even when asked the
question gets ignored [4].

The inability to provide proper justification for rejecting a patch
(that already had one co-maintainer's Reviewed-by [5]) _should_ render
that rejection baseless, and the patch applied (especially if there is
contributing subsystem developer interest in maintaining this support
over time, which there is).  At least that is what would happen in a
properly maintained kernel subsystem.

It'd really go a long way if senior Linux NVMe maintainers took steps to
accept reasonable changes.

Mike

[1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
[2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
[3]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020815.html
[4]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020846.html
[5]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020790.html

^ permalink raw reply	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
@ 2018-11-13 18:00         ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-13 18:00 UTC (permalink / raw)


On Tue, Nov 13 2018 at 11:18am -0500,
Keith Busch <keith.busch@intel.com> wrote:

> On Mon, Nov 12, 2018@04:53:23PM -0500, Mike Snitzer wrote:
> > On Mon, Nov 12 2018 at 11:23am -0500,
> > Martin Wilck <mwilck@suse.com> wrote:
> > 
> > > Hello Lijie,
> > > 
> > > On Thu, 2018-11-08@14:09 +0800, lijie wrote:
> > > > Add support for Asynchronous Namespace Access as specified in NVMe
> > > > 1.3
> > > > TP 4004. The states are updated through reading the ANA log page.
> > > > 
> > > > By default, the native nvme multipath takes over the nvme device.
> > > > We can pass a false to the parameter 'multipath' of the nvme-core.ko
> > > > module,when we want to use multipath-tools.
> > > 
> > > Thank you for the patch. It looks quite good to me. I've tested it with
> > > a Linux target and found no problems so far.
> > > 
> > > I have a few questions and comments inline below.
> > > 
> > > I suggest you also have a look at detect_prio(); it seems to make sense
> > > to use the ana prioritizer for NVMe paths automatically if ANA is
> > > supported (with your patch, "detect_prio no" and "prio ana" have to be
> > > configured explicitly). But that can be done in a later patch.
> > 
> > I (and others) think it makes sense to at least triple check with the
> > NVMe developers (now cc'd) to see if we could get agreement on the nvme
> > driver providing the ANA state via sysfs (when modparam
> > nvme_core.multipath=N is set), like Hannes proposed here:
> > http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> > 
> > Then the userspace multipath-tools ANA support could just read sysfs
> > rather than reinvent harvesting the ANA state via ioctl.
> 
> I'd prefer not duplicating the log page parsing. Maybe nvme's shouldn't
> even be tied to CONFIG_NVME_MULTIPATH so that the 'multipath' param
> isn't even an issue.

I like your instincts, we just need to take them a bit further.

Splitting out the kernel's ANA log page parsing won't buy us much given
it is userspace (multipath-tools) that needs to consume it.  The less
work userspace needs to do (because kernel has already done it) the
better.

If the NVMe driver is made to always track and export the ANA state via
sysfs [1] we'd avoid userspace parsing duplication "for free".  This
should occur regardless of what layer is reacting to the ANA state
changes (be it NVMe's native multipathing or multipath-tools).

ANA and NVMe multipathing really are disjoint, making them tightly
coupled only serves to force NVMe driver provided multipathing _or_
userspace ANA state tracking duplication that really isn't ideal [2].

We need a reasoned answer to the primary question of whether the NVMe
maintainers are willing to cooperate by providing this basic ANA sysfs
export even if nvme_core.multipath=N [1].

Christoph said "No" [3], but offered little _real_ justification for why
this isn't the right thing for NVMe in general -- even when asked the
question gets ignored [4].

The inability to provide proper justification for rejecting a patch
(that already had one co-maintainer's Reviewed-by [5]) _should_ render
that rejection baseless, and the patch applied (especially if there is
contributing subsystem developer interest in maintaining this support
over time, which there is).  At least that is what would happen in a
properly maintained kernel subsystem.

It'd really go a long way if senior Linux NVMe maintainers took steps to
accept reasonable changes.

Mike

[1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
[2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
[3]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020815.html
[4]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020846.html
[5]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020790.html

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
  2018-11-13 18:00         ` Mike Snitzer
@ 2018-11-14  5:38           ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-14  5:38 UTC (permalink / raw)
  To: Keith Busch, Sagi Grimberg, hch, axboe
  Cc: Martin Wilck, lijie, xose.vazquez, linux-nvme, chengjike.cheng,
	shenhong09, dm-devel, wangzhoumengjian, hare, christophe.varoqui,
	bmarzins, sschremm, linux-block, linux-kernel

On Tue, Nov 13 2018 at  1:00pm -0500,
Mike Snitzer <snitzer@redhat.com> wrote:

> On Tue, Nov 13 2018 at 11:18am -0500,
> Keith Busch <keith.busch@intel.com> wrote:
> 
> > On Mon, Nov 12, 2018 at 04:53:23PM -0500, Mike Snitzer wrote:
> > > On Mon, Nov 12 2018 at 11:23am -0500,
> > > Martin Wilck <mwilck@suse.com> wrote:
> > > 
> > > > Hello Lijie,
> > > > 
> > > > On Thu, 2018-11-08 at 14:09 +0800, lijie wrote:
> > > > > Add support for Asynchronous Namespace Access as specified in NVMe
> > > > > 1.3
> > > > > TP 4004. The states are updated through reading the ANA log page.
> > > > > 
> > > > > By default, the native nvme multipath takes over the nvme device.
> > > > > We can pass a false to the parameter 'multipath' of the nvme-core.ko
> > > > > module,when we want to use multipath-tools.
> > > > 
> > > > Thank you for the patch. It looks quite good to me. I've tested it with
> > > > a Linux target and found no problems so far.
> > > > 
> > > > I have a few questions and comments inline below.
> > > > 
> > > > I suggest you also have a look at detect_prio(); it seems to make sense
> > > > to use the ana prioritizer for NVMe paths automatically if ANA is
> > > > supported (with your patch, "detect_prio no" and "prio ana" have to be
> > > > configured explicitly). But that can be done in a later patch.
> > > 
> > > I (and others) think it makes sense to at least triple check with the
> > > NVMe developers (now cc'd) to see if we could get agreement on the nvme
> > > driver providing the ANA state via sysfs (when modparam
> > > nvme_core.multipath=N is set), like Hannes proposed here:
> > > http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> > > 
> > > Then the userspace multipath-tools ANA support could just read sysfs
> > > rather than reinvent harvesting the ANA state via ioctl.
> > 
> > I'd prefer not duplicating the log page parsing. Maybe nvme's shouldn't
> > even be tied to CONFIG_NVME_MULTIPATH so that the 'multipath' param
> > isn't even an issue.
> 
> I like your instincts, we just need to take them a bit further.
> 
> Splitting out the kernel's ANA log page parsing won't buy us much given
> it is userspace (multipath-tools) that needs to consume it.  The less
> work userspace needs to do (because kernel has already done it) the
> better.
> 
> If the NVMe driver is made to always track and export the ANA state via
> sysfs [1] we'd avoid userspace parsing duplication "for free".  This
> should occur regardless of what layer is reacting to the ANA state
> changes (be it NVMe's native multipathing or multipath-tools).
> 
> ANA and NVMe multipathing really are disjoint, making them tightly
> coupled only serves to force NVMe driver provided multipathing _or_
> userspace ANA state tracking duplication that really isn't ideal [2].
> 
> We need a reasoned answer to the primary question of whether the NVMe
> maintainers are willing to cooperate by providing this basic ANA sysfs
> export even if nvme_core.multipath=N [1].
> 
> Christoph said "No" [3], but offered little _real_ justification for why
> this isn't the right thing for NVMe in general.
...
> [1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> [2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
...

I knew there had to be a pretty tight coupling between the NVMe driver's
native multipathing and ANA support... and that the simplicity of
Hannes' patch [1] was too good to be true.

The real justification for not making Hannes' change is it'd effectively
be useless without first splitting out the ANA handling done during NVMe
request completion (NVME_SC_ANA_* cases in nvme_failover_req) that
triggers re-reading the ANA log page accordingly.

So without the ability to drive the ANA workqueue to trigger
nvme_read_ana_log() from the nvme driver's completion path -- even if
nvme_core.multipath=N -- it really doesn't buy multipath-tools anything
to have the NVMe driver export the ana state via sysfs, because that ANA
state will never get updated.

> The inability to provide proper justification for rejecting a patch
> (that already had one co-maintainer's Reviewed-by [5]) _should_ render
> that rejection baseless, and the patch applied (especially if there is
> contributing subsystem developer interest in maintaining this support
> over time, which there is).  At least that is what would happen in a
> properly maintained kernel subsystem.
> 
> It'd really go a long way if senior Linux NVMe maintainers took steps to
> accept reasonable changes.

Even though I'm frustrated I was clearly too harsh and regret my tone.
I promise to _try_ to suck less.

This dynamic of terse responses or no responses at all whenever NVMe
driver changes to ease multipath-tools NVMe support are floated is the
depressing gift that keeps on giving.  But enough excuses...

Not holding my breath BUT:
if decoupling the reading of ANA state from native NVMe multipathing
specific work during nvme request completion were an acceptable
advancement I'd gladly do the work.

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
@ 2018-11-14  5:38           ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-14  5:38 UTC (permalink / raw)


On Tue, Nov 13 2018 at  1:00pm -0500,
Mike Snitzer <snitzer@redhat.com> wrote:

> On Tue, Nov 13 2018 at 11:18am -0500,
> Keith Busch <keith.busch@intel.com> wrote:
> 
> > On Mon, Nov 12, 2018@04:53:23PM -0500, Mike Snitzer wrote:
> > > On Mon, Nov 12 2018 at 11:23am -0500,
> > > Martin Wilck <mwilck@suse.com> wrote:
> > > 
> > > > Hello Lijie,
> > > > 
> > > > On Thu, 2018-11-08@14:09 +0800, lijie wrote:
> > > > > Add support for Asynchronous Namespace Access as specified in NVMe
> > > > > 1.3
> > > > > TP 4004. The states are updated through reading the ANA log page.
> > > > > 
> > > > > By default, the native nvme multipath takes over the nvme device.
> > > > > We can pass a false to the parameter 'multipath' of the nvme-core.ko
> > > > > module,when we want to use multipath-tools.
> > > > 
> > > > Thank you for the patch. It looks quite good to me. I've tested it with
> > > > a Linux target and found no problems so far.
> > > > 
> > > > I have a few questions and comments inline below.
> > > > 
> > > > I suggest you also have a look at detect_prio(); it seems to make sense
> > > > to use the ana prioritizer for NVMe paths automatically if ANA is
> > > > supported (with your patch, "detect_prio no" and "prio ana" have to be
> > > > configured explicitly). But that can be done in a later patch.
> > > 
> > > I (and others) think it makes sense to at least triple check with the
> > > NVMe developers (now cc'd) to see if we could get agreement on the nvme
> > > driver providing the ANA state via sysfs (when modparam
> > > nvme_core.multipath=N is set), like Hannes proposed here:
> > > http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> > > 
> > > Then the userspace multipath-tools ANA support could just read sysfs
> > > rather than reinvent harvesting the ANA state via ioctl.
> > 
> > I'd prefer not duplicating the log page parsing. Maybe nvme's shouldn't
> > even be tied to CONFIG_NVME_MULTIPATH so that the 'multipath' param
> > isn't even an issue.
> 
> I like your instincts, we just need to take them a bit further.
> 
> Splitting out the kernel's ANA log page parsing won't buy us much given
> it is userspace (multipath-tools) that needs to consume it.  The less
> work userspace needs to do (because kernel has already done it) the
> better.
> 
> If the NVMe driver is made to always track and export the ANA state via
> sysfs [1] we'd avoid userspace parsing duplication "for free".  This
> should occur regardless of what layer is reacting to the ANA state
> changes (be it NVMe's native multipathing or multipath-tools).
> 
> ANA and NVMe multipathing really are disjoint, making them tightly
> coupled only serves to force NVMe driver provided multipathing _or_
> userspace ANA state tracking duplication that really isn't ideal [2].
> 
> We need a reasoned answer to the primary question of whether the NVMe
> maintainers are willing to cooperate by providing this basic ANA sysfs
> export even if nvme_core.multipath=N [1].
> 
> Christoph said "No" [3], but offered little _real_ justification for why
> this isn't the right thing for NVMe in general.
...
> [1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> [2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
...

I knew there had to be a pretty tight coupling between the NVMe driver's
native multipathing and ANA support... and that the simplicity of
Hannes' patch [1] was too good to be true.

The real justification for not making Hannes' change is it'd effectively
be useless without first splitting out the ANA handling done during NVMe
request completion (NVME_SC_ANA_* cases in nvme_failover_req) that
triggers re-reading the ANA log page accordingly.

So without the ability to drive the ANA workqueue to trigger
nvme_read_ana_log() from the nvme driver's completion path -- even if
nvme_core.multipath=N -- it really doesn't buy multipath-tools anything
to have the NVMe driver export the ana state via sysfs, because that ANA
state will never get updated.

> The inability to provide proper justification for rejecting a patch
> (that already had one co-maintainer's Reviewed-by [5]) _should_ render
> that rejection baseless, and the patch applied (especially if there is
> contributing subsystem developer interest in maintaining this support
> over time, which there is).  At least that is what would happen in a
> properly maintained kernel subsystem.
> 
> It'd really go a long way if senior Linux NVMe maintainers took steps to
> accept reasonable changes.

Even though I'm frustrated I was clearly too harsh and regret my tone.
I promise to _try_ to suck less.

This dynamic of terse responses or no responses at all whenever NVMe
driver changes to ease multipath-tools NVMe support are floated is the
depressing gift that keeps on giving.  But enough excuses...

Not holding my breath BUT:
if decoupling the reading of ANA state from native NVMe multipathing
specific work during nvme request completion were an acceptable
advancement I'd gladly do the work.

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
  2018-11-13 16:18       ` Keith Busch
@ 2018-11-14  7:24         ` Hannes Reinecke
  -1 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-14  7:24 UTC (permalink / raw)


On 11/13/18 5:18 PM, Keith Busch wrote:
> On Mon, Nov 12, 2018@04:53:23PM -0500, Mike Snitzer wrote:
>> On Mon, Nov 12 2018 at 11:23am -0500,
>> Martin Wilck <mwilck@suse.com> wrote:
>>
>>> Hello Lijie,
>>>
>>> On Thu, 2018-11-08@14:09 +0800, lijie wrote:
>>>> Add support for Asynchronous Namespace Access as specified in NVMe
>>>> 1.3
>>>> TP 4004. The states are updated through reading the ANA log page.
>>>>
>>>> By default, the native nvme multipath takes over the nvme device.
>>>> We can pass a false to the parameter 'multipath' of the nvme-core.ko
>>>> module,when we want to use multipath-tools.
>>>
>>> Thank you for the patch. It looks quite good to me. I've tested it with
>>> a Linux target and found no problems so far.
>>>
>>> I have a few questions and comments inline below.
>>>
>>> I suggest you also have a look at detect_prio(); it seems to make sense
>>> to use the ana prioritizer for NVMe paths automatically if ANA is
>>> supported (with your patch, "detect_prio no" and "prio ana" have to be
>>> configured explicitly). But that can be done in a later patch.
>>
>> I (and others) think it makes sense to at least triple check with the
>> NVMe developers (now cc'd) to see if we could get agreement on the nvme
>> driver providing the ANA state via sysfs (when modparam
>> nvme_core.multipath=N is set), like Hannes proposed here:
>> http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
>>
>> Then the userspace multipath-tools ANA support could just read sysfs
>> rather than reinvent harvesting the ANA state via ioctl.
> 
> I'd prefer not duplicating the log page parsing. Maybe nvme's shouldn't
> even be tied to CONFIG_NVME_MULTIPATH so that the 'multipath' param
> isn't even an issue.
>   
My argument here is that _ANA_ support should not be tied to the NVME 
native multipathing at all.

Whether or not ANA is present is a choice of the target implementation; 
the host has _zero_ influence on this. It may argue all it wants and 
doesn't even have to implement multipathing. But in the end if the 
target declares a path as 'inaccessible' the path _is_ inaccessible to 
the host. Even it that particular host doesn't implement or activate 
multipathing.
(You wouldn't believe how often I had this discussion with our QA team 
for ALUA ...)

Whether or not _multipathing_ is used is a _host_ implementation, and is 
actually independent on ANA support; linux itself proved the point here 
as we supported (symmetric) multipathing far longer than we had an ANA 
implementation.

So personally I don't see why the 'raw' ANA support (parsing log pages, 
figuring out path states etc) needs to be tied in with native NVMe 
multipathing. _Especially_ not as my patch really is trivial.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
@ 2018-11-14  7:24         ` Hannes Reinecke
  0 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-14  7:24 UTC (permalink / raw)
  To: Keith Busch, Mike Snitzer
  Cc: lijie, xose.vazquez, chengjike.cheng, shenhong09,
	Christoph Hellwig, dm-devel, wangzhoumengjian, linux-nvme,
	Martin Wilck, sschremm

On 11/13/18 5:18 PM, Keith Busch wrote:
> On Mon, Nov 12, 2018 at 04:53:23PM -0500, Mike Snitzer wrote:
>> On Mon, Nov 12 2018 at 11:23am -0500,
>> Martin Wilck <mwilck@suse.com> wrote:
>>
>>> Hello Lijie,
>>>
>>> On Thu, 2018-11-08 at 14:09 +0800, lijie wrote:
>>>> Add support for Asynchronous Namespace Access as specified in NVMe
>>>> 1.3
>>>> TP 4004. The states are updated through reading the ANA log page.
>>>>
>>>> By default, the native nvme multipath takes over the nvme device.
>>>> We can pass a false to the parameter 'multipath' of the nvme-core.ko
>>>> module,when we want to use multipath-tools.
>>>
>>> Thank you for the patch. It looks quite good to me. I've tested it with
>>> a Linux target and found no problems so far.
>>>
>>> I have a few questions and comments inline below.
>>>
>>> I suggest you also have a look at detect_prio(); it seems to make sense
>>> to use the ana prioritizer for NVMe paths automatically if ANA is
>>> supported (with your patch, "detect_prio no" and "prio ana" have to be
>>> configured explicitly). But that can be done in a later patch.
>>
>> I (and others) think it makes sense to at least triple check with the
>> NVMe developers (now cc'd) to see if we could get agreement on the nvme
>> driver providing the ANA state via sysfs (when modparam
>> nvme_core.multipath=N is set), like Hannes proposed here:
>> http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
>>
>> Then the userspace multipath-tools ANA support could just read sysfs
>> rather than reinvent harvesting the ANA state via ioctl.
> 
> I'd prefer not duplicating the log page parsing. Maybe nvme's shouldn't
> even be tied to CONFIG_NVME_MULTIPATH so that the 'multipath' param
> isn't even an issue.
>   
My argument here is that _ANA_ support should not be tied to the NVME 
native multipathing at all.

Whether or not ANA is present is a choice of the target implementation; 
the host has _zero_ influence on this. It may argue all it wants and 
doesn't even have to implement multipathing. But in the end if the 
target declares a path as 'inaccessible' the path _is_ inaccessible to 
the host. Even it that particular host doesn't implement or activate 
multipathing.
(You wouldn't believe how often I had this discussion with our QA team 
for ALUA ...)

Whether or not _multipathing_ is used is a _host_ implementation, and is 
actually independent on ANA support; linux itself proved the point here 
as we supported (symmetric) multipathing far longer than we had an ANA 
implementation.

So personally I don't see why the 'raw' ANA support (parsing log pages, 
figuring out path states etc) needs to be tied in with native NVMe 
multipathing. _Especially_ not as my patch really is trivial.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
  2018-11-14  5:38           ` Mike Snitzer
@ 2018-11-14  7:49             ` Hannes Reinecke
  -1 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-14  7:49 UTC (permalink / raw)
  To: Mike Snitzer, Keith Busch, Sagi Grimberg, hch, axboe
  Cc: Martin Wilck, lijie, xose.vazquez, linux-nvme, chengjike.cheng,
	shenhong09, dm-devel, wangzhoumengjian, christophe.varoqui,
	bmarzins, sschremm, linux-block, linux-kernel

On 11/14/18 6:38 AM, Mike Snitzer wrote:
> On Tue, Nov 13 2018 at  1:00pm -0500,
> Mike Snitzer <snitzer@redhat.com> wrote:
> 
>> On Tue, Nov 13 2018 at 11:18am -0500,
>> Keith Busch <keith.busch@intel.com> wrote:
>>
>>> On Mon, Nov 12, 2018 at 04:53:23PM -0500, Mike Snitzer wrote:
>>>> On Mon, Nov 12 2018 at 11:23am -0500,
>>>> Martin Wilck <mwilck@suse.com> wrote:
>>>>
>>>>> Hello Lijie,
>>>>>
>>>>> On Thu, 2018-11-08 at 14:09 +0800, lijie wrote:
>>>>>> Add support for Asynchronous Namespace Access as specified in NVMe
>>>>>> 1.3
>>>>>> TP 4004. The states are updated through reading the ANA log page.
>>>>>>
>>>>>> By default, the native nvme multipath takes over the nvme device.
>>>>>> We can pass a false to the parameter 'multipath' of the nvme-core.ko
>>>>>> module,when we want to use multipath-tools.
>>>>>
>>>>> Thank you for the patch. It looks quite good to me. I've tested it with
>>>>> a Linux target and found no problems so far.
>>>>>
>>>>> I have a few questions and comments inline below.
>>>>>
>>>>> I suggest you also have a look at detect_prio(); it seems to make sense
>>>>> to use the ana prioritizer for NVMe paths automatically if ANA is
>>>>> supported (with your patch, "detect_prio no" and "prio ana" have to be
>>>>> configured explicitly). But that can be done in a later patch.
>>>>
>>>> I (and others) think it makes sense to at least triple check with the
>>>> NVMe developers (now cc'd) to see if we could get agreement on the nvme
>>>> driver providing the ANA state via sysfs (when modparam
>>>> nvme_core.multipath=N is set), like Hannes proposed here:
>>>> http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
>>>>
>>>> Then the userspace multipath-tools ANA support could just read sysfs
>>>> rather than reinvent harvesting the ANA state via ioctl.
>>>
>>> I'd prefer not duplicating the log page parsing. Maybe nvme's shouldn't
>>> even be tied to CONFIG_NVME_MULTIPATH so that the 'multipath' param
>>> isn't even an issue.
>>
>> I like your instincts, we just need to take them a bit further.
>>
>> Splitting out the kernel's ANA log page parsing won't buy us much given
>> it is userspace (multipath-tools) that needs to consume it.  The less
>> work userspace needs to do (because kernel has already done it) the
>> better.
>>
>> If the NVMe driver is made to always track and export the ANA state via
>> sysfs [1] we'd avoid userspace parsing duplication "for free".  This
>> should occur regardless of what layer is reacting to the ANA state
>> changes (be it NVMe's native multipathing or multipath-tools).
>>
>> ANA and NVMe multipathing really are disjoint, making them tightly
>> coupled only serves to force NVMe driver provided multipathing _or_
>> userspace ANA state tracking duplication that really isn't ideal [2].
>>
>> We need a reasoned answer to the primary question of whether the NVMe
>> maintainers are willing to cooperate by providing this basic ANA sysfs
>> export even if nvme_core.multipath=N [1].
>>
>> Christoph said "No" [3], but offered little _real_ justification for why
>> this isn't the right thing for NVMe in general.
> ...
>> [1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
>> [2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
> ...
> 
> I knew there had to be a pretty tight coupling between the NVMe driver's
> native multipathing and ANA support... and that the simplicity of
> Hannes' patch [1] was too good to be true.
> 
> The real justification for not making Hannes' change is it'd effectively
> be useless without first splitting out the ANA handling done during NVMe
> request completion (NVME_SC_ANA_* cases in nvme_failover_req) that
> triggers re-reading the ANA log page accordingly.
> 
> So without the ability to drive the ANA workqueue to trigger
> nvme_read_ana_log() from the nvme driver's completion path -- even if
> nvme_core.multipath=N -- it really doesn't buy multipath-tools anything
> to have the NVMe driver export the ana state via sysfs, because that ANA
> state will never get updated.
> 
Hmm. Indeed, I was more focussed on having the sysfs attributes 
displayed, so yes, indeed it needs some more work.

>> The inability to provide proper justification for rejecting a patch
>> (that already had one co-maintainer's Reviewed-by [5]) _should_ render
>> that rejection baseless, and the patch applied (especially if there is
>> contributing subsystem developer interest in maintaining this support
>> over time, which there is).  At least that is what would happen in a
>> properly maintained kernel subsystem.
>>
>> It'd really go a long way if senior Linux NVMe maintainers took steps to
>> accept reasonable changes.
> 
> Even though I'm frustrated I was clearly too harsh and regret my tone.
> I promise to _try_ to suck less.
> 
> This dynamic of terse responses or no responses at all whenever NVMe
> driver changes to ease multipath-tools NVMe support are floated is the
> depressing gift that keeps on giving.  But enough excuses...
> 
> Not holding my breath BUT:
> if decoupling the reading of ANA state from native NVMe multipathing
> specific work during nvme request completion were an acceptable
> advancement I'd gladly do the work.
> 
I'd be happy to work on that, given that we'll have to have 'real' ANA 
support for device-mapper anyway for SLE12 SP4 etc.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
@ 2018-11-14  7:49             ` Hannes Reinecke
  0 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-14  7:49 UTC (permalink / raw)


On 11/14/18 6:38 AM, Mike Snitzer wrote:
> On Tue, Nov 13 2018 at  1:00pm -0500,
> Mike Snitzer <snitzer@redhat.com> wrote:
> 
>> On Tue, Nov 13 2018 at 11:18am -0500,
>> Keith Busch <keith.busch@intel.com> wrote:
>>
>>> On Mon, Nov 12, 2018@04:53:23PM -0500, Mike Snitzer wrote:
>>>> On Mon, Nov 12 2018 at 11:23am -0500,
>>>> Martin Wilck <mwilck@suse.com> wrote:
>>>>
>>>>> Hello Lijie,
>>>>>
>>>>> On Thu, 2018-11-08@14:09 +0800, lijie wrote:
>>>>>> Add support for Asynchronous Namespace Access as specified in NVMe
>>>>>> 1.3
>>>>>> TP 4004. The states are updated through reading the ANA log page.
>>>>>>
>>>>>> By default, the native nvme multipath takes over the nvme device.
>>>>>> We can pass a false to the parameter 'multipath' of the nvme-core.ko
>>>>>> module,when we want to use multipath-tools.
>>>>>
>>>>> Thank you for the patch. It looks quite good to me. I've tested it with
>>>>> a Linux target and found no problems so far.
>>>>>
>>>>> I have a few questions and comments inline below.
>>>>>
>>>>> I suggest you also have a look at detect_prio(); it seems to make sense
>>>>> to use the ana prioritizer for NVMe paths automatically if ANA is
>>>>> supported (with your patch, "detect_prio no" and "prio ana" have to be
>>>>> configured explicitly). But that can be done in a later patch.
>>>>
>>>> I (and others) think it makes sense to at least triple check with the
>>>> NVMe developers (now cc'd) to see if we could get agreement on the nvme
>>>> driver providing the ANA state via sysfs (when modparam
>>>> nvme_core.multipath=N is set), like Hannes proposed here:
>>>> http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
>>>>
>>>> Then the userspace multipath-tools ANA support could just read sysfs
>>>> rather than reinvent harvesting the ANA state via ioctl.
>>>
>>> I'd prefer not duplicating the log page parsing. Maybe nvme's shouldn't
>>> even be tied to CONFIG_NVME_MULTIPATH so that the 'multipath' param
>>> isn't even an issue.
>>
>> I like your instincts, we just need to take them a bit further.
>>
>> Splitting out the kernel's ANA log page parsing won't buy us much given
>> it is userspace (multipath-tools) that needs to consume it.  The less
>> work userspace needs to do (because kernel has already done it) the
>> better.
>>
>> If the NVMe driver is made to always track and export the ANA state via
>> sysfs [1] we'd avoid userspace parsing duplication "for free".  This
>> should occur regardless of what layer is reacting to the ANA state
>> changes (be it NVMe's native multipathing or multipath-tools).
>>
>> ANA and NVMe multipathing really are disjoint, making them tightly
>> coupled only serves to force NVMe driver provided multipathing _or_
>> userspace ANA state tracking duplication that really isn't ideal [2].
>>
>> We need a reasoned answer to the primary question of whether the NVMe
>> maintainers are willing to cooperate by providing this basic ANA sysfs
>> export even if nvme_core.multipath=N [1].
>>
>> Christoph said "No" [3], but offered little _real_ justification for why
>> this isn't the right thing for NVMe in general.
> ...
>> [1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
>> [2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
> ...
> 
> I knew there had to be a pretty tight coupling between the NVMe driver's
> native multipathing and ANA support... and that the simplicity of
> Hannes' patch [1] was too good to be true.
> 
> The real justification for not making Hannes' change is it'd effectively
> be useless without first splitting out the ANA handling done during NVMe
> request completion (NVME_SC_ANA_* cases in nvme_failover_req) that
> triggers re-reading the ANA log page accordingly.
> 
> So without the ability to drive the ANA workqueue to trigger
> nvme_read_ana_log() from the nvme driver's completion path -- even if
> nvme_core.multipath=N -- it really doesn't buy multipath-tools anything
> to have the NVMe driver export the ana state via sysfs, because that ANA
> state will never get updated.
> 
Hmm. Indeed, I was more focussed on having the sysfs attributes 
displayed, so yes, indeed it needs some more work.

>> The inability to provide proper justification for rejecting a patch
>> (that already had one co-maintainer's Reviewed-by [5]) _should_ render
>> that rejection baseless, and the patch applied (especially if there is
>> contributing subsystem developer interest in maintaining this support
>> over time, which there is).  At least that is what would happen in a
>> properly maintained kernel subsystem.
>>
>> It'd really go a long way if senior Linux NVMe maintainers took steps to
>> accept reasonable changes.
> 
> Even though I'm frustrated I was clearly too harsh and regret my tone.
> I promise to _try_ to suck less.
> 
> This dynamic of terse responses or no responses at all whenever NVMe
> driver changes to ease multipath-tools NVMe support are floated is the
> depressing gift that keeps on giving.  But enough excuses...
> 
> Not holding my breath BUT:
> if decoupling the reading of ANA state from native NVMe multipathing
> specific work during nvme request completion were an acceptable
> advancement I'd gladly do the work.
> 
I'd be happy to work on that, given that we'll have to have 'real' ANA 
support for device-mapper anyway for SLE12 SP4 etc.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [dm-devel] multipath-tools: add ANA support for NVMe device
  2018-11-14  7:49             ` Hannes Reinecke
@ 2018-11-14 10:36               ` Martin Wilck
  -1 siblings, 0 replies; 63+ messages in thread
From: Martin Wilck @ 2018-11-14 10:36 UTC (permalink / raw)
  To: Hannes Reinecke, Sagi Grimberg, Keith Busch, axboe, hch, Mike Snitzer
  Cc: xose.vazquez, chengjike.cheng, lijie, shenhong09,
	wangzhoumengjian, linux-nvme, sschremm, dm-devel, linux-block,
	linux-kernel

On Wed, 2018-11-14 at 08:49 +0100, Hannes Reinecke wrote:
> On 11/14/18 6:38 AM, Mike Snitzer wrote:
> > 
> > Not holding my breath BUT:
> > if decoupling the reading of ANA state from native NVMe
> > multipathing
> > specific work during nvme request completion were an acceptable
> > advancement I'd gladly do the work.
> > 
> I'd be happy to work on that, given that we'll have to have 'real'
> ANA 
> support for device-mapper anyway for SLE12 SP4 etc.

So, what's the way ahead for multipath-tools? 

Given that, IIUC, at least one official kernel (4.19) has been released
with ANA support but without the ANA sysfs attributes exported to user
space, multipath-tools can't rely on them being present. I for one have
no issue with copying code from nvme-cli and doing NVMe ioctls from
multipath-tools, as in Lijie's patch. When those sysfs attributes are
added (and work as expected), we will use them, but IMO we'll need to
fall back to ioctls until then, and also afterwards, because we can't
assume to be always running on the latest kernel.

Best,
Martin



^ permalink raw reply	[flat|nested] 63+ messages in thread

* [dm-devel] multipath-tools: add ANA support for NVMe device
@ 2018-11-14 10:36               ` Martin Wilck
  0 siblings, 0 replies; 63+ messages in thread
From: Martin Wilck @ 2018-11-14 10:36 UTC (permalink / raw)


On Wed, 2018-11-14@08:49 +0100, Hannes Reinecke wrote:
> On 11/14/18 6:38 AM, Mike Snitzer wrote:
> > 
> > Not holding my breath BUT:
> > if decoupling the reading of ANA state from native NVMe
> > multipathing
> > specific work during nvme request completion were an acceptable
> > advancement I'd gladly do the work.
> > 
> I'd be happy to work on that, given that we'll have to have 'real'
> ANA 
> support for device-mapper anyway for SLE12 SP4 etc.

So, what's the way ahead for multipath-tools? 

Given that, IIUC, at least one official kernel (4.19) has been released
with ANA support but without the ANA sysfs attributes exported to user
space, multipath-tools can't rely on them being present. I for one have
no issue with copying code from nvme-cli and doing NVMe ioctls from
multipath-tools, as in Lijie's patch. When those sysfs attributes are
added (and work as expected), we will use them, but IMO we'll need to
fall back to ioctls until then, and also afterwards, because we can't
assume to be always running on the latest kernel.

Best,
Martin

^ permalink raw reply	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
  2018-11-14  7:24         ` Hannes Reinecke
@ 2018-11-14 15:35           ` Christoph Hellwig
  -1 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-14 15:35 UTC (permalink / raw)


On Wed, Nov 14, 2018@08:24:05AM +0100, Hannes Reinecke wrote:
> My argument here is that _ANA_ support should not be tied to the NVME 
> native multipathing at all.

It should.  Because nvme driver multipathing is the only sanctioned
way to use it.  All other ways aren't supported and might break at
any time.

> So personally I don't see why the 'raw' ANA support (parsing log pages, 
> figuring out path states etc) needs to be tied in with native NVMe 
> multipathing. _Especially_ not as my patch really is trivial.

And not actually usable for anything..  They only thing you do is to
increase the code size for embedded nvme users that don't need any
multipathing.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
@ 2018-11-14 15:35           ` Christoph Hellwig
  0 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-14 15:35 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: lijie, xose.vazquez, Mike Snitzer, chengjike.cheng, linux-nvme,
	Christoph Hellwig, Keith Busch, dm-devel, wangzhoumengjian,
	shenhong09, Martin Wilck, sschremm

On Wed, Nov 14, 2018 at 08:24:05AM +0100, Hannes Reinecke wrote:
> My argument here is that _ANA_ support should not be tied to the NVME 
> native multipathing at all.

It should.  Because nvme driver multipathing is the only sanctioned
way to use it.  All other ways aren't supported and might break at
any time.

> So personally I don't see why the 'raw' ANA support (parsing log pages, 
> figuring out path states etc) needs to be tied in with native NVMe 
> multipathing. _Especially_ not as my patch really is trivial.

And not actually usable for anything..  They only thing you do is to
increase the code size for embedded nvme users that don't need any
multipathing.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
  2018-11-14 15:35           ` Christoph Hellwig
@ 2018-11-14 16:16             ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-14 16:16 UTC (permalink / raw)


On Wed, Nov 14 2018 at 10:35am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Wed, Nov 14, 2018@08:24:05AM +0100, Hannes Reinecke wrote:
> > My argument here is that _ANA_ support should not be tied to the NVME 
> > native multipathing at all.
> 
> It should.  Because nvme driver multipathing is the only sanctioned
> way to use it.  All other ways aren't supported and might break at
> any time.

Quite a few of us who are multipath-tools oriented would like the proper
separation rather than your more pragmatic isolated approach.  And we'd
fix it if it broke in the future.

> > So personally I don't see why the 'raw' ANA support (parsing log pages, 
> > figuring out path states etc) needs to be tied in with native NVMe 
> > multipathing. _Especially_ not as my patch really is trivial.
> 
> And not actually usable for anything..  They only thing you do is to
> increase the code size for embedded nvme users that don't need any
> multipathing.

Isn't that why CONFIG_NVME_MULTIPATH exists?  Embedded nvme users
wouldn't set that.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
@ 2018-11-14 16:16             ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-14 16:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: lijie, shenhong09, xose.vazquez, chengjike.cheng, linux-nvme,
	Keith Busch, dm-devel, wangzhoumengjian, Martin Wilck, sschremm

On Wed, Nov 14 2018 at 10:35am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Wed, Nov 14, 2018 at 08:24:05AM +0100, Hannes Reinecke wrote:
> > My argument here is that _ANA_ support should not be tied to the NVME 
> > native multipathing at all.
> 
> It should.  Because nvme driver multipathing is the only sanctioned
> way to use it.  All other ways aren't supported and might break at
> any time.

Quite a few of us who are multipath-tools oriented would like the proper
separation rather than your more pragmatic isolated approach.  And we'd
fix it if it broke in the future.

> > So personally I don't see why the 'raw' ANA support (parsing log pages, 
> > figuring out path states etc) needs to be tied in with native NVMe 
> > multipathing. _Especially_ not as my patch really is trivial.
> 
> And not actually usable for anything..  They only thing you do is to
> increase the code size for embedded nvme users that don't need any
> multipathing.

Isn't that why CONFIG_NVME_MULTIPATH exists?  Embedded nvme users
wouldn't set that.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
  2018-11-14  7:49             ` Hannes Reinecke
@ 2018-11-14 17:47               ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-14 17:47 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Keith Busch, Sagi Grimberg, hch, axboe, Martin Wilck, lijie,
	xose.vazquez, linux-nvme, chengjike.cheng, shenhong09, dm-devel,
	wangzhoumengjian, christophe.varoqui, bmarzins, sschremm,
	linux-block, linux-kernel

On Wed, Nov 14 2018 at  2:49am -0500,
Hannes Reinecke <hare@suse.de> wrote:

> On 11/14/18 6:38 AM, Mike Snitzer wrote:
> >On Tue, Nov 13 2018 at  1:00pm -0500,
> >Mike Snitzer <snitzer@redhat.com> wrote:
> >
> >>[1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> >>[2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
> >...
> >
> >I knew there had to be a pretty tight coupling between the NVMe driver's
> >native multipathing and ANA support... and that the simplicity of
> >Hannes' patch [1] was too good to be true.
> >
> >The real justification for not making Hannes' change is it'd effectively
> >be useless without first splitting out the ANA handling done during NVMe
> >request completion (NVME_SC_ANA_* cases in nvme_failover_req) that
> >triggers re-reading the ANA log page accordingly.
> >
> >So without the ability to drive the ANA workqueue to trigger
> >nvme_read_ana_log() from the nvme driver's completion path -- even if
> >nvme_core.multipath=N -- it really doesn't buy multipath-tools anything
> >to have the NVMe driver export the ana state via sysfs, because that ANA
> >state will never get updated.
> >
> Hmm. Indeed, I was more focussed on having the sysfs attributes
> displayed, so yes, indeed it needs some more work.
...
> >Not holding my breath BUT:
> >if decoupling the reading of ANA state from native NVMe multipathing
> >specific work during nvme request completion were an acceptable
> >advancement I'd gladly do the work.
> >
> I'd be happy to work on that, given that we'll have to have 'real'
> ANA support for device-mapper anyway for SLE12 SP4 etc.

I had a close enough look yesterday that I figured I'd just implement
what I reasoned through as one way forward, compile tested only (patch
relative to Jens' for-4.21/block):

 drivers/nvme/host/core.c      | 14 +++++++---
 drivers/nvme/host/multipath.c | 65 ++++++++++++++++++++++++++-----------------
 drivers/nvme/host/nvme.h      |  4 +++
 3 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f172d63db2b5..05313ab5d91e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -252,10 +252,16 @@ void nvme_complete_rq(struct request *req)
 	trace_nvme_complete_rq(req);
 
 	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
-		if ((req->cmd_flags & REQ_NVME_MPATH) &&
-		    blk_path_error(status)) {
-			nvme_failover_req(req);
-			return;
+		if (blk_path_error(status)) {
+			struct nvme_ns *ns = req->q->queuedata;
+			u16 nvme_status = nvme_req(req)->status;
+
+			if (req->cmd_flags & REQ_NVME_MPATH) {
+				nvme_failover_req(req);
+				nvme_update_ana(ns, nvme_status);
+				return;
+			}
+			nvme_update_ana(ns, nvme_status);
 		}
 
 		if (!blk_queue_dying(req->q)) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5e3cc8c59a39..f7fbc161dc8c 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath,
 
 inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 {
-	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
+	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
 }
 
 /*
@@ -47,6 +47,17 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 	}
 }
 
+static bool nvme_ana_error(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_ANA_TRANSITION:
+	case NVME_SC_ANA_INACCESSIBLE:
+	case NVME_SC_ANA_PERSISTENT_LOSS:
+		return true;
+	}
+	return false;
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -58,10 +69,7 @@ void nvme_failover_req(struct request *req)
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 	blk_mq_end_request(req, 0);
 
-	switch (status & 0x7ff) {
-	case NVME_SC_ANA_TRANSITION:
-	case NVME_SC_ANA_INACCESSIBLE:
-	case NVME_SC_ANA_PERSISTENT_LOSS:
+	if (nvme_ana_error(status)) {
 		/*
 		 * If we got back an ANA error we know the controller is alive,
 		 * but not ready to serve this namespaces.  The spec suggests
@@ -69,31 +77,38 @@ void nvme_failover_req(struct request *req)
 		 * that the admin and I/O queues are not serialized that is
 		 * fundamentally racy.  So instead just clear the current path,
 		 * mark the the path as pending and kick of a re-read of the ANA
-		 * log page ASAP.
+		 * log page ASAP (see nvme_update_ana() below).
 		 */
 		nvme_mpath_clear_current_path(ns);
-		if (ns->ctrl->ana_log_buf) {
-			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
-			queue_work(nvme_wq, &ns->ctrl->ana_work);
+	} else {
+		switch (status & 0x7ff) {
+		case NVME_SC_HOST_PATH_ERROR:
+			/*
+			 * Temporary transport disruption in talking to the
+			 * controller.  Try to send on a new path.
+			 */
+			nvme_mpath_clear_current_path(ns);
+			break;
+		default:
+			/*
+			 * Reset the controller for any non-ANA error as we
+			 * don't know what caused the error.
+			 */
+			nvme_reset_ctrl(ns->ctrl);
+			break;
 		}
-		break;
-	case NVME_SC_HOST_PATH_ERROR:
-		/*
-		 * Temporary transport disruption in talking to the controller.
-		 * Try to send on a new path.
-		 */
-		nvme_mpath_clear_current_path(ns);
-		break;
-	default:
-		/*
-		 * Reset the controller for any non-ANA error as we don't know
-		 * what caused the error.
-		 */
-		nvme_reset_ctrl(ns->ctrl);
-		break;
 	}
+}
 
-	kblockd_schedule_work(&ns->head->requeue_work);
+void nvme_update_ana(struct nvme_ns *ns, u16 status)
+{
+	if (nvme_ana_error(status) && ns->ctrl->ana_log_buf) {
+		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+		queue_work(nvme_wq, &ns->ctrl->ana_work);
+	}
+
+	if (multipath)
+		kblockd_schedule_work(&ns->head->requeue_work);
 }
 
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 32a1f1cfdfb4..8b4bc2054b7a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -468,6 +468,7 @@ bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 			struct nvme_ctrl *ctrl, int *flags);
 void nvme_failover_req(struct request *req);
+void nvme_update_ana(struct nvme_ns *ns, u16 status);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
@@ -507,6 +508,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 static inline void nvme_failover_req(struct request *req)
 {
 }
+static inline void nvme_update_ana(struct nvme_ns *ns, u16 status)
+{
+}
 static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 }

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
@ 2018-11-14 17:47               ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-14 17:47 UTC (permalink / raw)


On Wed, Nov 14 2018 at  2:49am -0500,
Hannes Reinecke <hare@suse.de> wrote:

> On 11/14/18 6:38 AM, Mike Snitzer wrote:
> >On Tue, Nov 13 2018 at  1:00pm -0500,
> >Mike Snitzer <snitzer@redhat.com> wrote:
> >
> >>[1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> >>[2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
> >...
> >
> >I knew there had to be a pretty tight coupling between the NVMe driver's
> >native multipathing and ANA support... and that the simplicity of
> >Hannes' patch [1] was too good to be true.
> >
> >The real justification for not making Hannes' change is it'd effectively
> >be useless without first splitting out the ANA handling done during NVMe
> >request completion (NVME_SC_ANA_* cases in nvme_failover_req) that
> >triggers re-reading the ANA log page accordingly.
> >
> >So without the ability to drive the ANA workqueue to trigger
> >nvme_read_ana_log() from the nvme driver's completion path -- even if
> >nvme_core.multipath=N -- it really doesn't buy multipath-tools anything
> >to have the NVMe driver export the ana state via sysfs, because that ANA
> >state will never get updated.
> >
> Hmm. Indeed, I was more focussed on having the sysfs attributes
> displayed, so yes, indeed it needs some more work.
...
> >Not holding my breath BUT:
> >if decoupling the reading of ANA state from native NVMe multipathing
> >specific work during nvme request completion were an acceptable
> >advancement I'd gladly do the work.
> >
> I'd be happy to work on that, given that we'll have to have 'real'
> ANA support for device-mapper anyway for SLE12 SP4 etc.

I had a close enough look yesterday that I figured I'd just implement
what I reasoned through as one way forward, compile tested only (patch
relative to Jens' for-4.21/block):

 drivers/nvme/host/core.c      | 14 +++++++---
 drivers/nvme/host/multipath.c | 65 ++++++++++++++++++++++++++-----------------
 drivers/nvme/host/nvme.h      |  4 +++
 3 files changed, 54 insertions(+), 29 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f172d63db2b5..05313ab5d91e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -252,10 +252,16 @@ void nvme_complete_rq(struct request *req)
 	trace_nvme_complete_rq(req);
 
 	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
-		if ((req->cmd_flags & REQ_NVME_MPATH) &&
-		    blk_path_error(status)) {
-			nvme_failover_req(req);
-			return;
+		if (blk_path_error(status)) {
+			struct nvme_ns *ns = req->q->queuedata;
+			u16 nvme_status = nvme_req(req)->status;
+
+			if (req->cmd_flags & REQ_NVME_MPATH) {
+				nvme_failover_req(req);
+				nvme_update_ana(ns, nvme_status);
+				return;
+			}
+			nvme_update_ana(ns, nvme_status);
 		}
 
 		if (!blk_queue_dying(req->q)) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5e3cc8c59a39..f7fbc161dc8c 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath,
 
 inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 {
-	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
+	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
 }
 
 /*
@@ -47,6 +47,17 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 	}
 }
 
+static bool nvme_ana_error(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_ANA_TRANSITION:
+	case NVME_SC_ANA_INACCESSIBLE:
+	case NVME_SC_ANA_PERSISTENT_LOSS:
+		return true;
+	}
+	return false;
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -58,10 +69,7 @@ void nvme_failover_req(struct request *req)
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 	blk_mq_end_request(req, 0);
 
-	switch (status & 0x7ff) {
-	case NVME_SC_ANA_TRANSITION:
-	case NVME_SC_ANA_INACCESSIBLE:
-	case NVME_SC_ANA_PERSISTENT_LOSS:
+	if (nvme_ana_error(status)) {
 		/*
 		 * If we got back an ANA error we know the controller is alive,
 		 * but not ready to serve this namespaces.  The spec suggests
@@ -69,31 +77,38 @@ void nvme_failover_req(struct request *req)
 		 * that the admin and I/O queues are not serialized that is
 		 * fundamentally racy.  So instead just clear the current path,
 		 * mark the the path as pending and kick of a re-read of the ANA
-		 * log page ASAP.
+		 * log page ASAP (see nvme_update_ana() below).
 		 */
 		nvme_mpath_clear_current_path(ns);
-		if (ns->ctrl->ana_log_buf) {
-			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
-			queue_work(nvme_wq, &ns->ctrl->ana_work);
+	} else {
+		switch (status & 0x7ff) {
+		case NVME_SC_HOST_PATH_ERROR:
+			/*
+			 * Temporary transport disruption in talking to the
+			 * controller.  Try to send on a new path.
+			 */
+			nvme_mpath_clear_current_path(ns);
+			break;
+		default:
+			/*
+			 * Reset the controller for any non-ANA error as we
+			 * don't know what caused the error.
+			 */
+			nvme_reset_ctrl(ns->ctrl);
+			break;
 		}
-		break;
-	case NVME_SC_HOST_PATH_ERROR:
-		/*
-		 * Temporary transport disruption in talking to the controller.
-		 * Try to send on a new path.
-		 */
-		nvme_mpath_clear_current_path(ns);
-		break;
-	default:
-		/*
-		 * Reset the controller for any non-ANA error as we don't know
-		 * what caused the error.
-		 */
-		nvme_reset_ctrl(ns->ctrl);
-		break;
 	}
+}
 
-	kblockd_schedule_work(&ns->head->requeue_work);
+void nvme_update_ana(struct nvme_ns *ns, u16 status)
+{
+	if (nvme_ana_error(status) && ns->ctrl->ana_log_buf) {
+		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+		queue_work(nvme_wq, &ns->ctrl->ana_work);
+	}
+
+	if (multipath)
+		kblockd_schedule_work(&ns->head->requeue_work);
 }
 
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 32a1f1cfdfb4..8b4bc2054b7a 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -468,6 +468,7 @@ bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 			struct nvme_ctrl *ctrl, int *flags);
 void nvme_failover_req(struct request *req);
+void nvme_update_ana(struct nvme_ns *ns, u16 status);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
@@ -507,6 +508,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 static inline void nvme_failover_req(struct request *req)
 {
 }
+static inline void nvme_update_ana(struct nvme_ns *ns, u16 status)
+{
+}
 static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 }

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
  2018-11-14 17:47               ` Mike Snitzer
@ 2018-11-14 18:51                 ` Hannes Reinecke
  -1 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-14 18:51 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Keith Busch, Sagi Grimberg, hch, axboe, Martin Wilck, lijie,
	xose.vazquez, linux-nvme, chengjike.cheng, shenhong09, dm-devel,
	wangzhoumengjian, christophe.varoqui, bmarzins, sschremm,
	linux-block, linux-kernel

On 11/14/18 6:47 PM, Mike Snitzer wrote:
> On Wed, Nov 14 2018 at  2:49am -0500,
> Hannes Reinecke <hare@suse.de> wrote:
> 
>> On 11/14/18 6:38 AM, Mike Snitzer wrote:
>>> On Tue, Nov 13 2018 at  1:00pm -0500,
>>> Mike Snitzer <snitzer@redhat.com> wrote:
>>>
>>>> [1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
>>>> [2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
>>> ...
>>>
>>> I knew there had to be a pretty tight coupling between the NVMe driver's
>>> native multipathing and ANA support... and that the simplicity of
>>> Hannes' patch [1] was too good to be true.
>>>
>>> The real justification for not making Hannes' change is it'd effectively
>>> be useless without first splitting out the ANA handling done during NVMe
>>> request completion (NVME_SC_ANA_* cases in nvme_failover_req) that
>>> triggers re-reading the ANA log page accordingly.
>>>
>>> So without the ability to drive the ANA workqueue to trigger
>>> nvme_read_ana_log() from the nvme driver's completion path -- even if
>>> nvme_core.multipath=N -- it really doesn't buy multipath-tools anything
>>> to have the NVMe driver export the ana state via sysfs, because that ANA
>>> state will never get updated.
>>>
>> Hmm. Indeed, I was more focussed on having the sysfs attributes
>> displayed, so yes, indeed it needs some more work.
> ...
>>> Not holding my breath BUT:
>>> if decoupling the reading of ANA state from native NVMe multipathing
>>> specific work during nvme request completion were an acceptable
>>> advancement I'd gladly do the work.
>>>
>> I'd be happy to work on that, given that we'll have to have 'real'
>> ANA support for device-mapper anyway for SLE12 SP4 etc.
> 
> I had a close enough look yesterday that I figured I'd just implement
> what I reasoned through as one way forward, compile tested only (patch
> relative to Jens' for-4.21/block):
> 
>  drivers/nvme/host/core.c      | 14 +++++++---
>  drivers/nvme/host/multipath.c | 65 ++++++++++++++++++++++++++-----------------
>  drivers/nvme/host/nvme.h      |  4 +++
>  3 files changed, 54 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index f172d63db2b5..05313ab5d91e 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -252,10 +252,16 @@ void nvme_complete_rq(struct request *req)
>  	trace_nvme_complete_rq(req);
>  
>  	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> -		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> -		    blk_path_error(status)) {
> -			nvme_failover_req(req);
> -			return;
> +		if (blk_path_error(status)) {
> +			struct nvme_ns *ns = req->q->queuedata;
> +			u16 nvme_status = nvme_req(req)->status;
> +
> +			if (req->cmd_flags & REQ_NVME_MPATH) {
> +				nvme_failover_req(req);
> +				nvme_update_ana(ns, nvme_status);
> +				return;
> +			}
> +			nvme_update_ana(ns, nvme_status);
>  		}
>  
>  		if (!blk_queue_dying(req->q)) {
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 5e3cc8c59a39..f7fbc161dc8c 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath,
>  
>  inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
>  {
> -	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
> +	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
>  }
>  
>  /*
> @@ -47,6 +47,17 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>  	}
>  }
>  
> +static bool nvme_ana_error(u16 status)
> +{
> +	switch (status & 0x7ff) {
> +	case NVME_SC_ANA_TRANSITION:
> +	case NVME_SC_ANA_INACCESSIBLE:
> +	case NVME_SC_ANA_PERSISTENT_LOSS:
> +		return true;
> +	}
> +	return false;
> +}
> +
>  void nvme_failover_req(struct request *req)
>  {
>  	struct nvme_ns *ns = req->q->queuedata;
> @@ -58,10 +69,7 @@ void nvme_failover_req(struct request *req)
>  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>  	blk_mq_end_request(req, 0);
>  
> -	switch (status & 0x7ff) {
> -	case NVME_SC_ANA_TRANSITION:
> -	case NVME_SC_ANA_INACCESSIBLE:
> -	case NVME_SC_ANA_PERSISTENT_LOSS:
> +	if (nvme_ana_error(status)) {
>  		/*
>  		 * If we got back an ANA error we know the controller is alive,
>  		 * but not ready to serve this namespaces.  The spec suggests
> @@ -69,31 +77,38 @@ void nvme_failover_req(struct request *req)
>  		 * that the admin and I/O queues are not serialized that is
>  		 * fundamentally racy.  So instead just clear the current path,
>  		 * mark the the path as pending and kick of a re-read of the ANA
> -		 * log page ASAP.
> +		 * log page ASAP (see nvme_update_ana() below).
>  		 */
>  		nvme_mpath_clear_current_path(ns);
> -		if (ns->ctrl->ana_log_buf) {
> -			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> -			queue_work(nvme_wq, &ns->ctrl->ana_work);
> +	} else {
> +		switch (status & 0x7ff) {
> +		case NVME_SC_HOST_PATH_ERROR:
> +			/*
> +			 * Temporary transport disruption in talking to the
> +			 * controller.  Try to send on a new path.
> +			 */
> +			nvme_mpath_clear_current_path(ns);
> +			break;
> +		default:
> +			/*
> +			 * Reset the controller for any non-ANA error as we
> +			 * don't know what caused the error.
> +			 */
> +			nvme_reset_ctrl(ns->ctrl);
> +			break;
>  		}
> -		break;
> -	case NVME_SC_HOST_PATH_ERROR:
> -		/*
> -		 * Temporary transport disruption in talking to the controller.
> -		 * Try to send on a new path.
> -		 */
> -		nvme_mpath_clear_current_path(ns);
> -		break;
> -	default:
> -		/*
> -		 * Reset the controller for any non-ANA error as we don't know
> -		 * what caused the error.
> -		 */
> -		nvme_reset_ctrl(ns->ctrl);
> -		break;
>  	}
Maybe move nvme_mpath_clear_current_path() out of the loop (it wouldn't
matter if we clear the path if we reset the controller afterwards); that
might even clean up the code even more.

> +}
>  
> -	kblockd_schedule_work(&ns->head->requeue_work);
> +void nvme_update_ana(struct nvme_ns *ns, u16 status)
> +{
> +	if (nvme_ana_error(status) && ns->ctrl->ana_log_buf) {
> +		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> +		queue_work(nvme_wq, &ns->ctrl->ana_work);
> +	}
> +
> +	if (multipath)
> +		kblockd_schedule_work(&ns->head->requeue_work);
>  }

maybe use 'ns->head->disk' here; we only need to call this if we have a
multipath disk.

Remaining bits are okay.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
@ 2018-11-14 18:51                 ` Hannes Reinecke
  0 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-14 18:51 UTC (permalink / raw)


On 11/14/18 6:47 PM, Mike Snitzer wrote:
> On Wed, Nov 14 2018 at  2:49am -0500,
> Hannes Reinecke <hare@suse.de> wrote:
> 
>> On 11/14/18 6:38 AM, Mike Snitzer wrote:
>>> On Tue, Nov 13 2018 at  1:00pm -0500,
>>> Mike Snitzer <snitzer@redhat.com> wrote:
>>>
>>>> [1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
>>>> [2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
>>> ...
>>>
>>> I knew there had to be a pretty tight coupling between the NVMe driver's
>>> native multipathing and ANA support... and that the simplicity of
>>> Hannes' patch [1] was too good to be true.
>>>
>>> The real justification for not making Hannes' change is it'd effectively
>>> be useless without first splitting out the ANA handling done during NVMe
>>> request completion (NVME_SC_ANA_* cases in nvme_failover_req) that
>>> triggers re-reading the ANA log page accordingly.
>>>
>>> So without the ability to drive the ANA workqueue to trigger
>>> nvme_read_ana_log() from the nvme driver's completion path -- even if
>>> nvme_core.multipath=N -- it really doesn't buy multipath-tools anything
>>> to have the NVMe driver export the ana state via sysfs, because that ANA
>>> state will never get updated.
>>>
>> Hmm. Indeed, I was more focussed on having the sysfs attributes
>> displayed, so yes, indeed it needs some more work.
> ...
>>> Not holding my breath BUT:
>>> if decoupling the reading of ANA state from native NVMe multipathing
>>> specific work during nvme request completion were an acceptable
>>> advancement I'd gladly do the work.
>>>
>> I'd be happy to work on that, given that we'll have to have 'real'
>> ANA support for device-mapper anyway for SLE12 SP4 etc.
> 
> I had a close enough look yesterday that I figured I'd just implement
> what I reasoned through as one way forward, compile tested only (patch
> relative to Jens' for-4.21/block):
> 
>  drivers/nvme/host/core.c      | 14 +++++++---
>  drivers/nvme/host/multipath.c | 65 ++++++++++++++++++++++++++-----------------
>  drivers/nvme/host/nvme.h      |  4 +++
>  3 files changed, 54 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index f172d63db2b5..05313ab5d91e 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -252,10 +252,16 @@ void nvme_complete_rq(struct request *req)
>  	trace_nvme_complete_rq(req);
>  
>  	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> -		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> -		    blk_path_error(status)) {
> -			nvme_failover_req(req);
> -			return;
> +		if (blk_path_error(status)) {
> +			struct nvme_ns *ns = req->q->queuedata;
> +			u16 nvme_status = nvme_req(req)->status;
> +
> +			if (req->cmd_flags & REQ_NVME_MPATH) {
> +				nvme_failover_req(req);
> +				nvme_update_ana(ns, nvme_status);
> +				return;
> +			}
> +			nvme_update_ana(ns, nvme_status);
>  		}
>  
>  		if (!blk_queue_dying(req->q)) {
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 5e3cc8c59a39..f7fbc161dc8c 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath,
>  
>  inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
>  {
> -	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
> +	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
>  }
>  
>  /*
> @@ -47,6 +47,17 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>  	}
>  }
>  
> +static bool nvme_ana_error(u16 status)
> +{
> +	switch (status & 0x7ff) {
> +	case NVME_SC_ANA_TRANSITION:
> +	case NVME_SC_ANA_INACCESSIBLE:
> +	case NVME_SC_ANA_PERSISTENT_LOSS:
> +		return true;
> +	}
> +	return false;
> +}
> +
>  void nvme_failover_req(struct request *req)
>  {
>  	struct nvme_ns *ns = req->q->queuedata;
> @@ -58,10 +69,7 @@ void nvme_failover_req(struct request *req)
>  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>  	blk_mq_end_request(req, 0);
>  
> -	switch (status & 0x7ff) {
> -	case NVME_SC_ANA_TRANSITION:
> -	case NVME_SC_ANA_INACCESSIBLE:
> -	case NVME_SC_ANA_PERSISTENT_LOSS:
> +	if (nvme_ana_error(status)) {
>  		/*
>  		 * If we got back an ANA error we know the controller is alive,
>  		 * but not ready to serve this namespaces.  The spec suggests
> @@ -69,31 +77,38 @@ void nvme_failover_req(struct request *req)
>  		 * that the admin and I/O queues are not serialized that is
>  		 * fundamentally racy.  So instead just clear the current path,
>  		 * mark the the path as pending and kick of a re-read of the ANA
> -		 * log page ASAP.
> +		 * log page ASAP (see nvme_update_ana() below).
>  		 */
>  		nvme_mpath_clear_current_path(ns);
> -		if (ns->ctrl->ana_log_buf) {
> -			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> -			queue_work(nvme_wq, &ns->ctrl->ana_work);
> +	} else {
> +		switch (status & 0x7ff) {
> +		case NVME_SC_HOST_PATH_ERROR:
> +			/*
> +			 * Temporary transport disruption in talking to the
> +			 * controller.  Try to send on a new path.
> +			 */
> +			nvme_mpath_clear_current_path(ns);
> +			break;
> +		default:
> +			/*
> +			 * Reset the controller for any non-ANA error as we
> +			 * don't know what caused the error.
> +			 */
> +			nvme_reset_ctrl(ns->ctrl);
> +			break;
>  		}
> -		break;
> -	case NVME_SC_HOST_PATH_ERROR:
> -		/*
> -		 * Temporary transport disruption in talking to the controller.
> -		 * Try to send on a new path.
> -		 */
> -		nvme_mpath_clear_current_path(ns);
> -		break;
> -	default:
> -		/*
> -		 * Reset the controller for any non-ANA error as we don't know
> -		 * what caused the error.
> -		 */
> -		nvme_reset_ctrl(ns->ctrl);
> -		break;
>  	}
Maybe move nvme_mpath_clear_current_path() out of the loop (it wouldn't
matter if we clear the path if we reset the controller afterwards); that
might even clean up the code even more.

> +}
>  
> -	kblockd_schedule_work(&ns->head->requeue_work);
> +void nvme_update_ana(struct nvme_ns *ns, u16 status)
> +{
> +	if (nvme_ana_error(status) && ns->ctrl->ana_log_buf) {
> +		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> +		queue_work(nvme_wq, &ns->ctrl->ana_work);
> +	}
> +
> +	if (multipath)
> +		kblockd_schedule_work(&ns->head->requeue_work);
>  }

maybe use 'ns->head->disk' here; we only need to call this if we have a
multipath disk.

Remaining bits are okay.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: multipath-tools: add ANA support for NVMe device
  2018-11-14 18:51                 ` Hannes Reinecke
@ 2018-11-14 19:26                   ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-14 19:26 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Keith Busch, Sagi Grimberg, hch, axboe, Martin Wilck, lijie,
	xose.vazquez, linux-nvme, chengjike.cheng, shenhong09, dm-devel,
	wangzhoumengjian, christophe.varoqui, bmarzins, sschremm,
	linux-block, linux-kernel

On Wed, Nov 14 2018 at  1:51pm -0500,
Hannes Reinecke <hare@suse.de> wrote:

> On 11/14/18 6:47 PM, Mike Snitzer wrote:
> > On Wed, Nov 14 2018 at  2:49am -0500,
> > Hannes Reinecke <hare@suse.de> wrote:
> > 
> >> On 11/14/18 6:38 AM, Mike Snitzer wrote:
> >>> On Tue, Nov 13 2018 at  1:00pm -0500,
> >>> Mike Snitzer <snitzer@redhat.com> wrote:
> >>>
> >>>> [1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> >>>> [2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
> >>> ...
> >>>
> >>> I knew there had to be a pretty tight coupling between the NVMe driver's
> >>> native multipathing and ANA support... and that the simplicity of
> >>> Hannes' patch [1] was too good to be true.
> >>>
> >>> The real justification for not making Hannes' change is it'd effectively
> >>> be useless without first splitting out the ANA handling done during NVMe
> >>> request completion (NVME_SC_ANA_* cases in nvme_failover_req) that
> >>> triggers re-reading the ANA log page accordingly.
> >>>
> >>> So without the ability to drive the ANA workqueue to trigger
> >>> nvme_read_ana_log() from the nvme driver's completion path -- even if
> >>> nvme_core.multipath=N -- it really doesn't buy multipath-tools anything
> >>> to have the NVMe driver export the ana state via sysfs, because that ANA
> >>> state will never get updated.
> >>>
> >> Hmm. Indeed, I was more focussed on having the sysfs attributes
> >> displayed, so yes, indeed it needs some more work.
> > ...
> >>> Not holding my breath BUT:
> >>> if decoupling the reading of ANA state from native NVMe multipathing
> >>> specific work during nvme request completion were an acceptable
> >>> advancement I'd gladly do the work.
> >>>
> >> I'd be happy to work on that, given that we'll have to have 'real'
> >> ANA support for device-mapper anyway for SLE12 SP4 etc.
> > 
> > I had a close enough look yesterday that I figured I'd just implement
> > what I reasoned through as one way forward, compile tested only (patch
> > relative to Jens' for-4.21/block):
> > 
> >  drivers/nvme/host/core.c      | 14 +++++++---
> >  drivers/nvme/host/multipath.c | 65 ++++++++++++++++++++++++++-----------------
> >  drivers/nvme/host/nvme.h      |  4 +++
> >  3 files changed, 54 insertions(+), 29 deletions(-)
> > 
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index f172d63db2b5..05313ab5d91e 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -252,10 +252,16 @@ void nvme_complete_rq(struct request *req)
> >  	trace_nvme_complete_rq(req);
> >  
> >  	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> > -		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> > -		    blk_path_error(status)) {
> > -			nvme_failover_req(req);
> > -			return;
> > +		if (blk_path_error(status)) {
> > +			struct nvme_ns *ns = req->q->queuedata;
> > +			u16 nvme_status = nvme_req(req)->status;
> > +
> > +			if (req->cmd_flags & REQ_NVME_MPATH) {
> > +				nvme_failover_req(req);
> > +				nvme_update_ana(ns, nvme_status);
> > +				return;
> > +			}
> > +			nvme_update_ana(ns, nvme_status);
> >  		}
> >  
> >  		if (!blk_queue_dying(req->q)) {
> > diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> > index 5e3cc8c59a39..f7fbc161dc8c 100644
> > --- a/drivers/nvme/host/multipath.c
> > +++ b/drivers/nvme/host/multipath.c
> > @@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath,
> >  
> >  inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
> >  {
> > -	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
> > +	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
> >  }
> >  
> >  /*
> > @@ -47,6 +47,17 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
> >  	}
> >  }
> >  
> > +static bool nvme_ana_error(u16 status)
> > +{
> > +	switch (status & 0x7ff) {
> > +	case NVME_SC_ANA_TRANSITION:
> > +	case NVME_SC_ANA_INACCESSIBLE:
> > +	case NVME_SC_ANA_PERSISTENT_LOSS:
> > +		return true;
> > +	}
> > +	return false;
> > +}
> > +
> >  void nvme_failover_req(struct request *req)
> >  {
> >  	struct nvme_ns *ns = req->q->queuedata;
> > @@ -58,10 +69,7 @@ void nvme_failover_req(struct request *req)
> >  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> >  	blk_mq_end_request(req, 0);
> >  
> > -	switch (status & 0x7ff) {
> > -	case NVME_SC_ANA_TRANSITION:
> > -	case NVME_SC_ANA_INACCESSIBLE:
> > -	case NVME_SC_ANA_PERSISTENT_LOSS:
> > +	if (nvme_ana_error(status)) {
> >  		/*
> >  		 * If we got back an ANA error we know the controller is alive,
> >  		 * but not ready to serve this namespaces.  The spec suggests
> > @@ -69,31 +77,38 @@ void nvme_failover_req(struct request *req)
> >  		 * that the admin and I/O queues are not serialized that is
> >  		 * fundamentally racy.  So instead just clear the current path,
> >  		 * mark the the path as pending and kick of a re-read of the ANA
> > -		 * log page ASAP.
> > +		 * log page ASAP (see nvme_update_ana() below).
> >  		 */
> >  		nvme_mpath_clear_current_path(ns);
> > -		if (ns->ctrl->ana_log_buf) {
> > -			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> > -			queue_work(nvme_wq, &ns->ctrl->ana_work);
> > +	} else {
> > +		switch (status & 0x7ff) {
> > +		case NVME_SC_HOST_PATH_ERROR:
> > +			/*
> > +			 * Temporary transport disruption in talking to the
> > +			 * controller.  Try to send on a new path.
> > +			 */
> > +			nvme_mpath_clear_current_path(ns);
> > +			break;
> > +		default:
> > +			/*
> > +			 * Reset the controller for any non-ANA error as we
> > +			 * don't know what caused the error.
> > +			 */
> > +			nvme_reset_ctrl(ns->ctrl);
> > +			break;
> >  		}
> > -		break;
> > -	case NVME_SC_HOST_PATH_ERROR:
> > -		/*
> > -		 * Temporary transport disruption in talking to the controller.
> > -		 * Try to send on a new path.
> > -		 */
> > -		nvme_mpath_clear_current_path(ns);
> > -		break;
> > -	default:
> > -		/*
> > -		 * Reset the controller for any non-ANA error as we don't know
> > -		 * what caused the error.
> > -		 */
> > -		nvme_reset_ctrl(ns->ctrl);
> > -		break;
> >  	}
> Maybe move nvme_mpath_clear_current_path() out of the loop (it wouldn't
> matter if we clear the path if we reset the controller afterwards); that
> might even clean up the code even more.

Not completely sure what you're suggesting.  But I was going for purely
functional equivalent of the existing upstream code -- just with
multipathing and ana split.

Could be that in future it wouldn't make sense to always
nvme_mpath_clear_current_path() for all cases?

> 
> > +}
> >  
> > -	kblockd_schedule_work(&ns->head->requeue_work);
> > +void nvme_update_ana(struct nvme_ns *ns, u16 status)
> > +{
> > +	if (nvme_ana_error(status) && ns->ctrl->ana_log_buf) {
> > +		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> > +		queue_work(nvme_wq, &ns->ctrl->ana_work);
> > +	}
> > +
> > +	if (multipath)
> > +		kblockd_schedule_work(&ns->head->requeue_work);
> >  }
> 
> maybe use 'ns->head->disk' here; we only need to call this if we have a
> multipath disk.

Sure.

> Remaining bits are okay.

Thanks,
Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* multipath-tools: add ANA support for NVMe device
@ 2018-11-14 19:26                   ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-14 19:26 UTC (permalink / raw)


On Wed, Nov 14 2018 at  1:51pm -0500,
Hannes Reinecke <hare@suse.de> wrote:

> On 11/14/18 6:47 PM, Mike Snitzer wrote:
> > On Wed, Nov 14 2018 at  2:49am -0500,
> > Hannes Reinecke <hare@suse.de> wrote:
> > 
> >> On 11/14/18 6:38 AM, Mike Snitzer wrote:
> >>> On Tue, Nov 13 2018 at  1:00pm -0500,
> >>> Mike Snitzer <snitzer@redhat.com> wrote:
> >>>
> >>>> [1]: http://lists.infradead.org/pipermail/linux-nvme/2018-November/020765.html
> >>>> [2]: https://www.redhat.com/archives/dm-devel/2018-November/msg00072.html
> >>> ...
> >>>
> >>> I knew there had to be a pretty tight coupling between the NVMe driver's
> >>> native multipathing and ANA support... and that the simplicity of
> >>> Hannes' patch [1] was too good to be true.
> >>>
> >>> The real justification for not making Hannes' change is it'd effectively
> >>> be useless without first splitting out the ANA handling done during NVMe
> >>> request completion (NVME_SC_ANA_* cases in nvme_failover_req) that
> >>> triggers re-reading the ANA log page accordingly.
> >>>
> >>> So without the ability to drive the ANA workqueue to trigger
> >>> nvme_read_ana_log() from the nvme driver's completion path -- even if
> >>> nvme_core.multipath=N -- it really doesn't buy multipath-tools anything
> >>> to have the NVMe driver export the ana state via sysfs, because that ANA
> >>> state will never get updated.
> >>>
> >> Hmm. Indeed, I was more focussed on having the sysfs attributes
> >> displayed, so yes, indeed it needs some more work.
> > ...
> >>> Not holding my breath BUT:
> >>> if decoupling the reading of ANA state from native NVMe multipathing
> >>> specific work during nvme request completion were an acceptable
> >>> advancement I'd gladly do the work.
> >>>
> >> I'd be happy to work on that, given that we'll have to have 'real'
> >> ANA support for device-mapper anyway for SLE12 SP4 etc.
> > 
> > I had a close enough look yesterday that I figured I'd just implement
> > what I reasoned through as one way forward, compile tested only (patch
> > relative to Jens' for-4.21/block):
> > 
> >  drivers/nvme/host/core.c      | 14 +++++++---
> >  drivers/nvme/host/multipath.c | 65 ++++++++++++++++++++++++++-----------------
> >  drivers/nvme/host/nvme.h      |  4 +++
> >  3 files changed, 54 insertions(+), 29 deletions(-)
> > 
> > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> > index f172d63db2b5..05313ab5d91e 100644
> > --- a/drivers/nvme/host/core.c
> > +++ b/drivers/nvme/host/core.c
> > @@ -252,10 +252,16 @@ void nvme_complete_rq(struct request *req)
> >  	trace_nvme_complete_rq(req);
> >  
> >  	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> > -		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> > -		    blk_path_error(status)) {
> > -			nvme_failover_req(req);
> > -			return;
> > +		if (blk_path_error(status)) {
> > +			struct nvme_ns *ns = req->q->queuedata;
> > +			u16 nvme_status = nvme_req(req)->status;
> > +
> > +			if (req->cmd_flags & REQ_NVME_MPATH) {
> > +				nvme_failover_req(req);
> > +				nvme_update_ana(ns, nvme_status);
> > +				return;
> > +			}
> > +			nvme_update_ana(ns, nvme_status);
> >  		}
> >  
> >  		if (!blk_queue_dying(req->q)) {
> > diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> > index 5e3cc8c59a39..f7fbc161dc8c 100644
> > --- a/drivers/nvme/host/multipath.c
> > +++ b/drivers/nvme/host/multipath.c
> > @@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath,
> >  
> >  inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
> >  {
> > -	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
> > +	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
> >  }
> >  
> >  /*
> > @@ -47,6 +47,17 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
> >  	}
> >  }
> >  
> > +static bool nvme_ana_error(u16 status)
> > +{
> > +	switch (status & 0x7ff) {
> > +	case NVME_SC_ANA_TRANSITION:
> > +	case NVME_SC_ANA_INACCESSIBLE:
> > +	case NVME_SC_ANA_PERSISTENT_LOSS:
> > +		return true;
> > +	}
> > +	return false;
> > +}
> > +
> >  void nvme_failover_req(struct request *req)
> >  {
> >  	struct nvme_ns *ns = req->q->queuedata;
> > @@ -58,10 +69,7 @@ void nvme_failover_req(struct request *req)
> >  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> >  	blk_mq_end_request(req, 0);
> >  
> > -	switch (status & 0x7ff) {
> > -	case NVME_SC_ANA_TRANSITION:
> > -	case NVME_SC_ANA_INACCESSIBLE:
> > -	case NVME_SC_ANA_PERSISTENT_LOSS:
> > +	if (nvme_ana_error(status)) {
> >  		/*
> >  		 * If we got back an ANA error we know the controller is alive,
> >  		 * but not ready to serve this namespaces.  The spec suggests
> > @@ -69,31 +77,38 @@ void nvme_failover_req(struct request *req)
> >  		 * that the admin and I/O queues are not serialized that is
> >  		 * fundamentally racy.  So instead just clear the current path,
> >  		 * mark the the path as pending and kick of a re-read of the ANA
> > -		 * log page ASAP.
> > +		 * log page ASAP (see nvme_update_ana() below).
> >  		 */
> >  		nvme_mpath_clear_current_path(ns);
> > -		if (ns->ctrl->ana_log_buf) {
> > -			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> > -			queue_work(nvme_wq, &ns->ctrl->ana_work);
> > +	} else {
> > +		switch (status & 0x7ff) {
> > +		case NVME_SC_HOST_PATH_ERROR:
> > +			/*
> > +			 * Temporary transport disruption in talking to the
> > +			 * controller.  Try to send on a new path.
> > +			 */
> > +			nvme_mpath_clear_current_path(ns);
> > +			break;
> > +		default:
> > +			/*
> > +			 * Reset the controller for any non-ANA error as we
> > +			 * don't know what caused the error.
> > +			 */
> > +			nvme_reset_ctrl(ns->ctrl);
> > +			break;
> >  		}
> > -		break;
> > -	case NVME_SC_HOST_PATH_ERROR:
> > -		/*
> > -		 * Temporary transport disruption in talking to the controller.
> > -		 * Try to send on a new path.
> > -		 */
> > -		nvme_mpath_clear_current_path(ns);
> > -		break;
> > -	default:
> > -		/*
> > -		 * Reset the controller for any non-ANA error as we don't know
> > -		 * what caused the error.
> > -		 */
> > -		nvme_reset_ctrl(ns->ctrl);
> > -		break;
> >  	}
> Maybe move nvme_mpath_clear_current_path() out of the loop (it wouldn't
> matter if we clear the path if we reset the controller afterwards); that
> might even clean up the code even more.

Not completely sure what you're suggesting.  But I was going for purely
functional equivalent of the existing upstream code -- just with
multipathing and ana split.

Could be that in future it wouldn't make sense to always
nvme_mpath_clear_current_path() for all cases?

> 
> > +}
> >  
> > -	kblockd_schedule_work(&ns->head->requeue_work);
> > +void nvme_update_ana(struct nvme_ns *ns, u16 status)
> > +{
> > +	if (nvme_ana_error(status) && ns->ctrl->ana_log_buf) {
> > +		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> > +		queue_work(nvme_wq, &ns->ctrl->ana_work);
> > +	}
> > +
> > +	if (multipath)
> > +		kblockd_schedule_work(&ns->head->requeue_work);
> >  }
> 
> maybe use 'ns->head->disk' here; we only need to call this if we have a
> multipath disk.

Sure.

> Remaining bits are okay.

Thanks,
Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] nvme: allow ANA support to be independent of native multipathing
  2018-11-14 18:51                 ` Hannes Reinecke
@ 2018-11-15 17:46                   ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-15 17:46 UTC (permalink / raw)
  To: linux-nvme
  Cc: Hannes Reinecke, Keith Busch, Sagi Grimberg, hch, axboe,
	Martin Wilck, lijie, xose.vazquez, chengjike.cheng, shenhong09,
	dm-devel, wangzhoumengjian, christophe.varoqui, bmarzins,
	sschremm, linux-block, linux-kernel

Whether or not ANA is present is a choice of the target implementation;
the host (and whether it supports multipathing) has _zero_ influence on
this.  If the target declares a path as 'inaccessible' the path _is_
inaccessible to the host.  As such, ANA support should be functional
even if native multipathing is not.

Introduce ability to always re-read ANA log page as required due to ANA
error and make current ANA state available via sysfs -- even if native
multipathing is disabled on the host (e.g. nvme_core.multipath=N).

This affords userspace access to the current ANA state independent of
which layer might be doing multipathing.  It also allows multipath-tools
to rely on the NVMe driver for ANA support while dm-multipath takes care
of multipathing.

While implementing these changes care was taken to preserve the exact
ANA functionality and code sequence native multipathing has provided.
This manifests as native multipathing's nvme_failover_req() being
tweaked to call __nvme_update_ana() which was factored out to allow
nvme_update_ana() to be called independent of nvme_failover_req().

And as always, if embedded NVMe users do not want any performance
overhead associated with ANA or native NVMe multipathing they can
disable CONFIG_NVME_MULTIPATH.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/nvme/host/core.c      | 10 +++++----
 drivers/nvme/host/multipath.c | 49 +++++++++++++++++++++++++++++++++----------
 drivers/nvme/host/nvme.h      |  4 ++++
 3 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fe957166c4a9..3df607905628 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -255,10 +255,12 @@ void nvme_complete_rq(struct request *req)
 		nvme_req(req)->ctrl->comp_seen = true;
 
 	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
-		if ((req->cmd_flags & REQ_NVME_MPATH) &&
-		    blk_path_error(status)) {
-			nvme_failover_req(req);
-			return;
+		if (blk_path_error(status)) {
+			if (req->cmd_flags & REQ_NVME_MPATH) {
+				nvme_failover_req(req);
+				return;
+			}
+			nvme_update_ana(req);
 		}
 
 		if (!blk_queue_dying(req->q)) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 8e03cda770c5..0adbcff5fba2 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath,
 
 inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 {
-	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
+	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
 }
 
 /*
@@ -47,6 +47,35 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 	}
 }
 
+static bool nvme_ana_error(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_ANA_TRANSITION:
+	case NVME_SC_ANA_INACCESSIBLE:
+	case NVME_SC_ANA_PERSISTENT_LOSS:
+		return true;
+	}
+	return false;
+}
+
+static void __nvme_update_ana(struct nvme_ns *ns)
+{
+	if (!ns->ctrl->ana_log_buf)
+		return;
+
+	set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+	queue_work(nvme_wq, &ns->ctrl->ana_work);
+}
+
+void nvme_update_ana(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	u16 status = nvme_req(req)->status;
+
+	if (nvme_ana_error(status))
+		__nvme_update_ana(ns);
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -58,25 +87,22 @@ void nvme_failover_req(struct request *req)
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 	blk_mq_end_request(req, 0);
 
-	switch (status & 0x7ff) {
-	case NVME_SC_ANA_TRANSITION:
-	case NVME_SC_ANA_INACCESSIBLE:
-	case NVME_SC_ANA_PERSISTENT_LOSS:
+	if (nvme_ana_error(status)) {
 		/*
 		 * If we got back an ANA error we know the controller is alive,
 		 * but not ready to serve this namespaces.  The spec suggests
 		 * we should update our general state here, but due to the fact
 		 * that the admin and I/O queues are not serialized that is
 		 * fundamentally racy.  So instead just clear the current path,
-		 * mark the the path as pending and kick of a re-read of the ANA
+		 * mark the path as pending and kick off a re-read of the ANA
 		 * log page ASAP.
 		 */
 		nvme_mpath_clear_current_path(ns);
-		if (ns->ctrl->ana_log_buf) {
-			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
-			queue_work(nvme_wq, &ns->ctrl->ana_work);
-		}
-		break;
+		__nvme_update_ana(ns);
+		goto kick_requeue;
+	}
+
+	switch (status & 0x7ff) {
 	case NVME_SC_HOST_PATH_ERROR:
 		/*
 		 * Temporary transport disruption in talking to the controller.
@@ -93,6 +119,7 @@ void nvme_failover_req(struct request *req)
 		break;
 	}
 
+kick_requeue:
 	kblockd_schedule_work(&ns->head->requeue_work);
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 27663ce3044e..cbe4253f2d02 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -471,6 +471,7 @@ bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 			struct nvme_ctrl *ctrl, int *flags);
 void nvme_failover_req(struct request *req);
+void nvme_update_ana(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
@@ -510,6 +511,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 static inline void nvme_failover_req(struct request *req)
 {
 }
+static inline void nvme_update_ana(struct request *req)
+{
+}
 static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 }
-- 
2.15.0


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH] nvme: allow ANA support to be independent of native multipathing
@ 2018-11-15 17:46                   ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-15 17:46 UTC (permalink / raw)


Whether or not ANA is present is a choice of the target implementation;
the host (and whether it supports multipathing) has _zero_ influence on
this.  If the target declares a path as 'inaccessible' the path _is_
inaccessible to the host.  As such, ANA support should be functional
even if native multipathing is not.

Introduce ability to always re-read ANA log page as required due to ANA
error and make current ANA state available via sysfs -- even if native
multipathing is disabled on the host (e.g. nvme_core.multipath=N).

This affords userspace access to the current ANA state independent of
which layer might be doing multipathing.  It also allows multipath-tools
to rely on the NVMe driver for ANA support while dm-multipath takes care
of multipathing.

While implementing these changes care was taken to preserve the exact
ANA functionality and code sequence native multipathing has provided.
This manifests as native multipathing's nvme_failover_req() being
tweaked to call __nvme_update_ana() which was factored out to allow
nvme_update_ana() to be called independent of nvme_failover_req().

And as always, if embedded NVMe users do not want any performance
overhead associated with ANA or native NVMe multipathing they can
disable CONFIG_NVME_MULTIPATH.

Signed-off-by: Mike Snitzer <snitzer at redhat.com>
---
 drivers/nvme/host/core.c      | 10 +++++----
 drivers/nvme/host/multipath.c | 49 +++++++++++++++++++++++++++++++++----------
 drivers/nvme/host/nvme.h      |  4 ++++
 3 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fe957166c4a9..3df607905628 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -255,10 +255,12 @@ void nvme_complete_rq(struct request *req)
 		nvme_req(req)->ctrl->comp_seen = true;
 
 	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
-		if ((req->cmd_flags & REQ_NVME_MPATH) &&
-		    blk_path_error(status)) {
-			nvme_failover_req(req);
-			return;
+		if (blk_path_error(status)) {
+			if (req->cmd_flags & REQ_NVME_MPATH) {
+				nvme_failover_req(req);
+				return;
+			}
+			nvme_update_ana(req);
 		}
 
 		if (!blk_queue_dying(req->q)) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 8e03cda770c5..0adbcff5fba2 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath,
 
 inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 {
-	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
+	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
 }
 
 /*
@@ -47,6 +47,35 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 	}
 }
 
+static bool nvme_ana_error(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_ANA_TRANSITION:
+	case NVME_SC_ANA_INACCESSIBLE:
+	case NVME_SC_ANA_PERSISTENT_LOSS:
+		return true;
+	}
+	return false;
+}
+
+static void __nvme_update_ana(struct nvme_ns *ns)
+{
+	if (!ns->ctrl->ana_log_buf)
+		return;
+
+	set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+	queue_work(nvme_wq, &ns->ctrl->ana_work);
+}
+
+void nvme_update_ana(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	u16 status = nvme_req(req)->status;
+
+	if (nvme_ana_error(status))
+		__nvme_update_ana(ns);
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -58,25 +87,22 @@ void nvme_failover_req(struct request *req)
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 	blk_mq_end_request(req, 0);
 
-	switch (status & 0x7ff) {
-	case NVME_SC_ANA_TRANSITION:
-	case NVME_SC_ANA_INACCESSIBLE:
-	case NVME_SC_ANA_PERSISTENT_LOSS:
+	if (nvme_ana_error(status)) {
 		/*
 		 * If we got back an ANA error we know the controller is alive,
 		 * but not ready to serve this namespaces.  The spec suggests
 		 * we should update our general state here, but due to the fact
 		 * that the admin and I/O queues are not serialized that is
 		 * fundamentally racy.  So instead just clear the current path,
-		 * mark the the path as pending and kick of a re-read of the ANA
+		 * mark the path as pending and kick off a re-read of the ANA
 		 * log page ASAP.
 		 */
 		nvme_mpath_clear_current_path(ns);
-		if (ns->ctrl->ana_log_buf) {
-			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
-			queue_work(nvme_wq, &ns->ctrl->ana_work);
-		}
-		break;
+		__nvme_update_ana(ns);
+		goto kick_requeue;
+	}
+
+	switch (status & 0x7ff) {
 	case NVME_SC_HOST_PATH_ERROR:
 		/*
 		 * Temporary transport disruption in talking to the controller.
@@ -93,6 +119,7 @@ void nvme_failover_req(struct request *req)
 		break;
 	}
 
+kick_requeue:
 	kblockd_schedule_work(&ns->head->requeue_work);
 }
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 27663ce3044e..cbe4253f2d02 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -471,6 +471,7 @@ bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 			struct nvme_ctrl *ctrl, int *flags);
 void nvme_failover_req(struct request *req);
+void nvme_update_ana(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
@@ -510,6 +511,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 static inline void nvme_failover_req(struct request *req)
 {
 }
+static inline void nvme_update_ana(struct request *req)
+{
+}
 static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 }
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: [PATCH] nvme: allow ANA support to be independent of native multipathing
  2018-11-15 17:46                   ` Mike Snitzer
@ 2018-11-16  7:25                     ` Hannes Reinecke
  -1 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-16  7:25 UTC (permalink / raw)
  To: Mike Snitzer, linux-nvme
  Cc: Keith Busch, Sagi Grimberg, hch, axboe, Martin Wilck, lijie,
	xose.vazquez, chengjike.cheng, shenhong09, dm-devel,
	wangzhoumengjian, christophe.varoqui, bmarzins, sschremm,
	linux-block, linux-kernel

On 11/15/18 6:46 PM, Mike Snitzer wrote:
> Whether or not ANA is present is a choice of the target implementation;
> the host (and whether it supports multipathing) has _zero_ influence on
> this.  If the target declares a path as 'inaccessible' the path _is_
> inaccessible to the host.  As such, ANA support should be functional
> even if native multipathing is not.
> 
> Introduce ability to always re-read ANA log page as required due to ANA
> error and make current ANA state available via sysfs -- even if native
> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> 
> This affords userspace access to the current ANA state independent of
> which layer might be doing multipathing.  It also allows multipath-tools
> to rely on the NVMe driver for ANA support while dm-multipath takes care
> of multipathing.
> 
> While implementing these changes care was taken to preserve the exact
> ANA functionality and code sequence native multipathing has provided.
> This manifests as native multipathing's nvme_failover_req() being
> tweaked to call __nvme_update_ana() which was factored out to allow
> nvme_update_ana() to be called independent of nvme_failover_req().
> 
> And as always, if embedded NVMe users do not want any performance
> overhead associated with ANA or native NVMe multipathing they can
> disable CONFIG_NVME_MULTIPATH.
> 
> Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> ---
>   drivers/nvme/host/core.c      | 10 +++++----
>   drivers/nvme/host/multipath.c | 49 +++++++++++++++++++++++++++++++++----------
>   drivers/nvme/host/nvme.h      |  4 ++++
>   3 files changed, 48 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index fe957166c4a9..3df607905628 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -255,10 +255,12 @@ void nvme_complete_rq(struct request *req)
>   		nvme_req(req)->ctrl->comp_seen = true;
>   
>   	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> -		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> -		    blk_path_error(status)) {
> -			nvme_failover_req(req);
> -			return;
> +		if (blk_path_error(status)) {
> +			if (req->cmd_flags & REQ_NVME_MPATH) {
> +				nvme_failover_req(req);
> +				return;
> +			}
> +			nvme_update_ana(req);
>   		}
>   
>   		if (!blk_queue_dying(req->q)) {
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 8e03cda770c5..0adbcff5fba2 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath,
>   
>   inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
>   {
> -	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
> +	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
>   }
>   
>   /*
> @@ -47,6 +47,35 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>   	}
>   }
>   
> +static bool nvme_ana_error(u16 status)
> +{
> +	switch (status & 0x7ff) {
> +	case NVME_SC_ANA_TRANSITION:
> +	case NVME_SC_ANA_INACCESSIBLE:
> +	case NVME_SC_ANA_PERSISTENT_LOSS:
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static void __nvme_update_ana(struct nvme_ns *ns)
> +{
> +	if (!ns->ctrl->ana_log_buf)
> +		return;
> +
> +	set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> +	queue_work(nvme_wq, &ns->ctrl->ana_work);
> +}
> +
> +void nvme_update_ana(struct request *req)
> +{
> +	struct nvme_ns *ns = req->q->queuedata;
> +	u16 status = nvme_req(req)->status;
> +
> +	if (nvme_ana_error(status))
> +		__nvme_update_ana(ns);
> +}
> +
>   void nvme_failover_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> @@ -58,25 +87,22 @@ void nvme_failover_req(struct request *req)
>   	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>   	blk_mq_end_request(req, 0);
>   
> -	switch (status & 0x7ff) {
> -	case NVME_SC_ANA_TRANSITION:
> -	case NVME_SC_ANA_INACCESSIBLE:
> -	case NVME_SC_ANA_PERSISTENT_LOSS:
> +	if (nvme_ana_error(status)) {
>   		/*
>   		 * If we got back an ANA error we know the controller is alive,
>   		 * but not ready to serve this namespaces.  The spec suggests
>   		 * we should update our general state here, but due to the fact
>   		 * that the admin and I/O queues are not serialized that is
>   		 * fundamentally racy.  So instead just clear the current path,
> -		 * mark the the path as pending and kick of a re-read of the ANA
> +		 * mark the path as pending and kick off a re-read of the ANA
>   		 * log page ASAP.
>   		 */
>   		nvme_mpath_clear_current_path(ns);
> -		if (ns->ctrl->ana_log_buf) {
> -			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> -			queue_work(nvme_wq, &ns->ctrl->ana_work);
> -		}
> -		break;
> +		__nvme_update_ana(ns);
> +		goto kick_requeue;
> +	}
> +
> +	switch (status & 0x7ff) {
>   	case NVME_SC_HOST_PATH_ERROR:
>   		/*
>   		 * Temporary transport disruption in talking to the controller.
> @@ -93,6 +119,7 @@ void nvme_failover_req(struct request *req)
>   		break;
>   	}
>   
> +kick_requeue:
>   	kblockd_schedule_work(&ns->head->requeue_work);
>   }
>   
Doesn't the need to be protected by 'if (ns->head->disk)' or somesuch?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] nvme: allow ANA support to be independent of native multipathing
@ 2018-11-16  7:25                     ` Hannes Reinecke
  0 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-16  7:25 UTC (permalink / raw)


On 11/15/18 6:46 PM, Mike Snitzer wrote:
> Whether or not ANA is present is a choice of the target implementation;
> the host (and whether it supports multipathing) has _zero_ influence on
> this.  If the target declares a path as 'inaccessible' the path _is_
> inaccessible to the host.  As such, ANA support should be functional
> even if native multipathing is not.
> 
> Introduce ability to always re-read ANA log page as required due to ANA
> error and make current ANA state available via sysfs -- even if native
> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> 
> This affords userspace access to the current ANA state independent of
> which layer might be doing multipathing.  It also allows multipath-tools
> to rely on the NVMe driver for ANA support while dm-multipath takes care
> of multipathing.
> 
> While implementing these changes care was taken to preserve the exact
> ANA functionality and code sequence native multipathing has provided.
> This manifests as native multipathing's nvme_failover_req() being
> tweaked to call __nvme_update_ana() which was factored out to allow
> nvme_update_ana() to be called independent of nvme_failover_req().
> 
> And as always, if embedded NVMe users do not want any performance
> overhead associated with ANA or native NVMe multipathing they can
> disable CONFIG_NVME_MULTIPATH.
> 
> Signed-off-by: Mike Snitzer <snitzer at redhat.com>
> ---
>   drivers/nvme/host/core.c      | 10 +++++----
>   drivers/nvme/host/multipath.c | 49 +++++++++++++++++++++++++++++++++----------
>   drivers/nvme/host/nvme.h      |  4 ++++
>   3 files changed, 48 insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index fe957166c4a9..3df607905628 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -255,10 +255,12 @@ void nvme_complete_rq(struct request *req)
>   		nvme_req(req)->ctrl->comp_seen = true;
>   
>   	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> -		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> -		    blk_path_error(status)) {
> -			nvme_failover_req(req);
> -			return;
> +		if (blk_path_error(status)) {
> +			if (req->cmd_flags & REQ_NVME_MPATH) {
> +				nvme_failover_req(req);
> +				return;
> +			}
> +			nvme_update_ana(req);
>   		}
>   
>   		if (!blk_queue_dying(req->q)) {
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 8e03cda770c5..0adbcff5fba2 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -22,7 +22,7 @@ MODULE_PARM_DESC(multipath,
>   
>   inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
>   {
> -	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
> +	return ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
>   }
>   
>   /*
> @@ -47,6 +47,35 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>   	}
>   }
>   
> +static bool nvme_ana_error(u16 status)
> +{
> +	switch (status & 0x7ff) {
> +	case NVME_SC_ANA_TRANSITION:
> +	case NVME_SC_ANA_INACCESSIBLE:
> +	case NVME_SC_ANA_PERSISTENT_LOSS:
> +		return true;
> +	}
> +	return false;
> +}
> +
> +static void __nvme_update_ana(struct nvme_ns *ns)
> +{
> +	if (!ns->ctrl->ana_log_buf)
> +		return;
> +
> +	set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> +	queue_work(nvme_wq, &ns->ctrl->ana_work);
> +}
> +
> +void nvme_update_ana(struct request *req)
> +{
> +	struct nvme_ns *ns = req->q->queuedata;
> +	u16 status = nvme_req(req)->status;
> +
> +	if (nvme_ana_error(status))
> +		__nvme_update_ana(ns);
> +}
> +
>   void nvme_failover_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> @@ -58,25 +87,22 @@ void nvme_failover_req(struct request *req)
>   	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
>   	blk_mq_end_request(req, 0);
>   
> -	switch (status & 0x7ff) {
> -	case NVME_SC_ANA_TRANSITION:
> -	case NVME_SC_ANA_INACCESSIBLE:
> -	case NVME_SC_ANA_PERSISTENT_LOSS:
> +	if (nvme_ana_error(status)) {
>   		/*
>   		 * If we got back an ANA error we know the controller is alive,
>   		 * but not ready to serve this namespaces.  The spec suggests
>   		 * we should update our general state here, but due to the fact
>   		 * that the admin and I/O queues are not serialized that is
>   		 * fundamentally racy.  So instead just clear the current path,
> -		 * mark the the path as pending and kick of a re-read of the ANA
> +		 * mark the path as pending and kick off a re-read of the ANA
>   		 * log page ASAP.
>   		 */
>   		nvme_mpath_clear_current_path(ns);
> -		if (ns->ctrl->ana_log_buf) {
> -			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> -			queue_work(nvme_wq, &ns->ctrl->ana_work);
> -		}
> -		break;
> +		__nvme_update_ana(ns);
> +		goto kick_requeue;
> +	}
> +
> +	switch (status & 0x7ff) {
>   	case NVME_SC_HOST_PATH_ERROR:
>   		/*
>   		 * Temporary transport disruption in talking to the controller.
> @@ -93,6 +119,7 @@ void nvme_failover_req(struct request *req)
>   		break;
>   	}
>   
> +kick_requeue:
>   	kblockd_schedule_work(&ns->head->requeue_work);
>   }
>   
Doesn't the need to be protected by 'if (ns->head->disk)' or somesuch?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] nvme: allow ANA support to be independent of native multipathing
  2018-11-15 17:46                   ` Mike Snitzer
@ 2018-11-16  9:14                     ` Christoph Hellwig
  -1 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-16  9:14 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: linux-nvme, Hannes Reinecke, Keith Busch, Sagi Grimberg, hch,
	axboe, Martin Wilck, lijie, xose.vazquez, chengjike.cheng,
	shenhong09, dm-devel, wangzhoumengjian, christophe.varoqui,
	bmarzins, sschremm, linux-block, linux-kernel

On Thu, Nov 15, 2018 at 12:46:05PM -0500, Mike Snitzer wrote:
> Whether or not ANA is present is a choice of the target implementation;
> the host (and whether it supports multipathing) has _zero_ influence on
> this.  If the target declares a path as 'inaccessible' the path _is_
> inaccessible to the host.  As such, ANA support should be functional
> even if native multipathing is not.
> 
> Introduce ability to always re-read ANA log page as required due to ANA
> error and make current ANA state available via sysfs -- even if native
> multipathing is disabled on the host (e.g. nvme_core.multipath=N).

The first part I could see, but I still want to make it conditional
in some way as nvme is going into deeply embedded setups, and I don't
want to carry the weight of the ANA code around for everyone.

The second I fundamentally disagree with.  And even if you found agreement
it would have to be in a separate patch as it is a separate feature.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] nvme: allow ANA support to be independent of native multipathing
@ 2018-11-16  9:14                     ` Christoph Hellwig
  0 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-16  9:14 UTC (permalink / raw)


On Thu, Nov 15, 2018@12:46:05PM -0500, Mike Snitzer wrote:
> Whether or not ANA is present is a choice of the target implementation;
> the host (and whether it supports multipathing) has _zero_ influence on
> this.  If the target declares a path as 'inaccessible' the path _is_
> inaccessible to the host.  As such, ANA support should be functional
> even if native multipathing is not.
> 
> Introduce ability to always re-read ANA log page as required due to ANA
> error and make current ANA state available via sysfs -- even if native
> multipathing is disabled on the host (e.g. nvme_core.multipath=N).

The first part I could see, but I still want to make it conditional
in some way as nvme is going into deeply embedded setups, and I don't
want to carry the weight of the ANA code around for everyone.

The second I fundamentally disagree with.  And even if you found agreement
it would have to be in a separate patch as it is a separate feature.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] nvme: allow ANA support to be independent of native multipathing
  2018-11-16  9:14                     ` Christoph Hellwig
@ 2018-11-16  9:40                       ` Hannes Reinecke
  -1 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-16  9:40 UTC (permalink / raw)
  To: Christoph Hellwig, Mike Snitzer
  Cc: linux-nvme, Keith Busch, Sagi Grimberg, axboe, Martin Wilck,
	lijie, xose.vazquez, chengjike.cheng, shenhong09, dm-devel,
	wangzhoumengjian, christophe.varoqui, bmarzins, sschremm,
	linux-block, linux-kernel

On 11/16/18 10:14 AM, Christoph Hellwig wrote:
> On Thu, Nov 15, 2018 at 12:46:05PM -0500, Mike Snitzer wrote:
>> Whether or not ANA is present is a choice of the target implementation;
>> the host (and whether it supports multipathing) has _zero_ influence on
>> this.  If the target declares a path as 'inaccessible' the path _is_
>> inaccessible to the host.  As such, ANA support should be functional
>> even if native multipathing is not.
>>
>> Introduce ability to always re-read ANA log page as required due to ANA
>> error and make current ANA state available via sysfs -- even if native
>> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> 
> The first part I could see, but I still want to make it conditional
> in some way as nvme is going into deeply embedded setups, and I don't
> want to carry the weight of the ANA code around for everyone.
> 
Can you clarify this a bit?
We _do_ have the NVME multipath config option to deconfigure the whole 
thing during compile time; that isn't influenced with this patch.
So are you worried about the size of the ANA implementation itself?
Or are you worried about the size of the ANA structures?

> The second I fundamentally disagree with.  And even if you found agreement
> it would have to be in a separate patch as it is a separate feature.
> 
Why? Where's the problem with re-reading the ANA log pages if we get an 
event indicating that we should?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] nvme: allow ANA support to be independent of native multipathing
@ 2018-11-16  9:40                       ` Hannes Reinecke
  0 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-16  9:40 UTC (permalink / raw)


On 11/16/18 10:14 AM, Christoph Hellwig wrote:
> On Thu, Nov 15, 2018@12:46:05PM -0500, Mike Snitzer wrote:
>> Whether or not ANA is present is a choice of the target implementation;
>> the host (and whether it supports multipathing) has _zero_ influence on
>> this.  If the target declares a path as 'inaccessible' the path _is_
>> inaccessible to the host.  As such, ANA support should be functional
>> even if native multipathing is not.
>>
>> Introduce ability to always re-read ANA log page as required due to ANA
>> error and make current ANA state available via sysfs -- even if native
>> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> 
> The first part I could see, but I still want to make it conditional
> in some way as nvme is going into deeply embedded setups, and I don't
> want to carry the weight of the ANA code around for everyone.
> 
Can you clarify this a bit?
We _do_ have the NVME multipath config option to deconfigure the whole 
thing during compile time; that isn't influenced with this patch.
So are you worried about the size of the ANA implementation itself?
Or are you worried about the size of the ANA structures?

> The second I fundamentally disagree with.  And even if you found agreement
> it would have to be in a separate patch as it is a separate feature.
> 
Why? Where's the problem with re-reading the ANA log pages if we get an 
event indicating that we should?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] nvme: allow ANA support to be independent of native multipathing
  2018-11-16  9:40                       ` Hannes Reinecke
@ 2018-11-16  9:49                         ` Christoph Hellwig
  -1 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-16  9:49 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Mike Snitzer, linux-nvme, Keith Busch,
	Sagi Grimberg, axboe, Martin Wilck, lijie, xose.vazquez,
	chengjike.cheng, shenhong09, dm-devel, wangzhoumengjian,
	christophe.varoqui, bmarzins, sschremm, linux-block,
	linux-kernel

On Fri, Nov 16, 2018 at 10:40:40AM +0100, Hannes Reinecke wrote:
>>> Introduce ability to always re-read ANA log page as required due to ANA
>>> error and make current ANA state available via sysfs -- even if native
>>> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
>>
>> The first part I could see, but I still want to make it conditional
>> in some way as nvme is going into deeply embedded setups, and I don't
>> want to carry the weight of the ANA code around for everyone.
>>
> Can you clarify this a bit?
> We _do_ have the NVME multipath config option to deconfigure the whole 
> thing during compile time; that isn't influenced with this patch.
> So are you worried about the size of the ANA implementation itself?
> Or are you worried about the size of the ANA structures?

I just see the next step of wanting to move ANA code into the core
which is implied above.
>
>> The second I fundamentally disagree with.  And even if you found agreement
>> it would have to be in a separate patch as it is a separate feature.
>>
> Why? Where's the problem with re-reading the ANA log pages if we get an 
> event indicating that we should?

"second" here means the sysfs file.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] nvme: allow ANA support to be independent of native multipathing
@ 2018-11-16  9:49                         ` Christoph Hellwig
  0 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-16  9:49 UTC (permalink / raw)


On Fri, Nov 16, 2018@10:40:40AM +0100, Hannes Reinecke wrote:
>>> Introduce ability to always re-read ANA log page as required due to ANA
>>> error and make current ANA state available via sysfs -- even if native
>>> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
>>
>> The first part I could see, but I still want to make it conditional
>> in some way as nvme is going into deeply embedded setups, and I don't
>> want to carry the weight of the ANA code around for everyone.
>>
> Can you clarify this a bit?
> We _do_ have the NVME multipath config option to deconfigure the whole 
> thing during compile time; that isn't influenced with this patch.
> So are you worried about the size of the ANA implementation itself?
> Or are you worried about the size of the ANA structures?

I just see the next step of wanting to move ANA code into the core
which is implied above.
>
>> The second I fundamentally disagree with.  And even if you found agreement
>> it would have to be in a separate patch as it is a separate feature.
>>
> Why? Where's the problem with re-reading the ANA log pages if we get an 
> event indicating that we should?

"second" here means the sysfs file.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] nvme: allow ANA support to be independent of native multipathing
  2018-11-16  9:49                         ` Christoph Hellwig
@ 2018-11-16 10:06                           ` Hannes Reinecke
  -1 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-16 10:06 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Mike Snitzer, linux-nvme, Keith Busch, Sagi Grimberg, axboe,
	Martin Wilck, lijie, xose.vazquez, chengjike.cheng, shenhong09,
	dm-devel, wangzhoumengjian, christophe.varoqui, bmarzins,
	sschremm, linux-block, linux-kernel

On 11/16/18 10:49 AM, Christoph Hellwig wrote:
> On Fri, Nov 16, 2018 at 10:40:40AM +0100, Hannes Reinecke wrote:
>>>> Introduce ability to always re-read ANA log page as required due to ANA
>>>> error and make current ANA state available via sysfs -- even if native
>>>> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
>>>
>>> The first part I could see, but I still want to make it conditional
>>> in some way as nvme is going into deeply embedded setups, and I don't
>>> want to carry the weight of the ANA code around for everyone.
>>>
>> Can you clarify this a bit?
>> We _do_ have the NVME multipath config option to deconfigure the whole
>> thing during compile time; that isn't influenced with this patch.
>> So are you worried about the size of the ANA implementation itself?
>> Or are you worried about the size of the ANA structures?
> 
> I just see the next step of wanting to move ANA code into the core
> which is implied above.

Really, I couldn't care less _where_ the ANA code lives.
If the size of which is any concern we can even make it configurable of 
sorts.

>>
>>> The second I fundamentally disagree with.  And even if you found agreement
>>> it would have to be in a separate patch as it is a separate feature.
>>>
>> Why? Where's the problem with re-reading the ANA log pages if we get an
>> event indicating that we should?
> 
> "second" here means the sysfs file.
> 
Ok, so would you be happy with making ANA support configurable?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare@suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 Nürnberg
GF: F. Imendörffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG Nürnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] nvme: allow ANA support to be independent of native multipathing
@ 2018-11-16 10:06                           ` Hannes Reinecke
  0 siblings, 0 replies; 63+ messages in thread
From: Hannes Reinecke @ 2018-11-16 10:06 UTC (permalink / raw)


On 11/16/18 10:49 AM, Christoph Hellwig wrote:
> On Fri, Nov 16, 2018@10:40:40AM +0100, Hannes Reinecke wrote:
>>>> Introduce ability to always re-read ANA log page as required due to ANA
>>>> error and make current ANA state available via sysfs -- even if native
>>>> multipathing is disabled on the host (e.g. nvme_core.multipath=N).
>>>
>>> The first part I could see, but I still want to make it conditional
>>> in some way as nvme is going into deeply embedded setups, and I don't
>>> want to carry the weight of the ANA code around for everyone.
>>>
>> Can you clarify this a bit?
>> We _do_ have the NVME multipath config option to deconfigure the whole
>> thing during compile time; that isn't influenced with this patch.
>> So are you worried about the size of the ANA implementation itself?
>> Or are you worried about the size of the ANA structures?
> 
> I just see the next step of wanting to move ANA code into the core
> which is implied above.

Really, I couldn't care less _where_ the ANA code lives.
If the size of which is any concern we can even make it configurable of 
sorts.

>>
>>> The second I fundamentally disagree with.  And even if you found agreement
>>> it would have to be in a separate patch as it is a separate feature.
>>>
>> Why? Where's the problem with re-reading the ANA log pages if we get an
>> event indicating that we should?
> 
> "second" here means the sysfs file.
> 
Ok, so would you be happy with making ANA support configurable?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke		   Teamlead Storage & Networking
hare at suse.de			               +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 N?rnberg
GF: F. Imend?rffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG N?rnberg)

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: [PATCH] nvme: allow ANA support to be independent of native multipathing
  2018-11-16 10:06                           ` Hannes Reinecke
@ 2018-11-16 10:17                             ` Christoph Hellwig
  -1 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-16 10:17 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Christoph Hellwig, Mike Snitzer, linux-nvme, Keith Busch,
	Sagi Grimberg, axboe, Martin Wilck, lijie, xose.vazquez,
	chengjike.cheng, shenhong09, dm-devel, wangzhoumengjian,
	christophe.varoqui, bmarzins, sschremm, linux-block,
	linux-kernel

On Fri, Nov 16, 2018 at 11:06:32AM +0100, Hannes Reinecke wrote:
> Ok, so would you be happy with making ANA support configurable?

I've looked a bit over the whole situation, and what I think we need
to do is:

 a) warn if we see a ANA capable device without multipath support
    so people know it is not going to work properly.
 b) deprecate the multipath module option.  It was only intended as
    a migration for any pre-existing PCIe multipath user if there
    were any, not to support any new functionality.  So for 4.20
    put in a patch that prints a clear warning when it is used,
    including a link to the nvme list, and then for 4.25 or so
    remove it entirely unless something unexpected come up.

This whole drama of optional multipath use has wasted way too much
of everyones time already.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH] nvme: allow ANA support to be independent of native multipathing
@ 2018-11-16 10:17                             ` Christoph Hellwig
  0 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-16 10:17 UTC (permalink / raw)


On Fri, Nov 16, 2018@11:06:32AM +0100, Hannes Reinecke wrote:
> Ok, so would you be happy with making ANA support configurable?

I've looked a bit over the whole situation, and what I think we need
to do is:

 a) warn if we see a ANA capable device without multipath support
    so people know it is not going to work properly.
 b) deprecate the multipath module option.  It was only intended as
    a migration for any pre-existing PCIe multipath user if there
    were any, not to support any new functionality.  So for 4.20
    put in a patch that prints a clear warning when it is used,
    including a link to the nvme list, and then for 4.25 or so
    remove it entirely unless something unexpected come up.

This whole drama of optional multipath use has wasted way too much
of everyones time already.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: nvme: allow ANA support to be independent of native multipathing
  2018-11-16  7:25                     ` Hannes Reinecke
@ 2018-11-16 14:01                       ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-16 14:01 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: linux-nvme, Keith Busch, Sagi Grimberg, hch, axboe, Martin Wilck,
	lijie, xose.vazquez, chengjike.cheng, shenhong09, dm-devel,
	wangzhoumengjian, christophe.varoqui, bmarzins, sschremm,
	linux-block, linux-kernel

On Fri, Nov 16 2018 at  2:25am -0500,
Hannes Reinecke <hare@suse.de> wrote:

> On 11/15/18 6:46 PM, Mike Snitzer wrote:
> >Whether or not ANA is present is a choice of the target implementation;
> >the host (and whether it supports multipathing) has _zero_ influence on
> >this.  If the target declares a path as 'inaccessible' the path _is_
> >inaccessible to the host.  As such, ANA support should be functional
> >even if native multipathing is not.
> >
> >Introduce ability to always re-read ANA log page as required due to ANA
> >error and make current ANA state available via sysfs -- even if native
> >multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> >
> >This affords userspace access to the current ANA state independent of
> >which layer might be doing multipathing.  It also allows multipath-tools
> >to rely on the NVMe driver for ANA support while dm-multipath takes care
> >of multipathing.
> >
> >While implementing these changes care was taken to preserve the exact
> >ANA functionality and code sequence native multipathing has provided.
> >This manifests as native multipathing's nvme_failover_req() being
> >tweaked to call __nvme_update_ana() which was factored out to allow
> >nvme_update_ana() to be called independent of nvme_failover_req().
> >
> >And as always, if embedded NVMe users do not want any performance
> >overhead associated with ANA or native NVMe multipathing they can
> >disable CONFIG_NVME_MULTIPATH.
> >
> >Signed-off-by: Mike Snitzer <snitzer@redhat.com>
> >---
> >  drivers/nvme/host/core.c      | 10 +++++----
> >  drivers/nvme/host/multipath.c | 49 +++++++++++++++++++++++++++++++++----------
> >  drivers/nvme/host/nvme.h      |  4 ++++
> >  3 files changed, 48 insertions(+), 15 deletions(-)
> >
> >diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> >index fe957166c4a9..3df607905628 100644
> >--- a/drivers/nvme/host/core.c
> >+++ b/drivers/nvme/host/core.c
> >@@ -255,10 +255,12 @@ void nvme_complete_rq(struct request *req)
> >  		nvme_req(req)->ctrl->comp_seen = true;
> >  	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> >-		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> >-		    blk_path_error(status)) {
> >-			nvme_failover_req(req);
> >-			return;
> >+		if (blk_path_error(status)) {
> >+			if (req->cmd_flags & REQ_NVME_MPATH) {
> >+				nvme_failover_req(req);
> >+				return;
> >+			}
> >+			nvme_update_ana(req);
> >  		}
> >  		if (!blk_queue_dying(req->q)) {

...

> >diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> >index 8e03cda770c5..0adbcff5fba2 100644
> >--- a/drivers/nvme/host/multipath.c
> >+++ b/drivers/nvme/host/multipath.c
> >@@ -58,25 +87,22 @@ void nvme_failover_req(struct request *req)
> >  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> >  	blk_mq_end_request(req, 0);
> >-	switch (status & 0x7ff) {
> >-	case NVME_SC_ANA_TRANSITION:
> >-	case NVME_SC_ANA_INACCESSIBLE:
> >-	case NVME_SC_ANA_PERSISTENT_LOSS:
> >+	if (nvme_ana_error(status)) {
> >  		/*
> >  		 * If we got back an ANA error we know the controller is alive,
> >  		 * but not ready to serve this namespaces.  The spec suggests
> >  		 * we should update our general state here, but due to the fact
> >  		 * that the admin and I/O queues are not serialized that is
> >  		 * fundamentally racy.  So instead just clear the current path,
> >-		 * mark the the path as pending and kick of a re-read of the ANA
> >+		 * mark the path as pending and kick off a re-read of the ANA
> >  		 * log page ASAP.
> >  		 */
> >  		nvme_mpath_clear_current_path(ns);
> >-		if (ns->ctrl->ana_log_buf) {
> >-			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> >-			queue_work(nvme_wq, &ns->ctrl->ana_work);
> >-		}
> >-		break;
> >+		__nvme_update_ana(ns);
> >+		goto kick_requeue;
> >+	}
> >+
> >+	switch (status & 0x7ff) {
> >  	case NVME_SC_HOST_PATH_ERROR:
> >  		/*
> >  		 * Temporary transport disruption in talking to the controller.
> >@@ -93,6 +119,7 @@ void nvme_failover_req(struct request *req)
> >  		break;
> >  	}
> >+kick_requeue:
> >  	kblockd_schedule_work(&ns->head->requeue_work);
> >  }
> Doesn't the need to be protected by 'if (ns->head->disk)' or somesuch?

No.  nvme_failover_req() is only ever called by native multipathing; see
nvme_complete_rq()'s check for req->cmd_flags & REQ_NVME_MPATH as the
condition for calling nvme_complete_rq().

The previos RFC-style patch I posted muddled ANA and multipathing in
nvme_update_ana() but this final patch submission was fixed because I
saw a cleaner way forward by having nvme_failover_req() also do ANA work
just like it always has -- albeit with new helpers that
nvme_update_ana() also calls.

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* nvme: allow ANA support to be independent of native multipathing
@ 2018-11-16 14:01                       ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-16 14:01 UTC (permalink / raw)


On Fri, Nov 16 2018 at  2:25am -0500,
Hannes Reinecke <hare@suse.de> wrote:

> On 11/15/18 6:46 PM, Mike Snitzer wrote:
> >Whether or not ANA is present is a choice of the target implementation;
> >the host (and whether it supports multipathing) has _zero_ influence on
> >this.  If the target declares a path as 'inaccessible' the path _is_
> >inaccessible to the host.  As such, ANA support should be functional
> >even if native multipathing is not.
> >
> >Introduce ability to always re-read ANA log page as required due to ANA
> >error and make current ANA state available via sysfs -- even if native
> >multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> >
> >This affords userspace access to the current ANA state independent of
> >which layer might be doing multipathing.  It also allows multipath-tools
> >to rely on the NVMe driver for ANA support while dm-multipath takes care
> >of multipathing.
> >
> >While implementing these changes care was taken to preserve the exact
> >ANA functionality and code sequence native multipathing has provided.
> >This manifests as native multipathing's nvme_failover_req() being
> >tweaked to call __nvme_update_ana() which was factored out to allow
> >nvme_update_ana() to be called independent of nvme_failover_req().
> >
> >And as always, if embedded NVMe users do not want any performance
> >overhead associated with ANA or native NVMe multipathing they can
> >disable CONFIG_NVME_MULTIPATH.
> >
> >Signed-off-by: Mike Snitzer <snitzer at redhat.com>
> >---
> >  drivers/nvme/host/core.c      | 10 +++++----
> >  drivers/nvme/host/multipath.c | 49 +++++++++++++++++++++++++++++++++----------
> >  drivers/nvme/host/nvme.h      |  4 ++++
> >  3 files changed, 48 insertions(+), 15 deletions(-)
> >
> >diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> >index fe957166c4a9..3df607905628 100644
> >--- a/drivers/nvme/host/core.c
> >+++ b/drivers/nvme/host/core.c
> >@@ -255,10 +255,12 @@ void nvme_complete_rq(struct request *req)
> >  		nvme_req(req)->ctrl->comp_seen = true;
> >  	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> >-		if ((req->cmd_flags & REQ_NVME_MPATH) &&
> >-		    blk_path_error(status)) {
> >-			nvme_failover_req(req);
> >-			return;
> >+		if (blk_path_error(status)) {
> >+			if (req->cmd_flags & REQ_NVME_MPATH) {
> >+				nvme_failover_req(req);
> >+				return;
> >+			}
> >+			nvme_update_ana(req);
> >  		}
> >  		if (!blk_queue_dying(req->q)) {

...

> >diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> >index 8e03cda770c5..0adbcff5fba2 100644
> >--- a/drivers/nvme/host/multipath.c
> >+++ b/drivers/nvme/host/multipath.c
> >@@ -58,25 +87,22 @@ void nvme_failover_req(struct request *req)
> >  	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> >  	blk_mq_end_request(req, 0);
> >-	switch (status & 0x7ff) {
> >-	case NVME_SC_ANA_TRANSITION:
> >-	case NVME_SC_ANA_INACCESSIBLE:
> >-	case NVME_SC_ANA_PERSISTENT_LOSS:
> >+	if (nvme_ana_error(status)) {
> >  		/*
> >  		 * If we got back an ANA error we know the controller is alive,
> >  		 * but not ready to serve this namespaces.  The spec suggests
> >  		 * we should update our general state here, but due to the fact
> >  		 * that the admin and I/O queues are not serialized that is
> >  		 * fundamentally racy.  So instead just clear the current path,
> >-		 * mark the the path as pending and kick of a re-read of the ANA
> >+		 * mark the path as pending and kick off a re-read of the ANA
> >  		 * log page ASAP.
> >  		 */
> >  		nvme_mpath_clear_current_path(ns);
> >-		if (ns->ctrl->ana_log_buf) {
> >-			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> >-			queue_work(nvme_wq, &ns->ctrl->ana_work);
> >-		}
> >-		break;
> >+		__nvme_update_ana(ns);
> >+		goto kick_requeue;
> >+	}
> >+
> >+	switch (status & 0x7ff) {
> >  	case NVME_SC_HOST_PATH_ERROR:
> >  		/*
> >  		 * Temporary transport disruption in talking to the controller.
> >@@ -93,6 +119,7 @@ void nvme_failover_req(struct request *req)
> >  		break;
> >  	}
> >+kick_requeue:
> >  	kblockd_schedule_work(&ns->head->requeue_work);
> >  }
> Doesn't the need to be protected by 'if (ns->head->disk)' or somesuch?

No.  nvme_failover_req() is only ever called by native multipathing; see
nvme_complete_rq()'s check for req->cmd_flags & REQ_NVME_MPATH as the
condition for calling nvme_complete_rq().

The previos RFC-style patch I posted muddled ANA and multipathing in
nvme_update_ana() but this final patch submission was fixed because I
saw a cleaner way forward by having nvme_failover_req() also do ANA work
just like it always has -- albeit with new helpers that
nvme_update_ana() also calls.

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: nvme: allow ANA support to be independent of native multipathing
  2018-11-16  9:14                     ` Christoph Hellwig
@ 2018-11-16 14:12                       ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-16 14:12 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-nvme, Hannes Reinecke, Keith Busch, Sagi Grimberg, axboe,
	Martin Wilck, lijie, xose.vazquez, chengjike.cheng, shenhong09,
	dm-devel, wangzhoumengjian, christophe.varoqui, bmarzins,
	sschremm, linux-block, linux-kernel

On Fri, Nov 16 2018 at  4:14am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Thu, Nov 15, 2018 at 12:46:05PM -0500, Mike Snitzer wrote:
> > Whether or not ANA is present is a choice of the target implementation;
> > the host (and whether it supports multipathing) has _zero_ influence on
> > this.  If the target declares a path as 'inaccessible' the path _is_
> > inaccessible to the host.  As such, ANA support should be functional
> > even if native multipathing is not.
> > 
> > Introduce ability to always re-read ANA log page as required due to ANA
> > error and make current ANA state available via sysfs -- even if native
> > multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> 
> The first part I could see, but I still want to make it conditional
> in some way as nvme is going into deeply embedded setups, and I don't
> want to carry the weight of the ANA code around for everyone.

So disabling CONFIG_NVME_MULTIPATH isn't adequate?  You see a need for
enabling CONFIG_NVME_MULTIPATH but disallowing consideration of ANA
state management for these embedded cases?  That doesn't make much sense
in that ANA won't be enabled on the target, and if it is then shouldn't
the NVMe host enable ANA support?
 
> The second I fundamentally disagree with.  And even if you found agreement
> it would have to be in a separate patch as it is a separate feature.

In a later mail you clarified that by "second" you meant the sysfs
export.

nvme_ns_id_attrs_are_visible() calls nvme_ctrl_use_ana() to know whether
to export the ana state via sysfs.  Makes no sense to enable ANA to
work, independent of native multipathing, and then even though ANA is
active disallow exporting the associated ANA state.  We need
consistency.. if the sysfs file exists it means ANA state is being
managed.

All I can figure is that what you're suggesting is born out of
disallowing uses you fundamentally disagree with (e.g. multipath-tools
reading the ANA state from this sysfs file).

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* nvme: allow ANA support to be independent of native multipathing
@ 2018-11-16 14:12                       ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-16 14:12 UTC (permalink / raw)


On Fri, Nov 16 2018 at  4:14am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Thu, Nov 15, 2018@12:46:05PM -0500, Mike Snitzer wrote:
> > Whether or not ANA is present is a choice of the target implementation;
> > the host (and whether it supports multipathing) has _zero_ influence on
> > this.  If the target declares a path as 'inaccessible' the path _is_
> > inaccessible to the host.  As such, ANA support should be functional
> > even if native multipathing is not.
> > 
> > Introduce ability to always re-read ANA log page as required due to ANA
> > error and make current ANA state available via sysfs -- even if native
> > multipathing is disabled on the host (e.g. nvme_core.multipath=N).
> 
> The first part I could see, but I still want to make it conditional
> in some way as nvme is going into deeply embedded setups, and I don't
> want to carry the weight of the ANA code around for everyone.

So disabling CONFIG_NVME_MULTIPATH isn't adequate?  You see a need for
enabling CONFIG_NVME_MULTIPATH but disallowing consideration of ANA
state management for these embedded cases?  That doesn't make much sense
in that ANA won't be enabled on the target, and if it is then shouldn't
the NVMe host enable ANA support?
 
> The second I fundamentally disagree with.  And even if you found agreement
> it would have to be in a separate patch as it is a separate feature.

In a later mail you clarified that by "second" you meant the sysfs
export.

nvme_ns_id_attrs_are_visible() calls nvme_ctrl_use_ana() to know whether
to export the ana state via sysfs.  Makes no sense to enable ANA to
work, independent of native multipathing, and then even though ANA is
active disallow exporting the associated ANA state.  We need
consistency.. if the sysfs file exists it means ANA state is being
managed.

All I can figure is that what you're suggesting is born out of
disallowing uses you fundamentally disagree with (e.g. multipath-tools
reading the ANA state from this sysfs file).

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* [PATCH v2] nvme: allow ANA support to be independent of native multipathing
  2018-11-15 17:46                   ` Mike Snitzer
@ 2018-11-16 18:59                     ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-16 18:59 UTC (permalink / raw)
  To: linux-nvme
  Cc: axboe, linux-block, lijie, xose.vazquez, Sagi Grimberg,
	chengjike.cheng, linux-kernel, shenhong09, Martin Wilck,
	Keith Busch, dm-devel, wangzhoumengjian, hch, sschremm

Whether or not ANA is present is a choice of the target implementation;
the host (and whether it supports multipathing) has _zero_ influence on
this.  If the target declares a path as 'inaccessible' the path _is_
inaccessible to the host.  As such, ANA support should be functional
even if native multipathing is not.

Introduce ability to always re-read ANA log page as required due to ANA
error and make current ANA state available via sysfs -- even if native
multipathing is disabled on the host (via nvme_core.multipath=N).

While implementing these changes care was taken to preserve the exact
ANA functionality and code sequence native multipathing has provided.
This manifests as native multipathing's nvme_failover_req() being
tweaked to call __nvme_update_ana() which was factored out to allow
nvme_update_ana() to be called independent of nvme_failover_req().

Add new module param to allow ANA to be disabled via nvme_core.ana=N.
Also, emit warning if ANA is enabled but native multipathing isn't.

And as always, if embedded NVMe users do not want any performance
overhead associated with ANA or native NVMe multipathing they can
disable CONFIG_NVME_MULTIPATH.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
---
 drivers/nvme/host/core.c      | 10 +++++---
 drivers/nvme/host/multipath.c | 59 ++++++++++++++++++++++++++++++++++---------
 drivers/nvme/host/nvme.h      |  4 +++
 3 files changed, 57 insertions(+), 16 deletions(-)

v2: add nvme_core.ana modparam and emit warning if ana but !multipath

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fe957166c4a9..3df607905628 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -255,10 +255,12 @@ void nvme_complete_rq(struct request *req)
 		nvme_req(req)->ctrl->comp_seen = true;
 
 	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
-		if ((req->cmd_flags & REQ_NVME_MPATH) &&
-		    blk_path_error(status)) {
-			nvme_failover_req(req);
-			return;
+		if (blk_path_error(status)) {
+			if (req->cmd_flags & REQ_NVME_MPATH) {
+				nvme_failover_req(req);
+				return;
+			}
+			nvme_update_ana(req);
 		}
 
 		if (!blk_queue_dying(req->q)) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 8e03cda770c5..8b45cad2734d 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -18,11 +18,16 @@
 static bool multipath = true;
 module_param(multipath, bool, 0444);
 MODULE_PARM_DESC(multipath,
-	"turn on native support for multiple controllers per subsystem");
+	"toggle native support for multiple controllers per subsystem");
+
+static bool ana = true;
+module_param(ana, bool, 0444);
+MODULE_PARM_DESC(ana,
+	"toggle support for Asynchronous Namespace Access");
 
 inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 {
-	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
+	return ana && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
 }
 
 /*
@@ -47,6 +52,35 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 	}
 }
 
+static bool nvme_ana_error(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_ANA_TRANSITION:
+	case NVME_SC_ANA_INACCESSIBLE:
+	case NVME_SC_ANA_PERSISTENT_LOSS:
+		return true;
+	}
+	return false;
+}
+
+static void __nvme_update_ana(struct nvme_ns *ns)
+{
+	if (!ns->ctrl->ana_log_buf)
+		return;
+
+	set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+	queue_work(nvme_wq, &ns->ctrl->ana_work);
+}
+
+void nvme_update_ana(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	u16 status = nvme_req(req)->status;
+
+	if (nvme_ana_error(status))
+		__nvme_update_ana(ns);
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -58,25 +92,22 @@ void nvme_failover_req(struct request *req)
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 	blk_mq_end_request(req, 0);
 
-	switch (status & 0x7ff) {
-	case NVME_SC_ANA_TRANSITION:
-	case NVME_SC_ANA_INACCESSIBLE:
-	case NVME_SC_ANA_PERSISTENT_LOSS:
+	if (nvme_ana_error(status)) {
 		/*
 		 * If we got back an ANA error we know the controller is alive,
 		 * but not ready to serve this namespaces.  The spec suggests
 		 * we should update our general state here, but due to the fact
 		 * that the admin and I/O queues are not serialized that is
 		 * fundamentally racy.  So instead just clear the current path,
-		 * mark the the path as pending and kick of a re-read of the ANA
+		 * mark the path as pending and kick off a re-read of the ANA
 		 * log page ASAP.
 		 */
 		nvme_mpath_clear_current_path(ns);
-		if (ns->ctrl->ana_log_buf) {
-			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
-			queue_work(nvme_wq, &ns->ctrl->ana_work);
-		}
-		break;
+		__nvme_update_ana(ns);
+		goto kick_requeue;
+	}
+
+	switch (status & 0x7ff) {
 	case NVME_SC_HOST_PATH_ERROR:
 		/*
 		 * Temporary transport disruption in talking to the controller.
@@ -93,6 +124,7 @@ void nvme_failover_req(struct request *req)
 		break;
 	}
 
+kick_requeue:
 	kblockd_schedule_work(&ns->head->requeue_work);
 }
 
@@ -551,6 +583,9 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 	if (!nvme_ctrl_use_ana(ctrl))
 		return 0;
 
+	if (!multipath)
+		dev_warn(ctrl->device, "allowing ANA without native multipathing due to nvme_core.multipath=N\n");
+
 	ctrl->anacap = id->anacap;
 	ctrl->anatt = id->anatt;
 	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 27663ce3044e..cbe4253f2d02 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -471,6 +471,7 @@ bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 			struct nvme_ctrl *ctrl, int *flags);
 void nvme_failover_req(struct request *req);
+void nvme_update_ana(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
@@ -510,6 +511,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 static inline void nvme_failover_req(struct request *req)
 {
 }
+static inline void nvme_update_ana(struct request *req)
+{
+}
 static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 }
-- 
2.15.0


^ permalink raw reply related	[flat|nested] 63+ messages in thread

* [PATCH v2] nvme: allow ANA support to be independent of native multipathing
@ 2018-11-16 18:59                     ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-16 18:59 UTC (permalink / raw)


Whether or not ANA is present is a choice of the target implementation;
the host (and whether it supports multipathing) has _zero_ influence on
this.  If the target declares a path as 'inaccessible' the path _is_
inaccessible to the host.  As such, ANA support should be functional
even if native multipathing is not.

Introduce ability to always re-read ANA log page as required due to ANA
error and make current ANA state available via sysfs -- even if native
multipathing is disabled on the host (via nvme_core.multipath=N).

While implementing these changes care was taken to preserve the exact
ANA functionality and code sequence native multipathing has provided.
This manifests as native multipathing's nvme_failover_req() being
tweaked to call __nvme_update_ana() which was factored out to allow
nvme_update_ana() to be called independent of nvme_failover_req().

Add new module param to allow ANA to be disabled via nvme_core.ana=N.
Also, emit warning if ANA is enabled but native multipathing isn't.

And as always, if embedded NVMe users do not want any performance
overhead associated with ANA or native NVMe multipathing they can
disable CONFIG_NVME_MULTIPATH.

Signed-off-by: Mike Snitzer <snitzer at redhat.com>
---
 drivers/nvme/host/core.c      | 10 +++++---
 drivers/nvme/host/multipath.c | 59 ++++++++++++++++++++++++++++++++++---------
 drivers/nvme/host/nvme.h      |  4 +++
 3 files changed, 57 insertions(+), 16 deletions(-)

v2: add nvme_core.ana modparam and emit warning if ana but !multipath

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index fe957166c4a9..3df607905628 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -255,10 +255,12 @@ void nvme_complete_rq(struct request *req)
 		nvme_req(req)->ctrl->comp_seen = true;
 
 	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
-		if ((req->cmd_flags & REQ_NVME_MPATH) &&
-		    blk_path_error(status)) {
-			nvme_failover_req(req);
-			return;
+		if (blk_path_error(status)) {
+			if (req->cmd_flags & REQ_NVME_MPATH) {
+				nvme_failover_req(req);
+				return;
+			}
+			nvme_update_ana(req);
 		}
 
 		if (!blk_queue_dying(req->q)) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 8e03cda770c5..8b45cad2734d 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -18,11 +18,16 @@
 static bool multipath = true;
 module_param(multipath, bool, 0444);
 MODULE_PARM_DESC(multipath,
-	"turn on native support for multiple controllers per subsystem");
+	"toggle native support for multiple controllers per subsystem");
+
+static bool ana = true;
+module_param(ana, bool, 0444);
+MODULE_PARM_DESC(ana,
+	"toggle support for Asynchronous Namespace Access");
 
 inline bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl)
 {
-	return multipath && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
+	return ana && ctrl->subsys && (ctrl->subsys->cmic & (1 << 3));
 }
 
 /*
@@ -47,6 +52,35 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 	}
 }
 
+static bool nvme_ana_error(u16 status)
+{
+	switch (status & 0x7ff) {
+	case NVME_SC_ANA_TRANSITION:
+	case NVME_SC_ANA_INACCESSIBLE:
+	case NVME_SC_ANA_PERSISTENT_LOSS:
+		return true;
+	}
+	return false;
+}
+
+static void __nvme_update_ana(struct nvme_ns *ns)
+{
+	if (!ns->ctrl->ana_log_buf)
+		return;
+
+	set_bit(NVME_NS_ANA_PENDING, &ns->flags);
+	queue_work(nvme_wq, &ns->ctrl->ana_work);
+}
+
+void nvme_update_ana(struct request *req)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	u16 status = nvme_req(req)->status;
+
+	if (nvme_ana_error(status))
+		__nvme_update_ana(ns);
+}
+
 void nvme_failover_req(struct request *req)
 {
 	struct nvme_ns *ns = req->q->queuedata;
@@ -58,25 +92,22 @@ void nvme_failover_req(struct request *req)
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
 	blk_mq_end_request(req, 0);
 
-	switch (status & 0x7ff) {
-	case NVME_SC_ANA_TRANSITION:
-	case NVME_SC_ANA_INACCESSIBLE:
-	case NVME_SC_ANA_PERSISTENT_LOSS:
+	if (nvme_ana_error(status)) {
 		/*
 		 * If we got back an ANA error we know the controller is alive,
 		 * but not ready to serve this namespaces.  The spec suggests
 		 * we should update our general state here, but due to the fact
 		 * that the admin and I/O queues are not serialized that is
 		 * fundamentally racy.  So instead just clear the current path,
-		 * mark the the path as pending and kick of a re-read of the ANA
+		 * mark the path as pending and kick off a re-read of the ANA
 		 * log page ASAP.
 		 */
 		nvme_mpath_clear_current_path(ns);
-		if (ns->ctrl->ana_log_buf) {
-			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
-			queue_work(nvme_wq, &ns->ctrl->ana_work);
-		}
-		break;
+		__nvme_update_ana(ns);
+		goto kick_requeue;
+	}
+
+	switch (status & 0x7ff) {
 	case NVME_SC_HOST_PATH_ERROR:
 		/*
 		 * Temporary transport disruption in talking to the controller.
@@ -93,6 +124,7 @@ void nvme_failover_req(struct request *req)
 		break;
 	}
 
+kick_requeue:
 	kblockd_schedule_work(&ns->head->requeue_work);
 }
 
@@ -551,6 +583,9 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
 	if (!nvme_ctrl_use_ana(ctrl))
 		return 0;
 
+	if (!multipath)
+		dev_warn(ctrl->device, "allowing ANA without native multipathing due to nvme_core.multipath=N\n");
+
 	ctrl->anacap = id->anacap;
 	ctrl->anatt = id->anatt;
 	ctrl->nanagrpid = le32_to_cpu(id->nanagrpid);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 27663ce3044e..cbe4253f2d02 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -471,6 +471,7 @@ bool nvme_ctrl_use_ana(struct nvme_ctrl *ctrl);
 void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 			struct nvme_ctrl *ctrl, int *flags);
 void nvme_failover_req(struct request *req);
+void nvme_update_ana(struct request *req);
 void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
 int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
 void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
@@ -510,6 +511,9 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
 static inline void nvme_failover_req(struct request *req)
 {
 }
+static inline void nvme_update_ana(struct request *req)
+{
+}
 static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
 {
 }
-- 
2.15.0

^ permalink raw reply related	[flat|nested] 63+ messages in thread

* Re: nvme: allow ANA support to be independent of native multipathing
  2018-11-16 10:17                             ` Christoph Hellwig
@ 2018-11-16 19:28                               ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-16 19:28 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Hannes Reinecke, linux-nvme, Keith Busch, Sagi Grimberg, axboe,
	Martin Wilck, lijie, xose.vazquez, chengjike.cheng, shenhong09,
	dm-devel, wangzhoumengjian, christophe.varoqui, bmarzins,
	sschremm, linux-block, linux-kernel

On Fri, Nov 16 2018 at  5:17am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Fri, Nov 16, 2018 at 11:06:32AM +0100, Hannes Reinecke wrote:
> > Ok, so would you be happy with making ANA support configurable?
> 
> I've looked a bit over the whole situation, and what I think we need
> to do is:
> 
>  a) warn if we see a ANA capable device without multipath support
>     so people know it is not going to work properly.

I disagree with your cynicism but v2 of this patch now emits a warning
accordingly.

>  b) deprecate the multipath module option.  It was only intended as
>     a migration for any pre-existing PCIe multipath user if there
>     were any, not to support any new functionality.  So for 4.20
>     put in a patch that prints a clear warning when it is used,
>     including a link to the nvme list, and then for 4.25 or so
>     remove it entirely unless something unexpected come up.

You rejected the idea of allowing fine-grained control over whether
native NVMe multipathing is enabled or not on a per-namespace basis.
All we have is the coarse-grained nvme_core.multipath=N knob.  Now
you're forecasting removing even that.  Please don't do that.

> This whole drama of optional multipath use has wasted way too much
> of everyones time already.

It has wasted _way_ too much time.

But the drama is born out of you rejecting that we need to preserve
multipath-tools and dm-multipath's ability to work across any
transport.  You don't need to do that work: Hannes, myself and others
have always been willing and able -- if you'd let us.

IIRC it was at 2016's LSF in Boston where Ewan Milne and I had a
face-to-face conversation with you in the hallway track where you agreed
that ANA support would be activated if the capability was advertised by
the target.  The model we discussed is that it would be comparable to
how ALUA gets enabled during SCSI LUN discovery.

I hope you can see your way forward to be more accommodating now.
Especially given the proposed changes are backed by NVMe standards.

Please, PLEASE take v2 of this patch.. please? ;)

Thanks,
Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* nvme: allow ANA support to be independent of native multipathing
@ 2018-11-16 19:28                               ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-16 19:28 UTC (permalink / raw)


On Fri, Nov 16 2018 at  5:17am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Fri, Nov 16, 2018@11:06:32AM +0100, Hannes Reinecke wrote:
> > Ok, so would you be happy with making ANA support configurable?
> 
> I've looked a bit over the whole situation, and what I think we need
> to do is:
> 
>  a) warn if we see a ANA capable device without multipath support
>     so people know it is not going to work properly.

I disagree with your cynicism but v2 of this patch now emits a warning
accordingly.

>  b) deprecate the multipath module option.  It was only intended as
>     a migration for any pre-existing PCIe multipath user if there
>     were any, not to support any new functionality.  So for 4.20
>     put in a patch that prints a clear warning when it is used,
>     including a link to the nvme list, and then for 4.25 or so
>     remove it entirely unless something unexpected come up.

You rejected the idea of allowing fine-grained control over whether
native NVMe multipathing is enabled or not on a per-namespace basis.
All we have is the coarse-grained nvme_core.multipath=N knob.  Now
you're forecasting removing even that.  Please don't do that.

> This whole drama of optional multipath use has wasted way too much
> of everyones time already.

It has wasted _way_ too much time.

But the drama is born out of you rejecting that we need to preserve
multipath-tools and dm-multipath's ability to work across any
transport.  You don't need to do that work: Hannes, myself and others
have always been willing and able -- if you'd let us.

IIRC it was at 2016's LSF in Boston where Ewan Milne and I had a
face-to-face conversation with you in the hallway track where you agreed
that ANA support would be activated if the capability was advertised by
the target.  The model we discussed is that it would be comparable to
how ALUA gets enabled during SCSI LUN discovery.

I hope you can see your way forward to be more accommodating now.
Especially given the proposed changes are backed by NVMe standards.

Please, PLEASE take v2 of this patch.. please? ;)

Thanks,
Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: nvme: allow ANA support to be independent of native multipathing
  2018-11-16 19:28                               ` Mike Snitzer
@ 2018-11-16 19:34                                 ` Laurence Oberman
  -1 siblings, 0 replies; 63+ messages in thread
From: Laurence Oberman @ 2018-11-16 19:34 UTC (permalink / raw)
  To: Mike Snitzer, Christoph Hellwig
  Cc: Hannes Reinecke, linux-nvme, Keith Busch, Sagi Grimberg, axboe,
	Martin Wilck, lijie, xose.vazquez, chengjike.cheng, shenhong09,
	dm-devel, wangzhoumengjian, christophe.varoqui, bmarzins,
	sschremm, linux-block, linux-kernel

On Fri, 2018-11-16 at 14:28 -0500, Mike Snitzer wrote:
> On Fri, Nov 16 2018 at  5:17am -0500,
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > On Fri, Nov 16, 2018 at 11:06:32AM +0100, Hannes Reinecke wrote:
> > > Ok, so would you be happy with making ANA support configurable?
> > 
> > I've looked a bit over the whole situation, and what I think we
> > need
> > to do is:
> > 
> >  a) warn if we see a ANA capable device without multipath support
> >     so people know it is not going to work properly.
> 
> I disagree with your cynicism but v2 of this patch now emits a
> warning
> accordingly.
> 
> >  b) deprecate the multipath module option.  It was only intended as
> >     a migration for any pre-existing PCIe multipath user if there
> >     were any, not to support any new functionality.  So for 4.20
> >     put in a patch that prints a clear warning when it is used,
> >     including a link to the nvme list, and then for 4.25 or so
> >     remove it entirely unless something unexpected come up.
> 
> You rejected the idea of allowing fine-grained control over whether
> native NVMe multipathing is enabled or not on a per-namespace basis.
> All we have is the coarse-grained nvme_core.multipath=N knob.  Now
> you're forecasting removing even that.  Please don't do that.
> 
> > This whole drama of optional multipath use has wasted way too much
> > of everyones time already.
> 
> It has wasted _way_ too much time.
> 
> But the drama is born out of you rejecting that we need to preserve
> multipath-tools and dm-multipath's ability to work across any
> transport.  You don't need to do that work: Hannes, myself and others
> have always been willing and able -- if you'd let us.
> 
> IIRC it was at 2016's LSF in Boston where Ewan Milne and I had a
> face-to-face conversation with you in the hallway track where you
> agreed
> that ANA support would be activated if the capability was advertised
> by
> the target.  The model we discussed is that it would be comparable to
> how ALUA gets enabled during SCSI LUN discovery.
> 
> I hope you can see your way forward to be more accommodating now.
> Especially given the proposed changes are backed by NVMe standards.
> 
> Please, PLEASE take v2 of this patch.. please? ;)
> 
> Thanks,
> Mike

I am begging you take it too please 
Thanks
Laurence

^ permalink raw reply	[flat|nested] 63+ messages in thread

* nvme: allow ANA support to be independent of native multipathing
@ 2018-11-16 19:34                                 ` Laurence Oberman
  0 siblings, 0 replies; 63+ messages in thread
From: Laurence Oberman @ 2018-11-16 19:34 UTC (permalink / raw)


On Fri, 2018-11-16@14:28 -0500, Mike Snitzer wrote:
> On Fri, Nov 16 2018 at??5:17am -0500,
> Christoph Hellwig <hch@lst.de> wrote:
> 
> > On Fri, Nov 16, 2018@11:06:32AM +0100, Hannes Reinecke wrote:
> > > Ok, so would you be happy with making ANA support configurable?
> > 
> > I've looked a bit over the whole situation, and what I think we
> > need
> > to do is:
> > 
> > ?a) warn if we see a ANA capable device without multipath support
> > ????so people know it is not going to work properly.
> 
> I disagree with your cynicism but v2 of this patch now emits a
> warning
> accordingly.
> 
> > ?b) deprecate the multipath module option.??It was only intended as
> > ????a migration for any pre-existing PCIe multipath user if there
> > ????were any, not to support any new functionality.??So for 4.20
> > ????put in a patch that prints a clear warning when it is used,
> > ????including a link to the nvme list, and then for 4.25 or so
> > ????remove it entirely unless something unexpected come up.
> 
> You rejected the idea of allowing fine-grained control over whether
> native NVMe multipathing is enabled or not on a per-namespace basis.
> All we have is the coarse-grained nvme_core.multipath=N knob.??Now
> you're forecasting removing even that.??Please don't do that.
> 
> > This whole drama of optional multipath use has wasted way too much
> > of everyones time already.
> 
> It has wasted _way_ too much time.
> 
> But the drama is born out of you rejecting that we need to preserve
> multipath-tools and dm-multipath's ability to work across any
> transport.??You don't need to do that work: Hannes, myself and others
> have always been willing and able -- if you'd let us.
> 
> IIRC it was at 2016's LSF in Boston where Ewan Milne and I had a
> face-to-face conversation with you in the hallway track where you
> agreed
> that ANA support would be activated if the capability was advertised
> by
> the target.??The model we discussed is that it would be comparable to
> how ALUA gets enabled during SCSI LUN discovery.
> 
> I hope you can see your way forward to be more accommodating now.
> Especially given the proposed changes are backed by NVMe standards.
> 
> Please, PLEASE take v2 of this patch.. please? ;)
> 
> Thanks,
> Mike

I am begging you take it too please 
Thanks
Laurence

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: nvme: allow ANA support to be independent of native multipathing
  2018-11-16 19:28                               ` Mike Snitzer
@ 2018-11-19  9:39                                 ` Christoph Hellwig
  -1 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-19  9:39 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Christoph Hellwig, Hannes Reinecke, linux-nvme, Keith Busch,
	Sagi Grimberg, axboe, Martin Wilck, lijie, xose.vazquez,
	chengjike.cheng, shenhong09, dm-devel, wangzhoumengjian,
	christophe.varoqui, bmarzins, sschremm, linux-block,
	linux-kernel

On Fri, Nov 16, 2018 at 02:28:02PM -0500, Mike Snitzer wrote:
> You rejected the idea of allowing fine-grained control over whether
> native NVMe multipathing is enabled or not on a per-namespace basis.
> All we have is the coarse-grained nvme_core.multipath=N knob.  Now
> you're forecasting removing even that.  Please don't do that.

The whole point is that this hook was intended as a band aid for the
hypothetical pre-existing setups.  Not ever for new development.

> Please, PLEASE take v2 of this patch.. please? ;)

See the previous mail for the plan ahead.   I'm sick and tired of you
trying to sneak your new developemts back in.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* nvme: allow ANA support to be independent of native multipathing
@ 2018-11-19  9:39                                 ` Christoph Hellwig
  0 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-19  9:39 UTC (permalink / raw)


On Fri, Nov 16, 2018@02:28:02PM -0500, Mike Snitzer wrote:
> You rejected the idea of allowing fine-grained control over whether
> native NVMe multipathing is enabled or not on a per-namespace basis.
> All we have is the coarse-grained nvme_core.multipath=N knob.  Now
> you're forecasting removing even that.  Please don't do that.

The whole point is that this hook was intended as a band aid for the
hypothetical pre-existing setups.  Not ever for new development.

> Please, PLEASE take v2 of this patch.. please? ;)

See the previous mail for the plan ahead.   I'm sick and tired of you
trying to sneak your new developemts back in.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: nvme: allow ANA support to be independent of native multipathing
  2018-11-19  9:39                                 ` Christoph Hellwig
  (?)
@ 2018-11-19 14:56                                   ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-19 14:56 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Hannes Reinecke, linux-nvme, Keith Busch, Sagi Grimberg, axboe,
	Martin Wilck, lijie, xose.vazquez, chengjike.cheng, shenhong09,
	dm-devel, wangzhoumengjian, christophe.varoqui, bmarzins,
	sschremm, linux-block, linux-kernel

On Mon, Nov 19 2018 at  4:39am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Fri, Nov 16, 2018 at 02:28:02PM -0500, Mike Snitzer wrote:
> > You rejected the idea of allowing fine-grained control over whether
> > native NVMe multipathing is enabled or not on a per-namespace basis.
> > All we have is the coarse-grained nvme_core.multipath=N knob.  Now
> > you're forecasting removing even that.  Please don't do that.
> 
> The whole point is that this hook was intended as a band aid for the
> hypothetical pre-existing setups.  Not ever for new development.

It pains me that you're marching us towards increased conflict that
needs resolution through more formal means.  And that now I'm going to
have to document the timeline of your authoritarian approach to
stifling another Linux maintainer's freedom to do his job of delivering
on functionality I've been depended on for many years.

> > Please, PLEASE take v2 of this patch.. please? ;)
> 
> See the previous mail for the plan ahead.   I'm sick and tired of you
> trying to sneak your new developemts back in.

I've only ever posted patches in the open and never has it been with the
idea of sneaking anything in.  Miscategorizing my actions as such is a
gross abuse that I will not tolerate.

If we were to confine ourselves to this v2 patch I pleaded with you to
take: https://lkml.org/lkml/2018/11/17/4

1) Hannes proposed a simplistic patch that didn't account for the fact
   that NVMe's ana workqueue wouldn't get kicked, his intent was to make
   ANA work independent of your native multipathing.  (The fact he or I
   even need to post such patches, to unwind your tight-coupling of ANA
   and multipathing, speaks to how you've calculatingly undermined any
   effort to implement proper NVMe multipathing outside of the NVMe
   driver)

2) ANA and multipathing are completely disjoint on an NVMe spec level.
   You know this.

SO: will you be taking my v2 patch for 4.21 or not?

Please advise, thanks.
Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* nvme: allow ANA support to be independent of native multipathing
@ 2018-11-19 14:56                                   ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-19 14:56 UTC (permalink / raw)


On Mon, Nov 19 2018 at  4:39am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Fri, Nov 16, 2018@02:28:02PM -0500, Mike Snitzer wrote:
> > You rejected the idea of allowing fine-grained control over whether
> > native NVMe multipathing is enabled or not on a per-namespace basis.
> > All we have is the coarse-grained nvme_core.multipath=N knob.  Now
> > you're forecasting removing even that.  Please don't do that.
> 
> The whole point is that this hook was intended as a band aid for the
> hypothetical pre-existing setups.  Not ever for new development.

It pains me that you're marching us towards increased conflict that
needs resolution through more formal means.  And that now I'm going to
have to document the timeline of your authoritarian approach to
stifling another Linux maintainer's freedom to do his job of delivering
on functionality I've been depended on for many years.

> > Please, PLEASE take v2 of this patch.. please? ;)
> 
> See the previous mail for the plan ahead.   I'm sick and tired of you
> trying to sneak your new developemts back in.

I've only ever posted patches in the open and never has it been with the
idea of sneaking anything in.  Miscategorizing my actions as such is a
gross abuse that I will not tolerate.

If we were to confine ourselves to this v2 patch I pleaded with you to
take: https://lkml.org/lkml/2018/11/17/4

1) Hannes proposed a simplistic patch that didn't account for the fact
   that NVMe's ana workqueue wouldn't get kicked, his intent was to make
   ANA work independent of your native multipathing.  (The fact he or I
   even need to post such patches, to unwind your tight-coupling of ANA
   and multipathing, speaks to how you've calculatingly undermined any
   effort to implement proper NVMe multipathing outside of the NVMe
   driver)

2) ANA and multipathing are completely disjoint on an NVMe spec level.
   You know this.

SO: will you be taking my v2 patch for 4.21 or not?

Please advise, thanks.
Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: nvme: allow ANA support to be independent of native multipathing
@ 2018-11-19 14:56                                   ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-19 14:56 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: axboe, linux-block, lijie, shenhong09, xose.vazquez,
	Sagi Grimberg, chengjike.cheng, linux-kernel, linux-nvme,
	Keith Busch, dm-devel, wangzhoumengjian, Martin Wilck, sschremm

On Mon, Nov 19 2018 at  4:39am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Fri, Nov 16, 2018 at 02:28:02PM -0500, Mike Snitzer wrote:
> > You rejected the idea of allowing fine-grained control over whether
> > native NVMe multipathing is enabled or not on a per-namespace basis.
> > All we have is the coarse-grained nvme_core.multipath=N knob.  Now
> > you're forecasting removing even that.  Please don't do that.
> 
> The whole point is that this hook was intended as a band aid for the
> hypothetical pre-existing setups.  Not ever for new development.

It pains me that you're marching us towards increased conflict that
needs resolution through more formal means.  And that now I'm going to
have to document the timeline of your authoritarian approach to
stifling another Linux maintainer's freedom to do his job of delivering
on functionality I've been depended on for many years.

> > Please, PLEASE take v2 of this patch.. please? ;)
> 
> See the previous mail for the plan ahead.   I'm sick and tired of you
> trying to sneak your new developemts back in.

I've only ever posted patches in the open and never has it been with the
idea of sneaking anything in.  Miscategorizing my actions as such is a
gross abuse that I will not tolerate.

If we were to confine ourselves to this v2 patch I pleaded with you to
take: https://lkml.org/lkml/2018/11/17/4

1) Hannes proposed a simplistic patch that didn't account for the fact
   that NVMe's ana workqueue wouldn't get kicked, his intent was to make
   ANA work independent of your native multipathing.  (The fact he or I
   even need to post such patches, to unwind your tight-coupling of ANA
   and multipathing, speaks to how you've calculatingly undermined any
   effort to implement proper NVMe multipathing outside of the NVMe
   driver)

2) ANA and multipathing are completely disjoint on an NVMe spec level.
   You know this.

SO: will you be taking my v2 patch for 4.21 or not?

Please advise, thanks.
Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: nvme: allow ANA support to be independent of native multipathing
  2018-11-19 14:56                                   ` Mike Snitzer
@ 2018-11-20  9:42                                     ` Christoph Hellwig
  -1 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-20  9:42 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Christoph Hellwig, Hannes Reinecke, linux-nvme, Keith Busch,
	Sagi Grimberg, axboe, Martin Wilck, lijie, xose.vazquez,
	chengjike.cheng, shenhong09, dm-devel, wangzhoumengjian,
	christophe.varoqui, bmarzins, sschremm, linux-block,
	linux-kernel

On Mon, Nov 19, 2018 at 09:56:50AM -0500, Mike Snitzer wrote:
> SO: will you be taking my v2 patch for 4.21 or not?

No.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* nvme: allow ANA support to be independent of native multipathing
@ 2018-11-20  9:42                                     ` Christoph Hellwig
  0 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-20  9:42 UTC (permalink / raw)


On Mon, Nov 19, 2018@09:56:50AM -0500, Mike Snitzer wrote:
> SO: will you be taking my v2 patch for 4.21 or not?

No.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: nvme: allow ANA support to be independent of native multipathing
  2018-11-20  9:42                                     ` Christoph Hellwig
@ 2018-11-20 13:37                                       ` Mike Snitzer
  -1 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-20 13:37 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Hannes Reinecke, linux-nvme, Keith Busch, Sagi Grimberg, axboe,
	Martin Wilck, lijie, xose.vazquez, chengjike.cheng, shenhong09,
	dm-devel, wangzhoumengjian, christophe.varoqui, bmarzins,
	sschremm, linux-block, linux-kernel

On Tue, Nov 20 2018 at  4:42am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Mon, Nov 19, 2018 at 09:56:50AM -0500, Mike Snitzer wrote:
> > SO: will you be taking my v2 patch for 4.21 or not?
> 
> No.

This isn't how a Linux maintainer engages in technical discussion.

You _clearly_ just want to prevent further use of multipath-tools and
DM-multipath.  You will resort to rejecting a patch that improves the
NVMe driver's standards compliance if it allows you hijack NVMe
multipathing because you think you have the best way and nobody else
should be allowed to use a well established competing _open_ _Linux_
solution.  Simple as that.

You haven't made a single technical argument against my v2 patch, yet
you're rejecting it.  Purely on the basis that having NVMe's ANA updates
work independent on native NVMe multipathing happens to benefit an
alternative (and that benefit is just to not have multipath-tools to be
so crude with a pure userspace ANA state tracking).

Jens, this entire situation has gotten well beyond acceptable and you or
other NVMe co-maintainers need to step in.  We need reasoned _technical_
discussion or this entire process falls apart.

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* nvme: allow ANA support to be independent of native multipathing
@ 2018-11-20 13:37                                       ` Mike Snitzer
  0 siblings, 0 replies; 63+ messages in thread
From: Mike Snitzer @ 2018-11-20 13:37 UTC (permalink / raw)


On Tue, Nov 20 2018 at  4:42am -0500,
Christoph Hellwig <hch@lst.de> wrote:

> On Mon, Nov 19, 2018@09:56:50AM -0500, Mike Snitzer wrote:
> > SO: will you be taking my v2 patch for 4.21 or not?
> 
> No.

This isn't how a Linux maintainer engages in technical discussion.

You _clearly_ just want to prevent further use of multipath-tools and
DM-multipath.  You will resort to rejecting a patch that improves the
NVMe driver's standards compliance if it allows you hijack NVMe
multipathing because you think you have the best way and nobody else
should be allowed to use a well established competing _open_ _Linux_
solution.  Simple as that.

You haven't made a single technical argument against my v2 patch, yet
you're rejecting it.  Purely on the basis that having NVMe's ANA updates
work independent on native NVMe multipathing happens to benefit an
alternative (and that benefit is just to not have multipath-tools to be
so crude with a pure userspace ANA state tracking).

Jens, this entire situation has gotten well beyond acceptable and you or
other NVMe co-maintainers need to step in.  We need reasoned _technical_
discussion or this entire process falls apart.

Mike

^ permalink raw reply	[flat|nested] 63+ messages in thread

* Re: nvme: allow ANA support to be independent of native multipathing
  2018-11-20 13:37                                       ` Mike Snitzer
@ 2018-11-20 16:23                                         ` Christoph Hellwig
  -1 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-20 16:23 UTC (permalink / raw)
  To: Mike Snitzer
  Cc: Christoph Hellwig, Hannes Reinecke, linux-nvme, Keith Busch,
	Sagi Grimberg, axboe, Martin Wilck, lijie, xose.vazquez,
	chengjike.cheng, shenhong09, dm-devel, wangzhoumengjian,
	christophe.varoqui, bmarzins, sschremm, linux-block,
	linux-kernel

On Tue, Nov 20, 2018 at 08:37:19AM -0500, Mike Snitzer wrote:
> This isn't how a Linux maintainer engages in technical discussion.

And this isn't a technical discussion.   As told before the only
reason to not build the multipath code is to save space, and
the only reason to disable multipath at runtime is for potential
pre-existing setups, which obviously are not ANA capable.

The right fix is to warn users if they use a driver without
CONFIG_NVME_MULTIPATH on a multi-port device and to phase out the
multipath=0 option in the long run, again with a warning.  Any new
additions to "improve" these cases simply don't make sense.

^ permalink raw reply	[flat|nested] 63+ messages in thread

* nvme: allow ANA support to be independent of native multipathing
@ 2018-11-20 16:23                                         ` Christoph Hellwig
  0 siblings, 0 replies; 63+ messages in thread
From: Christoph Hellwig @ 2018-11-20 16:23 UTC (permalink / raw)


On Tue, Nov 20, 2018@08:37:19AM -0500, Mike Snitzer wrote:
> This isn't how a Linux maintainer engages in technical discussion.

And this isn't a technical discussion.   As told before the only
reason to not build the multipath code is to save space, and
the only reason to disable multipath at runtime is for potential
pre-existing setups, which obviously are not ANA capable.

The right fix is to warn users if they use a driver without
CONFIG_NVME_MULTIPATH on a multi-port device and to phase out the
multipath=0 option in the long run, again with a warning.  Any new
additions to "improve" these cases simply don't make sense.

^ permalink raw reply	[flat|nested] 63+ messages in thread

end of thread, other threads:[~2018-11-20 16:24 UTC | newest]

Thread overview: 63+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-11-08  6:09 [PATCH] multipath-tools: add ANA support for NVMe device lijie
2018-11-12 16:23 ` Martin Wilck
2018-11-12 21:53   ` Mike Snitzer
2018-11-12 21:53     ` Mike Snitzer
2018-11-13  6:59     ` Martin Wilck
2018-11-13  6:59       ` Martin Wilck
2018-11-13 16:18     ` Keith Busch
2018-11-13 16:18       ` Keith Busch
2018-11-13 18:00       ` Mike Snitzer
2018-11-13 18:00         ` Mike Snitzer
2018-11-14  5:38         ` Mike Snitzer
2018-11-14  5:38           ` Mike Snitzer
2018-11-14  7:49           ` Hannes Reinecke
2018-11-14  7:49             ` Hannes Reinecke
2018-11-14 10:36             ` [dm-devel] " Martin Wilck
2018-11-14 10:36               ` Martin Wilck
2018-11-14 17:47             ` Mike Snitzer
2018-11-14 17:47               ` Mike Snitzer
2018-11-14 18:51               ` Hannes Reinecke
2018-11-14 18:51                 ` Hannes Reinecke
2018-11-14 19:26                 ` Mike Snitzer
2018-11-14 19:26                   ` Mike Snitzer
2018-11-15 17:46                 ` [PATCH] nvme: allow ANA support to be independent of native multipathing Mike Snitzer
2018-11-15 17:46                   ` Mike Snitzer
2018-11-16  7:25                   ` Hannes Reinecke
2018-11-16  7:25                     ` Hannes Reinecke
2018-11-16 14:01                     ` Mike Snitzer
2018-11-16 14:01                       ` Mike Snitzer
2018-11-16  9:14                   ` [PATCH] " Christoph Hellwig
2018-11-16  9:14                     ` Christoph Hellwig
2018-11-16  9:40                     ` Hannes Reinecke
2018-11-16  9:40                       ` Hannes Reinecke
2018-11-16  9:49                       ` Christoph Hellwig
2018-11-16  9:49                         ` Christoph Hellwig
2018-11-16 10:06                         ` Hannes Reinecke
2018-11-16 10:06                           ` Hannes Reinecke
2018-11-16 10:17                           ` Christoph Hellwig
2018-11-16 10:17                             ` Christoph Hellwig
2018-11-16 19:28                             ` Mike Snitzer
2018-11-16 19:28                               ` Mike Snitzer
2018-11-16 19:34                               ` Laurence Oberman
2018-11-16 19:34                                 ` Laurence Oberman
2018-11-19  9:39                               ` Christoph Hellwig
2018-11-19  9:39                                 ` Christoph Hellwig
2018-11-19 14:56                                 ` Mike Snitzer
2018-11-19 14:56                                   ` Mike Snitzer
2018-11-19 14:56                                   ` Mike Snitzer
2018-11-20  9:42                                   ` Christoph Hellwig
2018-11-20  9:42                                     ` Christoph Hellwig
2018-11-20 13:37                                     ` Mike Snitzer
2018-11-20 13:37                                       ` Mike Snitzer
2018-11-20 16:23                                       ` Christoph Hellwig
2018-11-20 16:23                                         ` Christoph Hellwig
2018-11-16 14:12                     ` Mike Snitzer
2018-11-16 14:12                       ` Mike Snitzer
2018-11-16 18:59                   ` [PATCH v2] " Mike Snitzer
2018-11-16 18:59                     ` Mike Snitzer
2018-11-14  7:24       ` multipath-tools: add ANA support for NVMe device Hannes Reinecke
2018-11-14  7:24         ` Hannes Reinecke
2018-11-14 15:35         ` Christoph Hellwig
2018-11-14 15:35           ` Christoph Hellwig
2018-11-14 16:16           ` Mike Snitzer
2018-11-14 16:16             ` Mike Snitzer

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.