* [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. [not found] <20210415192848.962891-1-nitram_67@hotmail.com> @ 2021-04-15 19:28 ` Martin Belanger 2021-05-01 11:34 ` Hannes Reinecke 2021-05-04 19:56 ` Sagi Grimberg 0 siblings, 2 replies; 23+ messages in thread From: Martin Belanger @ 2021-04-15 19:28 UTC (permalink / raw) To: linux-nvme; +Cc: kbusch, axboe, hch, sagi, Martin Belanger From: Martin Belanger <martin.belanger@dell.com> --- drivers/nvme/host/core.c | 5 +++++ drivers/nvme/host/fabrics.c | 14 +++++++++++++ drivers/nvme/host/fabrics.h | 6 +++++- drivers/nvme/host/tcp.c | 41 ++++++++++++++++++++++++++++++++++--- 4 files changed, 62 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 288ac47ff5b4..91ae11a1ae26 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3961,6 +3961,11 @@ static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", opts->host_traddr ?: "none"); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_HOST_TRIFACE=%s", + opts->host_triface ?: "none"); } return ret; } diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 604ab0e5a2ad..f5d0d760b53b 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -112,6 +112,9 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) len += scnprintf(buf + len, size - len, "%shost_traddr=%s", (len) ? "," : "", ctrl->opts->host_traddr); + if (ctrl->opts->mask & NVMF_OPT_HOST_TRIFACE) + len += scnprintf(buf + len, size - len, "%shost_triface=%s", + (len) ? "," : "", ctrl->opts->host_triface); len += scnprintf(buf + len, size - len, "\n"); return len; @@ -604,6 +607,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, + { NVMF_OPT_HOST_TRIFACE, "host_triface=%s" }, { NVMF_OPT_HOST_ID, "hostid=%s" }, { NVMF_OPT_DUP_CONNECT, "duplicate_connect" }, { NVMF_OPT_DISABLE_SQFLOW, "disable_sqflow" }, @@ -813,6 +817,15 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, kfree(opts->host_traddr); opts->host_traddr = p; break; + case NVMF_OPT_HOST_TRIFACE: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + kfree(opts->host_triface); + opts->host_triface = p; + break; case NVMF_OPT_HOST_ID: p = match_strdup(args); if (!p) { @@ -997,6 +1010,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts) kfree(opts->trsvcid); kfree(opts->subsysnqn); kfree(opts->host_traddr); + kfree(opts->host_triface); kfree(opts); } EXPORT_SYMBOL_GPL(nvmf_free_options); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 733010d2eafd..17c64ff4db8c 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -59,6 +59,7 @@ enum { NVMF_OPT_NR_POLL_QUEUES = 1 << 18, NVMF_OPT_TOS = 1 << 19, NVMF_OPT_FAIL_FAST_TMO = 1 << 20, + NVMF_OPT_HOST_TRIFACE = 1 << 21, }; /** @@ -76,7 +77,9 @@ enum { * @trsvcid: The transport-specific TRSVCID field for a port on the * subsystem which is adding a controller. * @host_traddr: A transport-specific field identifying the NVME host port - * to use for the connection to the controller. + * to use for the connection to the controller. + * @host_triface: A transport-specific field identifying the NVME host + * interface to use for the connection to the controller. * @queue_size: Number of IO queue elements. * @nr_io_queues: Number of controller IO queues that will be established. * @reconnect_delay: Time between two consecutive reconnect attempts. @@ -101,6 +104,7 @@ struct nvmf_ctrl_options { char *traddr; char *trsvcid; char *host_traddr; + char *host_triface; size_t queue_size; unsigned int nr_io_queues; unsigned int reconnect_delay; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 8e55d8bc0c50..28eb7f88b487 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1447,6 +1447,20 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, } } + if (nctrl->opts->mask & NVMF_OPT_HOST_TRIFACE) { + char *iface = nctrl->opts->host_triface; + sockptr_t optval = KERNEL_SOCKPTR(iface); + + ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE, + optval, strlen(iface)); + if (ret) { + dev_err(nctrl->device, + "failed to bind to interface %s queue %d err %d\n", + iface, qid, ret); + goto err_sock; + } + } + queue->hdr_digest = nctrl->opts->hdr_digest; queue->data_digest = nctrl->opts->data_digest; if (queue->hdr_digest || queue->data_digest) { @@ -2457,6 +2471,10 @@ nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts) { + const char *iface_key = ""; + const char *iface_val = ""; + const char *srce_key = ""; + const char *srce_val = ""; struct nvme_tcp_ctrl *ctrl; int ret; @@ -2502,6 +2520,22 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, opts->host_traddr); goto out_free_ctrl; } + srce_key = ", src-addr "; + srce_val = opts->host_traddr; + } + + if (opts->mask & NVMF_OPT_HOST_TRIFACE) { + struct net_device *ndev; + + ndev = dev_get_by_name(&init_net, opts->host_triface); + if (!ndev) { + pr_err("invalid interface passed: %s\n", + opts->host_triface); + ret = -ENODEV; + goto out_free_ctrl; + } + iface_key = ", iface "; + iface_val = opts->host_triface; } if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) { @@ -2530,8 +2564,9 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, if (ret) goto out_uninit_ctrl; - dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", - ctrl->ctrl.opts->subsysnqn, &ctrl->addr); + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp%s%s%s%s\n", + ctrl->ctrl.opts->subsysnqn, &ctrl->addr, + srce_key, srce_val, iface_key, iface_val); mutex_lock(&nvme_tcp_ctrl_mutex); list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); @@ -2560,7 +2595,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = { NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | - NVMF_OPT_TOS, + NVMF_OPT_TOS | NVMF_OPT_HOST_TRIFACE, .create_ctrl = nvme_tcp_create_ctrl, }; -- 2.25.1 _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-04-15 19:28 ` [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting Martin Belanger @ 2021-05-01 11:34 ` Hannes Reinecke 2021-05-03 16:59 ` Belanger, Martin 2021-05-04 19:56 ` Sagi Grimberg 1 sibling, 1 reply; 23+ messages in thread From: Hannes Reinecke @ 2021-05-01 11:34 UTC (permalink / raw) To: Martin Belanger, linux-nvme; +Cc: kbusch, axboe, hch, sagi, Martin Belanger On 4/15/21 9:28 PM, Martin Belanger wrote: > From: Martin Belanger <martin.belanger@dell.com> > Please fix up the subject and description. > --- > drivers/nvme/host/core.c | 5 +++++ > drivers/nvme/host/fabrics.c | 14 +++++++++++++ > drivers/nvme/host/fabrics.h | 6 +++++- > drivers/nvme/host/tcp.c | 41 ++++++++++++++++++++++++++++++++++--- > 4 files changed, 62 insertions(+), 4 deletions(-) > > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c > index 288ac47ff5b4..91ae11a1ae26 100644 > --- a/drivers/nvme/host/core.c > +++ b/drivers/nvme/host/core.c > @@ -3961,6 +3961,11 @@ static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) > > ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", > opts->host_traddr ?: "none"); > + if (ret) > + return ret; > + > + ret = add_uevent_var(env, "NVME_HOST_TRIFACE=%s", > + opts->host_triface ?: "none"); > } > return ret; > } Why not simply 'host_iface' ? 'triface' is a bit awkward. > diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c > index 604ab0e5a2ad..f5d0d760b53b 100644 > --- a/drivers/nvme/host/fabrics.c > +++ b/drivers/nvme/host/fabrics.c > @@ -112,6 +112,9 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) > if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) > len += scnprintf(buf + len, size - len, "%shost_traddr=%s", > (len) ? "," : "", ctrl->opts->host_traddr); > + if (ctrl->opts->mask & NVMF_OPT_HOST_TRIFACE) > + len += scnprintf(buf + len, size - len, "%shost_triface=%s", > + (len) ? "," : "", ctrl->opts->host_triface); > len += scnprintf(buf + len, size - len, "\n"); > > return len; > @@ -604,6 +607,7 @@ static const match_table_t opt_tokens = { > { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, > { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, > { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, > + { NVMF_OPT_HOST_TRIFACE, "host_triface=%s" }, > { NVMF_OPT_HOST_ID, "hostid=%s" }, > { NVMF_OPT_DUP_CONNECT, "duplicate_connect" }, > { NVMF_OPT_DISABLE_SQFLOW, "disable_sqflow" }, > @@ -813,6 +817,15 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, > kfree(opts->host_traddr); > opts->host_traddr = p; > break; > + case NVMF_OPT_HOST_TRIFACE: > + p = match_strdup(args); > + if (!p) { > + ret = -ENOMEM; > + goto out; > + } > + kfree(opts->host_triface); > + opts->host_triface = p; > + break; > case NVMF_OPT_HOST_ID: > p = match_strdup(args); > if (!p) { > @@ -997,6 +1010,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts) > kfree(opts->trsvcid); > kfree(opts->subsysnqn); > kfree(opts->host_traddr); > + kfree(opts->host_triface); > kfree(opts); > } > EXPORT_SYMBOL_GPL(nvmf_free_options); > diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h > index 733010d2eafd..17c64ff4db8c 100644 > --- a/drivers/nvme/host/fabrics.h > +++ b/drivers/nvme/host/fabrics.h > @@ -59,6 +59,7 @@ enum { > NVMF_OPT_NR_POLL_QUEUES = 1 << 18, > NVMF_OPT_TOS = 1 << 19, > NVMF_OPT_FAIL_FAST_TMO = 1 << 20, > + NVMF_OPT_HOST_TRIFACE = 1 << 21, > }; > > /** > @@ -76,7 +77,9 @@ enum { > * @trsvcid: The transport-specific TRSVCID field for a port on the > * subsystem which is adding a controller. > * @host_traddr: A transport-specific field identifying the NVME host port > - * to use for the connection to the controller. > + * to use for the connection to the controller. > + * @host_triface: A transport-specific field identifying the NVME host > + * interface to use for the connection to the controller. > * @queue_size: Number of IO queue elements. > * @nr_io_queues: Number of controller IO queues that will be established. > * @reconnect_delay: Time between two consecutive reconnect attempts. > @@ -101,6 +104,7 @@ struct nvmf_ctrl_options { > char *traddr; > char *trsvcid; > char *host_traddr; > + char *host_triface; > size_t queue_size; > unsigned int nr_io_queues; > unsigned int reconnect_delay; > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c > index 8e55d8bc0c50..28eb7f88b487 100644 > --- a/drivers/nvme/host/tcp.c > +++ b/drivers/nvme/host/tcp.c > @@ -1447,6 +1447,20 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, > } > } > > + if (nctrl->opts->mask & NVMF_OPT_HOST_TRIFACE) { > + char *iface = nctrl->opts->host_triface; > + sockptr_t optval = KERNEL_SOCKPTR(iface); > + > + ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE, > + optval, strlen(iface)); > + if (ret) { > + dev_err(nctrl->device, > + "failed to bind to interface %s queue %d err %d\n", > + iface, qid, ret); > + goto err_sock; > + } > + } > + > queue->hdr_digest = nctrl->opts->hdr_digest; > queue->data_digest = nctrl->opts->data_digest; > if (queue->hdr_digest || queue->data_digest) { Is this valid for all transports? I guess it would only work for 'tcp', and maybe 'rdma' if one would be running ROCE. Shouldn't we error out on other transports like 'fc' or 'loop'? > @@ -2457,6 +2471,10 @@ nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) > static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, > struct nvmf_ctrl_options *opts) > { > + const char *iface_key = ""; > + const char *iface_val = ""; > + const char *srce_key = ""; > + const char *srce_val = ""; > struct nvme_tcp_ctrl *ctrl; > int ret; > > @@ -2502,6 +2520,22 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, > opts->host_traddr); > goto out_free_ctrl; > } > + srce_key = ", src-addr "; > + srce_val = opts->host_traddr; > + } > + > + if (opts->mask & NVMF_OPT_HOST_TRIFACE) { > + struct net_device *ndev; > + > + ndev = dev_get_by_name(&init_net, opts->host_triface); > + if (!ndev) { > + pr_err("invalid interface passed: %s\n", > + opts->host_triface); > + ret = -ENODEV; > + goto out_free_ctrl; > + } > + iface_key = ", iface "; > + iface_val = opts->host_triface; > } > > if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) { Normally the options are just parts of the 'address' string; why didn't you use that approach here? > @@ -2530,8 +2564,9 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, > if (ret) > goto out_uninit_ctrl; > > - dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", > - ctrl->ctrl.opts->subsysnqn, &ctrl->addr); > + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp%s%s%s%s\n", > + ctrl->ctrl.opts->subsysnqn, &ctrl->addr, > + srce_key, srce_val, iface_key, iface_val); > > mutex_lock(&nvme_tcp_ctrl_mutex); > list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); > @@ -2560,7 +2595,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = { > NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | > NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | > NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | > - NVMF_OPT_TOS, > + NVMF_OPT_TOS | NVMF_OPT_HOST_TRIFACE, > .create_ctrl = nvme_tcp_create_ctrl, > }; > > Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-01 11:34 ` Hannes Reinecke @ 2021-05-03 16:59 ` Belanger, Martin 2021-05-04 13:25 ` Hannes Reinecke 0 siblings, 1 reply; 23+ messages in thread From: Belanger, Martin @ 2021-05-03 16:59 UTC (permalink / raw) To: Hannes Reinecke, Martin Belanger, linux-nvme; +Cc: kbusch, axboe, hch, sagi Hi Hannes, I just noticed there were in-line comments. to answer you questions: Q1) Why not simply 'host_iface' ? 'triface' is a bit awkward. A1) I used TRIFACE to keep consistency with all other transport options: traddr, trsvcid, host_traddr. I will rename to host_iface at your suggestion. Q2) Is this valid for all transports? I guess it would only work for 'tcp', and maybe 'rdma' if one would be running ROCE. Shouldn't we error out on other transports like 'fc' or 'loop'? A2) This is only for TCP, and we do check that this option is only allowed for TCP by specifying it in the "allowed_opts" as follows (see file tcp.c): static struct nvmf_transport_ops nvme_tcp_transport = { .name = "tcp", .module = THIS_MODULE, .required_opts = NVMF_OPT_TRADDR, .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | NVMF_OPT_TOS | NVMF_OPT_HOST_TRIFACE, .create_ctrl = nvme_tcp_create_ctrl, }; Q3) Normally the options are just parts of the 'address' string; why didn't you use that approach here? A3) I don't understand what you mean? Regards, Martin ________________________________________ From: Hannes Reinecke <hare@suse.de> Sent: Saturday, May 1, 2021 07:34 To: Martin Belanger; linux-nvme@lists.infradead.org Cc: kbusch@kernel.org; axboe@fb.com; hch@lst.de; sagi@grimberg.me; Belanger, Martin Subject: Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. [EXTERNAL EMAIL] On 4/15/21 9:28 PM, Martin Belanger wrote: > From: Martin Belanger <martin.belanger@dell.com> > Please fix up the subject and description. > --- > drivers/nvme/host/core.c | 5 +++++ > drivers/nvme/host/fabrics.c | 14 +++++++++++++ > drivers/nvme/host/fabrics.h | 6 +++++- > drivers/nvme/host/tcp.c | 41 ++++++++++++++++++++++++++++++++++--- > 4 files changed, 62 insertions(+), 4 deletions(-) > > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c > index 288ac47ff5b4..91ae11a1ae26 100644 > --- a/drivers/nvme/host/core.c > +++ b/drivers/nvme/host/core.c > @@ -3961,6 +3961,11 @@ static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) > > ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", > opts->host_traddr ?: "none"); > + if (ret) > + return ret; > + > + ret = add_uevent_var(env, "NVME_HOST_TRIFACE=%s", > + opts->host_triface ?: "none"); > } > return ret; > } Why not simply 'host_iface' ? 'triface' is a bit awkward. > diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c > index 604ab0e5a2ad..f5d0d760b53b 100644 > --- a/drivers/nvme/host/fabrics.c > +++ b/drivers/nvme/host/fabrics.c > @@ -112,6 +112,9 @@ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) > if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) > len += scnprintf(buf + len, size - len, "%shost_traddr=%s", > (len) ? "," : "", ctrl->opts->host_traddr); > + if (ctrl->opts->mask & NVMF_OPT_HOST_TRIFACE) > + len += scnprintf(buf + len, size - len, "%shost_triface=%s", > + (len) ? "," : "", ctrl->opts->host_triface); > len += scnprintf(buf + len, size - len, "\n"); > > return len; > @@ -604,6 +607,7 @@ static const match_table_t opt_tokens = { > { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, > { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, > { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, > + { NVMF_OPT_HOST_TRIFACE, "host_triface=%s" }, > { NVMF_OPT_HOST_ID, "hostid=%s" }, > { NVMF_OPT_DUP_CONNECT, "duplicate_connect" }, > { NVMF_OPT_DISABLE_SQFLOW, "disable_sqflow" }, > @@ -813,6 +817,15 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, > kfree(opts->host_traddr); > opts->host_traddr = p; > break; > + case NVMF_OPT_HOST_TRIFACE: > + p = match_strdup(args); > + if (!p) { > + ret = -ENOMEM; > + goto out; > + } > + kfree(opts->host_triface); > + opts->host_triface = p; > + break; > case NVMF_OPT_HOST_ID: > p = match_strdup(args); > if (!p) { > @@ -997,6 +1010,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts) > kfree(opts->trsvcid); > kfree(opts->subsysnqn); > kfree(opts->host_traddr); > + kfree(opts->host_triface); > kfree(opts); > } > EXPORT_SYMBOL_GPL(nvmf_free_options); > diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h > index 733010d2eafd..17c64ff4db8c 100644 > --- a/drivers/nvme/host/fabrics.h > +++ b/drivers/nvme/host/fabrics.h > @@ -59,6 +59,7 @@ enum { > NVMF_OPT_NR_POLL_QUEUES = 1 << 18, > NVMF_OPT_TOS = 1 << 19, > NVMF_OPT_FAIL_FAST_TMO = 1 << 20, > + NVMF_OPT_HOST_TRIFACE = 1 << 21, > }; > > /** > @@ -76,7 +77,9 @@ enum { > * @trsvcid: The transport-specific TRSVCID field for a port on the > * subsystem which is adding a controller. > * @host_traddr: A transport-specific field identifying the NVME host port > - * to use for the connection to the controller. > + * to use for the connection to the controller. > + * @host_triface: A transport-specific field identifying the NVME host > + * interface to use for the connection to the controller. > * @queue_size: Number of IO queue elements. > * @nr_io_queues: Number of controller IO queues that will be established. > * @reconnect_delay: Time between two consecutive reconnect attempts. > @@ -101,6 +104,7 @@ struct nvmf_ctrl_options { > char *traddr; > char *trsvcid; > char *host_traddr; > + char *host_triface; > size_t queue_size; > unsigned int nr_io_queues; > unsigned int reconnect_delay; > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c > index 8e55d8bc0c50..28eb7f88b487 100644 > --- a/drivers/nvme/host/tcp.c > +++ b/drivers/nvme/host/tcp.c > @@ -1447,6 +1447,20 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, > } > } > > + if (nctrl->opts->mask & NVMF_OPT_HOST_TRIFACE) { > + char *iface = nctrl->opts->host_triface; > + sockptr_t optval = KERNEL_SOCKPTR(iface); > + > + ret = sock_setsockopt(queue->sock, SOL_SOCKET, SO_BINDTODEVICE, > + optval, strlen(iface)); > + if (ret) { > + dev_err(nctrl->device, > + "failed to bind to interface %s queue %d err %d\n", > + iface, qid, ret); > + goto err_sock; > + } > + } > + > queue->hdr_digest = nctrl->opts->hdr_digest; > queue->data_digest = nctrl->opts->data_digest; > if (queue->hdr_digest || queue->data_digest) { Is this valid for all transports? I guess it would only work for 'tcp', and maybe 'rdma' if one would be running ROCE. Shouldn't we error out on other transports like 'fc' or 'loop'? > @@ -2457,6 +2471,10 @@ nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts) > static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, > struct nvmf_ctrl_options *opts) > { > + const char *iface_key = ""; > + const char *iface_val = ""; > + const char *srce_key = ""; > + const char *srce_val = ""; > struct nvme_tcp_ctrl *ctrl; > int ret; > > @@ -2502,6 +2520,22 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, > opts->host_traddr); > goto out_free_ctrl; > } > + srce_key = ", src-addr "; > + srce_val = opts->host_traddr; > + } > + > + if (opts->mask & NVMF_OPT_HOST_TRIFACE) { > + struct net_device *ndev; > + > + ndev = dev_get_by_name(&init_net, opts->host_triface); > + if (!ndev) { > + pr_err("invalid interface passed: %s\n", > + opts->host_triface); > + ret = -ENODEV; > + goto out_free_ctrl; > + } > + iface_key = ", iface "; > + iface_val = opts->host_triface; > } > > if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) { Normally the options are just parts of the 'address' string; why didn't you use that approach here? > @@ -2530,8 +2564,9 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, > if (ret) > goto out_uninit_ctrl; > > - dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp\n", > - ctrl->ctrl.opts->subsysnqn, &ctrl->addr); > + dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp%s%s%s%s\n", > + ctrl->ctrl.opts->subsysnqn, &ctrl->addr, > + srce_key, srce_val, iface_key, iface_val); > > mutex_lock(&nvme_tcp_ctrl_mutex); > list_add_tail(&ctrl->list, &nvme_tcp_ctrl_list); > @@ -2560,7 +2595,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = { > NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | > NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | > NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | > - NVMF_OPT_TOS, > + NVMF_OPT_TOS | NVMF_OPT_HOST_TRIFACE, > .create_ctrl = nvme_tcp_create_ctrl, > }; > > Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-03 16:59 ` Belanger, Martin @ 2021-05-04 13:25 ` Hannes Reinecke 0 siblings, 0 replies; 23+ messages in thread From: Hannes Reinecke @ 2021-05-04 13:25 UTC (permalink / raw) To: Belanger, Martin, Martin Belanger, linux-nvme; +Cc: kbusch, axboe, hch, sagi On 5/3/21 6:59 PM, Belanger, Martin wrote: > Hi Hannes, > > I just noticed there were in-line comments. to answer you questions: > This is the recommended style when working with linux patches; please do not top-post. > Q1) Why not simply 'host_iface' ? 'triface' is a bit awkward. > A1) I used TRIFACE to keep consistency with all other transport options: traddr, trsvcid, host_traddr. > I will rename to host_iface at your suggestion. > > Q2) Is this valid for all transports? I guess it would only work for 'tcp', and maybe 'rdma' if one > would be running ROCE. Shouldn't we error out on other transports like 'fc' or 'loop'? > A2) This is only for TCP, and we do check that this option is only allowed for TCP by specifying it > in the "allowed_opts" as follows (see file tcp.c): > > static struct nvmf_transport_ops nvme_tcp_transport = { > .name = "tcp", > .module = THIS_MODULE, > .required_opts = NVMF_OPT_TRADDR, > .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | > NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | > NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | > NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | > NVMF_OPT_TOS | NVMF_OPT_HOST_TRIFACE, > .create_ctrl = nvme_tcp_create_ctrl, > }; > Right, okay. > Q3) Normally the options are just parts of the 'address' string; why didn't > you use that approach here? > A3) I don't understand what you mean? > The 'address' string contains all transport arguments (without the transport type), concatenated by a ','. So the 'host_iface' value should just be added to the address string itself, and not added as separate argument to the sysfs printf() call. Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-04-15 19:28 ` [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting Martin Belanger 2021-05-01 11:34 ` Hannes Reinecke @ 2021-05-04 19:56 ` Sagi Grimberg 2021-05-05 8:47 ` Hannes Reinecke 1 sibling, 1 reply; 23+ messages in thread From: Sagi Grimberg @ 2021-05-04 19:56 UTC (permalink / raw) To: Martin Belanger, linux-nvme; +Cc: kbusch, axboe, hch, Martin Belanger > From: Martin Belanger <martin.belanger@dell.com> Change log is missing... > > --- > drivers/nvme/host/core.c | 5 +++++ > drivers/nvme/host/fabrics.c | 14 +++++++++++++ > drivers/nvme/host/fabrics.h | 6 +++++- > drivers/nvme/host/tcp.c | 41 ++++++++++++++++++++++++++++++++++--- > 4 files changed, 62 insertions(+), 4 deletions(-) > > diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c > index 288ac47ff5b4..91ae11a1ae26 100644 > --- a/drivers/nvme/host/core.c > +++ b/drivers/nvme/host/core.c > @@ -3961,6 +3961,11 @@ static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) > > ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", > opts->host_traddr ?: "none"); > + if (ret) > + return ret; > + > + ret = add_uevent_var(env, "NVME_HOST_TRIFACE=%s", > + opts->host_triface ?: "none"); Given that this was the original intent for host_traddr, why not have host_traddr resolve the iface from the address and set sockopt SO_BINDTODEVICE on it? _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-04 19:56 ` Sagi Grimberg @ 2021-05-05 8:47 ` Hannes Reinecke 2021-05-05 14:31 ` Belanger, Martin 0 siblings, 1 reply; 23+ messages in thread From: Hannes Reinecke @ 2021-05-05 8:47 UTC (permalink / raw) To: Sagi Grimberg, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch, Martin Belanger On 5/4/21 9:56 PM, Sagi Grimberg wrote: > >> From: Martin Belanger <martin.belanger@dell.com> > > Change log is missing... > >> >> --- >> drivers/nvme/host/core.c | 5 +++++ >> drivers/nvme/host/fabrics.c | 14 +++++++++++++ >> drivers/nvme/host/fabrics.h | 6 +++++- >> drivers/nvme/host/tcp.c | 41 ++++++++++++++++++++++++++++++++++--- >> 4 files changed, 62 insertions(+), 4 deletions(-) >> >> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c >> index 288ac47ff5b4..91ae11a1ae26 100644 >> --- a/drivers/nvme/host/core.c >> +++ b/drivers/nvme/host/core.c >> @@ -3961,6 +3961,11 @@ static int nvme_class_uevent(struct device >> *dev, struct kobj_uevent_env *env) >> ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", >> opts->host_traddr ?: "none"); >> + if (ret) >> + return ret; >> + >> + ret = add_uevent_var(env, "NVME_HOST_TRIFACE=%s", >> + opts->host_triface ?: "none"); > > Given that this was the original intent for host_traddr, why not have > host_traddr resolve the iface from the address and set sockopt > SO_BINDTODEVICE on it? > That was my question, too. I would vastly prefer to not have another option to deal with (as it raises the question whether to add it eg during 'nvme connect-all') And one could argue that this was the intention of _having_ the host_traddr argument in the first place ... Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions Germany GmbH, 90409 Nürnberg GF: F. Imendörffer, HRB 36809 (AG Nürnberg) _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-05 8:47 ` Hannes Reinecke @ 2021-05-05 14:31 ` Belanger, Martin 2021-05-05 18:33 ` James Smart ` (2 more replies) 0 siblings, 3 replies; 23+ messages in thread From: Belanger, Martin @ 2021-05-05 14:31 UTC (permalink / raw) To: Hannes Reinecke, Sagi Grimberg, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch > -----Original Message----- > From: Hannes Reinecke <hare@suse.de> > Sent: Wednesday, May 5, 2021 4:47 AM > To: Sagi Grimberg; Martin Belanger; linux-nvme@lists.infradead.org > Cc: kbusch@kernel.org; axboe@fb.com; hch@lst.de; Belanger, Martin > Subject: Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can > be used to specify the IP interface to use for the connection. The driver uses > that to set SO_BINDTODEVICE on the socket before connecting. > > > [EXTERNAL EMAIL] > > On 5/4/21 9:56 PM, Sagi Grimberg wrote: > > > >> From: Martin Belanger <martin.belanger@dell.com> > > > > Change log is missing... > > > >> > >> --- > >> drivers/nvme/host/core.c | 5 +++++ > >> drivers/nvme/host/fabrics.c | 14 +++++++++++++ > >> drivers/nvme/host/fabrics.h | 6 +++++- > >> drivers/nvme/host/tcp.c | 41 > >> ++++++++++++++++++++++++++++++++++--- > >> 4 files changed, 62 insertions(+), 4 deletions(-) > >> > >> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c > >> index 288ac47ff5b4..91ae11a1ae26 100644 > >> --- a/drivers/nvme/host/core.c > >> +++ b/drivers/nvme/host/core.c > >> @@ -3961,6 +3961,11 @@ static int nvme_class_uevent(struct device > >> *dev, struct kobj_uevent_env *env) > >> ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", > >> opts->host_traddr ?: "none"); > >> + if (ret) > >> + return ret; > >> + > >> + ret = add_uevent_var(env, "NVME_HOST_TRIFACE=%s", > >> + opts->host_triface ?: "none"); > > > > Given that this was the original intent for host_traddr, why not have > > host_traddr resolve the iface from the address and set sockopt > > SO_BINDTODEVICE on it? > > > That was my question, too. > > I would vastly prefer to not have another option to deal with (as it raises the > question whether to add it eg during 'nvme connect-all') And one could > argue that this was the intention of _having_ the host_traddr argument in > the first place ... > > Cheers, > > Hannes > -- > Dr. Hannes Reinecke Kernel Storage Architect > hare@suse.de +49 911 74053 688 > SUSE Software Solutions Germany GmbH, 90409 Nürnberg > GF: F. Imendörffer, HRB 36809 (AG Nürnberg) Hi Sagi and Hannes, Correct me if I'm wrong, but it sounds like host_traddr was primarily added for FC (at least it wasn't tested for TCP since it does not work in its current state). I'm not an expert on FC and maybe specifying an address is the right (and only) way to specify and interface for FC. For TCP, however, it's not advisable. Specifying an interface by its associated IP address is less intuitive than specifying the actual interface name and, in some cases, it simply won't work. That's because the association between interfaces and IP addresses is not predictable. IP addresses can be changed or can change by themselves over time (e.g. DHCP). Interface names are predictable [1] and will persist over time. Consider the following configuration. 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 inet 100.0.0.100/24 scope global lo valid_lft forever preferred_lft forever 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000 link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff inet 100.0.0.100/24 scope global enp0s3 valid_lft forever preferred_lft forever 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000 link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff inet 100.0.0.100/24 scope global enp0s8 valid_lft forever preferred_lft forever The above is a VM that I configured with the same IP address (100.0.0.100) on all interfaces. Doing a reverse lookup to identify the unique interface associated with 100.0.0.100 would simply not work here. And this is why the option host_iface is required. I understand that the above config does not represent a standard host system, but I'm using this to prove a point: "we can never know how a user will configure their system and the above configuration is perfectly fine by Linux". The current TCP implementation for host_traddr uses bind()-before-connect(). This is a common construct to set the source IP address on the socket before connecting. This has no effect on how Linux will select the interface for the connection. That's because Linux uses the Weak End System model as described in RFC1122 [2]. Setting the source address on a connection is a common requirement that linux-nvme needs to support. In fact, specifying the Source IP address is a mandatory FedGov requirement (e.g. connection to a RADIUS/TACACS+ server). Consider the following configuration. $ ip addr list dev enp0s8 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000 link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff inet 192.168.56.101/24 brd 192.168.56.255 scope global dynamic noprefixroute enp0s8 valid_lft 426sec preferred_lft 426sec inet 192.168.56.102/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.103/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.104/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever Here we can see that several addresses are associated with interface enp0s8. By default, Linux will select the default IP address, 192.168.56.101, as the source address when connecting over interface enp0s8. Some users, however, want the ability to specify a different address (e.g., 192.168.56.103) to be used as the source address. The option host_traddr can be used as-is to perform this function (I tested it). In conclusion, I believe that for TCP we need 2 options. One that can be used to specify an interface. And one that can be used to set the source address. And users should be allowed to use one or the other, or both, or none. Of course, the documentation for host_traddr will need some clarification. It should state that when used for TCP connection, this option only sets the source address. And the documentation for host_iface should say that this option only applies to TCP connections. References: [1] https://www.freedesktop.org/wiki/Software/systemd/PredictableNetworkInterfaceNames/ [2] https://tools.ietf.org/html/rfc1122 Regards, Martin Belanger Dell Inc. _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-05 14:31 ` Belanger, Martin @ 2021-05-05 18:33 ` James Smart 2021-05-05 20:32 ` Sagi Grimberg 2021-05-06 6:05 ` Hannes Reinecke 2 siblings, 0 replies; 23+ messages in thread From: James Smart @ 2021-05-05 18:33 UTC (permalink / raw) To: Belanger, Martin, Hannes Reinecke, Sagi Grimberg, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch On 5/5/2021 7:31 AM, Belanger, Martin wrote: ... >>> >>> Given that this was the original intent for host_traddr, why not have >>> host_traddr resolve the iface from the address and set sockopt >>> SO_BINDTODEVICE on it? >>> >> That was my question, too. >> >> I would vastly prefer to not have another option to deal with (as it raises the >> question whether to add it eg during 'nvme connect-all') And one could >> argue that this was the intention of _having_ the host_traddr argument in >> the first place ... >> >> Cheers, >> >> Hannes > > Hi Sagi and Hannes, > > Correct me if I'm wrong, but it sounds like host_traddr was primarily added for FC (at least it wasn't tested for TCP since it does not work in its current state). I'm not an expert on FC and maybe specifying an address is the right (and only) way to specify and interface for FC. For TCP, however, it's not advisable. Specifying an interface by its associated IP address is less intuitive than specifying the actual interface name and, in some cases, it simply won't work. That's because the association between interfaces and IP addresses is not predictable. IP addresses can be changed or can change by themselves over time (e.g. DHCP). HOST_TRADDR exists only in linux and has no strict definition other than what we define for it. I fully expect its value, just like TRADDR, will be transport specific. For FC, we didn't want to use names based on driver instances, or on FC link addresses, so we chose WWNs that were specific to a FC port to identify it. The transport has the mechanisms to map the WWNs to a nvme-supporting FC host port. Define whatever is necessary for rdma or TCP and what they put into HOST_TRADDR. ... > > In conclusion, I believe that for TCP we need 2 options. One that can be used to specify an interface. And one that can be used to set the source address. And users should be allowed to use one or the other, or both, or none. Of course, the documentation for host_traddr will need some clarification. It should state that when used for TCP connection, this option only sets the source address. And the documentation for host_iface should say that this option only applies to TCP connections. I don't like seeing 2 options. I'd rather you stuck with a single option and added formatting to the option so you could specify one or the other or both (or none by not specifying the option). -- james _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-05 14:31 ` Belanger, Martin 2021-05-05 18:33 ` James Smart @ 2021-05-05 20:32 ` Sagi Grimberg 2021-05-06 18:27 ` Michael Christie 2021-05-06 6:05 ` Hannes Reinecke 2 siblings, 1 reply; 23+ messages in thread From: Sagi Grimberg @ 2021-05-05 20:32 UTC (permalink / raw) To: Belanger, Martin, Hannes Reinecke, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch >>> Given that this was the original intent for host_traddr, why not have >>> host_traddr resolve the iface from the address and set sockopt >>> SO_BINDTODEVICE on it? >>> >> That was my question, too. >> >> I would vastly prefer to not have another option to deal with (as it raises the >> question whether to add it eg during 'nvme connect-all') And one could >> argue that this was the intention of _having_ the host_traddr argument in >> the first place ... >> >> Cheers, >> >> Hannes >> -- >> Dr. Hannes Reinecke Kernel Storage Architect >> hare@suse.de +49 911 74053 688 >> SUSE Software Solutions Germany GmbH, 90409 Nürnberg >> GF: F. Imendörffer, HRB 36809 (AG Nürnberg) > > Hi Sagi and Hannes, > > Correct me if I'm wrong, but it sounds like host_traddr was primarily added for FC (at least it wasn't tested for TCP since it does not work in its current state). I'm not an expert on FC and maybe specifying an address is the right (and only) way to specify and interface for FC. For TCP, however, it's not advisable. Specifying an interface by its associated IP address is less intuitive than specifying the actual interface name and, in some cases, it simply won't work. That's because the association between interfaces and IP addresses is not predictable. IP addresses can be changed or can change by themselves over time (e.g. DHCP). Interface names are predictable [1] and will persist over time. Consider the following configuration. > > 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 > link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 > inet 100.0.0.100/24 scope global lo > valid_lft forever preferred_lft forever > 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000 > link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff > inet 100.0.0.100/24 scope global enp0s3 > valid_lft forever preferred_lft forever > 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000 > link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff > inet 100.0.0.100/24 scope global enp0s8 > valid_lft forever preferred_lft forever > > The above is a VM that I configured with the same IP address (100.0.0.100) on all interfaces. Doing a reverse lookup to identify the unique interface associated with 100.0.0.100 would simply not work here. And this is why the option host_iface is required. I understand that the above config does not represent a standard host system, but I'm using this to prove a point: "we can never know how a user will configure their system and the above configuration is perfectly fine by Linux". Is this a common setting? I'd say that we should probably see a real life need for it before adding a user interface for it. What if we start with doing bind + BINDTODEVICE sockopt and if interface resolution results in multiple devices we just skip the BINDTODEVICE sockopt (which will not introduce a regression). If this will cover 99% of the use-cases we are in good shape and we didn't introduce yet another ABI that may be just confusing... Having said that, if this setting is a real use-case we need to support then there is no alternative to have the two options. So I'm a bit on the fence here. > The current TCP implementation for host_traddr uses bind()-before-connect(). This is a common construct to set the source IP address on the socket before connecting. This has no effect on how Linux will select the interface for the connection. That's because Linux uses the Weak End System model as described in RFC1122 [2]. Setting the source address on a connection is a common requirement that linux-nvme needs to support. In fact, specifying the Source IP address is a mandatory FedGov requirement (e.g. connection to a RADIUS/TACACS+ server). Consider the following configuration. > > $ ip addr list dev enp0s8 > 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000 > link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff > inet 192.168.56.101/24 brd 192.168.56.255 scope global dynamic noprefixroute enp0s8 > valid_lft 426sec preferred_lft 426sec > inet 192.168.56.102/24 scope global secondary enp0s8 > valid_lft forever preferred_lft forever > inet 192.168.56.103/24 scope global secondary enp0s8 > valid_lft forever preferred_lft forever > inet 192.168.56.104/24 scope global secondary enp0s8 > valid_lft forever preferred_lft forever > > Here we can see that several addresses are associated with interface enp0s8. By default, Linux will select the default IP address, 192.168.56.101, as the source address when connecting over interface enp0s8. Some users, however, want the ability to specify a different address (e.g., 192.168.56.103) to be used as the source address. The option host_traddr can be used as-is to perform this function (I tested it). > I meant that we do both bind and sockopt for host_traddr. _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-05 20:32 ` Sagi Grimberg @ 2021-05-06 18:27 ` Michael Christie 0 siblings, 0 replies; 23+ messages in thread From: Michael Christie @ 2021-05-06 18:27 UTC (permalink / raw) To: Sagi Grimberg Cc: Belanger, Martin, Hannes Reinecke, Martin Belanger, linux-nvme, kbusch, axboe, hch > On May 5, 2021, at 3:32 PM, Sagi Grimberg <sagi@grimberg.me> wrote: > > >>>> Given that this was the original intent for host_traddr, why not have >>>> host_traddr resolve the iface from the address and set sockopt >>>> SO_BINDTODEVICE on it? >>>> >>> That was my question, too. >>> >>> I would vastly prefer to not have another option to deal with (as it raises the >>> question whether to add it eg during 'nvme connect-all') And one could >>> argue that this was the intention of _having_ the host_traddr argument in >>> the first place ... >>> >>> Cheers, >>> >>> Hannes >>> -- >>> Dr. Hannes Reinecke Kernel Storage Architect >>> hare@suse.de +49 911 74053 688 >>> SUSE Software Solutions Germany GmbH, 90409 Nürnberg >>> GF: F. Imendörffer, HRB 36809 (AG Nürnberg) >> Hi Sagi and Hannes, >> Correct me if I'm wrong, but it sounds like host_traddr was primarily added for FC (at least it wasn't tested for TCP since it does not work in its current state). I'm not an expert on FC and maybe specifying an address is the right (and only) way to specify and interface for FC. For TCP, however, it's not advisable. Specifying an interface by its associated IP address is less intuitive than specifying the actual interface name and, in some cases, it simply won't work. That's because the association between interfaces and IP addresses is not predictable. IP addresses can be changed or can change by themselves over time (e.g. DHCP). Interface names are predictable [1] and will persist over time. Consider the following configuration. >> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 >> link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 >> inet 100.0.0.100/24 scope global lo >> valid_lft forever preferred_lft forever >> 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000 >> link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff >> inet 100.0.0.100/24 scope global enp0s3 >> valid_lft forever preferred_lft forever >> 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000 >> link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff >> inet 100.0.0.100/24 scope global enp0s8 >> valid_lft forever preferred_lft forever >> The above is a VM that I configured with the same IP address (100.0.0.100) on all interfaces. Doing a reverse lookup to identify the unique interface associated with 100.0.0.100 would simply not work here. And this is why the option host_iface is required. I understand that the above config does not represent a standard host system, but I'm using this to prove a point: "we can never know how a user will configure their system and the above configuration is perfectly fine by Linux". > > Is this a common setting? I'd say that we should probably see a real > life need for it before adding a user interface for it. For iscsi, people used SO_BINDTODEVICE because for whatver reason they really like putting multiple interfaces on the same subnet. They would then want to have connection1 use enp0s0 and connection2 use enp0s1, etc. I can’t remember why (it could be a bug or maybe the user can’t config the routing table) but just doing a bind() will sometimes not work because packets will come in on enp0s0 but when you do a send() it would go out on enp0s. But enp0s1 might be connected in way it can only reach certain target ports. The target would then not see the response. _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-05 14:31 ` Belanger, Martin 2021-05-05 18:33 ` James Smart 2021-05-05 20:32 ` Sagi Grimberg @ 2021-05-06 6:05 ` Hannes Reinecke 2021-05-06 7:00 ` Hannes Reinecke 2 siblings, 1 reply; 23+ messages in thread From: Hannes Reinecke @ 2021-05-06 6:05 UTC (permalink / raw) To: Belanger, Martin, Sagi Grimberg, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch On 5/5/21 4:31 PM, Belanger, Martin wrote: >> -----Original Message----- >> From: Hannes Reinecke <hare@suse.de> >> Sent: Wednesday, May 5, 2021 4:47 AM >> To: Sagi Grimberg; Martin Belanger; linux-nvme@lists.infradead.org >> Cc: kbusch@kernel.org; axboe@fb.com; hch@lst.de; Belanger, Martin >> Subject: Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can >> be used to specify the IP interface to use for the connection. The driver uses >> that to set SO_BINDTODEVICE on the socket before connecting. >> >> >> [EXTERNAL EMAIL] >> >> On 5/4/21 9:56 PM, Sagi Grimberg wrote: >>> >>>> From: Martin Belanger <martin.belanger@dell.com> >>> >>> Change log is missing... >>> >>>> >>>> --- >>>> drivers/nvme/host/core.c | 5 +++++ >>>> drivers/nvme/host/fabrics.c | 14 +++++++++++++ >>>> drivers/nvme/host/fabrics.h | 6 +++++- >>>> drivers/nvme/host/tcp.c | 41 >>>> ++++++++++++++++++++++++++++++++++--- >>>> 4 files changed, 62 insertions(+), 4 deletions(-) >>>> >>>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c >>>> index 288ac47ff5b4..91ae11a1ae26 100644 >>>> --- a/drivers/nvme/host/core.c >>>> +++ b/drivers/nvme/host/core.c >>>> @@ -3961,6 +3961,11 @@ static int nvme_class_uevent(struct device >>>> *dev, struct kobj_uevent_env *env) >>>> ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", >>>> opts->host_traddr ?: "none"); >>>> + if (ret) >>>> + return ret; >>>> + >>>> + ret = add_uevent_var(env, "NVME_HOST_TRIFACE=%s", >>>> + opts->host_triface ?: "none"); >>> >>> Given that this was the original intent for host_traddr, why not have >>> host_traddr resolve the iface from the address and set sockopt >>> SO_BINDTODEVICE on it? >>> >> That was my question, too. >> >> I would vastly prefer to not have another option to deal with (as it raises the >> question whether to add it eg during 'nvme connect-all') And one could >> argue that this was the intention of _having_ the host_traddr argument in >> the first place ... >> >> Cheers, >> >> Hannes >> -- >> Dr. Hannes Reinecke Kernel Storage Architect >> hare@suse.de +49 911 74053 688 >> SUSE Software Solutions Germany GmbH, 90409 Nürnberg >> GF: F. Imendörffer, HRB 36809 (AG Nürnberg) > > Hi Sagi and Hannes, > > Correct me if I'm wrong, but it sounds like host_traddr was primarily added for FC (at least it wasn't tested for TCP since it does not work in its current state). I'm not an expert on FC and maybe specifying an address is the right (and only) way to specify and interface for FC. For TCP, however, it's not advisable. Specifying an interface by its associated IP address is less intuitive than specifying the actual interface name and, in some cases, it simply won't work. That's because the association between interfaces and IP addresses is not predictable. IP addresses can be changed or can change by themselves over time (e.g. DHCP). Interface names are predictable [1] and will persist over time. Consider the following configuration. > > 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN group default qlen 1000 > link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 > inet 100.0.0.100/24 scope global lo > valid_lft forever preferred_lft forever > 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000 > link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff > inet 100.0.0.100/24 scope global enp0s3 > valid_lft forever preferred_lft forever > 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000 > link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff > inet 100.0.0.100/24 scope global enp0s8 > valid_lft forever preferred_lft forever > > The above is a VM that I configured with the same IP address (100.0.0.100) on all interfaces. Doing a reverse > lookup to identify the unique interface associated with 100.0.0.100 would simply not work here. And this is why > the option host_iface is required. I understand that the above config does not represent a standard host system, > but I'm using this to prove a point: "we can never know how a user will configure their system and the above > configuration is perfectly fine by Linux". > ... and messing up any switch MAC address caching when doing so. I guess the network admin will come down hard on you if you try that on a production system. And I sincerely question whether this is a valid use-case; I'm already getting grief from our network admins if I dare to put two network interfaces from the same machine in the same network. > The current TCP implementation for host_traddr uses bind()-before-connect(). This is a common construct to set the > source IP address on the socket before connecting. This has no effect on how Linux will select the interface for the > connection. That's because Linux uses the Weak End System model as described in RFC1122 [2]. Setting the source address > on a connection is a common requirement that linux-nvme needs to support. In fact, specifying the Source IP address > is a mandatory FedGov requirement (e.g. connection to a RADIUS/TACACS+ server). Consider the following configuration. > > $ ip addr list dev enp0s8 > 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel state UP group default qlen 1000 > link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff > inet 192.168.56.101/24 brd 192.168.56.255 scope global dynamic noprefixroute enp0s8 > valid_lft 426sec preferred_lft 426sec > inet 192.168.56.102/24 scope global secondary enp0s8 > valid_lft forever preferred_lft forever > inet 192.168.56.103/24 scope global secondary enp0s8 > valid_lft forever preferred_lft forever > inet 192.168.56.104/24 scope global secondary enp0s8 > valid_lft forever preferred_lft forever > > Here we can see that several addresses are associated with interface enp0s8. By default, Linux will select the > default IP address, 192.168.56.101, as the source address when connecting over interface enp0s8. Some users, > however, want the ability to specify a different address (e.g., 192.168.56.103) to be used as the source address. > The option host_traddr can be used as-is to perform this function (I tested it). > No disagreement here. > In conclusion, I believe that for TCP we need 2 options. One that can be used to specify an interface. And one > that can be used to set the source address. And users should be allowed to use one or the other, or both, or none. > Of course, the documentation for host_traddr will need some clarification. It should state that when used for TCP > connection, this option only sets the source address. And the documentation for host_iface should say that this > option only applies to TCP connections. > I'm with James Smart here. I do fail to see the need for 'host_iface' _without_ 'host_traddr'; especially for IPv6 where several addresses are standard just specifying 'host_iface' simply is not enough, and one has to specify 'host_traddr' additionally. So 'host_iface' should be contingent on 'host_traddr', meaning we can just expand the syntax of 'host_traddr'. One easy possibility would be to add ',nobind' to the host_traddr syntax which would indicate that we should _not_ bind to the underlying interface; I do think that binding to the respective interface should be the default. Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-06 6:05 ` Hannes Reinecke @ 2021-05-06 7:00 ` Hannes Reinecke 2021-05-06 15:46 ` Belanger, Martin 0 siblings, 1 reply; 23+ messages in thread From: Hannes Reinecke @ 2021-05-06 7:00 UTC (permalink / raw) To: Belanger, Martin, Sagi Grimberg, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch On 5/6/21 8:05 AM, Hannes Reinecke wrote: > On 5/5/21 4:31 PM, Belanger, Martin wrote: [ .. ] >> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN >> group default qlen 1000 >> link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 >> inet 100.0.0.100/24 scope global lo >> valid_lft forever preferred_lft forever >> 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel >> state UP group default qlen 1000 >> link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff >> inet 100.0.0.100/24 scope global enp0s3 >> valid_lft forever preferred_lft forever >> 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel >> state UP group default qlen 1000 >> link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff >> inet 100.0.0.100/24 scope global enp0s8 >> valid_lft forever preferred_lft forever >> >> The above is a VM that I configured with the same IP address >> (100.0.0.100) on all interfaces. Doing a reverse >> lookup to identify the unique interface associated with 100.0.0.100 >> would simply not work here. And this is why >> the option host_iface is required. I understand that the above config >> does not represent a standard host system, >> but I'm using this to prove a point: "we can never know how a user >> will configure their system and the above >> configuration is perfectly fine by Linux". >> > > ... and messing up any switch MAC address caching when doing so. I guess > the network admin will come down hard on you if you try that on a > production system. > And I sincerely question whether this is a valid use-case; I'm already > getting grief from our network admins if I dare to put two network > interfaces from the same machine in the same network. > >> The current TCP implementation for host_traddr uses >> bind()-before-connect(). This is a common construct to set the >> source IP address on the socket before connecting. This has no effect >> on how Linux will select the interface for the >> connection. That's because Linux uses the Weak End System model as >> described in RFC1122 [2]. Setting the source address >> on a connection is a common requirement that linux-nvme needs to >> support. In fact, specifying the Source IP address >> is a mandatory FedGov requirement (e.g. connection to a RADIUS/TACACS+ >> server). Consider the following configuration. >> >> $ ip addr list dev enp0s8 >> 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc fq_codel >> state UP group default qlen 1000 >> link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff >> inet 192.168.56.101/24 brd 192.168.56.255 scope global dynamic >> noprefixroute enp0s8 >> valid_lft 426sec preferred_lft 426sec >> inet 192.168.56.102/24 scope global secondary enp0s8 >> valid_lft forever preferred_lft forever >> inet 192.168.56.103/24 scope global secondary enp0s8 >> valid_lft forever preferred_lft forever >> inet 192.168.56.104/24 scope global secondary enp0s8 >> valid_lft forever preferred_lft forever >> >> Here we can see that several addresses are associated with interface >> enp0s8. By default, Linux will select the >> default IP address, 192.168.56.101, as the source address when >> connecting over interface enp0s8. Some users, >> however, want the ability to specify a different address (e.g., >> 192.168.56.103) to be used as the source address. >> The option host_traddr can be used as-is to perform this function (I >> tested it). >> > > No disagreement here. > >> In conclusion, I believe that for TCP we need 2 options. One that can >> be used to specify an interface. And one >> that can be used to set the source address. And users should be >> allowed to use one or the other, or both, or none. >> Of course, the documentation for host_traddr will need some >> clarification. It should state that when used for TCP >> connection, this option only sets the source address. And the >> documentation for host_iface should say that this >> option only applies to TCP connections. >> > > I'm with James Smart here. I do fail to see the need for 'host_iface' > _without_ 'host_traddr'; especially for IPv6 where several addresses are > standard just specifying 'host_iface' simply is not enough, and one has > to specify 'host_traddr' additionally. > > So 'host_iface' should be contingent on 'host_traddr', meaning we can > just expand the syntax of 'host_traddr'. > One easy possibility would be to add ',nobind' to the host_traddr syntax > which would indicate that we should _not_ bind to the underlying > interface; I do think that binding to the respective interface should be > the default. > A-ha. Just spoke to our network folks, and they clarified the usage of binding to an IP address vs binding to a network interface. Apparently, binding to a source IP address does just that, setting the source IP address of the outgoing packet. That packet will _still_ be subjected to the normal routing table, as the routing table is just influenced by the _destination_ IP address. So if we want to have it routed via a specific interface (and thereby influencing the routing table) we need to bind it to that interface. The only valid scenario our network folks could come up with where we do _not_ want to bind to an interface is for asymmetric flows, ie in cases where the outgoing flow is routed to one interface and the incoming flow is arriving on another interface. But even they admitted that it's not a common scenario, and probably will be killed by anti-spoofing software running on the core switches ... But if we want to support _that_ then clearly binding to a specific interface doesn't work. So I would vote for making binding to the network interface holding the IP address the default, and add an option ',nobind' to host_traddr to skip it. Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions Germany GmbH, 90409 Nürnberg GF: F. Imendörffer, HRB 36809 (AG Nürnberg) _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-06 7:00 ` Hannes Reinecke @ 2021-05-06 15:46 ` Belanger, Martin 2021-05-07 18:20 ` Sagi Grimberg 0 siblings, 1 reply; 23+ messages in thread From: Belanger, Martin @ 2021-05-06 15:46 UTC (permalink / raw) To: Hannes Reinecke, Sagi Grimberg, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch > On 5/6/21 8:05 AM, Hannes Reinecke wrote: > > On 5/5/21 4:31 PM, Belanger, Martin wrote: > [ .. ] > >> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state > UNKNOWN > >> group default qlen 1000 > >> link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 > >> inet 100.0.0.100/24 scope global lo > >> valid_lft forever preferred_lft forever > >> 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc > fq_codel > >> state UP group default qlen 1000 > >> link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff > >> inet 100.0.0.100/24 scope global enp0s3 > >> valid_lft forever preferred_lft forever > >> 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc > fq_codel > >> state UP group default qlen 1000 > >> link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff > >> inet 100.0.0.100/24 scope global enp0s8 > >> valid_lft forever preferred_lft forever > >> > >> The above is a VM that I configured with the same IP address > >> (100.0.0.100) on all interfaces. Doing a reverse lookup to identify > >> the unique interface associated with 100.0.0.100 would simply not > >> work here. And this is why the option host_iface is required. I > >> understand that the above config does not represent a standard host > >> system, but I'm using this to prove a point: "we can never know how a > >> user will configure their system and the above configuration is > >> perfectly fine by Linux". > >> > > > > ... and messing up any switch MAC address caching when doing so. I > > guess the network admin will come down hard on you if you try that on > > a production system. > > And I sincerely question whether this is a valid use-case; I'm already > > getting grief from our network admins if I dare to put two network > > interfaces from the same machine in the same network. > > > >> The current TCP implementation for host_traddr uses > >> bind()-before-connect(). This is a common construct to set the source > >> IP address on the socket before connecting. This has no effect on how > >> Linux will select the interface for the connection. That's because > >> Linux uses the Weak End System model as described in RFC1122 [2]. > >> Setting the source address on a connection is a common requirement > >> that linux-nvme needs to support. In fact, specifying the Source IP > >> address is a mandatory FedGov requirement (e.g. connection to a > >> RADIUS/TACACS+ server). Consider the following configuration. > >> > >> $ ip addr list dev enp0s8 > >> 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc > fq_codel > >> state UP group default qlen 1000 > >> link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff > >> inet 192.168.56.101/24 brd 192.168.56.255 scope global dynamic > >> noprefixroute enp0s8 > >> valid_lft 426sec preferred_lft 426sec > >> inet 192.168.56.102/24 scope global secondary enp0s8 > >> valid_lft forever preferred_lft forever > >> inet 192.168.56.103/24 scope global secondary enp0s8 > >> valid_lft forever preferred_lft forever > >> inet 192.168.56.104/24 scope global secondary enp0s8 > >> valid_lft forever preferred_lft forever > >> > >> Here we can see that several addresses are associated with interface > >> enp0s8. By default, Linux will select the default IP address, > >> 192.168.56.101, as the source address when connecting over interface > >> enp0s8. Some users, however, want the ability to specify a different > >> address (e.g., > >> 192.168.56.103) to be used as the source address. > >> The option host_traddr can be used as-is to perform this function (I > >> tested it). > >> > > > > No disagreement here. > > > >> In conclusion, I believe that for TCP we need 2 options. One that can > >> be used to specify an interface. And one that can be used to set the > >> source address. And users should be allowed to use one or the other, > >> or both, or none. > >> Of course, the documentation for host_traddr will need some > >> clarification. It should state that when used for TCP connection, > >> this option only sets the source address. And the documentation for > >> host_iface should say that this option only applies to TCP > >> connections. > >> > > > > I'm with James Smart here. I do fail to see the need for 'host_iface' > > _without_ 'host_traddr'; especially for IPv6 where several addresses > > are standard just specifying 'host_iface' simply is not enough, and > > one has to specify 'host_traddr' additionally. > > > > So 'host_iface' should be contingent on 'host_traddr', meaning we can > > just expand the syntax of 'host_traddr'. > > One easy possibility would be to add ',nobind' to the host_traddr > > syntax which would indicate that we should _not_ bind to the > > underlying interface; I do think that binding to the respective > > interface should be the default. > > > A-ha. Just spoke to our network folks, and they clarified the usage of binding > to an IP address vs binding to a network interface. > Apparently, binding to a source IP address does just that, setting the source > IP address of the outgoing packet. That packet will _still_ be subjected to the > normal routing table, as the routing table is just influenced by the > _destination_ IP address. > So if we want to have it routed via a specific interface (and thereby > influencing the routing table) we need to bind it to that interface. > > The only valid scenario our network folks could come up with where we do > _not_ want to bind to an interface is for asymmetric flows, ie in cases where > the outgoing flow is routed to one interface and the incoming flow is arriving > on another interface. But even they admitted that it's not a common > scenario, and probably will be killed by anti-spoofing software running on > the core switches ... > > But if we want to support _that_ then clearly binding to a specific interface > doesn't work. > > So I would vote for making binding to the network interface holding the IP > address the default, and add an option ',nobind' to host_traddr to skip it. > > Cheers, > > Hannes > -- > Dr. Hannes Reinecke Kernel Storage Architect > hare@suse.de +49 911 74053 688 > SUSE Software Solutions Germany GmbH, 90409 Nürnberg > GF: F. Imendörffer, HRB 36809 (AG Nürnberg) Hi Hannes, If the only concern here is the addition of yet another option (--host-iface), then may I suggest a simpler approach. What I'm proposing adheres to RFC4007 [1], which defines a way to specify an interface by using the '%' delimiter between the Destination IP address and the Interface. In fact, "ping" uses this approach [2]. With ping, one can force the connection to go a specific interface like this: ping <dest-ip-addr>%<interface> Extending this approach to nvme-cli we arrive to something like this: nvme discover --traddr 100.64.29.2%enp0s8 --host-traddr 192.168.56.102 .... This tells nvme to connect to 100.64.29.2 on interface enp0s8. We make no change to the --host-traddr option. It continues to be used to specify the Source IP address only (for the rare cases where users want to specify a Source Address other than the default). With this, the interface is specified by name and not by its associated address. This is not only more intuitive, but, as I stated before, eliminates the problem caused by mapping the same IP address to multiple interfaces (not to mention that doing a reverse lookup on an IP address to find the interface is extra work that we don’t need to do in kernel space). This gives us full control. It allows us to set the "interface" and the "source address". We can set both, or only one of the two, or none. It follows standards and there is a precedent with "ping". NOTE: RFC4007 specifically discusses specifying an Interface for IPv6, but I don’t see any issue extending the use of the '%' delimiter to IPv4 as well. Ref: [1] https://tools.ietf.org/html/rfc4007 [2] https://man7.org/linux/man-pages/man8/ping.8.html Regards, Martin Belanger Dell EMC _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-06 15:46 ` Belanger, Martin @ 2021-05-07 18:20 ` Sagi Grimberg 2021-05-10 13:49 ` Belanger, Martin 0 siblings, 1 reply; 23+ messages in thread From: Sagi Grimberg @ 2021-05-07 18:20 UTC (permalink / raw) To: Belanger, Martin, Hannes Reinecke, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch On 5/6/21 8:46 AM, Belanger, Martin wrote: >> On 5/6/21 8:05 AM, Hannes Reinecke wrote: >>> On 5/5/21 4:31 PM, Belanger, Martin wrote: >> [ .. ] >>>> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state >> UNKNOWN >>>> group default qlen 1000 >>>> link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 >>>> inet 100.0.0.100/24 scope global lo >>>> valid_lft forever preferred_lft forever >>>> 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc >> fq_codel >>>> state UP group default qlen 1000 >>>> link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff >>>> inet 100.0.0.100/24 scope global enp0s3 >>>> valid_lft forever preferred_lft forever >>>> 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc >> fq_codel >>>> state UP group default qlen 1000 >>>> link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff >>>> inet 100.0.0.100/24 scope global enp0s8 >>>> valid_lft forever preferred_lft forever >>>> >>>> The above is a VM that I configured with the same IP address >>>> (100.0.0.100) on all interfaces. Doing a reverse lookup to identify >>>> the unique interface associated with 100.0.0.100 would simply not >>>> work here. And this is why the option host_iface is required. I >>>> understand that the above config does not represent a standard host >>>> system, but I'm using this to prove a point: "we can never know how a >>>> user will configure their system and the above configuration is >>>> perfectly fine by Linux". >>>> >>> >>> ... and messing up any switch MAC address caching when doing so. I >>> guess the network admin will come down hard on you if you try that on >>> a production system. >>> And I sincerely question whether this is a valid use-case; I'm already >>> getting grief from our network admins if I dare to put two network >>> interfaces from the same machine in the same network. >>> >>>> The current TCP implementation for host_traddr uses >>>> bind()-before-connect(). This is a common construct to set the source >>>> IP address on the socket before connecting. This has no effect on how >>>> Linux will select the interface for the connection. That's because >>>> Linux uses the Weak End System model as described in RFC1122 [2]. >>>> Setting the source address on a connection is a common requirement >>>> that linux-nvme needs to support. In fact, specifying the Source IP >>>> address is a mandatory FedGov requirement (e.g. connection to a >>>> RADIUS/TACACS+ server). Consider the following configuration. >>>> >>>> $ ip addr list dev enp0s8 >>>> 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc >> fq_codel >>>> state UP group default qlen 1000 >>>> link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff >>>> inet 192.168.56.101/24 brd 192.168.56.255 scope global dynamic >>>> noprefixroute enp0s8 >>>> valid_lft 426sec preferred_lft 426sec >>>> inet 192.168.56.102/24 scope global secondary enp0s8 >>>> valid_lft forever preferred_lft forever >>>> inet 192.168.56.103/24 scope global secondary enp0s8 >>>> valid_lft forever preferred_lft forever >>>> inet 192.168.56.104/24 scope global secondary enp0s8 >>>> valid_lft forever preferred_lft forever >>>> >>>> Here we can see that several addresses are associated with interface >>>> enp0s8. By default, Linux will select the default IP address, >>>> 192.168.56.101, as the source address when connecting over interface >>>> enp0s8. Some users, however, want the ability to specify a different >>>> address (e.g., >>>> 192.168.56.103) to be used as the source address. >>>> The option host_traddr can be used as-is to perform this function (I >>>> tested it). >>>> >>> >>> No disagreement here. >>> >>>> In conclusion, I believe that for TCP we need 2 options. One that can >>>> be used to specify an interface. And one that can be used to set the >>>> source address. And users should be allowed to use one or the other, >>>> or both, or none. >>>> Of course, the documentation for host_traddr will need some >>>> clarification. It should state that when used for TCP connection, >>>> this option only sets the source address. And the documentation for >>>> host_iface should say that this option only applies to TCP >>>> connections. >>>> >>> >>> I'm with James Smart here. I do fail to see the need for 'host_iface' >>> _without_ 'host_traddr'; especially for IPv6 where several addresses >>> are standard just specifying 'host_iface' simply is not enough, and >>> one has to specify 'host_traddr' additionally. >>> >>> So 'host_iface' should be contingent on 'host_traddr', meaning we can >>> just expand the syntax of 'host_traddr'. >>> One easy possibility would be to add ',nobind' to the host_traddr >>> syntax which would indicate that we should _not_ bind to the >>> underlying interface; I do think that binding to the respective >>> interface should be the default. >>> >> A-ha. Just spoke to our network folks, and they clarified the usage of binding >> to an IP address vs binding to a network interface. >> Apparently, binding to a source IP address does just that, setting the source >> IP address of the outgoing packet. That packet will _still_ be subjected to the >> normal routing table, as the routing table is just influenced by the >> _destination_ IP address. >> So if we want to have it routed via a specific interface (and thereby >> influencing the routing table) we need to bind it to that interface. >> >> The only valid scenario our network folks could come up with where we do >> _not_ want to bind to an interface is for asymmetric flows, ie in cases where >> the outgoing flow is routed to one interface and the incoming flow is arriving >> on another interface. But even they admitted that it's not a common >> scenario, and probably will be killed by anti-spoofing software running on >> the core switches ... >> >> But if we want to support _that_ then clearly binding to a specific interface >> doesn't work. >> >> So I would vote for making binding to the network interface holding the IP >> address the default, and add an option ',nobind' to host_traddr to skip it. >> >> Cheers, >> >> Hannes >> -- >> Dr. Hannes Reinecke Kernel Storage Architect >> hare@suse.de +49 911 74053 688 >> SUSE Software Solutions Germany GmbH, 90409 Nürnberg >> GF: F. Imendörffer, HRB 36809 (AG Nürnberg) > > Hi Hannes, > > If the only concern here is the addition of yet another option (--host-iface), then may I suggest a simpler approach. What I'm proposing adheres to RFC4007 [1], which defines a way to specify an interface by using the '%' delimiter between the Destination IP address and the Interface. In fact, "ping" uses this approach [2]. With ping, one can force the connection to go a specific interface like this: > > ping <dest-ip-addr>%<interface> Ping only supports this syntax for IPv6 no? > Extending this approach to nvme-cli we arrive to something like this: > > nvme discover --traddr 100.64.29.2%enp0s8 --host-traddr 192.168.56.102 .... We already support this for IPv6, we can do that also for IPv4, but this syntax may not be trivially expected for ipv4? > This tells nvme to connect to 100.64.29.2 on interface enp0s8. We make no change to the --host-traddr option. It continues to be used to specify the Source IP address only (for the rare cases where users want to specify a Source Address other than the default). With this, the interface is specified by name and not by its associated address. This is not only more intuitive, but, as I stated before, eliminates the problem caused by mapping the same IP address to multiple interfaces (not to mention that doing a reverse lookup on an IP address to find the interface is extra work that we don’t need to do in kernel space). Maybe we do something like ping -I for host_traddr, from ping man pages: -I interface interface is either an address, an interface name or a VRF name. If interface is an address, it sets source address to specified interface address. If interface is an interface name, it sets source interface to specified interface. If interface is a VRF name, each packet is routed using the corresponding routing table; in this case, the -I option can be repeated to specify a source address. NOTE: For IPv6, when doing ping to a link-local scope address, link specification (by the '%'-notation in destination, or by this option) can be used but it is no longer required. Without the repetition though, unless we need to support two interfaces that share the same multiple addresses in the same subnet, which sounds completely crazy to me... _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-07 18:20 ` Sagi Grimberg @ 2021-05-10 13:49 ` Belanger, Martin 2021-05-10 18:13 ` Sagi Grimberg 0 siblings, 1 reply; 23+ messages in thread From: Belanger, Martin @ 2021-05-10 13:49 UTC (permalink / raw) To: Sagi Grimberg, Hannes Reinecke, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch > On 5/6/21 8:46 AM, Belanger, Martin wrote: > >> On 5/6/21 8:05 AM, Hannes Reinecke wrote: > >>> On 5/5/21 4:31 PM, Belanger, Martin wrote: > >> [ .. ] > >>>> 1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state > >> UNKNOWN > >>>> group default qlen 1000 > >>>> link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00 > >>>> inet 100.0.0.100/24 scope global lo > >>>> valid_lft forever preferred_lft forever > >>>> 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc > >> fq_codel > >>>> state UP group default qlen 1000 > >>>> link/ether 08:00:27:21:65:ec brd ff:ff:ff:ff:ff:ff > >>>> inet 100.0.0.100/24 scope global enp0s3 > >>>> valid_lft forever preferred_lft forever > >>>> 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc > >> fq_codel > >>>> state UP group default qlen 1000 > >>>> link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff > >>>> inet 100.0.0.100/24 scope global enp0s8 > >>>> valid_lft forever preferred_lft forever > >>>> > >>>> The above is a VM that I configured with the same IP address > >>>> (100.0.0.100) on all interfaces. Doing a reverse lookup to identify > >>>> the unique interface associated with 100.0.0.100 would simply not > >>>> work here. And this is why the option host_iface is required. I > >>>> understand that the above config does not represent a standard host > >>>> system, but I'm using this to prove a point: "we can never know how > >>>> a user will configure their system and the above configuration is > >>>> perfectly fine by Linux". > >>>> > >>> > >>> ... and messing up any switch MAC address caching when doing so. I > >>> guess the network admin will come down hard on you if you try that > >>> on a production system. > >>> And I sincerely question whether this is a valid use-case; I'm > >>> already getting grief from our network admins if I dare to put two > >>> network interfaces from the same machine in the same network. > >>> > >>>> The current TCP implementation for host_traddr uses > >>>> bind()-before-connect(). This is a common construct to set the > >>>> source IP address on the socket before connecting. This has no > >>>> effect on how Linux will select the interface for the connection. > >>>> That's because Linux uses the Weak End System model as described in > RFC1122 [2]. > >>>> Setting the source address on a connection is a common requirement > >>>> that linux-nvme needs to support. In fact, specifying the Source IP > >>>> address is a mandatory FedGov requirement (e.g. connection to a > >>>> RADIUS/TACACS+ server). Consider the following configuration. > >>>> > >>>> $ ip addr list dev enp0s8 > >>>> 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc > >> fq_codel > >>>> state UP group default qlen 1000 > >>>> link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff > >>>> inet 192.168.56.101/24 brd 192.168.56.255 scope global > >>>> dynamic noprefixroute enp0s8 > >>>> valid_lft 426sec preferred_lft 426sec > >>>> inet 192.168.56.102/24 scope global secondary enp0s8 > >>>> valid_lft forever preferred_lft forever > >>>> inet 192.168.56.103/24 scope global secondary enp0s8 > >>>> valid_lft forever preferred_lft forever > >>>> inet 192.168.56.104/24 scope global secondary enp0s8 > >>>> valid_lft forever preferred_lft forever > >>>> > >>>> Here we can see that several addresses are associated with > >>>> interface enp0s8. By default, Linux will select the default IP > >>>> address, 192.168.56.101, as the source address when connecting over > >>>> interface enp0s8. Some users, however, want the ability to specify > >>>> a different address (e.g., > >>>> 192.168.56.103) to be used as the source address. > >>>> The option host_traddr can be used as-is to perform this function > >>>> (I tested it). > >>>> > >>> > >>> No disagreement here. > >>> > >>>> In conclusion, I believe that for TCP we need 2 options. One that > >>>> can be used to specify an interface. And one that can be used to > >>>> set the source address. And users should be allowed to use one or > >>>> the other, or both, or none. > >>>> Of course, the documentation for host_traddr will need some > >>>> clarification. It should state that when used for TCP connection, > >>>> this option only sets the source address. And the documentation for > >>>> host_iface should say that this option only applies to TCP > >>>> connections. > >>>> > >>> > >>> I'm with James Smart here. I do fail to see the need for 'host_iface' > >>> _without_ 'host_traddr'; especially for IPv6 where several addresses > >>> are standard just specifying 'host_iface' simply is not enough, and > >>> one has to specify 'host_traddr' additionally. > >>> > >>> So 'host_iface' should be contingent on 'host_traddr', meaning we > >>> can just expand the syntax of 'host_traddr'. > >>> One easy possibility would be to add ',nobind' to the host_traddr > >>> syntax which would indicate that we should _not_ bind to the > >>> underlying interface; I do think that binding to the respective > >>> interface should be the default. > >>> > >> A-ha. Just spoke to our network folks, and they clarified the usage > >> of binding to an IP address vs binding to a network interface. > >> Apparently, binding to a source IP address does just that, setting > >> the source IP address of the outgoing packet. That packet will > >> _still_ be subjected to the normal routing table, as the routing > >> table is just influenced by the _destination_ IP address. > >> So if we want to have it routed via a specific interface (and thereby > >> influencing the routing table) we need to bind it to that interface. > >> > >> The only valid scenario our network folks could come up with where we > >> do _not_ want to bind to an interface is for asymmetric flows, ie in > >> cases where the outgoing flow is routed to one interface and the > >> incoming flow is arriving on another interface. But even they > >> admitted that it's not a common scenario, and probably will be killed > >> by anti-spoofing software running on the core switches ... > >> > >> But if we want to support _that_ then clearly binding to a specific > >> interface doesn't work. > >> > >> So I would vote for making binding to the network interface holding > >> the IP address the default, and add an option ',nobind' to host_traddr to > skip it. > >> > >> Cheers, > >> > >> Hannes > >> -- > >> Dr. Hannes Reinecke Kernel Storage Architect > >> hare@suse.de +49 911 74053 688 > >> SUSE Software Solutions Germany GmbH, 90409 Nürnberg > >> GF: F. Imendörffer, HRB 36809 (AG Nürnberg) > > > > Hi Hannes, > > > > If the only concern here is the addition of yet another option (--host-iface), > then may I suggest a simpler approach. What I'm proposing adheres to > RFC4007 [1], which defines a way to specify an interface by using the '%' > delimiter between the Destination IP address and the Interface. In fact, > "ping" uses this approach [2]. With ping, one can force the connection to go > a specific interface like this: > > > > ping <dest-ip-addr>%<interface> > > Ping only supports this syntax for IPv6 no? > > > Extending this approach to nvme-cli we arrive to something like this: > > > > nvme discover --traddr 100.64.29.2%enp0s8 --host-traddr 192.168.56.102 > .... > > We already support this for IPv6, we can do that also for IPv4, but this syntax > may not be trivially expected for ipv4? I tried this for IPv6 and it doesn't work. Here's what I get: $ sudo nvme discover -g -G -t tcp -s 8009 -a fe80::800:27ff:fe00:0 Failed to write to /dev/nvme-fabrics: Invalid argument $ sudo nvme discover -g -G -t tcp -s 8009 -a fe80::800:27ff:fe00:0%enp0s8 Failed to write to /dev/nvme-fabrics: Invalid argument $ sudo nvme discover -g -G -t tcp -s 8009 -a [fe80::800:27ff:fe00:0] failed to resolve host [fe80::800:27ff:fe00:0] info $ sudo nvme discover -g -G -t tcp -s 8009 -a [fe80::800:27ff:fe00:0%enp0s8] failed to resolve host [fe80::800:27ff:fe00:0%enp0s8] info > > > This tells nvme to connect to 100.64.29.2 on interface enp0s8. We make no > change to the --host-traddr option. It continues to be used to specify the > Source IP address only (for the rare cases where users want to specify a > Source Address other than the default). With this, the interface is specified > by name and not by its associated address. This is not only more intuitive, > but, as I stated before, eliminates the problem caused by mapping the same > IP address to multiple interfaces (not to mention that doing a reverse lookup > on an IP address to find the interface is extra work that we don’t need to do > in kernel space). > > Maybe we do something like ping -I for host_traddr, from ping man pages: > > -I interface > interface is either an address, an interface name or a VRF name. If > interface is an address, it sets source address to specified interface address. > If interface is an > interface name, it sets source interface to specified interface. If > interface is a VRF name, each packet is routed using the corresponding > routing table; in this case, the -I > option can be repeated to specify a source address. NOTE: > For IPv6, when doing ping to a link-local scope address, link specification (by > the '%'-notation in destination, or > by this option) can be used but it is no longer required. > > > Without the repetition though, unless we need to support two interfaces > that share the same multiple addresses in the same subnet, which sounds > completely crazy to me... Hi Sagi, If we want to follow ping as an example, the repetition is needed not to specify two interfaces, but to specify an interface and the source address. In a previous example (reproduced below), I described a configuration where an interface had several addresses assigned to it. By default, Linux always picks the same Source address (i.e. 192.168.56.101 in this example) when connecting. If a user wants a different source address they need a way to specify it (currently with --host-traddr). Users also need a way to specify an interface separately from the source address (either with a new option like --host-iface or by repeating --host-traddr). With the example below, if we wanted to force ping to use interface enp0s8 and source address 192.168.56.103, we would repeat the -I option, for example "ping -I enp0s8 -I 192.168.56.103". We need a way to do the same with nvme-cli. I thought that introducing a new option, "--host-iface", had the smallest impact since it requires less code changes, but that was turned down (not sure exactly why). I then suggested that we use the '%' delimiter for IPv4 and IPv6. I agree that it is not 100% the same as ping since ping only allows the '%' delimiter for IPv6 addresses (as per RFC4007). As you suggested, we could repeat the --host-traddr option (e.g. --host-traddr enp0s8 --host-traddr 192.168.56.103), but this is more impactful to the code than adding a separate --host-iface option. EXAMPLE: Interface with several addresses assigned: $ ip addr list dev enp0s8 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 ... link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff inet 192.168.56.101/24 brd 192.168.56.255 scope ... valid_lft 426sec preferred_lft 426sec inet 192.168.56.102/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.103/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever inet 192.168.56.104/24 scope global secondary enp0s8 valid_lft forever preferred_lft forever In the end, it doesn't really matter (to me) how it is implemented. However, a solution that have little to no impact on existing code would be nice. Just like ping, we need a way to specify an interface by its **interface name** (and not by its associated IP address), and we need to allow users to select which Source IP address to use when there are multiple addresses associated with an interface. Regards, Martin _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-10 13:49 ` Belanger, Martin @ 2021-05-10 18:13 ` Sagi Grimberg 2021-05-10 19:18 ` Belanger, Martin 0 siblings, 1 reply; 23+ messages in thread From: Sagi Grimberg @ 2021-05-10 18:13 UTC (permalink / raw) To: Belanger, Martin, Hannes Reinecke, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch >>> ping <dest-ip-addr>%<interface> >> >> Ping only supports this syntax for IPv6 no? >> >>> Extending this approach to nvme-cli we arrive to something like this: >>> >>> nvme discover --traddr 100.64.29.2%enp0s8 --host-traddr 192.168.56.102 >> .... >> >> We already support this for IPv6, we can do that also for IPv4, but this syntax >> may not be trivially expected for ipv4? > > I tried this for IPv6 and it doesn't work. Here's what I get: > $ sudo nvme discover -g -G -t tcp -s 8009 -a fe80::800:27ff:fe00:0 > Failed to write to /dev/nvme-fabrics: Invalid argument > $ sudo nvme discover -g -G -t tcp -s 8009 -a fe80::800:27ff:fe00:0%enp0s8 > Failed to write to /dev/nvme-fabrics: Invalid argument > $ sudo nvme discover -g -G -t tcp -s 8009 -a [fe80::800:27ff:fe00:0] > failed to resolve host [fe80::800:27ff:fe00:0] info > $ sudo nvme discover -g -G -t tcp -s 8009 -a [fe80::800:27ff:fe00:0%enp0s8] > failed to resolve host [fe80::800:27ff:fe00:0%enp0s8] info # nvme discover -t tcp -a fe80::5054:ff:fef1:9f3b -w fe80::5054:ff:fe28:5edb%enp6s0 Discovery Log Number of Records 1, Generation counter 5 =====Discovery Log Entry 0====== trtype: tcp adrfam: ipv6 subtype: nvme subsystem treq: not specified, sq flow control disable supported portid: 3 trsvcid: 8009 subnqn: testnqn1 traddr: fe80::5054:ff:fef1:9f3b%enp6s0 sectype: none > >> >>> This tells nvme to connect to 100.64.29.2 on interface enp0s8. We make no >> change to the --host-traddr option. It continues to be used to specify the >> Source IP address only (for the rare cases where users want to specify a >> Source Address other than the default). With this, the interface is specified >> by name and not by its associated address. This is not only more intuitive, >> but, as I stated before, eliminates the problem caused by mapping the same >> IP address to multiple interfaces (not to mention that doing a reverse lookup >> on an IP address to find the interface is extra work that we don’t need to do >> in kernel space). >> >> Maybe we do something like ping -I for host_traddr, from ping man pages: >> >> -I interface >> interface is either an address, an interface name or a VRF name. If >> interface is an address, it sets source address to specified interface address. >> If interface is an >> interface name, it sets source interface to specified interface. If >> interface is a VRF name, each packet is routed using the corresponding >> routing table; in this case, the -I >> option can be repeated to specify a source address. NOTE: >> For IPv6, when doing ping to a link-local scope address, link specification (by >> the '%'-notation in destination, or >> by this option) can be used but it is no longer required. >> >> >> Without the repetition though, unless we need to support two interfaces >> that share the same multiple addresses in the same subnet, which sounds >> completely crazy to me... > > Hi Sagi, > > If we want to follow ping as an example, the repetition is needed not to specify two interfaces, but to specify an interface and the source address. In a previous example (reproduced below), I described a configuration where an interface had several addresses assigned to it. By default, Linux always picks the same Source address (i.e. 192.168.56.101 in this example) when connecting. If a user wants a different source address they need a way to specify it (currently with --host-traddr). Users also need a way to specify an interface separately from the source address (either with a new option like --host-iface or by repeating --host-traddr). With the example below, if we wanted to force ping to use interface enp0s8 and source address 192.168.56.103, we would repeat the -I option, for example "ping -I enp0s8 -I 192.168.56.103". We need a way to do the same with nvme-cli. > > I thought that introducing a new option, "--host-iface", had the smallest impact since it requires less code changes, but that was turned down (not sure exactly why). I then suggested that we use the '%' delimiter for IPv4 and IPv6. I agree that it is not 100% the same as ping since ping only allows the '%' delimiter for IPv6 addresses (as per RFC4007). As you suggested, we could repeat the --host-traddr option (e.g. --host-traddr enp0s8 --host-traddr 192.168.56.103), but this is more impactful to the code than adding a separate --host-iface option. It's less about code-changes and more on adding a new user ABI, that is the reason why (at least I'm fully on board just yet). > EXAMPLE: Interface with several addresses assigned: > $ ip addr list dev enp0s8 > 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 ... > link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff > inet 192.168.56.101/24 brd 192.168.56.255 scope ... > valid_lft 426sec preferred_lft 426sec > inet 192.168.56.102/24 scope global secondary enp0s8 > valid_lft forever preferred_lft forever > inet 192.168.56.103/24 scope global secondary enp0s8 > valid_lft forever preferred_lft forever > inet 192.168.56.104/24 scope global secondary enp0s8 > valid_lft forever preferred_lft forever > > In the end, it doesn't really matter (to me) how it is implemented. However, a solution that have little to no impact on existing code would be nice. Just like ping, we need a way to specify an interface by its **interface name** (and not by its associated IP address), and we need to allow users to select which Source IP address to use when there are multiple addresses associated with an interface. The '%' may be confusing when it comes to other transports as well (e.g. rdma/fc would have to either reject or ignore it, but regardless of how we add it that would be the case). Having host-traddr accept either ip or interface seems the most desirable, however that won't work if there are 2 interfaces that share multiple ip addresses. So if this is a requirement we'll probably need to add --host-iface as another option... _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-10 18:13 ` Sagi Grimberg @ 2021-05-10 19:18 ` Belanger, Martin 2021-05-11 0:28 ` Sagi Grimberg 0 siblings, 1 reply; 23+ messages in thread From: Belanger, Martin @ 2021-05-10 19:18 UTC (permalink / raw) To: Sagi Grimberg, Hannes Reinecke, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch > >>> ping <dest-ip-addr>%<interface> > >> > >> Ping only supports this syntax for IPv6 no? > >> > >>> Extending this approach to nvme-cli we arrive to something like this: > >>> > >>> nvme discover --traddr 100.64.29.2%enp0s8 --host-traddr > >>> 192.168.56.102 > >> .... > >> > >> We already support this for IPv6, we can do that also for IPv4, but > >> this syntax may not be trivially expected for ipv4? > > > > I tried this for IPv6 and it doesn't work. Here's what I get: > > $ sudo nvme discover -g -G -t tcp -s 8009 -a fe80::800:27ff:fe00:0 > > Failed to write to /dev/nvme-fabrics: Invalid argument $ sudo nvme > > discover -g -G -t tcp -s 8009 -a fe80::800:27ff:fe00:0%enp0s8 Failed > > to write to /dev/nvme-fabrics: Invalid argument $ sudo nvme discover > > -g -G -t tcp -s 8009 -a [fe80::800:27ff:fe00:0] failed to resolve host > > [fe80::800:27ff:fe00:0] info $ sudo nvme discover -g -G -t tcp -s 8009 > > -a [fe80::800:27ff:fe00:0%enp0s8] failed to resolve host > > [fe80::800:27ff:fe00:0%enp0s8] info > > # nvme discover -t tcp -a fe80::5054:ff:fef1:9f3b -w > fe80::5054:ff:fe28:5edb%enp6s0 Thanks for clarifying the syntax. However, that doesn't work for me. # nvme discover -t tcp -a fe80::800:27ff:fe00:0 -w fe80::9266:4855:6cf2:f7e9%enp0s8 Failed to write to /dev/nvme-fabrics: Connection refused Note that the above syntax does not comply with RFC4007. The '%' delimiter is supposed to be appended to the Destination IP address and not the Source Address. In other words, to be RFC4007-compliant, the syntax should be (using your example): # nvme discover -t tcp -a fe80::5054:ff:fef1:9f3b%enp6s0 -w fe80::5054:ff:fe28:5edb This tells nvme-cli to connect to a controller at address fe80::5054:ff:fef1:9f3b using interface enp6s0 for the connection. And set the Source address to fe80::5054:ff:fe28:5edb. > > Discovery Log Number of Records 1, Generation counter 5 =====Discovery > Log Entry 0====== > trtype: tcp > adrfam: ipv6 > subtype: nvme subsystem > treq: not specified, sq flow control disable supported > portid: 3 > trsvcid: 8009 > subnqn: testnqn1 > traddr: fe80::5054:ff:fef1:9f3b%enp6s0 > sectype: none > > > > >> > >>> This tells nvme to connect to 100.64.29.2 on interface enp0s8. We > >>> make no > >> change to the --host-traddr option. It continues to be used to > >> specify the Source IP address only (for the rare cases where users > >> want to specify a Source Address other than the default). With this, > >> the interface is specified by name and not by its associated address. > >> This is not only more intuitive, but, as I stated before, eliminates > >> the problem caused by mapping the same IP address to multiple > >> interfaces (not to mention that doing a reverse lookup on an IP > >> address to find the interface is extra work that we don’t need to do in > kernel space). > >> > >> Maybe we do something like ping -I for host_traddr, from ping man > pages: > >> > >> -I interface > >> interface is either an address, an interface name or a > >> VRF name. If interface is an address, it sets source address to specified > interface address. > >> If interface is an > >> interface name, it sets source interface to specified > >> interface. If interface is a VRF name, each packet is routed using > >> the corresponding routing table; in this case, the -I > >> option can be repeated to specify a source address. NOTE: > >> For IPv6, when doing ping to a link-local scope address, link > >> specification (by the '%'-notation in destination, or > >> by this option) can be used but it is no longer required. > >> > >> > >> Without the repetition though, unless we need to support two > >> interfaces that share the same multiple addresses in the same subnet, > >> which sounds completely crazy to me... > > > > Hi Sagi, > > > > If we want to follow ping as an example, the repetition is needed not to > specify two interfaces, but to specify an interface and the source address. In > a previous example (reproduced below), I described a configuration where > an interface had several addresses assigned to it. By default, Linux always > picks the same Source address (i.e. 192.168.56.101 in this example) when > connecting. If a user wants a different source address they need a way to > specify it (currently with --host-traddr). Users also need a way to specify an > interface separately from the source address (either with a new option like -- > host-iface or by repeating --host-traddr). With the example below, if we > wanted to force ping to use interface enp0s8 and source address > 192.168.56.103, we would repeat the -I option, for example "ping -I enp0s8 -I > 192.168.56.103". We need a way to do the same with nvme-cli. > > > > I thought that introducing a new option, "--host-iface", had the smallest > impact since it requires less code changes, but that was turned down (not > sure exactly why). I then suggested that we use the '%' delimiter for IPv4 and > IPv6. I agree that it is not 100% the same as ping since ping only allows the > '%' delimiter for IPv6 addresses (as per RFC4007). As you suggested, we could > repeat the --host-traddr option (e.g. --host-traddr enp0s8 --host-traddr > 192.168.56.103), but this is more impactful to the code than adding a > separate --host-iface option. > > It's less about code-changes and more on adding a new user ABI, that is the > reason why (at least I'm fully on board just yet). > > > EXAMPLE: Interface with several addresses assigned: > > $ ip addr list dev enp0s8 > > 3: enp0s8: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 ... > > link/ether 08:00:27:4f:95:5c brd ff:ff:ff:ff:ff:ff > > inet 192.168.56.101/24 brd 192.168.56.255 scope ... > > valid_lft 426sec preferred_lft 426sec > > inet 192.168.56.102/24 scope global secondary enp0s8 > > valid_lft forever preferred_lft forever > > inet 192.168.56.103/24 scope global secondary enp0s8 > > valid_lft forever preferred_lft forever > > inet 192.168.56.104/24 scope global secondary enp0s8 > > valid_lft forever preferred_lft forever > > > > In the end, it doesn't really matter (to me) how it is implemented. > However, a solution that have little to no impact on existing code would be > nice. Just like ping, we need a way to specify an interface by its **interface > name** (and not by its associated IP address), and we need to allow users to > select which Source IP address to use when there are multiple addresses > associated with an interface. > > The '%' may be confusing when it comes to other transports as well (e.g. > rdma/fc would have to either reject or ignore it, but regardless of how we > add it that would be the case). Having host-traddr accept either ip or > interface seems the most desirable, however that won't work if there are 2 > interfaces that share multiple ip addresses. So if this is a requirement we'll > probably need to add --host-iface as another option... I don’t grok what you mean by "that won't work if there are 2 interfaces that share multiple ip addresses". Why not? If one specifies the interface by its name (e.g. enp0s8), there is no possible confusion even if multiple interfaces share the same IP addresses. The following are some examples of how nvme-cli should work to comply with RFC4007 and be consistent to the way ping operates. Example 1 - IPv4, Specify Interface with -w and let Linux select Source address: nvme discover -t tcp -a 192.168.1.9 -w enp0s8 Example 2 - IPv4, Specify Interface and Source address with repeated -w: nvme discover -t tcp -a 192.168.1.9 -w enp0s8 -w 192.168.56.103 Example 3 - IPv6, Specify Interface with'%' delimiter and let Linux select Source address: nvme discover -t tcp -a fe80::800:27ff:fe00:0%enp0s8 Example 4 - IPv6, Specify Interface with -w and let Linux select Source address: nvme discover -t tcp -a fe80::800:27ff:fe00:0 -w enp0s8 Example 5 - IPv6, Specify Interface with'%' delimiter and Source address with -w: nvme discover -t tcp -a fe80::800:27ff:fe00:0%enp0s8 -w fe80::9266:4855:6cf2:f7e9 Example 6 - IPv6, Specify Interface and Source address with repeated -w: nvme discover -t tcp -a fe80::800:27ff:fe00:0 -w enp0s8 -w fe80::9266:4855:6cf2:f7e9 Martin _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-10 19:18 ` Belanger, Martin @ 2021-05-11 0:28 ` Sagi Grimberg 2021-05-11 13:41 ` Belanger, Martin 0 siblings, 1 reply; 23+ messages in thread From: Sagi Grimberg @ 2021-05-11 0:28 UTC (permalink / raw) To: Belanger, Martin, Hannes Reinecke, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch >>>> We already support this for IPv6, we can do that also for IPv4, but >>>> this syntax may not be trivially expected for ipv4? >>> >>> I tried this for IPv6 and it doesn't work. Here's what I get: >>> $ sudo nvme discover -g -G -t tcp -s 8009 -a fe80::800:27ff:fe00:0 >>> Failed to write to /dev/nvme-fabrics: Invalid argument $ sudo nvme >>> discover -g -G -t tcp -s 8009 -a fe80::800:27ff:fe00:0%enp0s8 Failed >>> to write to /dev/nvme-fabrics: Invalid argument $ sudo nvme discover >>> -g -G -t tcp -s 8009 -a [fe80::800:27ff:fe00:0] failed to resolve host >>> [fe80::800:27ff:fe00:0] info $ sudo nvme discover -g -G -t tcp -s 8009 >>> -a [fe80::800:27ff:fe00:0%enp0s8] failed to resolve host >>> [fe80::800:27ff:fe00:0%enp0s8] info >> >> # nvme discover -t tcp -a fe80::5054:ff:fef1:9f3b -w >> fe80::5054:ff:fe28:5edb%enp6s0 > > Thanks for clarifying the syntax. However, that doesn't work for me. > > # nvme discover -t tcp -a fe80::800:27ff:fe00:0 -w fe80::9266:4855:6cf2:f7e9%enp0s8 > Failed to write to /dev/nvme-fabrics: Connection refused Are you using the linux target? connection refused means that you don't have a listener on it, it's not a resolution error. did you have the target listen on fe80::800:27ff:fe00:0%<intf> ? > > Note that the above syntax does not comply with RFC4007. The '%' delimiter is supposed to be appended to the Destination IP address and not the Source Address. In other words, to be RFC4007-compliant, the syntax should be (using your example): > > # nvme discover -t tcp -a fe80::5054:ff:fef1:9f3b%enp6s0 -w fe80::5054:ff:fe28:5edb > > This tells nvme-cli to connect to a controller at address fe80::5054:ff:fef1:9f3b using interface enp6s0 for the connection. And set the Source address to fe80::5054:ff:fe28:5edb. This also seems to work, not sure that it does what we want though... nvme discover -t tcp -a fe80::5054:ff:fef1:9f3b%enp6s0 -w fe80::5054:ff:fe28:5edb%enp6s0 Discovery Log Number of Records 1, Generation counter 5 =====Discovery Log Entry 0====== trtype: tcp adrfam: ipv6 subtype: nvme subsystem treq: not specified, sq flow control disable supported portid: 3 trsvcid: 8009 subnqn: testnqn1 traddr: fe80::5054:ff:fef1:9f3b%enp6s0 sectype: none >> The '%' may be confusing when it comes to other transports as well (e.g. >> rdma/fc would have to either reject or ignore it, but regardless of how we >> add it that would be the case). Having host-traddr accept either ip or >> interface seems the most desirable, however that won't work if there are 2 >> interfaces that share multiple ip addresses. So if this is a requirement we'll >> probably need to add --host-iface as another option... > > I don’t grok what you mean by "that won't work if there are 2 interfaces that share multiple ip addresses". Why not? If one specifies the interface by its name (e.g. enp0s8), there is no possible confusion even if multiple interfaces share the same IP addresses. > > The following are some examples of how nvme-cli should work to comply with RFC4007 and be consistent to the way ping operates. > Example 1 - IPv4, Specify Interface with -w and let Linux select Source address: > nvme discover -t tcp -a 192.168.1.9 -w enp0s8 > > Example 2 - IPv4, Specify Interface and Source address with repeated -w: > nvme discover -t tcp -a 192.168.1.9 -w enp0s8 -w 192.168.56.103 I meant without the repetitions, which you only need if you have 2 devices that share more than one address, which again, is not a clear use-case to me, but without repetitions we won't support that. > Example 3 - IPv6, Specify Interface with'%' delimiter and let Linux select Source address: > nvme discover -t tcp -a fe80::800:27ff:fe00:0%enp0s8 > > Example 4 - IPv6, Specify Interface with -w and let Linux select Source address: > nvme discover -t tcp -a fe80::800:27ff:fe00:0 -w enp0s8 > > Example 5 - IPv6, Specify Interface with'%' delimiter and Source address with -w: > nvme discover -t tcp -a fe80::800:27ff:fe00:0%enp0s8 -w fe80::9266:4855:6cf2:f7e9 > > Example 6 - IPv6, Specify Interface and Source address with repeated -w: > nvme discover -t tcp -a fe80::800:27ff:fe00:0 -w enp0s8 -w fe80::9266:4855:6cf2:f7e9 > > Martin > _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-11 0:28 ` Sagi Grimberg @ 2021-05-11 13:41 ` Belanger, Martin 2021-05-11 17:13 ` Sagi Grimberg 0 siblings, 1 reply; 23+ messages in thread From: Belanger, Martin @ 2021-05-11 13:41 UTC (permalink / raw) To: Sagi Grimberg, Hannes Reinecke, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch > >>>> We already support this for IPv6, we can do that also for IPv4, but > >>>> this syntax may not be trivially expected for ipv4? > >>> > >>> I tried this for IPv6 and it doesn't work. Here's what I get: > >>> $ sudo nvme discover -g -G -t tcp -s 8009 -a fe80::800:27ff:fe00:0 > >>> Failed to write to /dev/nvme-fabrics: Invalid argument $ sudo nvme > >>> discover -g -G -t tcp -s 8009 -a fe80::800:27ff:fe00:0%enp0s8 Failed > >>> to write to /dev/nvme-fabrics: Invalid argument $ sudo nvme discover > >>> -g -G -t tcp -s 8009 -a [fe80::800:27ff:fe00:0] failed to resolve > >>> host [fe80::800:27ff:fe00:0] info $ sudo nvme discover -g -G -t tcp > >>> -s 8009 -a [fe80::800:27ff:fe00:0%enp0s8] failed to resolve host > >>> [fe80::800:27ff:fe00:0%enp0s8] info > >> > >> # nvme discover -t tcp -a fe80::5054:ff:fef1:9f3b -w > >> fe80::5054:ff:fe28:5edb%enp6s0 > > > > Thanks for clarifying the syntax. However, that doesn't work for me. > > > > # nvme discover -t tcp -a fe80::800:27ff:fe00:0 -w > > fe80::9266:4855:6cf2:f7e9%enp0s8 Failed to write to /dev/nvme-fabrics: > > Connection refused > > Are you using the linux target? connection refused means that you don't > have a listener on it, it's not a resolution error. > > did you have the target listen on fe80::800:27ff:fe00:0%<intf> ? Doh! You are correct. In my setup, I run the nvme-cli client on a VM and I run the target (nvmet) on the host computer. I had nvmet configured for "0.0.0.0" instead of "::" (i.e. listen on all interfaces). After changing nvmet's configuration, I was able to query the discovery log pages, using this syntax: nvme discover -t tcp -a fe80::800:27ff:fe00:0 -w fe80::9266:4855:6cf2:f7ea%enp0s8 Note that it doesn't work when I append the interface to the Destination IP address as per RFC4007 (like ping) as follows. nvme discover -t tcp -a fe80::800:27ff:fe00:0%enp0s8 -w fe80::9266:4855:6cf2:f7ea > > > > > Note that the above syntax does not comply with RFC4007. The '%' > delimiter is supposed to be appended to the Destination IP address and not > the Source Address. In other words, to be RFC4007-compliant, the syntax > should be (using your example): > > > > # nvme discover -t tcp -a fe80::5054:ff:fef1:9f3b%enp6s0 -w > > fe80::5054:ff:fe28:5edb > > > > This tells nvme-cli to connect to a controller at address > fe80::5054:ff:fef1:9f3b using interface enp6s0 for the connection. And set the > Source address to fe80::5054:ff:fe28:5edb. > > This also seems to work, not sure that it does what we want though... > nvme discover -t tcp -a fe80::5054:ff:fef1:9f3b%enp6s0 -w > fe80::5054:ff:fe28:5edb%enp6s0 > > Discovery Log Number of Records 1, Generation counter 5 =====Discovery > Log Entry 0====== > trtype: tcp > adrfam: ipv6 > subtype: nvme subsystem > treq: not specified, sq flow control disable supported > portid: 3 > trsvcid: 8009 > subnqn: testnqn1 > traddr: fe80::5054:ff:fef1:9f3b%enp6s0 > sectype: none > > > >> The '%' may be confusing when it comes to other transports as well (e.g. > >> rdma/fc would have to either reject or ignore it, but regardless of > >> how we add it that would be the case). Having host-traddr accept > >> either ip or interface seems the most desirable, however that won't > >> work if there are 2 interfaces that share multiple ip addresses. So > >> if this is a requirement we'll probably need to add --host-iface as another > option... > > > > I don’t grok what you mean by "that won't work if there are 2 interfaces > that share multiple ip addresses". Why not? If one specifies the interface by > its name (e.g. enp0s8), there is no possible confusion even if multiple > interfaces share the same IP addresses. > > > > The following are some examples of how nvme-cli should work to comply > with RFC4007 and be consistent to the way ping operates. > > Example 1 - IPv4, Specify Interface with -w and let Linux select Source > address: > > nvme discover -t tcp -a 192.168.1.9 -w enp0s8 > > > > Example 2 - IPv4, Specify Interface and Source address with repeated -w: > > nvme discover -t tcp -a 192.168.1.9 -w enp0s8 -w 192.168.56.103 > > I meant without the repetitions, which you only need if you have 2 devices > that share more than one address, which again, is not a clear use-case to > me, but without repetitions we won't support that. I've been thinking about what you said regarding the need to repeat the -w option when two interfaces share the same IP address. I think we're looking at the problem from a different point of view. The current implementation uses an IP address to identify an interface. I, on the other hand, believe that the best way to identify an interface is by its "interface name or index". In previous emails, I provided examples of the problems that may occur when using an IP address to identify an interface. For example, one can assign the same IP address to different interfaces making it impossible to distinguish interfaces by their IP address alone. Another example is that the low level APIs (e.g. setsockopt(SO_BINDTODEVICE) don’t even require the source IP address. They only need the interface name/index. So, why go through the trouble of performing a reverse address lookup to retrieve the interface name/index when the address is not used at all? By the way, if nvme-cli/linux-nvme allowed specifying interfaces by name/index, then we would not really need to repeat the -w option unless we also wanted to set the source address at the same time. Setting the source address is a completely different thing from setting the interface. One should be allowed to set one independently from the other, or both, or none. If you look at how ping is implemented, they do not infer the interface from the IP address. If one wants to force ping to go over an interface, then one must provide the interface by name/index using the -I option. If one wants to change the source IP address (without forcing a specific interface), then one provides the IP address to the -I option. It's simple and intuitive. And ping also supports appending the interface to the Destination IP using the '%' delimiter for IPv6-only as per RFC4007. I think that nvme-cli/linux-nvme should follow the ping approach. Interfaces should never be inferred from source IP addresses, but instead be clearly identified by their name or index. And setting the source address should be independent from setting the interface. Regards, Martin > > > Example 3 - IPv6, Specify Interface with'%' delimiter and let Linux select > Source address: > > nvme discover -t tcp -a fe80::800:27ff:fe00:0%enp0s8 > > > > Example 4 - IPv6, Specify Interface with -w and let Linux select Source > address: > > nvme discover -t tcp -a fe80::800:27ff:fe00:0 -w enp0s8 > > > > Example 5 - IPv6, Specify Interface with'%' delimiter and Source address > with -w: > > nvme discover -t tcp -a fe80::800:27ff:fe00:0%enp0s8 -w > > fe80::9266:4855:6cf2:f7e9 > > > > Example 6 - IPv6, Specify Interface and Source address with repeated -w: > > nvme discover -t tcp -a fe80::800:27ff:fe00:0 -w enp0s8 -w > > fe80::9266:4855:6cf2:f7e9 > > > > Martin > > _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-11 13:41 ` Belanger, Martin @ 2021-05-11 17:13 ` Sagi Grimberg 2021-05-12 6:09 ` Hannes Reinecke 0 siblings, 1 reply; 23+ messages in thread From: Sagi Grimberg @ 2021-05-11 17:13 UTC (permalink / raw) To: Belanger, Martin, Hannes Reinecke, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch > I've been thinking about what you said regarding the need to repeat the -w option when two interfaces share the same IP address. I think we're looking at the problem from a different point of view. The current implementation uses an IP address to identify an interface. I, on the other hand, believe that the best way to identify an interface is by its "interface name or index". In previous emails, I provided examples of the problems that may occur when using an IP address to identify an interface. For example, one can assign the same IP address to different interfaces making it impossible to distinguish interfaces by their IP address alone. Another example is that the low level APIs (e.g. setsockopt(SO_BINDTODEVICE) don’t even require the source IP address. They only need the interface name/index. So, why go through the trouble of performing a reverse address lookup to retrieve the interface name/index when the address is not used at all? > > By the way, if nvme-cli/linux-nvme allowed specifying interfaces by name/index, then we would not really need to repeat the -w option unless we also wanted to set the source address at the same time. Setting the source address is a completely different thing from setting the interface. One should be allowed to set one independently from the other, or both, or none. > > If you look at how ping is implemented, they do not infer the interface from the IP address. If one wants to force ping to go over an interface, then one must provide the interface by name/index using the -I option. If one wants to change the source IP address (without forcing a specific interface), then one provides the IP address to the -I option. It's simple and intuitive. And ping also supports appending the interface to the Destination IP using the '%' delimiter for IPv6-only as per RFC4007. > > I think that nvme-cli/linux-nvme should follow the ping approach. Interfaces should never be inferred from source IP addresses, but instead be clearly identified by their name or index. And setting the source address should be independent from setting the interface. I'm starting to think that we are going in circles, I'm getting to the point that having host_iface is the right way to go. We can have nvme-cli convert <addr>%iface notation to "..,host_traddr=<addr>,host_iface=<iface>,.." when creating the controller string... _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-11 17:13 ` Sagi Grimberg @ 2021-05-12 6:09 ` Hannes Reinecke 2021-05-12 12:12 ` Belanger, Martin 0 siblings, 1 reply; 23+ messages in thread From: Hannes Reinecke @ 2021-05-12 6:09 UTC (permalink / raw) To: Sagi Grimberg, Belanger, Martin, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch On 5/11/21 7:13 PM, Sagi Grimberg wrote: > >> I've been thinking about what you said regarding the need to repeat >> the -w option when two interfaces share the same IP address. I think >> we're looking at the problem from a different point of view. The >> current implementation uses an IP address to identify an interface. I, >> on the other hand, believe that the best way to identify an interface >> is by its "interface name or index". In previous emails, I provided >> examples of the problems that may occur when using an IP address to >> identify an interface. For example, one can assign the same IP address >> to different interfaces making it impossible to distinguish interfaces >> by their IP address alone. Another example is that the low level APIs >> (e.g. setsockopt(SO_BINDTODEVICE) don’t even require the source IP >> address. They only need the interface name/index. So, why go through >> the trouble of performing a reverse address lookup to retrieve the >> interface name/index when the address is not used at all? >> >> By the way, if nvme-cli/linux-nvme allowed specifying interfaces by >> name/index, then we would not really need to repeat the -w option >> unless we also wanted to set the source address at the same time. >> Setting the source address is a completely different thing from >> setting the interface. One should be allowed to set one independently >> from the other, or both, or none. >> >> If you look at how ping is implemented, they do not infer the >> interface from the IP address. If one wants to force ping to go over >> an interface, then one must provide the interface by name/index using >> the -I option. If one wants to change the source IP address (without >> forcing a specific interface), then one provides the IP address to the >> -I option. It's simple and intuitive. And ping also supports appending >> the interface to the Destination IP using the '%' delimiter for >> IPv6-only as per RFC4007. >> >> I think that nvme-cli/linux-nvme should follow the ping approach. >> Interfaces should never be inferred from source IP addresses, but >> instead be clearly identified by their name or index. And setting the >> source address should be independent from setting the interface. > > I'm starting to think that we are going in circles, I'm getting to > the point that having host_iface is the right way to go. > > We can have nvme-cli convert <addr>%iface notation to > "..,host_traddr=<addr>,host_iface=<iface>,.." when creating the > controller string... That was my thinking, too; the only _other_ way would be to have a host_traddr _imply_ a binding to the underlying interface. We could define that, but having a single option doing _two_ different things is not good software design. So let's do the 'host_iface' thingie. Cheers, Hannes -- Dr. Hannes Reinecke Kernel Storage Architect hare@suse.de +49 911 74053 688 SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg HRB 36809 (AG Nürnberg), Geschäftsführer: Felix Imendörffer _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-12 6:09 ` Hannes Reinecke @ 2021-05-12 12:12 ` Belanger, Martin 2021-05-12 22:12 ` Sagi Grimberg 0 siblings, 1 reply; 23+ messages in thread From: Belanger, Martin @ 2021-05-12 12:12 UTC (permalink / raw) To: Hannes Reinecke, Sagi Grimberg, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch > On 5/11/21 7:13 PM, Sagi Grimberg wrote: > > > >> I've been thinking about what you said regarding the need to repeat > >> the -w option when two interfaces share the same IP address. I think > >> we're looking at the problem from a different point of view. The > >> current implementation uses an IP address to identify an interface. > >> I, on the other hand, believe that the best way to identify an > >> interface is by its "interface name or index". In previous emails, I > >> provided examples of the problems that may occur when using an IP > >> address to identify an interface. For example, one can assign the > >> same IP address to different interfaces making it impossible to > >> distinguish interfaces by their IP address alone. Another example is > >> that the low level APIs (e.g. setsockopt(SO_BINDTODEVICE) don’t even > >> require the source IP address. They only need the interface > >> name/index. So, why go through the trouble of performing a reverse > >> address lookup to retrieve the interface name/index when the address is > not used at all? > >> > >> By the way, if nvme-cli/linux-nvme allowed specifying interfaces by > >> name/index, then we would not really need to repeat the -w option > >> unless we also wanted to set the source address at the same time. > >> Setting the source address is a completely different thing from > >> setting the interface. One should be allowed to set one independently > >> from the other, or both, or none. > >> > >> If you look at how ping is implemented, they do not infer the > >> interface from the IP address. If one wants to force ping to go over > >> an interface, then one must provide the interface by name/index using > >> the -I option. If one wants to change the source IP address (without > >> forcing a specific interface), then one provides the IP address to > >> the -I option. It's simple and intuitive. And ping also supports > >> appending the interface to the Destination IP using the '%' delimiter > >> for IPv6-only as per RFC4007. > >> > >> I think that nvme-cli/linux-nvme should follow the ping approach. > >> Interfaces should never be inferred from source IP addresses, but > >> instead be clearly identified by their name or index. And setting the > >> source address should be independent from setting the interface. > > > > I'm starting to think that we are going in circles, I'm getting to the > > point that having host_iface is the right way to go. > > > > We can have nvme-cli convert <addr>%iface notation to > > "..,host_traddr=<addr>,host_iface=<iface>,.." when creating the > > controller string... > > That was my thinking, too; the only _other_ way would be to have a > host_traddr _imply_ a binding to the underlying interface. > We could define that, but having a single option doing _two_ different > things is not good software design. > So let's do the 'host_iface' thingie. > > Cheers, > > Hannes > -- > Dr. Hannes Reinecke Kernel Storage Architect > hare@suse.de +49 911 74053 688 > SUSE Software Solutions GmbH, Maxfeldstr. 5, 90409 Nürnberg HRB 36809 > (AG Nürnberg), Geschäftsführer: Felix Imendörffer Thanks Hannes and Sagi. I will resubmit the 'host_iface' patch after running more IPv6 tests. Sagi made me realize that IPv6 has a few subtle differences and I need to double check that the current host_iface implementation works as well for IPv6 as it does for IPv4. Martin _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting. 2021-05-12 12:12 ` Belanger, Martin @ 2021-05-12 22:12 ` Sagi Grimberg 0 siblings, 0 replies; 23+ messages in thread From: Sagi Grimberg @ 2021-05-12 22:12 UTC (permalink / raw) To: Belanger, Martin, Hannes Reinecke, Martin Belanger, linux-nvme Cc: kbusch, axboe, hch > Thanks Hannes and Sagi. I will resubmit the 'host_iface' patch after running more IPv6 tests. Sagi made me realize that IPv6 has a few subtle differences and I need to double check that the current host_iface implementation works as well for IPv6 as it does for IPv4. Cool, thanks. _______________________________________________ Linux-nvme mailing list Linux-nvme@lists.infradead.org http://lists.infradead.org/mailman/listinfo/linux-nvme ^ permalink raw reply [flat|nested] 23+ messages in thread
end of thread, other threads:[~2021-05-12 22:12 UTC | newest] Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- [not found] <20210415192848.962891-1-nitram_67@hotmail.com> 2021-04-15 19:28 ` [PATCH 1/1] Add 'Transport Interface' (triface) option. This can be used to specify the IP interface to use for the connection. The driver uses that to set SO_BINDTODEVICE on the socket before connecting Martin Belanger 2021-05-01 11:34 ` Hannes Reinecke 2021-05-03 16:59 ` Belanger, Martin 2021-05-04 13:25 ` Hannes Reinecke 2021-05-04 19:56 ` Sagi Grimberg 2021-05-05 8:47 ` Hannes Reinecke 2021-05-05 14:31 ` Belanger, Martin 2021-05-05 18:33 ` James Smart 2021-05-05 20:32 ` Sagi Grimberg 2021-05-06 18:27 ` Michael Christie 2021-05-06 6:05 ` Hannes Reinecke 2021-05-06 7:00 ` Hannes Reinecke 2021-05-06 15:46 ` Belanger, Martin 2021-05-07 18:20 ` Sagi Grimberg 2021-05-10 13:49 ` Belanger, Martin 2021-05-10 18:13 ` Sagi Grimberg 2021-05-10 19:18 ` Belanger, Martin 2021-05-11 0:28 ` Sagi Grimberg 2021-05-11 13:41 ` Belanger, Martin 2021-05-11 17:13 ` Sagi Grimberg 2021-05-12 6:09 ` Hannes Reinecke 2021-05-12 12:12 ` Belanger, Martin 2021-05-12 22:12 ` Sagi Grimberg
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).