All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
@ 2009-10-05 17:43 Sean Hefty
       [not found] ` <F0EFC2D8E6A340D48497497670C5969C-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-05 17:43 UTC (permalink / raw)
  To: linux-rdma, Roland Dreier; +Cc: Hefty, Sean

Export rdma_set_ib_paths to user space to allow applications to
manually set the IB path used for connections.  This allows
alternative ways for a user space application or library to obtain
path record information, including retrieving path information
from cached data, avoiding direct interaction with the IB SA.
The IB SA is a single, centralized entity that can limit scaling
on large clusters running MPI applications.

Signed-off-by: Sean Hefty <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
I'd like to get feedback on this approach with the possibility of merging
for 2.6.33.

 drivers/infiniband/core/ucma.c |   40 ++++++++++++++++++++++++++++++++++++++++
 include/rdma/rdma_user_cm.h    |    7 +++++--
 2 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 4346a24..1359727 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -42,6 +42,7 @@
 #include <rdma/rdma_user_cm.h>
 #include <rdma/ib_marshall.h>
 #include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
@@ -811,6 +812,42 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname,
 	return ret;
 }
 
+static int ucma_set_ib_path(struct ucma_context *ctx,
+			    struct ib_user_path_rec *upath, size_t optlen)
+{
+	struct ib_sa_path_rec sa_path;
+	struct rdma_cm_event event;
+	int ret;
+
+	if (optlen != sizeof(*upath))
+		return -EINVAL;
+
+	ib_copy_path_rec_from_user(&sa_path, upath);
+	ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
+	if (ret)
+		return ret;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (optname) {
+	case RDMA_OPTION_IB_PATH:
+		ret = ucma_set_ib_path(ctx, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
 static int ucma_set_option_level(struct ucma_context *ctx, int level,
 				 int optname, void *optval, size_t optlen)
 {
@@ -820,6 +857,9 @@ static int ucma_set_option_level(struct ucma_context *ctx, int level,
 	case RDMA_OPTION_ID:
 		ret = ucma_set_option_id(ctx, optname, optval, optlen);
 		break;
+	case RDMA_OPTION_IB:
+		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h
index c557054..d7829f4 100644
--- a/include/rdma/rdma_user_cm.h
+++ b/include/rdma/rdma_user_cm.h
@@ -215,12 +215,15 @@ struct rdma_ucm_event_resp {
 
 /* Option levels */
 enum {
-	RDMA_OPTION_ID		= 0
+	RDMA_OPTION_ID		= 0,
+	RDMA_OPTION_IB		= 1
 };
 
 /* Option details */
 enum {
-	RDMA_OPTION_ID_TOS	= 0
+	RDMA_OPTION_ID_TOS	= 0,
+
+	RDMA_OPTION_IB_PATH	= 1
 };
 
 struct rdma_ucm_set_option {



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 76+ messages in thread

* [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found] ` <F0EFC2D8E6A340D48497497670C5969C-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-05 17:45   ` Sean Hefty
       [not found]     ` <F451C333D8CB45E4B4642C6BD1EDD3C3-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-05 17:56   ` [PATCH 1/2] rdma/cm: support option to allow manually setting IB path Jason Gunthorpe
  2009-10-09 21:48   ` Sean Hefty
  2 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-05 17:45 UTC (permalink / raw)
  To: linux-rdma, Roland Dreier; +Cc: Hefty, Sean

Provide an option for user's to manually specify the socket address
to DGID mapping on InfiniBand.  Currently, all mappings are done
using ipoib, and involve ARP.  This will not work across IP subnets,
and alternative mechanisms of resolving the mapping are being explored.
The latter can be more efficient if combined with route resolution
as well.

Signed-off-by: Sean Hefty <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---

 drivers/infiniband/core/cma.c  |   15 +++++++++++++++
 drivers/infiniband/core/ucma.c |   23 +++++++++++++++++++++++
 include/rdma/rdma_cm_ib.h      |   14 ++++++++++++++
 include/rdma/rdma_user_cm.h    |    6 ++++++
 4 files changed, 58 insertions(+), 0 deletions(-)

diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
index 0753178..9adf8fb 100644
--- a/drivers/infiniband/core/cma.c
+++ b/drivers/infiniband/core/cma.c
@@ -1919,6 +1919,21 @@ err:
 }
 EXPORT_SYMBOL(rdma_resolve_addr);
 
+int rdma_set_ib_dest(struct rdma_cm_id *id, struct sockaddr *dst_addr,
+		     union ib_gid *dgid)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, CMA_ADDR_BOUND, CMA_ADDR_RESOLVED))
+		return -EINVAL;
+
+	memcpy(&id->route.addr.dst_addr, dst_addr, ip_addr_size(dst_addr));
+	ib_addr_set_dgid(&id->route.addr.dev_addr, dgid);
+	return 0;
+}
+EXPORT_SYMBOL(rdma_set_ib_dest);
+
 static void cma_bind_port(struct rdma_bind_list *bind_list,
 			  struct rdma_id_private *id_priv)
 {
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 1359727..3a252e6 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -832,12 +832,35 @@ static int ucma_set_ib_path(struct ucma_context *ctx,
 	return ucma_event_handler(ctx->cm_id, &event);
 }
 
+static int ucma_set_ib_dest(struct ucma_context *ctx,
+			    struct rdma_ucm_ib_dest *ib_dest, size_t optlen)
+{
+	union ib_gid dgid;
+	struct rdma_cm_event event;
+	int ret;
+
+	if (optlen < sizeof(*ib_dest))
+		return -EINVAL;
+
+	memcpy(&dgid, ib_dest->dgid, sizeof dgid);
+	ret = rdma_set_ib_dest(ctx->cm_id, (struct sockaddr *) &ib_dest->dst_addr, &dgid);
+	if (ret)
+		return ret;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+	return ucma_event_handler(ctx->cm_id, &event);
+}
+
 static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
 			      void *optval, size_t optlen)
 {
 	int ret;
 
 	switch (optname) {
+	case RDMA_OPTION_IB_DEST:
+		ret = ucma_set_ib_dest(ctx, optval, optlen);
+		break;
 	case RDMA_OPTION_IB_PATH:
 		ret = ucma_set_ib_path(ctx, optval, optlen);
 		break;
diff --git a/include/rdma/rdma_cm_ib.h b/include/rdma/rdma_cm_ib.h
index 2389c3b..7326d35 100644
--- a/include/rdma/rdma_cm_ib.h
+++ b/include/rdma/rdma_cm_ib.h
@@ -48,6 +48,20 @@
 int rdma_set_ib_paths(struct rdma_cm_id *id,
 		      struct ib_sa_path_rec *path_rec, int num_paths);
 
+/**
+ * rdma_set_ib_dest - Manually set the destination address
+ * @id: Connection identifier associated with the request.
+ * @dst_addr: Destination address information.
+ * @dgid: Destination device address information.
+ *
+ * This call allows the user to specify address mappings for rdma_cm_id's
+ * bound to an Infiniband device.  It is called on the client side of a
+ * connection and combined with rdma_bind_addr, replaces the call to
+ * rdma_resolve_addr.
+ */
+int rdma_set_ib_dest(struct rdma_cm_id *id, struct sockaddr *dst_addr,
+		     union ib_gid *dgid);
+
 /* Global qkey for UDP QPs and multicast groups. */
 #define RDMA_UDP_QKEY 0x01234567
 
diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h
index d7829f4..a908b89 100644
--- a/include/rdma/rdma_user_cm.h
+++ b/include/rdma/rdma_user_cm.h
@@ -223,6 +223,7 @@ enum {
 enum {
 	RDMA_OPTION_ID_TOS	= 0,
 
+	RDMA_OPTION_IB_DEST	= 0,
 	RDMA_OPTION_IB_PATH	= 1
 };
 
@@ -234,6 +235,11 @@ struct rdma_ucm_set_option {
 	__u32 optlen;
 };
 
+struct rdma_ucm_ib_dest {
+	__u8 dgid[16];
+	struct sockaddr_in6 dst_addr;
+};
+
 struct rdma_ucm_migrate_id {
 	__u64 response;
 	__u32 id;



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 76+ messages in thread

* Re: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found] ` <F0EFC2D8E6A340D48497497670C5969C-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-05 17:45   ` [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping Sean Hefty
@ 2009-10-05 17:56   ` Jason Gunthorpe
       [not found]     ` <20091005175656.GK5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2009-10-09 21:48   ` Sean Hefty
  2 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-05 17:56 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma, Roland Dreier

On Mon, Oct 05, 2009 at 10:43:44AM -0700, Sean Hefty wrote:
> Export rdma_set_ib_paths to user space to allow applications to
> manually set the IB path used for connections.  This allows
> alternative ways for a user space application or library to obtain
> path record information, including retrieving path information
> from cached data, avoiding direct interaction with the IB SA.
> The IB SA is a single, centralized entity that can limit scaling
> on large clusters running MPI applications.

Um, isn't this kind of low level control exactly why we have the IB
CM?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]     ` <20091005175656.GK5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-05 18:08       ` Sean Hefty
       [not found]         ` <F7D418716F3A4A0DACE42CC449624298-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-05 18:08 UTC (permalink / raw)
  To: 'Jason Gunthorpe'; +Cc: linux-rdma, Roland Dreier

>On Mon, Oct 05, 2009 at 10:43:44AM -0700, Sean Hefty wrote:
>> Export rdma_set_ib_paths to user space to allow applications to
>> manually set the IB path used for connections.  This allows
>> alternative ways for a user space application or library to obtain
>> path record information, including retrieving path information
>> from cached data, avoiding direct interaction with the IB SA.
>> The IB SA is a single, centralized entity that can limit scaling
>> on large clusters running MPI applications.
>
>Um, isn't this kind of low level control exactly why we have the IB
>CM?

There are very few apps that I'm aware of (one to be precise) that have coded
directly to the libibcm.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]         ` <F7D418716F3A4A0DACE42CC449624298-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-05 18:15           ` Jason Gunthorpe
       [not found]             ` <20091005181525.GL5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-05 18:15 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma, Roland Dreier

On Mon, Oct 05, 2009 at 11:08:51AM -0700, Sean Hefty wrote:
> >On Mon, Oct 05, 2009 at 10:43:44AM -0700, Sean Hefty wrote:
> >> Export rdma_set_ib_paths to user space to allow applications to
> >> manually set the IB path used for connections.  This allows
> >> alternative ways for a user space application or library to obtain
> >> path record information, including retrieving path information
> >> from cached data, avoiding direct interaction with the IB SA.
> >> The IB SA is a single, centralized entity that can limit scaling
> >> on large clusters running MPI applications.
> >
> >Um, isn't this kind of low level control exactly why we have the IB
> >CM?
> 
> There are very few apps that I'm aware of (one to be precise) that have coded
> directly to the libibcm.

I've done several..

But that isn't really the point, IB CM already provides this API so,
why corrupt the RDMA CM abstraction with this?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]             ` <20091005181525.GL5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-05 19:20               ` Sean Hefty
  2009-10-06 15:01               ` Todd Rimmer
  1 sibling, 0 replies; 76+ messages in thread
From: Sean Hefty @ 2009-10-05 19:20 UTC (permalink / raw)
  To: 'Jason Gunthorpe'; +Cc: linux-rdma, Roland Dreier

>But that isn't really the point, IB CM already provides this API so,
>why corrupt the RDMA CM abstraction with this?

The RDMA CM abstraction does not work on larger cluster sizes.  Either we can
fix the RDMA CM or force all RDMA CM applications to be re-written and become
transport aware.  This allows the librdmacm to implement a fix and communicate
the necessary information to the kernel for connection establishment using
generic mechanisms and an existing RDMA CM API.  (One alternative is for the
kernel RDMA CM to communicate directly with the IB ACM or other external
service, but I'd like to see much wider testing, acceptance, and standardization
before attempting that.)  I could not find an efficient way of implementing the
entire solution in user space because of the event reporting and port space
handling.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]     ` <F451C333D8CB45E4B4642C6BD1EDD3C3-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-06  8:00       ` Or Gerlitz
       [not found]         ` <4ACAF913.3050909-smomgflXvOZWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Or Gerlitz @ 2009-10-06  8:00 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma, Roland Dreier

Sean Hefty wrote:
> Provide an option for user's to manually specify the socket address to DGID mapping on InfiniBand.  Currently, all mappings are done using ipoib, and involve ARP.  This will not work across IP subnets, and alternative mechanisms of resolving the mapping are being explored. The latter can be more efficient if combined with route resolution as well.
Sean, 

If I understand correct your suggested changes are to optionally let an 
application to - instead of the following sequence of calls

rdma_resolve_addr  / addr resolved event
rdma_create_qp
rdma_resolve_route  / route resolved event
rdma_connect / cm events

do

rdma_set_ib_path
rdma_create_qp
rdma_connect / cm events

So in that respect, I am not sure how rdma_set_dest serves you. Further, 
rdma_resolve_addr does three resolutions

1. the local device and source gid
2. the PKEY (VLAN) to use
3. the destination gid

so in that respect, rdma_set_ib_path replaces both rdma_resolve_addr and
rdma_resolve_route?

I would prefer to have a solution where the app flow isn't touched, 
something like the kernel rdma-cm to communicate with the user space ACM 
daemon to get address and route resolutions.  Does such a design makes 
sense to you?


Or

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]             ` <20091005181525.GL5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2009-10-05 19:20               ` Sean Hefty
@ 2009-10-06 15:01               ` Todd Rimmer
       [not found]                 ` <5AEC2602AE03EB46BFC16C6B9B200DA8168EFD82BA-e4KNYiSEog6Xx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
  1 sibling, 1 reply; 76+ messages in thread
From: Todd Rimmer @ 2009-10-06 15:01 UTC (permalink / raw)
  To: Jason Gunthorpe, Sean Hefty; +Cc: linux-rdma, Roland Dreier

> From: Jason Gunthorpe
> Sent: Monday, October 05, 2009 2:15 PM
> To: Sean Hefty
> Cc: linux-rdma; Roland Dreier
> Subject: Re: [PATCH 1/2] rdma/cm: support option to allow manually
> setting IB path
> 
> On Mon, Oct 05, 2009 at 11:08:51AM -0700, Sean Hefty wrote:
> > >On Mon, Oct 05, 2009 at 10:43:44AM -0700, Sean Hefty wrote:
> > >> Export rdma_set_ib_paths to user space to allow applications to
> > >> manually set the IB path used for connections.  This allows
> > >> alternative ways for a user space application or library to obtain
> > >> path record information, including retrieving path information
> > >> from cached data, avoiding direct interaction with the IB SA.
> > >> The IB SA is a single, centralized entity that can limit scaling
> > >> on large clusters running MPI applications.
> > >
> > >Um, isn't this kind of low level control exactly why we have the IB
> > >CM?
> >
> > There are very few apps that I'm aware of (one to be precise) that
> have coded
> > directly to the libibcm.
> 
> I've done several..
> 
> But that isn't really the point, IB CM already provides this API so,
> why corrupt the RDMA CM abstraction with this?
> 
> Jason


Ideally the best approach would be to have a mux at the ib_mad level.  We could allow a user space application to intercept all outbound MADs for a given class and/or attribute.  Unlike the present "snooping" of mads, this would literally be a interception.  This would provide a number of key advantages:

1. outbound queries from all sources (ib_cm, rdma_cm, kernel ULPs, user space applications, saquery tool, etc) could all be intercepted and processed the same way

2. The "cache" could be in user space and be optional.  If the cache application is not running, the intercept is disabled and MADs flow as they do now.  However when the cache application is running, it can intercept the MADs.  The cache may then choose to directly respond from the cache (without sending a MAD on the wire) or issue the MAD (or a modified version of it) on the wire, get the response, cache it, then answer the original requester.

3. This approach could also provide opportunities for interesting IB packet tracing facilities.  Ib_madeye is quite primitive in its ability to show packets and filter.  As a result ib_madeye is useful for low speed analysis, but is difficult to use for high message rate MAD traffic.  It also doesn't show source/dest address information.  By having a mux there would be the opportunity to capture binary packets into user space for offline analysis, perhaps with tools like WireShark/EtherReal or other hand crafted packet analysis/dump/debug tools.

Todd Rimmer
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]         ` <4ACAF913.3050909-smomgflXvOZWk0Htik3J/w@public.gmane.org>
@ 2009-10-06 19:05           ` Sean Hefty
       [not found]             ` <AA7E7C8FC2A04B9688CD69CEB7355DF8-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-06 19:05 UTC (permalink / raw)
  To: 'Or Gerlitz'; +Cc: linux-rdma, Roland Dreier

>If I understand correct your suggested changes are to optionally let an
>application to - instead of the following sequence of calls
>
>rdma_resolve_addr  / addr resolved event
>rdma_create_qp
>rdma_resolve_route  / route resolved event
>rdma_connect / cm events
>
>do
>
>rdma_set_ib_path
>rdma_create_qp
>rdma_connect / cm events

>From user space, the call sequence does not change.  The user calls
rdma_resolve_addr, rdma_resolve_route, rdma_connect, etc.  It is up to the
librdmacm to perform the resolution.  Today, the resolution request is simply
passed down to the kernel, which restricts how the resolution can be performed.

>So in that respect, I am not sure how rdma_set_dest serves you. Further,
>rdma_resolve_addr does three resolutions
>
>1. the local device and source gid
>2. the PKEY (VLAN) to use
>3. the destination gid
>
>so in that respect, rdma_set_ib_path replaces both rdma_resolve_addr and
>rdma_resolve_route?

I kept resolving the address and route separate.  rdma_set_ib_path, which has
always existed btw, simply sets the route/path.   The new call,
rdma_set_ib_dest, sets the address mapping.  To use rdma_set_ib_dest, the user
must have called rdma_bind_addr first, which covers steps 1 & 2 that you
mentioned above.  The rdma_bind_addr call can be done internally to the
librdmacm as part of the rdma_resolve_addr implementation.  If a user sets the
wrong address mapping or route, they should only affect themselves.

(FYI - I have not yet implemented the librdmacm to call rdma_bind_addr as part
of rdma_resolve_route on linux.  I did not see an easy way to convert a
destination IP address to a source IP address.  If anyone knows how, please let
me know.)

>I would prefer to have a solution where the app flow isn't touched,
>something like the kernel rdma-cm to communicate with the user space ACM
>daemon to get address and route resolutions.  Does such a design makes
>sense to you?

Long term, this is exactly the type of flow that I envision.  I'd like to have
real data to show that the ACM implementation scales first, which is part of my
problem.  I do not have the ability to easily change kernel drivers on any
larger sized clusters.  My approach is to allow user space to perform the
address and route resolution and pass the data to the kernel.  This way, we have
the freedom to test multiple solutions, until we can settle on what works.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]                 ` <5AEC2602AE03EB46BFC16C6B9B200DA8168EFD82BA-e4KNYiSEog6Xx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
@ 2009-10-06 19:05                   ` Sean Hefty
       [not found]                     ` <D61F37041B6F49ACB0AC64FBF2DC4D00-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-06 19:05 UTC (permalink / raw)
  To: 'Todd Rimmer', Jason Gunthorpe; +Cc: linux-rdma, Roland Dreier

>Ideally the best approach would be to have a mux at the ib_mad level.  We could
>allow a user space application to intercept all outbound MADs for a given class
>and/or attribute.  Unlike the present "snooping" of mads, this would literally
>be a interception.  This would provide a number of key advantages:

I agree that this is a good idea, and I mentioned something similar to this
before on the list.  The idea was rejected in favor of using standard SA
redirection.

That said, it may not be the best approach in all situations.  It restricts any
solution to the MAD layer only.  If you look at how IB ACM approaches the
problem, it combines the name/address resolution (ARP) with route resolution (PR
query) without sending QP 1 traffic.  A solution limited to snooping MADs would
be less efficient.

For these two patches, if user space can determine the address mapping that they
want for their connection and/or the path that should be used, do we allow the
user to specify either or both?  I'd like to keep this discussion somewhat
separate from how those were obtained.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]             ` <AA7E7C8FC2A04B9688CD69CEB7355DF8-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-06 20:07               ` Jason Gunthorpe
       [not found]                 ` <20091006200739.GP5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2009-10-07 22:23               ` Or Gerlitz
  1 sibling, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-06 20:07 UTC (permalink / raw)
  To: Sean Hefty; +Cc: 'Or Gerlitz', linux-rdma, Roland Dreier

On Tue, Oct 06, 2009 at 12:05:02PM -0700, Sean Hefty wrote:

> From user space, the call sequence does not change.  The user calls
> rdma_resolve_addr, rdma_resolve_route, rdma_connect, etc.  It is up to the
> librdmacm to perform the resolution.  Today, the resolution request is simply
> passed down to the kernel, which restricts how the resolution can be performed.

Well, that is the whole trade off for using IP based addressing via
RDMA CM - you completely give up the IB addressing
architecture. Having an option for user space to specify an IP address
and a HW address is just really very, very strange.

I understand why you'd want to do this, but it is just such a brutal
hack..

IMHO, I think the fundamental problem is that RDMA CM is tied to IP
(ie iWarp addressing). This was probably a mistake. We really needed a
protocol and address agnostic CM API that can work with RDMA IB, iWARP
and standard IB CM protocols, using the appropriate protocol specific
addressing family and mechanism in each case.

Then you could have ACM provide GID based PR addressing using IB CM
within the RDMA CM API and it doesn't create such a complex mess. This
would then mirror how sockets work.

Actually, thinking about it some more, that would be very helpful. As
I said before, I have worked on apps using IB CM. The only reason is
to have complete control over the addressing. If I could use RDMA CM
API in some kind of AF_GID addressing and service ID space, it would
basically eliminate the need for IB CM entirely and make it alot less
trouble to support things like iWarp, since it now just another AF/PF
in the same API family.

> (FYI - I have not yet implemented the librdmacm to call
> rdma_bind_addr as part of rdma_resolve_route on linux.  I did not
> see an easy way to convert a destination IP address to a source IP
> address.  If anyone knows how, please let me know.)

Isn't this related to the now several threads discussing how route
resolution in the RDMA CM is busted?

http://lists.openfabrics.org/pipermail/general/2009-July/060612.html

You get the source address via the user (netlink) or kernel
(ip_route_output_key) equivalent of 'ip route get x.x.x.x dev XXX'

I wonder if Leo ever figured this out?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                 ` <20091006200739.GP5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-06 22:53                   ` Sean Hefty
       [not found]                     ` <B266C10D3C26431E8FF5012420132452-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-06 22:53 UTC (permalink / raw)
  To: 'Jason Gunthorpe'; +Cc: 'Or Gerlitz', linux-rdma, Roland Dreier

>Actually, thinking about it some more, that would be very helpful. As
>I said before, I have worked on apps using IB CM. The only reason is
>to have complete control over the addressing. If I could use RDMA CM
>API in some kind of AF_GID addressing and service ID space, it would
>basically eliminate the need for IB CM entirely and make it alot less
>trouble to support things like iWarp, since it now just another AF/PF
>in the same API family.

In order to maintain application level compatibility, there are a few
requirements for the changes in this patch.  An event needs to be queued
indicating that the librdmacm rdma_resolve_addr() call is complete.  The IB CM
REQ message should carry the IP address, so that data should be set.  And the
state of the rdma_cm_id needs to change.

I did consider the possibility of having the sockaddr contain some IB related
address, with user space performing the mapping.  My thought was that the IP
address needed to be given to the kernel since the IB CM message carries the IP
address in the private data.  The GID could actually be extracted from the
rdma_set_ib_paths() call.

I'm not sure about defining a new address family for GIDs, given that a GID is
already supposed to be an IPv6 address.  Maybe the RDMA CM could check whether
an address mapped to IB GID or not.  If the source address of either an
rdma_bind_addr or rdma_resolve_addr call were an actual GID, it could assume the
same of the destination address.  Something would need to be done to determine
what would go into the IB CM REQ, but that may introduce incompatibilities. 

Note that between the two patches, this one is less important to scaling than
the other one.  It would be ideal to avoid sending ARP requests when they are
not needed. 

>You get the source address via the user (netlink) or kernel
>(ip_route_output_key) equivalent of 'ip route get x.x.x.x dev XXX'

Yes - ip route get gives what's needed.  Is there a simple way to obtain that
same data from within a program?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                     ` <B266C10D3C26431E8FF5012420132452-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-06 23:17                       ` Jason Gunthorpe
       [not found]                         ` <20091006231720.GR5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-06 23:17 UTC (permalink / raw)
  To: Sean Hefty; +Cc: 'Or Gerlitz', linux-rdma, Roland Dreier

On Tue, Oct 06, 2009 at 03:53:21PM -0700, Sean Hefty wrote:
> >Actually, thinking about it some more, that would be very helpful. As
> >I said before, I have worked on apps using IB CM. The only reason is
> >to have complete control over the addressing. If I could use RDMA CM
> >API in some kind of AF_GID addressing and service ID space, it would
> >basically eliminate the need for IB CM entirely and make it alot less
> >trouble to support things like iWarp, since it now just another AF/PF
> >in the same API family.
> 
> In order to maintain application level compatibility, there are a few
> requirements for the changes in this patch.  An event needs to be queued
> indicating that the librdmacm rdma_resolve_addr() call is complete.  The IB CM
> REQ message should carry the IP address, so that data should be set.  And the
> state of the rdma_cm_id needs to change.

All these APIs were put together pretty quickly, if we can move ahead
in a significant way by making minor adjustments (like adding a family
field here and there) then I think it is worth doing.

> I did consider the possibility of having the sockaddr contain some
> IB related address, with user space performing the mapping.  My
> thought was that the IP address needed to be given to the kernel
> since the IB CM message carries the IP address in the private data.
> The GID could actually be extracted from the rdma_set_ib_paths()
> call.

I'm not necessarily proposing that an IB centric RDMA CM interface
continue to use IP addressess, but that I can provide IB addresses
through the RDMA CM API and create IB CM connections. To me this is
really what your acm patch is attempting to do. That there is IP
addresses at all seems more of a convenience.

So, an AF_GID RDMA CM connection process would not (directly)
interoperate with an AF_IP/AF_IPV6 RDMA CM connection process.

> I'm not sure about defining a new address family for GIDs, given
> that a GID is already supposed to be an IPv6 address.  Maybe the
> RDMA CM could check whether an address mapped to IB GID or not.  If
> the source address of either an

GIDs are addresses that are formed like IPv6 addresses that occupy a
completely distjoint address space. It is correct to have them exist
in their own family (ie AF_GID). That is the only way to disambiguate
them from IPv6 addresses.

IETF has not (and probably will not) reserve an IPv6 prefix space for
GIDs, so there is no other way.

> could assume the same of the destination address.  Something would
> need to be done to determine what would go into the IB CM REQ, but
> that may introduce incompatibilities.

The same approach that the IB CM uses today would have to be
used. There would need to be technology specific APIs to set ancillary
data. The IP version already has APIs to set port numbers, GID based
RDMA CM would need APIs to set services IDs and so on, just like in
the IB CM case.

I'm not suggesting that you implement RMDA CM IP semantics in
userspace using the IB CM, I'm suggesting you expose the IB CM GID
semantics through the RDMA CM API exactly as they are. Your IBACM
would then become an enhanced path resolution module to the RDMA CM, 
much like getaddrinfo is to socket()/bind()/connect().

So the output from IBACM would specify on AF_GID address family and
include opaque data blobs that are passed through the RDMA CM API that
contain all the PR records, service ID, etc. If used on non-IB then
IBACM could just return AF_IP/AF_IPV6 and related blobs. Thus the
consumer of the API gets transparency and network protocol agility,
and all the mess can be hid in the address resolution API.

Like getaddrinfo it could be string based, and perhaps with some
careful thought we can make a string descriptor that can actually
expose some of the good IB functionality, like multipath, APM, etc.

Ie, perhaps if you get
 getrdmaaddrinfo("gid=fd83:609c:bdc8:1:213:72ff:fe29:e65d","123123232");
you would get data describing an IB CM connection using service ID
123123232 to GID fd83:609c:bdc8:1:213:72ff:fe29:e65d, while
 getrdmaaddrinfo("192.168.122.1%eth2","1243");
Would describe an IP based RDMA connection using device eth2 and port
1234.

And maybe, say
 getrdmaaddrinfo("acm=192.168.122.1%eth2","1243");
Invokes your new module, but the result is an AF_GID family connection.

Like in IP/IPv6 the connection process would proceed in exactly the
same way no matter if it is iWARP, IB RDMA, CEE RDMA, or
whatever. This model has worked very well for writing dual stack
IPv4/IPv6 applications.

> Note that between the two patches, this one is less important to
> scaling than the other one.  It would be ideal to avoid sending ARP
> requests when they are not needed.

Yes, I see that, but the ARP request is an absolutely critical part of
the IP world, to eliminate it, but still pretend to be IP really is
cheating too much, IMHO. :)

> >You get the source address via the user (netlink) or kernel
> >(ip_route_output_key) equivalent of 'ip route get x.x.x.x dev XXX'
> 
> Yes - ip route get gives what's needed.  Is there a simple way to
> obtain that same data from within a program?
 
Another topic, but yes, ip route get just does a netlink
queury. I can give you all the details if you want to try it.

However as I explained in the thread, I highly skeptical about all of
this. That query needs to be done exactly once and the connection must
be bound to that result from then on. Currently too many route lookups
are done, and adding more to userspace does not seem to be the right
direction - unless the userspace one replaces all the kernel lookups..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                         ` <20091006231720.GR5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-07  1:20                           ` Sean Hefty
       [not found]                             ` <3F7D26D4BA1C46F18F2F87BDD7EB7F36-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-07  1:20 UTC (permalink / raw)
  To: 'Jason Gunthorpe'; +Cc: 'Or Gerlitz', linux-rdma, Roland Dreier

>I'm not suggesting that you implement RMDA CM IP semantics in
>userspace using the IB CM, I'm suggesting you expose the IB CM GID
>semantics through the RDMA CM API exactly as they are. Your IBACM
>would then become an enhanced path resolution module to the RDMA CM,
>much like getaddrinfo is to socket()/bind()/connect().

There are 3 interfaces of interest here.  The librdmacm API, the rdma_ucm user
to kernel interface, and the rdma_cm interface.  These patches are looking to
change the rdma_ucm interface.  I want to avoid changing the API or behavior of
the librdmacm in a way that requires changes to existing applications in order
to run on larger clusters. 

Adding support for AF_GID at the librdmacm level is fine, but it doesn't help
existing apps.  Adding support for AF_GID at the rdma_ucm and rdma_cm interfaces
may help, provided that the behavior of librdmacm calls: rdma_resolve_addr,
rdma_bind_addr, rdma_resolve_route, and rdma_connect are maintained.  If you
have a specific idea on a better way to change the rdma_ucm interface than
'set_option' calls, let me know.

I will look at the details of what would happen if the librdmacm converted an
AF_INET address into an AF_GID address and used that in the down call to the
rdma_ucm:rdma_resolve_addr.  I suspect that since the original AF_INET address
is not carried in the IB CM REQ, there will be issues matching the REQ with
listens on specific addresses.  This may not be a big deal in practice.  The
rdma_cm would still need the path record data.

>So the output from IBACM would specify on AF_GID address family and
>include opaque data blobs that are passed through the RDMA CM API that
>contain all the PR records, service ID, etc. If used on non-IB then
>IBACM could just return AF_IP/AF_IPV6 and related blobs. Thus the
>consumer of the API gets transparency and network protocol agility,
>and all the mess can be hid in the address resolution API.

This is just debating where the transport abstraction occurs, but IMO the IB ACM
should be IB centric.  Transport abstraction should occur somewhere above it.
This has been the role of the librdmacm.  Adding a new call similar to
getaddrinfo to the librdmacm should be possible, and could actually take
advantage of the IB ACM resolution that converts a host name directly into IB
path data.

This still leaves open the issue of how to communicate that data to the kernel
so that the rdma_cm can format the IB CM REQ correctly and send it on its merry
little way.

>Yes, I see that, but the ARP request is an absolutely critical part of
>the IP world, to eliminate it, but still pretend to be IP really is
>cheating too much, IMHO. :)

We aren't completely getting rid of ARP, we just support an alternate,
non-standard, proprietary address resolution mechanism instead.

>Another topic, but yes, ip route get just does a netlink
>queury. I can give you all the details if you want to try it.

Yes, please - see below

>However as I explained in the thread, I highly skeptical about all of
>this. That query needs to be done exactly once and the connection must
>be bound to that result from then on. Currently too many route lookups
>are done, and adding more to userspace does not seem to be the right
>direction - unless the userspace one replaces all the kernel lookups..

The librdmacm rdma_resolve_addr() call allows a user to specify a destination
address only.  A suitable source address will be selected, and the rdma_cm_id
will be bound to the corresponding RDMA device.  If the librdmacm can
efficiently determine the source address, it can call the IB ACM to resolve the
addresses and obtain the path data.  Otherwise, the call to librdmacm
rdma_resolve_addr() drops into the kernel and operates as it does today, which
can involve sending an ARP.

I haven't been overly concerned about this yet, because the application I'm most
concerned with always calls rdma_bind_addr().

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                             ` <3F7D26D4BA1C46F18F2F87BDD7EB7F36-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-07  5:26                               ` Jason Gunthorpe
       [not found]                                 ` <20091007052639.GB18578-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-07  5:26 UTC (permalink / raw)
  To: Sean Hefty; +Cc: 'Or Gerlitz', linux-rdma, Roland Dreier

On Tue, Oct 06, 2009 at 06:20:05PM -0700, Sean Hefty wrote:

> There are 3 interfaces of interest here.  The librdmacm API, the rdma_ucm user
> to kernel interface, and the rdma_cm interface.  These patches are looking to
> change the rdma_ucm interface.  I want to avoid changing the API or behavior of
> the librdmacm in a way that requires changes to existing applications in order
> to run on larger clusters. 

So, I'm just talking about the user space API, the others can be
changed as necessary to align with it. 

This is open source, so choosing a technically better solution over a
endlessly backwards compatible solution is done all the time and is
normal, expected, etc. Cost of progress - that is the underlying
rational of Documentation/stable_api_nonsense.txt, and it applies just
as well to niche little user space libraries like these :)

> >So the output from IBACM would specify on AF_GID address family and
> >include opaque data blobs that are passed through the RDMA CM API that
> >contain all the PR records, service ID, etc. If used on non-IB then
> >IBACM could just return AF_IP/AF_IPV6 and related blobs. Thus the
> >consumer of the API gets transparency and network protocol agility,
> >and all the mess can be hid in the address resolution API.
> 
> This is just debating where the transport abstraction occurs, but IMO the IB ACM
> should be IB centric.  Transport abstraction should occur somewhere
> above it.

Actually, I'm arguing that RDMA CM should have been a transport
mux/switch like socket() rather than just an IP addressing abstraction.

If it is a mux then an app coded to RDMA CM could speak native IB GID
addressing on the same API and the scaling problems related to arp and
the implict kernel PR query of the IP abstraction can be neatly
eliminated, by eliminating the abstraction. It is the abstraction to
IP addresses that is the root inefficiency - we need a transport
protocol agnostic API for CM that lets the native transport addressing
be used - not an abstraction layer (abstractions are always the bane
of efficiency)..

If you want to have a naming layer in user space that converts
*whatever* to GIDs then fine, great, but lets call it that and not
co-mingle it with the IP address abstraction layer.

Adding IB CM semantics to the RDMA CM API does not seem to be too
hard:
 rdma_create_id uses an new RDMA_PS_IB to signal IB CM behavior
 rmda_resolve_addr and all other functions use a sockaddr that is an
   IB GID, pkey, service ID etc. rdma_resolve_addr at least grows a
   new parameter which is a 'hw address'. This is mandatory for
   RDMA_PS_IB and is up to 5 PR records. (1 for CM path,
   forward/reverse for primary, and forward/reverse for alternative)

 rdma_get_addr_info return a struct with the rdma_port_space, sockaddr
 src, sockaddr dest and 'hw address' values that the app blindly plugs
 into the calls. libacm type function is implemented entirely in the
 rdma_get_addr_info.

The trade off is that apps that want to scale use IB GID addressing,
IB GID CM, and IB service ID at the rdma_cm layer, and libacm provides
a name mapping from hostname or IP to GID, if its being used. No IP
addreses, no IP listen matching, no port space TCP issues. Just
straight IB services IDs.

Pretty much the symmetry is simple.. The Kernel always takes care of
IP hardware addressing, userspace always takes care of IB
hardware addressing. Nature of the two protocols.

> This still leaves open the issue of how to communicate that data to the kernel
> so that the rdma_cm can format the IB CM REQ correctly and send it on its merry
> little way.

The rdma_ucm interface would have to be extended to be able to do 100%
of the functionality of the ib cm interface using the rdma_cm_id
abstraction. This is very useful in of itself and much better than
adding an obscure option to override the ARP query. For instance,
other MPIs could immediately provide their users an option to use GID
addresess directly and cut out the ARP overhead instantly with
little code change.

> >Another topic, but yes, ip route get just does a netlink
> >queury. I can give you all the details if you want to try it.
> 
> Yes, please - see below

I'll look in my codes, remind me if I forget, I can't do it just now
..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]                     ` <D61F37041B6F49ACB0AC64FBF2DC4D00-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-07  5:30                       ` Jason Gunthorpe
  0 siblings, 0 replies; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-07  5:30 UTC (permalink / raw)
  To: Sean Hefty; +Cc: 'Todd Rimmer', linux-rdma, Roland Dreier

On Tue, Oct 06, 2009 at 12:05:02PM -0700, Sean Hefty wrote:
> >Ideally the best approach would be to have a mux at the ib_mad level.  We could
> >allow a user space application to intercept all outbound MADs for a given class
> >and/or attribute.  Unlike the present "snooping" of mads, this would literally
> >be a interception.  This would provide a number of key advantages:
> 
> I agree that this is a good idea, and I mentioned something similar to this
> before on the list.  The idea was rejected in favor of using standard SA
> redirection.

Actually, I think MAD capture is much too low level.

As in the other message, if we could have a rdma_get_addr_info style
API then it would wonderful to do as glibc does and trap that out via
a socket to a nscd-like caching/whatever daemon.

If the kernel was also fixed up to be able to take addressing data (ie a
IB PR set) for NFS and SDP connection setup then this same API could
be used to provide caching/whatever for the majority of cases..

Maybe someday? :)

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                 ` <20091007052639.GB18578-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-07 19:16                                   ` Sean Hefty
       [not found]                                     ` <20ADF14BE2B24B459DC3921F69449E61-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-07 19:16 UTC (permalink / raw)
  To: 'Jason Gunthorpe'; +Cc: 'Or Gerlitz', linux-rdma, Roland Dreier

>So, I'm just talking about the user space API, the others can be
>changed as necessary to align with it.
>
>This is open source, so choosing a technically better solution over a
>endlessly backwards compatible solution is done all the time and is
>normal, expected, etc. Cost of progress - that is the underlying
>rational of Documentation/stable_api_nonsense.txt, and it applies just
>as well to niche little user space libraries like these :)

stable_api_nonsense.txt only applies to the rdma_cm interface, not the ABI or a
user space library.  I believe that any change to the library or ABI that forces
applications to change would be detrimental to OFA and the stack as a whole, and
I do not see a compelling reason to make such a change.

Discarding the existing librdmacm interface and ABI are not viable options in my
opinion.

>The rdma_ucm interface would have to be extended to be able to do 100%
>of the functionality of the ib cm interface using the rdma_cm_id
>abstraction. This is very useful in of itself and much better than
>adding an obscure option to override the ARP query. For instance,
>other MPIs could immediately provide their users an option to use GID
>addresess directly and cut out the ARP overhead instantly with
>little code change.

rdma_cm rdma_resolve_addr may result in issuing an ARP query - it depends on the
transport and device capabilities.  I want to keep the other behavior of
librdmacm rdma_resolve_addr, and eliminate the ARP as unnecessary.  Other
options I looked at were using fields inside the struct sockaddr_in6 (yuck) or
letting a timeout of 0 indicate that ARP should not be used.  The latter leaves
the ABI intact.  The drawback is that the DGID could still be unknown, which
would result in rdma_cm rdma_resolve_route failing.  This may be acceptable.
 
An extension to the ABI is needed to allow user space to set the IB path.  The
proposed 'set_option' ABI could support passing multiple PRs to the kernel.  The
kernel implementation only handles one currently.

>From your other mails, it doesn't sound like you have an issue with an ABI
extension that allows setting the IB path record directly.  Is this correct, and
do you have an issue with the proposed implementation of that?

You do seem to disagree with the changes to allow user space to specify the IP
to DGID mapping.  Is there an alternative that you would agree with?

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                     ` <20ADF14BE2B24B459DC3921F69449E61-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-07 20:32                                       ` Jason Gunthorpe
       [not found]                                         ` <20091007203257.GT5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-07 20:32 UTC (permalink / raw)
  To: Sean Hefty; +Cc: 'Or Gerlitz', linux-rdma, Roland Dreier

On Wed, Oct 07, 2009 at 12:16:56PM -0700, Sean Hefty wrote:
> >So, I'm just talking about the user space API, the others can be
> >changed as necessary to align with it.
> >
> >This is open source, so choosing a technically better solution over a
> >endlessly backwards compatible solution is done all the time and is
> >normal, expected, etc. Cost of progress - that is the underlying
> >rational of Documentation/stable_api_nonsense.txt, and it applies just
> >as well to niche little user space libraries like these :)
> 
> stable_api_nonsense.txt only applies to the rdma_cm interface, not the ABI or a
> user space library.  I believe that any change to the library or ABI that forces
> applications to change would be detrimental to OFA and the stack as a whole, and
> I do not see a compelling reason to make such a change.

Well, you may think that, but look at the past sonoma
conferences. Some of the current APIs are *BAD* - they are hard to
use, complex, inflexable, incomplete and sometimes even
non-performant. You just can't fix bad APIs without changing them. Bad
APIs are, IMHO, a much bigger detriment to the goals of OFA than some
small software churn in existing apps. They make it less likely that
'killer RDMA apps' will emerge to widen the use of RDMA technologies.

> Discarding the existing librdmacm interface and ABI are not viable
> options in my opinion.

Probably not discarding, but some updates here and there. This stuff
happens. The main thing is that the old APIs in binary form continue
to exist for linking purposes. New software has to patch a little to
use new library versions to get new features. There are countless
examples of this in open source.

> >The rdma_ucm interface would have to be extended to be able to do 100%
> >of the functionality of the ib cm interface using the rdma_cm_id
> >abstraction. This is very useful in of itself and much better than
> >adding an obscure option to override the ARP query. For instance,
> >other MPIs could immediately provide their users an option to use GID
> >addresess directly and cut out the ARP overhead instantly with
> >little code change.
> 
> rdma_cm rdma_resolve_addr may result in issuing an ARP query - it
> depends on the transport and device capabilities.  I want to keep
> the other behavior of librdmacm rdma_resolve_addr, and eliminate the
> ARP as unnecessary.  Other options I looked at were using fields
> inside the struct sockaddr_in6 (yuck) or letting a timeout of 0
> indicate that ARP should not be used.  The latter leaves the ABI
> intact.  The drawback is that the DGID could still be unknown, which
> would result in rdma_cm rdma_resolve_route failing.  This may be
> acceptable.

You are trying to make the smallest change possible to work around a
performance problem caused by ineffecient abstractions by completely
breaking the abstraction.

I understand why you want to do this, it is simple, 'tidy', fits in
with DAPL and seems to be easy.. But it doesn't really move anything
forward, it raises new problems, and just seems wrong.

IP RDMA already gets alot of criticism because it does not fit
properly into the IP stack, I don't think divering further is the way
to go. Establishing IP-like connections without neighbor entries, and
without respecting static neighbor entries is just more deviation.

IP RDMA addressing on IB - I think - should be regarded as a
non-performant convenience API that is built to be similar to iWarp,
and honours the Linux IP stack. MPIs should not be surprised they get
bad performance from this method!! It is not a bug to be fixed that
this overhead is present.

> An extension to the ABI is needed to allow user space to set the IB
> path.  The proposed 'set_option' ABI could support passing multiple
> PRs to the kernel.  The kernel implementation only handles one
> currently.

If you do go ahead with this, then please at least build in forward
support for passing all 5 PRs, that is one of the bugs with the
current code that does badly affect people. Ie the folks working on
the torus routing are forced to solve a much harder problem since the
Linux stack does not yet support asymmetric paths.

> From your other mails, it doesn't sound like you have an issue with
> an ABI extension that allows setting the IB path record directly.
> Is this correct, and do you have an issue with the proposed
> implementation of that?

> You do seem to disagree with the changes to allow user space to specify the IP
> to DGID mapping.  Is there an alternative that you would agree with?

I don't like moving, effectively, HW address selection into user space
for IP addressing applications. That just seems really wrong.

Like I've said, I think the MPIs should use the IB CM (ideally through
RDMA CM API), and GID addresses + service IDs. That eliminates
inherent overhead from IP RDMA CM without breaking the IP stack
integration of that scheme.

Yes, I know this is harder, I know it requires some API updates, I
know something will probably have to be done to DAPL. But it is the
long term right approach, IMHO.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                         ` <20091007203257.GT5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-07 21:20                                           ` Hefty, Sean
       [not found]                                             ` <CF9C39F99A89134C9CF9C4CCB68B8DDF12C180FFD1-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Hefty, Sean @ 2009-10-07 21:20 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: 'Or Gerlitz', linux-rdma, Roland Dreier

>You are trying to make the smallest change possible to work around a
>performance problem caused by ineffecient abstractions by completely
>breaking the abstraction.

The performance problem being addressed here is not caused by an inefficient abstraction.  It's caused by having a single, centralized SM incapable of scaling to the cluster sizes that customers want to use.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                             ` <CF9C39F99A89134C9CF9C4CCB68B8DDF12C180FFD1-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
@ 2009-10-07 21:45                                               ` Jason Gunthorpe
  0 siblings, 0 replies; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-07 21:45 UTC (permalink / raw)
  To: Hefty, Sean; +Cc: 'Or Gerlitz', linux-rdma, Roland Dreier

On Wed, Oct 07, 2009 at 02:20:29PM -0700, Hefty, Sean wrote:
> >You are trying to make the smallest change possible to work around a
> >performance problem caused by ineffecient abstractions by completely
> >breaking the abstraction.
> 
> The performance problem being addressed here is not caused by an
> inefficient abstraction.  It's caused by having a single,
> centralized SM incapable of scaling to the cluster sizes that
> customers want to use.

The SM scaling is a side effect of the IP RDMA CM design. Since the IP
RDMA CM design and abstraction require the ARP and PR queries - it is
the root inefficiency - the SM scaling is the consequence. IB GID CM
does not have this mandatory behavior, you can get your PR data from
wherever you want.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]             ` <AA7E7C8FC2A04B9688CD69CEB7355DF8-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-06 20:07               ` Jason Gunthorpe
@ 2009-10-07 22:23               ` Or Gerlitz
       [not found]                 ` <15ddcffd0910071523w4f229b14j905ad170ceb8c21f-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 76+ messages in thread
From: Or Gerlitz @ 2009-10-07 22:23 UTC (permalink / raw)
  To: Sean Hefty; +Cc: Or Gerlitz, linux-rdma, Roland Dreier

Sean Hefty wrote:

> From user space, the call sequence does not change.  The user calls
> rdma_resolve_addr, rdma_resolve_route, rdma_connect, etc.  It is up to the
> librdmacm to perform the resolution.  Today, the resolution request is simply
> passed down to the kernel, which restricts how the resolution can be performed.

good, fair-enough

> I kept resolving the address and route separate.  rdma_set_ib_path, which has
> always existed btw, simply sets the route/path.   The new call,
> rdma_set_ib_dest, sets the address mapping.  To use rdma_set_ib_dest, the user
> must have called rdma_bind_addr first, which covers steps 1 & 2 that you
> mentioned above.  The rdma_bind_addr call can be done internally to the
> librdmacm as part of the rdma_resolve_addr implementation.

I understand that rdma_bind_address covers the local device and vlan
resolutions, but I we should also --keep-- supporting also
applications that use an explicit source address in rdma_resolve_addr
or that don't do bind, provide src=NULL to resolve_addr and rely on
the rdma-cm to use route lookup (as the rdma_resolve_addr man page
indicates) for the device/vlan resolution.

> If a user sets the wrong address mapping or route, they should only affect themselves

I wasn't sure to follow this comment, can you elaborate a bit more?

> (FYI - I have not yet implemented the librdmacm to call rdma_bind_addr as part
> of rdma_resolve_route on linux.  I did not see an easy way to convert a
> destination IP address to a source IP address.  If anyone knows how, please let
> me know.)

I assume you was referring rdma_resolve_addr, correct? there should be
a way to do that from user space and if not, you can go down to the
kernel, resolve the device/vlan and then call ACM to resolve the
destination. It seems that you must resolve the dev/vlan for issuing
the ACM ARP replacement...

> >I would prefer to have a solution where the app flow isn't touched,
> >something like the kernel rdma-cm to communicate with the user space ACM
> >daemon to get address and route resolutions.  Does such a design makes
> >sense to you?

> Long term, this is exactly the type of flow that I envision.  I'd like to have
> real data to show that the ACM implementation scales first, which is part of my
> problem.  I do not have the ability to easily change kernel drivers on any
> larger sized clusters.  My approach is to allow user space to perform the
> address and route resolution and pass the data to the kernel.  This way, we have
> the freedom to test multiple solutions, until we can settle on what works.

I am not sure to fully follow on the easily-change-kernel-drivers
claim, isn't some change to the kernel rdma-cm being a must for the
ACM + librdmacm solution to work? suppose you have a way to fully do
the addr+route resolutions from user space, will the kernel rdma-cm
state machine will be willing to issue

rdma_create_id
rdma_set_ib_path (you said this exists today?)
rdma_create_qp
rdma_connect

???

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                 ` <15ddcffd0910071523w4f229b14j905ad170ceb8c21f-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2009-10-07 23:42                   ` Sean Hefty
       [not found]                     ` <9F4DE6A2B4F644698E94F00C4FEEF30A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-07 23:42 UTC (permalink / raw)
  To: 'Or Gerlitz'; +Cc: Or Gerlitz, linux-rdma, Roland Dreier

>I understand that rdma_bind_address covers the local device and vlan
>resolutions, but I we should also --keep-- supporting also
>applications that use an explicit source address in rdma_resolve_addr
>or that don't do bind, provide src=NULL to resolve_addr and rely on
>the rdma-cm to use route lookup (as the rdma_resolve_addr man page
>indicates) for the device/vlan resolution.

My intent, which differs from Jason's, was to fully support the existing
librdmacm interfaces as they are defined.

Implementation wise, if the user of the librdmacm calls rdma_resolve_addr with a
src address, it's easy.  Without the src address, it's hard, but I may just be
missing some easy interface for finding the src address.

Between the two patches, I'm fine with the change to set the PR and still
looking at ways to handle rdma_resolve_addr.

>> If a user sets the wrong address mapping or route, they should only affect
>themselves
>
>I wasn't sure to follow this comment, can you elaborate a bit more?

I meant that if some bogus app wants to specify an IP to GID mapping that's
invalid, the incorrect mapping should only affect connections for that app.
When used over IB, the IP address is little more than a qualifier contained
within the IB CM REQ private data.

>I assume you was referring rdma_resolve_addr, correct? there should be
>a way to do that from user space and if not, you can go down to the
>kernel, resolve the device/vlan and then call ACM to resolve the
>destination. It seems that you must resolve the dev/vlan for issuing
>the ACM ARP replacement...

Technically, rdma_resolve_addr could remain unchanged, in which case it will do
everything it does today, which may include sending an ARP.  This is the
specific operation that I'd like to avoid.

>I am not sure to fully follow on the easily-change-kernel-drivers
>claim, isn't some change to the kernel rdma-cm being a must for the
>ACM + librdmacm solution to work? suppose you have a way to fully do
>the addr+route resolutions from user space, will the kernel rdma-cm
>state machine will be willing to issue

All of our larger clusters are for production use.  Asking them to install and
use a supported kernel / OFED release is different than me asking them to apply
a patch to the kernel on all systems.

I can somewhat implement an ACM + librdmacm solution entirely in user space by
layering the librdmacm over libibcm.  Because of the event reporting, it would
be limited in how it could be use, and is unlikely to be something that would
ever be supported.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                     ` <9F4DE6A2B4F644698E94F00C4FEEF30A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-08  0:38                       ` Sean Hefty
       [not found]                         ` <0A383504E0E54C949DEF84405E3AE92F-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-08 23:13                       ` Or Gerlitz
  1 sibling, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-08  0:38 UTC (permalink / raw)
  To: Hefty, Sean, 'Or Gerlitz', 'Jason Gunthorpe'
  Cc: Or Gerlitz, linux-rdma, Roland Dreier

>When used over IB, the IP address is little more than a qualifier contained
>within the IB CM REQ private data.

If we added support for AF_GID/AF_IB to the kernel, the rdma_cm could leave all
of the private data carried in the IB CM REQ entirely up to the user.  If the
user happens to format that data to look like the CMA header, so be it.  I
believe this would allow for a 'clean' implementation of rdma_resolve_addr,
preserve the ABI, and still allow a library to provide backwards compatibility.

The following information should be known after calling rdma_resolve_addr: sgid,
dgid, pkey, source port/sid, destination port/sid.  The address structure for
AF_IB should be defined to capture this information.  (The port / service ID
needs to be worked out.)

Would this approach combined with the ability to set the route work for
everyone?

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                         ` <0A383504E0E54C949DEF84405E3AE92F-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-08  0:54                           ` Jason Gunthorpe
       [not found]                             ` <20091008005425.GW5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2009-10-08 23:33                           ` Or Gerlitz
  1 sibling, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-08  0:54 UTC (permalink / raw)
  To: Sean Hefty; +Cc: 'Or Gerlitz', Or Gerlitz, linux-rdma, Roland Dreier

On Wed, Oct 07, 2009 at 05:38:27PM -0700, Sean Hefty wrote:
> >When used over IB, the IP address is little more than a qualifier contained
> >within the IB CM REQ private data.
> 
> If we added support for AF_GID/AF_IB to the kernel, the rdma_cm
> could leave all of the private data carried in the IB CM REQ
> entirely up to the user.  If the user happens to format that data to
> look like the CMA header, so be it.  I believe this would allow for
> a 'clean' implementation of rdma_resolve_addr, preserve the ABI, and
> still allow a library to provide backwards compatibility.

Yep, not sure how you handle the listening side without port
conflicts?? But that doesn't seem to be a huge problem. TBH - since
ACM is kinda its own little world, it could just use a seperate
service ID space from RDMA CM?

> The following information should be known after calling
> rdma_resolve_addr: sgid, dgid, pkey, source port/sid, destination
> port/sid.  The address structure for AF_IB should be defined to
> capture this information.  (The port / service ID needs to be worked
> out.)

Yes, that seems great..

What API would you use to pass the PR data?

What do you think of a 'rdma_get_addr_info' that could be where libacm
hooks?

> Would this approach combined with the ability to set the route work for
> everyone?

'set the route' ?

You mean the 'ip route get' thingy?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                             ` <20091008005425.GW5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-08  6:46                               ` Sean Hefty
       [not found]                                 ` <3BA5B96263EC4ACA8FF3C4D8DCF47C69-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-08  6:46 UTC (permalink / raw)
  To: 'Jason Gunthorpe'
  Cc: 'Or Gerlitz', Or Gerlitz, linux-rdma, Roland Dreier

>Yep, not sure how you handle the listening side without port
>conflicts?? But that doesn't seem to be a huge problem. TBH - since
>ACM is kinda its own little world, it could just use a seperate
>service ID space from RDMA CM?

I'm not sure how to handle the port space yet.  The port space is specified when
the rdma_cm_id is created.  I don't think there's an immediate need to change
anything on the listen side, but if we add AF_IB, then adding RDMA_PS_IB may
make sense.  This could be the full 64-bit service ID.  (We can determine the
right name for AF_IB / AF_GID based on what's actually in the structure.)

>> The following information should be known after calling
>> rdma_resolve_addr: sgid, dgid, pkey, source port/sid, destination
>> port/sid.  The address structure for AF_IB should be defined to
>> capture this information.  (The port / service ID needs to be worked
>> out.)
>
>Yes, that seems great..

On second thought, I'm not sure about _needing_ the pkey.  My first draft of
this is:

struct sockaddr_ib {
	unsigned short int ib_family;
	__u16 reserved;
	union ib_gid gid;
	__be64 ib_service_id; 
};

Although I considered dividing the service id and putting the low order bytes
where reserved is, or only supporting the RDMA IP CM service ID format, possibly
using sockaddr_in6 directly.

>What API would you use to pass the PR data?

API at which level?

>What do you think of a 'rdma_get_addr_info' that could be where libacm
>hooks?

I have no objections to extending the librdmacm API.

>> Would this approach combined with the ability to set the route work for
>> everyone?
>
>'set the route' ?

Pass the path record to the kernel.  This piece is still missing to allow user
space to own the policy for obtaining path information.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                 ` <3BA5B96263EC4ACA8FF3C4D8DCF47C69-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-08 17:21                                   ` Jason Gunthorpe
       [not found]                                     ` <20091008172120.GX5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-08 17:21 UTC (permalink / raw)
  To: Sean Hefty; +Cc: 'Or Gerlitz', Or Gerlitz, linux-rdma, Roland Dreier

On Wed, Oct 07, 2009 at 11:46:24PM -0700, Sean Hefty wrote:
> >Yep, not sure how you handle the listening side without port
> >conflicts?? But that doesn't seem to be a huge problem. TBH - since
> >ACM is kinda its own little world, it could just use a seperate
> >service ID space from RDMA CM?
 
> I'm not sure how to handle the port space yet.  The port space is
> specified when the rdma_cm_id is created.  I don't think there's an
> immediate need to change anything on the listen side, but if we add
> AF_IB, then adding RDMA_PS_IB may make sense.  This could be the
> full 64-bit service ID.  (We can determine the right name for AF_IB
> / AF_GID based on what's actually in the structure.)

RDMA_PS_IB I think is nescessary for this scheme to make sense. If the
listening side continues to use the IP mode to listen then I guess the
client can compute an appropriate service ID, but it seems a bit
strange for one side to use IP and the other side to use the ACM
method? I was imagining you'd configure both sides to use the same
method.

> >> The following information should be known after calling
> >> rdma_resolve_addr: sgid, dgid, pkey, source port/sid, destination
> >> port/sid.  The address structure for AF_IB should be defined to
> >> capture this information.  (The port / service ID needs to be worked
> >> out.)
> >
> >Yes, that seems great..
> 
> On second thought, I'm not sure about _needing_ the pkey.  My first draft of
> this is:
> 
> struct sockaddr_ib {
> 	unsigned short int ib_family;
> 	__u16 reserved;
> 	union ib_gid gid;
> 	__be64 ib_service_id; 
> };
> 
> Although I considered dividing the service id and putting the low
> order bytes where reserved is, or only supporting the RDMA IP CM
> service ID format, possibly using sockaddr_in6 directly.

Well, it seems to me, within the RDMA CM API in GID mode the only
purpose of the sockaddr is to select the device. In APM cases there
may actually be multiple gids on either side.. Doubling up and using
it as a way to pass the service ID seems fine to me. the RDMA CM API
would then ignore the gid portion of the destination address, use the
GID portion of the source address to choose the device and record the
service IDs in both to use in the CM protocol.

IPv6 has the notion of a scope_id which is used to select the device
in ambiguous cases. I don't think that is needed here, the source GID
should be unambiguous and the destination GID isn't used for device
selection.

Also, the naming scheme should probably use sib_ as a prefix for
consistency with POSIX. 

BTW, sockaddrs should also always be accompanied by a socklen_t to
indicate their length (for alignment with POSIX). I noticed the
current CM API doesn't do that..

> >What API would you use to pass the PR data?
> 
> API at which level?

User space librdmacm
 
> >> Would this approach combined with the ability to set the route work for
> >> everyone?
> >
> >'set the route' ?
> 
> Pass the path record to the kernel.  This piece is still missing to
> allow user space to own the policy for obtaining path information.

Right, I think you can reasonably use the option approach to
communicate with the kernel, but I think something more standardized
is needed for the user space API.

If the flow is:
rmda_get_addr_info("foo","123",&hints,&result);
rdma_create_id(chan,&id,result[0].port_space);
rdma_resolve_addr(id,result[0].source,result[0].dest,0);
[..]
rdma_resolve_route(id,0);

Maybe the best approach is to change rdma_resolve_route to:
 rdma_resovle_routex(id,result[0].route_data,result[0].route_data_len,0);

The reason to do this is so that the API is clean, calling an IB
specific option in what should be a protocol neutral code path is not
too nice. The two existing PS's would simply use 0 for the route_data.

route_data would be AF/PS defined structure - for IB it would be the
up to 5 PRs.

Under the covers it can use the existing option API to the kernel,
that is not really too important.

Is there anything else the IB CM API can do that this could not?

[** The other choice I could see is to use the sockaddr_ib to pass the
 PR data. The problem with this is that the existing API doesn't use
 socklen_t and sockaddr_ib with 3 PRs would overflow sockaddr_storage,
 so the resulting API would be 'broken by design' on the Rusty scale
 :( ]

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                     ` <9F4DE6A2B4F644698E94F00C4FEEF30A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-08  0:38                       ` Sean Hefty
@ 2009-10-08 23:13                       ` Or Gerlitz
  1 sibling, 0 replies; 76+ messages in thread
From: Or Gerlitz @ 2009-10-08 23:13 UTC (permalink / raw)
  To: Sean Hefty; +Cc: Or Gerlitz, linux-rdma, Roland Dreier

On Thu, Oct 8, 2009 at 1:42 AM, Sean Hefty <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
> My intent, which differs from Jason's, was to fully support the existing
> librdmacm interfaces as they are defined.

yes, I agree this is the way to go

> Implementation wise, if the user of the librdmacm calls rdma_resolve_addr with a
> src address, it's easy.  Without the src address, it's hard, but I may just be
> missing some easy interface for finding the src address.

note that dst can map to multiple src addresses, so you're just
looking for one of them... its doable, I will get you the details if
you still need them


>>> If a user sets the wrong address mapping or route, they should only affect themselves

>> I wasn't sure to follow this comment, can you elaborate a bit more?

> I meant that if some bogus app wants to specify an IP to GID mapping that's
> invalid, the incorrect mapping should only affect connections for that app.

yes, this makes sense and I believe the rdma-cm code is written such
that one bugus ID doesn't leak its defections to other IDs

> I can somewhat implement an ACM + librdmacm solution entirely in user space by
> layering the librdmacm over libibcm.  Because of the event reporting, it would be limited
> in how it could be use, and is unlikely to be something that would ever be supported.

yes, it would be limited and not really supportable, going that way
for research / experimentation and development is fine, just make sure
to never release that...

> Technically, rdma_resolve_addr could remain unchanged, in which case it will do
> everything it does today, which may include sending an ARP.  This is the
> specific operation that I'd like to avoid.

again, apps (both user and kernel ones) do use rdma_resolve_addr and
we want them to keep doing so (I thought we agreed on that). For
staging you may develop the type II address resolution prototype on
top of libibcm but later rdma_resolve_addr would call IBACM and then
sync with the kernel.

Basically, can we agree that rdma_resolve_addr(src, dst, timeout) of
type II it would look like

if (src)
  rdma_bind(src)
else
   call_some_user_space_networking_api_to_convert_dst_to_netdev/src
next,  now we have dev/pkey
- call ACM to resolve dst IP to GID and use dev/pkey for that
- sync the kernel rdma_cm with the resolution if needed for the
state-machine (hopefully its not a must at this point and can be done
when calling set_path).

Or
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                         ` <0A383504E0E54C949DEF84405E3AE92F-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-08  0:54                           ` Jason Gunthorpe
@ 2009-10-08 23:33                           ` Or Gerlitz
       [not found]                             ` <15ddcffd0910081633q20d98abfg41a9f4e781e486b1-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 76+ messages in thread
From: Or Gerlitz @ 2009-10-08 23:33 UTC (permalink / raw)
  To: Sean Hefty; +Cc: Jason Gunthorpe, Or Gerlitz, linux-rdma, Roland Dreier

Sean Hefty <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org> wrote:
>>When used over IB, the IP address is little more than a qualifier contained
>>within the IB CM REQ private data.
>
> If we added support for AF_GID/AF_IB to the kernel, the rdma_cm could leave all
> of the private data carried in the IB CM REQ entirely up to the user.  If the
> user happens to format that data to look like the CMA header, so be it.  I
> believe this would allow for a 'clean' implementation of rdma_resolve_addr,
> preserve the ABI, and still allow a library to provide backwards compatibility.

Sean,

So in this design librdmacm will change the user supplied AF_XXX in
the provided sock address and set it to AF_GID/IB, sounds okay.

> Would this approach combined with the ability to set the route work for everyone?

yes, it makes sense.

However, I don't manage to follow on your port space discussion with
Jason. Some apps may have client in user space and server in the
kernel or vise versa. I wouldn't tie PS_IB or a like with ACM. The ACM
ARP replacement protocol will reply only if the ip address specified
in the broadcast request is an ip of this host on that pkey and a port
connected to that fabric, correct?

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                     ` <20091008172120.GX5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-08 23:39                                       ` Or Gerlitz
  0 siblings, 0 replies; 76+ messages in thread
From: Or Gerlitz @ 2009-10-08 23:39 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Sean Hefty, Or Gerlitz, linux-rdma, Roland Dreier

Jason Gunthorpe <jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org> wrote:

> If the listening side continues to use the IP mode to listen then I guess the
> client can compute an appropriate service ID, but it seems a bit
> strange for one side to use IP and the other side to use the ACM
> method? I was imagining you'd configure both sides to use the same  method.

1st, unlike IP, in IB only the active/connecting side does address
resolution, 2nd, the listener may be in the kernel where the active
may be user space, but anyway, ACM is an alternative way to do
destination gid resolution and path query emulation, I don't see what
it has to the with the CM protocol expect for keeping things the way
they were in this respect (rdma-cm IP header in the REQ, etc).

I don't see why if someone is resolving address through ACM they
aren't PS_TCP consumers.

Or
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                             ` <15ddcffd0910081633q20d98abfg41a9f4e781e486b1-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2009-10-09  0:24                               ` Sean Hefty
       [not found]                                 ` <859D79BFCA4741F393AABF76BBCA4F7B-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-09  0:33                               ` Jason Gunthorpe
  1 sibling, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-09  0:24 UTC (permalink / raw)
  To: 'Or Gerlitz'
  Cc: Jason Gunthorpe, Or Gerlitz, linux-rdma, Roland Dreier

>However, I don't manage to follow on your port space discussion with
>Jason. Some apps may have client in user space and server in the
>kernel or vise versa. I wouldn't tie PS_IB or a like with ACM. The ACM
>ARP replacement protocol will reply only if the ip address specified
>in the broadcast request is an ip of this host on that pkey and a port
>connected to that fabric, correct?

Nothing in any of this would require the use of a specific method for path
resolution.  The rdma_resolve_route() API does not impose any such limitation.
The problem is that the kernel implementation does.  The purpose of these
patches is to allow user space to perform the resolution using whatever
mechanism it chooses, and convey that to the kernel.  rdma_resolve_addr() is
similar.  Basically treat any mention of ACM in this discussion as an example.

>From the perspective of IB, the RDMA CM simply defines a specific format to
private data and service ID carried in the IB CM REQ.  As long as any use
adheres to that protocol, interoperability won't be an issue.

If an application uses RDMA_PS_TCP, they would expect a 16-bit port number
mapped appropriately to the 64-bit service id.  The concept of adding RDMA_PS_IB
would be to expose the full 64-bit service id.  Essentially, the RDMA CM
interface would become capable of connecting to any IB application.  (I really
haven't thought through the details yet, and the addition of RDMA_PS_IB
shouldn't be part of the initial patch submission.)

As for your ACM specific questions - the ACM responds based on a configuration
file.  The ib_acme utility can create that file using the active IP, pkey, port
information of the system, but the current ACM implementation does not adjust to
dynamic changes or detect misconfigurations or other made up words.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                             ` <15ddcffd0910081633q20d98abfg41a9f4e781e486b1-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2009-10-09  0:24                               ` Sean Hefty
@ 2009-10-09  0:33                               ` Jason Gunthorpe
  1 sibling, 0 replies; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-09  0:33 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Sean Hefty, Or Gerlitz, linux-rdma, Roland Dreier

On Fri, Oct 09, 2009 at 01:33:56AM +0200, Or Gerlitz wrote:

> However, I don't manage to follow on your port space discussion with
> Jason. Some apps may have client in user space and server in the
> kernel or vise versa. I wouldn't tie PS_IB or a like with ACM. The ACM
> ARP replacement protocol will reply only if the ip address specified
> in the broadcast request is an ip of this host on that pkey and a port
> connected to that fabric, correct?

The PS and sockaddr type are linked together, it makes no sense to use
a PS_TCP and a sockaddr_ib together - sockaddr_ib doesn't have a port
number, or IP address.

So AF_GID/IB must also use PS_IB.

The real complex question is if the scheme should have a transparent
way for something like rdma_get_addr_info to return a description that
uses PS_IB but sets things up to be compatible with a PS_TCP listener.

It would be easy to do this..

I really don't know, bypassing the IP stack so completely is really
not a great idea from an administration standpoint..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found] ` <F0EFC2D8E6A340D48497497670C5969C-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-05 17:45   ` [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping Sean Hefty
  2009-10-05 17:56   ` [PATCH 1/2] rdma/cm: support option to allow manually setting IB path Jason Gunthorpe
@ 2009-10-09 21:48   ` Sean Hefty
       [not found]     ` <A08104C1CF70400F8BEF492AD49C8491-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-09 21:48 UTC (permalink / raw)
  To: Hefty, Sean, linux-rdma, Roland Dreier

Before spending any more time on this patch series, is there any disagreement to
accepting this patch (as is or slightly modified) upstream?

- Sean

>Export rdma_set_ib_paths to user space to allow applications to
>manually set the IB path used for connections.  This allows
>alternative ways for a user space application or library to obtain
>path record information, including retrieving path information
>from cached data, avoiding direct interaction with the IB SA.
>The IB SA is a single, centralized entity that can limit scaling
>on large clusters running MPI applications.
>
>Signed-off-by: Sean Hefty <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
>---
>I'd like to get feedback on this approach with the possibility of merging
>for 2.6.33.
>
> drivers/infiniband/core/ucma.c |   40 ++++++++++++++++++++++++++++++++++++++++
> include/rdma/rdma_user_cm.h    |    7 +++++--
> 2 files changed, 45 insertions(+), 2 deletions(-)
>
>diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
>index 4346a24..1359727 100644
>--- a/drivers/infiniband/core/ucma.c
>+++ b/drivers/infiniband/core/ucma.c
>@@ -42,6 +42,7 @@
> #include <rdma/rdma_user_cm.h>
> #include <rdma/ib_marshall.h>
> #include <rdma/rdma_cm.h>
>+#include <rdma/rdma_cm_ib.h>
>
> MODULE_AUTHOR("Sean Hefty");
> MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
>@@ -811,6 +812,42 @@ static int ucma_set_option_id(struct ucma_context *ctx,
>int optname,
> 	return ret;
> }
>
>+static int ucma_set_ib_path(struct ucma_context *ctx,
>+			    struct ib_user_path_rec *upath, size_t optlen)
>+{
>+	struct ib_sa_path_rec sa_path;
>+	struct rdma_cm_event event;
>+	int ret;
>+
>+	if (optlen != sizeof(*upath))
>+		return -EINVAL;
>+
>+	ib_copy_path_rec_from_user(&sa_path, upath);
>+	ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
>+	if (ret)
>+		return ret;
>+
>+	memset(&event, 0, sizeof event);
>+	event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
>+	return ucma_event_handler(ctx->cm_id, &event);
>+}
>+
>+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
>+			      void *optval, size_t optlen)
>+{
>+	int ret;
>+
>+	switch (optname) {
>+	case RDMA_OPTION_IB_PATH:
>+		ret = ucma_set_ib_path(ctx, optval, optlen);
>+		break;
>+	default:
>+		ret = -ENOSYS;
>+	}
>+
>+	return ret;
>+}
>+
> static int ucma_set_option_level(struct ucma_context *ctx, int level,
> 				 int optname, void *optval, size_t optlen)
> {
>@@ -820,6 +857,9 @@ static int ucma_set_option_level(struct ucma_context *ctx,
>int level,
> 	case RDMA_OPTION_ID:
> 		ret = ucma_set_option_id(ctx, optname, optval, optlen);
> 		break;
>+	case RDMA_OPTION_IB:
>+		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
>+		break;
> 	default:
> 		ret = -ENOSYS;
> 	}
>diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h
>index c557054..d7829f4 100644
>--- a/include/rdma/rdma_user_cm.h
>+++ b/include/rdma/rdma_user_cm.h
>@@ -215,12 +215,15 @@ struct rdma_ucm_event_resp {
>
> /* Option levels */
> enum {
>-	RDMA_OPTION_ID		= 0
>+	RDMA_OPTION_ID		= 0,
>+	RDMA_OPTION_IB		= 1
> };
>
> /* Option details */
> enum {
>-	RDMA_OPTION_ID_TOS	= 0
>+	RDMA_OPTION_ID_TOS	= 0,
>+
>+	RDMA_OPTION_IB_PATH	= 1
> };
>
> struct rdma_ucm_set_option {
>
>


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]     ` <A08104C1CF70400F8BEF492AD49C8491-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-13 13:06       ` Or Gerlitz
       [not found]         ` <4AD47B40.8070800-smomgflXvOZWk0Htik3J/w@public.gmane.org>
  2009-10-20 18:14       ` Jason Gunthorpe
  1 sibling, 1 reply; 76+ messages in thread
From: Or Gerlitz @ 2009-10-13 13:06 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma, Roland Dreier

Sean Hefty wrote:
> Before spending any more time on this patch series, is there any disagreement to
> accepting this patch (as is or slightly modified) upstream?
Hi Sean,

This patch just sets a route to the kernel and have the kernel issue a 
route resolved event in return, sounds good to me, I don't see any 
problem with merging it upstream.

However, we still have a discussion to continue on the slightly bigger 
picture which is related to how address resolution is "set" to the 
kernel, what port spaces would be supported, etc, and this discussion is 
somehow gets closer to the ACM design... lets continue with that on the 
"rdma/cm: allow user to specify IP to DGID mapping" thread

Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                 ` <859D79BFCA4741F393AABF76BBCA4F7B-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-20 10:22                                   ` Or Gerlitz
       [not found]                                     ` <4ADD8F5F.3010008-smomgflXvOZWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Or Gerlitz @ 2009-10-20 10:22 UTC (permalink / raw)
  To: Sean Hefty
  Cc: 'Or Gerlitz', Jason Gunthorpe, linux-rdma, Roland Dreier

Sean Hefty wrote:
> From the perspective of IB, the RDMA CM simply defines a specific format to private data and service ID carried in the IB CM REQ.  As long as any use adheres to that protocol, interoperability won't be an issue.
okay, I just wanted to make sure that the whole thing (ACM + modified 
librdmacm + modifed rdma-cm) is applicable AND inter-operable for 
AF_INET / PS_TCP applications.
Looking on kernel cma.c format_hdr code it first branches on the address 
family and next of the port space.

Going with your proposed flow, I understand that an app call to 
rdma_resolve_addr will be broken down to rdma_bind_addr, ACM resolution 
of the destination GID and then rdma_set_ib_dest, so things should work 
perfect for AF_INET / PS_TCP apps, correct?

The only missing piece here is the route lookup from user space for 
applications not specifying a source address in their rdma_resolve_addr 
invocation, do you still need help to implement that?

> Essentially, the RDMA CM interface would become capable of connecting to any IB application.  (I really haven't thought through the details yet, and the addition of RDMA_PS_IB shouldn't be part of the initial patch submission.)

fair-enough, I just wanted to make sure with you that AF_IB / PS_IB 
aren't tightly coupled with the proposed change and you have clarified that.

> The ACM responds based on a configuration file.  The ib_acme utility can create that file using the active IP, pkey, port information of the system, but the current ACM implementation does not adjust to dynamic changes or detect misconfigurations or other made up words.
I see. Does the new flow of librdmacm is going to be under new API, eg 
rdma_resolve_addr/route_ext  or the same API optionally talking to ACM 
through some IPC if the ACM daemon is running, or something else?

Or.


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]         ` <4AD47B40.8070800-smomgflXvOZWk0Htik3J/w@public.gmane.org>
@ 2009-10-20 10:23           ` Or Gerlitz
       [not found]             ` <4ADD8FAA.902-smomgflXvOZWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Or Gerlitz @ 2009-10-20 10:23 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma

Or Gerlitz wrote:
> sounds good to me, I don't see any problem with merging it upstream.
Hi Sean,

Are you moving forward with these patches to 2.6.33 ?

Or
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]             ` <4ADD8FAA.902-smomgflXvOZWk0Htik3J/w@public.gmane.org>
@ 2009-10-20 15:52               ` Sean Hefty
  0 siblings, 0 replies; 76+ messages in thread
From: Sean Hefty @ 2009-10-20 15:52 UTC (permalink / raw)
  To: 'Or Gerlitz'; +Cc: linux-rdma

>Are you moving forward with these patches to 2.6.33 ?

yes - though I'm trying to keep the changes to the current upstream code minimal

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                     ` <4ADD8F5F.3010008-smomgflXvOZWk0Htik3J/w@public.gmane.org>
@ 2009-10-20 18:08                                       ` Sean Hefty
       [not found]                                         ` <9F76F7CD7B9048E8821A1B05CC5FAFE8-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-20 18:08 UTC (permalink / raw)
  To: 'Or Gerlitz'
  Cc: 'Or Gerlitz', Jason Gunthorpe, linux-rdma, Roland Dreier

>okay, I just wanted to make sure that the whole thing (ACM + modified
>librdmacm + modifed rdma-cm) is applicable AND inter-operable for
>AF_INET / PS_TCP applications.

I do not intend to have any changes that break anything.

>Looking on kernel cma.c format_hdr code it first branches on the address
>family and next of the port space.
>
>Going with your proposed flow, I understand that an app call to
>rdma_resolve_addr will be broken down to rdma_bind_addr, ACM resolution
>of the destination GID and then rdma_set_ib_dest, so things should work
>perfect for AF_INET / PS_TCP apps, correct?

This is my current plan for the kernel:

Export rdma_set_ib_paths to user space.  Submit a patch.  Get it accepted
upstream.  Eat ice cream to celebrate.

Define AF_IB and struct sockaddr_ib (contains a gid and service id).  Update
rdma_bind_addr, rdma_resolve_addr, and rdma_connect to handle AF_IB.

rdma_bind_addr fills in the service id according RDMA IP CM service annex.
rdma_resolve_addr just needs to save the GIDs.
rdma_connect will not modify the private data in the CM REQ for AF_IB.

For user space:

Initially, ACM support must be specifically built in and enabled using an
environment variable.  If it's enabled, the librdmacm will convert AF_INET and
pass AF_IB to the kernel.  The librdmacm will fall back to normal operation if
ACM fails.

With AF_IB, the flow would probably be something like: rdma_resolve_addr -> ACM
resolution, ucma_resolve_addr, then rdma_resolve_route -> ucma_set_ib_paths.

>The only missing piece here is the route lookup from user space for
>applications not specifying a source address in their rdma_resolve_addr
>invocation, do you still need help to implement that?

This is still missing, and help would be great.

>> The ACM responds based on a configuration file.  The ib_acme utility can
>create that file using the active IP, pkey, port information of the system, but
>the current ACM implementation does not adjust to dynamic changes or detect
>misconfigurations or other made up words.

>I see. Does the new flow of librdmacm is going to be under new API, eg
>rdma_resolve_addr/route_ext  or the same API optionally talking to ACM
>through some IPC if the ACM daemon is running, or something else?

I hope we can do this under the existing APIs with an optional ability to use
ACM, but I like the idea of adding rdma_getaddrinfo.

- Sean 

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]     ` <A08104C1CF70400F8BEF492AD49C8491-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-13 13:06       ` Or Gerlitz
@ 2009-10-20 18:14       ` Jason Gunthorpe
       [not found]         ` <20091020181458.GD14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  1 sibling, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-20 18:14 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma, Roland Dreier

On Fri, Oct 09, 2009 at 02:48:03PM -0700, Sean Hefty wrote:
> Before spending any more time on this patch series, is there any disagreement to
> accepting this patch (as is or slightly modified) upstream?

Can you please have some way for this to pass APM data and the
reversible GMP path as well? We know this is a problem, lets not
introduce new userspace APIs that further enshrine it..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                         ` <9F76F7CD7B9048E8821A1B05CC5FAFE8-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-20 18:31                                           ` Jason Gunthorpe
       [not found]                                             ` <20091020183132.GE14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2009-10-22 15:41                                           ` Or Gerlitz
  1 sibling, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-20 18:31 UTC (permalink / raw)
  To: Sean Hefty
  Cc: 'Or Gerlitz', 'Or Gerlitz', linux-rdma, Roland Dreier

On Tue, Oct 20, 2009 at 11:08:58AM -0700, Sean Hefty wrote:
> >Looking on kernel cma.c format_hdr code it first branches on the address
> >family and next of the port space.
> >
> >Going with your proposed flow, I understand that an app call to
> >rdma_resolve_addr will be broken down to rdma_bind_addr, ACM resolution
> >of the destination GID and then rdma_set_ib_dest, so things should work
> >perfect for AF_INET / PS_TCP apps, correct?
> 
> This is my current plan for the kernel:
> 
> Export rdma_set_ib_paths to user space.  Submit a patch.  Get it accepted
> upstream.  Eat ice cream to celebrate.
> 
> Define AF_IB and struct sockaddr_ib (contains a gid and service id).  Update
> rdma_bind_addr, rdma_resolve_addr, and rdma_connect to handle AF_IB.
> 
> rdma_bind_addr fills in the service id according RDMA IP CM service
> annex.

Hm? If the sockaddr_ib contains the service ID why override it in the
kernel?

The AF_IB/PS_IB flow in the kernel side must just use straight service
IDs.

This is why I suggested rdma_getaddrinfo - if you specify an address
string that uses IP port notation but rdma_getaddrinfo decides to use
AF_GID/PS_IB then it can create the proper service ID for that port
number and build it into the sockaddr_ib.

Like in IP you'd use rdma_getaddrinfo with hints.flags = AI_PASSIVE to
construct a description for a listening RDMA connection, and it can
construct either a AF_GID with a service ID or AF_IP with the port.

> >> The ACM responds based on a configuration file.  The ib_acme utility can
> >create that file using the active IP, pkey, port information of the system, but
> >the current ACM implementation does not adjust to dynamic changes or detect
> >misconfigurations or other made up words.
> 
> >I see. Does the new flow of librdmacm is going to be under new API, eg
> >rdma_resolve_addr/route_ext  or the same API optionally talking to ACM
> >through some IPC if the ACM daemon is running, or something else?
> 
> I hope we can do this under the existing APIs with an optional ability to use
> ACM, but I like the idea of adding rdma_getaddrinfo.

IHMO, I would much rather see your ACM as part of a rdma_getaddrinfo
API. If you implement the nscd-like unix domain socket then ACM can
completely live outside the main stack as a bolt-on.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]         ` <20091020181458.GD14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-20 18:34           ` Sean Hefty
       [not found]             ` <46770152ACA04B6C8AA9497C45AC8FD0-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-20 18:34 UTC (permalink / raw)
  To: 'Jason Gunthorpe'; +Cc: linux-rdma, Roland Dreier

>Can you please have some way for this to pass APM data and the
>reversible GMP path as well? We know this is a problem, lets not
>introduce new userspace APIs that further enshrine it..

Did you have something specific in mind?

ucma_set_ib_paths should be able to accommodate this; we just need some rules
defined.  More invasive kernel changes are needed to do anything with the extra
paths.

For APM, I'm guessing that you'd like a way to set a new alternate path after
establishing a connection.  ucma_set_ib_paths could still do this based on the
state of the connection.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                             ` <20091020183132.GE14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-20 19:13                                               ` Sean Hefty
       [not found]                                                 ` <A47D2FC6B143436DB87704307B0E715D-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-20 19:13 UTC (permalink / raw)
  To: 'Jason Gunthorpe'
  Cc: 'Or Gerlitz', 'Or Gerlitz', linux-rdma, Roland Dreier

>> rdma_bind_addr fills in the service id according RDMA IP CM service
>> annex.
>
>Hm? If the sockaddr_ib contains the service ID why override it in the
>kernel?
>
>The AF_IB/PS_IB flow in the kernel side must just use straight service
>IDs.

I agree.  But we can still support AF_IB/PS_TCP by simply assigning the service
ID correctly.  rdma_bind_addr only needs to fill in a service id if one is not
given.  This should enable interoperability.  I realize that IB doesn't have the
concept of a service ID for the active side of a connection, but the RDMA IP CM
Service header carries the source port.  And rdma_bind_addr is called by
rdma_resolve_addr, so it needs to be updated. 

Even once PS_IB is added, we need to make sure that it doesn't collide with
PS_TCP at the IB service ID level, or IB will be in the same situation that
iWarp is in with space collisions.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]             ` <46770152ACA04B6C8AA9497C45AC8FD0-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-20 19:14               ` Jason Gunthorpe
       [not found]                 ` <20091020191404.GH14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-20 19:14 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma, Roland Dreier

On Tue, Oct 20, 2009 at 11:34:01AM -0700, Sean Hefty wrote:
> >Can you please have some way for this to pass APM data and the
> >reversible GMP path as well? We know this is a problem, lets not
> >introduce new userspace APIs that further enshrine it..
> 
> Did you have something specific in mind?

Maybe something simple:

struct ibv_kern_path_rec2
{
   u32 flags;
   struct ibv_kern_path_rec rec;
}

(actually it would be really nice if ibv_kern_path_rec could be in
 MAD format not yet again another format)

Input to RDMA_OPTION_IB is an array of ibv_kern_path_rec2

flags is a combination of the following
 - GMP_PRIMARY
 - FORWARD_PRIMARY
 - RETURN_PRIMARY
 - RETURN_PRIMARY_REV
 - GMP_SECONDARY
 - FORWARD_SECONDARY
 - RETURN_SECONDARY
 - RETURN_SECONDARY_REV

The _REV notation indicates the path is stored in reversed format.

Today the kernel only supports up to two paths, with flags:
 GMP_PRIMARY | FORWARD_PRIMARY | RETURN_PRIMARY_REV
 FORWARD_SECONDARY | FORWARD_SECONDARY_REV

Future kernels can support up to 6 paths labeled:
 GMP_PRIMARY
 FORWARD_PRIMARY
 RETURN_PRIMARY
 GMP_SECONDARY
 FORWARD_SECONDARY
 RETURN_SECONDARY

The rdma_getaddrinfo resolver would ask the SA for a FORWARD path, if
the result comes back with reversible set then it just passes it to
the kernel as a single:
 GMP_PRIMARY | FORWARD_PRIMARY | RETURN_PRIMARY_REV
Otherwise the resolver does two more queries to get a GMP reversible
path and a return path, and the kernel gets 3 records.

A successful RDMA_OPTION_IB must locate at least GMP_PRIMARY,
FORWARD_PRIMARY(_REV), and RETURN_PRIMARY(_REV) paths in the included
description. Kernel searches in order.

The kernel supported capabilities should be viewable from a sysfs
location. When the kernel learns to do GMP_PRIMARY and RETURN_PRIMARY
standalone, then userpsace should be able to know that prior to
constructing the array. (ie once the kernel learns to do that then the
resolver should not ask the SA for a reversible FORWARD path.)

But even so, this resolver should be able to construct this data blob:
 FORWARD_PRIMARY
 RETURN_PRIMARY
 GMP_PRIMARY | FORWARD_PRIMARY | RETURN_PRIMARY_REV

Current kernels will ignore the first two flag sets (does not
understand that combiantion) and fall through to the last one. Someday
new kernels will pickup the FORWARD/RETURN paths from the earlier two
records and ignore the latter FORWARD_PRIMARY | PRETURN_PRIMARY_REV
flag.

This lets new path types by added in future too, using the same basic
scheme.

This would be the same format returned by a rdma_getaddrinfo call.

> ucma_set_ib_paths should be able to accommodate this; we just need
> some rules defined.  More invasive kernel changes are needed to do
> anything with the extra paths.

Yes, it isn't something that needs to be done right away, but having
the API means that someone could do the kernel work someday. As I
would see this working an opaque channel from the rdma_getaddrinfo
call to the kernel must be provided for this data to flow.

Passing a new SECONDARY/PRIAMRY path through RDMA_OPTION_IB seems
reasonable to me.

> For APM, I'm guessing that you'd like a way to set a new alternate
> path after establishing a connection.  ucma_set_ib_paths could still
> do this based on the state of the connection.

Yes, that would be necessary to obsolete the IB UCM API.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                                 ` <A47D2FC6B143436DB87704307B0E715D-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-20 19:18                                                   ` Jason Gunthorpe
       [not found]                                                     ` <20091020191821.GI14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-20 19:18 UTC (permalink / raw)
  To: Sean Hefty
  Cc: 'Or Gerlitz', 'Or Gerlitz', linux-rdma, Roland Dreier

On Tue, Oct 20, 2009 at 12:13:03PM -0700, Sean Hefty wrote:
> >> rdma_bind_addr fills in the service id according RDMA IP CM service
> >> annex.
> >
> >Hm? If the sockaddr_ib contains the service ID why override it in the
> >kernel?
> >
> >The AF_IB/PS_IB flow in the kernel side must just use straight service
> >IDs.
> 
> I agree.  But we can still support AF_IB/PS_TCP by simply assigning
> the service ID correctly.  rdma_bind_addr only needs to fill in a
> service id if one is not given.  This should enable

'one is not given' == 0 service ID?

Is there any reason to ever use AF_IB/PS_TCP on the listening side?
What functional difference are you imagining compared to listening on
AF_INET/PS_TCP?

> interoperability.  I realize that IB doesn't have the concept of a
> service ID for the active side of a connection, but the RDMA IP CM
> Service header carries the source port.  And rdma_bind_addr is
> called by rdma_resolve_addr, so it needs to be updated.

Hmm, yes, messy..

> Even once PS_IB is added, we need to make sure that it doesn't collide with
> PS_TCP at the IB service ID level, or IB will be in the same situation that
> iWarp is in with space collisions.

I thought there was already a single service ID registration list that was
shared between RDMA CM and IB CM?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                                     ` <20091020191821.GI14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-20 20:05                                                       ` Sean Hefty
       [not found]                                                         ` <8D09997BDBC5482C86EAD338F19C8030-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-20 20:05 UTC (permalink / raw)
  To: 'Jason Gunthorpe'
  Cc: 'Or Gerlitz', 'Or Gerlitz', linux-rdma, Roland Dreier

>> I agree.  But we can still support AF_IB/PS_TCP by simply assigning
>> the service ID correctly.  rdma_bind_addr only needs to fill in a
>> service id if one is not given.  This should enable
>
>'one is not given' == 0 service ID?

yes

I'm guessing PS_IB would need to assign from the IB CM assigned range starting
at 0x020::0 to avoid conflicts.

>Is there any reason to ever use AF_IB/PS_TCP on the listening side?

I don't think so.  I was thinking of this combination on the active side in
order to get the port number to place in the private data.

>What functional difference are you imagining compared to listening on
>AF_INET/PS_TCP?

I hadn't thought about this, but making stuff up, I guess this could allow using
the rdma_cm without ipoib running...?  From IB's perspective, PS_TCP is just a
specific subset of the service ID range.

>> Even once PS_IB is added, we need to make sure that it doesn't collide with
>> PS_TCP at the IB service ID level, or IB will be in the same situation that
>> iWarp is in with space collisions.
>
>I thought there was already a single service ID registration list that was
>shared between RDMA CM and IB CM?

Not really - the rdma cm constructs the service id, and the ib cm checks that
it's usable, but not until listen is called.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                                         ` <8D09997BDBC5482C86EAD338F19C8030-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-20 20:29                                                           ` Jason Gunthorpe
       [not found]                                                             ` <20091020202902.GJ14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-20 20:29 UTC (permalink / raw)
  To: Sean Hefty
  Cc: 'Or Gerlitz', 'Or Gerlitz', linux-rdma, Roland Dreier

On Tue, Oct 20, 2009 at 01:05:15PM -0700, Sean Hefty wrote:
> >> I agree.  But we can still support AF_IB/PS_TCP by simply assigning
> >> the service ID correctly.  rdma_bind_addr only needs to fill in a
> >> service id if one is not given.  This should enable
> >
> >'one is not given' == 0 service ID?
> 
> yes
> 
> I'm guessing PS_IB would need to assign from the IB CM assigned range starting
> at 0x020::0 to avoid conflicts.

Something like that..

Ok, this seems sensible to me. So to recap the main differences:

Private data:
- AF_IB/PS_TCP - the kernel munges the private data to be compatible
  with AF_INET/PS_TCP, but otherwise is the same.
- AF_IB/PS_IB - the kernel doesn't touch the private data.

Service ID for bind or connect:
 - AF_IB/PS_TCP - the service ID passed in through the sockaddr_ib
   must be 0 or correctly formed as a IP RDMA CM service ID indicating
   a port number. This is true in the bind or connect case.
 - AF_IB/PS_IB - the service ID is just a service ID. No restrictions
   or alteration are done. The bind'd service ID is ignored on connect

0 Service ID on bind:
 - AF_IB/PS_TCP - a new random unused port is allocated and converted to a
   service ID
 - AF_IB/PS_IB - a new random unused service ID within the proper prefix is
   allocated (usefull!)

What about the service_mask feature of IB CM?

How are the IP source and dest IPs going to be picked in for PS_TCP
mode?

I guess user space does that and passes it through to the kernel?
Another bit of binary blob data from rdma_getaddrinfo I guess. I
suppose rdma_getaddrinfo could return an array of option blobs that
must be copied to the kernel to setup the connection, or something
like that.

> >What functional difference are you imagining compared to listening on
> >AF_INET/PS_TCP?
> 
> I hadn't thought about this, but making stuff up, I guess this could allow using
> the rdma_cm without ipoib running...?  From IB's perspective, PS_TCP is just a
> specific subset of the service ID range.

Plus the IP addies in the REQ private data.. 

I think it would be fine to EINVAL on AF_IB/PS_TCP listen().

> >> Even once PS_IB is added, we need to make sure that it doesn't collide with
> >> PS_TCP at the IB service ID level, or IB will be in the same situation that
> >> iWarp is in with space collisions.
> >
> >I thought there was already a single service ID registration list that was
> >shared between RDMA CM and IB CM?
> 
> Not really - the rdma cm constructs the service id, and the ib cm checks that
> it's usable, but not until listen is called.

Oh I see, well, a 'bind' api into the IB CM should take care of that? 

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                                             ` <20091020202902.GJ14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-20 20:48                                                               ` Sean Hefty
       [not found]                                                                 ` <B7BCBF813BF447B28330C2DB8F1437D6-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-20 20:48 UTC (permalink / raw)
  To: 'Jason Gunthorpe'
  Cc: 'Or Gerlitz', 'Or Gerlitz', linux-rdma, Roland Dreier

>Private data:
>- AF_IB/PS_TCP - the kernel munges the private data to be compatible
>  with AF_INET/PS_TCP, but otherwise is the same.
>- AF_IB/PS_IB - the kernel doesn't touch the private data.

I was thinking AF_IB/* - kernel doesn't touch the private data, as it lacks the
necessary information.

>What about the service_mask feature of IB CM?

not sure - Is it needed in user space?

>How are the IP source and dest IPs going to be picked in for PS_TCP
>mode?
>
>I guess user space does that and passes it through to the kernel?

It could pass the private data when calling rdma_connect.

>I think it would be fine to EINVAL on AF_IB/PS_TCP listen().

I'm good with that, but want to give this some thought.

>Oh I see, well, a 'bind' api into the IB CM should take care of that?

I think so.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                                                 ` <B7BCBF813BF447B28330C2DB8F1437D6-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-20 21:30                                                                   ` Jason Gunthorpe
  0 siblings, 0 replies; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-20 21:30 UTC (permalink / raw)
  To: Sean Hefty
  Cc: 'Or Gerlitz', 'Or Gerlitz', linux-rdma, Roland Dreier

On Tue, Oct 20, 2009 at 01:48:34PM -0700, Sean Hefty wrote:
> >Private data:
> >- AF_IB/PS_TCP - the kernel munges the private data to be compatible
> >  with AF_INET/PS_TCP, but otherwise is the same.
> >- AF_IB/PS_IB - the kernel doesn't touch the private data.
> 
> I was thinking AF_IB/* - kernel doesn't touch the private data, as
> it lacks the necessary information.

That was my first thinking as well.. 

If you want to go that way then I suggest compltely ditching
AF_IB/PS_TCP and extend PS_IB to include the service ID mask.

Basically, in PS_IB, you get to specify a service ID and a service ID
mask. When you bind the kernel keeps the unmasked bits and computes
masked bits.

Presenting a service ID + mask in the IP RDMA CM service ID format
will cause the kernel to allocate a 'port'.

This scheme is then also re-usable for other things, like IBTA defined
SDP port service ID allocation, and 'Local OS Administered Service IDs'

Basically, we just deal with the port problem as a sub-case of the
general service ID allocation problem.

The IB CM learns how to do this and the RDMA CM AF_INET/PS_TCP just
does exactly the above inside the kernel to avoid collision problems.
(IB CM just does while (!exists((rand() & ~mask | serviceID))) to
 choose an appropriate random unused service ID.)

Actually, this is pretty nice - probably should do it no matter what
for AF_IB/PS_IB

Could we get rid of PS_SDP too like this?

> >What about the service_mask feature of IB CM?
> 
> not sure - Is it needed in user space?

I don't have a clear idea how the service mask was intended to
be used to comment much on this. But it seems useful for the above.

> >How are the IP source and dest IPs going to be picked in for PS_TCP
> >mode?
> >
> >I guess user space does that and passes it through to the kernel?
> 
> It could pass the private data when calling rdma_connect.

Well.. The main problem I see with this is that it does not fit very
well into a rdma_getaddrinfo model. rdma_getaddrinfo should not
allocate any resources - but it should provide the private data
prefix (if any).

So.. the private data is either constructed in the kernel, or in
librdmacm. Both have different issues, librdmacm needs to retrieve a
source port from the kernel, or the kernel needs to get the IP data
from userspace. Both troublesome.

Can we just do away with the source port? Set it to 1234 or rand() or
something?  If this is only going to be used for ACM I'd be inclined
to do this because it is nice and simplifying.

The source port is pretty useles anyhow, the only app level purpose it
ever served in IP was to determine if a remote process might be root
by having it present a < 1024 source port. But that usage is extremely
rare (and broken with IB, nothing enforces this).

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]                 ` <20091020191404.GH14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-22  0:14                   ` Sean Hefty
       [not found]                     ` <9DFD8E65325F4EE990749EEBE4BC33CA-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-22  0:14 UTC (permalink / raw)
  To: 'Jason Gunthorpe'; +Cc: linux-rdma

>Maybe something simple:
>
>struct ibv_kern_path_rec2
>{
>   u32 flags;
>   struct ibv_kern_path_rec rec;
>}

This is more as an RFC:

Looking at ib_user_path_rec / ibv_kern_path_rec, we could just make use of the
existing bits that are available.  We have 2 32-bit fields that are only used to
record a single bit of data, the gids, plus numb_path and preference fields.

We should be able to determine if a path is forward, reverse, or both by looking
at the sgid and the reversible bit.  Preference (flags) could be used to
indicate if a path is primary, alternate, or for the CM only.  With the
exception of the marking a path for the CM, the use of the fields in this manner
seems somewhat natural to me. 


--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]                     ` <9DFD8E65325F4EE990749EEBE4BC33CA-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-22  0:42                       ` Jason Gunthorpe
       [not found]                         ` <20091022004245.GV14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-22  0:42 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma

On Wed, Oct 21, 2009 at 05:14:59PM -0700, Sean Hefty wrote:
> >Maybe something simple:
> >
> >struct ibv_kern_path_rec2
> >{
> >   u32 flags;
> >   struct ibv_kern_path_rec rec;
> >}
> 
> This is more as an RFC:
> 
> Looking at ib_user_path_rec / ibv_kern_path_rec, we could just make
> use of the existing bits that are available.  We have 2 32-bit
> fields that are only used to record a single bit of data, the gids,
> plus numb_path and preference fields.
> 
> We should be able to determine if a path is forward, reverse, or
> both by looking at the sgid and the reversible bit.  Preference
> (flags) could be used to indicate if a path is primary, alternate,
> or for the CM only.  With the exception of the marking a path for
> the CM, the use of the fields in this manner seems somewhat natural
> to me.

I'm reluctant to override fields like this to save 4 bytes. The
clarity and extensibility of using an additional flags field seems
worth it to me, and the processing code is not complex. I cannot think
of a motivation to save the 4 bytes?

As I said, in many ways I would actually much perfer it if the
structure was:

struct ibv_kern_path_rec2
{
    u32 flags;
    u32 pr[512/8/4];
}

Where 'pr' is simply the 64 bytes of data directly from the MAD. The
kernel already has parsing code for it. As it is with the libibcm
things are kind of insane, first we decode the PR mad into an
ibv_path_rec, then convert that into an ibv_kern_path_rec, which is
unpacked into various other structures anyhow. Ik.

The main advantage of the above is that the resolver function can
directly pass the bytes from the SM into the kernel, so if the spec is
extended the path to flow the extension from the SM to the kernel is
already in place.

It is actually very easy to process 'PR', first do this:
for (unsigned int I = 0; I != 64; I++)
   pr[I] = be_to_cpu32(pr[I]);

Then cast to this structure:

#if __BYTE_ORDER == __LITTLE_ENDIAN
struct SAPathRecord {
    uint32_t rsv0;

    uint32_t rsv1;

    uint32_t DGID[4];

    uint32_t SGID[4];

    uint16_t SLID;
    uint16_t DLID;

    uint32_t hopLimit:8;
    uint32_t flowLabel:20;
    uint32_t rsv2:3;
    uint32_t rawTraffic:1;

    uint16_t PKey;
    uint8_t numbPath:7;
    uint8_t reversible:1;
    uint8_t TClass;

    uint8_t rate:6;
    uint8_t rateSelector:2;
    uint8_t MTU:6;
    uint8_t MTUSelector:2;
    uint16_t SL:4;
    uint16_t rsv3:12;

    uint16_t rsv4;
    uint8_t preference;
    uint8_t packetLifeTime:6;
    uint8_t packetLifeTimeSelector:2;

    uint32_t rsv5;
};
#else
struct SAPathRecord {
    uint32_t rsv0;

    uint32_t rsv1;

    uint32_t DGID[4];

    uint32_t SGID[4];

    uint16_t DLID;
    uint16_t SLID;

    uint32_t rawTraffic:1;
    uint32_t rsv2:3;
    uint32_t flowLabel:20;
    uint32_t hopLimit:8;

    uint8_t TClass;
    uint8_t reversible:1;
    uint8_t numbPath:7;
    uint16_t PKey;

    uint16_t rsv3:12;
    uint16_t SL:4;
    uint8_t MTUSelector:2;
    uint8_t MTU:6;
    uint8_t rateSelector:2;
    uint8_t rate:6;

    uint8_t packetLifeTimeSelector:2;
    uint8_t packetLifeTime:6;
    uint8_t preference;
    uint16_t rsv4;

    uint32_t rsv5;
};
#endif

Fin. Super simple. The horror that the kernel uses now is silly code
bloatery :)

Pretty much the same technique that is used for the IP/TCP header in
certain places in the kernel.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]                         ` <20091022004245.GV14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-22  1:07                           ` Sean Hefty
       [not found]                             ` <AE35305D45DB49F591A45DADD822209A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-22  1:07 UTC (permalink / raw)
  To: 'Jason Gunthorpe'; +Cc: linux-rdma

>I'm reluctant to override fields like this to save 4 bytes. The
>clarity and extensibility of using an additional flags field seems
>worth it to me, and the processing code is not complex. I cannot think
>of a motivation to save the 4 bytes?

I wasn't thinking about saving the 4 bytes.  struct ib_user_path_rec is already
part of the ABI for query route.  If the preference were used to indicate
primary versus alternate path, which seems reasonable, then query route can more
easily convey that information.

Preference is implementation defined and meaningless once a path has been
selected, so I really don't see any issue in using it however we want.  I
thought about using it to convey all of the flags, but it doesn't give much room
for expansion.  That led me to consider using the sgid and reversible bit to
determine if a path is outbound or inbound.

>As I said, in many ways I would actually much perfer it if the
>structure was:
>
>struct ibv_kern_path_rec2
>{
>    u32 flags;
>    u32 pr[512/8/4];
>}

I would have preferred using MAD definitions directly everywhere myself, but I
don't think it's worth breaking the ABI to change this.  struct ib_user_path_rec
is already in use by the ib_cm and rdma_cm at this point.  (The rdma_cm simply
copied its use by the ib_cm.)

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 1/2] rdma/cm: support option to allow manually setting IB path
       [not found]                             ` <AE35305D45DB49F591A45DADD822209A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-22  1:35                               ` Jason Gunthorpe
       [not found]                                 ` <20091022013542.GX14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-22  1:35 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma

On Wed, Oct 21, 2009 at 06:07:54PM -0700, Sean Hefty wrote:
> >I'm reluctant to override fields like this to save 4 bytes. The
> >clarity and extensibility of using an additional flags field seems
> >worth it to me, and the processing code is not complex. I cannot think
> >of a motivation to save the 4 bytes?
> 
> I wasn't thinking about saving the 4 bytes.  struct ib_user_path_rec is already
> part of the ABI for query route.  If the preference were used to indicate
> primary versus alternate path, which seems reasonable, then query route can more
> easily convey that information.

I'm not sure why we need to conflate these two functions..

But ucma_abi_query_route_resp is already broken as soon as the kernel
can support more than 2 paths on a QP. The entire point of the flags
scheme is to support more than 2 paths in future.

So ucma_abi_query_route_resp has to be churned in the future, and at
that time it can be synch'd to use the same structure as the path
update, and ibv_kern_path_rec can go away.

In light of that, I'd say pick the flag + raw PR method. That clearly
has the best long term API.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v2] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                 ` <20091022013542.GX14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-22  8:10                                   ` Sean Hefty
       [not found]                                     ` <B7E97540810E4A2785FF1FC8CB96F453-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-22  8:10 UTC (permalink / raw)
  To: 'Jason Gunthorpe', linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

Export rdma_set_ib_paths to user space to allow applications to
manually set the IB path used for connections.  This allows
alternative ways for a user space application or library to obtain
path record information, including retrieving path information
from cached data, avoiding direct interaction with the IB SA.
The IB SA is a single, centralized entity that can limit scaling
on large clusters running MPI applications.

Signed-off-by: Sean Hefty <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
Changes from v1:
Use MAD attribute structure format for path record data.
Add flags to indicate how a path should be used.  This allows separate
forward and reverse paths, and could support APM.

Patch is compiled tested only. 

 drivers/infiniband/core/sa_query.c |    6 +++++
 drivers/infiniband/core/ucma.c     |   44 ++++++++++++++++++++++++++++++++++++
 include/rdma/ib_sa.h               |    6 +++++
 include/rdma/ib_user_sa.h          |   14 +++++++++++
 include/rdma/rdma_user_cm.h        |    7 ++++--
 5 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 1865049..2e73dcc 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -604,6 +604,12 @@ retry:
 	return ret ? ret : id;
 }
 
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec)
+{
+	ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
 static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
 				    int status,
 				    struct ib_sa_mad *mad)
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 4346a24..996a521 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -42,6 +42,7 @@
 #include <rdma/rdma_user_cm.h>
 #include <rdma/ib_marshall.h>
 #include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
@@ -811,6 +812,46 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname,
 	return ret;
 }
 
+static int ucma_set_ib_path(struct ucma_context *ctx,
+			    struct ib_path_rec_data *path_data, size_t optlen)
+{
+	struct ib_sa_path_rec sa_path;
+	struct rdma_cm_event event;
+	int ret;
+
+	if (optlen != sizeof(*path_data))
+		return -EINVAL;
+
+	if (path_data->flags != IB_PATH_GMP | IB_PATH_PRIMARY |
+				IB_PATH_OUTBOUND | IB_PATH_INBOUND)
+		return -EINVAL;
+
+	ib_sa_unpack_path(path_data->path_rec, &sa_path);
+	ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
+	if (ret)
+		return ret;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (optname) {
+	case RDMA_OPTION_IB_PATH:
+		ret = ucma_set_ib_path(ctx, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
 static int ucma_set_option_level(struct ucma_context *ctx, int level,
 				 int optname, void *optval, size_t optlen)
 {
@@ -820,6 +861,9 @@ static int ucma_set_option_level(struct ucma_context *ctx, int level,
 	case RDMA_OPTION_ID:
 		ret = ucma_set_option_id(ctx, optname, optval, optlen);
 		break;
+	case RDMA_OPTION_IB:
+		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 3841c1a..1082afa 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -379,4 +379,10 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 			 struct ib_sa_path_rec *rec,
 			 struct ib_ah_attr *ah_attr);
 
+/**
+ * ib_sa_unpack_path - Convert a path record from MAD format to struct
+ * ib_sa_path_rec.
+ */
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec);
+
 #endif /* IB_SA_H */
diff --git a/include/rdma/ib_user_sa.h b/include/rdma/ib_user_sa.h
index 6591201..c2c2504 100644
--- a/include/rdma/ib_user_sa.h
+++ b/include/rdma/ib_user_sa.h
@@ -35,6 +35,20 @@
 
 #include <linux/types.h>
 
+enum {
+	IB_PATH_GMP	  = 1,
+	IB_PATH_PRIMARY   = (1<<1),
+	IB_PATH_ALTERNATE = (1<<2),
+	IB_PATH_OUTBOUND  = (1<<3),
+	IB_PATH_INBOUND   = (1<<4)
+};
+
+struct ib_path_rec_data {
+	__u32	flags;
+	__u32	reserved;
+	__u32	path_rec[16];
+};
+
 struct ib_user_path_rec {
 	__u8	dgid[16];
 	__u8	sgid[16];
diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h
index c557054..d7829f4 100644
--- a/include/rdma/rdma_user_cm.h
+++ b/include/rdma/rdma_user_cm.h
@@ -215,12 +215,15 @@ struct rdma_ucm_event_resp {
 
 /* Option levels */
 enum {
-	RDMA_OPTION_ID		= 0
+	RDMA_OPTION_ID		= 0,
+	RDMA_OPTION_IB		= 1
 };
 
 /* Option details */
 enum {
-	RDMA_OPTION_ID_TOS	= 0
+	RDMA_OPTION_ID_TOS	= 0,
+
+	RDMA_OPTION_IB_PATH	= 1
 };
 
 struct rdma_ucm_set_option {



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                         ` <9F76F7CD7B9048E8821A1B05CC5FAFE8-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-20 18:31                                           ` Jason Gunthorpe
@ 2009-10-22 15:41                                           ` Or Gerlitz
       [not found]                                             ` <4AE07D41.7040300-smomgflXvOZWk0Htik3J/w@public.gmane.org>
  1 sibling, 1 reply; 76+ messages in thread
From: Or Gerlitz @ 2009-10-22 15:41 UTC (permalink / raw)
  To: Sean Hefty; +Cc: Jason Gunthorpe, linux-rdma, Roland Dreier

Sean Hefty wrote:
>> okay, I just wanted to make sure that the whole thing (ACM + modified librdmacm + modifed rdma-cm) is applicable AND inter-operable for AF_INET / PS_TCP applications
> I do not intend to have any changes that break anything
my question went beyond whether things are going to be broken (they 
aren't as you said), but rather will ACM is going to be 
***applicable***  for AF_INET/PS_TCP application. From your reply and 
the discussion that followed between you and Jason, I got the impression 
that the answer is "not really" b/c if for example the server side 
thinks it would be getting the IP address of the connecting side in the 
REQ private header, once this REQ was sent in the flow of AF_INET which 
was converted to AF_IB, this is not going to happen. Moreover, if the 
SID constructed by AF_INET / PS_TCP call to rdma_resolve_address which 
uses the librdmacm-ACM flow wouldn't match the SID constructed in the 
passive side which didn't use this flow (e.g user --> kernel or kernel 
--> user app), the REQ wouldn't be getting anywhere and be rejected by 
the CM on the passive side :(

>> Going with your proposed flow, I understand that an app call to rdma_resolve_addr will be broken down to rdma_bind_addr, ACM resolution of the destination GID and then rdma_set_ib_dest, so things should work perfect for AF_INET / PS_TCP apps, correct?
> This is my current plan for the kernel: export rdma_set_ib_paths to user space.  Submit a patch.  Get it accepted upstream.  Eat ice cream to celebrate.
again, rdma_set_ib_path for itself is quite innocent... as I wrote you 
couple of days ago, it can be merged anytime, the big thing is the bind 
/ address resolution modified flow which effects the connect/listen, 
etc.  So just for this patch, I would go on a small size ice-cream, 
where once the design for the bigger picture is in place, go for a pint...

> Define AF_IB and struct sockaddr_ib (contains a gid and service id).  Update rdma_bind_addr, rdma_resolve_addr, and rdma_connect to handle AF_IB. rdma_bind_addr fills in the sid according RDMA IP CM service annex. rdma_resolve_addr just needs to save the GIDs. rdma_connect will not modify the private data in the CM REQ for AF_IB.
I really tried to follow the thread between you and Jason with quite 
little success, and I am going to give it more tries... in parallel, 
could you help me understand what is the --drive/reasoning-- from your 
perspective to add AF_IB / PS_IB here?

I believe that the suggestion I brought of: converting rdma_resolve_addr 
with null src addr to route lookup and following that rdma_bind_addr, 
with a similar/same flow for rdma_resolve_addr with src address, next do 
the ACM dgid resolution, call the rdma_set_dgid call. Would allow to 
serve AF_INET / PS_TCP with ACM.

If from other reasons, people want the rdma-cm to support AF_IB and/or 
PS_IB, we can do that as well, but why force doing that behind the cover 
each time ACM is used?!

Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                             ` <4AE07D41.7040300-smomgflXvOZWk0Htik3J/w@public.gmane.org>
@ 2009-10-22 16:39                                               ` Jason Gunthorpe
       [not found]                                                 ` <20091022163904.GC26003-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2009-10-22 19:14                                               ` Sean Hefty
  1 sibling, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-22 16:39 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Sean Hefty, linux-rdma, Roland Dreier

On Thu, Oct 22, 2009 at 05:41:53PM +0200, Or Gerlitz wrote:

> If from other reasons, people want the rdma-cm to support AF_IB and/or 
> PS_IB, we can do that as well, but why force doing that behind the cover 
> each time ACM is used?!

My view is that ultimately ACM has at best a niche application. The
various limitations with the approach are not suitable for many cases.
I also don't think it would be the final word on this subject.

So why not have a more general, flexible approach? Isolating ACM from
librdmacm by using AF_IB is a good idea, it keeps them seperate and
lets ACM and future go where ever.

I hope Sean can make it work with the rdma_getddrinfo idea, that would
completely seperate ACM and librdmacm.

Attempting to bake it into AF_INET means that librdmacm, possibly the
kernel and maybe even the apps need to be contaminated with ACM
specific code, and that just doesn't seem desirable to me. What
happens when someone invents BCM or CCM? More mess.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v2] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                     ` <B7E97540810E4A2785FF1FC8CB96F453-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-22 16:54                                       ` Jason Gunthorpe
       [not found]                                         ` <20091022165414.GH26003-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  2009-10-22 20:26                                       ` [PATCH v3] " Sean Hefty
  1 sibling, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-22 16:54 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

On Thu, Oct 22, 2009 at 01:10:09AM -0700, Sean Hefty wrote:

> +static int ucma_set_ib_path(struct ucma_context *ctx,
> +			    struct ib_path_rec_data *path_data, size_t optlen)
> +{
> +	struct ib_sa_path_rec sa_path;
> +	struct rdma_cm_event event;
> +	int ret;
> +
> +	if (optlen != sizeof(*path_data))
> +		return -EINVAL;
> +
> +	if (path_data->flags != IB_PATH_GMP | IB_PATH_PRIMARY |
> +				IB_PATH_OUTBOUND | IB_PATH_INBOUND)
> +		return -EINVAL;

This should accept an array here, to aid easing in APM support:

if ((optlen % sizeof(*path_data)) != 0) return -EINVAL;

for (; optlen != 0; optlen -= sizeof(*path_data), path_data++)
   if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |                                  
                            IB_PATH_OUTBOUND | IB_PATH_INBOUND))
       break;

if (optlen == 0) return -EINVAL;

Could you do basic APM support right now, exactly with the same
capability as ucm has?

> +enum {
> +	IB_PATH_GMP	  = 1,
> +	IB_PATH_PRIMARY   = (1<<1),
> +	IB_PATH_ALTERNATE = (1<<2),
> +	IB_PATH_OUTBOUND  = (1<<3),
> +	IB_PATH_INBOUND   = (1<<4)
> +};

I like the PATH_PRIMARY/PATH_ALTERNATE idea,

But I think IB_PATH_OUTBOUND_REV is still required.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH v2] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                         ` <20091022165414.GH26003-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-22 17:52                                           ` Sean Hefty
       [not found]                                             ` <1438C87E89284364A56E08A40DFE199E-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-22 17:52 UTC (permalink / raw)
  To: 'Jason Gunthorpe'; +Cc: linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

>> +static int ucma_set_ib_path(struct ucma_context *ctx,
>> +			    struct ib_path_rec_data *path_data, size_t optlen)
>> +{
>> +	struct ib_sa_path_rec sa_path;
>> +	struct rdma_cm_event event;
>> +	int ret;
>> +
>> +	if (optlen != sizeof(*path_data))
>> +		return -EINVAL;
>> +
>> +	if (path_data->flags != IB_PATH_GMP | IB_PATH_PRIMARY |
>> +				IB_PATH_OUTBOUND | IB_PATH_INBOUND)
>> +		return -EINVAL;
>
>This should accept an array here, to aid easing in APM support:

I agree - I thought about this after sending the patch.  This is more what I
think is needed at this point:

if ((optlen % sizeof(*path_data)) != 0) return -EINVAL;

if (number of paths > 1)
	return -ENOSYS;

if (path_data->flags != (IB_PATH_GMP | IB_PATH_PRIMARY |
                            IB_PATH_OUTBOUND | IB_PATH_INBOUND))

	return -EINVAL;

>Could you do basic APM support right now, exactly with the same
>capability as ucm has?

The rdma_cm is not coded to handle APM.  That should be a separate patch.

>
>> +enum {
>> +	IB_PATH_GMP	  = 1,
>> +	IB_PATH_PRIMARY   = (1<<1),
>> +	IB_PATH_ALTERNATE = (1<<2),
>> +	IB_PATH_OUTBOUND  = (1<<3),
>> +	IB_PATH_INBOUND   = (1<<4)
>> +};
>
>I like the PATH_PRIMARY/PATH_ALTERNATE idea,
>
>But I think IB_PATH_OUTBOUND_REV is still required.

I'm not fond of the reverse idea.  Why do you think it's needed?

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v2] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                             ` <1438C87E89284364A56E08A40DFE199E-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-22 18:11                                               ` Jason Gunthorpe
       [not found]                                                 ` <20091022181101.GY14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-22 18:11 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

On Thu, Oct 22, 2009 at 10:52:13AM -0700, Sean Hefty wrote:
> >> +static int ucma_set_ib_path(struct ucma_context *ctx,
> >> +			    struct ib_path_rec_data *path_data, size_t optlen)
> >> +{
> >> +	struct ib_sa_path_rec sa_path;
> >> +	struct rdma_cm_event event;
> >> +	int ret;
> >> +
> >> +	if (optlen != sizeof(*path_data))
> >> +		return -EINVAL;
> >> +
> >> +	if (path_data->flags != IB_PATH_GMP | IB_PATH_PRIMARY |
> >> +				IB_PATH_OUTBOUND | IB_PATH_INBOUND)
> >> +		return -EINVAL;
> >
> >This should accept an array here, to aid easing in APM support:
> 
> I agree - I thought about this after sending the patch.  This is more what I
> think is needed at this point:
> 
> if ((optlen % sizeof(*path_data)) != 0) return -EINVAL;
> 
> if (number of paths > 1)
> 	return -ENOSYS;

This is really no different than what you had at first.

My idea was to have the kernel search the array for the entries it
needs/supports. First one found wins. This provides future
API compatability.

Some future userspace, to support some future kernel APM, would pass in 2
entries:
 IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_OUTBOUND | IB_PATH_INBOUND
 IB_PATH_GMP | IB_PATH_ALTERNATE | IB_PATH_OUTBOUND | IB_PATH_INBOUND

This shouldn't break old kernels.

Again, some even future kernel would support 6 entries, future user
space would send this:
 IB_PATH_PRIMARY | IB_PATH_INBOUND
 IB_PATH_PRIMARY | IB_PATH_OUTBOUND
 IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_OUTBOUND | IB_PATH_INBOUND
 IB_PATH_ALTERNATE | IB_PATH_INBOUND
 IB_PATH_ALTERNATE | IB_PATH_OUTBOUND
 IB_PATH_GMP | IB_PATH_ALTERNATE | IB_PATH_OUTBOUND | IB_PATH_INBOUND

This shouldn't break old kernels either.

Maybe it is worth returning a postive integer from the syscall
indicating the number of paths the kernel accepted. Although I suspect
most use models of this will ignore that.

> >I like the PATH_PRIMARY/PATH_ALTERNATE idea,
> >
> >But I think IB_PATH_OUTBOUND_REV is still required.
> 
> I'm not fond of the reverse idea.  Why do you think it's needed?

Alignment with the SM. (Actually, it would be IB_PATH_INBOUND_REV) 

You have a PR query where SGID=local, DGID=remote that gives you
IB_PATH_OUTBOUND. Then you do a query with SGID=remote, DGID=local,
that gives you IB_PATH_INBOUND. ie the SGID is different in each
result. This is necessary, non-reversible paths are uni-directional.

If you use a reversible path then the IB_PATH_OUTBOUND == IB_PATH_GMP
== IB_PATH_INBOUND_REV (ie the inbound side is always the one that has
SGID/DGID direction swapped)

Since this is going with the raw PR from the SA, I think it is wise to
not require any modification of the PR as it flows from the SA to the
kernel. Thus there are two PR representations for IB_PATH_INBOUND -
one that has SGID=local (_REV) and one that has SGID=remote.

So either IB_PATH_INBOUND_REV is explicitly part of the flags
structure, or IB_PATH_INBOUND | IB_PATH_OUTBOUND implicity means
IB_PATH_INBOUND_REV | IB_PATH_OUTBOUND

The implict behavior seems way to subtle to me, making it explicit
clears up which is which.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH v2] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                                 ` <20091022181101.GY14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-22 18:28                                                   ` Sean Hefty
       [not found]                                                     ` <67280F81CB6F417DA6EEE22448ED5500-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-22 18:28 UTC (permalink / raw)
  To: 'Jason Gunthorpe'; +Cc: linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

>> if (number of paths > 1)
>> 	return -ENOSYS;
>
>This is really no different than what you had at first.

The main difference is returning ENOSYS to indicate that only one path is
acceptable.

>My idea was to have the kernel search the array for the entries it
>needs/supports. First one found wins. This provides future
>API compatability.
>
>Some future userspace, to support some future kernel APM, would pass in 2
>entries:
> IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_OUTBOUND | IB_PATH_INBOUND
> IB_PATH_GMP | IB_PATH_ALTERNATE | IB_PATH_OUTBOUND | IB_PATH_INBOUND
>
>This shouldn't break old kernels.

I don't like the idea of the kernel silently ignoring the alternate path.
Returning an error seems like a better idea.

>Again, some even future kernel would support 6 entries, future user
>space would send this:
> IB_PATH_PRIMARY | IB_PATH_INBOUND
> IB_PATH_PRIMARY | IB_PATH_OUTBOUND
> IB_PATH_GMP | IB_PATH_PRIMARY | IB_PATH_OUTBOUND | IB_PATH_INBOUND
> IB_PATH_ALTERNATE | IB_PATH_INBOUND
> IB_PATH_ALTERNATE | IB_PATH_OUTBOUND
> IB_PATH_GMP | IB_PATH_ALTERNATE | IB_PATH_OUTBOUND | IB_PATH_INBOUND
>
>This shouldn't break old kernels either.

To me, this sets contradictory paths.  The pairings <PRI | IN>, <PRI | OUT>,
<ALT | IN>, <ALT | OUT> should appear exactly once in a set of paths.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v2] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                                     ` <67280F81CB6F417DA6EEE22448ED5500-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-22 18:36                                                       ` Jason Gunthorpe
  0 siblings, 0 replies; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-22 18:36 UTC (permalink / raw)
  To: Sean Hefty; +Cc: linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

On Thu, Oct 22, 2009 at 11:28:11AM -0700, Sean Hefty wrote:

> I don't like the idea of the kernel silently ignoring the alternate path.
> Returning an error seems like a better idea.

Then provide a way for userspace to know WTF to do. Without a
negotiation process this is now an 'impossible to use right' API.

There are other kernel protocols that work like this, some netlink
stuff works like this for instance. The notion of providing multiple
options and having the kernel pick the ones it supports is a standard
technique for forward API compatability.

How about a compromise. Add a IB_PATH_MUST or something, if the kernel
skips over an entry marked like that due to no support then it returns
-ENOSYS. Userspace that really cares can use that flag. I expect most
cases won't care..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                             ` <4AE07D41.7040300-smomgflXvOZWk0Htik3J/w@public.gmane.org>
  2009-10-22 16:39                                               ` Jason Gunthorpe
@ 2009-10-22 19:14                                               ` Sean Hefty
       [not found]                                                 ` <9574E625AB3C48E6A7DF1A2760882363-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  1 sibling, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-22 19:14 UTC (permalink / raw)
  To: 'Or Gerlitz'; +Cc: Jason Gunthorpe, linux-rdma, Roland Dreier

>I really tried to follow the thread between you and Jason with quite
>little success, and I am going to give it more tries... in parallel,
>could you help me understand what is the --drive/reasoning-- from your
>perspective to add AF_IB / PS_IB here?

These are the things done today in the kernel wrt IB:

* Map a local or remote IP address to a GID.
* If a local address is not given, provide a usable address
  based on the destination address.
* Acquire a path between the source and destination.
* Format the first 36 bytes of private data in the CM REQ.

Any or all of these could be done in user space instead.  Adding AF_IB to the
kernel can provide a clean way of enabling this.  It can also allow full support
of IB CM functionality through the RDMA CM interfaces.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                     ` <B7E97540810E4A2785FF1FC8CB96F453-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-22 16:54                                       ` Jason Gunthorpe
@ 2009-10-22 20:26                                       ` Sean Hefty
       [not found]                                         ` <DC0770A17FDC4DACAC0251A3362CE87A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  1 sibling, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-22 20:26 UTC (permalink / raw)
  To: 'Jason Gunthorpe', linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

Export rdma_set_ib_paths to user space to allow applications to
manually set the IB path used for connections.  This allows
alternative ways for a user space application or library to obtain
path record information, including retrieving path information
from cached data, avoiding direct interaction with the IB SA.
The IB SA is a single, centralized entity that can limit scaling
on large clusters running MPI applications.

Future changes to the rdma cm can expand on this framework to
support the full range of features allowed by the IB CM, such as
separate forward and reverse paths and APM.

Signed-off-by: Sean Hefty <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
---
Changes from v1:
Use MAD attribute structure format for path record data.
Add flags to indicate how a path should be used.  This allows separate
forward and reverse paths, and could support APM.

Changes from v2:
Handle an array of paths being set.
Add inbound_reverse and bidirectional flags to describe paths.

I think Jason's proposal works fine here.  The user can determine if an
alternate path was accepted by looking at the paths returned from query
route.  This is needed anyway, since the remote side can reject the
alternate path, but still accept the connection.  And marking more than
one path as <PRI, IN> or <PRI, OUT> indicates that the kernel
can select any of the marked paths.

Compile tested only

 drivers/infiniband/core/sa_query.c |    6 ++++
 drivers/infiniband/core/ucma.c     |   49 ++++++++++++++++++++++++++++++++++++
 include/rdma/ib_sa.h               |    6 ++++
 include/rdma/ib_user_sa.h          |   16 ++++++++++++
 include/rdma/rdma_user_cm.h        |    7 ++++-
 5 files changed, 82 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
index 1865049..2e73dcc 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -604,6 +604,12 @@ retry:
 	return ret ? ret : id;
 }
 
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec)
+{
+	ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
 static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
 				    int status,
 				    struct ib_sa_mad *mad)
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 4346a24..23d9939 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -42,6 +42,7 @@
 #include <rdma/rdma_user_cm.h>
 #include <rdma/ib_marshall.h>
 #include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
@@ -811,6 +812,51 @@ static int ucma_set_option_id(struct ucma_context *ctx, int optname,
 	return ret;
 }
 
+static int ucma_set_ib_path(struct ucma_context *ctx,
+			    struct ib_path_rec_data *path_data, size_t optlen)
+{
+	struct ib_sa_path_rec sa_path;
+	struct rdma_cm_event event;
+	int ret;
+
+	if (optlen % sizeof(*path_data))
+		return -EINVAL;
+
+	for (; optlen; optlen -= sizeof(*path_data), path_data++) {
+		if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
+					 IB_PATH_BIDIRECTIONAL))
+			break;
+	}
+
+	if (!optlen)
+		return -EINVAL;
+
+	ib_sa_unpack_path(path_data->path_rec, &sa_path);
+	ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
+	if (ret)
+		return ret;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (optname) {
+	case RDMA_OPTION_IB_PATH:
+		ret = ucma_set_ib_path(ctx, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
 static int ucma_set_option_level(struct ucma_context *ctx, int level,
 				 int optname, void *optval, size_t optlen)
 {
@@ -820,6 +866,9 @@ static int ucma_set_option_level(struct ucma_context *ctx, int level,
 	case RDMA_OPTION_ID:
 		ret = ucma_set_option_id(ctx, optname, optval, optlen);
 		break;
+	case RDMA_OPTION_IB:
+		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 3841c1a..1082afa 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -379,4 +379,10 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
 			 struct ib_sa_path_rec *rec,
 			 struct ib_ah_attr *ah_attr);
 
+/**
+ * ib_sa_unpack_path - Convert a path record from MAD format to struct
+ * ib_sa_path_rec.
+ */
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec);
+
 #endif /* IB_SA_H */
diff --git a/include/rdma/ib_user_sa.h b/include/rdma/ib_user_sa.h
index 6591201..cfc7c9b 100644
--- a/include/rdma/ib_user_sa.h
+++ b/include/rdma/ib_user_sa.h
@@ -35,6 +35,22 @@
 
 #include <linux/types.h>
 
+enum {
+	IB_PATH_GMP		= 1,
+	IB_PATH_PRIMARY		= (1<<1),
+	IB_PATH_ALTERNATE	= (1<<2),
+	IB_PATH_OUTBOUND	= (1<<3),
+	IB_PATH_INBOUND		= (1<<4),
+	IB_PATH_INBOUND_REVERSE = (1<<5),
+	IB_PATH_BIDIRECTIONAL	= IB_PATH_OUTBOUND | IB_PATH_INBOUND_REVERSE
+};
+
+struct ib_path_rec_data {
+	__u32	flags;
+	__u32	reserved;
+	__u32	path_rec[16];
+};
+
 struct ib_user_path_rec {
 	__u8	dgid[16];
 	__u8	sgid[16];
diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h
index c557054..d7829f4 100644
--- a/include/rdma/rdma_user_cm.h
+++ b/include/rdma/rdma_user_cm.h
@@ -215,12 +215,15 @@ struct rdma_ucm_event_resp {
 
 /* Option levels */
 enum {
-	RDMA_OPTION_ID		= 0
+	RDMA_OPTION_ID		= 0,
+	RDMA_OPTION_IB		= 1
 };
 
 /* Option details */
 enum {
-	RDMA_OPTION_ID_TOS	= 0
+	RDMA_OPTION_ID_TOS	= 0,
+
+	RDMA_OPTION_IB_PATH	= 1
 };
 
 struct rdma_ucm_set_option {



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                                 ` <9574E625AB3C48E6A7DF1A2760882363-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-25 11:25                                                   ` Or Gerlitz
       [not found]                                                     ` <4AE435A1.6040309-smomgflXvOZWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Or Gerlitz @ 2009-10-25 11:25 UTC (permalink / raw)
  To: Sean Hefty; +Cc: Jason Gunthorpe, linux-rdma, Roland Dreier

Sean Hefty wrote:
> These are the things done today in the kernel wrt IB:
> * Map a local or remote IP address to a GID
> * If a local address is not given, provide a usable address based on the destination address
> * Acquire a path between the source and destination
> * Format the first 36 bytes of private data in the CM REQ
> Any or all of these could be done in user space instead.  Adding AF_IB to the kernel can provide a clean way of enabling this.  It can also allow full support of IB CM functionality through the RDMA CM interfaces
Sean,

First, on top of what you have mentioned above, the kernel also 
generates the SID to connect to / listen on, maintains a "binding" 
(mapping) between an rdma-cm id to a netdevice which today is used for 
generating address change events, and maybe some more tasks which I 
neither of us brought. From what you write here I understand that the 
reasoning is something like:

1. we can do all this in user space
2. for that end AF_INET/PS_TCP flow has to be converted to AF_IB/PS_IB 
behind the cover

well, you didn't address some of my comments (not the ice-cream 
ones...), which come to say that this wouldn't be inter-operable if for 
one side you convert INET/TCP to IB/IB and for the other side you don't 
(e.g userA/userB user/kernel kernel/user etc schemes). Also the 
functionality added under the bonding scheme is lost, etc.

I am asking you to have INET/TCP apps enjoy both ACM's DGID and route 
resolution without being converted to IB/IB, simple as that. If needed 
I'd be happy to assist in making this flow happen.

The rdma-cm was born first and most to serve as a glue between the IP 
and RDMA worlds, and I just ask you, as the maintainer, to keep this 
well-going-glue happening also under ACM.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                                 ` <20091022163904.GC26003-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-25 11:32                                                   ` Or Gerlitz
       [not found]                                                     ` <4AE4374B.6020104-smomgflXvOZWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Or Gerlitz @ 2009-10-25 11:32 UTC (permalink / raw)
  To: Jason Gunthorpe; +Cc: Sean Hefty, linux-rdma, Roland Dreier

Jason Gunthorpe wrote:
> So why not have a more general, flexible approach? Isolating ACM from librdmacm by using AF_IB is a good idea, it keeps them seperate and lets ACM and future go where ever. I hope Sean can make it work with the rdma_getddrinfo idea, that would completely seperate ACM and librdmacm
Generally speaking, AF_IB/PS_IB sounds okay to me, even though I am not 
clear what applications are going to use it, maybe some examples please?
> Attempting to bake it into AF_INET means that librdmacm, possibly the kernel and maybe even the apps need to be contaminated with ACM specific code, and that just doesn't seem desirable to me. What happens when someone invents BCM or CCM? More mess
I don't agree, the only place where librdmacm goes to ACM is to resolve 
DGID and a route. This can be done with rdma_getdgidinfo & 
rdma_getrouteinfo if you like (maybe you do the implementation?), or 
with ACM (later BCM, CCM) plugin used by librdmacm or by calls from 
librdmacm to ACM. But in any case, the kernel code nor the app will not 
be contaminated.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                                     ` <4AE435A1.6040309-smomgflXvOZWk0Htik3J/w@public.gmane.org>
@ 2009-10-25 18:32                                                       ` Jason Gunthorpe
  0 siblings, 0 replies; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-25 18:32 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Sean Hefty, linux-rdma

On Sun, Oct 25, 2009 at 01:25:21PM +0200, Or Gerlitz wrote:

> well, you didn't address some of my comments (not the ice-cream 
> ones...), which come to say that this wouldn't be inter-operable if for 
> one side you convert INET/TCP to IB/IB and for the other side you don't 
> (e.g userA/userB user/kernel kernel/user etc schemes). 

I don't think there will be any compatability issues. Most IP CM
active side requirements are trivially met through user space
generation of the private data.

> Also the functionality added under the bonding scheme is lost, etc.

Considering that ACM gets rid of the ND process I don't see how full
bonding functionality could have ever been maintained. That said, I
think within what Sean has designed there could be something analogous
to IP bonding within ACM - features like this are why it is important
the name resolution have control over source device selection, not
just outgoing route.

> I am asking you to have INET/TCP apps enjoy both ACM's DGID and route 
> resolution without being converted to IB/IB, simple as that. If needed 
> I'd be happy to assist in making this flow happen.

If they are 100% interoperable, and the conversion is transparent to
apps that use this new rdma_getaddrinfo thingy, why do you care at all?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping
       [not found]                                                     ` <4AE4374B.6020104-smomgflXvOZWk0Htik3J/w@public.gmane.org>
@ 2009-10-25 18:50                                                       ` Jason Gunthorpe
  0 siblings, 0 replies; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-25 18:50 UTC (permalink / raw)
  To: Or Gerlitz; +Cc: Sean Hefty, linux-rdma

On Sun, Oct 25, 2009 at 01:32:27PM +0200, Or Gerlitz wrote:
> Jason Gunthorpe wrote:
> >So why not have a more general, flexible approach? Isolating ACM from 
> >librdmacm by using AF_IB is a good idea, it keeps them seperate and lets 
> >ACM and future go where ever. I hope Sean can make it work with the 
> >rdma_getddrinfo idea, that would completely seperate ACM and librdmacm

> Generally speaking, AF_IB/PS_IB sounds okay to me, even though I am not 
> clear what applications are going to use it, maybe some examples please?

Again, my hope with the rdma_getaddrinfo idea is that apps using that
API get *all* the features with no additional code changes. Things
like GID addressing, ACM, etc all just exist under the
rdma_getaddrinfo API, so the apps don't care.

So, any time a *user* wants to use native IB features - APM, user path
selection, GID addressing, IB routing, etc - or users that want to
work with native IB labels for partitioning, QOS and IB routing.

> I don't agree, the only place where librdmacm goes to ACM is to resolve 
> DGID and a route. This can be done with rdma_getdgidinfo & 
> rdma_getrouteinfo if you like (maybe you do the implementation?), or

I would not be interested in such a poor API as a rdma_getdginfo, 
rdma_getrouteinfo. Such a design ignores the hard won lessons of
IP.

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                         ` <DC0770A17FDC4DACAC0251A3362CE87A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-28 18:11                                           ` Roland Dreier
       [not found]                                             ` <adaiqdzs81h.fsf-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org>
  2009-10-30  1:49                                           ` [PATCH v4] " Sean Hefty
  1 sibling, 1 reply; 76+ messages in thread
From: Roland Dreier @ 2009-10-28 18:11 UTC (permalink / raw)
  To: Sean Hefty
  Cc: 'Jason Gunthorpe', linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

This looks fine to me... the only thing I could wonder about is:

 > +	RDMA_OPTION_ID_TOS	= 0,
 > +
 > +	RDMA_OPTION_IB_PATH	= 1

why a blank line here? ;)

Jason and Or, does this seem ready to queue for 2.6.33?

Thanks,
  Roland
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                             ` <adaiqdzs81h.fsf-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org>
@ 2009-10-28 19:00                                               ` Sean Hefty
       [not found]                                                 ` <B82A674A574A4A239FBF3FFCBF5D6BB6-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-28 19:14                                               ` Jason Gunthorpe
  1 sibling, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-28 19:00 UTC (permalink / raw)
  To: 'Roland Dreier'
  Cc: 'Jason Gunthorpe', linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

>This looks fine to me... the only thing I could wonder about is:
>
> > +	RDMA_OPTION_ID_TOS	= 0,
> > +
> > +	RDMA_OPTION_IB_PATH	= 1
>
>why a blank line here? ;)

I'll remove this.  This is left over from a place holder for setting the
destination address, which is now going in a completely different direction.

>Jason and Or, does this seem ready to queue for 2.6.33?

I need a few more days to finish updates to the user space code before I'm back
to a point of being able to test this.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                             ` <adaiqdzs81h.fsf-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org>
  2009-10-28 19:00                                               ` Sean Hefty
@ 2009-10-28 19:14                                               ` Jason Gunthorpe
       [not found]                                                 ` <20091028191454.GL14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  1 sibling, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-28 19:14 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Sean Hefty, linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

On Wed, Oct 28, 2009 at 11:11:22AM -0700, Roland Dreier wrote:
> This looks fine to me... the only thing I could wonder about is:
> 
>  > +	RDMA_OPTION_ID_TOS	= 0,
>  > +
>  > +	RDMA_OPTION_IB_PATH	= 1
> 
> why a blank line here? ;)
 
> Jason and Or, does this seem ready to queue for 2.6.33?

I like the API as far as passing IB PRs between kernel and userspace,
I'm glad we came up with this. (Sean: I was going to suggest that the
second EINVAL should maybe be ENOSUPPORTED or something - so userspace
can tell the kernel supports the API but did not accept any of the flags
combinations)

But, I still think this API should return EINVAL if the cm_id is in
AF_INET/AF_INET6 mode. That is to say, this API only works with the
AF_IB idea we have been discussing.

I suggest this because using this API really does override the
capabilities of the AF_INET/6 in unexpected ways, as the discussion
drifted through it seemed like at least bonding, routing
and ND operations would/could be overridden.

If so then I'd say it should be part of an AF_IB patch.

Sean, what are your thoughts on applying it to AF_INET/6?

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                                 ` <20091028191454.GL14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-28 19:37                                                   ` Sean Hefty
       [not found]                                                     ` <5082D185D95A4389BC9EEA666CAEBA66-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-28 19:37 UTC (permalink / raw)
  To: 'Jason Gunthorpe', Roland Dreier
  Cc: linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

>But, I still think this API should return EINVAL if the cm_id is in
>AF_INET/AF_INET6 mode. That is to say, this API only works with the
>AF_IB idea we have been discussing.
>
>I suggest this because using this API really does override the
>capabilities of the AF_INET/6 in unexpected ways, as the discussion
>drifted through it seemed like at least bonding, routing
>and ND operations would/could be overridden.
>
>If so then I'd say it should be part of an AF_IB patch.
>
>Sean, what are your thoughts on applying it to AF_INET/6?

Even without any other kernel changes, this patch enables us to solve the
biggest scaling problem that we've measured, so I want to allow it regardless of
what the original addressing was.  Whether a path record comes from the SA, a
local cache, some wonky multicast protocol, or is made up is really independent
from how the GIDs were discovered.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                                     ` <5082D185D95A4389BC9EEA666CAEBA66-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-28 20:25                                                       ` Jason Gunthorpe
       [not found]                                                         ` <20091028202545.GM14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-28 20:25 UTC (permalink / raw)
  To: Sean Hefty; +Cc: Roland Dreier, linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

On Wed, Oct 28, 2009 at 12:37:39PM -0700, Sean Hefty wrote:
> >But, I still think this API should return EINVAL if the cm_id is in
> >AF_INET/AF_INET6 mode. That is to say, this API only works with the
> >AF_IB idea we have been discussing.
> >
> >I suggest this because using this API really does override the
> >capabilities of the AF_INET/6 in unexpected ways, as the discussion
> >drifted through it seemed like at least bonding, routing
> >and ND operations would/could be overridden.
> >
> >If so then I'd say it should be part of an AF_IB patch.
> >
> >Sean, what are your thoughts on applying it to AF_INET/6?
> 
> Even without any other kernel changes, this patch enables us to solve the
> biggest scaling problem that we've measured, so I want to allow it regardless of
> what the original addressing was.  Whether a path record comes from the SA, a
> local cache, some wonky multicast protocol, or is made up is really independent
> from how the GIDs were discovered.

OK, that makes sense.

I have no problem with this if there is a way for the kernel to choose
the DGID and pass it to user space to do path resolution to feed back
PRs through the new API - at least it is then possible to use the new
API in a way that is consistent with the IP stack.

ie this API when applied to AF_INET does not short-circuit ND. You
have to use AF_IB to remove the ND.

Does that seem consistent with what you are thinking? Does a DGID
returning API already exist? Seems simple to add if not..

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* RE: [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                                         ` <20091028202545.GM14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
@ 2009-10-28 21:41                                                           ` Sean Hefty
       [not found]                                                             ` <24B14DCC9C3645FB92194300C8F5D441-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-28 21:41 UTC (permalink / raw)
  To: 'Jason Gunthorpe'
  Cc: Roland Dreier, linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

>Does a DGID returning API already exist?

yes - query_route returns the following information: SGID, DGID, pkey, source
address, destination address, and path records (max of 2).  Not all of the
information is valid, depending on the state of the rdma cm id.  The librdmacm
already invokes this after rdma_resolve_addr completes.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                                             ` <24B14DCC9C3645FB92194300C8F5D441-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-10-28 22:24                                                               ` Jason Gunthorpe
  0 siblings, 0 replies; 76+ messages in thread
From: Jason Gunthorpe @ 2009-10-28 22:24 UTC (permalink / raw)
  To: Sean Hefty; +Cc: Roland Dreier, linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

On Wed, Oct 28, 2009 at 02:41:15PM -0700, Sean Hefty wrote:
> >Does a DGID returning API already exist?
> 
> yes - query_route returns the following information: SGID, DGID, pkey, source
> address, destination address, and path records (max of 2).  Not all of the
> information is valid, depending on the state of the rdma cm id.  The librdmacm
> already invokes this after rdma_resolve_addr completes.

Great, I didn't realize that was there. No further comments from me then

Reviewed-By: Jason Gunthorpe <jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>

Jason
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* [PATCH v4] rdma/cm: support option to allow manually setting IB path
       [not found]                                         ` <DC0770A17FDC4DACAC0251A3362CE87A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  2009-10-28 18:11                                           ` Roland Dreier
@ 2009-10-30  1:49                                           ` Sean Hefty
       [not found]                                             ` <1B115D7248A5404781F001F72A7C591A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
  1 sibling, 1 reply; 76+ messages in thread
From: Sean Hefty @ 2009-10-30  1:49 UTC (permalink / raw)
  To: Hefty, Sean, 'Jason Gunthorpe',
	linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

Export rdma_set_ib_paths to user space to allow applications to
manually set the IB path used for connections.  This allows
alternative ways for a user space application or library to obtain
path record information, including retrieving path information
from cached data, avoiding direct interaction with the IB SA.
The IB SA is a single, centralized entity that can limit scaling
on large clusters running MPI applications.

Future changes to the rdma cm can expand on this framework to
support the full range of features allowed by the IB CM, such as
separate forward and reverse paths and APM.

Signed-off-by: Sean Hefty <sean.hefty-ral2JQCrhuEAvxtiuMwx3w@public.gmane.org>
Reviewed-By: Jason Gunthorpe <jgunthorpe-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
---
changes from v3: removed extra blank line

I was able to test successfully on 2.6.31.  Please merge for 2.6.33.


 drivers/infiniband/core/sa_query.c |    6 ++++
 drivers/infiniband/core/ucma.c     |   49 ++++++++++++++++++++++++++++++++++++
 include/rdma/ib_sa.h               |    6 ++++
 include/rdma/ib_user_sa.h          |   16 ++++++++++++
 include/rdma/rdma_user_cm.h        |    6 +++-
 5 files changed, 81 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/sa_query.c
b/drivers/infiniband/core/sa_query.c
index 1865049..2e73dcc 100644
--- a/drivers/infiniband/core/sa_query.c
+++ b/drivers/infiniband/core/sa_query.c
@@ -604,6 +604,12 @@ retry:
 	return ret ? ret : id;
 }
 
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec)
+{
+	ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
 static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
 				    int status,
 				    struct ib_sa_mad *mad)
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 4346a24..23d9939 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -42,6 +42,7 @@
 #include <rdma/rdma_user_cm.h>
 #include <rdma/ib_marshall.h>
 #include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
@@ -811,6 +812,51 @@ static int ucma_set_option_id(struct ucma_context *ctx, int
optname,
 	return ret;
 }
 
+static int ucma_set_ib_path(struct ucma_context *ctx,
+			    struct ib_path_rec_data *path_data, size_t optlen)
+{
+	struct ib_sa_path_rec sa_path;
+	struct rdma_cm_event event;
+	int ret;
+
+	if (optlen % sizeof(*path_data))
+		return -EINVAL;
+
+	for (; optlen; optlen -= sizeof(*path_data), path_data++) {
+		if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
+					 IB_PATH_BIDIRECTIONAL))
+			break;
+	}
+
+	if (!optlen)
+		return -EINVAL;
+
+	ib_sa_unpack_path(path_data->path_rec, &sa_path);
+	ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
+	if (ret)
+		return ret;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (optname) {
+	case RDMA_OPTION_IB_PATH:
+		ret = ucma_set_ib_path(ctx, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
 static int ucma_set_option_level(struct ucma_context *ctx, int level,
 				 int optname, void *optval, size_t optlen)
 {
@@ -820,6 +866,9 @@ static int ucma_set_option_level(struct ucma_context *ctx,
int level,
 	case RDMA_OPTION_ID:
 		ret = ucma_set_option_id(ctx, optname, optval, optlen);
 		break;
+	case RDMA_OPTION_IB:
+		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h
index 3841c1a..1082afa 100644
--- a/include/rdma/ib_sa.h
+++ b/include/rdma/ib_sa.h
@@ -379,4 +379,10 @@ int ib_init_ah_from_path(struct ib_device *device, u8
port_num,
 			 struct ib_sa_path_rec *rec,
 			 struct ib_ah_attr *ah_attr);
 
+/**
+ * ib_sa_unpack_path - Convert a path record from MAD format to struct
+ * ib_sa_path_rec.
+ */
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec);
+
 #endif /* IB_SA_H */
diff --git a/include/rdma/ib_user_sa.h b/include/rdma/ib_user_sa.h
index 6591201..cfc7c9b 100644
--- a/include/rdma/ib_user_sa.h
+++ b/include/rdma/ib_user_sa.h
@@ -35,6 +35,22 @@
 
 #include <linux/types.h>
 
+enum {
+	IB_PATH_GMP		= 1,
+	IB_PATH_PRIMARY		= (1<<1),
+	IB_PATH_ALTERNATE	= (1<<2),
+	IB_PATH_OUTBOUND	= (1<<3),
+	IB_PATH_INBOUND		= (1<<4),
+	IB_PATH_INBOUND_REVERSE = (1<<5),
+	IB_PATH_BIDIRECTIONAL	= IB_PATH_OUTBOUND | IB_PATH_INBOUND_REVERSE
+};
+
+struct ib_path_rec_data {
+	__u32	flags;
+	__u32	reserved;
+	__u32	path_rec[16];
+};
+
 struct ib_user_path_rec {
 	__u8	dgid[16];
 	__u8	sgid[16];
diff --git a/include/rdma/rdma_user_cm.h b/include/rdma/rdma_user_cm.h
index c557054..1d16502 100644
--- a/include/rdma/rdma_user_cm.h
+++ b/include/rdma/rdma_user_cm.h
@@ -215,12 +215,14 @@ struct rdma_ucm_event_resp {
 
 /* Option levels */
 enum {
-	RDMA_OPTION_ID		= 0
+	RDMA_OPTION_ID		= 0,
+	RDMA_OPTION_IB		= 1
 };
 
 /* Option details */
 enum {
-	RDMA_OPTION_ID_TOS	= 0
+	RDMA_OPTION_ID_TOS	= 0,
+	RDMA_OPTION_IB_PATH	= 1
 };
 
 struct rdma_ucm_set_option {



--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 76+ messages in thread

* Re: [PATCH v3] [RFC] rdma/cm: support option to allow manually setting IB path
       [not found]                                                 ` <B82A674A574A4A239FBF3FFCBF5D6BB6-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-11-01  6:50                                                   ` Or Gerlitz
  0 siblings, 0 replies; 76+ messages in thread
From: Or Gerlitz @ 2009-11-01  6:50 UTC (permalink / raw)
  To: Sean Hefty
  Cc: 'Roland Dreier', linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

Sean Hefty wrote:
>> Jason and Or, does this seem ready to queue for 2.6.33?
Roland, I have missed your email last week, anyway, as I wrote Sean 
earlier, I'm totally fine with this patch of allowing user space to set 
a patch record for the kernel.

Or.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* Re: [PATCH v4] rdma/cm: support option to allow manually setting IB path
       [not found]                                             ` <1B115D7248A5404781F001F72A7C591A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
@ 2009-11-01  7:10                                               ` Or Gerlitz
       [not found]                                                 ` <4AED347A.1060301-smomgflXvOZWk0Htik3J/w@public.gmane.org>
  0 siblings, 1 reply; 76+ messages in thread
From: Or Gerlitz @ 2009-11-01  7:10 UTC (permalink / raw)
  To: Sean Hefty
  Cc: 'Jason Gunthorpe', linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

Sean Hefty wrote:
> Future changes to the rdma cm can expand on this framework to support the full range of features allowed by the IB CM, such as separate forward and reverse paths and APM
Sean,

Before enhancing the rdma-cm to support the full feature set of the IB 
CM, something which I personally don't see the actual need for (but I 
will be happy to get educated what applications will or can migrate to 
rdma-cm once this is implemented), how about trying to allow for reduced 
QoS scheme also when the entity that resolved this patch didn't 
consulted with the SA?

IB QoS is based on the query providing the <SGID, DGID, PKEY, SID, TOS> 
tuple and the SA returning a <SLID, DLID, SL, MTU, ....> QoS tuple. Now 
I'd like to see how can we let the application / querying middleware to 
take advantage of the knowledge on what partition it runs and use the SL 
associated with the IPv4 (e.g AF_INET rdma-cm ID's) IPoIB broadcast 
group. This way, one can still program a QoS scheme at the SA which is 
based on partitions.

Looking on mckey, the user space code (e.g ACM), could just do rdma_bind 
to an IP address of an IPoIB NIC that uses this partition and then 
rdma_join to an unmapped multicast address which correspond to the 
broadcast group, take the SL and leave the group, makes sense?

Or.

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

* QoS in local SA entity: was rdma/cm: support option to allow manually setting IB path
       [not found]                                                 ` <4AED347A.1060301-smomgflXvOZWk0Htik3J/w@public.gmane.org>
@ 2009-11-02 18:59                                                   ` Sean Hefty
  0 siblings, 0 replies; 76+ messages in thread
From: Sean Hefty @ 2009-11-02 18:59 UTC (permalink / raw)
  To: 'Or Gerlitz'
  Cc: 'Jason Gunthorpe', linux-rdma, rolandd-FYB4Gu1CFyUAvxtiuMwx3w

>Before enhancing the rdma-cm to support the full feature set of the IB
>CM, something which I personally don't see the actual need for (but I
>will be happy to get educated what applications will or can migrate to
>rdma-cm once this is implemented), how about trying to allow for reduced
>QoS scheme also when the entity that resolved this patch didn't
>consulted with the SA?

I think this really needs to be discussed wrt the implementation of the entity
providing the path records.

>IB QoS is based on the query providing the <SGID, DGID, PKEY, SID, TOS>
>tuple and the SA returning a <SLID, DLID, SL, MTU, ....> QoS tuple. Now
>I'd like to see how can we let the application / querying middleware to
>take advantage of the knowledge on what partition it runs and use the SL
>associated with the IPv4 (e.g AF_INET rdma-cm ID's) IPoIB broadcast
>group. This way, one can still program a QoS scheme at the SA which is
>based on partitions.

I think what's needed is a way for the SA to distribute QoS information to the
end nodes, so that the decisions can be made locally.  If someone wants some
sort of dynamic QoS management and is happy using a small cluster, then they can
disable any local SA entities and contact the SA directly.

In the case of ACM, the pkey is embedded in the MGID.  'Something' could tell
the SA to create ACM multicast groups using a specific SL for a given MGID or
pkey in the join request.  That SL would be distributed to the end nodes when
they joined their groups.

>Looking on mckey, the user space code (e.g ACM), could just do rdma_bind
>to an IP address of an IPoIB NIC that uses this partition and then
>rdma_join to an unmapped multicast address which correspond to the
>broadcast group, take the SL and leave the group, makes sense?

The entity that provides the path records cannot depend on calling into the
librdmacm.  The dependency needs to go the other way.

- Sean

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 76+ messages in thread

end of thread, other threads:[~2009-11-02 18:59 UTC | newest]

Thread overview: 76+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-10-05 17:43 [PATCH 1/2] rdma/cm: support option to allow manually setting IB path Sean Hefty
     [not found] ` <F0EFC2D8E6A340D48497497670C5969C-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-05 17:45   ` [PATCH 2/2] rdma/cm: allow user to specify IP to DGID mapping Sean Hefty
     [not found]     ` <F451C333D8CB45E4B4642C6BD1EDD3C3-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-06  8:00       ` Or Gerlitz
     [not found]         ` <4ACAF913.3050909-smomgflXvOZWk0Htik3J/w@public.gmane.org>
2009-10-06 19:05           ` Sean Hefty
     [not found]             ` <AA7E7C8FC2A04B9688CD69CEB7355DF8-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-06 20:07               ` Jason Gunthorpe
     [not found]                 ` <20091006200739.GP5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-06 22:53                   ` Sean Hefty
     [not found]                     ` <B266C10D3C26431E8FF5012420132452-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-06 23:17                       ` Jason Gunthorpe
     [not found]                         ` <20091006231720.GR5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-07  1:20                           ` Sean Hefty
     [not found]                             ` <3F7D26D4BA1C46F18F2F87BDD7EB7F36-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-07  5:26                               ` Jason Gunthorpe
     [not found]                                 ` <20091007052639.GB18578-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-07 19:16                                   ` Sean Hefty
     [not found]                                     ` <20ADF14BE2B24B459DC3921F69449E61-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-07 20:32                                       ` Jason Gunthorpe
     [not found]                                         ` <20091007203257.GT5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-07 21:20                                           ` Hefty, Sean
     [not found]                                             ` <CF9C39F99A89134C9CF9C4CCB68B8DDF12C180FFD1-osO9UTpF0USkrb+BlOpmy7fspsVTdybXVpNB7YpNyf8@public.gmane.org>
2009-10-07 21:45                                               ` Jason Gunthorpe
2009-10-07 22:23               ` Or Gerlitz
     [not found]                 ` <15ddcffd0910071523w4f229b14j905ad170ceb8c21f-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-10-07 23:42                   ` Sean Hefty
     [not found]                     ` <9F4DE6A2B4F644698E94F00C4FEEF30A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-08  0:38                       ` Sean Hefty
     [not found]                         ` <0A383504E0E54C949DEF84405E3AE92F-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-08  0:54                           ` Jason Gunthorpe
     [not found]                             ` <20091008005425.GW5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-08  6:46                               ` Sean Hefty
     [not found]                                 ` <3BA5B96263EC4ACA8FF3C4D8DCF47C69-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-08 17:21                                   ` Jason Gunthorpe
     [not found]                                     ` <20091008172120.GX5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-08 23:39                                       ` Or Gerlitz
2009-10-08 23:33                           ` Or Gerlitz
     [not found]                             ` <15ddcffd0910081633q20d98abfg41a9f4e781e486b1-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2009-10-09  0:24                               ` Sean Hefty
     [not found]                                 ` <859D79BFCA4741F393AABF76BBCA4F7B-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-20 10:22                                   ` Or Gerlitz
     [not found]                                     ` <4ADD8F5F.3010008-smomgflXvOZWk0Htik3J/w@public.gmane.org>
2009-10-20 18:08                                       ` Sean Hefty
     [not found]                                         ` <9F76F7CD7B9048E8821A1B05CC5FAFE8-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-20 18:31                                           ` Jason Gunthorpe
     [not found]                                             ` <20091020183132.GE14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-20 19:13                                               ` Sean Hefty
     [not found]                                                 ` <A47D2FC6B143436DB87704307B0E715D-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-20 19:18                                                   ` Jason Gunthorpe
     [not found]                                                     ` <20091020191821.GI14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-20 20:05                                                       ` Sean Hefty
     [not found]                                                         ` <8D09997BDBC5482C86EAD338F19C8030-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-20 20:29                                                           ` Jason Gunthorpe
     [not found]                                                             ` <20091020202902.GJ14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-20 20:48                                                               ` Sean Hefty
     [not found]                                                                 ` <B7BCBF813BF447B28330C2DB8F1437D6-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-20 21:30                                                                   ` Jason Gunthorpe
2009-10-22 15:41                                           ` Or Gerlitz
     [not found]                                             ` <4AE07D41.7040300-smomgflXvOZWk0Htik3J/w@public.gmane.org>
2009-10-22 16:39                                               ` Jason Gunthorpe
     [not found]                                                 ` <20091022163904.GC26003-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-25 11:32                                                   ` Or Gerlitz
     [not found]                                                     ` <4AE4374B.6020104-smomgflXvOZWk0Htik3J/w@public.gmane.org>
2009-10-25 18:50                                                       ` Jason Gunthorpe
2009-10-22 19:14                                               ` Sean Hefty
     [not found]                                                 ` <9574E625AB3C48E6A7DF1A2760882363-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-25 11:25                                                   ` Or Gerlitz
     [not found]                                                     ` <4AE435A1.6040309-smomgflXvOZWk0Htik3J/w@public.gmane.org>
2009-10-25 18:32                                                       ` Jason Gunthorpe
2009-10-09  0:33                               ` Jason Gunthorpe
2009-10-08 23:13                       ` Or Gerlitz
2009-10-05 17:56   ` [PATCH 1/2] rdma/cm: support option to allow manually setting IB path Jason Gunthorpe
     [not found]     ` <20091005175656.GK5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-05 18:08       ` Sean Hefty
     [not found]         ` <F7D418716F3A4A0DACE42CC449624298-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-05 18:15           ` Jason Gunthorpe
     [not found]             ` <20091005181525.GL5191-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-05 19:20               ` Sean Hefty
2009-10-06 15:01               ` Todd Rimmer
     [not found]                 ` <5AEC2602AE03EB46BFC16C6B9B200DA8168EFD82BA-e4KNYiSEog6Xx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
2009-10-06 19:05                   ` Sean Hefty
     [not found]                     ` <D61F37041B6F49ACB0AC64FBF2DC4D00-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-07  5:30                       ` Jason Gunthorpe
2009-10-09 21:48   ` Sean Hefty
     [not found]     ` <A08104C1CF70400F8BEF492AD49C8491-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-13 13:06       ` Or Gerlitz
     [not found]         ` <4AD47B40.8070800-smomgflXvOZWk0Htik3J/w@public.gmane.org>
2009-10-20 10:23           ` Or Gerlitz
     [not found]             ` <4ADD8FAA.902-smomgflXvOZWk0Htik3J/w@public.gmane.org>
2009-10-20 15:52               ` Sean Hefty
2009-10-20 18:14       ` Jason Gunthorpe
     [not found]         ` <20091020181458.GD14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-20 18:34           ` Sean Hefty
     [not found]             ` <46770152ACA04B6C8AA9497C45AC8FD0-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-20 19:14               ` Jason Gunthorpe
     [not found]                 ` <20091020191404.GH14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-22  0:14                   ` Sean Hefty
     [not found]                     ` <9DFD8E65325F4EE990749EEBE4BC33CA-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-22  0:42                       ` Jason Gunthorpe
     [not found]                         ` <20091022004245.GV14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-22  1:07                           ` Sean Hefty
     [not found]                             ` <AE35305D45DB49F591A45DADD822209A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-22  1:35                               ` Jason Gunthorpe
     [not found]                                 ` <20091022013542.GX14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-22  8:10                                   ` [PATCH v2] [RFC] " Sean Hefty
     [not found]                                     ` <B7E97540810E4A2785FF1FC8CB96F453-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-22 16:54                                       ` Jason Gunthorpe
     [not found]                                         ` <20091022165414.GH26003-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-22 17:52                                           ` Sean Hefty
     [not found]                                             ` <1438C87E89284364A56E08A40DFE199E-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-22 18:11                                               ` Jason Gunthorpe
     [not found]                                                 ` <20091022181101.GY14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-22 18:28                                                   ` Sean Hefty
     [not found]                                                     ` <67280F81CB6F417DA6EEE22448ED5500-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-22 18:36                                                       ` Jason Gunthorpe
2009-10-22 20:26                                       ` [PATCH v3] " Sean Hefty
     [not found]                                         ` <DC0770A17FDC4DACAC0251A3362CE87A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-28 18:11                                           ` Roland Dreier
     [not found]                                             ` <adaiqdzs81h.fsf-FYB4Gu1CFyUAvxtiuMwx3w@public.gmane.org>
2009-10-28 19:00                                               ` Sean Hefty
     [not found]                                                 ` <B82A674A574A4A239FBF3FFCBF5D6BB6-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-11-01  6:50                                                   ` Or Gerlitz
2009-10-28 19:14                                               ` Jason Gunthorpe
     [not found]                                                 ` <20091028191454.GL14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-28 19:37                                                   ` Sean Hefty
     [not found]                                                     ` <5082D185D95A4389BC9EEA666CAEBA66-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-28 20:25                                                       ` Jason Gunthorpe
     [not found]                                                         ` <20091028202545.GM14520-ePGOBjL8dl3ta4EC/59zMFaTQe2KTcn/@public.gmane.org>
2009-10-28 21:41                                                           ` Sean Hefty
     [not found]                                                             ` <24B14DCC9C3645FB92194300C8F5D441-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-10-28 22:24                                                               ` Jason Gunthorpe
2009-10-30  1:49                                           ` [PATCH v4] " Sean Hefty
     [not found]                                             ` <1B115D7248A5404781F001F72A7C591A-Zpru7NauK7drdx17CPfAsdBPR1lH4CV8@public.gmane.org>
2009-11-01  7:10                                               ` Or Gerlitz
     [not found]                                                 ` <4AED347A.1060301-smomgflXvOZWk0Htik3J/w@public.gmane.org>
2009-11-02 18:59                                                   ` QoS in local SA entity: was " Sean Hefty

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.