From mboxrd@z Thu Jan 1 00:00:00 1970 From: Al Chu Subject: Re: [infiniband-diags] [UPDATED PATCH] [3/3] support --load-cache in iblinkinfo and ibqueryerrors Date: Sat, 16 Jan 2010 08:25:17 -0800 Message-ID: <1263659117.14626.12.camel@crazyclimber.llnl.gov> References: <1263579799.15172.168.camel@auk31.llnl.gov> <20100116142801.GS574@me> Reply-To: chu11-i2BcT+NCU+M@public.gmane.org Mime-Version: 1.0 Content-Type: text/plain Content-Transfer-Encoding: 7bit Return-path: In-Reply-To: <20100116142801.GS574@me> Sender: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org To: Sasha Khapyorsky Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org List-Id: linux-rdma@vger.kernel.org Hey Sasha, answers inlined below On Sat, 2010-01-16 at 16:28 +0200, Sasha Khapyorsky wrote: > On 10:23 Fri 15 Jan , Al Chu wrote: > > Hi Sasha, > > > > This adds the --load-cache options to iblinkinfo and ibqueryerrors. > > > > Al > > > > -- > > Albert Chu > > chu11-i2BcT+NCU+M@public.gmane.org > > Computer Scientist > > High Performance Systems Division > > Lawrence Livermore National Laboratory > > > From: Albert Chu > > Date: Thu, 10 Dec 2009 11:22:50 -0800 > > Subject: [PATCH] support --load-cache in iblinkinfo and ibqueryerrors > > > > > > Signed-off-by: Albert Chu > > --- > > infiniband-diags/man/iblinkinfo.8 | 11 ++++++- > > infiniband-diags/man/ibqueryerrors.8 | 10 ++++++- > > infiniband-diags/src/iblinkinfo.c | 52 +++++++++++++++++++++++++------- > > infiniband-diags/src/ibqueryerrors.c | 53 ++++++++++++++++++++++++++------- > > 4 files changed, 99 insertions(+), 27 deletions(-) > > > > diff --git a/infiniband-diags/man/iblinkinfo.8 b/infiniband-diags/man/iblinkinfo.8 > > index 0f53b00..f184edf 100644 > > --- a/infiniband-diags/man/iblinkinfo.8 > > +++ b/infiniband-diags/man/iblinkinfo.8 > > @@ -6,7 +6,7 @@ iblinkinfo \- report link info for all links in the fabric > > .SH SYNOPSIS > > .B iblinkinfo > > [-hcdl -C -P -v -S > > --D ] > > +-D \-\-load\-cache ] > > > > .SH DESCRIPTION > > .PP > > @@ -42,7 +42,14 @@ Print port capabilities (enabled and supported values) > > \fB\-P \fR use the specified ca_port for the search. > > .TP > > \fB\-R\fR (This option is obsolete and does nothing) > > - > > +.TP > > +\fB\-\-load\-cache\fR > > +Load and use the cached ibnetdiscover data stored in the specified > > +filename. May be useful for outputting and learning about other > > +fabrics or a previous state of a fabric. Cannot be used if user > > +specifies a directo route path. See > > +.B ibnetdiscover > > +for information on caching ibnetdiscover output. > > > > .SH AUTHOR > > .TP > > diff --git a/infiniband-diags/man/ibqueryerrors.8 b/infiniband-diags/man/ibqueryerrors.8 > > index 83a2b5a..56a0d67 100644 > > --- a/infiniband-diags/man/ibqueryerrors.8 > > +++ b/infiniband-diags/man/ibqueryerrors.8 > > @@ -6,7 +6,7 @@ ibqueryerrors \- query and report non-zero IB port counters > > .SH SYNOPSIS > > .B ibqueryerrors > > [-s -c -r -C -P -G > > --D -d -k -K] > > +-D -d -k -K \-\-load\-cache ] > > > > .SH DESCRIPTION > > .PP > > @@ -60,6 +60,14 @@ specified the data counters will be cleared without any printed output. > > .TP > > \fB\-\-details\fR include transmit discard details > > .TP > > +\fB\-\-load\-cache\fR > > +Load and use the cached ibnetdiscover data stored in the specified > > +filename. May be useful for outputting and learning about other > > +fabrics or a previous state of a fabric. Cannot be used if user > > +specifies a directo route path. See > > +.B ibnetdiscover > > +for information on caching ibnetdiscover output. > > +.TP > > \fB\-R\fR (This option is obsolete and does nothing) > > > > .SH COMMON OPTIONS > > diff --git a/infiniband-diags/src/iblinkinfo.c b/infiniband-diags/src/iblinkinfo.c > > index 21b31bb..10e3ad5 100644 > > --- a/infiniband-diags/src/iblinkinfo.c > > +++ b/infiniband-diags/src/iblinkinfo.c > > @@ -55,6 +55,7 @@ > > > > static char *node_name_map_file = NULL; > > static nn_map_t *node_name_map = NULL; > > +static char *load_cache_file = NULL; > > > > static uint64_t guid = 0; > > static char *guid_str = NULL; > > @@ -230,6 +231,9 @@ static int process_opt(void *context, int ch, char *optarg) > > case 1: > > node_name_map_file = strdup(optarg); > > break; > > + case 2: > > + load_cache_file = strdup(optarg); > > + break; > > case 'S': > > guid_str = optarg; > > guid = (uint64_t) strtoull(guid_str, 0, 0); > > @@ -291,6 +295,7 @@ int main(int argc, char **argv) > > "print additional switch settings (PktLifeTime, HoqLife, VLStallCount)"}, > > {"portguids", 'g', 0, NULL, > > "print port guids instead of node guids"}, > > + {"load-cache", 2, 1, "", "filename of ibnetdiscover cache to load"}, > > {"GNDN", 'R', 0, NULL, > > "(This option is obsolete and does nothing)"}, > > {0} > > @@ -317,6 +322,11 @@ int main(int argc, char **argv) > > mad_rpc_set_timeout(ibmad_port, ibd_timeout); > > > > node_name_map = open_node_name_map(node_name_map_file); > > + > > + if (dr_path && load_cache_file) { > > + fprintf(stderr, "Cannot specify cache and direct route path\n"); > > + exit(1); > > + } > > Why is this limitation needed really? I spoke to Ira about it awhile ago. I think what we decided was that while technically you can do a DR path, b/c you can load a cache from anywhere in the cluster, you won't know if the DR path is legal or correct at point A vs point B. In contrast, the node_guid input is valid anywhere you are on the cluster. I suppose we could just allow it and the user has to understand this fact?? > > > > if (dr_path) { > > /* only scan part of the fabric */ > > @@ -334,19 +344,37 @@ int main(int argc, char **argv) > > guid_str); > > } > > > > - if (resolved >= 0) > > - if ((fabric = ibnd_discover_fabric(ibmad_port, &port_id, > > - hops)) == NULL) > > - IBWARN > > - ("Single node discover failed; attempting full scan\n"); > > - > > - if (!fabric) > > - if ((fabric = > > - ibnd_discover_fabric(ibmad_port, NULL, -1)) == NULL) { > > - fprintf(stderr, "discover failed\n"); > > - rc = 1; > > - goto close_port; > > + if (resolved >= 0) { > > + if (load_cache_file) { > > + if ((fabric = ibnd_load_fabric(load_cache_file, 0)) == NULL) { > > + fprintf(stderr, "loading cached fabric failed\n"); > > + exit(1); > > + } > > + } > > + else { > > + if ((fabric = ibnd_discover_fabric(ibmad_port, &port_id, > > + hops)) == NULL) > > + IBWARN > > + ("Single node discover failed; attempting full scan\n"); > > + } > > + } > > + > > + if (!fabric) { > > + if (load_cache_file) { > > + if ((fabric = ibnd_load_fabric(load_cache_file, 0)) == NULL) { > > + fprintf(stderr, "loading cached fabric failed\n"); > > + exit(1); > > + } > > } > > + else { > > + if ((fabric = > > + ibnd_discover_fabric(ibmad_port, NULL, -1)) == NULL) { > > + fprintf(stderr, "discover failed\n"); > > + rc = 1; > > + goto close_port; > > + } > > + } > > + } > > It doesn't look so simple for me. In case when '-S' or '-D' are used the > information is requested for only specific node. So the flow of initial > discovery (above) would likely look as: > > if (load_cache_file) > load_fabric_from_cache(); > else { > if (resloved) > fabric = single_node_discovery(); > if (!fabric) > fabric = full_fabric_discovery(); > } > > .... > > Correct? Yeah, I can make it simpler. I will submit a new patch on Monday. > > > > if (!all && guid_str) { > > ibnd_node_t *sw = ibnd_find_node_guid(fabric, guid); > > diff --git a/infiniband-diags/src/ibqueryerrors.c b/infiniband-diags/src/ibqueryerrors.c > > index 47bd2af..3ed0ed1 100644 > > --- a/infiniband-diags/src/ibqueryerrors.c > > +++ b/infiniband-diags/src/ibqueryerrors.c > > @@ -58,6 +58,8 @@ > > struct ibmad_port *ibmad_port; > > static char *node_name_map_file = NULL; > > static nn_map_t *node_name_map = NULL; > > +static char *load_cache_file = NULL; > > + > > int data_counters = 0; > > int port_config = 0; > > uint64_t node_guid = 0; > > @@ -481,6 +483,9 @@ static int process_opt(void *context, int ch, char *optarg) > > case 6: > > details = 1; > > break; > > + case 7: > > + load_cache_file = strdup(optarg); > > + break; > > case 'G': > > case 'S': > > node_guid_str = optarg; > > @@ -542,6 +547,7 @@ int main(int argc, char **argv) > > "Clear error counters after read"}, > > {"clear-counts", 'K', 0, NULL, > > "Clear data counters after read"}, > > + {"load-cache", 7, 1, "", "filename of ibnetdiscover cache to load"}, > > {0} > > }; > > char usage_args[] = ""; > > @@ -568,6 +574,11 @@ int main(int argc, char **argv) > > > > node_name_map = open_node_name_map(node_name_map_file); > > > > + if (dr_path && load_cache_file) { > > + fprintf(stderr, "Cannot specify cache and direct route path\n"); > > + exit(1); > > + } > > + > > /* limit the scan the fabric around the target */ > > if (dr_path) { > > if ((resolved = > > @@ -584,19 +595,37 @@ int main(int argc, char **argv) > > node_guid_str); > > } > > > > - if (resolved >= 0) > > - if ((fabric = ibnd_discover_fabric(ibmad_port, &portid, > > - 0)) == NULL) > > - IBWARN > > - ("Single node discover failed; attempting full scan"); > > - > > - if (!fabric) /* do a full scan */ > > - if ((fabric = > > - ibnd_discover_fabric(ibmad_port, NULL, -1)) == NULL) { > > - fprintf(stderr, "discover failed\n"); > > - rc = 1; > > - goto close_port; > > + if (resolved >= 0) { > > + if (load_cache_file) { > > + if ((fabric = ibnd_load_fabric(load_cache_file, 0)) == NULL) { > > + fprintf(stderr, "loading cached fabric failed\n"); > > + exit(1); > > + } > > + } > > + else { > > + if ((fabric = ibnd_discover_fabric(ibmad_port, &portid, > > + 0)) == NULL) > > + IBWARN > > + ("Single node discover failed; attempting full scan"); > > } > > + } > > + > > + if (!fabric) { /* do a full scan */ > > + if (load_cache_file) { > > + if ((fabric = ibnd_load_fabric(load_cache_file, 0)) == NULL) { > > + fprintf(stderr, "loading cached fabric failed\n"); > > + exit(1); > > + } > > + } > > + else { > > + if ((fabric = > > + ibnd_discover_fabric(ibmad_port, NULL, -1)) == NULL) { > > + fprintf(stderr, "discover failed\n"); > > + rc = 1; > > + goto close_port; > > + } > > + } > > + } > > Ditto. Yup, I'll submit a new patch on Monday. > Sasha > > > > > report_suppressed(); > > > > -- > > 1.5.4.5 > > > -- Albert Chu chu11-i2BcT+NCU+M@public.gmane.org Computer Scientist High Performance Systems Division Lawrence Livermore National Laboratory -- To unsubscribe from this list: send the line "unsubscribe linux-rdma" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html