All of lore.kernel.org
 help / color / mirror / Atom feed
* Work completion error: "transport retry counter exceeded"
@ 2012-07-26  2:07 Ira Weiny
       [not found] ` <20120725190719.475605dc169353b775cd3463-i2BcT+NCU+M@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Ira Weiny @ 2012-07-26  2:07 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA

I am at a loss.  I am hacking some RDMA code to do an RDMA write from a server
to a client.

I have it working perfectly on a small 2 node test system.  When I move the
code to another system I am getting a "transport retry counter exceeded"
error.  I just can't figure out why an RDMA Write is timing out like this.

What ibv_qp_init_attr's in ibv_create_qp or ibv_qp_attr's in ibv_modify_qp might I
have to change to account for different hardware?

NOTE: On both of these tests I am trying to xfer data from 2 nodes on the same
switch.  The hardware is different and the payload in small (<512 bytes).

Here is my init code on the server side:

rdma_create_qp...

	struct ibv_qp *qp;

	struct ibv_qp_init_attr attr = {
		.send_cq = p_sa->rdma_ctx.cq,
		.recv_cq = p_sa->rdma_ctx.cq,
		.cap     = {
			.max_send_wr  = 10,
			.max_recv_wr  = 500,
			.max_send_sge = 1,
			.max_recv_sge = 1
		},
		.qp_type = IBV_QPT_RC
	};
	
	qp = ibv_create_qp(p_sa->rdma_ctx.pd, &attr);
	if (!qp) {
		return (IB_INSUFFICIENT_RESOURCES);
	}

	{
		struct ibv_qp_attr attr = {
			.qp_state        = IBV_QPS_INIT,
			.pkey_index      = 0,
			.port_num        = rdma_ctx.device_port,
			.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE
		};

		if (ibv_modify_qp(qp, &attr,
				  IBV_QP_STATE              |
				  IBV_QP_PKEY_INDEX         |
				  IBV_QP_PORT               |
				  IBV_QP_ACCESS_FLAGS)) {
			goto DestroyQP;
		}
	}
...

rdma_modify_qp...

	/* transition it to RTR/RTS with eth_info and path record */
	struct ibv_qp_attr attr = {
		.qp_state		= IBV_QPS_RTR,
		.path_mtu		= path->mtu,
		.dest_qp_num		= eth_info->qpn,
		.rq_psn			= 1,
		.max_dest_rd_atomic	= 1,
		.min_rnr_timer		= 12,
		.ah_attr		= {
			.is_global	= 0,
			.dlid		= path->dlid,
			.sl		= path->sl,
			.src_path_bits	= 0,
			.port_num	= rdma_ctx.device_port
		}
	};

	if (ibv_modify_qp(qp, &attr,
			  IBV_QP_STATE              |
			  IBV_QP_PATH_MTU           |
			  IBV_QP_DEST_QPN           |
			  IBV_QP_RQ_PSN             |
			  IBV_QP_MAX_DEST_RD_ATOMIC |
			  IBV_QP_MIN_RNR_TIMER      |
			  IBV_QP_AV)) {
		return 1;
	}

	attr.qp_state	    = IBV_QPS_RTS;
	attr.timeout	    = 14;
	attr.retry_cnt	    = 7;
	attr.rnr_retry	    = 7;
	attr.sq_psn	    = 1;
	attr.max_rd_atomic  = 1;
	if (ibv_modify_qp(qp, &attr,
			  IBV_QP_STATE              |
			  IBV_QP_TIMEOUT            |
			  IBV_QP_RETRY_CNT          |
			  IBV_QP_RNR_RETRY          |
			  IBV_QP_SQ_PSN             |
			  IBV_QP_MAX_QP_RD_ATOMIC)) {
		return 1;
	}
...


Here is the code on the client:

rdma_create_qp...

	struct ibv_qp_init_attr attr = {
		.send_cq = rdma_ctx.cq,
		.recv_cq = rdma_ctx.cq,
		.cap     = {
			.max_send_wr  = 10,
			.max_recv_wr  = 10,
			.max_send_sge = 1,
			.max_recv_sge = 1
		},
		.qp_type = IBV_QPT_RC
	};
	
	qp = ibv_create_qp(rdma_ctx.pd, &attr);
	if (!qp) {
		return (-ENOMEM);
	}

	{
		struct ibv_qp_attr attr = {
			.qp_state        = IBV_QPS_INIT,
			.pkey_index      = 0,
			.port_num	 = rdma_ctx.device_port,
			.qp_access_flags = IBV_ACCESS_REMOTE_WRITE | IBV_ACCESS_REMOTE_READ | IBV_ACCESS_LOCAL_WRITE
		};

		if (ibv_modify_qp(qp, &attr,
				  IBV_QP_STATE              |
				  IBV_QP_PKEY_INDEX         |
				  IBV_QP_PORT               |
				  IBV_QP_ACCESS_FLAGS)) {
			return (-ENOMEM);
		}
	}

...

rdma_connect_qp...

	struct ibv_qp_attr attr = {
		.qp_state		= IBV_QPS_RTR,
		.path_mtu		= conn->path.mtu,
		.dest_qp_num		= conn->rqpn,
		.rq_psn			= 1,
		.max_dest_rd_atomic	= 1,
		.min_rnr_timer		= 12,
		.ah_attr		= {
			.is_global	= 0,
			.dlid		= conn->path.dlid,
			.sl		= conn->path.sl,
			.src_path_bits	= 0,
			.port_num	= rdma_ctx.device_port
		}
	};

	if (ibv_modify_qp(conn->qp, &attr,
			  IBV_QP_STATE              |
			  IBV_QP_PATH_MTU           |
			  IBV_QP_DEST_QPN           |
			  IBV_QP_RQ_PSN             |
			  IBV_QP_MAX_DEST_RD_ATOMIC |
			  IBV_QP_MIN_RNR_TIMER      |
			  IBV_QP_AV)) {
		return 1;
	}

	attr.qp_state	     = IBV_QPS_RTS;
	attr.timeout	     = 14;
	attr.retry_cnt	     = 7;
	attr.rnr_retry	     = 7;
	attr.sq_psn	     = 1;
	attr.max_rd_atomic   = 1;
	if (ibv_modify_qp(conn->qp, &attr,
			  IBV_QP_STATE              |
			  IBV_QP_TIMEOUT            |
			  IBV_QP_RETRY_CNT          |
			  IBV_QP_RNR_RETRY          |
			  IBV_QP_SQ_PSN             |
			  IBV_QP_MAX_QP_RD_ATOMIC)) {
		return 1;
	}
...


Thanks,
Ira

-- 
Ira Weiny
Member of Technical Staff
Lawrence Livermore National Lab
925-423-8008
weiny2-i2BcT+NCU+M@public.gmane.org
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Work completion error: "transport retry counter exceeded"
       [not found] ` <20120725190719.475605dc169353b775cd3463-i2BcT+NCU+M@public.gmane.org>
@ 2012-07-26  7:15   ` Roland Dreier
       [not found]     ` <CAL1RGDWmpHy43b5TarBWpUk1RXOEdNitXWE4+xRCspgfpwUisQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Roland Dreier @ 2012-07-26  7:15 UTC (permalink / raw)
  To: Ira Weiny; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Wed, Jul 25, 2012 at 7:07 PM, Ira Weiny <weiny2-i2BcT+NCU+M@public.gmane.org> wrote:
>         attr.timeout         = 14;

Is this timeout sufficient to account for the round trip on
the fabric and the ack delay on the remote HCA?

I don't think there are any other attributes that would affect
getting transport retries.

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Work completion error: "transport retry counter exceeded"
       [not found]     ` <CAL1RGDWmpHy43b5TarBWpUk1RXOEdNitXWE4+xRCspgfpwUisQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2012-07-26 12:21       ` Albert Strasheim
       [not found]         ` <CALfB72Ad4+R48Nc-kawsrk1JQo964OkJ6DE46mcR5b9pS2_hEA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Albert Strasheim @ 2012-07-26 12:21 UTC (permalink / raw)
  To: Roland Dreier; +Cc: Ira Weiny, linux-rdma-u79uwXL29TY76Z2rM5mHXA

Hello

On Thu, Jul 26, 2012 at 9:15 AM, Roland Dreier <roland-BHEL68pLQRGGvPXPguhicg@public.gmane.org> wrote:
> On Wed, Jul 25, 2012 at 7:07 PM, Ira Weiny <weiny2-i2BcT+NCU+M@public.gmane.org> wrote:
>>         attr.timeout         = 14;
> Is this timeout sufficient to account for the round trip on
> the fabric and the ack delay on the remote HCA?
> I don't think there are any other attributes that would affect
> getting transport retries.

I wonder if I might be seeing the same thing...

How does one choose a good value for this setting?

Apparently it maps to 4.096 x 2 ^ attr.timeout microseconds.

What's the maximum value one can set here?

What can go wrong if one goes for the maximum value?

Regards

Albert
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Work completion error: "transport retry counter exceeded"
       [not found]         ` <CALfB72Ad4+R48Nc-kawsrk1JQo964OkJ6DE46mcR5b9pS2_hEA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2012-07-26 17:45           ` Roland Dreier
       [not found]             ` <CAL1RGDVwy56YL7OLxVuvap5WZRzzZsosQmKBkhWZB73uy3ysDg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Roland Dreier @ 2012-07-26 17:45 UTC (permalink / raw)
  To: Albert Strasheim; +Cc: Ira Weiny, linux-rdma-u79uwXL29TY76Z2rM5mHXA

> I wonder if I might be seeing the same thing...
>
> How does one choose a good value for this setting?
>
> Apparently it maps to 4.096 x 2 ^ attr.timeout microseconds.
>
> What's the maximum value one can set here?
>
> What can go wrong if one goes for the maximum value?

In theory you want a timeout of around 2 * max packet life in the fabric
(ie max RTT) plus max remote HCA ack time (reported in device properties).

Max value is 31, which maps to a few hours.  If you choose that, then
a single lost packet will stall your connection for many hours (if you
choose 7 retries) before reporting an error.

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: Work completion error: "transport retry counter exceeded"
       [not found]             ` <CAL1RGDVwy56YL7OLxVuvap5WZRzzZsosQmKBkhWZB73uy3ysDg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2012-07-27 16:50               ` Paul Grun
  2012-07-27 17:08                 ` Roland Dreier
  2012-07-27 17:33                 ` Albert Strasheim
  0 siblings, 2 replies; 10+ messages in thread
From: Paul Grun @ 2012-07-27 16:50 UTC (permalink / raw)
  To: 'Roland Dreier', 'Albert Strasheim'
  Cc: 'Ira Weiny', linux-rdma-u79uwXL29TY76Z2rM5mHXA

In general this is correct.  This question came up recently in an entirely
different context (it happened to be RoCE), but the failure was strikingly
similar.  For those interested, here's the view from the IB spec
perspective.

============================================================================
==========================

There are two possible issues here, normal retries and the RNR-NAK protocol.

Normal Retries-
The transport can retry two types of errors (timeouts and sequence number
errors).  There is a 3-bit counter that the transport decrements whenever it
retries a packet due to a timeout or a NAK-sequence error. If the counter
expires, the message transfer (e.g. SEND, RDMA WRITE...) is terminated and
the work request is completed and marked in error which is how the verbs are
notified of the error.  This retry counter is an attribute of the QP and is
set using the Modify QP verb.

Timeouts are due to expiration of a thing called the Transport Timer, which
has a minimum duration of 8.192uS.  The Transport Timer is used to detect
genuinely lost packets and really bad stuff happening in the fabric.  The
transport starts the timer when it initiates its first work request, and
resets it every time a valid acknowledge message is received.  If the timer
expires, it means that the requester hasn't seen an acknowledge of any sort
for a really long time.  The value of this timer is also an attribute of the
QP and is set using the Modify QP verb. Setting the timer value to zero
disables the timer.

If the Transport Timer expires, the requester signals a locally detected
error.

It is very hard to predict these re-try interval.  If the error is due to a
NAK-sequence error (which means that the responder saw an out of sequence
packet), the requester will retry it right away.  Retries due to timeouts
are virtually impossible to predict.

RNR-NAK-
There are two parameters associated with this: the number of times an
RNR-NAK can be retried, and the interval between retries.  The number of
times an RNR-NAK can be retried is negotiated by the two parties during
connection establishment.  As above, this 3-bit counter, called "RNR Retry
Count" is an attribute of the QP and is set using the Modify QP verb.  A
value of 7 (111) means infinite retry.

If the counter expires, meaning that the requester received too many
RNR-NAKs, the requester signals a locally detected error.

Whenever it generates an RNR-NAK, the Responder indicates the minimum
interval that the requester must wait before retrying the request. This
value is returned to the requester as a field in the RNR-NAK, and can range
from .01mS up to 655.36mS.  As the above, this is an attribute of the QP and
is set using the Modify QP verb.

============================================================================
==========================

Note that both an "RNR-NAK retry count exceeded" and a "timeout" error are
reported in the same way, as a locally detected error.

Ira, are you by any chance sending immediate data with your RDMA Write?  

-Paul

> -----Original Message-----
> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Roland Dreier
> Sent: Thursday, July 26, 2012 10:45 AM
> To: Albert Strasheim
> Cc: Ira Weiny; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: Work completion error: "transport retry counter exceeded"
> 
> > I wonder if I might be seeing the same thing...
> >
> > How does one choose a good value for this setting?
> >
> > Apparently it maps to 4.096 x 2 ^ attr.timeout microseconds.
> >
> > What's the maximum value one can set here?
> >
> > What can go wrong if one goes for the maximum value?
> 
> In theory you want a timeout of around 2 * max packet life in the fabric
> (ie max RTT) plus max remote HCA ack time (reported in device properties).
> 
> Max value is 31, which maps to a few hours.  If you choose that, then a
> single lost packet will stall your connection for many hours (if you
> choose 7 retries) before reporting an error.
> 
>  - R.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Work completion error: "transport retry counter exceeded"
  2012-07-27 16:50               ` Paul Grun
@ 2012-07-27 17:08                 ` Roland Dreier
  2012-07-27 17:33                 ` Albert Strasheim
  1 sibling, 0 replies; 10+ messages in thread
From: Roland Dreier @ 2012-07-27 17:08 UTC (permalink / raw)
  To: Paul Grun; +Cc: Albert Strasheim, Ira Weiny, linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Fri, Jul 27, 2012 at 9:50 AM, Paul Grun <pgrun-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org> wrote:
> Note that both an "RNR-NAK retry count exceeded" and a "timeout" error are
> reported in the same way, as a locally detected error.

Not quite right.  There are two different work completion statuses:

        IBV_WC_RETRY_EXC_ERR
        IBV_WC_RNR_RETRY_EXC_ERR

which libibverbs will format as

        "transport retry counter exceeded"
        "RNR retry counter exceeded"

so it is pretty easy to tell the difference between the two errors.

 - R.
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Work completion error: "transport retry counter exceeded"
  2012-07-27 16:50               ` Paul Grun
  2012-07-27 17:08                 ` Roland Dreier
@ 2012-07-27 17:33                 ` Albert Strasheim
       [not found]                   ` <CALfB72A+ghTETXqVt63YW-cWF_ygiEDkFq9SvQos=Vuv4ZcfwQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  1 sibling, 1 reply; 10+ messages in thread
From: Albert Strasheim @ 2012-07-27 17:33 UTC (permalink / raw)
  To: Paul Grun; +Cc: Roland Dreier, Ira Weiny, linux-rdma-u79uwXL29TY76Z2rM5mHXA

Hello

On Fri, Jul 27, 2012 at 6:50 PM, Paul Grun <pgrun-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org> wrote:
> Ira, are you by any chance sending immediate data with your RDMA Write?

Out of curiosity, what would be the significance if the answer to this
question was yes?

Regards

Albert
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Work completion error: "transport retry counter exceeded"
       [not found]                   ` <CALfB72A+ghTETXqVt63YW-cWF_ygiEDkFq9SvQos=Vuv4ZcfwQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2012-07-27 17:42                     ` Ira Weiny
       [not found]                       ` <20120727104212.293cdc3e4a14ad267988d6ee-i2BcT+NCU+M@public.gmane.org>
  2012-07-27 18:51                     ` Paul Grun
  1 sibling, 1 reply; 10+ messages in thread
From: Ira Weiny @ 2012-07-27 17:42 UTC (permalink / raw)
  To: Albert Strasheim
  Cc: Paul Grun, Roland Dreier, linux-rdma-u79uwXL29TY76Z2rM5mHXA

First, I have gotten pulled into another project so I have not been able to debug this further.

I __really__ appreciate all the responses and will report back when I have found more information.

Thanks!

On Fri, 27 Jul 2012 19:33:18 +0200
Albert Strasheim <fullung-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:

> Hello
> 
> On Fri, Jul 27, 2012 at 6:50 PM, Paul Grun <pgrun-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org> wrote:
> > Ira, are you by any chance sending immediate data with your RDMA Write?
> 
> Out of curiosity, what would be the significance if the answer to this
> question was yes?

For me the answer is "no".

Thanks again,
Ira

> 
> Regards
> 
> Albert
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


-- 
Ira Weiny
Member of Technical Staff
Lawrence Livermore National Lab
925-423-8008
weiny2-i2BcT+NCU+M@public.gmane.org
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* RE: Work completion error: "transport retry counter exceeded"
       [not found]                   ` <CALfB72A+ghTETXqVt63YW-cWF_ygiEDkFq9SvQos=Vuv4ZcfwQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2012-07-27 17:42                     ` Ira Weiny
@ 2012-07-27 18:51                     ` Paul Grun
  1 sibling, 0 replies; 10+ messages in thread
From: Paul Grun @ 2012-07-27 18:51 UTC (permalink / raw)
  To: 'Albert Strasheim'
  Cc: 'Roland Dreier', 'Ira Weiny',
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

The only way to get an RNR-NAK when doing an RDMA WRITE operation is if the
RDMA WRITE includes immediate data.  If Ira is not using immediate data,
then the cause of the local error cannot be due to an excessive number of
RNR-NAKs.

In any event though, Roland points out that the verbs can distinguish
between an RNR-NAK counter expiration (IBV_WC_RNR_RETRY_EXC_ERR) and a retry
counter expiration (IBV_WC_RETRY_EXC_ERR).  So my question is moot.

-Paul

> -----Original Message-----
> From: linux-rdma-owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org [mailto:linux-rdma-
> owner-u79uwXL29TY76Z2rM5mHXA@public.gmane.org] On Behalf Of Albert Strasheim
> Sent: Friday, July 27, 2012 10:33 AM
> To: Paul Grun
> Cc: Roland Dreier; Ira Weiny; linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> Subject: Re: Work completion error: "transport retry counter exceeded"
> 
> Hello
> 
> On Fri, Jul 27, 2012 at 6:50 PM, Paul Grun <pgrun-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org>
> wrote:
> > Ira, are you by any chance sending immediate data with your RDMA Write?
> 
> Out of curiosity, what would be the significance if the answer to this
> question was yes?
> 
> Regards
> 
> Albert
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: Work completion error: "transport retry counter exceeded"
       [not found]                       ` <20120727104212.293cdc3e4a14ad267988d6ee-i2BcT+NCU+M@public.gmane.org>
@ 2012-08-01 20:08                         ` Ira Weiny
  0 siblings, 0 replies; 10+ messages in thread
From: Ira Weiny @ 2012-08-01 20:08 UTC (permalink / raw)
  To: Ira Weiny
  Cc: Albert Strasheim, Paul Grun, Roland Dreier,
	linux-rdma-u79uwXL29TY76Z2rM5mHXA

On Fri, 27 Jul 2012 10:42:12 -0700
Ira Weiny <weiny2-i2BcT+NCU+M@public.gmane.org> wrote:

> First, I have gotten pulled into another project so I have not been able to debug this further.
> 
> I __really__ appreciate all the responses and will report back when I have found more information.

I feel really stupid admitting this... but I feel it is important to close this thread for other users who may have this issue.

The bottom line: I was sending to the wrong lid.

The details are: I had a copy and paste error where I incorrectly used "uint8_t" for a lid.  Therefore when run on a large system (lid > 256) this would fail where it would run fine on a 2 node test system.  :-(

<sigh>  Like I said I feel pretty stupid right now.  Thank you _very_ much for all the help.

Ira

> 
> Thanks!
> 
> On Fri, 27 Jul 2012 19:33:18 +0200
> Albert Strasheim <fullung-Re5JQEeQqe8AvxtiuMwx3w@public.gmane.org> wrote:
> 
> > Hello
> > 
> > On Fri, Jul 27, 2012 at 6:50 PM, Paul Grun <pgrun-klaOcWyJdxkshyMvu7JE4pqQE7yCjDx5@public.gmane.org> wrote:
> > > Ira, are you by any chance sending immediate data with your RDMA Write?
> > 
> > Out of curiosity, what would be the significance if the answer to this
> > question was yes?
> 
> For me the answer is "no".
> 
> Thanks again,
> Ira
> 
> > 
> > Regards
> > 
> > Albert
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 
> 
> -- 
> Ira Weiny
> Member of Technical Staff
> Lawrence Livermore National Lab
> 925-423-8008
> weiny2-i2BcT+NCU+M@public.gmane.org
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


-- 
Ira Weiny
Member of Technical Staff
Lawrence Livermore National Lab
925-423-8008
weiny2-i2BcT+NCU+M@public.gmane.org
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2012-08-01 20:08 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-26  2:07 Work completion error: "transport retry counter exceeded" Ira Weiny
     [not found] ` <20120725190719.475605dc169353b775cd3463-i2BcT+NCU+M@public.gmane.org>
2012-07-26  7:15   ` Roland Dreier
     [not found]     ` <CAL1RGDWmpHy43b5TarBWpUk1RXOEdNitXWE4+xRCspgfpwUisQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2012-07-26 12:21       ` Albert Strasheim
     [not found]         ` <CALfB72Ad4+R48Nc-kawsrk1JQo964OkJ6DE46mcR5b9pS2_hEA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2012-07-26 17:45           ` Roland Dreier
     [not found]             ` <CAL1RGDVwy56YL7OLxVuvap5WZRzzZsosQmKBkhWZB73uy3ysDg-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2012-07-27 16:50               ` Paul Grun
2012-07-27 17:08                 ` Roland Dreier
2012-07-27 17:33                 ` Albert Strasheim
     [not found]                   ` <CALfB72A+ghTETXqVt63YW-cWF_ygiEDkFq9SvQos=Vuv4ZcfwQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2012-07-27 17:42                     ` Ira Weiny
     [not found]                       ` <20120727104212.293cdc3e4a14ad267988d6ee-i2BcT+NCU+M@public.gmane.org>
2012-08-01 20:08                         ` Ira Weiny
2012-07-27 18:51                     ` Paul Grun

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.