All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] Handling BUSY responses from the SM.
@ 2010-10-26 18:33 Mike Heinz
       [not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D4675DED-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
  0 siblings, 1 reply; 5+ messages in thread
From: Mike Heinz @ 2010-10-26 18:33 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA

Resending. Didn't get any reply after sending the last time.

-----Original Message-----
From: Mike Heinz 
Sent: Monday, October 11, 2010 1:24 PM
To: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'; 'Hefty, Sean'
Cc: Todd Rimmer
Subject: [PATCH] Handling BUSY responses from the SM.

This patch builds upon feedback received earlier this year to add a "treat BUSY as timeout" feature to ib_mad. It does NOT implement the ABI/API changes that would be needed in user space to take advantage of the new feature, but it lays the groundwork for doing so. In addition, it provides a new module parameter that allow the administrator to coerce existing code into using the new capability.

The patch builds upon the randomization/backoff patch I sent earlier today to add a random factor to timeouts to prevent synchronized storms of MAD queries. I chose to build upon the existing timeout handling because it seemed the best way to add the functionality without 

Initially, I had tried to completely separate BUSY retries from timeout handling, but that seemed difficult due to the way the timeout code is structured. As a result, true timeouts and busy handling still use the same timeout values, but I was still able to address the idea of randomizing the retry timeout if desired.

By default, the behavior of ib_mad with respect to BUSY responses is unchanged. If, however, a send work request is provided that has the new "busy_wait" parameter set, ib_mad will ignore BUSY responses to that WR, allowing it to timeout and retry as if no response had been received. 

Finally, I've added a module parameter to coerce all mad work requests to use this new feature:

parm:           treat_busy_as_timeout:When true, treat BUSY responses as if they were timeouts. (int)

As I mentioned in the past, this change solves a problem we see in the real world all the time (the SM being pounded by "unintelligent" queries) so I strongly hope this meets your concerns.

----

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 3b03f1c..9e5e566 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -60,6 +60,10 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
 module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
 
+int mad_wait_on_busy = 0;
+module_param_named(treat_busy_as_timeout, mad_wait_on_busy, int, 0444);
+MODULE_PARM_DESC(treat_busy_as_timeout, "When true, treat BUSY responses as if they were timeouts.");
+
 int mad_randomized_wait = 0;
 module_param_named(randomized_wait, mad_randomized_wait, int, 0444);
 MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff algorithm to control retries for timeouts.");
@@ -1120,6 +1124,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 
 		mad_send_wr->max_retries = send_buf->retries;
 		mad_send_wr->retries_left = send_buf->retries;
+		mad_send_wr->wait_on_busy = send_buf->wait_on_busy || mad_wait_on_busy;
 		
 		send_buf->retries = 0;
 		
@@ -1819,6 +1824,8 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 
 	/* Complete corresponding request */
 	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+		u16 busy = __be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.status) &
+						IB_MGMT_MAD_STATUS_BUSY;
 
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
@@ -1829,6 +1836,17 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 			return;
 		}
 
+		printk(KERN_DEBUG PFX "Completing recv %p: busy = %d, retries_left = %d, wait_on_busy = %d\n",
+			mad_send_wr, busy, mad_send_wr->retries_left, mad_send_wr->wait_on_busy);
+		if (busy && mad_send_wr->retries_left && mad_send_wr->wait_on_busy) {
+			/* Just let the query timeout and have it requeued later */
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			ib_free_recv_mad(mad_recv_wc);
+			deref_mad_agent(mad_agent_priv);
+			printk(KERN_INFO PFX "SA/SM responded MAD_STATUS_BUSY. Allowing request to time out.\n");
+			return;
+		}
+
 		ib_mark_mad_done(mad_send_wr);
 		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 01fb7ed..1d0629e 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -135,6 +135,7 @@ struct ib_mad_send_wr_private {
 	unsigned long total_timeout;
 	int max_retries;
 	int retries_left;
+	int wait_on_busy;
 	int randomized_wait;
 	int retry;
 	int refcount;
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index c3d6efb..3da55c3 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -255,6 +255,7 @@ struct ib_mad_send_buf {
 	int			seg_count;
 	int			seg_size;
 	int			timeout_ms;
+	int			wait_on_busy;
 	int			randomized_wait;
 	int			retries;
 };
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH] Handling BUSY responses from the SM.
       [not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D4675DED-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
@ 2010-10-28 13:14   ` Hal Rosenstock
       [not found]     ` <4CC97718.5080002-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
  2010-11-04 15:12   ` Hefty, Sean
  1 sibling, 1 reply; 5+ messages in thread
From: Hal Rosenstock @ 2010-10-28 13:14 UTC (permalink / raw)
  To: Mike Heinz; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA

On 10/26/2010 2:33 PM, Mike Heinz wrote:
> Resending. Didn't get any reply after sending the last time.
>
> -----Original Message-----
> From: Mike Heinz
> Sent: Monday, October 11, 2010 1:24 PM
> To: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'; 'Hefty, Sean'
> Cc: Todd Rimmer
> Subject: [PATCH] Handling BUSY responses from the SM.
>
> This patch builds upon feedback received earlier this year to add a "treat BUSY as timeout" feature to ib_mad. It does NOT implement the ABI/API changes that would be needed in user space to take advantage of the new feature, but it lays the groundwork for doing so. In addition, it provides a new module parameter that allow the administrator to coerce existing code into using the new capability.
 >
> The patch builds upon the randomization/backoff patch I sent earlier today to add a random factor to timeouts to prevent synchronized storms of MAD queries. I chose to build upon the existing timeout handling because it seemed the best way to add the functionality without
>
> Initially, I had tried to completely separate BUSY retries from timeout handling, but that seemed difficult due to the way the timeout code is structured. As a result, true timeouts and busy handling still use the same timeout values, but I was still able to address the idea of randomizing the retry timeout if desired.
>
> By default, the behavior of ib_mad with respect to BUSY responses is unchanged. If, however, a send work request is provided that has the new "busy_wait" parameter set, ib_mad will ignore BUSY responses to that WR, allowing it to timeout and retry as if no response had been received.

Is setting this useful without the randomization also set for this (or 
all) transaction (request/response) WRs ?

> Finally, I've added a module parameter to coerce all mad work requests to use this new feature:
>
> parm:           treat_busy_as_timeout:When true, treat BUSY responses as if they were timeouts. (int)
>
> As I mentioned in the past, this change solves a problem we see in the real world all the time (the SM being pounded by "unintelligent" queries) so I strongly hope this meets your concerns.
>
> ----
>
> diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
> index 3b03f1c..9e5e566 100644
> --- a/drivers/infiniband/core/mad.c
> +++ b/drivers/infiniband/core/mad.c
> @@ -60,6 +60,10 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
>   module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
>   MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
>
> +int mad_wait_on_busy = 0;
> +module_param_named(treat_busy_as_timeout, mad_wait_on_busy, int, 0444);
> +MODULE_PARM_DESC(treat_busy_as_timeout, "When true, treat BUSY responses as if they were timeouts.");
> +
>   int mad_randomized_wait = 0;
>   module_param_named(randomized_wait, mad_randomized_wait, int, 0444);
>   MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff algorithm to control retries for timeouts.");
> @@ -1120,6 +1124,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
>
>   		mad_send_wr->max_retries = send_buf->retries;
>   		mad_send_wr->retries_left = send_buf->retries;
> +		mad_send_wr->wait_on_busy = send_buf->wait_on_busy || mad_wait_on_busy;
>   		
>   		send_buf->retries = 0;
>   		
> @@ -1819,6 +1824,8 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
>
>   	/* Complete corresponding request */
>   	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
> +		u16 busy = __be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.status)&
> +						IB_MGMT_MAD_STATUS_BUSY;

Should this just use be16_to_cpu be used here for consistency ?

Nit: the definition of IB_MGMT_MAD_STATUS_BUSY should be part of this 
patch rather than the previous one.

>
>   		spin_lock_irqsave(&mad_agent_priv->lock, flags);
>   		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
> @@ -1829,6 +1836,17 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
>   			return;
>   		}
>
> +		printk(KERN_DEBUG PFX "Completing recv %p: busy = %d, retries_left = %d, wait_on_busy = %d\n",
> +			mad_send_wr, busy, mad_send_wr->retries_left, mad_send_wr->wait_on_busy);
> +		if (busy&&  mad_send_wr->retries_left&&  mad_send_wr->wait_on_busy) {

Nit: formatting: if (busy && mad_send_wr->retries_left && 
mad_send_wr->wait_on_busy) {

> +			/* Just let the query timeout and have it requeued later */
> +			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
> +			ib_free_recv_mad(mad_recv_wc);
> +			deref_mad_agent(mad_agent_priv);
> +			printk(KERN_INFO PFX "SA/SM responded MAD_STATUS_BUSY. Allowing request to time out.\n");

Do we need this printk ? Won't this spam the kernel log ?

-- Hal

> +			return;
> +		}
> +
>   		ib_mark_mad_done(mad_send_wr);
>   		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
>
> diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
> index 01fb7ed..1d0629e 100644
> --- a/drivers/infiniband/core/mad_priv.h
> +++ b/drivers/infiniband/core/mad_priv.h
> @@ -135,6 +135,7 @@ struct ib_mad_send_wr_private {
>   	unsigned long total_timeout;
>   	int max_retries;
>   	int retries_left;
> +	int wait_on_busy;
>   	int randomized_wait;
>   	int retry;
>   	int refcount;
> diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
> index c3d6efb..3da55c3 100644
> --- a/include/rdma/ib_mad.h
> +++ b/include/rdma/ib_mad.h
> @@ -255,6 +255,7 @@ struct ib_mad_send_buf {
>   	int			seg_count;
>   	int			seg_size;
>   	int			timeout_ms;
> +	int			wait_on_busy;
>   	int			randomized_wait;
>   	int			retries;
>   };

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH] Handling BUSY responses from the SM.
       [not found]     ` <4CC97718.5080002-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
@ 2010-10-28 13:27       ` Hal Rosenstock
  0 siblings, 0 replies; 5+ messages in thread
From: Hal Rosenstock @ 2010-10-28 13:27 UTC (permalink / raw)
  To: Mike Heinz; +Cc: linux-rdma-u79uwXL29TY76Z2rM5mHXA

On 10/28/2010 9:14 AM, Hal Rosenstock wrote:

A couple more things I missed in my previous post on this:

> On 10/26/2010 2:33 PM, Mike Heinz wrote:
>> Resending. Didn't get any reply after sending the last time.
>>
>> -----Original Message-----
>> From: Mike Heinz
>> Sent: Monday, October 11, 2010 1:24 PM
>> To: 'linux-rdma-u79uwXL29TY76Z2rM5mHXA@public.gmane.org'; 'Hefty, Sean'
>> Cc: Todd Rimmer
>> Subject: [PATCH] Handling BUSY responses from the SM.
>>
>> This patch builds upon feedback received earlier this year to add a
>> "treat BUSY as timeout" feature to ib_mad. It does NOT implement the
>> ABI/API changes that would be needed in user space to take advantage
>> of the new feature, but it lays the groundwork for doing so. In
>> addition, it provides a new module parameter that allow the
>> administrator to coerce existing code into using the new capability.
>  >
>> The patch builds upon the randomization/backoff patch I sent earlier
>> today to add a random factor to timeouts to prevent synchronized
>> storms of MAD queries. I chose to build upon the existing timeout
>> handling because it seemed the best way to add the functionality without

incomplete sentence: without what ?

>>
>> Initially, I had tried to completely separate BUSY retries from
>> timeout handling, but that seemed difficult due to the way the timeout
>> code is structured. As a result, true timeouts and busy handling still
>> use the same timeout values, but I was still able to address the idea
>> of randomizing the retry timeout if desired.
>>
>> By default, the behavior of ib_mad with respect to BUSY responses is
>> unchanged. If, however, a send work request is provided that has the
>> new "busy_wait" parameter set, ib_mad will ignore BUSY responses to
>> that WR, allowing it to timeout and retry as if no response had been
>> received.
>
> Is setting this useful without the randomization also set for this (or
> all) transaction (request/response) WRs ?
>
>> Finally, I've added a module parameter to coerce all mad work requests
>> to use this new feature:
>>
>> parm: treat_busy_as_timeout:When true, treat BUSY responses as if they
>> were timeouts. (int)
>>
>> As I mentioned in the past, this change solves a problem we see in the
>> real world all the time (the SM being pounded by "unintelligent"
>> queries) so I strongly hope this meets your concerns.
>>
>> ----
>>
>> diff --git a/drivers/infiniband/core/mad.c
>> b/drivers/infiniband/core/mad.c
>> index 3b03f1c..9e5e566 100644
>> --- a/drivers/infiniband/core/mad.c
>> +++ b/drivers/infiniband/core/mad.c
>> @@ -60,6 +60,10 @@ MODULE_PARM_DESC(send_queue_size, "Size of send
>> queue in number of work requests
>> module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
>> MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of
>> work requests");
>>
>> +int mad_wait_on_busy = 0;
>> +module_param_named(treat_busy_as_timeout, mad_wait_on_busy, int, 0444);
>> +MODULE_PARM_DESC(treat_busy_as_timeout, "When true, treat BUSY
>> responses as if they were timeouts.");
>> +
>> int mad_randomized_wait = 0;
>> module_param_named(randomized_wait, mad_randomized_wait, int, 0444);
>> MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff
>> algorithm to control retries for timeouts.");
>> @@ -1120,6 +1124,7 @@ int ib_post_send_mad(struct ib_mad_send_buf
>> *send_buf,
>>
>> mad_send_wr->max_retries = send_buf->retries;
>> mad_send_wr->retries_left = send_buf->retries;
>> + mad_send_wr->wait_on_busy = send_buf->wait_on_busy || mad_wait_on_busy;
>>
>> send_buf->retries = 0;
>>
>> @@ -1819,6 +1824,8 @@ static void ib_mad_complete_recv(struct
>> ib_mad_agent_private *mad_agent_priv,
>>
>> /* Complete corresponding request */
>> if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
>> + u16 busy = __be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.status)&
>> + IB_MGMT_MAD_STATUS_BUSY;
>
> Should this just use be16_to_cpu be used here for consistency ?
>
> Nit: the definition of IB_MGMT_MAD_STATUS_BUSY should be part of this
> patch rather than the previous one.
>
>>
>> spin_lock_irqsave(&mad_agent_priv->lock, flags);
>> mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
>> @@ -1829,6 +1836,17 @@ static void ib_mad_complete_recv(struct
>> ib_mad_agent_private *mad_agent_priv,
>> return;
>> }
>>
>> + printk(KERN_DEBUG PFX "Completing recv %p: busy = %d, retries_left =
>> %d, wait_on_busy = %d\n",
>> + mad_send_wr, busy, mad_send_wr->retries_left,
>> mad_send_wr->wait_on_busy);
>> + if (busy&& mad_send_wr->retries_left&& mad_send_wr->wait_on_busy) {

This appears to include trap represses (as determined by 
ib_response_mad). Shouldn't busy be ignored for that case ? I don't 
think that would be used (e.g. trap repress sent w/ busy) but it seems 
safer to me. I think we previously discussed this way back in June.

-- Hal

> Nit: formatting: if (busy && mad_send_wr->retries_left &&
> mad_send_wr->wait_on_busy) {
>
>> + /* Just let the query timeout and have it requeued later */
>> + spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
>> + ib_free_recv_mad(mad_recv_wc);
>> + deref_mad_agent(mad_agent_priv);
>> + printk(KERN_INFO PFX "SA/SM responded MAD_STATUS_BUSY. Allowing
>> request to time out.\n");
>
> Do we need this printk ? Won't this spam the kernel log ?
>
> -- Hal
>
>> + return;
>> + }
>> +
>> ib_mark_mad_done(mad_send_wr);
>> spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
>>
>> diff --git a/drivers/infiniband/core/mad_priv.h
>> b/drivers/infiniband/core/mad_priv.h
>> index 01fb7ed..1d0629e 100644
>> --- a/drivers/infiniband/core/mad_priv.h
>> +++ b/drivers/infiniband/core/mad_priv.h
>> @@ -135,6 +135,7 @@ struct ib_mad_send_wr_private {
>> unsigned long total_timeout;
>> int max_retries;
>> int retries_left;
>> + int wait_on_busy;
>> int randomized_wait;
>> int retry;
>> int refcount;
>> diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
>> index c3d6efb..3da55c3 100644
>> --- a/include/rdma/ib_mad.h
>> +++ b/include/rdma/ib_mad.h
>> @@ -255,6 +255,7 @@ struct ib_mad_send_buf {
>> int seg_count;
>> int seg_size;
>> int timeout_ms;
>> + int wait_on_busy;
>> int randomized_wait;
>> int retries;
>> };
>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 5+ messages in thread

* RE: [PATCH] Handling BUSY responses from the SM.
       [not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D4675DED-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
  2010-10-28 13:14   ` Hal Rosenstock
@ 2010-11-04 15:12   ` Hefty, Sean
  1 sibling, 0 replies; 5+ messages in thread
From: Hefty, Sean @ 2010-11-04 15:12 UTC (permalink / raw)
  To: Mike Heinz, linux-rdma-u79uwXL29TY76Z2rM5mHXA

> diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
> index 3b03f1c..9e5e566 100644
> --- a/drivers/infiniband/core/mad.c
> +++ b/drivers/infiniband/core/mad.c
> @@ -60,6 +60,10 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in
> number of work requests
>  module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
>  MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work
> requests");
> 
> +int mad_wait_on_busy = 0;
> +module_param_named(treat_busy_as_timeout, mad_wait_on_busy, int, 0444);
> +MODULE_PARM_DESC(treat_busy_as_timeout, "When true, treat BUSY responses
> as if they were timeouts.");

I'd prefer to avoid adding a module parameter and just have the mad layer do what's right.

> +
>  int mad_randomized_wait = 0;
>  module_param_named(randomized_wait, mad_randomized_wait, int, 0444);
>  MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff
> algorithm to control retries for timeouts.");
> @@ -1120,6 +1124,7 @@ int ib_post_send_mad(struct ib_mad_send_buf
> *send_buf,
> 
>  		mad_send_wr->max_retries = send_buf->retries;
>  		mad_send_wr->retries_left = send_buf->retries;
> +		mad_send_wr->wait_on_busy = send_buf->wait_on_busy ||
> mad_wait_on_busy;
> 
>  		send_buf->retries = 0;
> 
> @@ -1819,6 +1824,8 @@ static void ib_mad_complete_recv(struct
> ib_mad_agent_private *mad_agent_priv,
> 
>  	/* Complete corresponding request */
>  	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
> +		u16 busy = __be16_to_cpu(mad_recv_wc->recv_buf.mad-
> >mad_hdr.status) &
> +						IB_MGMT_MAD_STATUS_BUSY;
> 
>  		spin_lock_irqsave(&mad_agent_priv->lock, flags);
>  		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
> @@ -1829,6 +1836,17 @@ static void ib_mad_complete_recv(struct
> ib_mad_agent_private *mad_agent_priv,
>  			return;
>  		}
> 
> +		printk(KERN_DEBUG PFX "Completing recv %p: busy = %d,
> retries_left = %d, wait_on_busy = %d\n",
> +			mad_send_wr, busy, mad_send_wr->retries_left,
> mad_send_wr->wait_on_busy);
> +		if (busy && mad_send_wr->retries_left && mad_send_wr-
> >wait_on_busy) {
> +			/* Just let the query timeout and have it requeued later
> */
> +			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
> +			ib_free_recv_mad(mad_recv_wc);
> +			deref_mad_agent(mad_agent_priv);
> +			printk(KERN_INFO PFX "SA/SM responded MAD_STATUS_BUSY.
> Allowing request to time out.\n");
> +			return;
> +		}
> +
>  		ib_mark_mad_done(mad_send_wr);
>  		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
> 
> diff --git a/drivers/infiniband/core/mad_priv.h
> b/drivers/infiniband/core/mad_priv.h
> index 01fb7ed..1d0629e 100644
> --- a/drivers/infiniband/core/mad_priv.h
> +++ b/drivers/infiniband/core/mad_priv.h
> @@ -135,6 +135,7 @@ struct ib_mad_send_wr_private {
>  	unsigned long total_timeout;
>  	int max_retries;
>  	int retries_left;
> +	int wait_on_busy;
>  	int randomized_wait;
>  	int retry;
>  	int refcount;
> diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
> index c3d6efb..3da55c3 100644
> --- a/include/rdma/ib_mad.h
> +++ b/include/rdma/ib_mad.h
> @@ -255,6 +255,7 @@ struct ib_mad_send_buf {
>  	int			seg_count;
>  	int			seg_size;
>  	int			timeout_ms;
> +	int			wait_on_busy;

I don't see where this is ever set.

>  	int			randomized_wait;
>  	int			retries;
>  };

--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH] Handling BUSY responses from the SM.
@ 2010-10-11 17:23 Mike Heinz
  0 siblings, 0 replies; 5+ messages in thread
From: Mike Heinz @ 2010-10-11 17:23 UTC (permalink / raw)
  To: linux-rdma-u79uwXL29TY76Z2rM5mHXA, Hefty, Sean; +Cc: Todd Rimmer

This patch builds upon feedback received earlier this year to add a "treat BUSY as timeout" feature to ib_mad. It does NOT implement the ABI/API changes that would be needed in user space to take advantage of the new feature, but it lays the groundwork for doing so. In addition, it provides a new module parameter that allow the administrator to coerce existing code into using the new capability.

The patch builds upon the randomization/backoff patch I sent earlier today to add a random factor to timeouts to prevent synchronized storms of MAD queries. I chose to build upon the existing timeout handling because it seemed the best way to add the functionality without 

Initially, I had tried to completely separate BUSY retries from timeout handling, but that seemed difficult due to the way the timeout code is structured. As a result, true timeouts and busy handling still use the same timeout values, but I was still able to address the idea of randomizing the retry timeout if desired.

By default, the behavior of ib_mad with respect to BUSY responses is unchanged. If, however, a send work request is provided that has the new "busy_wait" parameter set, ib_mad will ignore BUSY responses to that WR, allowing it to timeout and retry as if no response had been received. 

Finally, I've added a module parameter to coerce all mad work requests to use this new feature:

parm:           treat_busy_as_timeout:When true, treat BUSY responses as if they were timeouts. (int)

As I mentioned in the past, this change solves a problem we see in the real world all the time (the SM being pounded by "unintelligent" queries) so I strongly hope this meets your concerns.

----

diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
index 3b03f1c..9e5e566 100644
--- a/drivers/infiniband/core/mad.c
+++ b/drivers/infiniband/core/mad.c
@@ -60,6 +60,10 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
 module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
 
+int mad_wait_on_busy = 0;
+module_param_named(treat_busy_as_timeout, mad_wait_on_busy, int, 0444);
+MODULE_PARM_DESC(treat_busy_as_timeout, "When true, treat BUSY responses as if they were timeouts.");
+
 int mad_randomized_wait = 0;
 module_param_named(randomized_wait, mad_randomized_wait, int, 0444);
 MODULE_PARM_DESC(randomized_wait, "When true, use a randomized backoff algorithm to control retries for timeouts.");
@@ -1120,6 +1124,7 @@ int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
 
 		mad_send_wr->max_retries = send_buf->retries;
 		mad_send_wr->retries_left = send_buf->retries;
+		mad_send_wr->wait_on_busy = send_buf->wait_on_busy || mad_wait_on_busy;
 		
 		send_buf->retries = 0;
 		
@@ -1819,6 +1824,8 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 
 	/* Complete corresponding request */
 	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+		u16 busy = __be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.status) &
+						IB_MGMT_MAD_STATUS_BUSY;
 
 		spin_lock_irqsave(&mad_agent_priv->lock, flags);
 		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
@@ -1829,6 +1836,17 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
 			return;
 		}
 
+		printk(KERN_DEBUG PFX "Completing recv %p: busy = %d, retries_left = %d, wait_on_busy = %d\n",
+			mad_send_wr, busy, mad_send_wr->retries_left, mad_send_wr->wait_on_busy);
+		if (busy && mad_send_wr->retries_left && mad_send_wr->wait_on_busy) {
+			/* Just let the query timeout and have it requeued later */
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			ib_free_recv_mad(mad_recv_wc);
+			deref_mad_agent(mad_agent_priv);
+			printk(KERN_INFO PFX "SA/SM responded MAD_STATUS_BUSY. Allowing request to time out.\n");
+			return;
+		}
+
 		ib_mark_mad_done(mad_send_wr);
 		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
 
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
index 01fb7ed..1d0629e 100644
--- a/drivers/infiniband/core/mad_priv.h
+++ b/drivers/infiniband/core/mad_priv.h
@@ -135,6 +135,7 @@ struct ib_mad_send_wr_private {
 	unsigned long total_timeout;
 	int max_retries;
 	int retries_left;
+	int wait_on_busy;
 	int randomized_wait;
 	int retry;
 	int refcount;
diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h
index c3d6efb..3da55c3 100644
--- a/include/rdma/ib_mad.h
+++ b/include/rdma/ib_mad.h
@@ -255,6 +255,7 @@ struct ib_mad_send_buf {
 	int			seg_count;
 	int			seg_size;
 	int			timeout_ms;
+	int			wait_on_busy;
 	int			randomized_wait;
 	int			retries;
 };
--
To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2010-11-04 15:12 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-10-26 18:33 [PATCH] Handling BUSY responses from the SM Mike Heinz
     [not found] ` <4C2744E8AD2982428C5BFE523DF8CDCB49D4675DED-amwN6d8PyQWXx9kJd3VG2h2eb7JE58TQ@public.gmane.org>
2010-10-28 13:14   ` Hal Rosenstock
     [not found]     ` <4CC97718.5080002-LDSdmyG8hGV8YrgS2mwiifqBs+8SCbDb@public.gmane.org>
2010-10-28 13:27       ` Hal Rosenstock
2010-11-04 15:12   ` Hefty, Sean
  -- strict thread matches above, loose matches on Subject: below --
2010-10-11 17:23 Mike Heinz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.