netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/2] mrf24j40: Avoid transmission while receiving a frame
@ 2013-05-09 15:19 David Hauweele
  2013-05-09 15:19 ` [PATCH 2/2] mrf24j40: Keep the interrupt line enabled David Hauweele
  2013-05-14  3:22 ` [Linux-zigbee-devel] [PATCH 1/2] mrf24j40: Avoid transmission while receiving a frame Alan Ott
  0 siblings, 2 replies; 24+ messages in thread
From: David Hauweele @ 2013-05-09 15:19 UTC (permalink / raw)
  To: Alexander Smirnov, Dmitry Eremin-Solenikov,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA, David Hauweele

The transceiver may fail under heavy traffic when a frame is transmitted
while receiving another frame.

This patch uses a mutex to separate the transmission and reception of
frames along with a secondary working queue to avoid a deadlock while
waiting for the transmission interrupt.

Signed-off-by: David Hauweele <david-1EggE+PRa6vk1uMJSBkQmQ@public.gmane.org>
---
 drivers/net/ieee802154/mrf24j40.c |   22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index ede3ce4..1e3ddf3 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -82,8 +82,10 @@ struct mrf24j40 {
 	struct ieee802154_dev *dev;
 
 	struct mutex buffer_mutex; /* only used to protect buf */
+	struct mutex txrx_mutex; /* avoid transmission while receiving a frame */
 	struct completion tx_complete;
 	struct work_struct irqwork;
+	struct work_struct rxwork;
 	u8 *buf; /* 3 bytes. Used for SPI single-register transfers. */
 };
 
@@ -353,6 +355,8 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
 	/* Set TXNACKREQ if the ACK bit is set in the packet. */
 	if (skb->data[0] & IEEE802154_FC_ACK_REQ)
 		val |= 0x4;
+
+	mutex_lock(&devrec->txrx_mutex);
 	write_short_reg(devrec, REG_TXNCON, val);
 
 	INIT_COMPLETION(devrec->tx_complete);
@@ -361,6 +365,9 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
 	ret = wait_for_completion_interruptible_timeout(
 						&devrec->tx_complete,
 						5 * HZ);
+
+	mutex_unlock(&devrec->txrx_mutex);
+
 	if (ret == -ERESTARTSYS)
 		goto err;
 	if (ret == 0) {
@@ -535,6 +542,8 @@ static int mrf24j40_handle_rx(struct mrf24j40 *devrec)
 	int ret = 0;
 	struct sk_buff *skb;
 
+	mutex_lock(&devrec->txrx_mutex);
+
 	/* Turn off reception of packets off the air. This prevents the
 	 * device from overwriting the buffer while we're reading it. */
 	ret = read_short_reg(devrec, REG_BBREG1, &val);
@@ -575,6 +584,8 @@ out:
 	val &= ~0x4; /* Clear RXDECINV */
 	write_short_reg(devrec, REG_BBREG1, val);
 
+	mutex_unlock(&devrec->txrx_mutex);
+
 	return ret;
 }
 
@@ -616,12 +627,19 @@ static void mrf24j40_isrwork(struct work_struct *work)
 
 	/* Check for Rx */
 	if (intstat & 0x8)
-		mrf24j40_handle_rx(devrec);
+		schedule_work(&devrec->rxwork);
 
 out:
 	enable_irq(devrec->spi->irq);
 }
 
+static void mrf24j40_rxwork(struct work_struct *work)
+{
+	struct mrf24j40 *devrec = container_of(work, struct mrf24j40, rxwork);
+
+	mrf24j40_handle_rx(devrec);
+}
+
 static int mrf24j40_probe(struct spi_device *spi)
 {
 	int ret = -ENOMEM;
@@ -648,8 +666,10 @@ static int mrf24j40_probe(struct spi_device *spi)
 		spi->max_speed_hz = MAX_SPI_SPEED_HZ;
 
 	mutex_init(&devrec->buffer_mutex);
+	mutex_init(&devrec->txrx_mutex);
 	init_completion(&devrec->tx_complete);
 	INIT_WORK(&devrec->irqwork, mrf24j40_isrwork);
+	INIT_WORK(&devrec->rxwork, mrf24j40_rxwork);
 	devrec->spi = spi;
 	spi_set_drvdata(spi, devrec);
 
-- 
1.7.10.4


------------------------------------------------------------------------------
Learn Graph Databases - Download FREE O'Reilly Book
"Graph Databases" is the definitive new guide to graph databases and 
their applications. This 200-page book is written by three acclaimed 
leaders in the field. The early access version is available now. 
Download your free book today! http://p.sf.net/sfu/neotech_d2d_may

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/2] mrf24j40: Keep the interrupt line enabled
  2013-05-09 15:19 [PATCH 1/2] mrf24j40: Avoid transmission while receiving a frame David Hauweele
@ 2013-05-09 15:19 ` David Hauweele
       [not found]   ` <1368112788-25701-2-git-send-email-david-1EggE+PRa6vk1uMJSBkQmQ@public.gmane.org>
  2013-05-14  3:22 ` [Linux-zigbee-devel] [PATCH 1/2] mrf24j40: Avoid transmission while receiving a frame Alan Ott
  1 sibling, 1 reply; 24+ messages in thread
From: David Hauweele @ 2013-05-09 15:19 UTC (permalink / raw)
  To: Alexander Smirnov, Dmitry Eremin-Solenikov, linux-zigbee-devel
  Cc: netdev, linux-kernel, David Hauweele

Disabling the interrupt line could miss an IRQ and leave the line into a
low state hence locking the driver.

Signed-off-by: David Hauweele <david@hauweele.net>
---
 drivers/net/ieee802154/mrf24j40.c |    7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index 1e3ddf3..6ef32f7 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -603,8 +603,6 @@ static irqreturn_t mrf24j40_isr(int irq, void *data)
 {
 	struct mrf24j40 *devrec = data;
 
-	disable_irq_nosync(irq);
-
 	schedule_work(&devrec->irqwork);
 
 	return IRQ_HANDLED;
@@ -619,7 +617,7 @@ static void mrf24j40_isrwork(struct work_struct *work)
 	/* Read the interrupt status */
 	ret = read_short_reg(devrec, REG_INTSTAT, &intstat);
 	if (ret)
-		goto out;
+		return;
 
 	/* Check for TX complete */
 	if (intstat & 0x1)
@@ -628,9 +626,6 @@ static void mrf24j40_isrwork(struct work_struct *work)
 	/* Check for Rx */
 	if (intstat & 0x8)
 		schedule_work(&devrec->rxwork);
-
-out:
-	enable_irq(devrec->spi->irq);
 }
 
 static void mrf24j40_rxwork(struct work_struct *work)
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [Linux-zigbee-devel] [PATCH 1/2] mrf24j40: Avoid transmission while receiving a frame
  2013-05-09 15:19 [PATCH 1/2] mrf24j40: Avoid transmission while receiving a frame David Hauweele
  2013-05-09 15:19 ` [PATCH 2/2] mrf24j40: Keep the interrupt line enabled David Hauweele
@ 2013-05-14  3:22 ` Alan Ott
       [not found]   ` <5191ADE4.8040709-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
  1 sibling, 1 reply; 24+ messages in thread
From: Alan Ott @ 2013-05-14  3:22 UTC (permalink / raw)
  To: David Hauweele
  Cc: Alexander Smirnov, Dmitry Eremin-Solenikov, linux-zigbee-devel,
	netdev, linux-kernel

On 5/9/13 11:19 AM, David Hauweele wrote:
> The transceiver may fail under heavy traffic when a frame is transmitted
> while receiving another frame.
>
> This patch uses a mutex to separate the transmission and reception of
> frames along with a secondary working queue to avoid a deadlock while
> waiting for the transmission interrupt.
>

Have you observed this failure? I have put this kind of thing in before 
(several times actually) but ultimately convinced myself each time that 
it was not necessary. I'm happy to be shown wrong.


> Signed-off-by: David Hauweele <david@hauweele.net>
> ---
>   drivers/net/ieee802154/mrf24j40.c |   22 +++++++++++++++++++++-
>   1 file changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
> index ede3ce4..1e3ddf3 100644
> --- a/drivers/net/ieee802154/mrf24j40.c
> +++ b/drivers/net/ieee802154/mrf24j40.c
> @@ -82,8 +82,10 @@ struct mrf24j40 {
>   	struct ieee802154_dev *dev;
>
>   	struct mutex buffer_mutex; /* only used to protect buf */
> +	struct mutex txrx_mutex; /* avoid transmission while receiving a frame */
>   	struct completion tx_complete;
>   	struct work_struct irqwork;
> +	struct work_struct rxwork;
>   	u8 *buf; /* 3 bytes. Used for SPI single-register transfers. */
>   };
>
> @@ -353,6 +355,8 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
>   	/* Set TXNACKREQ if the ACK bit is set in the packet. */
>   	if (skb->data[0] & IEEE802154_FC_ACK_REQ)
>   		val |= 0x4;
> +
> +	mutex_lock(&devrec->txrx_mutex);
>   	write_short_reg(devrec, REG_TXNCON, val);
>
>   	INIT_COMPLETION(devrec->tx_complete);
> @@ -361,6 +365,9 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
>   	ret = wait_for_completion_interruptible_timeout(
>   						&devrec->tx_complete,
>   						5 * HZ);
> +
> +	mutex_unlock(&devrec->txrx_mutex);
> +
>   	if (ret == -ERESTARTSYS)
>   		goto err;
>   	if (ret == 0) {
> @@ -535,6 +542,8 @@ static int mrf24j40_handle_rx(struct mrf24j40 *devrec)
>   	int ret = 0;
>   	struct sk_buff *skb;
>
> +	mutex_lock(&devrec->txrx_mutex);
> +
>   	/* Turn off reception of packets off the air. This prevents the
>   	 * device from overwriting the buffer while we're reading it. */
>   	ret = read_short_reg(devrec, REG_BBREG1, &val);
> @@ -575,6 +584,8 @@ out:
>   	val &= ~0x4; /* Clear RXDECINV */
>   	write_short_reg(devrec, REG_BBREG1, val);
>
> +	mutex_unlock(&devrec->txrx_mutex);
> +
>   	return ret;
>   }
>
> @@ -616,12 +627,19 @@ static void mrf24j40_isrwork(struct work_struct *work)
>
>   	/* Check for Rx */
>   	if (intstat & 0x8)
> -		mrf24j40_handle_rx(devrec);
> +		schedule_work(&devrec->rxwork);
>

mrf24f40_isrwork() is already called from a workqueue.

>   out:
>   	enable_irq(devrec->spi->irq);
>   }
>
> +static void mrf24j40_rxwork(struct work_struct *work)
> +{
> +	struct mrf24j40 *devrec = container_of(work, struct mrf24j40, rxwork);
> +
> +	mrf24j40_handle_rx(devrec);
> +}
> +
>   static int mrf24j40_probe(struct spi_device *spi)
>   {
>   	int ret = -ENOMEM;
> @@ -648,8 +666,10 @@ static int mrf24j40_probe(struct spi_device *spi)
>   		spi->max_speed_hz = MAX_SPI_SPEED_HZ;
>
>   	mutex_init(&devrec->buffer_mutex);
> +	mutex_init(&devrec->txrx_mutex);
>   	init_completion(&devrec->tx_complete);
>   	INIT_WORK(&devrec->irqwork, mrf24j40_isrwork);
> +	INIT_WORK(&devrec->rxwork, mrf24j40_rxwork);
>   	devrec->spi = spi;
>   	spi_set_drvdata(spi, devrec);
>
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] mrf24j40: Keep the interrupt line enabled
       [not found]   ` <1368112788-25701-2-git-send-email-david-1EggE+PRa6vk1uMJSBkQmQ@public.gmane.org>
@ 2013-05-14  3:55     ` Alan Ott
  2013-05-16 21:34       ` [Linux-zigbee-devel] " David Hauweele
  0 siblings, 1 reply; 24+ messages in thread
From: Alan Ott @ 2013-05-14  3:55 UTC (permalink / raw)
  To: David Hauweele
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

On 5/9/13 11:19 AM, David Hauweele wrote:
> Disabling the interrupt line could miss an IRQ and leave the line into a
> low state hence locking the driver.
>

Have you observed this? My understanding is that the interrupt won't be 
lost but instead delayed until enable_irq() is called.

I got this pattern from the other 802.15.4 drivers. Perhaps my 
understanding is wrong.


> Signed-off-by: David Hauweele <david-1EggE+PRa6vk1uMJSBkQmQ@public.gmane.org>
> ---
>   drivers/net/ieee802154/mrf24j40.c |    7 +------
>   1 file changed, 1 insertion(+), 6 deletions(-)
>
> diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
> index 1e3ddf3..6ef32f7 100644
> --- a/drivers/net/ieee802154/mrf24j40.c
> +++ b/drivers/net/ieee802154/mrf24j40.c
> @@ -603,8 +603,6 @@ static irqreturn_t mrf24j40_isr(int irq, void *data)
>   {
>   	struct mrf24j40 *devrec = data;
>
> -	disable_irq_nosync(irq);
> -
>   	schedule_work(&devrec->irqwork);
>
>   	return IRQ_HANDLED;
> @@ -619,7 +617,7 @@ static void mrf24j40_isrwork(struct work_struct *work)
>   	/* Read the interrupt status */
>   	ret = read_short_reg(devrec, REG_INTSTAT, &intstat);
>   	if (ret)
> -		goto out;
> +		return;
>
>   	/* Check for TX complete */
>   	if (intstat & 0x1)
> @@ -628,9 +626,6 @@ static void mrf24j40_isrwork(struct work_struct *work)
>   	/* Check for Rx */
>   	if (intstat & 0x8)
>   		schedule_work(&devrec->rxwork);
> -
> -out:
> -	enable_irq(devrec->spi->irq);
>   }
>
>   static void mrf24j40_rxwork(struct work_struct *work)
>


------------------------------------------------------------------------------
AlienVault Unified Security Management (USM) platform delivers complete
security visibility with the essential security capabilities. Easily and
efficiently configure, manage, and operate all of your security controls
from a single console and one unified framework. Download a free trial.
http://p.sf.net/sfu/alienvault_d2d

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/2] mrf24j40: Avoid transmission while receiving a frame
       [not found]   ` <5191ADE4.8040709-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
@ 2013-05-16 17:45     ` David Hauweele
       [not found]       ` <CAO+c-UVJQJbmmmCS4hxAEYHKa8gEU1Bs=y+Rv_A75UHeBZxc+w-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: David Hauweele @ 2013-05-16 17:45 UTC (permalink / raw)
  To: Alan Ott
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f


[-- Attachment #1.1: Type: text/plain, Size: 5227 bytes --]

I've observed this failure with two 'ping -f' between the transceivers.
After some time one of the transceivers will stop transmitting. I've
checked with a sniffer and it doesn't send frames on the air anymore. It
doesn't reply to ACK request neither.

I'm able to reproduce the bug but it may take several minutes to do so.
With this patch, I've been able to flood both transceivers for more than 2
hours. However it would be nice to look deeper into the problem as it
introduces more losses on the link in case of heavy traffic.

The secondary working queue is needed to avoid a lock with the mutexes.
Suppose you have a frame on the air, waiting for its TX interrupt, the
txrx_mutex is locked to avoid reception. At the same time an RX interrupt
is triggered. As everything is done within the same workqueue, the
scheduled work will wait for the end of the transmission. Hence preventing
the expected TX interrupt to be processed. The transmission is locked until
the completion times out.

I must admit that this solution is a bit twisted though and I'd be glad if
anyone comes with a better solution.

David

2013/5/14 Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>

> On 5/9/13 11:19 AM, David Hauweele wrote:
>
>> The transceiver may fail under heavy traffic when a frame is transmitted
>> while receiving another frame.
>>
>> This patch uses a mutex to separate the transmission and reception of
>> frames along with a secondary working queue to avoid a deadlock while
>> waiting for the transmission interrupt.
>>
>>
> Have you observed this failure? I have put this kind of thing in before
> (several times actually) but ultimately convinced myself each time that it
> was not necessary. I'm happy to be shown wrong.
>
>
>
>  Signed-off-by: David Hauweele <david-1EggE+PRa6vk1uMJSBkQmQ@public.gmane.org>
>> ---
>>   drivers/net/ieee802154/**mrf24j40.c |   22 +++++++++++++++++++++-
>>   1 file changed, 21 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/net/ieee802154/**mrf24j40.c
>> b/drivers/net/ieee802154/**mrf24j40.c
>> index ede3ce4..1e3ddf3 100644
>> --- a/drivers/net/ieee802154/**mrf24j40.c
>> +++ b/drivers/net/ieee802154/**mrf24j40.c
>> @@ -82,8 +82,10 @@ struct mrf24j40 {
>>         struct ieee802154_dev *dev;
>>
>>         struct mutex buffer_mutex; /* only used to protect buf */
>> +       struct mutex txrx_mutex; /* avoid transmission while receiving a
>> frame */
>>         struct completion tx_complete;
>>         struct work_struct irqwork;
>> +       struct work_struct rxwork;
>>         u8 *buf; /* 3 bytes. Used for SPI single-register transfers. */
>>   };
>>
>> @@ -353,6 +355,8 @@ static int mrf24j40_tx(struct ieee802154_dev *dev,
>> struct sk_buff *skb)
>>         /* Set TXNACKREQ if the ACK bit is set in the packet. */
>>         if (skb->data[0] & IEEE802154_FC_ACK_REQ)
>>                 val |= 0x4;
>> +
>> +       mutex_lock(&devrec->txrx_**mutex);
>>         write_short_reg(devrec, REG_TXNCON, val);
>>
>>         INIT_COMPLETION(devrec->tx_**complete);
>> @@ -361,6 +365,9 @@ static int mrf24j40_tx(struct ieee802154_dev *dev,
>> struct sk_buff *skb)
>>         ret = wait_for_completion_**interruptible_timeout(
>>                                                 &devrec->tx_complete,
>>                                                 5 * HZ);
>> +
>> +       mutex_unlock(&devrec->txrx_**mutex);
>> +
>>         if (ret == -ERESTARTSYS)
>>                 goto err;
>>         if (ret == 0) {
>> @@ -535,6 +542,8 @@ static int mrf24j40_handle_rx(struct mrf24j40 *devrec)
>>         int ret = 0;
>>         struct sk_buff *skb;
>>
>> +       mutex_lock(&devrec->txrx_**mutex);
>> +
>>         /* Turn off reception of packets off the air. This prevents the
>>          * device from overwriting the buffer while we're reading it. */
>>         ret = read_short_reg(devrec, REG_BBREG1, &val);
>> @@ -575,6 +584,8 @@ out:
>>         val &= ~0x4; /* Clear RXDECINV */
>>         write_short_reg(devrec, REG_BBREG1, val);
>>
>> +       mutex_unlock(&devrec->txrx_**mutex);
>> +
>>         return ret;
>>   }
>>
>> @@ -616,12 +627,19 @@ static void mrf24j40_isrwork(struct work_struct
>> *work)
>>
>>         /* Check for Rx */
>>         if (intstat & 0x8)
>> -               mrf24j40_handle_rx(devrec);
>> +               schedule_work(&devrec->rxwork)**;
>>
>>
> mrf24f40_isrwork() is already called from a workqueue.
>
>
>    out:
>>         enable_irq(devrec->spi->irq);
>>   }
>>
>> +static void mrf24j40_rxwork(struct work_struct *work)
>> +{
>> +       struct mrf24j40 *devrec = container_of(work, struct mrf24j40,
>> rxwork);
>> +
>> +       mrf24j40_handle_rx(devrec);
>> +}
>> +
>>   static int mrf24j40_probe(struct spi_device *spi)
>>   {
>>         int ret = -ENOMEM;
>> @@ -648,8 +666,10 @@ static int mrf24j40_probe(struct spi_device *spi)
>>                 spi->max_speed_hz = MAX_SPI_SPEED_HZ;
>>
>>         mutex_init(&devrec->buffer_**mutex);
>> +       mutex_init(&devrec->txrx_**mutex);
>>         init_completion(&devrec->tx_**complete);
>>         INIT_WORK(&devrec->irqwork, mrf24j40_isrwork);
>> +       INIT_WORK(&devrec->rxwork, mrf24j40_rxwork);
>>         devrec->spi = spi;
>>         spi_set_drvdata(spi, devrec);
>>
>>
>>
>

[-- Attachment #1.2: Type: text/html, Size: 6843 bytes --]

[-- Attachment #2: Type: text/plain, Size: 403 bytes --]

------------------------------------------------------------------------------
AlienVault Unified Security Management (USM) platform delivers complete
security visibility with the essential security capabilities. Easily and
efficiently configure, manage, and operate all of your security controls
from a single console and one unified framework. Download a free trial.
http://p.sf.net/sfu/alienvault_d2d

[-- Attachment #3: Type: text/plain, Size: 213 bytes --]

_______________________________________________
Linux-zigbee-devel mailing list
Linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f@public.gmane.org
https://lists.sourceforge.net/lists/listinfo/linux-zigbee-devel

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Linux-zigbee-devel] [PATCH 2/2] mrf24j40: Keep the interrupt line enabled
  2013-05-14  3:55     ` Alan Ott
@ 2013-05-16 21:34       ` David Hauweele
  2013-05-19 23:04         ` Alan Ott
  0 siblings, 1 reply; 24+ messages in thread
From: David Hauweele @ 2013-05-16 21:34 UTC (permalink / raw)
  To: Alan Ott
  Cc: Alexander Smirnov, Dmitry Eremin-Solenikov, linux-zigbee-devel,
	netdev, linux-kernel

I have seen the interrupt line going low forever with heavy traffic.

The at86rf230 driver has two isr, one for edge-triggered and another
for level-triggered interrupt. The latter uses
disable_irq_nosync/enable_irq which makes sense for level-triggered
interrupt. Otherwise the interrupt would be triggered again and again
until the scheduled work read the status register.

However I don't see the use of disable_irq/enable_irq with an
edge-triggered interrupt. Perhaps I'm missing something. Though I
don't see any harm using it anyway. As you said the interrupt should
be delayed until enable_irq() is called. In particular when
CONFIG_HARDIRQS_SW_RESEND is set.

David

2013/5/14 Alan Ott <alan@signal11.us>:
> On 5/9/13 11:19 AM, David Hauweele wrote:
>>
>> Disabling the interrupt line could miss an IRQ and leave the line into a
>> low state hence locking the driver.
>>
>
> Have you observed this? My understanding is that the interrupt won't be lost
> but instead delayed until enable_irq() is called.
>
> I got this pattern from the other 802.15.4 drivers. Perhaps my understanding
> is wrong.
>
>
>
>> Signed-off-by: David Hauweele <david@hauweele.net>
>> ---
>>   drivers/net/ieee802154/mrf24j40.c |    7 +------
>>   1 file changed, 1 insertion(+), 6 deletions(-)
>>
>> diff --git a/drivers/net/ieee802154/mrf24j40.c
>> b/drivers/net/ieee802154/mrf24j40.c
>> index 1e3ddf3..6ef32f7 100644
>> --- a/drivers/net/ieee802154/mrf24j40.c
>> +++ b/drivers/net/ieee802154/mrf24j40.c
>> @@ -603,8 +603,6 @@ static irqreturn_t mrf24j40_isr(int irq, void *data)
>>   {
>>         struct mrf24j40 *devrec = data;
>>
>> -       disable_irq_nosync(irq);
>> -
>>         schedule_work(&devrec->irqwork);
>>
>>         return IRQ_HANDLED;
>> @@ -619,7 +617,7 @@ static void mrf24j40_isrwork(struct work_struct *work)
>>         /* Read the interrupt status */
>>         ret = read_short_reg(devrec, REG_INTSTAT, &intstat);
>>         if (ret)
>> -               goto out;
>> +               return;
>>
>>         /* Check for TX complete */
>>         if (intstat & 0x1)
>> @@ -628,9 +626,6 @@ static void mrf24j40_isrwork(struct work_struct *work)
>>         /* Check for Rx */
>>         if (intstat & 0x8)
>>                 schedule_work(&devrec->rxwork);
>> -
>> -out:
>> -       enable_irq(devrec->spi->irq);
>>   }
>>
>>   static void mrf24j40_rxwork(struct work_struct *work)
>>
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Linux-zigbee-devel] [PATCH 2/2] mrf24j40: Keep the interrupt line enabled
  2013-05-16 21:34       ` [Linux-zigbee-devel] " David Hauweele
@ 2013-05-19 23:04         ` Alan Ott
  2013-05-21 16:17           ` David Hauweele
  0 siblings, 1 reply; 24+ messages in thread
From: Alan Ott @ 2013-05-19 23:04 UTC (permalink / raw)
  To: David Hauweele
  Cc: Alexander Smirnov, Dmitry Eremin-Solenikov, linux-zigbee-devel,
	netdev, linux-kernel

On 05/16/2013 05:34 PM, David Hauweele wrote:
> I have seen the interrupt line going low forever with heavy traffic.

Hi David,

I've been doing some testing today and I can reproduce your issue if I
ping -f both ways simultaneously (after about 3-4 minutes usually). I
think it has more to do with the tx/rx mutual exclusion that you
described in patch 1/1 than it does with the disable/enable IRQ. With
respect to enable/disable irq, I did some checking of other similar
drivers (non-ieee802154) and noticed that many of them use threaded
interrupts. I think this may be closer to the right way to do it.

Are you running a tickless kernel? What is your preemption model? What's
your hardware platform?

> The at86rf230 driver has two isr, one for edge-triggered and another
> for level-triggered interrupt. The latter uses
> disable_irq_nosync/enable_irq which makes sense for level-triggered
> interrupt. Otherwise the interrupt would be triggered again and again
> until the scheduled work read the status register.
>
> However I don't see the use of disable_irq/enable_irq with an
> edge-triggered interrupt. Perhaps I'm missing something. Though I
> don't see any harm using it anyway.

My understanding based on the datasheet is that the interrupt can be
asserted again as soon as INTSTAT is read (which is done in the
workqueue). I guess even if it happens while the workqueue is running,
it's ok.

I think I had a flawed understanding of schedule_work() before, thinking
that it would not schedule a work_struct it if the work_struct was running.

>  As you said the interrupt should
> be delayed until enable_irq() is called. In particular when
> CONFIG_HARDIRQS_SW_RESEND is set.

I think I agree with you. I'll send you a patch to try with threaded
interrupts to try.

I'm trying to determine exactly what's the cause of the interrupt line
being stuck low.

Alan.

>
> 2013/5/14 Alan Ott <alan@signal11.us>:
>> On 5/9/13 11:19 AM, David Hauweele wrote:
>>> Disabling the interrupt line could miss an IRQ and leave the line into a
>>> low state hence locking the driver.
>>>
>> Have you observed this? My understanding is that the interrupt won't be lost
>> but instead delayed until enable_irq() is called.
>>
>> I got this pattern from the other 802.15.4 drivers. Perhaps my understanding
>> is wrong.
>>
>>
>>
>>> Signed-off-by: David Hauweele <david@hauweele.net>
>>> ---
>>>   drivers/net/ieee802154/mrf24j40.c |    7 +------
>>>   1 file changed, 1 insertion(+), 6 deletions(-)
>>>
>>> diff --git a/drivers/net/ieee802154/mrf24j40.c
>>> b/drivers/net/ieee802154/mrf24j40.c
>>> index 1e3ddf3..6ef32f7 100644
>>> --- a/drivers/net/ieee802154/mrf24j40.c
>>> +++ b/drivers/net/ieee802154/mrf24j40.c
>>> @@ -603,8 +603,6 @@ static irqreturn_t mrf24j40_isr(int irq, void *data)
>>>   {
>>>         struct mrf24j40 *devrec = data;
>>>
>>> -       disable_irq_nosync(irq);
>>> -
>>>         schedule_work(&devrec->irqwork);
>>>
>>>         return IRQ_HANDLED;
>>> @@ -619,7 +617,7 @@ static void mrf24j40_isrwork(struct work_struct *work)
>>>         /* Read the interrupt status */
>>>         ret = read_short_reg(devrec, REG_INTSTAT, &intstat);
>>>         if (ret)
>>> -               goto out;
>>> +               return;
>>>
>>>         /* Check for TX complete */
>>>         if (intstat & 0x1)
>>> @@ -628,9 +626,6 @@ static void mrf24j40_isrwork(struct work_struct *work)
>>>         /* Check for Rx */
>>>         if (intstat & 0x8)
>>>                 schedule_work(&devrec->rxwork);
>>> -
>>> -out:
>>> -       enable_irq(devrec->spi->irq);
>>>   }
>>>
>>>   static void mrf24j40_rxwork(struct work_struct *work)
>>>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH testing] mrf24j40: Move INIT_COMPLETION to before the packet is sent
       [not found]       ` <CAO+c-UVJQJbmmmCS4hxAEYHKa8gEU1Bs=y+Rv_A75UHeBZxc+w-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2013-05-20  0:05         ` Alan Ott
  2013-05-22  2:01         ` [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts Alan Ott
  1 sibling, 0 replies; 24+ messages in thread
From: Alan Ott @ 2013-05-20  0:05 UTC (permalink / raw)
  To: david-1EggE+PRa6vk1uMJSBkQmQ
  Cc: Alan Ott, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

This avoids a race condition where the tx_complete interrupt could happen
before the completion is initialized.

---

David H,

Try this out and see if it changes anything for you. I put a bunch of
schedule_timeout()s in mrf24j40_tx() to basically guarantee that a reception
will happen during mrf24j40_tx(), and it seems to work. It would not work
before with said schedule_timeout() calls.

Alan.

---
 drivers/net/ieee802154/mrf24j40.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index 556151d..0ea2a5a 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -345,6 +345,8 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
 	if (ret)
 		goto err;
 
+	INIT_COMPLETION(devrec->tx_complete);
+
 	/* Set TXNTRIG bit of TXNCON to send packet */
 	ret = read_short_reg(devrec, REG_TXNCON, &val);
 	if (ret)
@@ -355,8 +357,6 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
 		val |= 0x4;
 	write_short_reg(devrec, REG_TXNCON, val);
 
-	INIT_COMPLETION(devrec->tx_complete);
-
 	/* Wait for the device to send the TX complete interrupt. */
 	ret = wait_for_completion_interruptible_timeout(
 						&devrec->tx_complete,
-- 
1.7.11.2


------------------------------------------------------------------------------
AlienVault Unified Security Management (USM) platform delivers complete
security visibility with the essential security capabilities. Easily and
efficiently configure, manage, and operate all of your security controls
from a single console and one unified framework. Download a free trial.
http://p.sf.net/sfu/alienvault_d2d

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [Linux-zigbee-devel] [PATCH 2/2] mrf24j40: Keep the interrupt line enabled
  2013-05-19 23:04         ` Alan Ott
@ 2013-05-21 16:17           ` David Hauweele
       [not found]             ` <CAO+c-UXymPjYysvh2kM36gcOsL3P51YWq+aYhEqX8oCySwBcaQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: David Hauweele @ 2013-05-21 16:17 UTC (permalink / raw)
  To: Alan Ott
  Cc: Alexander Smirnov, Dmitry Eremin-Solenikov, linux-zigbee-devel,
	netdev, linux-kernel

2013/5/20 Alan Ott <alan@signal11.us>:
> On 05/16/2013 05:34 PM, David Hauweele wrote:
>> I have seen the interrupt line going low forever with heavy traffic.
>
> Hi David,
>
> I've been doing some testing today and I can reproduce your issue if I
> ping -f both ways simultaneously (after about 3-4 minutes usually). I
> think it has more to do with the tx/rx mutual exclusion that you
> described in patch 1/1 than it does with the disable/enable IRQ. With
> respect to enable/disable irq, I did some checking of other similar
> drivers (non-ieee802154) and noticed that many of them use threaded
> interrupts. I think this may be closer to the right way to do it.

Hi Alan,

Indeed, threaded interrupts would be more appropriate.
What about spi_async ? We could avoid using any working queue with this call.
Although the driver would need a major rewrite and I've no clue about
the benefits of doing so.
I wonder how much additional latency is introduced by the working
queue, I will try to measure this today.

>
> Are you running a tickless kernel? What is your preemption model? What's
> your hardware platform?

This is a tickless kernel with CONFIG_PREEMPT on an RPi.

>
>> The at86rf230 driver has two isr, one for edge-triggered and another
>> for level-triggered interrupt. The latter uses
>> disable_irq_nosync/enable_irq which makes sense for level-triggered
>> interrupt. Otherwise the interrupt would be triggered again and again
>> until the scheduled work read the status register.
>>
>> However I don't see the use of disable_irq/enable_irq with an
>> edge-triggered interrupt. Perhaps I'm missing something. Though I
>> don't see any harm using it anyway.
>
> My understanding based on the datasheet is that the interrupt can be
> asserted again as soon as INTSTAT is read (which is done in the
> workqueue). I guess even if it happens while the workqueue is running,
> it's ok.

You're right, as soon as INTSTAT is read, the line should be driven
high again to assert another interrupt.
So it shouldn't pose any problem if the interrupts are replayed by the
kernel. What may happen however is
an error in the spi transfer. This would result in the line staying
low since the register hasn't really been read.
This is what I observed at the hardware level.

If this is the case, then the problem should also occurs with and
without disable/enable_irq. However this bug
is a bit harder to reproduce as it may take several hours. The only
solution I see would be to regularly poll the
INTSTAT register for recovering. I know about other drivers which do
this so I will look into this and work on a fix.

>
> I think I had a flawed understanding of schedule_work() before, thinking
> that it would not schedule a work_struct it if the work_struct was running.
>
>>  As you said the interrupt should
>> be delayed until enable_irq() is called. In particular when
>> CONFIG_HARDIRQS_SW_RESEND is set.
>
> I think I agree with you. I'll send you a patch to try with threaded
> interrupts to try.
>
> I'm trying to determine exactly what's the cause of the interrupt line
> being stuck low.
>

I tried your patch for INIT_COMPLETION. It doesn't fix the problem but
I agree with you for the race condition.

David

>>
>> 2013/5/14 Alan Ott <alan@signal11.us>:
>>> On 5/9/13 11:19 AM, David Hauweele wrote:
>>>> Disabling the interrupt line could miss an IRQ and leave the line into a
>>>> low state hence locking the driver.
>>>>
>>> Have you observed this? My understanding is that the interrupt won't be lost
>>> but instead delayed until enable_irq() is called.
>>>
>>> I got this pattern from the other 802.15.4 drivers. Perhaps my understanding
>>> is wrong.
>>>
>>>
>>>
>>>> Signed-off-by: David Hauweele <david@hauweele.net>
>>>> ---
>>>>   drivers/net/ieee802154/mrf24j40.c |    7 +------
>>>>   1 file changed, 1 insertion(+), 6 deletions(-)
>>>>
>>>> diff --git a/drivers/net/ieee802154/mrf24j40.c
>>>> b/drivers/net/ieee802154/mrf24j40.c
>>>> index 1e3ddf3..6ef32f7 100644
>>>> --- a/drivers/net/ieee802154/mrf24j40.c
>>>> +++ b/drivers/net/ieee802154/mrf24j40.c
>>>> @@ -603,8 +603,6 @@ static irqreturn_t mrf24j40_isr(int irq, void *data)
>>>>   {
>>>>         struct mrf24j40 *devrec = data;
>>>>
>>>> -       disable_irq_nosync(irq);
>>>> -
>>>>         schedule_work(&devrec->irqwork);
>>>>
>>>>         return IRQ_HANDLED;
>>>> @@ -619,7 +617,7 @@ static void mrf24j40_isrwork(struct work_struct *work)
>>>>         /* Read the interrupt status */
>>>>         ret = read_short_reg(devrec, REG_INTSTAT, &intstat);
>>>>         if (ret)
>>>> -               goto out;
>>>> +               return;
>>>>
>>>>         /* Check for TX complete */
>>>>         if (intstat & 0x1)
>>>> @@ -628,9 +626,6 @@ static void mrf24j40_isrwork(struct work_struct *work)
>>>>         /* Check for Rx */
>>>>         if (intstat & 0x8)
>>>>                 schedule_work(&devrec->rxwork);
>>>> -
>>>> -out:
>>>> -       enable_irq(devrec->spi->irq);
>>>>   }
>>>>
>>>>   static void mrf24j40_rxwork(struct work_struct *work)
>>>>
>
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/2] mrf24j40: Keep the interrupt line enabled
       [not found]             ` <CAO+c-UXymPjYysvh2kM36gcOsL3P51YWq+aYhEqX8oCySwBcaQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2013-05-21 18:22               ` Alan Ott
  0 siblings, 0 replies; 24+ messages in thread
From: Alan Ott @ 2013-05-21 18:22 UTC (permalink / raw)
  To: David Hauweele
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

On 05/21/2013 12:17 PM, David Hauweele wrote:
> 2013/5/20 Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>:
>> On 05/16/2013 05:34 PM, David Hauweele wrote:
>>> I have seen the interrupt line going low forever with heavy traffic.
>> I've been doing some testing today and I can reproduce your issue if I
>> ping -f both ways simultaneously (after about 3-4 minutes usually). I
>> think it has more to do with the tx/rx mutual exclusion that you
>> described in patch 1/1 than it does with the disable/enable IRQ. With
>> respect to enable/disable irq, I did some checking of other similar
>> drivers (non-ieee802154) and noticed that many of them use threaded
>> interrupts. I think this may be closer to the right way to do it.
> Indeed, threaded interrupts would be more appropriate.
> What about spi_async ? We could avoid using any working queue with this call.
> Although the driver would need a major rewrite and I've no clue about
> the benefits of doing so.
> I wonder how much additional latency is introduced by the working
> queue, I will try to measure this today.

Async would be really difficult since we need the values that are read
in order to know what to write in later calls (like where we need to set
one bit in a register). It's hard to imagine doing this without blocking
somewhere, and we might as well use the blocking mechanisms that the
kernel gives us.

>> Are you running a tickless kernel? What is your preemption model? What's
>> your hardware platform?
> This is a tickless kernel with CONFIG_PREEMPT on an RPi.

As another data point, can you try with voluntary preemption and see if
the behavior changes?

>>> The at86rf230 driver has two isr, one for edge-triggered and another
>>> for level-triggered interrupt. The latter uses
>>> disable_irq_nosync/enable_irq which makes sense for level-triggered
>>> interrupt. Otherwise the interrupt would be triggered again and again
>>> until the scheduled work read the status register.
>>>
>>> However I don't see the use of disable_irq/enable_irq with an
>>> edge-triggered interrupt. Perhaps I'm missing something. Though I
>>> don't see any harm using it anyway.
>> My understanding based on the datasheet is that the interrupt can be
>> asserted again as soon as INTSTAT is read (which is done in the
>> workqueue). I guess even if it happens while the workqueue is running,
>> it's ok.
> You're right, as soon as INTSTAT is read, the line should be driven
> high again to assert another interrupt.

This is where I think I've found another race condition. In doing some
testing with the mrf24j40 from PIC24 firmware last night, I think I've
discovered that it's possible for the interrupt line to _not_ get reset
(de-asserted). I think what may be causing it is if an interrupt occurs
at the same time as (or right shortly after) INTSTAT is read. It's easy
to imagine the interrupt line not being able to get all the way up
before coming back down. I have a support email into Microchip asking
specifically about this case.

In my firmware, I'm able to see a case where the INTSTAT register shows
non-zero (ie: an interrupt is pending), yet the interrupt flag is not
set. It seems like it takes about 30 minutes for this to happen. On
Linux, I think this is less likely, but still possible, since there is
more latency between the hardware interrupt and when INTSTAT is read.
The extra latency means there is more time for the second interrupt to
happen, causing INTSTAT to simply show both interrupts, and have less
likelihood of the second interrupt happening exactly as INTSTAT is read.

> So it shouldn't pose any problem if the interrupts are replayed by the
> kernel. What may happen however is
> an error in the spi transfer. This would result in the line staying
> low since the register hasn't really been read.
> This is what I observed at the hardware level.

To be clear, you've observed the line staying low, not an SPI transfer
error, right?

> If this is the case, then the problem should also occurs with and
> without disable/enable_irq. However this bug
> is a bit harder to reproduce as it may take several hours. The only
> solution I see would be to regularly poll the
> INTSTAT register for recovering. I know about other drivers which do
> this so I will look into this and work on a fix.

I'd like to avoid a straight-up poll if at all possible. Worst case, we
can poll for interrupt in the case of something happening which we don't
expect to ever happen, for example on something like the TX interrupt
never showing up. It's harder of course to know when to poll for a
system that's primarily a receiver.
 
I've done something like this in my firmware:

void mrf24j40_isr(void)
{
        u8 intstat;
        int ret;

        /* Read the interrupt status */
        ret = read_short_reg(devrec, REG_INTSTAT, &intstat);
        if (ret)
                return;

        do {
                /* Check for TX complete */
                if (intstat & 0x1)
                        tx_complete = 1;

                /* Check for Rx */
                if (intstat & 0x8)
                        mrf24j40_handle_rx();

                /* Read the interrupt status */
                ret = read_short_reg(devrec, REG_INTSTAT, &intstat);
                if (ret)
                        return;

                if (intstat && !IFS1bits.INT1IF) {
                        /* Put a break here. This should never happen
but does */
                        Nop();
                        Nop();
                        Nop();
                }
        } while (intstat);
}

Basically there's an extra reading of the INTSTAT every time there's an
interrupt. One would think this would take care of the race condition,
unless there's another....

The breakpoint in the comment is shows the error I describe. The second
reading of INTSTAT shows an interrupt pending (in the case I observed,
0x8: RX), but INT1F (the interrupt flag for INT1, where I have the INT
line of the mrf24j40 connected) does not get set. In the Linux driver
(and in an ISR which doesn't do the extra check of INTSTAT, this causes
the INT line to be stuck low (asserted) with no ISR ever getting called.

We could try triggering on level (IRQF_TRIGGER_LOW) instead of edge. The
mrf24j40 datasheet specifically says it gives an edge trigger, but the
level sticks low until it INTSTAT is read, so we should be able to
detect it as a level trigger.

I'll do some testing on both of these cases.

Alan.

>> I think I had a flawed understanding of schedule_work() before, thinking
>> that it would not schedule a work_struct it if the work_struct was running.
>>
>>>  As you said the interrupt should
>>> be delayed until enable_irq() is called. In particular when
>>> CONFIG_HARDIRQS_SW_RESEND is set.
>> I think I agree with you. I'll send you a patch to try with threaded
>> interrupts to try.
>>
>> I'm trying to determine exactly what's the cause of the interrupt line
>> being stuck low.
>>
> I tried your patch for INIT_COMPLETION. It doesn't fix the problem but
> I agree with you for the race condition.
>
> David
>
>>> 2013/5/14 Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>:
>>>> On 5/9/13 11:19 AM, David Hauweele wrote:
>>>>> Disabling the interrupt line could miss an IRQ and leave the line into a
>>>>> low state hence locking the driver.
>>>>>
>>>> Have you observed this? My understanding is that the interrupt won't be lost
>>>> but instead delayed until enable_irq() is called.
>>>>
>>>> I got this pattern from the other 802.15.4 drivers. Perhaps my understanding
>>>> is wrong.
>>>>
>>>>
>>>>
>>>>> Signed-off-by: David Hauweele <david-1EggE+PRa6vk1uMJSBkQmQ@public.gmane.org>
>>>>> ---
>>>>>   drivers/net/ieee802154/mrf24j40.c |    7 +------
>>>>>   1 file changed, 1 insertion(+), 6 deletions(-)
>>>>>
>>>>> diff --git a/drivers/net/ieee802154/mrf24j40.c
>>>>> b/drivers/net/ieee802154/mrf24j40.c
>>>>> index 1e3ddf3..6ef32f7 100644
>>>>> --- a/drivers/net/ieee802154/mrf24j40.c
>>>>> +++ b/drivers/net/ieee802154/mrf24j40.c
>>>>> @@ -603,8 +603,6 @@ static irqreturn_t mrf24j40_isr(int irq, void *data)
>>>>>   {
>>>>>         struct mrf24j40 *devrec = data;
>>>>>
>>>>> -       disable_irq_nosync(irq);
>>>>> -
>>>>>         schedule_work(&devrec->irqwork);
>>>>>
>>>>>         return IRQ_HANDLED;
>>>>> @@ -619,7 +617,7 @@ static void mrf24j40_isrwork(struct work_struct *work)
>>>>>         /* Read the interrupt status */
>>>>>         ret = read_short_reg(devrec, REG_INTSTAT, &intstat);
>>>>>         if (ret)
>>>>> -               goto out;
>>>>> +               return;
>>>>>
>>>>>         /* Check for TX complete */
>>>>>         if (intstat & 0x1)
>>>>> @@ -628,9 +626,6 @@ static void mrf24j40_isrwork(struct work_struct *work)
>>>>>         /* Check for Rx */
>>>>>         if (intstat & 0x8)
>>>>>                 schedule_work(&devrec->rxwork);
>>>>> -
>>>>> -out:
>>>>> -       enable_irq(devrec->spi->irq);
>>>>>   }
>>>>>
>>>>>   static void mrf24j40_rxwork(struct work_struct *work)
>>>>>
>>


------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts
       [not found]       ` <CAO+c-UVJQJbmmmCS4hxAEYHKa8gEU1Bs=y+Rv_A75UHeBZxc+w-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  2013-05-20  0:05         ` [PATCH testing] mrf24j40: Move INIT_COMPLETION to before the packet is sent Alan Ott
@ 2013-05-22  2:01         ` Alan Ott
       [not found]           ` <1369188080-8904-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
                             ` (2 more replies)
  1 sibling, 3 replies; 24+ messages in thread
From: Alan Ott @ 2013-05-22  2:01 UTC (permalink / raw)
  To: david-1EggE+PRa6vk1uMJSBkQmQ
  Cc: Alan Ott, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

David Hauweele noticed that the mrf24j40 would hang arbitrarily after some
period of heavy traffic.  Two race conditions were discovered, and the
driver was changed to use threaded interrupts, since the enable/disable of
interrupts in the driver has recently been a lighning rod whenever issues
arise related to interrupts (costing engineering time), and since threaded
interrupts are the right way to do it.

Alan Ott (3):
  mrf24j40: Move INIT_COMPLETION() to before packet transmission
  mrf24j40: Use threaded IRQ handler
  mrf24j40: Use level-triggered interrupts

 drivers/net/ieee802154/mrf24j40.c | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

-- 
1.7.11.2


------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH beta 1] 1/3] mrf24j40: Move INIT_COMPLETION() to before packet transmission
       [not found]           ` <1369188080-8904-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
@ 2013-05-22  2:01             ` Alan Ott
  2013-05-22  2:01             ` [PATCH beta 1] 2/3] mrf24j40: Use threaded IRQ handler Alan Ott
  2013-10-06  3:52             ` [PATCH v1 0/3] Fix race conditions in mrf24j40 interrupts Alan Ott
  2 siblings, 0 replies; 24+ messages in thread
From: Alan Ott @ 2013-05-22  2:01 UTC (permalink / raw)
  To: david-1EggE+PRa6vk1uMJSBkQmQ
  Cc: Alan Ott, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

This avoids a race condition where complete(tx_complete) could be called
before tx_complete is initialized.

Signed-off-by: Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
---
 drivers/net/ieee802154/mrf24j40.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index 556151d..0ea2a5a 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -345,6 +345,8 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
 	if (ret)
 		goto err;
 
+	INIT_COMPLETION(devrec->tx_complete);
+
 	/* Set TXNTRIG bit of TXNCON to send packet */
 	ret = read_short_reg(devrec, REG_TXNCON, &val);
 	if (ret)
@@ -355,8 +357,6 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
 		val |= 0x4;
 	write_short_reg(devrec, REG_TXNCON, val);
 
-	INIT_COMPLETION(devrec->tx_complete);
-
 	/* Wait for the device to send the TX complete interrupt. */
 	ret = wait_for_completion_interruptible_timeout(
 						&devrec->tx_complete,
-- 
1.7.11.2


------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH beta 1] 2/3] mrf24j40: Use threaded IRQ handler
       [not found]           ` <1369188080-8904-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
  2013-05-22  2:01             ` [PATCH beta 1] 1/3] mrf24j40: Move INIT_COMPLETION() to before packet transmission Alan Ott
@ 2013-05-22  2:01             ` Alan Ott
  2013-10-06  3:52             ` [PATCH v1 0/3] Fix race conditions in mrf24j40 interrupts Alan Ott
  2 siblings, 0 replies; 24+ messages in thread
From: Alan Ott @ 2013-05-22  2:01 UTC (permalink / raw)
  To: david-1EggE+PRa6vk1uMJSBkQmQ
  Cc: Alan Ott, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Eliminate all the workqueue and interrupt enable/disable.

Signed-off-by: Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
---
 drivers/net/ieee802154/mrf24j40.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index 0ea2a5a..a55ab8d 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -83,7 +83,6 @@ struct mrf24j40 {
 
 	struct mutex buffer_mutex; /* only used to protect buf */
 	struct completion tx_complete;
-	struct work_struct irqwork;
 	u8 *buf; /* 3 bytes. Used for SPI single-register transfers. */
 };
 
@@ -591,17 +590,6 @@ static struct ieee802154_ops mrf24j40_ops = {
 static irqreturn_t mrf24j40_isr(int irq, void *data)
 {
 	struct mrf24j40 *devrec = data;
-
-	disable_irq_nosync(irq);
-
-	schedule_work(&devrec->irqwork);
-
-	return IRQ_HANDLED;
-}
-
-static void mrf24j40_isrwork(struct work_struct *work)
-{
-	struct mrf24j40 *devrec = container_of(work, struct mrf24j40, irqwork);
 	u8 intstat;
 	int ret;
 
@@ -619,7 +607,7 @@ static void mrf24j40_isrwork(struct work_struct *work)
 		mrf24j40_handle_rx(devrec);
 
 out:
-	enable_irq(devrec->spi->irq);
+	return IRQ_HANDLED;
 }
 
 static int mrf24j40_probe(struct spi_device *spi)
@@ -649,7 +637,6 @@ static int mrf24j40_probe(struct spi_device *spi)
 
 	mutex_init(&devrec->buffer_mutex);
 	init_completion(&devrec->tx_complete);
-	INIT_WORK(&devrec->irqwork, mrf24j40_isrwork);
 	devrec->spi = spi;
 	dev_set_drvdata(&spi->dev, devrec);
 
@@ -695,11 +682,12 @@ static int mrf24j40_probe(struct spi_device *spi)
 	val &= ~0x3; /* Clear RX mode (normal) */
 	write_short_reg(devrec, REG_RXMCR, val);
 
-	ret = request_irq(spi->irq,
-			  mrf24j40_isr,
-			  IRQF_TRIGGER_FALLING,
-			  dev_name(&spi->dev),
-			  devrec);
+	ret = request_threaded_irq(spi->irq,
+				   NULL,
+				   mrf24j40_isr,
+				   IRQF_TRIGGER_FALLING|IRQF_ONESHOT,
+				   dev_name(&spi->dev),
+				   devrec);
 
 	if (ret) {
 		dev_err(printdev(devrec), "Unable to get IRQ");
@@ -728,7 +716,6 @@ static int mrf24j40_remove(struct spi_device *spi)
 	dev_dbg(printdev(devrec), "remove\n");
 
 	free_irq(spi->irq, devrec);
-	flush_work(&devrec->irqwork); /* TODO: Is this the right call? */
 	ieee802154_unregister_device(devrec->dev);
 	ieee802154_free_device(devrec->dev);
 	/* TODO: Will ieee802154_free_device() wait until ->xmit() is
-- 
1.7.11.2


------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH beta 1] 3/3] mrf24j40: Use level-triggered interrupts
  2013-05-22  2:01         ` [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts Alan Ott
       [not found]           ` <1369188080-8904-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
@ 2013-05-22  2:01           ` Alan Ott
  2013-05-22  2:03           ` [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts Alan Ott
  2 siblings, 0 replies; 24+ messages in thread
From: Alan Ott @ 2013-05-22  2:01 UTC (permalink / raw)
  To: david; +Cc: netdev, linux-kernel, linux-zigbee-devel, Alan Ott

The mrf24j40 generates level interrupts. There are rare cases where it
appears that the interrupt line never gets de-asserted between interrupts,
causing interrupts to be lost, and causing a hung device from the driver's
perspective.  Switching the driver to interpret these interrupts as
level-triggered fixes this issue.

Signed-off-by: Alan Ott <alan@signal11.us>
---
 drivers/net/ieee802154/mrf24j40.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index a55ab8d..d59dbff 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -685,7 +685,7 @@ static int mrf24j40_probe(struct spi_device *spi)
 	ret = request_threaded_irq(spi->irq,
 				   NULL,
 				   mrf24j40_isr,
-				   IRQF_TRIGGER_FALLING|IRQF_ONESHOT,
+				   IRQF_TRIGGER_LOW|IRQF_ONESHOT,
 				   dev_name(&spi->dev),
 				   devrec);
 
-- 
1.7.11.2

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts
  2013-05-22  2:01         ` [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts Alan Ott
       [not found]           ` <1369188080-8904-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
  2013-05-22  2:01           ` [PATCH beta 1] 3/3] mrf24j40: Use level-triggered interrupts Alan Ott
@ 2013-05-22  2:03           ` Alan Ott
  2013-05-22 20:32             ` David Hauweele
  2 siblings, 1 reply; 24+ messages in thread
From: Alan Ott @ 2013-05-22  2:03 UTC (permalink / raw)
  To: Alan Ott; +Cc: david, netdev, linux-kernel, linux-zigbee-devel

On 05/21/2013 10:01 PM, Alan Ott wrote:
> David Hauweele noticed that the mrf24j40 would hang arbitrarily after some
> period of heavy traffic.  Two race conditions were discovered, and the
> driver was changed to use threaded interrupts, since the enable/disable of
> interrupts in the driver has recently been a lighning rod whenever issues
> arise related to interrupts (costing engineering time), and since threaded
> interrupts are the right way to do it.
>
> Alan Ott (3):
>   mrf24j40: Move INIT_COMPLETION() to before packet transmission
>   mrf24j40: Use threaded IRQ handler
>   mrf24j40: Use level-triggered interrupts
>
>  drivers/net/ieee802154/mrf24j40.c | 31 +++++++++----------------------
>  1 file changed, 9 insertions(+), 22 deletions(-)

I forgot to add, I ran ping -f both ways all afternoon (6.5 hours), and
it seems solid.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts
  2013-05-22  2:03           ` [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts Alan Ott
@ 2013-05-22 20:32             ` David Hauweele
       [not found]               ` <CAO+c-UV1bZZEE=9=7VVn3f1eLWzCpYCK9eZnnvzNwcPOKKtKLA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: David Hauweele @ 2013-05-22 20:32 UTC (permalink / raw)
  To: Alan Ott; +Cc: netdev, linux-kernel, linux-zigbee-devel

Hello,

I cannot use level-triggered interrupts with GPIO on the RPi, so I
cannot test this specific patch.
However I agree with the idea of level-triggered interrupts, that
would fix all major problems related to missed interrupts.

Beside this I'm running a ping -f since more than two hours now and it
seems to work well.

David

2013/5/22 Alan Ott <alan@signal11.us>:
> On 05/21/2013 10:01 PM, Alan Ott wrote:
>> David Hauweele noticed that the mrf24j40 would hang arbitrarily after some
>> period of heavy traffic.  Two race conditions were discovered, and the
>> driver was changed to use threaded interrupts, since the enable/disable of
>> interrupts in the driver has recently been a lighning rod whenever issues
>> arise related to interrupts (costing engineering time), and since threaded
>> interrupts are the right way to do it.
>>
>> Alan Ott (3):
>>   mrf24j40: Move INIT_COMPLETION() to before packet transmission
>>   mrf24j40: Use threaded IRQ handler
>>   mrf24j40: Use level-triggered interrupts
>>
>>  drivers/net/ieee802154/mrf24j40.c | 31 +++++++++----------------------
>>  1 file changed, 9 insertions(+), 22 deletions(-)
>
> I forgot to add, I ran ping -f both ways all afternoon (6.5 hours), and
> it seems solid.
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts
       [not found]               ` <CAO+c-UV1bZZEE=9=7VVn3f1eLWzCpYCK9eZnnvzNwcPOKKtKLA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
@ 2013-05-23  6:36                 ` Alan Ott
       [not found]                   ` <519DB8E6.4020709-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
  0 siblings, 1 reply; 24+ messages in thread
From: Alan Ott @ 2013-05-23  6:36 UTC (permalink / raw)
  To: David Hauweele
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

On 5/22/13 4:32 PM, David Hauweele wrote:
> I cannot use level-triggered interrupts with GPIO on the RPi, so I
> cannot test this specific patch.

Is there another interrupt line you can tie into which does support 
level-trigger interrupts (INT0 or something)?

> However I agree with the idea of level-triggered interrupts, that
> would fix all major problems related to missed interrupts.
>
> Beside this I'm running a ping -f since more than two hours now and it
> seems to work well.
>

So that surprises me. I thought level-trigger interrupts were the thing 
that would fix this problem, and if you're not running with that patch, 
you just have the INIT_COMPLETION() fix (which you said didn't fix your 
issue) and the threaded interrupts patch, which I was fairly sure I had 
determined wasn't fixing any actual race-condition-related problems.

I'm glad, but surprised that you're no longer seeing issues.

Alan.

>
> 2013/5/22 Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>:
>> On 05/21/2013 10:01 PM, Alan Ott wrote:
>>> David Hauweele noticed that the mrf24j40 would hang arbitrarily after some
>>> period of heavy traffic.  Two race conditions were discovered, and the
>>> driver was changed to use threaded interrupts, since the enable/disable of
>>> interrupts in the driver has recently been a lighning rod whenever issues
>>> arise related to interrupts (costing engineering time), and since threaded
>>> interrupts are the right way to do it.
>>>
>>> Alan Ott (3):
>>>    mrf24j40: Move INIT_COMPLETION() to before packet transmission
>>>    mrf24j40: Use threaded IRQ handler
>>>    mrf24j40: Use level-triggered interrupts
>>>
>>>   drivers/net/ieee802154/mrf24j40.c | 31 +++++++++----------------------
>>>   1 file changed, 9 insertions(+), 22 deletions(-)
>>
>> I forgot to add, I ran ping -f both ways all afternoon (6.5 hours), and
>> it seems solid.
>>


------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts
       [not found]                   ` <519DB8E6.4020709-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
@ 2013-05-23 17:54                     ` David Hauweele
  2013-05-23 19:33                       ` Alan Ott
  0 siblings, 1 reply; 24+ messages in thread
From: David Hauweele @ 2013-05-23 17:54 UTC (permalink / raw)
  To: Alan Ott
  Cc: netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

2013/5/23 Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>:
> On 5/22/13 4:32 PM, David Hauweele wrote:
>>
>> I cannot use level-triggered interrupts with GPIO on the RPi, so I
>> cannot test this specific patch.
>
>
> Is there another interrupt line you can tie into which does support
> level-trigger interrupts (INT0 or something)?

According to the datasheet it should be possible but the bcm2708 port
does not support it. I've been told that we shouldn't use level-triggered
interrupts in the first place.

>
>
>> However I agree with the idea of level-triggered interrupts, that
>> would fix all major problems related to missed interrupts.
>>
>> Beside this I'm running a ping -f since more than two hours now and it
>> seems to work well.
>>
>
> So that surprises me. I thought level-trigger interrupts were the thing that
> would fix this problem, and if you're not running with that patch, you just
> have the INIT_COMPLETION() fix (which you said didn't fix your issue) and
> the threaded interrupts patch, which I was fairly sure I had determined
> wasn't fixing any actual race-condition-related problems.

I should have been more clear about this. I've tested [PATCH 1/3]
which fixes the race condition with tx_complete. That is the
INIT_COMPLETION() fix. But it is still possible to miss an interrupt,
perhaps it just took longer this time. I ran the test again today and
it failed after 30 minutes.

I did not test [PATCH 2/3], that is the threaded IRQ. Instead I
removed interrupt enable/disable from the IRQ handler and the
workqueue. Without this the driver would fail within seconds of a ping
-f. Have you observed this too ? Perhaps this problem is specific to
the bcm2708 port.

David

>
> I'm glad, but surprised that you're no longer seeing issues.
>
> Alan.
>
>
>>
>> 2013/5/22 Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>:
>>>
>>> On 05/21/2013 10:01 PM, Alan Ott wrote:
>>>>
>>>> David Hauweele noticed that the mrf24j40 would hang arbitrarily after
>>>> some
>>>> period of heavy traffic.  Two race conditions were discovered, and the
>>>> driver was changed to use threaded interrupts, since the enable/disable
>>>> of
>>>> interrupts in the driver has recently been a lighning rod whenever
>>>> issues
>>>> arise related to interrupts (costing engineering time), and since
>>>> threaded
>>>> interrupts are the right way to do it.
>>>>
>>>> Alan Ott (3):
>>>>    mrf24j40: Move INIT_COMPLETION() to before packet transmission
>>>>    mrf24j40: Use threaded IRQ handler
>>>>    mrf24j40: Use level-triggered interrupts
>>>>
>>>>   drivers/net/ieee802154/mrf24j40.c | 31 +++++++++----------------------
>>>>   1 file changed, 9 insertions(+), 22 deletions(-)
>>>
>>>
>>> I forgot to add, I ran ping -f both ways all afternoon (6.5 hours), and
>>> it seems solid.
>>>
>

------------------------------------------------------------------------------
Try New Relic Now & We'll Send You this Cool Shirt
New Relic is the only SaaS-based application performance monitoring service 
that delivers powerful full stack analytics. Optimize and monitor your
browser, app, & servers with just a few lines of code. Try New Relic
and get this awesome Nerd Life shirt! http://p.sf.net/sfu/newrelic_d2d_may

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts
  2013-05-23 17:54                     ` David Hauweele
@ 2013-05-23 19:33                       ` Alan Ott
  0 siblings, 0 replies; 24+ messages in thread
From: Alan Ott @ 2013-05-23 19:33 UTC (permalink / raw)
  To: David Hauweele; +Cc: netdev, linux-kernel, linux-zigbee-devel

On 05/23/2013 01:54 PM, David Hauweele wrote:
> 2013/5/23 Alan Ott <alan@signal11.us>:
>> On 5/22/13 4:32 PM, David Hauweele wrote:
>>>
>>> I cannot use level-triggered interrupts with GPIO on the RPi, so I
>>> cannot test this specific patch.
>>
>>
>> Is there another interrupt line you can tie into which does support
>> level-trigger interrupts (INT0 or something)?
> 
> According to the datasheet it should be possible but the bcm2708 port
> does not support it. I've been told that we shouldn't use level-triggered
> interrupts in the first place.


Who is "we," and why shouldn't we use level-triggered interrupts? The
CPU needs to detect whatever type of interrupt the attached hardware
generates.


> 
>>
>>
>>> However I agree with the idea of level-triggered interrupts, that
>>> would fix all major problems related to missed interrupts.
>>>
>>> Beside this I'm running a ping -f since more than two hours now and it
>>> seems to work well.
>>>
>>
>> So that surprises me. I thought level-trigger interrupts were the thing that
>> would fix this problem, and if you're not running with that patch, you just
>> have the INIT_COMPLETION() fix (which you said didn't fix your issue) and
>> the threaded interrupts patch, which I was fairly sure I had determined
>> wasn't fixing any actual race-condition-related problems.
> 
> I should have been more clear about this. I've tested [PATCH 1/3]
> which fixes the race condition with tx_complete. That is the
> INIT_COMPLETION() fix. But it is still possible to miss an interrupt,
> perhaps it just took longer this time. I ran the test again today and
> it failed after 30 minutes.
> 
> I did not test [PATCH 2/3], that is the threaded IRQ. Instead I
> removed interrupt enable/disable from the IRQ handler and the
> workqueue. Without this the driver would fail within seconds of a ping
> -f.


Without what? What do you mean by "without this?" Without the
enable/disable, or without the change that removes the enable/disable?


> Have you observed this too ? Perhaps this problem is specific to
> the bcm2708 port.
> 

What I observe right now is that it seems to work great (ping -f for 6.5
hours) when using the three patches in this patch set on a BeagleBone.

> 
>>
>> I'm glad, but surprised that you're no longer seeing issues.
>>
>> Alan.
>>
>>
>>>
>>> 2013/5/22 Alan Ott <alan@signal11.us>:
>>>>
>>>> On 05/21/2013 10:01 PM, Alan Ott wrote:
>>>>>
>>>>> David Hauweele noticed that the mrf24j40 would hang arbitrarily after
>>>>> some
>>>>> period of heavy traffic.  Two race conditions were discovered, and the
>>>>> driver was changed to use threaded interrupts, since the enable/disable
>>>>> of
>>>>> interrupts in the driver has recently been a lighning rod whenever
>>>>> issues
>>>>> arise related to interrupts (costing engineering time), and since
>>>>> threaded
>>>>> interrupts are the right way to do it.
>>>>>
>>>>> Alan Ott (3):
>>>>>    mrf24j40: Move INIT_COMPLETION() to before packet transmission
>>>>>    mrf24j40: Use threaded IRQ handler
>>>>>    mrf24j40: Use level-triggered interrupts
>>>>>
>>>>>   drivers/net/ieee802154/mrf24j40.c | 31 +++++++++----------------------
>>>>>   1 file changed, 9 insertions(+), 22 deletions(-)
>>>>
>>>>
>>>> I forgot to add, I ran ping -f both ways all afternoon (6.5 hours), and
>>>> it seems solid.
>>>>
>>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v1 0/3] Fix race conditions in mrf24j40 interrupts
       [not found]           ` <1369188080-8904-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
  2013-05-22  2:01             ` [PATCH beta 1] 1/3] mrf24j40: Move INIT_COMPLETION() to before packet transmission Alan Ott
  2013-05-22  2:01             ` [PATCH beta 1] 2/3] mrf24j40: Use threaded IRQ handler Alan Ott
@ 2013-10-06  3:52             ` Alan Ott
       [not found]               ` <1381031544-2960-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
  2013-10-08 19:32               ` [PATCH v1 0/3] Fix race conditions in mrf24j40 interrupts David Miller
  2 siblings, 2 replies; 24+ messages in thread
From: Alan Ott @ 2013-10-06  3:52 UTC (permalink / raw)
  To: Alexander Smirnov, Dmitry Eremin-Solenikov, David S. Miller,
	david-1EggE+PRa6vk1uMJSBkQmQ
  Cc: Alan Ott, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

After testing with the betas of this patchset, it's been rebased and is
ready for inclusion.

David Hauweele noticed that the mrf24j40 would hang arbitrarily after some
period of heavy traffic.  Two race conditions were discovered, and the
driver was changed to use threaded interrupts, since the enable/disable of
interrupts in the driver has recently been a lighning rod whenever issues
arise related to interrupts (costing engineering time), and since threaded
interrupts are the right way to do it.

Alan Ott (3):
  mrf24j40: Move INIT_COMPLETION() to before packet transmission
  mrf24j40: Use threaded IRQ handler
  mrf24j40: Use level-triggered interrupts

 drivers/net/ieee802154/mrf24j40.c | 31 +++++++++----------------------
 1 file changed, 9 insertions(+), 22 deletions(-)

-- 
1.8.1.2


------------------------------------------------------------------------------
October Webinars: Code for Performance
Free Intel webinars can help you accelerate application performance.
Explore tips for MPI, OpenMP, advanced profiling, and more. Get the most from 
the latest Intel processors and coprocessors. See abstracts and register >
http://pubads.g.doubleclick.net/gampad/clk?id=60134791&iu=/4140/ostg.clktrk

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v1 1/3] mrf24j40: Move INIT_COMPLETION() to before packet transmission
       [not found]               ` <1381031544-2960-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
@ 2013-10-06  3:52                 ` Alan Ott
  2013-10-06  3:52                 ` [PATCH v1 2/3] mrf24j40: Use threaded IRQ handler Alan Ott
  2013-10-06  3:52                 ` [PATCH v1 3/3] mrf24j40: Use level-triggered interrupts Alan Ott
  2 siblings, 0 replies; 24+ messages in thread
From: Alan Ott @ 2013-10-06  3:52 UTC (permalink / raw)
  To: Alexander Smirnov, Dmitry Eremin-Solenikov, David S. Miller,
	david-1EggE+PRa6vk1uMJSBkQmQ
  Cc: Alan Ott, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

This avoids a race condition where complete(tx_complete) could be called
before tx_complete is initialized.

Signed-off-by: Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
---
 drivers/net/ieee802154/mrf24j40.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index 42e6dee..66bb4ce 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -344,6 +344,8 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
 	if (ret)
 		goto err;
 
+	INIT_COMPLETION(devrec->tx_complete);
+
 	/* Set TXNTRIG bit of TXNCON to send packet */
 	ret = read_short_reg(devrec, REG_TXNCON, &val);
 	if (ret)
@@ -354,8 +356,6 @@ static int mrf24j40_tx(struct ieee802154_dev *dev, struct sk_buff *skb)
 		val |= 0x4;
 	write_short_reg(devrec, REG_TXNCON, val);
 
-	INIT_COMPLETION(devrec->tx_complete);
-
 	/* Wait for the device to send the TX complete interrupt. */
 	ret = wait_for_completion_interruptible_timeout(
 						&devrec->tx_complete,
-- 
1.8.1.2


------------------------------------------------------------------------------
October Webinars: Code for Performance
Free Intel webinars can help you accelerate application performance.
Explore tips for MPI, OpenMP, advanced profiling, and more. Get the most from 
the latest Intel processors and coprocessors. See abstracts and register >
http://pubads.g.doubleclick.net/gampad/clk?id=60134791&iu=/4140/ostg.clktrk

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v1 2/3] mrf24j40: Use threaded IRQ handler
       [not found]               ` <1381031544-2960-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
  2013-10-06  3:52                 ` [PATCH v1 1/3] mrf24j40: Move INIT_COMPLETION() to before packet transmission Alan Ott
@ 2013-10-06  3:52                 ` Alan Ott
  2013-10-06  3:52                 ` [PATCH v1 3/3] mrf24j40: Use level-triggered interrupts Alan Ott
  2 siblings, 0 replies; 24+ messages in thread
From: Alan Ott @ 2013-10-06  3:52 UTC (permalink / raw)
  To: Alexander Smirnov, Dmitry Eremin-Solenikov, David S. Miller,
	david-1EggE+PRa6vk1uMJSBkQmQ
  Cc: Alan Ott, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

Eliminate all the workqueue and interrupt enable/disable.

Signed-off-by: Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
---
 drivers/net/ieee802154/mrf24j40.c | 27 +++++++--------------------
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index 66bb4ce..c1bc688 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -82,7 +82,6 @@ struct mrf24j40 {
 
 	struct mutex buffer_mutex; /* only used to protect buf */
 	struct completion tx_complete;
-	struct work_struct irqwork;
 	u8 *buf; /* 3 bytes. Used for SPI single-register transfers. */
 };
 
@@ -590,17 +589,6 @@ static struct ieee802154_ops mrf24j40_ops = {
 static irqreturn_t mrf24j40_isr(int irq, void *data)
 {
 	struct mrf24j40 *devrec = data;
-
-	disable_irq_nosync(irq);
-
-	schedule_work(&devrec->irqwork);
-
-	return IRQ_HANDLED;
-}
-
-static void mrf24j40_isrwork(struct work_struct *work)
-{
-	struct mrf24j40 *devrec = container_of(work, struct mrf24j40, irqwork);
 	u8 intstat;
 	int ret;
 
@@ -618,7 +606,7 @@ static void mrf24j40_isrwork(struct work_struct *work)
 		mrf24j40_handle_rx(devrec);
 
 out:
-	enable_irq(devrec->spi->irq);
+	return IRQ_HANDLED;
 }
 
 static int mrf24j40_probe(struct spi_device *spi)
@@ -642,7 +630,6 @@ static int mrf24j40_probe(struct spi_device *spi)
 
 	mutex_init(&devrec->buffer_mutex);
 	init_completion(&devrec->tx_complete);
-	INIT_WORK(&devrec->irqwork, mrf24j40_isrwork);
 	devrec->spi = spi;
 	spi_set_drvdata(spi, devrec);
 
@@ -688,11 +675,12 @@ static int mrf24j40_probe(struct spi_device *spi)
 	val &= ~0x3; /* Clear RX mode (normal) */
 	write_short_reg(devrec, REG_RXMCR, val);
 
-	ret = request_irq(spi->irq,
-			  mrf24j40_isr,
-			  IRQF_TRIGGER_FALLING,
-			  dev_name(&spi->dev),
-			  devrec);
+	ret = request_threaded_irq(spi->irq,
+				   NULL,
+				   mrf24j40_isr,
+				   IRQF_TRIGGER_FALLING|IRQF_ONESHOT,
+				   dev_name(&spi->dev),
+				   devrec);
 
 	if (ret) {
 		dev_err(printdev(devrec), "Unable to get IRQ");
@@ -721,7 +709,6 @@ static int mrf24j40_remove(struct spi_device *spi)
 	dev_dbg(printdev(devrec), "remove\n");
 
 	free_irq(spi->irq, devrec);
-	flush_work(&devrec->irqwork); /* TODO: Is this the right call? */
 	ieee802154_unregister_device(devrec->dev);
 	ieee802154_free_device(devrec->dev);
 	/* TODO: Will ieee802154_free_device() wait until ->xmit() is
-- 
1.8.1.2


------------------------------------------------------------------------------
October Webinars: Code for Performance
Free Intel webinars can help you accelerate application performance.
Explore tips for MPI, OpenMP, advanced profiling, and more. Get the most from 
the latest Intel processors and coprocessors. See abstracts and register >
http://pubads.g.doubleclick.net/gampad/clk?id=60134791&iu=/4140/ostg.clktrk

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH v1 3/3] mrf24j40: Use level-triggered interrupts
       [not found]               ` <1381031544-2960-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
  2013-10-06  3:52                 ` [PATCH v1 1/3] mrf24j40: Move INIT_COMPLETION() to before packet transmission Alan Ott
  2013-10-06  3:52                 ` [PATCH v1 2/3] mrf24j40: Use threaded IRQ handler Alan Ott
@ 2013-10-06  3:52                 ` Alan Ott
  2 siblings, 0 replies; 24+ messages in thread
From: Alan Ott @ 2013-10-06  3:52 UTC (permalink / raw)
  To: Alexander Smirnov, Dmitry Eremin-Solenikov, David S. Miller,
	david-1EggE+PRa6vk1uMJSBkQmQ
  Cc: Alan Ott, netdev-u79uwXL29TY76Z2rM5mHXA,
	linux-kernel-u79uwXL29TY76Z2rM5mHXA,
	linux-zigbee-devel-5NWGOfrQmneRv+LV9MX5uipxlwaOVQ5f

The mrf24j40 generates level interrupts. There are rare cases where it
appears that the interrupt line never gets de-asserted between interrupts,
causing interrupts to be lost, and causing a hung device from the driver's
perspective.  Switching the driver to interpret these interrupts as
level-triggered fixes this issue.

Signed-off-by: Alan Ott <alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
---
 drivers/net/ieee802154/mrf24j40.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ieee802154/mrf24j40.c b/drivers/net/ieee802154/mrf24j40.c
index c1bc688..0632d34 100644
--- a/drivers/net/ieee802154/mrf24j40.c
+++ b/drivers/net/ieee802154/mrf24j40.c
@@ -678,7 +678,7 @@ static int mrf24j40_probe(struct spi_device *spi)
 	ret = request_threaded_irq(spi->irq,
 				   NULL,
 				   mrf24j40_isr,
-				   IRQF_TRIGGER_FALLING|IRQF_ONESHOT,
+				   IRQF_TRIGGER_LOW|IRQF_ONESHOT,
 				   dev_name(&spi->dev),
 				   devrec);
 
-- 
1.8.1.2


------------------------------------------------------------------------------
October Webinars: Code for Performance
Free Intel webinars can help you accelerate application performance.
Explore tips for MPI, OpenMP, advanced profiling, and more. Get the most from 
the latest Intel processors and coprocessors. See abstracts and register >
http://pubads.g.doubleclick.net/gampad/clk?id=60134791&iu=/4140/ostg.clktrk

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH v1 0/3] Fix race conditions in mrf24j40 interrupts
  2013-10-06  3:52             ` [PATCH v1 0/3] Fix race conditions in mrf24j40 interrupts Alan Ott
       [not found]               ` <1381031544-2960-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
@ 2013-10-08 19:32               ` David Miller
  1 sibling, 0 replies; 24+ messages in thread
From: David Miller @ 2013-10-08 19:32 UTC (permalink / raw)
  To: alan
  Cc: alex.bluesman.smirnov, dbaryshkov, david, linux-zigbee-devel,
	netdev, linux-kernel

From: Alan Ott <alan@signal11.us>
Date: Sat,  5 Oct 2013 23:52:21 -0400

> After testing with the betas of this patchset, it's been rebased and is
> ready for inclusion.
> 
> David Hauweele noticed that the mrf24j40 would hang arbitrarily after some
> period of heavy traffic.  Two race conditions were discovered, and the
> driver was changed to use threaded interrupts, since the enable/disable of
> interrupts in the driver has recently been a lighning rod whenever issues
> arise related to interrupts (costing engineering time), and since threaded
> interrupts are the right way to do it.

Series applied, thanks.

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2013-10-08 19:32 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-05-09 15:19 [PATCH 1/2] mrf24j40: Avoid transmission while receiving a frame David Hauweele
2013-05-09 15:19 ` [PATCH 2/2] mrf24j40: Keep the interrupt line enabled David Hauweele
     [not found]   ` <1368112788-25701-2-git-send-email-david-1EggE+PRa6vk1uMJSBkQmQ@public.gmane.org>
2013-05-14  3:55     ` Alan Ott
2013-05-16 21:34       ` [Linux-zigbee-devel] " David Hauweele
2013-05-19 23:04         ` Alan Ott
2013-05-21 16:17           ` David Hauweele
     [not found]             ` <CAO+c-UXymPjYysvh2kM36gcOsL3P51YWq+aYhEqX8oCySwBcaQ-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2013-05-21 18:22               ` Alan Ott
2013-05-14  3:22 ` [Linux-zigbee-devel] [PATCH 1/2] mrf24j40: Avoid transmission while receiving a frame Alan Ott
     [not found]   ` <5191ADE4.8040709-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
2013-05-16 17:45     ` David Hauweele
     [not found]       ` <CAO+c-UVJQJbmmmCS4hxAEYHKa8gEU1Bs=y+Rv_A75UHeBZxc+w-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2013-05-20  0:05         ` [PATCH testing] mrf24j40: Move INIT_COMPLETION to before the packet is sent Alan Ott
2013-05-22  2:01         ` [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts Alan Ott
     [not found]           ` <1369188080-8904-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
2013-05-22  2:01             ` [PATCH beta 1] 1/3] mrf24j40: Move INIT_COMPLETION() to before packet transmission Alan Ott
2013-05-22  2:01             ` [PATCH beta 1] 2/3] mrf24j40: Use threaded IRQ handler Alan Ott
2013-10-06  3:52             ` [PATCH v1 0/3] Fix race conditions in mrf24j40 interrupts Alan Ott
     [not found]               ` <1381031544-2960-1-git-send-email-alan-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
2013-10-06  3:52                 ` [PATCH v1 1/3] mrf24j40: Move INIT_COMPLETION() to before packet transmission Alan Ott
2013-10-06  3:52                 ` [PATCH v1 2/3] mrf24j40: Use threaded IRQ handler Alan Ott
2013-10-06  3:52                 ` [PATCH v1 3/3] mrf24j40: Use level-triggered interrupts Alan Ott
2013-10-08 19:32               ` [PATCH v1 0/3] Fix race conditions in mrf24j40 interrupts David Miller
2013-05-22  2:01           ` [PATCH beta 1] 3/3] mrf24j40: Use level-triggered interrupts Alan Ott
2013-05-22  2:03           ` [PATCH beta 1] 0/3] Fix race conditions in mrf24j40 interrupts Alan Ott
2013-05-22 20:32             ` David Hauweele
     [not found]               ` <CAO+c-UV1bZZEE=9=7VVn3f1eLWzCpYCK9eZnnvzNwcPOKKtKLA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2013-05-23  6:36                 ` Alan Ott
     [not found]                   ` <519DB8E6.4020709-yzvJWuRpmD1zbRFIqnYvSA@public.gmane.org>
2013-05-23 17:54                     ` David Hauweele
2013-05-23 19:33                       ` Alan Ott

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).