All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] b43: Rewrite DMA Tx status handling sanity checks
@ 2009-11-19 21:24 Michael Buesch
  2009-11-22 17:52 ` Michael Buesch
                   ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: Michael Buesch @ 2009-11-19 21:24 UTC (permalink / raw)
  To: John W. Linville; +Cc: bcm43xx-dev, linux-wireless, Larry Finger

This rewrites the error handling policies in the TX status handler.
It tries to be error-tolerant as in "try hard to not crash the machine".
It won't recover from errors (that are bugs in the firmware or driver),
because that's impossible. However, it will return a more or less useful
error message and bail out. It also tries hard to use rate-limited messages
to not flood the syslog in case of a failure.

Signed-off-by: Michael Buesch <mb@bu3sch.de>

---

This patch also adds the skb pointer poisoning. I think it's not
strictly needed to catch firmware bugs anymore, because those should
be caught by the in-order check. However, we love defensive coding, so
we try to make the code as robust as possible.

I did not try with openfirmware, but I guess it blows up in the
in-order check there pretty quickly.
Would be cool if somebody could stress this on openfirmware.

Note that if the ordering sanity check fails that can mean three things:
- Either the report ordering on one engine is wrong.
- We missed one report on the engine.
- Or we reported the status to the wrong engine.


Index: wireless-testing/drivers/net/wireless/b43/dma.c
===================================================================
--- wireless-testing.orig/drivers/net/wireless/b43/dma.c	2009-11-18 19:21:17.000000000 +0100
+++ wireless-testing/drivers/net/wireless/b43/dma.c	2009-11-19 21:40:45.000000000 +0100
@@ -874,7 +874,7 @@ static void free_all_descbuffers(struct 
 	for (i = 0; i < ring->nr_slots; i++) {
 		desc = ring->ops->idx2desc(ring, i, &meta);
 
-		if (!meta->skb) {
+		if (!meta->skb || b43_dma_ptr_is_poisoned(meta->skb)) {
 			B43_WARN_ON(!ring->tx);
 			continue;
 		}
@@ -926,7 +926,7 @@ struct b43_dmaring *b43_setup_dmaring(st
 				      enum b43_dmatype type)
 {
 	struct b43_dmaring *ring;
-	int err;
+	int i, err;
 	dma_addr_t dma_test;
 
 	ring = kzalloc(sizeof(*ring), GFP_KERNEL);
@@ -941,6 +941,8 @@ struct b43_dmaring *b43_setup_dmaring(st
 			     GFP_KERNEL);
 	if (!ring->meta)
 		goto err_kfree_ring;
+	for (i = 0; i < ring->nr_slots; i++)
+		ring->meta->skb = B43_DMA_PTR_POISON;
 
 	ring->type = type;
 	ring->dev = dev;
@@ -1251,11 +1253,13 @@ struct b43_dmaring *parse_cookie(struct 
 	case 0x5000:
 		ring = dma->tx_ring_mcast;
 		break;
-	default:
-		B43_WARN_ON(1);
 	}
 	*slot = (cookie & 0x0FFF);
-	B43_WARN_ON(!(ring && *slot >= 0 && *slot < ring->nr_slots));
+	if (unlikely(!ring || *slot < 0 || *slot >= ring->nr_slots)) {
+		b43dbg(dev->wl, "TX-status contains "
+		       "invalid cookie: 0x%04X\n", cookie);
+		return NULL;
+	}
 
 	return ring;
 }
@@ -1494,19 +1498,40 @@ void b43_dma_handle_txstatus(struct b43_
 	struct b43_dmaring *ring;
 	struct b43_dmadesc_generic *desc;
 	struct b43_dmadesc_meta *meta;
-	int slot;
+	int slot, firstused;
 	bool frame_succeed;
 
 	ring = parse_cookie(dev, status->cookie, &slot);
 	if (unlikely(!ring))
 		return;
-
 	B43_WARN_ON(!ring->tx);
+
+	/* Sanity check: TX packets are processed in-order on one ring.
+	 * Check if the slot deduced from the cookie really is the first
+	 * used slot. */
+	firstused = ring->current_slot - ring->used_slots + 1;
+	if (firstused < 0)
+		firstused = ring->nr_slots + firstused;
+	if (unlikely(slot != firstused)) {
+		/* This possibly is a firmware bug and will result in
+		 * malfunction, memory leaks and/or stall of DMA functionality. */
+		b43dbg(dev->wl, "Out of order TX status report on DMA ring %d. "
+		       "Expected %d, but got %d\n",
+		       ring->index, firstused, slot);
+		return;
+	}
+
 	ops = ring->ops;
 	while (1) {
-		B43_WARN_ON(!(slot >= 0 && slot < ring->nr_slots));
+		B43_WARN_ON(slot < 0 || slot >= ring->nr_slots);
 		desc = ops->idx2desc(ring, slot, &meta);
 
+		if (b43_dma_ptr_is_poisoned(meta->skb)) {
+			b43dbg(dev->wl, "Poisoned TX slot %d (first=%d) "
+			       "on ring %d\n",
+			       slot, firstused, ring->index);
+			break;
+		}
 		if (meta->skb) {
 			struct b43_private_tx_info *priv_info =
 				b43_get_priv_tx_info(IEEE80211_SKB_CB(meta->skb));
@@ -1522,7 +1547,14 @@ void b43_dma_handle_txstatus(struct b43_
 		if (meta->is_last_fragment) {
 			struct ieee80211_tx_info *info;
 
-			BUG_ON(!meta->skb);
+			if (unlikely(!meta->skb)) {
+				/* This is a scatter-gather fragment of a frame, so
+				 * the skb pointer must not be NULL. */
+				b43dbg(dev->wl, "TX status unexpected NULL skb "
+				       "at slot %d (first=%d) on ring %d\n",
+				       slot, firstused, ring->index);
+				break;
+			}
 
 			info = IEEE80211_SKB_CB(meta->skb);
 
@@ -1540,20 +1572,29 @@ void b43_dma_handle_txstatus(struct b43_
 #endif /* DEBUG */
 			ieee80211_tx_status(dev->wl->hw, meta->skb);
 
-			/* skb is freed by ieee80211_tx_status() */
-			meta->skb = NULL;
+			/* skb will be freed by ieee80211_tx_status().
+			 * Poison our pointer. */
+			meta->skb = B43_DMA_PTR_POISON;
 		} else {
 			/* No need to call free_descriptor_buffer here, as
 			 * this is only the txhdr, which is not allocated.
 			 */
-			B43_WARN_ON(meta->skb);
+			if (unlikely(meta->skb)) {
+				b43dbg(dev->wl, "TX status unexpected non-NULL skb "
+				       "at slot %d (first=%d) on ring %d\n",
+				       slot, firstused, ring->index);
+				break;
+			}
 		}
 
 		/* Everything unmapped and free'd. So it's not used anymore. */
 		ring->used_slots--;
 
-		if (meta->is_last_fragment)
+		if (meta->is_last_fragment) {
+			/* This is the last scatter-gather
+			 * fragment of the frame. We are done. */
 			break;
+		}
 		slot = next_slot(ring, slot);
 	}
 	if (ring->stopped) {
Index: wireless-testing/drivers/net/wireless/b43/dma.h
===================================================================
--- wireless-testing.orig/drivers/net/wireless/b43/dma.h	2009-11-18 19:29:49.000000000 +0100
+++ wireless-testing/drivers/net/wireless/b43/dma.h	2009-11-19 21:16:54.000000000 +0100
@@ -1,7 +1,7 @@
 #ifndef B43_DMA_H_
 #define B43_DMA_H_
 
-#include <linux/ieee80211.h>
+#include <linux/err.h>
 
 #include "b43.h"
 
@@ -164,6 +164,10 @@ struct b43_dmadesc_generic {
 #define B43_RXRING_SLOTS		64
 #define B43_DMA0_RX_BUFFERSIZE		IEEE80211_MAX_FRAME_LEN
 
+/* Pointer poison */
+#define B43_DMA_PTR_POISON		((void *)ERR_PTR(-ENOMEM))
+#define b43_dma_ptr_is_poisoned(ptr)	(unlikely((ptr) == B43_DMA_PTR_POISON))
+
 
 struct sk_buff;
 struct b43_private;

-- 
Greetings, Michael.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] b43: Rewrite DMA Tx status handling sanity checks
  2009-11-19 21:24 [PATCH] b43: Rewrite DMA Tx status handling sanity checks Michael Buesch
@ 2009-11-22 17:52 ` Michael Buesch
  2009-11-22 18:11   ` Larry Finger
  2009-11-23  1:34 ` Larry Finger
  2009-11-23  4:45 ` Larry Finger
  2 siblings, 1 reply; 11+ messages in thread
From: Michael Buesch @ 2009-11-22 17:52 UTC (permalink / raw)
  To: bcm43xx-dev; +Cc: John W. Linville, linux-wireless, Larry Finger, Lorenzo Nava

On Thursday 19 November 2009 22:24:29 Michael Buesch wrote:
> This rewrites the error handling policies in the TX status handler.
> It tries to be error-tolerant as in "try hard to not crash the machine".
> It won't recover from errors (that are bugs in the firmware or driver),
> because that's impossible. However, it will return a more or less useful
> error message and bail out. It also tries hard to use rate-limited messages
> to not flood the syslog in case of a failure.
> 
> Signed-off-by: Michael Buesch <mb@bu3sch.de>

So did somebody try this with opensource firmware, yet?

-- 
Greetings, Michael.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] b43: Rewrite DMA Tx status handling sanity checks
  2009-11-22 17:52 ` Michael Buesch
@ 2009-11-22 18:11   ` Larry Finger
  2009-11-22 18:19     ` Michael Buesch
  0 siblings, 1 reply; 11+ messages in thread
From: Larry Finger @ 2009-11-22 18:11 UTC (permalink / raw)
  To: Michael Buesch; +Cc: bcm43xx-dev, linux-wireless, Francesco Gringoli

On 11/22/2009 11:52 AM, Michael Buesch wrote:
> On Thursday 19 November 2009 22:24:29 Michael Buesch wrote:
>> This rewrites the error handling policies in the TX status handler.
>> It tries to be error-tolerant as in "try hard to not crash the machine".
>> It won't recover from errors (that are bugs in the firmware or driver),
>> because that's impossible. However, it will return a more or less useful
>> error message and bail out. It also tries hard to use rate-limited messages
>> to not flood the syslog in case of a failure.
>>
>> Signed-off-by: Michael Buesch <mb@bu3sch.de>
> 
> So did somebody try this with opensource firmware, yet?

I'm testing now. So far, it has survived about 18 hours running tcpperf in one
console, and a flood ping in another. It looks really good, but I want at least
24 hours before committing.

Larry

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] b43: Rewrite DMA Tx status handling sanity checks
  2009-11-22 18:11   ` Larry Finger
@ 2009-11-22 18:19     ` Michael Buesch
  0 siblings, 0 replies; 11+ messages in thread
From: Michael Buesch @ 2009-11-22 18:19 UTC (permalink / raw)
  To: Larry Finger; +Cc: bcm43xx-dev, linux-wireless, Francesco Gringoli

On Sunday 22 November 2009 19:11:52 Larry Finger wrote:
> On 11/22/2009 11:52 AM, Michael Buesch wrote:
> > On Thursday 19 November 2009 22:24:29 Michael Buesch wrote:
> >> This rewrites the error handling policies in the TX status handler.
> >> It tries to be error-tolerant as in "try hard to not crash the machine".
> >> It won't recover from errors (that are bugs in the firmware or driver),
> >> because that's impossible. However, it will return a more or less useful
> >> error message and bail out. It also tries hard to use rate-limited messages
> >> to not flood the syslog in case of a failure.
> >>
> >> Signed-off-by: Michael Buesch <mb@bu3sch.de>
> > 
> > So did somebody try this with opensource firmware, yet?
> 
> I'm testing now. So far, it has survived about 18 hours running tcpperf in one
> console, and a flood ping in another.

Cool. Thanks for testing. I'd have expected it to blow up, though. It's a little bit
strange, because there still are reports of blowing up opensource firmware. This patch
should produce better error messages in that case (it will not fix the blown firmware).

> It looks really good, but I want at least 
> 24 hours before committing.

Well, no. Just commit it, please. If this breaks, the _firmware_ has to be fixed.
Not the patch.

-- 
Greetings, Michael.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] b43: Rewrite DMA Tx status handling sanity checks
  2009-11-19 21:24 [PATCH] b43: Rewrite DMA Tx status handling sanity checks Michael Buesch
  2009-11-22 17:52 ` Michael Buesch
@ 2009-11-23  1:34 ` Larry Finger
  2009-11-23 10:30   ` Michael Buesch
  2009-11-23  4:45 ` Larry Finger
  2 siblings, 1 reply; 11+ messages in thread
From: Larry Finger @ 2009-11-23  1:34 UTC (permalink / raw)
  To: Michael Buesch; +Cc: John W. Linville, bcm43xx-dev, linux-wireless

On 11/19/2009 03:24 PM, Michael Buesch wrote:
> This rewrites the error handling policies in the TX status handler.
> It tries to be error-tolerant as in "try hard to not crash the machine".
> It won't recover from errors (that are bugs in the firmware or driver),
> because that's impossible. However, it will return a more or less useful
> error message and bail out. It also tries hard to use rate-limited messages
> to not flood the syslog in case of a failure.
> 
> Signed-off-by: Michael Buesch <mb@bu3sch.de>
> 
> ---

Tested and ACKed by Larry Finger. Not only does this improve the error handling
for b43, but it also appears to fix the skb == NULL error that I experienced
with the open-source firmware.

John - please push this into wireless-testing. It should also go to 2.6.32, but
it is likely too large for the current stage. At least Cc it to stable.

Larry

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] b43: Rewrite DMA Tx status handling sanity checks
  2009-11-19 21:24 [PATCH] b43: Rewrite DMA Tx status handling sanity checks Michael Buesch
  2009-11-22 17:52 ` Michael Buesch
  2009-11-23  1:34 ` Larry Finger
@ 2009-11-23  4:45 ` Larry Finger
  2009-11-23 10:49   ` Michael Buesch
  2 siblings, 1 reply; 11+ messages in thread
From: Larry Finger @ 2009-11-23  4:45 UTC (permalink / raw)
  To: Michael Buesch; +Cc: John W. Linville, bcm43xx-dev, linux-wireless

On 11/19/2009 03:24 PM, Michael Buesch wrote:
> This rewrites the error handling policies in the TX status handler.
> It tries to be error-tolerant as in "try hard to not crash the machine".
> It won't recover from errors (that are bugs in the firmware or driver),
> because that's impossible. However, it will return a more or less useful
> error message and bail out. It also tries hard to use rate-limited messages
> to not flood the syslog in case of a failure.

This patch definitely helped open-source firmware, but it is not a complete fix.
Usually, a failure occurs within minutes to a few hours under heavy load. For my
system, the load is repeating tcpperf transmits in one terminal, and a flood
ping in a second. This time, the system ran for ~25 hours before failing. After
removing the header to eliminate wrap-around, the messages logged are:

b43 ssb0:0: firmware: requesting b43-open/pcm5.fw
Loading OpenSource firmware version 410.31754
Hardware crypto acceleration not supported by firmware
QoS not supported by firmware
debug: Chip initialized
debug: 32-bit DMA initialized
debug: QoS disabled
debug: Wireless interface started
debug: Adding Interface type 2
wlan0: direct probe to AP 00:14:bf:85:49:fa (try 1)
wlan0: direct probe responded
wlan0: authenticate with AP 00:14:bf:85:49:fa (try 1)
wlan0: authenticated
wlan0: associate with AP 00:14:bf:85:49:fa (try 1)
wlan0: RX ReassocResp from 00:14:bf:85:49:fa (capab=0x411 status=12 aid=0)
wlan0: AP denied association (code=12)
wlan0: associate with AP 00:14:bf:85:49:fa (try 1)
wlan0: RX ReassocResp from 00:14:bf:85:49:fa (capab=0x411 status=12 aid=0)
wlan0: AP denied association (code=12)
wlan0: deauthenticating from 00:14:bf:85:49:fa by local choice (reason=3)
wlan0: direct probe to AP 00:14:bf:85:49:fa (try 1)
wlan0: direct probe to AP 00:14:bf:85:49:fa (try 1)
wlan0: direct probe responded
wlan0: authenticate with AP 00:14:bf:85:49:fa (try 1)
wlan0: authenticated
wlan0: associate with AP 00:14:bf:85:49:fa (try 1)
wlan0: RX ReassocResp from 00:14:bf:85:49:fa (capab=0x411 status=0 aid=2)
wlan0: associated
spurious 8259A interrupt: IRQ15.
SFW2-INext-DROP-DEFLT IN=wlan0 OUT=
MAC=00:18:39:5e:90:f9:00:14:bf:85:49:f8:08:00 SRC=192.168.1.1 DST=192.168.1.107
LEN=336 TOS=0x00 PREC=0x00 TTL=64 ID=64124 PROTO=UDP SPT=67 DPT=68 LEN=316
SFW2-INext-DROP-DEFLT IN=wlan0 OUT=
MAC=00:18:39:5e:90:f9:00:14:bf:85:49:f8:08:00 SRC=192.168.1.1 DST=192.168.1.107
LEN=336 TOS=0x00 PREC=0x00 TTL=64 ID=6664 PROTO=UDP SPT=67 DPT=68 LEN=316
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 146
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 148
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 118
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 150
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 152
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 154
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 156
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 158
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 160
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 162
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 164
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 134
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 166
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 168
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 170
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 172
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 174
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 176
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 178
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 180
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 182
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 184
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 154
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 186
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 156
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 188
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 158
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 190
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 160
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 192
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 162
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 194
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 196
debug: Out of order TX status report on DMA ring 1. Expected 114, but got 198

These messages continued. Unloading and reloading the driver restored the network.

Larry


^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] b43: Rewrite DMA Tx status handling sanity checks
  2009-11-23  1:34 ` Larry Finger
@ 2009-11-23 10:30   ` Michael Buesch
  0 siblings, 0 replies; 11+ messages in thread
From: Michael Buesch @ 2009-11-23 10:30 UTC (permalink / raw)
  To: Larry Finger; +Cc: John W. Linville, bcm43xx-dev, linux-wireless

On Monday 23 November 2009 02:34:09 Larry Finger wrote:
> On 11/19/2009 03:24 PM, Michael Buesch wrote:
> > This rewrites the error handling policies in the TX status handler.
> > It tries to be error-tolerant as in "try hard to not crash the machine".
> > It won't recover from errors (that are bugs in the firmware or driver),
> > because that's impossible. However, it will return a more or less useful
> > error message and bail out. It also tries hard to use rate-limited messages
> > to not flood the syslog in case of a failure.
> > 
> > Signed-off-by: Michael Buesch <mb@bu3sch.de>
> > 
> > ---
> 
> Tested and ACKed by Larry Finger. Not only does this improve the error handling
> for b43, but it also appears to fix the skb == NULL error that I experienced
> with the open-source firmware.

I don't think there's any way it can fix this. The patch doesn't change the
code behavior. It just changes the sanity checks, that under normal circumstances
should never trigger.

> John - please push this into wireless-testing. It should also go to 2.6.32, but
> it is likely too large for the current stage. At least Cc it to stable.

Don't put it into stable. This is not a fix.
I don't think it's suitable for 2.6.32 at this stage, too.

-- 
Greetings, Michael.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] b43: Rewrite DMA Tx status handling sanity checks
  2009-11-23  4:45 ` Larry Finger
@ 2009-11-23 10:49   ` Michael Buesch
  2009-11-23 11:00     ` Francesco Gringoli
  0 siblings, 1 reply; 11+ messages in thread
From: Michael Buesch @ 2009-11-23 10:49 UTC (permalink / raw)
  To: Larry Finger; +Cc: John W. Linville, bcm43xx-dev, linux-wireless

On Monday 23 November 2009 05:45:47 Larry Finger wrote:
> On 11/19/2009 03:24 PM, Michael Buesch wrote:
> > This rewrites the error handling policies in the TX status handler.
> > It tries to be error-tolerant as in "try hard to not crash the machine".
> > It won't recover from errors (that are bugs in the firmware or driver),
> > because that's impossible. However, it will return a more or less useful
> > error message and bail out. It also tries hard to use rate-limited messages
> > to not flood the syslog in case of a failure.
> 
> This patch definitely helped open-source firmware, but it is not a complete fix.

It is no fix _at_ _all_.
The patch does not change a single line of code that wasn't either an assertion
or a machine crash before.
So it just transforms assertions into more verbose assertions and crashes into
assertions without a crash.

> debug: Out of order TX status report on DMA ring 1. Expected 114, but got 146

Ok, this is what I expected.

Let's see what's going on. Here's the ring. o is unused, * is used.

ooooooooooooooo***************************************************ooooooooooooooooooooooooooo
               ^               ^                                 ^
               114             146                               newest
               oldest

So as you can see, the firmware reported a TX status for a frame right in the middle of
the ringbuffer. The new code detects this now before getting a double free and/or silent
memory corruption (freeing of used memory).

It really is illegal to report a TX status for a frame that's not the oldest one in the ring.
The firmware is required to process all frames in-order on one ring.

So how can this failure happen? I think there basically are three ways this can happen.

- First is that the ordering within one ring really gets messed up and it loses track
  of its ring pointers. I'm not sure if this is likely. Probably not.
- It messes up the ring membership. So it reports a TX status on the wrong ring.
  Note that the "ring" kernel pointer in the TX status report handler is derived
  from the cookie (and so also the number in the message "Out of order TX status
  report on DMA ring 1" is derived from the cookie). So it's untrustworthy in case of
  broken firmware. The firmware has QoS-alike mechanisms, even if QoS is disabled. Maybe
  these mechanisms are broken.
- Third is the possibility of a driver bug. I rule that out as long as nobody is
  able to reproduce it with proprietary firmware.

-- 
Greetings, Michael.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] b43: Rewrite DMA Tx status handling sanity checks
  2009-11-23 10:49   ` Michael Buesch
@ 2009-11-23 11:00     ` Francesco Gringoli
  2009-11-23 11:05       ` Michael Buesch
  2009-11-23 15:41       ` Larry Finger
  0 siblings, 2 replies; 11+ messages in thread
From: Francesco Gringoli @ 2009-11-23 11:00 UTC (permalink / raw)
  To: Broadcom Wireless; +Cc: Larry Finger, linux-wireless, Michael Buesch

On Nov 23, 2009, at 11:49 AM, Michael Buesch wrote:

> On Monday 23 November 2009 05:45:47 Larry Finger wrote:
>> On 11/19/2009 03:24 PM, Michael Buesch wrote:
>>> This rewrites the error handling policies in the TX status handler.
>>> It tries to be error-tolerant as in "try hard to not crash the  
>>> machine".
>>> It won't recover from errors (that are bugs in the firmware or  
>>> driver),
>>> because that's impossible. However, it will return a more or less  
>>> useful
>>> error message and bail out. It also tries hard to use rate-limited  
>>> messages
>>> to not flood the syslog in case of a failure.
>>
>> This patch definitely helped open-source firmware, but it is not a  
>> complete fix.
>
> It is no fix _at_ _all_.
> The patch does not change a single line of code that wasn't either  
> an assertion
> or a machine crash before.
> So it just transforms assertions into more verbose assertions and  
> crashes into
> assertions without a crash.
>
>> debug: Out of order TX status report on DMA ring 1. Expected 114,  
>> but got 146
>
> Ok, this is what I expected.
>
> Let's see what's going on. Here's the ring. o is unused, * is used.
>
> ooooooooooooooo 
> ***************************************************ooooooooooooooooooooooooooo
>               ^               ^                                 ^
>               114             146                               newest
>               oldest
>
> So as you can see, the firmware reported a TX status for a frame  
> right in the middle of
> the ringbuffer. The new code detects this now before getting a  
> double free and/or silent
> memory corruption (freeing of used memory).
Hi Michael,

so you can observe this behavior at your site. Do you mind describing  
the exact configuration? Maybe this time I can reproduce this  
behavior, as I tried everything to make it happen. I also asked Larry  
one of his boards and put it into several PCs but had no chance to  
reproduce the crash. Could you please also report the neighboring  
stations, the AP you are connected and so on.

Many thanks,
-Francesco



Informativa sulla privacy: http://help.ing.unibs.it/privacy.php



^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] b43: Rewrite DMA Tx status handling sanity checks
  2009-11-23 11:00     ` Francesco Gringoli
@ 2009-11-23 11:05       ` Michael Buesch
  2009-11-23 15:41       ` Larry Finger
  1 sibling, 0 replies; 11+ messages in thread
From: Michael Buesch @ 2009-11-23 11:05 UTC (permalink / raw)
  To: Francesco Gringoli; +Cc: Broadcom Wireless, Larry Finger, linux-wireless

On Monday 23 November 2009 12:00:15 Francesco Gringoli wrote:
> Hi Michael,
> 
> so you can observe this behavior at your site. Do you mind describing  
> the exact configuration? Maybe this time I can reproduce this  
> behavior, as I tried everything to make it happen. I also asked Larry  
> one of his boards and put it into several PCs but had no chance to  
> reproduce the crash. Could you please also report the neighboring  
> stations, the AP you are connected and so on.

I don't reproduce this, because I never really tried running the opensource firmware.
Larry reproduced it, this time.

-- 
Greetings, Michael.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH] b43: Rewrite DMA Tx status handling sanity checks
  2009-11-23 11:00     ` Francesco Gringoli
  2009-11-23 11:05       ` Michael Buesch
@ 2009-11-23 15:41       ` Larry Finger
  1 sibling, 0 replies; 11+ messages in thread
From: Larry Finger @ 2009-11-23 15:41 UTC (permalink / raw)
  To: Francesco Gringoli; +Cc: Broadcom Wireless, linux-wireless, Michael Buesch

On 11/23/2009 05:00 AM, Francesco Gringoli wrote:
> 
> so you can observe this behavior at your site. Do you mind describing
> the exact configuration? Maybe this time I can reproduce this behavior,
> as I tried everything to make it happen. I also asked Larry one of his
> boards and put it into several PCs but had no chance to reproduce the
> crash. Could you please also report the neighboring stations, the AP you
> are connected and so on.

As Michael said, I was the one that reported the behavior. My card was a BCM4318
in a Cardbus format running V5.2 of the openfwwf. The AP is a Linksys WRT54G V5
running standard firmware v1.02.6. A list of nearby AP's with channels and
strengths are as follows:

          Cell 01 - Address: 00:14:BF:85:49:FA
                    Channel:1
                    Frequency:2.412 GHz (Channel 1)
                    Quality=70/70  Signal level=-6 dBm
                    Encryption key:on  (WPA2)
                    ESSID:"lwfdjf_rad"
          Cell 02 - Address: 00:1B:11:5C:B0:83
                    Channel:1
                    Frequency:2.412 GHz (Channel 1)
                    Quality=25/70  Signal level=-85 dBm
                    Encryption key:on   (WEP)
                    ESSID:"Browns"
          Cell 03 - Address: 00:1A:70:46:BA:B1
                    Channel:11
                    Frequency:2.462 GHz (Channel 11)
                    Quality=42/70  Signal level=-68 dBm
                    Encryption key:on    (WEP)
                    ESSID:"Larry with space"
          Cell 04 - Address: 00:23:69:81:B7:D9
                    Channel:6
                    Frequency:2.437 GHz (Channel 6)
                    Quality=34/70  Signal level=-76 dBm
                    Encryption key:on    (WEP)
                    ESSID:"Hoover"
          Cell 05 - Address: 00:14:BF:0C:7E:14
                    Channel:6
                    Frequency:2.437 GHz (Channel 6)
                    Quality=28/70  Signal level=-82 dBm
                    Encryption key:on     (WPA)
                    ESSID:"linksys"
          Cell 06 - Address: 00:22:6B:78:18:7D
                    Channel:11
                    Frequency:2.462 GHz (Channel 11)
                    Quality=26/70  Signal level=-84 dBm
                    Encryption key:on     (WPA)
                    ESSID:"Go Away"

I was connected to the AP in Cell 01. The test was my usual with a repeating
tcpperf run in one terminal and a flood ping to the same server in a second.
Please let me know if I missed any useful information.

This condition may take a long time to show up. For instance, my latest test ran
for 25 hours before failure. All other tests have failed much more quickly, but
one never knows. I have not seen this failure with standard firmware.

Larry

Larry






^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2009-11-23 15:41 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-11-19 21:24 [PATCH] b43: Rewrite DMA Tx status handling sanity checks Michael Buesch
2009-11-22 17:52 ` Michael Buesch
2009-11-22 18:11   ` Larry Finger
2009-11-22 18:19     ` Michael Buesch
2009-11-23  1:34 ` Larry Finger
2009-11-23 10:30   ` Michael Buesch
2009-11-23  4:45 ` Larry Finger
2009-11-23 10:49   ` Michael Buesch
2009-11-23 11:00     ` Francesco Gringoli
2009-11-23 11:05       ` Michael Buesch
2009-11-23 15:41       ` Larry Finger

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.