linux-wireless.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] ath9k:  Implement rx copy-break.
@ 2011-01-08 15:33 greearb
  2011-01-09  0:20 ` Felix Fietkau
  2011-01-09  8:00 ` Gabor Juhos
  0 siblings, 2 replies; 16+ messages in thread
From: greearb @ 2011-01-08 15:33 UTC (permalink / raw)
  To: linux-wireless; +Cc: ath9k-devel, Ben Greear

From: Ben Greear <greearb@candelatech.com>

This saves us constantly allocating large, multi-page
skbs.  It should fix the order-1 allocation errors reported,
and in a 60-vif scenario, this significantly decreases CPU
utilization, and latency, and increases bandwidth.

Signed-off-by: Ben Greear <greearb@candelatech.com>
---
:100644 100644 b2497b8... ea2f67c... M	drivers/net/wireless/ath/ath9k/recv.c
 drivers/net/wireless/ath/ath9k/recv.c |   92 ++++++++++++++++++++++-----------
 1 files changed, 61 insertions(+), 31 deletions(-)

diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c
index b2497b8..ea2f67c 100644
--- a/drivers/net/wireless/ath/ath9k/recv.c
+++ b/drivers/net/wireless/ath/ath9k/recv.c
@@ -16,6 +16,7 @@
 
 #include "ath9k.h"
 #include "ar9003_mac.h"
+#include <linux/pci.h>
 
 #define SKB_CB_ATHBUF(__skb)	(*((struct ath_buf **)__skb->cb))
 
@@ -1623,7 +1624,7 @@ div_comb_done:
 int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp)
 {
 	struct ath_buf *bf;
-	struct sk_buff *skb = NULL, *requeue_skb;
+	struct sk_buff *skb = NULL, *requeue_skb = NULL;
 	struct ieee80211_rx_status *rxs;
 	struct ath_hw *ah = sc->sc_ah;
 	struct ath_common *common = ath9k_hw_common(ah);
@@ -1634,7 +1635,8 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp)
 	 */
 	struct ieee80211_hw *hw = NULL;
 	struct ieee80211_hdr *hdr;
-	int retval;
+	int retval, len;
+	bool use_copybreak = true;
 	bool decrypt_error = false;
 	struct ath_rx_status rs;
 	enum ath9k_rx_qtype qtype;
@@ -1702,42 +1704,70 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp)
 		    unlikely(tsf_lower - rs.rs_tstamp > 0x10000000))
 			rxs->mactime += 0x100000000ULL;
 
-		/* Ensure we always have an skb to requeue once we are done
-		 * processing the current buffer's skb */
-		requeue_skb = ath_rxbuf_alloc(common, common->rx_bufsize, GFP_ATOMIC);
-
-		/* If there is no memory we ignore the current RX'd frame,
-		 * tell hardware it can give us a new frame using the old
-		 * skb and put it at the tail of the sc->rx.rxbuf list for
-		 * processing. */
-		if (!requeue_skb)
-			goto requeue;
-
-		/* Unmap the frame */
-		dma_unmap_single(sc->dev, bf->bf_buf_addr,
-				 common->rx_bufsize,
-				 dma_type);
+		len = rs.rs_datalen + ah->caps.rx_status_len;
+		if (use_copybreak) {
+			skb = netdev_alloc_skb(NULL, len);
+			if (!skb) {
+				skb = bf->bf_mpdu;
+				use_copybreak = false;
+				goto non_copybreak;
+			}
+		} else {
+non_copybreak:
+			/* Ensure we always have an skb to requeue once we are
+			 * done processing the current buffer's skb */
+			requeue_skb = ath_rxbuf_alloc(common,
+						      common->rx_bufsize,
+						      GFP_ATOMIC);
+
+			/* If there is no memory we ignore the current RX'd
+			 * frame, tell hardware it can give us a new frame
+			 * using the old skb and put it at the tail of the
+			 * sc->rx.rxbuf list for processing. */
+			if (!requeue_skb)
+				goto requeue;
+
+			/* Unmap the frame */
+			dma_unmap_single(sc->dev, bf->bf_buf_addr,
+					 common->rx_bufsize,
+					 dma_type);
+		}
 
-		skb_put(skb, rs.rs_datalen + ah->caps.rx_status_len);
+		skb_put(skb, len);
 		if (ah->caps.rx_status_len)
 			skb_pull(skb, ah->caps.rx_status_len);
 
+		if (use_copybreak) {
+			struct pci_dev *pdev = to_pci_dev(sc->dev);
+			pci_dma_sync_single_for_cpu(pdev, bf->bf_buf_addr,
+						    len, PCI_DMA_FROMDEVICE);
+			skb_copy_from_linear_data(bf->bf_mpdu, skb->data, len);
+			pci_dma_sync_single_for_device(pdev, bf->bf_buf_addr,
+						       len, PCI_DMA_FROMDEVICE);
+			memcpy(skb->cb, bf->bf_mpdu->cb, sizeof(skb->cb));
+			rxs =  IEEE80211_SKB_RXCB(skb);
+		}
+
 		ath9k_rx_skb_postprocess(common, skb, &rs,
 					 rxs, decrypt_error);
 
-		/* We will now give hardware our shiny new allocated skb */
-		bf->bf_mpdu = requeue_skb;
-		bf->bf_buf_addr = dma_map_single(sc->dev, requeue_skb->data,
-						 common->rx_bufsize,
-						 dma_type);
-		if (unlikely(dma_mapping_error(sc->dev,
-			  bf->bf_buf_addr))) {
-			dev_kfree_skb_any(requeue_skb);
-			bf->bf_mpdu = NULL;
-			bf->bf_buf_addr = 0;
-			ath_err(common, "dma_mapping_error() on RX\n");
-			ath_rx_send_to_mac80211(hw, sc, skb);
-			break;
+		if (!use_copybreak) {
+			/* We will now give hardware our shiny new allocated
+			 * skb */
+			bf->bf_mpdu = requeue_skb;
+			bf->bf_buf_addr = dma_map_single(sc->dev,
+							 requeue_skb->data,
+							 common->rx_bufsize,
+							 dma_type);
+			if (unlikely(dma_mapping_error(sc->dev,
+						       bf->bf_buf_addr))) {
+				dev_kfree_skb_any(requeue_skb);
+				bf->bf_mpdu = NULL;
+				bf->bf_buf_addr = 0;
+				ath_err(common, "dma_mapping_error() on RX\n");
+				ath_rx_send_to_mac80211(hw, sc, skb);
+				break;
+			}
 		}
 
 		/*
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-08 15:33 [PATCH] ath9k: Implement rx copy-break greearb
@ 2011-01-09  0:20 ` Felix Fietkau
  2011-01-09  0:36   ` Ben Greear
  2011-01-09  8:00 ` Gabor Juhos
  1 sibling, 1 reply; 16+ messages in thread
From: Felix Fietkau @ 2011-01-09  0:20 UTC (permalink / raw)
  To: greearb; +Cc: linux-wireless, ath9k-devel

On 2011-01-08 8:33 AM, greearb@candelatech.com wrote:
> From: Ben Greear<greearb@candelatech.com>
>
> This saves us constantly allocating large, multi-page
> skbs.  It should fix the order-1 allocation errors reported,
> and in a 60-vif scenario, this significantly decreases CPU
> utilization, and latency, and increases bandwidth.
>
> Signed-off-by: Ben Greear<greearb@candelatech.com>
> ---
> :100644 100644 b2497b8... ea2f67c... M	drivers/net/wireless/ath/ath9k/recv.c
>   drivers/net/wireless/ath/ath9k/recv.c |   92 ++++++++++++++++++++++-----------
>   1 files changed, 61 insertions(+), 31 deletions(-)
>
> diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c
> index b2497b8..ea2f67c 100644
> --- a/drivers/net/wireless/ath/ath9k/recv.c
> +++ b/drivers/net/wireless/ath/ath9k/recv.c
> @@ -1702,42 +1704,70 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp)
>   		    unlikely(tsf_lower - rs.rs_tstamp>  0x10000000))
>   			rxs->mactime += 0x100000000ULL;
>
> -		/* Ensure we always have an skb to requeue once we are done
> -		 * processing the current buffer's skb */
> -		requeue_skb = ath_rxbuf_alloc(common, common->rx_bufsize, GFP_ATOMIC);
> -
> -		/* If there is no memory we ignore the current RX'd frame,
> -		 * tell hardware it can give us a new frame using the old
> -		 * skb and put it at the tail of the sc->rx.rxbuf list for
> -		 * processing. */
> -		if (!requeue_skb)
> -			goto requeue;
> -
> -		/* Unmap the frame */
> -		dma_unmap_single(sc->dev, bf->bf_buf_addr,
> -				 common->rx_bufsize,
> -				 dma_type);
> +		len = rs.rs_datalen + ah->caps.rx_status_len;
> +		if (use_copybreak) {
> +			skb = netdev_alloc_skb(NULL, len);
> +			if (!skb) {
> +				skb = bf->bf_mpdu;
> +				use_copybreak = false;
> +				goto non_copybreak;
> +			}
> +		} else {
I think this should be dependent on packet size, maybe even based on the 
architecture. Especially on embedded hardware, copying large frames is 
probably quite a bit more expensive than allocating large buffers. Cache 
sizes are small, memory access takes several cycles, especially during 
concurrent DMA.
Once I'm back home, I could try a few packet size threshold to find a 
sweet spot for the typical MIPS hardware that I'm playing with. I expect 
a visible performance regression from this patch when applied as-is.

- Felix

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-09  0:20 ` Felix Fietkau
@ 2011-01-09  0:36   ` Ben Greear
  2011-01-09  0:41     ` Felix Fietkau
  2011-01-09 18:13     ` Jouni Malinen
  0 siblings, 2 replies; 16+ messages in thread
From: Ben Greear @ 2011-01-09  0:36 UTC (permalink / raw)
  To: Felix Fietkau; +Cc: linux-wireless, ath9k-devel

On 01/08/2011 04:20 PM, Felix Fietkau wrote:
> On 2011-01-08 8:33 AM, greearb@candelatech.com wrote:
>> From: Ben Greear<greearb@candelatech.com>
>>
>> This saves us constantly allocating large, multi-page
>> skbs. It should fix the order-1 allocation errors reported,
>> and in a 60-vif scenario, this significantly decreases CPU
>> utilization, and latency, and increases bandwidth.
>>
>> Signed-off-by: Ben Greear<greearb@candelatech.com>
>> ---
>> :100644 100644 b2497b8... ea2f67c... M drivers/net/wireless/ath/ath9k/recv.c
>> drivers/net/wireless/ath/ath9k/recv.c | 92 ++++++++++++++++++++++-----------
>> 1 files changed, 61 insertions(+), 31 deletions(-)
>>
>> diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c
>> index b2497b8..ea2f67c 100644
>> --- a/drivers/net/wireless/ath/ath9k/recv.c
>> +++ b/drivers/net/wireless/ath/ath9k/recv.c
>> @@ -1702,42 +1704,70 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp)
>> unlikely(tsf_lower - rs.rs_tstamp> 0x10000000))
>> rxs->mactime += 0x100000000ULL;
>>
>> - /* Ensure we always have an skb to requeue once we are done
>> - * processing the current buffer's skb */
>> - requeue_skb = ath_rxbuf_alloc(common, common->rx_bufsize, GFP_ATOMIC);
>> -
>> - /* If there is no memory we ignore the current RX'd frame,
>> - * tell hardware it can give us a new frame using the old
>> - * skb and put it at the tail of the sc->rx.rxbuf list for
>> - * processing. */
>> - if (!requeue_skb)
>> - goto requeue;
>> -
>> - /* Unmap the frame */
>> - dma_unmap_single(sc->dev, bf->bf_buf_addr,
>> - common->rx_bufsize,
>> - dma_type);
>> + len = rs.rs_datalen + ah->caps.rx_status_len;
>> + if (use_copybreak) {
>> + skb = netdev_alloc_skb(NULL, len);
>> + if (!skb) {
>> + skb = bf->bf_mpdu;
>> + use_copybreak = false;
>> + goto non_copybreak;
>> + }
>> + } else {
> I think this should be dependent on packet size, maybe even based on the architecture. Especially on embedded hardware, copying large frames is probably quite a
> bit more expensive than allocating large buffers. Cache sizes are small, memory access takes several cycles, especially during concurrent DMA.
> Once I'm back home, I could try a few packet size threshold to find a sweet spot for the typical MIPS hardware that I'm playing with. I expect a visible
> performance regression from this patch when applied as-is.

I see a serious performance improvement with this patch.  My current test is sending 1024 byte UDP
payloads to/from each of 60 stations at 128kbps.  Please do try it out on your system and see how
it performs there.  I'm guessing that any time you have more than 1 VIF this will be a good
improvement since mac80211 does skb_copy (and you would typically be copying a much smaller
packet with this patch).

If we do see performance differences on different platforms, this could perhaps be
something we could tune at run-time.

Thanks,
Ben

>
> - Felix


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-09  0:36   ` Ben Greear
@ 2011-01-09  0:41     ` Felix Fietkau
  2011-01-09  1:06       ` Ben Greear
  2011-01-09 18:13     ` Jouni Malinen
  1 sibling, 1 reply; 16+ messages in thread
From: Felix Fietkau @ 2011-01-09  0:41 UTC (permalink / raw)
  To: Ben Greear; +Cc: linux-wireless, ath9k-devel

On 2011-01-08 5:36 PM, Ben Greear wrote:
> On 01/08/2011 04:20 PM, Felix Fietkau wrote:
>>  I think this should be dependent on packet size, maybe even based on the architecture. Especially on embedded hardware, copying large frames is probably quite a
>>  bit more expensive than allocating large buffers. Cache sizes are small, memory access takes several cycles, especially during concurrent DMA.
>>  Once I'm back home, I could try a few packet size threshold to find a sweet spot for the typical MIPS hardware that I'm playing with. I expect a visible
>>  performance regression from this patch when applied as-is.
>
> I see a serious performance improvement with this patch.  My current test is sending 1024 byte UDP
> payloads to/from each of 60 stations at 128kbps.  Please do try it out on your system and see how
> it performs there.  I'm guessing that any time you have more than 1 VIF this will be a good
> improvement since mac80211 does skb_copy (and you would typically be copying a much smaller
> packet with this patch).
>
> If we do see performance differences on different platforms, this could perhaps be
> something we could tune at run-time.
What kind of system are you testing on? If it's a PC, then the 
performance characteristics will be completely different compared to 
embedded hardware. I've had to remove a few copybreak-like 
implementations from various ethernet drivers on similar hardware, 
because even taking the hit of unaligned load/store exceptions (which 
are already *very* expensive on MIPS) was less than copying the full 
packet data, even with packet sizes less than what you're using.

I don't have suitable test hardware with me right now, but I'll do some 
tests in a week or so.

- Felix

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-09  0:41     ` Felix Fietkau
@ 2011-01-09  1:06       ` Ben Greear
  2011-01-09 14:15         ` Björn Smedman
  0 siblings, 1 reply; 16+ messages in thread
From: Ben Greear @ 2011-01-09  1:06 UTC (permalink / raw)
  To: Felix Fietkau; +Cc: linux-wireless, ath9k-devel

On 01/08/2011 04:41 PM, Felix Fietkau wrote:
> On 2011-01-08 5:36 PM, Ben Greear wrote:
>> On 01/08/2011 04:20 PM, Felix Fietkau wrote:
>>> I think this should be dependent on packet size, maybe even based on the architecture. Especially on embedded hardware, copying large frames is probably quite a
>>> bit more expensive than allocating large buffers. Cache sizes are small, memory access takes several cycles, especially during concurrent DMA.
>>> Once I'm back home, I could try a few packet size threshold to find a sweet spot for the typical MIPS hardware that I'm playing with. I expect a visible
>>> performance regression from this patch when applied as-is.
>>
>> I see a serious performance improvement with this patch. My current test is sending 1024 byte UDP
>> payloads to/from each of 60 stations at 128kbps. Please do try it out on your system and see how
>> it performs there. I'm guessing that any time you have more than 1 VIF this will be a good
>> improvement since mac80211 does skb_copy (and you would typically be copying a much smaller
>> packet with this patch).
>>
>> If we do see performance differences on different platforms, this could perhaps be
>> something we could tune at run-time.
> What kind of system are you testing on? If it's a PC, then the performance characteristics will be completely different compared to embedded hardware. I've had
> to remove a few copybreak-like implementations from various ethernet drivers on similar hardware, because even taking the hit of unaligned load/store exceptions
> (which are already *very* expensive on MIPS) was less than copying the full packet data, even with packet sizes less than what you're using.
>
> I don't have suitable test hardware with me right now, but I'll do some tests in a week or so.

I'm on a dual-core Atom processor.  I'm interested in your MIPs results when you get them...

Thanks,
Ben

>
> - Felix


-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-08 15:33 [PATCH] ath9k: Implement rx copy-break greearb
  2011-01-09  0:20 ` Felix Fietkau
@ 2011-01-09  8:00 ` Gabor Juhos
  2011-01-09 17:49   ` Ben Greear
  1 sibling, 1 reply; 16+ messages in thread
From: Gabor Juhos @ 2011-01-09  8:00 UTC (permalink / raw)
  To: greearb; +Cc: linux-wireless, ath9k-devel

Hi Ben,

> From: Ben Greear <greearb@candelatech.com>
> 
> This saves us constantly allocating large, multi-page
> skbs.  It should fix the order-1 allocation errors reported,
> and in a 60-vif scenario, this significantly decreases CPU
> utilization, and latency, and increases bandwidth.
> 
> Signed-off-by: Ben Greear <greearb@candelatech.com>
> ---
> :100644 100644 b2497b8... ea2f67c... M	drivers/net/wireless/ath/ath9k/recv.c
>  drivers/net/wireless/ath/ath9k/recv.c |   92 ++++++++++++++++++++++-----------
>  1 files changed, 61 insertions(+), 31 deletions(-)

<...>

> +		if (use_copybreak) {
> +			struct pci_dev *pdev = to_pci_dev(sc->dev);

This would cause undefined behaviour with ath9k devices sitting on an AHB bus.

> +			pci_dma_sync_single_for_cpu(pdev, bf->bf_buf_addr,
> +						    len, PCI_DMA_FROMDEVICE);
> +			skb_copy_from_linear_data(bf->bf_mpdu, skb->data, len);
> +			pci_dma_sync_single_for_device(pdev, bf->bf_buf_addr,
> +						       len, PCI_DMA_FROMDEVICE);

Please use the bus agnostic equivalents of these DMA functions.

- Gabor

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k: Implement rx copy-break.
  2011-01-09  1:06       ` Ben Greear
@ 2011-01-09 14:15         ` Björn Smedman
  2011-01-09 14:18           ` Felix Fietkau
  0 siblings, 1 reply; 16+ messages in thread
From: Björn Smedman @ 2011-01-09 14:15 UTC (permalink / raw)
  To: Ben Greear; +Cc: Felix Fietkau, linux-wireless, ath9k-devel

On Sun, Jan 9, 2011 at 2:06 AM, Ben Greear <greearb@candelatech.com> wrote:
> On 01/08/2011 04:41 PM, Felix Fietkau wrote:
>>
>> On 2011-01-08 5:36 PM, Ben Greear wrote:
>>>
>>> On 01/08/2011 04:20 PM, Felix Fietkau wrote:
>>>>
>>>> I think this should be dependent on packet size, maybe even based on the
>>>> architecture. Especially on embedded hardware, copying large frames is
>>>> probably quite a
>>>> bit more expensive than allocating large buffers. Cache sizes are small,
>>>> memory access takes several cycles, especially during concurrent DMA.
>>>> Once I'm back home, I could try a few packet size threshold to find a
>>>> sweet spot for the typical MIPS hardware that I'm playing with. I expect a
>>>> visible
>>>> performance regression from this patch when applied as-is.
>>>
>>> I see a serious performance improvement with this patch. My current test
>>> is sending 1024 byte UDP
>>> payloads to/from each of 60 stations at 128kbps. Please do try it out on
>>> your system and see how
>>> it performs there. I'm guessing that any time you have more than 1 VIF
>>> this will be a good
>>> improvement since mac80211 does skb_copy (and you would typically be
>>> copying a much smaller
>>> packet with this patch).
>>>
>>> If we do see performance differences on different platforms, this could
>>> perhaps be
>>> something we could tune at run-time.
>>
>> What kind of system are you testing on? If it's a PC, then the performance
>> characteristics will be completely different compared to embedded hardware.
>> I've had
>> to remove a few copybreak-like implementations from various ethernet
>> drivers on similar hardware, because even taking the hit of unaligned
>> load/store exceptions
>> (which are already *very* expensive on MIPS) was less than copying the
>> full packet data, even with packet sizes less than what you're using.
>>
>> I don't have suitable test hardware with me right now, but I'll do some
>> tests in a week or so.
>
> I'm on a dual-core Atom processor.  I'm interested in your MIPs results when
> you get them...

I think we should also consider the added stability/saneness with this
patch. I for one would be willing to live with some extra cpu load if
that means the unstoppable rx dma problem can be contained (all the
time).

Perhaps make it a configuration option?

/Björn

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k: Implement rx copy-break.
  2011-01-09 14:15         ` Björn Smedman
@ 2011-01-09 14:18           ` Felix Fietkau
  2011-01-09 15:35             ` Björn Smedman
  0 siblings, 1 reply; 16+ messages in thread
From: Felix Fietkau @ 2011-01-09 14:18 UTC (permalink / raw)
  To: Björn Smedman; +Cc: Ben Greear, linux-wireless, ath9k-devel

On 2011-01-09 7:15 AM, Björn Smedman wrote:
> I think we should also consider the added stability/saneness with this
> patch. I for one would be willing to live with some extra cpu load if
> that means the unstoppable rx dma problem can be contained (all the
> time).
I don't think this patch has anything to do with the rx dma stop issue.

- Felix

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k: Implement rx copy-break.
  2011-01-09 14:18           ` Felix Fietkau
@ 2011-01-09 15:35             ` Björn Smedman
  0 siblings, 0 replies; 16+ messages in thread
From: Björn Smedman @ 2011-01-09 15:35 UTC (permalink / raw)
  To: Felix Fietkau; +Cc: Ben Greear, linux-wireless, ath9k-devel

2011/1/9 Felix Fietkau <nbd@openwrt.org>:
> On 2011-01-09 7:15 AM, Björn Smedman wrote:
>>
>> I think we should also consider the added stability/saneness with this
>> patch. I for one would be willing to live with some extra cpu load if
>> that means the unstoppable rx dma problem can be contained (all the
>> time).
>
> I don't think this patch has anything to do with the rx dma stop issue.

Ok, so if you have a machine gun that can go off at any time and is
impossible to stop you don't think it makes a difference where it's
pointed? ;) Just kidding, and call me superstitious, but I would feel
much better knowing ath9k is pointing that half-broken dma engine at
it's own data only, at least until it's 100% stable.

/Björn

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-09  8:00 ` Gabor Juhos
@ 2011-01-09 17:49   ` Ben Greear
  2011-01-10  7:14     ` Gabor Juhos
  0 siblings, 1 reply; 16+ messages in thread
From: Ben Greear @ 2011-01-09 17:49 UTC (permalink / raw)
  To: Gabor Juhos; +Cc: linux-wireless, ath9k-devel

On 01/09/2011 12:00 AM, Gabor Juhos wrote:
> Hi Ben,
>
>> From: Ben Greear<greearb@candelatech.com>
>>
>> This saves us constantly allocating large, multi-page
>> skbs.  It should fix the order-1 allocation errors reported,
>> and in a 60-vif scenario, this significantly decreases CPU
>> utilization, and latency, and increases bandwidth.
>>
>> Signed-off-by: Ben Greear<greearb@candelatech.com>
>> ---
>> :100644 100644 b2497b8... ea2f67c... M	drivers/net/wireless/ath/ath9k/recv.c
>>   drivers/net/wireless/ath/ath9k/recv.c |   92 ++++++++++++++++++++++-----------
>>   1 files changed, 61 insertions(+), 31 deletions(-)
>
> <...>
>
>> +		if (use_copybreak) {
>> +			struct pci_dev *pdev = to_pci_dev(sc->dev);
>
> This would cause undefined behaviour with ath9k devices sitting on an AHB bus.
>
>> +			pci_dma_sync_single_for_cpu(pdev, bf->bf_buf_addr,
>> +						    len, PCI_DMA_FROMDEVICE);
>> +			skb_copy_from_linear_data(bf->bf_mpdu, skb->data, len);
>> +			pci_dma_sync_single_for_device(pdev, bf->bf_buf_addr,
>> +						       len, PCI_DMA_FROMDEVICE);
>
> Please use the bus agnostic equivalents of these DMA functions.

Any idea what that might be?

Should we just disable copybreak for things on AHB bus?

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-09  0:36   ` Ben Greear
  2011-01-09  0:41     ` Felix Fietkau
@ 2011-01-09 18:13     ` Jouni Malinen
  2011-01-09 20:14       ` Christian Lamparter
  2011-01-10  4:32       ` Ben Greear
  1 sibling, 2 replies; 16+ messages in thread
From: Jouni Malinen @ 2011-01-09 18:13 UTC (permalink / raw)
  To: Ben Greear; +Cc: Felix Fietkau, linux-wireless, ath9k-devel

On Sat, Jan 08, 2011 at 04:36:23PM -0800, Ben Greear wrote:
> On 01/08/2011 04:20 PM, Felix Fietkau wrote:
> >On 2011-01-08 8:33 AM, greearb@candelatech.com wrote:
> >>From: Ben Greear<greearb@candelatech.com>
> >>This saves us constantly allocating large, multi-page
> >>skbs. It should fix the order-1 allocation errors reported,
> >>and in a 60-vif scenario, this significantly decreases CPU
> >>utilization, and latency, and increases bandwidth.

As far as CPU use is concerned, 60 VIF scenario should not be the one to
use for checking what is most efficient.. This really needs to be tested
on something that uses a single VIF on an embedded (low-power CPU)..

For the order-1 allocation issues, it would be interesting to see if
someone could take a look at using paged skbs or multiple RX descriptors
with shorter skbs (and copying only for the case where a long frame is
received so that only the A-MSDU RX case would suffer from extra
copying).

> I see a serious performance improvement with this patch.  My current test is sending 1024 byte UDP
> payloads to/from each of 60 stations at 128kbps.  Please do try it out on your system and see how
> it performs there.  I'm guessing that any time you have more than 1 VIF this will be a good
> improvement since mac80211 does skb_copy (and you would typically be copying a much smaller
> packet with this patch).

How would this patch change the number of bytes copied by skb_copy?

> If we do see performance differences on different platforms, this could perhaps be
> something we could tune at run-time.

I guess that could be looked at, but as long as that is not the case,
the test setup you used is not exactly the most common case for ath9k in
the upstream kernel and should not be used to figure out default
behavior.
 
-- 
Jouni Malinen                                            PGP id EFC895FA

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-09 18:13     ` Jouni Malinen
@ 2011-01-09 20:14       ` Christian Lamparter
  2011-01-09 20:24         ` Felix Fietkau
  2011-01-10 12:40         ` Jouni Malinen
  2011-01-10  4:32       ` Ben Greear
  1 sibling, 2 replies; 16+ messages in thread
From: Christian Lamparter @ 2011-01-09 20:14 UTC (permalink / raw)
  To: Jouni Malinen; +Cc: Ben Greear, Felix Fietkau, linux-wireless, ath9k-devel

On Sunday 09 January 2011 19:13:04 Jouni Malinen wrote:
> On Sat, Jan 08, 2011 at 04:36:23PM -0800, Ben Greear wrote:
> > On 01/08/2011 04:20 PM, Felix Fietkau wrote:
> > >On 2011-01-08 8:33 AM, greearb@candelatech.com wrote:
> > >>From: Ben Greear<greearb@candelatech.com>
> > >>This saves us constantly allocating large, multi-page
> > >>skbs. It should fix the order-1 allocation errors reported,
> > >>and in a 60-vif scenario, this significantly decreases CPU
> > >>utilization, and latency, and increases bandwidth.
> 
> As far as CPU use is concerned, 60 VIF scenario should not be the one to
> use for checking what is most efficient.. This really needs to be tested
> on something that uses a single VIF on an embedded (low-power CPU)..
> 
> For the order-1 allocation issues, it would be interesting to see if
> someone could take a look at using paged skbs or multiple RX descriptors
> with shorter skbs (and copying only for the case where a long frame is
> received so that only the A-MSDU RX case would suffer from extra
> copying).

Well, here's a shot at paged rx. It'll only work when PAGE_SIZE > buf_size
(which will be true for most system, I guess)

No idea how to handle EDMA... In fact I don't have any ath9k* solution
at the moment to test ;), so you better backup your data! 
---
diff --git a/drivers/net/wireless/ath/ath9k/ath9k.h b/drivers/net/wireless/ath/ath9k/ath9k.h
index 3681caf5..b113f44 100644
--- a/drivers/net/wireless/ath/ath9k/ath9k.h
+++ b/drivers/net/wireless/ath/ath9k/ath9k.h
@@ -227,6 +227,7 @@ struct ath_buf {
 					   an aggregate) */
 	struct ath_buf *bf_next;	/* next subframe in the aggregate */
 	struct sk_buff *bf_mpdu;	/* enclosing frame structure */
+	struct page *page;
 	void *bf_desc;			/* virtual addr of desc */
 	dma_addr_t bf_daddr;		/* physical addr of desc */
 	dma_addr_t bf_buf_addr;	/* physical addr of data buffer, for DMA */
diff --git a/drivers/net/wireless/ath/ath9k/recv.c b/drivers/net/wireless/ath/ath9k/recv.c
index b2497b8..acdf6ae 100644
--- a/drivers/net/wireless/ath/ath9k/recv.c
+++ b/drivers/net/wireless/ath/ath9k/recv.c
@@ -313,10 +313,13 @@ static void ath_edma_stop_recv(struct ath_softc *sc)
 int ath_rx_init(struct ath_softc *sc, int nbufs)
 {
 	struct ath_common *common = ath9k_hw_common(sc->sc_ah);
-	struct sk_buff *skb;
+	struct page *page;
 	struct ath_buf *bf;
 	int error = 0;
 
+	if (WARN_ON(common->rx_bufsize > PAGE_SIZE))
+		return -EIO;
+
 	spin_lock_init(&sc->sc_pcu_lock);
 	sc->sc_flags &= ~SC_OP_RXFLUSH;
 	spin_lock_init(&sc->rx.rxbuflock);
@@ -342,27 +345,26 @@ int ath_rx_init(struct ath_softc *sc, int nbufs)
 		}
 
 		list_for_each_entry(bf, &sc->rx.rxbuf, list) {
-			skb = ath_rxbuf_alloc(common, common->rx_bufsize,
-					      GFP_KERNEL);
-			if (skb == NULL) {
+			page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0);
+			if (page == NULL) {
 				error = -ENOMEM;
 				goto err;
 			}
 
-			bf->bf_mpdu = skb;
-			bf->bf_buf_addr = dma_map_single(sc->dev, skb->data,
-					common->rx_bufsize,
-					DMA_FROM_DEVICE);
+			bf->bf_mpdu = NULL;
+			bf->bf_buf_addr = dma_map_page(sc->dev, page, 0,
+						       PAGE_SIZE,
+						       DMA_FROM_DEVICE);
 			if (unlikely(dma_mapping_error(sc->dev,
 							bf->bf_buf_addr))) {
-				dev_kfree_skb_any(skb);
-				bf->bf_mpdu = NULL;
+				__free_pages(page, 0);
 				bf->bf_buf_addr = 0;
 				ath_err(common,
 					"dma_mapping_error() on RX init\n");
 				error = -ENOMEM;
 				goto err;
 			}
+			bf->page = page;
 		}
 		sc->rx.rxlink = NULL;
 	}
@@ -378,7 +380,7 @@ void ath_rx_cleanup(struct ath_softc *sc)
 {
 	struct ath_hw *ah = sc->sc_ah;
 	struct ath_common *common = ath9k_hw_common(ah);
-	struct sk_buff *skb;
+	struct page *page;
 	struct ath_buf *bf;
 
 	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_EDMA) {
@@ -386,14 +388,15 @@ void ath_rx_cleanup(struct ath_softc *sc)
 		return;
 	} else {
 		list_for_each_entry(bf, &sc->rx.rxbuf, list) {
-			skb = bf->bf_mpdu;
-			if (skb) {
-				dma_unmap_single(sc->dev, bf->bf_buf_addr,
-						common->rx_bufsize,
-						DMA_FROM_DEVICE);
-				dev_kfree_skb(skb);
+			page = bf->page;
+			bf->page = NULL;
+
+			if (page) {
+				dma_unmap_page(sc->dev, bf->bf_buf_addr,
+					       PAGE_SIZE,
+					       DMA_FROM_DEVICE);
+				__free_pages(page, 0);
 				bf->bf_buf_addr = 0;
-				bf->bf_mpdu = NULL;
 			}
 		}
 
@@ -663,12 +666,10 @@ static void ath_rx_ps(struct ath_softc *sc, struct sk_buff *skb)
 }
 
 static void ath_rx_send_to_mac80211(struct ieee80211_hw *hw,
-				    struct ath_softc *sc, struct sk_buff *skb)
+				    struct ath_softc *sc,
+				    struct sk_buff *skb,
+				    struct ieee80211_hdr *hdr)
 {
-	struct ieee80211_hdr *hdr;
-
-	hdr = (struct ieee80211_hdr *)skb->data;
-
 	/* Send the frame to mac80211 */
 	if (is_multicast_ether_addr(hdr->addr1)) {
 		int i;
@@ -1037,21 +1038,20 @@ static int ath9k_rx_skb_preprocess(struct ath_common *common,
 }
 
 static void ath9k_rx_skb_postprocess(struct ath_common *common,
-				     struct sk_buff *skb,
+				     struct ieee80211_hdr *hdr,
+				     size_t *len,
 				     struct ath_rx_status *rx_stats,
 				     struct ieee80211_rx_status *rxs,
 				     bool decrypt_error)
 {
 	struct ath_hw *ah = common->ah;
-	struct ieee80211_hdr *hdr;
 	int hdrlen, padpos, padsize;
 	u8 keyix;
 	__le16 fc;
 
 	/* see if any padding is done by the hw and remove it */
-	hdr = (struct ieee80211_hdr *) skb->data;
-	hdrlen = ieee80211_get_hdrlen_from_skb(skb);
 	fc = hdr->frame_control;
+	hdrlen = *len >= 2 ? ieee80211_hdrlen(fc) : 0;
 	padpos = ath9k_cmn_padpos(hdr->frame_control);
 
 	/* The MAC header is padded to have 32-bit boundary if the
@@ -1063,9 +1063,9 @@ static void ath9k_rx_skb_postprocess(struct ath_common *common,
 	 * not try to remove padding from short control frames that do
 	 * not have payload. */
 	padsize = padpos & 3;
-	if (padsize && skb->len>=padpos+padsize+FCS_LEN) {
-		memmove(skb->data + padsize, skb->data, padpos);
-		skb_pull(skb, padsize);
+	if (padsize && *len >= padpos + padsize + FCS_LEN) {
+		memmove(hdr + padsize, hdr, padpos);
+		*len -= padsize;
 	}
 
 	keyix = rx_stats->rs_keyix;
@@ -1074,8 +1074,10 @@ static void ath9k_rx_skb_postprocess(struct ath_common *common,
 	    ieee80211_has_protected(fc)) {
 		rxs->flag |= RX_FLAG_DECRYPTED;
 	} else if (ieee80211_has_protected(fc)
-		   && !decrypt_error && skb->len >= hdrlen + 4) {
-		keyix = skb->data[hdrlen + 3] >> 6;
+		   && !decrypt_error && *len >= hdrlen + 4) {
+		u8 *data = (u8 *)hdr;
+
+		keyix = data[hdrlen + 3] >> 6;
 
 		if (test_bit(keyix, common->keymap))
 			rxs->flag |= RX_FLAG_DECRYPTED;
@@ -1623,7 +1625,8 @@ div_comb_done:
 int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp)
 {
 	struct ath_buf *bf;
-	struct sk_buff *skb = NULL, *requeue_skb;
+	struct sk_buff *skb = NULL;
+	struct page *requeue_page;
 	struct ieee80211_rx_status *rxs;
 	struct ath_hw *ah = sc->sc_ah;
 	struct ath_common *common = ath9k_hw_common(ah);
@@ -1634,6 +1637,7 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp)
 	 */
 	struct ieee80211_hw *hw = NULL;
 	struct ieee80211_hdr *hdr;
+	size_t data_len;
 	int retval;
 	bool decrypt_error = false;
 	struct ath_rx_status rs;
@@ -1670,9 +1674,9 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp)
 		if (!bf)
 			break;
 
-		skb = bf->bf_mpdu;
+		skb = dev_alloc_skb(128);
 		if (!skb)
-			continue;
+			goto requeue;
 
 		hdr = (struct ieee80211_hdr *) (skb->data + rx_status_len);
 		rxs =  IEEE80211_SKB_RXCB(skb);
@@ -1704,41 +1708,40 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp)
 
 		/* Ensure we always have an skb to requeue once we are done
 		 * processing the current buffer's skb */
-		requeue_skb = ath_rxbuf_alloc(common, common->rx_bufsize, GFP_ATOMIC);
+		requeue_page = alloc_pages(GFP_ATOMIC, 0);
 
 		/* If there is no memory we ignore the current RX'd frame,
 		 * tell hardware it can give us a new frame using the old
 		 * skb and put it at the tail of the sc->rx.rxbuf list for
 		 * processing. */
-		if (!requeue_skb)
+		if (!requeue_page) {
+			dev_kfree_skb_any(skb);
 			goto requeue;
+		}
 
 		/* Unmap the frame */
-		dma_unmap_single(sc->dev, bf->bf_buf_addr,
-				 common->rx_bufsize,
-				 dma_type);
+		dma_unmap_page(sc->dev, bf->bf_buf_addr, PAGE_SIZE, dma_type);
 
-		skb_put(skb, rs.rs_datalen + ah->caps.rx_status_len);
-		if (ah->caps.rx_status_len)
-			skb_pull(skb, ah->caps.rx_status_len);
+		data_len = rs.rs_datalen;
+		ath9k_rx_skb_postprocess(common, page_address(bf->page),
+					 &data_len, &rs, rxs, decrypt_error);
 
-		ath9k_rx_skb_postprocess(common, skb, &rs,
-					 rxs, decrypt_error);
+		skb_add_rx_frag(skb, 0, bf->page, ah->caps.rx_status_len,
+				data_len);
 
 		/* We will now give hardware our shiny new allocated skb */
-		bf->bf_mpdu = requeue_skb;
-		bf->bf_buf_addr = dma_map_single(sc->dev, requeue_skb->data,
-						 common->rx_bufsize,
-						 dma_type);
+		bf->bf_buf_addr = dma_map_page(sc->dev, requeue_page, 0,
+					       PAGE_SIZE, dma_type);
 		if (unlikely(dma_mapping_error(sc->dev,
 			  bf->bf_buf_addr))) {
-			dev_kfree_skb_any(requeue_skb);
-			bf->bf_mpdu = NULL;
+			__free_pages(requeue_page, 0);
 			bf->bf_buf_addr = 0;
 			ath_err(common, "dma_mapping_error() on RX\n");
-			ath_rx_send_to_mac80211(hw, sc, skb);
+			ath_rx_send_to_mac80211(hw, sc, skb,
+						page_address(bf->page));
 			break;
 		}
+		bf->page = requeue_page;
 
 		/*
 		 * change the default rx antenna if rx diversity chooses the
@@ -1763,7 +1766,7 @@ int ath_rx_tasklet(struct ath_softc *sc, int flush, bool hp)
 		if (ah->caps.hw_caps & ATH9K_HW_CAP_ANT_DIV_COMB)
 			ath_ant_comb_scan(sc, &rs);
 
-		ath_rx_send_to_mac80211(hw, sc, skb);
+		ath_rx_send_to_mac80211(hw, sc, skb, page_address(bf->page));
 
 requeue:
 		if (edma) {


^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-09 20:14       ` Christian Lamparter
@ 2011-01-09 20:24         ` Felix Fietkau
  2011-01-10 12:40         ` Jouni Malinen
  1 sibling, 0 replies; 16+ messages in thread
From: Felix Fietkau @ 2011-01-09 20:24 UTC (permalink / raw)
  To: Christian Lamparter
  Cc: Jouni Malinen, Ben Greear, linux-wireless, ath9k-devel

On 2011-01-09 1:14 PM, Christian Lamparter wrote:
> On Sunday 09 January 2011 19:13:04 Jouni Malinen wrote:
>>  On Sat, Jan 08, 2011 at 04:36:23PM -0800, Ben Greear wrote:
>>  >  On 01/08/2011 04:20 PM, Felix Fietkau wrote:
>>  >  >On 2011-01-08 8:33 AM, greearb@candelatech.com wrote:
>>  >  >>From: Ben Greear<greearb@candelatech.com>
>>  >  >>This saves us constantly allocating large, multi-page
>>  >  >>skbs. It should fix the order-1 allocation errors reported,
>>  >  >>and in a 60-vif scenario, this significantly decreases CPU
>>  >  >>utilization, and latency, and increases bandwidth.
>>
>>  As far as CPU use is concerned, 60 VIF scenario should not be the one to
>>  use for checking what is most efficient.. This really needs to be tested
>>  on something that uses a single VIF on an embedded (low-power CPU)..
>>
>>  For the order-1 allocation issues, it would be interesting to see if
>>  someone could take a look at using paged skbs or multiple RX descriptors
>>  with shorter skbs (and copying only for the case where a long frame is
>>  received so that only the A-MSDU RX case would suffer from extra
>>  copying).
>
> Well, here's a shot at paged rx. It'll only work when PAGE_SIZE>  buf_size
> (which will be true for most system, I guess)
>
> No idea how to handle EDMA... In fact I don't have any ath9k* solution
> at the moment to test ;), so you better backup your data!
I think paged rx would be quite problematic for performance on embedded 
hardware that way. If I read the networking stack correctly, this would 
trigger skb_linearize() for network drivers/devices that cannot do 
scatter/gather IO when packets are forwarded between interfaces (the 
most common use case for OpenWrt).
It might be possible to avoid this by changing the network stack to 
bypass the linearize if there is only data in the page attached to the 
skb, but I don't know how easy that is to add, nor what corner cases I'd 
need to take care of.

- Felix

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-09 18:13     ` Jouni Malinen
  2011-01-09 20:14       ` Christian Lamparter
@ 2011-01-10  4:32       ` Ben Greear
  1 sibling, 0 replies; 16+ messages in thread
From: Ben Greear @ 2011-01-10  4:32 UTC (permalink / raw)
  To: Jouni Malinen; +Cc: Felix Fietkau, linux-wireless, ath9k-devel

On 01/09/2011 10:13 AM, Jouni Malinen wrote:
> On Sat, Jan 08, 2011 at 04:36:23PM -0800, Ben Greear wrote:
>> On 01/08/2011 04:20 PM, Felix Fietkau wrote:
>>> On 2011-01-08 8:33 AM, greearb@candelatech.com wrote:
>>>> From: Ben Greear<greearb@candelatech.com>
>>>> This saves us constantly allocating large, multi-page
>>>> skbs. It should fix the order-1 allocation errors reported,
>>>> and in a 60-vif scenario, this significantly decreases CPU
>>>> utilization, and latency, and increases bandwidth.
>
> As far as CPU use is concerned, 60 VIF scenario should not be the one to
> use for checking what is most efficient.. This really needs to be tested
> on something that uses a single VIF on an embedded (low-power CPU)..
>
> For the order-1 allocation issues, it would be interesting to see if
> someone could take a look at using paged skbs or multiple RX descriptors
> with shorter skbs (and copying only for the case where a long frame is
> received so that only the A-MSDU RX case would suffer from extra
> copying).

>
>> I see a serious performance improvement with this patch.  My current test is sending 1024 byte UDP
>> payloads to/from each of 60 stations at 128kbps.  Please do try it out on your system and see how
>> it performs there.  I'm guessing that any time you have more than 1 VIF this will be a good
>> improvement since mac80211 does skb_copy (and you would typically be copying a much smaller
>> packet with this patch).
>
> How would this patch change the number of bytes copied by skb_copy?

It seems that if you allocate a 2-page SKB, as upstream ath9k does, pass that
up the stack, then if/when anything calls 'skb_copy' it allocates a new skb with
2 pages even if the actual data-length is much smaller.

This copy wouldn't be so bad for single VIF scenarios (which means probably no copying),
but you still end up exhausting the order-1 memory buffer pool with lots of big skbs
floating around the system.  Note that the original bug was not filed by me
and happened on some embedded device, though I also see memory exhaustion in my
tests with upstream code.

>
>> If we do see performance differences on different platforms, this could perhaps be
>> something we could tune at run-time.
>
> I guess that could be looked at, but as long as that is not the case,
> the test setup you used is not exactly the most common case for ath9k in
> the upstream kernel and should not be used to figure out default
> behavior.

True, but I also like the protection this should offer against stray
DMA that this chipset/driver seems capable of.

I'm curious if anyone has any stats at all as far as ath9k performance goes?

Thanks,
Ben

-- 
Ben Greear <greearb@candelatech.com>
Candela Technologies Inc  http://www.candelatech.com

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-09 17:49   ` Ben Greear
@ 2011-01-10  7:14     ` Gabor Juhos
  0 siblings, 0 replies; 16+ messages in thread
From: Gabor Juhos @ 2011-01-10  7:14 UTC (permalink / raw)
  To: Ben Greear; +Cc: linux-wireless, ath9k-devel

2011.01.09. 18:49 keltezéssel, Ben Greear írta:
> On 01/09/2011 12:00 AM, Gabor Juhos wrote:
>> Hi Ben,
>>
>>> From: Ben Greear<greearb@candelatech.com>
>>>
>>> This saves us constantly allocating large, multi-page
>>> skbs.  It should fix the order-1 allocation errors reported,
>>> and in a 60-vif scenario, this significantly decreases CPU
>>> utilization, and latency, and increases bandwidth.
>>>
>>> Signed-off-by: Ben Greear<greearb@candelatech.com>
>>> ---
>>> :100644 100644 b2497b8... ea2f67c... M    drivers/net/wireless/ath/ath9k/recv.c
>>>   drivers/net/wireless/ath/ath9k/recv.c |   92 ++++++++++++++++++++++-----------
>>>   1 files changed, 61 insertions(+), 31 deletions(-)
>>
>> <...>
>>
>>> +        if (use_copybreak) {
>>> +            struct pci_dev *pdev = to_pci_dev(sc->dev);
>>
>> This would cause undefined behaviour with ath9k devices sitting on an AHB bus.
>>
>>> +            pci_dma_sync_single_for_cpu(pdev, bf->bf_buf_addr,
>>> +                            len, PCI_DMA_FROMDEVICE);
>>> +            skb_copy_from_linear_data(bf->bf_mpdu, skb->data, len);
>>> +            pci_dma_sync_single_for_device(pdev, bf->bf_buf_addr,
>>> +                               len, PCI_DMA_FROMDEVICE);
>>
>> Please use the bus agnostic equivalents of these DMA functions.
> 
> Any idea what that might be?

Invalid/null pointer dereference probably. The problem is that sc->dev is
pointing to a device structure inside a platform_device structure when it is not
PCI device. Converting sc->dev to 'struct *pci_dev' and using the result as a
parameter for a PCI specific function is not correct in this case.

> Should we just disable copybreak for things on AHB bus?

We should not disable it, order-1 allocation failures are present there as well.

-Gabor

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH] ath9k:  Implement rx copy-break.
  2011-01-09 20:14       ` Christian Lamparter
  2011-01-09 20:24         ` Felix Fietkau
@ 2011-01-10 12:40         ` Jouni Malinen
  1 sibling, 0 replies; 16+ messages in thread
From: Jouni Malinen @ 2011-01-10 12:40 UTC (permalink / raw)
  To: Christian Lamparter
  Cc: Ben Greear, Felix Fietkau, linux-wireless, ath9k-devel

On Sun, Jan 09, 2011 at 09:14:53PM +0100, Christian Lamparter wrote:
> On Sunday 09 January 2011 19:13:04 Jouni Malinen wrote:
> > For the order-1 allocation issues, it would be interesting to see if
> > someone could take a look at using paged skbs or multiple RX descriptors
> > with shorter skbs (and copying only for the case where a long frame is
> > received so that only the A-MSDU RX case would suffer from extra
> > copying).
> 
> Well, here's a shot at paged rx. It'll only work when PAGE_SIZE > buf_size
> (which will be true for most system, I guess)

Thanks!

For the multiple RX descriptors (fragmented buffer for DMA) part, here's
a very preliminary patch to show how it could be done in ath9k for
anyone who might want to experiment more in this area. This version is
obviously only helping with large allocations (it uses half the buffer
size). The extra copying is there for large A-MSDU case. Though, I did
add some concept code to use skb frag_list to allow such a frame be
delivered to mac80211 without needing any extra allocation/copying. That
is commented out for now, if mac80211 can be made to handle A-MSDU RX
processing with fragmented data (which should be relatively simple
change, I'd hope), there should be no need for doing the extra copy in
ath9k. (And likewise, I was too lazy to handle the EDMA case for now..)

The version here is limited to at maximum two fragments. If desired,
this could be further extended to handle multiple fragments to get the
default RX skb allocation shorter than 2k (i.e., to cover the normal
1500 MTU). I'm not sure whether that would result in noticeable benefit,
but it could help in saving some memory when there are multiple skbs
queued in various places in the networking stack. Likewise, the maximum
A-MSDU receive length could be extended with this approach without
having to use even longer individual buffers for RX.

---
 drivers/net/wireless/ath/ath9k/hw.h   |    2 
 drivers/net/wireless/ath/ath9k/main.c |    5 +
 drivers/net/wireless/ath/ath9k/recv.c |  102 +++++++++++++++++++++++++++++-----
 3 files changed, 95 insertions(+), 14 deletions(-)

--- wireless-testing.orig/drivers/net/wireless/ath/ath9k/hw.h	2011-01-10 13:03:16.000000000 +0200
+++ wireless-testing/drivers/net/wireless/ath/ath9k/hw.h	2011-01-10 13:15:39.000000000 +0200
@@ -852,6 +852,8 @@ struct ath_hw {
 
 	/* Enterprise mode cap */
 	u32 ent_mode;
+
+	struct sk_buff *rx_frag;
 };
 
 static inline struct ath_common *ath9k_hw_common(struct ath_hw *ah)
--- wireless-testing.orig/drivers/net/wireless/ath/ath9k/main.c	2011-01-10 13:14:26.000000000 +0200
+++ wireless-testing/drivers/net/wireless/ath/ath9k/main.c	2011-01-10 13:15:05.000000000 +0200
@@ -1320,6 +1320,11 @@ static void ath9k_stop(struct ieee80211_
 	} else
 		sc->rx.rxlink = NULL;
 
+	if (ah->rx_frag) {
+		dev_kfree_skb_any(ah->rx_frag);
+		ah->rx_frag = NULL;
+	}
+
 	/* disable HAL and put h/w to sleep */
 	ath9k_hw_disable(ah);
 	ath9k_hw_configpcipowersave(ah, 1, 1);
--- wireless-testing.orig/drivers/net/wireless/ath/ath9k/recv.c	2011-01-10 12:24:08.000000000 +0200
+++ wireless-testing/drivers/net/wireless/ath/ath9k/recv.c	2011-01-10 14:29:29.000000000 +0200
@@ -324,8 +324,19 @@ int ath_rx_init(struct ath_softc *sc, in
 	if (sc->sc_ah->caps.hw_caps & ATH9K_HW_CAP_EDMA) {
 		return ath_rx_edma_init(sc, nbufs);
 	} else {
-		common->rx_bufsize = roundup(IEEE80211_MAX_MPDU_LEN,
+		/*
+		 * Use a buffer that allows a full frame to be received in at
+		 * most two buffers with scattering DMA. This is needed for
+		 * A-MSDU; other cases will fit in a single buffer. This will
+		 * allow the skbs to fit in a single page to avoid issues with
+		 * higher order allocation.
+		 */
+		common->rx_bufsize = roundup(IEEE80211_MAX_MPDU_LEN / 2,
 				min(common->cachelsz, (u16)64));
+#if 0 /* TESTING - force two fragments even without A-MSDU */
+		common->rx_bufsize = roundup(2800 / 2,
+				min(common->cachelsz, (u16)64));
+#endif
 
 		ath_dbg(common, ATH_DBG_CONFIG, "cachelsz %u rxbufsize %u\n",
 			common->cachelsz, common->rx_bufsize);
@@ -863,16 +874,6 @@ static bool ath9k_rx_accept(struct ath_c
 		return false;
 
 	/*
-	 * rs_more indicates chained descriptors which can be used
-	 * to link buffers together for a sort of scatter-gather
-	 * operation.
-	 * reject the frame, we don't support scatter-gather yet and
-	 * the frame is probably corrupt anyway
-	 */
-	if (rx_stats->rs_more)
-		return false;
-
-	/*
 	 * The rx_stats->rs_status will not be set until the end of the
 	 * chained descriptors so it can be ignored if rs_more is set. The
 	 * rs_more will be false at the last element of the chained
@@ -1022,6 +1023,9 @@ static int ath9k_rx_skb_preprocess(struc
 	if (!ath9k_rx_accept(common, hdr, rx_status, rx_stats, decrypt_error))
 		return -EINVAL;
 
+	if (rx_stats->rs_more)
+		return 0; /* Only use data from the last fragment */
+
 	ath9k_process_rssi(common, hw, hdr, rx_stats);
 
 	if (ath9k_process_rate(common, hw, rx_stats, rx_status))
@@ -1674,7 +1678,16 @@ int ath_rx_tasklet(struct ath_softc *sc,
 		if (!skb)
 			continue;
 
-		hdr = (struct ieee80211_hdr *) (skb->data + rx_status_len);
+		/*
+		 * Take frame header from the first fragment and RX status from
+		 * the last one.
+		 */
+		if (ah->rx_frag)
+			hdr = (struct ieee80211_hdr *)
+				(ah->rx_frag->data + rx_status_len);
+		else
+			hdr = (struct ieee80211_hdr *)
+				(skb->data + rx_status_len);
 		rxs =  IEEE80211_SKB_RXCB(skb);
 
 		hw = ath_get_virt_hw(sc, hdr);
@@ -1718,12 +1731,14 @@ int ath_rx_tasklet(struct ath_softc *sc,
 				 common->rx_bufsize,
 				 dma_type);
 
+		/* FIX: for fragmented frames, only one rx_status_len */
 		skb_put(skb, rs.rs_datalen + ah->caps.rx_status_len);
 		if (ah->caps.rx_status_len)
 			skb_pull(skb, ah->caps.rx_status_len);
 
-		ath9k_rx_skb_postprocess(common, skb, &rs,
-					 rxs, decrypt_error);
+		if (!ah->rx_frag && !rs.rs_more)
+			ath9k_rx_skb_postprocess(common, skb, &rs,
+						 rxs, decrypt_error);
 
 		/* We will now give hardware our shiny new allocated skb */
 		bf->bf_mpdu = requeue_skb;
@@ -1740,6 +1755,65 @@ int ath_rx_tasklet(struct ath_softc *sc,
 			break;
 		}
 
+		if (rs.rs_more) {
+			/*
+			 * rs_more indicates chained descriptors which can be
+			 * used to link buffers together for a sort of
+			 * scatter-gather operation.
+			 */
+			if (ah->rx_frag) {
+				printk(KERN_DEBUG "%s:dropped prev rx_frag\n",
+				       __func__);
+				dev_kfree_skb_any(ah->rx_frag);
+			}
+			ah->rx_frag = skb;
+			goto requeue;
+		}
+		if (ah->rx_frag) {
+#if 1
+			struct sk_buff *nskb;
+			/*
+			 * This is the final fragment of the frame - merge with
+			 * previously received data.
+			 */
+			nskb = skb_copy_expand(ah->rx_frag, 0, skb->len,
+					       GFP_ATOMIC);
+			dev_kfree_skb_any(ah->rx_frag);
+			ah->rx_frag = NULL;
+			if (nskb == NULL) {
+				dev_kfree_skb_any(skb);
+				goto requeue;
+			}
+			skb_copy_from_linear_data(skb, skb_put(nskb, skb->len),
+						  skb->len);
+			/* Take RX status for the last fragment */
+			memcpy(nskb->cb, skb->cb, sizeof(skb->cb));
+			dev_kfree_skb_any(skb);
+			skb = nskb;
+#else
+			/*
+			 * TODO: This should really be optimized by using
+			 * skb frag_list etc. to avoid new allocation and
+			 * copying of data here. The fragmentation case will
+			 * only happen when receiving A-MSDU aggregates and
+			 * mac80211 will end up re-allocating and splitting
+			 * those anyway. Once mac80211 is able to do this based
+			 * on multiple fragments, alloc+copy code above can
+			 * be replaced with something like following.
+			 */
+			/* Take RX status for the last fragment */
+			memcpy(ah->rx_frag->cb, skb->cb, sizeof(skb->cb));
+			/* Link the fragments together using frag_list */
+			skb_frag_list_init(ah->rx_frag);
+			skb_frag_add_head(ah->rx_frag, skb);
+			skb = ah->rx_frag;
+			ah->rx_frag = NULL;
+#endif
+			rxs = IEEE80211_SKB_RXCB(skb);
+			ath9k_rx_skb_postprocess(common, skb, &rs,
+						 rxs, decrypt_error);
+		}
+
 		/*
 		 * change the default rx antenna if rx diversity chooses the
 		 * other antenna 3 times in a row.

-- 
Jouni Malinen                                            PGP id EFC895FA

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2011-01-10 12:40 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2011-01-08 15:33 [PATCH] ath9k: Implement rx copy-break greearb
2011-01-09  0:20 ` Felix Fietkau
2011-01-09  0:36   ` Ben Greear
2011-01-09  0:41     ` Felix Fietkau
2011-01-09  1:06       ` Ben Greear
2011-01-09 14:15         ` Björn Smedman
2011-01-09 14:18           ` Felix Fietkau
2011-01-09 15:35             ` Björn Smedman
2011-01-09 18:13     ` Jouni Malinen
2011-01-09 20:14       ` Christian Lamparter
2011-01-09 20:24         ` Felix Fietkau
2011-01-10 12:40         ` Jouni Malinen
2011-01-10  4:32       ` Ben Greear
2011-01-09  8:00 ` Gabor Juhos
2011-01-09 17:49   ` Ben Greear
2011-01-10  7:14     ` Gabor Juhos

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).