linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] net: alx: use custom skb allocator
@ 2016-05-25  6:49 Feng Tang
  2016-05-25 23:53 ` David Miller
  0 siblings, 1 reply; 6+ messages in thread
From: Feng Tang @ 2016-05-25  6:49 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Johannes Berg, Jarod Wilson,
	netdev, linux-kernel
  Cc: Feng Tang

This patch follows Eric Dumazet's commit 7b70176421 for Atheros
atl1c driver to fix one exactly same bug in alx driver, that the
network link will be lost in 1-5 minutes after the device is up.

My laptop Lenovo Y580 with Atheros AR8161 ethernet device hit the
same problem with kernel 4.4, and it will be cured by Jarod Wilson's
commit c406700c for alx driver which get merged in 4.5. But there
are still some alx devices can't function well even with Jarod's
patch, while this patch could make them work fine. More details on
	https://bugzilla.kernel.org/show_bug.cgi?id=70761

The debug shows the issue is very likely to be related with the RX
DMA address, specifically 0x...f80, if RX buffer get 0x...f80 several
times, their will be RX overflow error and device will stop working.

For kernel 4.5.0 with Jarod's patch which works fine with my
AR8161/Lennov Y580, if I made some change to the
	__netdev_alloc_skb
		--> __alloc_page_frag()
to make the allocated buffer can get an address with 0x...f80,
then the same error happens. If I make it to 0x...f40 or 0x....fc0,
everything will be still fine. So I tend to believe that the
0x..f80 address cause the silicon to behave abnormally.

Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=70761
Cc: Eric Dumazet <edumazet@google.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: Jarod Wilson <jarod@redhat.com>
Signed-off-by: Feng Tang <feng.tang@intel.com>
Tested-by: Ole Lukoie <olelukoie@mail.ru>
---
 drivers/net/ethernet/atheros/alx/alx.h  |  4 +++
 drivers/net/ethernet/atheros/alx/main.c | 48 ++++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/atheros/alx/alx.h b/drivers/net/ethernet/atheros/alx/alx.h
index 8fc93c5..d02c424 100644
--- a/drivers/net/ethernet/atheros/alx/alx.h
+++ b/drivers/net/ethernet/atheros/alx/alx.h
@@ -96,6 +96,10 @@ struct alx_priv {
 	unsigned int rx_ringsz;
 	unsigned int rxbuf_size;
 
+	struct page  *rx_page;
+	unsigned int rx_page_offset;
+	unsigned int rx_frag_size;
+
 	struct napi_struct napi;
 	struct alx_tx_queue txq;
 	struct alx_rx_queue rxq;
diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c
index 9fe8b5e..c98acdc 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -70,6 +70,35 @@ static void alx_free_txbuf(struct alx_priv *alx, int entry)
 	}
 }
 
+static struct sk_buff *alx_alloc_skb(struct alx_priv *alx, gfp_t gfp)
+{
+	struct sk_buff *skb;
+	struct page *page;
+
+	if (alx->rx_frag_size > PAGE_SIZE)
+		return __netdev_alloc_skb(alx->dev, alx->rxbuf_size, gfp);
+
+	page = alx->rx_page;
+	if (!page) {
+		alx->rx_page = page = alloc_page(gfp);
+		if (unlikely(!page))
+			return NULL;
+		alx->rx_page_offset = 0;
+	}
+
+	skb = build_skb(page_address(page) + alx->rx_page_offset,
+			alx->rx_frag_size);
+	if (likely(skb)) {
+		alx->rx_page_offset += alx->rx_frag_size;
+		if (alx->rx_page_offset >= PAGE_SIZE)
+			alx->rx_page = NULL;
+		else
+			get_page(page);
+	}
+	return skb;
+}
+
+
 static int alx_refill_rx_ring(struct alx_priv *alx, gfp_t gfp)
 {
 	struct alx_rx_queue *rxq = &alx->rxq;
@@ -86,7 +115,7 @@ static int alx_refill_rx_ring(struct alx_priv *alx, gfp_t gfp)
 	while (!cur_buf->skb && next != rxq->read_idx) {
 		struct alx_rfd *rfd = &rxq->rfd[cur];
 
-		skb = __netdev_alloc_skb(alx->dev, alx->rxbuf_size, gfp);
+		skb = alx_alloc_skb(alx, gfp);
 		if (!skb)
 			break;
 		dma = dma_map_single(&alx->hw.pdev->dev,
@@ -124,6 +153,7 @@ static int alx_refill_rx_ring(struct alx_priv *alx, gfp_t gfp)
 		alx_write_mem16(&alx->hw, ALX_RFD_PIDX, cur);
 	}
 
+
 	return count;
 }
 
@@ -592,6 +622,11 @@ static void alx_free_rings(struct alx_priv *alx)
 	kfree(alx->txq.bufs);
 	kfree(alx->rxq.bufs);
 
+	if (alx->rx_page) {
+		put_page(alx->rx_page);
+		alx->rx_page = NULL;
+	}
+
 	dma_free_coherent(&alx->hw.pdev->dev,
 			  alx->descmem.size,
 			  alx->descmem.virt,
@@ -646,6 +681,7 @@ static int alx_request_irq(struct alx_priv *alx)
 				  alx->dev->name, alx);
 		if (!err)
 			goto out;
+
 		/* fall back to legacy interrupt */
 		pci_disable_msi(alx->hw.pdev);
 	}
@@ -689,6 +725,7 @@ static int alx_init_sw(struct alx_priv *alx)
 	struct pci_dev *pdev = alx->hw.pdev;
 	struct alx_hw *hw = &alx->hw;
 	int err;
+	unsigned int head_size;
 
 	err = alx_identify_hw(alx);
 	if (err) {
@@ -704,7 +741,12 @@ static int alx_init_sw(struct alx_priv *alx)
 
 	hw->smb_timer = 400;
 	hw->mtu = alx->dev->mtu;
+
 	alx->rxbuf_size = ALX_MAX_FRAME_LEN(hw->mtu);
+	head_size = SKB_DATA_ALIGN(alx->rxbuf_size + NET_SKB_PAD) +
+		    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	alx->rx_frag_size = roundup_pow_of_two(head_size);
+
 	alx->tx_ringsz = 256;
 	alx->rx_ringsz = 512;
 	hw->imt = 200;
@@ -806,6 +848,7 @@ static int alx_change_mtu(struct net_device *netdev, int mtu)
 {
 	struct alx_priv *alx = netdev_priv(netdev);
 	int max_frame = ALX_MAX_FRAME_LEN(mtu);
+	unsigned int head_size;
 
 	if ((max_frame < ALX_MIN_FRAME_SIZE) ||
 	    (max_frame > ALX_MAX_FRAME_SIZE))
@@ -817,6 +860,9 @@ static int alx_change_mtu(struct net_device *netdev, int mtu)
 	netdev->mtu = mtu;
 	alx->hw.mtu = mtu;
 	alx->rxbuf_size = max(max_frame, ALX_DEF_RXBUF_SIZE);
+	head_size = SKB_DATA_ALIGN(alx->rxbuf_size + NET_SKB_PAD) +
+		    SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	alx->rx_frag_size = roundup_pow_of_two(head_size);
 	netdev_update_features(netdev);
 	if (netif_running(netdev))
 		alx_reinit(alx);
-- 
2.5.0

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] net: alx: use custom skb allocator
  2016-05-25  6:49 [PATCH v2] net: alx: use custom skb allocator Feng Tang
@ 2016-05-25 23:53 ` David Miller
  2016-05-26  8:41   ` Feng Tang
  0 siblings, 1 reply; 6+ messages in thread
From: David Miller @ 2016-05-25 23:53 UTC (permalink / raw)
  To: feng.tang; +Cc: edumazet, johannes, jarod, netdev, linux-kernel

From: Feng Tang <feng.tang@intel.com>
Date: Wed, 25 May 2016 14:49:54 +0800

> This patch follows Eric Dumazet's commit 7b70176421 for Atheros
> atl1c driver to fix one exactly same bug in alx driver, that the
> network link will be lost in 1-5 minutes after the device is up.
> 
> My laptop Lenovo Y580 with Atheros AR8161 ethernet device hit the
> same problem with kernel 4.4, and it will be cured by Jarod Wilson's
> commit c406700c for alx driver which get merged in 4.5. But there
> are still some alx devices can't function well even with Jarod's
> patch, while this patch could make them work fine. More details on
> 	https://bugzilla.kernel.org/show_bug.cgi?id=70761
> 
> The debug shows the issue is very likely to be related with the RX
> DMA address, specifically 0x...f80, if RX buffer get 0x...f80 several
> times, their will be RX overflow error and device will stop working.
> 
> For kernel 4.5.0 with Jarod's patch which works fine with my
> AR8161/Lennov Y580, if I made some change to the
> 	__netdev_alloc_skb
> 		--> __alloc_page_frag()
> to make the allocated buffer can get an address with 0x...f80,
> then the same error happens. If I make it to 0x...f40 or 0x....fc0,
> everything will be still fine. So I tend to believe that the
> 0x..f80 address cause the silicon to behave abnormally.
> 
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=70761
> Cc: Eric Dumazet <edumazet@google.com>
> Cc: Johannes Berg <johannes@sipsolutions.net>
> Cc: Jarod Wilson <jarod@redhat.com>
> Signed-off-by: Feng Tang <feng.tang@intel.com>
> Tested-by: Ole Lukoie <olelukoie@mail.ru>

Looks good, applied, thanks.

But now that we have at least two instances of this code we really
need to put a common version somewhere. :-/

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] net: alx: use custom skb allocator
  2016-05-25 23:53 ` David Miller
@ 2016-05-26  8:41   ` Feng Tang
  2016-05-26 13:08     ` Eric Dumazet
  2016-05-26 19:31     ` David Miller
  0 siblings, 2 replies; 6+ messages in thread
From: Feng Tang @ 2016-05-26  8:41 UTC (permalink / raw)
  To: David Miller; +Cc: edumazet, johannes, jarod, netdev, linux-kernel

On Wed, May 25, 2016 at 07:53:41PM -0400, David Miller wrote:
> From: Feng Tang <feng.tang@intel.com>
> Date: Wed, 25 May 2016 14:49:54 +0800
> 
> > This patch follows Eric Dumazet's commit 7b70176421 for Atheros
> > atl1c driver to fix one exactly same bug in alx driver, that the
> > network link will be lost in 1-5 minutes after the device is up.
> > 
> > My laptop Lenovo Y580 with Atheros AR8161 ethernet device hit the
> > same problem with kernel 4.4, and it will be cured by Jarod Wilson's
> > commit c406700c for alx driver which get merged in 4.5. But there
> > are still some alx devices can't function well even with Jarod's
> > patch, while this patch could make them work fine. More details on
> > 	https://bugzilla.kernel.org/show_bug.cgi?id=70761
> > 
> > The debug shows the issue is very likely to be related with the RX
> > DMA address, specifically 0x...f80, if RX buffer get 0x...f80 several
> > times, their will be RX overflow error and device will stop working.
> > 
> > For kernel 4.5.0 with Jarod's patch which works fine with my
> > AR8161/Lennov Y580, if I made some change to the
> > 	__netdev_alloc_skb
> > 		--> __alloc_page_frag()
> > to make the allocated buffer can get an address with 0x...f80,
> > then the same error happens. If I make it to 0x...f40 or 0x....fc0,
> > everything will be still fine. So I tend to believe that the
> > 0x..f80 address cause the silicon to behave abnormally.
> > 
> > Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=70761
> > Cc: Eric Dumazet <edumazet@google.com>
> > Cc: Johannes Berg <johannes@sipsolutions.net>
> > Cc: Jarod Wilson <jarod@redhat.com>
> > Signed-off-by: Feng Tang <feng.tang@intel.com>
> > Tested-by: Ole Lukoie <olelukoie@mail.ru>
> 
> Looks good, applied, thanks.

Thanks for reviewing and taking it.

> 
> But now that we have at least two instances of this code we really
> need to put a common version somewhere. :-/

I agree, and furthermore I noticed there are some similar routines
in the 4 individual Atheros drivers atlx/alx/atl1c/atl1e, which may
be unified by a simple framework for them all. Maybe the driver
maintainer from Atheros could take a look, as they can reach all
the real HWs :)

Thanks,
Feng

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] net: alx: use custom skb allocator
  2016-05-26  8:41   ` Feng Tang
@ 2016-05-26 13:08     ` Eric Dumazet
  2016-05-30  6:57       ` Feng Tang
  2016-05-26 19:31     ` David Miller
  1 sibling, 1 reply; 6+ messages in thread
From: Eric Dumazet @ 2016-05-26 13:08 UTC (permalink / raw)
  To: Feng Tang; +Cc: David Miller, edumazet, johannes, jarod, netdev, linux-kernel

On Thu, 2016-05-26 at 16:41 +0800, Feng Tang wrote:
> On Wed, May 25, 2016 at 07:53:41PM -0400, David Miller wrote:

> > 
> > But now that we have at least two instances of this code we really
> > need to put a common version somewhere. :-/
> 
> I agree, and furthermore I noticed there are some similar routines
> in the 4 individual Atheros drivers atlx/alx/atl1c/atl1e, which may
> be unified by a simple framework for them all. Maybe the driver
> maintainer from Atheros could take a look, as they can reach all
> the real HWs :)

Note that you could also use the napi_get_frags() API that other drivers
use, and you attach page frags to the skb.

(Ie use small skb->head allocations where the stack will pull the
headers, but use your own page based allocations for the frames)

This might allow you to use the page reuse trick that some Intel drivers
use.

Look for igb_can_reuse_rx_page() for an example.

Thanks.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] net: alx: use custom skb allocator
  2016-05-26  8:41   ` Feng Tang
  2016-05-26 13:08     ` Eric Dumazet
@ 2016-05-26 19:31     ` David Miller
  1 sibling, 0 replies; 6+ messages in thread
From: David Miller @ 2016-05-26 19:31 UTC (permalink / raw)
  To: feng.tang; +Cc: edumazet, johannes, jarod, netdev, linux-kernel

From: Feng Tang <feng.tang@intel.com>
Date: Thu, 26 May 2016 16:41:55 +0800

> Maybe the driver maintainer from Atheros could take a look, as they
> can reach all the real HWs :)

Don't hold your breath.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] net: alx: use custom skb allocator
  2016-05-26 13:08     ` Eric Dumazet
@ 2016-05-30  6:57       ` Feng Tang
  0 siblings, 0 replies; 6+ messages in thread
From: Feng Tang @ 2016-05-30  6:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David Miller, edumazet, johannes, jarod, netdev, linux-kernel

Hi Eric,

On Thu, May 26, 2016 at 06:08:22AM -0700, Eric Dumazet wrote:
> On Thu, 2016-05-26 at 16:41 +0800, Feng Tang wrote:
> > On Wed, May 25, 2016 at 07:53:41PM -0400, David Miller wrote:
> 
> > > 
> > > But now that we have at least two instances of this code we really
> > > need to put a common version somewhere. :-/
> > 
> > I agree, and furthermore I noticed there are some similar routines
> > in the 4 individual Atheros drivers atlx/alx/atl1c/atl1e, which may
> > be unified by a simple framework for them all. Maybe the driver
> > maintainer from Atheros could take a look, as they can reach all
> > the real HWs :)
> 
> Note that you could also use the napi_get_frags() API that other drivers
> use, and you attach page frags to the skb.
> 
> (Ie use small skb->head allocations where the stack will pull the
> headers, but use your own page based allocations for the frames)
> 
> This might allow you to use the page reuse trick that some Intel drivers
> use.
> 
> Look for igb_can_reuse_rx_page() for an example.

Thanks for the detail hints.

When working on unifying the 2 same fixes in atl1c and alx driver, I gave
the problem a second thought. As the problem is due to a HW problem
(most likely), which is just about the DMA RX address, that if the addr is
close to the page boundary: 0x...f80(raw skb->data, and actual DMA
addr set to HW is 0x...fc0), the dma will fail with RX overflow error. And
this is really not the fault of current skb allocation APIs.

Maybe we can simply drop the skb with 0x...f80 address, and re-apply a new
one. This is a workaroun of HW issue of some Atheros devices, and it has
some advantages:
1. We don't need to create a custom skb allocator.
2. The performance effect is small. First the 0x..f80 skb only happens with
some MTU. Secondly even for where the 0x..f80 could happen, it will only get
about 1/16 chance, as the __netdev_alloc_dev will make 16 2KB skb for each
32KB batch buffer allocation

This workaround works fine on my Y580, and I will send it to bugzlla for
others' try.

Thanks,
Feng

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2016-05-30  6:55 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-05-25  6:49 [PATCH v2] net: alx: use custom skb allocator Feng Tang
2016-05-25 23:53 ` David Miller
2016-05-26  8:41   ` Feng Tang
2016-05-26 13:08     ` Eric Dumazet
2016-05-30  6:57       ` Feng Tang
2016-05-26 19:31     ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).