netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] bnx2x: Alloc 4k fragment for each rx ring buffer element
@ 2015-04-24 20:45 Gabriel Krisman Bertazi
  2015-04-24 23:24 ` Lino Sanfilippo
  0 siblings, 1 reply; 17+ messages in thread
From: Gabriel Krisman Bertazi @ 2015-04-24 20:45 UTC (permalink / raw)
  To: ariel.elior; +Cc: netdev, cascardo, cascardo, brking, Gabriel Krisman Bertazi

> This code assumes that pages are consumed by the device in order. This
> is not true.  The pages are consumed according to packet arrival
> order, which can be from different aggregations.

Thanks for your review.  Here is a new version that fixes the issue you
pointed out and fixes some other things I think that might be a problem.

To fix the issue you presented, I made the memory pool local for each rx
queue, so we don't need to worry about elements coming for different
queues, and also made sure we dealloc fragments separately.

I also modified the allocation code to hold a reference for the pages
that are currently being fragmented, so we can be sure it won't be freed
before using the whole page.

If you have any other concern regarding this fix, please, let me know.

-- >8 --
Subject: [PATCH] bnx2x: Alloc 4k fragment for each rx ring buffer element

The driver allocates one page for each buffer on the rx ring, which is
too much on architectures like ppc64 and can cause unexpected allocation
failures when the system is under stress.  Now, we keep a memory pool
for each rx queue, and if the architecture's PAGE_SIZE is greater than
4k, we fragment pages and assign each 4k segment to a ring element,
which reduces the overall memory consumption on such architectures.
This helps avoiding errors like the example below:

[bnx2x_alloc_rx_sge:435(eth1)]Can't alloc sge
[c00000037ffeb900] [d000000075eddeb4] .bnx2x_alloc_rx_sge+0x44/0x200 [bnx2x]
[c00000037ffeb9b0] [d000000075ee0b34] .bnx2x_fill_frag_skb+0x1ac/0x460 [bnx2x]
[c00000037ffebac0] [d000000075ee11f0] .bnx2x_tpa_stop+0x160/0x2e8 [bnx2x]
[c00000037ffebb90] [d000000075ee1560] .bnx2x_rx_int+0x1e8/0xc30 [bnx2x]
[c00000037ffebcd0] [d000000075ee2084] .bnx2x_poll+0xdc/0x3d8 [bnx2x] (unreliable)

Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
---

 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h     | 19 ++++++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 60 +++++++++++++++++--------
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 33 ++++++++++++--
 3 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 4085c4b..d0c8ed0 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -356,6 +356,8 @@ struct sw_tx_bd {
 
 struct sw_rx_page {
 	struct page	*page;
+	int		len;
+	int		offset;
 	DEFINE_DMA_UNMAP_ADDR(mapping);
 };
 
@@ -381,9 +383,10 @@ union db_prod {
 
 #define PAGES_PER_SGE_SHIFT	0
 #define PAGES_PER_SGE		(1 << PAGES_PER_SGE_SHIFT)
-#define SGE_PAGE_SIZE		PAGE_SIZE
-#define SGE_PAGE_SHIFT		PAGE_SHIFT
-#define SGE_PAGE_ALIGN(addr)	PAGE_ALIGN((typeof(PAGE_SIZE))(addr))
+#define SGE_PAGE_SHIFT		12
+#define SGE_PAGE_SIZE		(1 << SGE_PAGE_SHIFT)
+#define SGE_PAGE_MASK		(~(SGE_PAGE_SIZE - 1))
+#define SGE_PAGE_ALIGN(addr)	(((addr) + SGE_PAGE_SIZE - 1) & SGE_PAGE_MASK)
 #define SGE_PAGES		(SGE_PAGE_SIZE * PAGES_PER_SGE)
 #define TPA_AGG_SIZE		min_t(u32, (min_t(u32, 8, MAX_SKB_FRAGS) * \
 					    SGE_PAGES), 0xffff)
@@ -525,6 +528,14 @@ enum bnx2x_tpa_mode_t {
 	TPA_MODE_GRO
 };
 
+struct bnx2x_alloc_pool {
+	struct page	*page;
+	dma_addr_t	dma;
+	int		len;
+	int		offset;
+	int		last_offset;
+};
+
 struct bnx2x_fastpath {
 	struct bnx2x		*bp; /* parent */
 
@@ -611,6 +622,8 @@ struct bnx2x_fastpath {
 	     4 (for the digits and to make it DWORD aligned) */
 #define FP_NAME_SIZE		(sizeof(((struct net_device *)0)->name) + 8)
 	char			name[FP_NAME_SIZE];
+
+	struct bnx2x_alloc_pool	page_pool;
 };
 
 #define bnx2x_fp(bp, nr, var)	((bp)->fp[(nr)].var)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 0a9faa1..2039325 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -544,30 +544,52 @@ static void bnx2x_set_gro_params(struct sk_buff *skb, u16 parsing_flags,
 static int bnx2x_alloc_rx_sge(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			      u16 index, gfp_t gfp_mask)
 {
-	struct page *page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
 	struct sw_rx_page *sw_buf = &fp->rx_page_ring[index];
 	struct eth_rx_sge *sge = &fp->rx_sge_ring[index];
+	struct bnx2x_alloc_pool *pool = &fp->page_pool;
 	dma_addr_t mapping;
 
-	if (unlikely(page == NULL)) {
-		BNX2X_ERR("Can't alloc sge\n");
-		return -ENOMEM;
-	}
+	if (!pool->page || pool->offset > pool->last_offset) {
 
-	mapping = dma_map_page(&bp->pdev->dev, page, 0,
-			       SGE_PAGES, DMA_FROM_DEVICE);
-	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
-		__free_pages(page, PAGES_PER_SGE_SHIFT);
-		BNX2X_ERR("Can't map sge\n");
-		return -ENOMEM;
+		/* put page reference used by the memory pool, since we
+		 * won't be using this page as the mempool anymore.
+		 */
+		if (pool->page)
+			put_page(pool->page);
+
+		pool->page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
+		if (unlikely(!pool->page)) {
+			BNX2X_ERR("Can't alloc sge\n");
+			return -ENOMEM;
+		}
+
+		pool->dma = dma_map_page(&bp->pdev->dev, pool->page, 0,
+					 PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(dma_mapping_error(&bp->pdev->dev,
+					       pool->dma))) {
+			__free_pages(pool->page, PAGES_PER_SGE_SHIFT);
+			BNX2X_ERR("Can't map sge\n");
+			return -ENOMEM;
+		}
+		pool->offset = 0;
+		pool->len = PAGE_SIZE;
+		pool->last_offset = ((PAGE_SIZE / SGE_PAGE_SIZE - 1)
+				     * SGE_PAGE_SIZE);
 	}
 
-	sw_buf->page = page;
+	get_page(pool->page);
+	sw_buf->page = pool->page;
+	sw_buf->len = SGE_PAGE_SIZE;
+	sw_buf->offset = pool->offset;
+
+	mapping = pool->dma + sw_buf->offset;
 	dma_unmap_addr_set(sw_buf, mapping, mapping);
 
 	sge->addr_hi = cpu_to_le32(U64_HI(mapping));
 	sge->addr_lo = cpu_to_le32(U64_LO(mapping));
 
+	pool->offset += SGE_PAGE_SIZE;
+
 	return 0;
 }
 
@@ -629,20 +651,22 @@ static int bnx2x_fill_frag_skb(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			return err;
 		}
 
-		/* Unmap the page as we're going to pass it to the stack */
-		dma_unmap_page(&bp->pdev->dev,
-			       dma_unmap_addr(&old_rx_pg, mapping),
-			       SGE_PAGES, DMA_FROM_DEVICE);
+		dma_unmap_single(&bp->pdev->dev,
+				 dma_unmap_addr(&old_rx_pg, mapping),
+				 old_rx_pg.len, DMA_FROM_DEVICE);
 		/* Add one frag and update the appropriate fields in the skb */
 		if (fp->mode == TPA_MODE_LRO)
-			skb_fill_page_desc(skb, j, old_rx_pg.page, 0, frag_len);
+			skb_fill_page_desc(skb, j, old_rx_pg.page,
+					   old_rx_pg.offset, frag_len);
 		else { /* GRO */
 			int rem;
 			int offset = 0;
 			for (rem = frag_len; rem > 0; rem -= gro_size) {
 				int len = rem > gro_size ? gro_size : rem;
 				skb_fill_page_desc(skb, frag_id++,
-						   old_rx_pg.page, offset, len);
+						   old_rx_pg.page,
+						   old_rx_pg.offset + offset,
+						   len);
 				if (offset)
 					get_page(old_rx_pg.page);
 				offset += len;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index adcacda..3842d51 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -804,9 +804,14 @@ static inline void bnx2x_free_rx_sge(struct bnx2x *bp,
 	if (!page)
 		return;
 
-	dma_unmap_page(&bp->pdev->dev, dma_unmap_addr(sw_buf, mapping),
-		       SGE_PAGES, DMA_FROM_DEVICE);
-	__free_pages(page, PAGES_PER_SGE_SHIFT);
+	/* Since many fragments can share the same page, make sure to
+	 * only unmap and free the page once.
+	 */
+	dma_unmap_single(&bp->pdev->dev,
+			 dma_unmap_addr(sw_buf, mapping),
+			 sw_buf->len, DMA_FROM_DEVICE);
+
+	put_page(page);
 
 	sw_buf->page = NULL;
 	sge->addr_hi = 0;
@@ -964,6 +969,26 @@ static inline void bnx2x_set_fw_mac_addr(__le16 *fw_hi, __le16 *fw_mid,
 	((u8 *)fw_lo)[1]  = mac[4];
 }
 
+static inline void bnx2x_free_rx_mem_pool(struct bnx2x *bp,
+					  struct bnx2x_alloc_pool *pool)
+{
+	if (!pool->page)
+		return;
+
+	/* Page was not fully fragmented.  Unmap unused space */
+	if (pool->offset <= pool->last_offset) {
+		dma_addr_t dma = pool->dma + pool->offset;
+
+		dma_unmap_single(&bp->pdev->dev, dma,
+				 pool->len - pool->offset,
+				 DMA_FROM_DEVICE);
+	}
+
+	put_page(pool->page);
+
+	pool->page = NULL;
+}
+
 static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 					   struct bnx2x_fastpath *fp, int last)
 {
@@ -974,6 +999,8 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 
 	for (i = 0; i < last; i++)
 		bnx2x_free_rx_sge(bp, fp, i);
+
+	bnx2x_free_rx_mem_pool(bp, &fp->page_pool);
 }
 
 static inline void bnx2x_set_next_page_rx_bd(struct bnx2x_fastpath *fp)
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-04-24 20:45 [PATCH v2] bnx2x: Alloc 4k fragment for each rx ring buffer element Gabriel Krisman Bertazi
@ 2015-04-24 23:24 ` Lino Sanfilippo
  2015-04-29 13:30   ` Gabriel Krisman Bertazi
  0 siblings, 1 reply; 17+ messages in thread
From: Lino Sanfilippo @ 2015-04-24 23:24 UTC (permalink / raw)
  To: Gabriel Krisman Bertazi, ariel.elior; +Cc: netdev, cascardo, cascardo, brking

Hi,

On 24.04.2015 22:45, Gabriel Krisman Bertazi wrote:
> @@ -544,30 +544,52 @@ static void bnx2x_set_gro_params(struct sk_buff *skb, u16 parsing_flags,
>  static int bnx2x_alloc_rx_sge(struct bnx2x *bp, struct bnx2x_fastpath *fp,
>  			      u16 index, gfp_t gfp_mask)
>  {
> -	struct page *page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
>  	struct sw_rx_page *sw_buf = &fp->rx_page_ring[index];
>  	struct eth_rx_sge *sge = &fp->rx_sge_ring[index];
> +	struct bnx2x_alloc_pool *pool = &fp->page_pool;
>  	dma_addr_t mapping;
>  
> -	if (unlikely(page == NULL)) {
> -		BNX2X_ERR("Can't alloc sge\n");
> -		return -ENOMEM;
> -	}
> +	if (!pool->page || pool->offset > pool->last_offset) {
>  
> -	mapping = dma_map_page(&bp->pdev->dev, page, 0,
> -			       SGE_PAGES, DMA_FROM_DEVICE);
> -	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
> -		__free_pages(page, PAGES_PER_SGE_SHIFT);
> -		BNX2X_ERR("Can't map sge\n");
> -		return -ENOMEM;
> +		/* put page reference used by the memory pool, since we
> +		 * won't be using this page as the mempool anymore.
> +		 */
> +		if (pool->page)
> +			put_page(pool->page);
> +
> +		pool->page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
> +		if (unlikely(!pool->page)) {
> +			BNX2X_ERR("Can't alloc sge\n");
> +			return -ENOMEM;
> +		}
> +
> +		pool->dma = dma_map_page(&bp->pdev->dev, pool->page, 0,
> +					 PAGE_SIZE, DMA_FROM_DEVICE);
> +		if (unlikely(dma_mapping_error(&bp->pdev->dev,
> +					       pool->dma))) {
> +			__free_pages(pool->page, PAGES_PER_SGE_SHIFT);
> +			BNX2X_ERR("Can't map sge\n");
> +			return -ENOMEM;

Looks like setting pool->page to NULL is missing in case that dma
mapping failed. Otherwise put_page is called on the freed page the next
time bnx2x_alloc_rx_sge is called.

Regards,
Lino

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-04-24 23:24 ` Lino Sanfilippo
@ 2015-04-29 13:30   ` Gabriel Krisman Bertazi
  2015-04-30 11:25     ` Yuval Mintz
  0 siblings, 1 reply; 17+ messages in thread
From: Gabriel Krisman Bertazi @ 2015-04-29 13:30 UTC (permalink / raw)
  To: Lino Sanfilippo; +Cc: ariel.elior, netdev, cascardo, cascardo, brking


Hi Lino,

Lino Sanfilippo <LinoSanfilippo@gmx.de> writes:
> Looks like setting pool->page to NULL is missing in case that dma
> mapping failed. Otherwise put_page is called on the freed page the next
> time bnx2x_alloc_rx_sge is called.

Thanks, I fixed it.  Please take this new version with the fix you
suggested.

-- >8 --
Subject: [PATCH] bnx2x: Alloc 4k fragment for each rx ring buffer element

The driver allocates one page for each buffer on the rx ring, which is
too much on architectures like ppc64 and can cause unexpected allocation
failures when the system is under stress.  Now, we keep a memory pool
per queue, and if the architecture's PAGE_SIZE is greater than 4k, we
fragment pages and assign each 4k segment to a ring element, which
reduces the overall memory consumption on such architectures.  This
helps avoiding errors like the example below:

[bnx2x_alloc_rx_sge:435(eth1)]Can't alloc sge
[c00000037ffeb900] [d000000075eddeb4] .bnx2x_alloc_rx_sge+0x44/0x200 [bnx2x]
[c00000037ffeb9b0] [d000000075ee0b34] .bnx2x_fill_frag_skb+0x1ac/0x460 [bnx2x]
[c00000037ffebac0] [d000000075ee11f0] .bnx2x_tpa_stop+0x160/0x2e8 [bnx2x]
[c00000037ffebb90] [d000000075ee1560] .bnx2x_rx_int+0x1e8/0xc30 [bnx2x]
[c00000037ffebcd0] [d000000075ee2084] .bnx2x_poll+0xdc/0x3d8 [bnx2x] (unreliable)

Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h     | 19 ++++++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 61 +++++++++++++++++--------
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 33 +++++++++++--
 3 files changed, 89 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 4085c4b..d0c8ed0 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -356,6 +356,8 @@ struct sw_tx_bd {
 
 struct sw_rx_page {
 	struct page	*page;
+	int		len;
+	int		offset;
 	DEFINE_DMA_UNMAP_ADDR(mapping);
 };
 
@@ -381,9 +383,10 @@ union db_prod {
 
 #define PAGES_PER_SGE_SHIFT	0
 #define PAGES_PER_SGE		(1 << PAGES_PER_SGE_SHIFT)
-#define SGE_PAGE_SIZE		PAGE_SIZE
-#define SGE_PAGE_SHIFT		PAGE_SHIFT
-#define SGE_PAGE_ALIGN(addr)	PAGE_ALIGN((typeof(PAGE_SIZE))(addr))
+#define SGE_PAGE_SHIFT		12
+#define SGE_PAGE_SIZE		(1 << SGE_PAGE_SHIFT)
+#define SGE_PAGE_MASK		(~(SGE_PAGE_SIZE - 1))
+#define SGE_PAGE_ALIGN(addr)	(((addr) + SGE_PAGE_SIZE - 1) & SGE_PAGE_MASK)
 #define SGE_PAGES		(SGE_PAGE_SIZE * PAGES_PER_SGE)
 #define TPA_AGG_SIZE		min_t(u32, (min_t(u32, 8, MAX_SKB_FRAGS) * \
 					    SGE_PAGES), 0xffff)
@@ -525,6 +528,14 @@ enum bnx2x_tpa_mode_t {
 	TPA_MODE_GRO
 };
 
+struct bnx2x_alloc_pool {
+	struct page	*page;
+	dma_addr_t	dma;
+	int		len;
+	int		offset;
+	int		last_offset;
+};
+
 struct bnx2x_fastpath {
 	struct bnx2x		*bp; /* parent */
 
@@ -611,6 +622,8 @@ struct bnx2x_fastpath {
 	     4 (for the digits and to make it DWORD aligned) */
 #define FP_NAME_SIZE		(sizeof(((struct net_device *)0)->name) + 8)
 	char			name[FP_NAME_SIZE];
+
+	struct bnx2x_alloc_pool	page_pool;
 };
 
 #define bnx2x_fp(bp, nr, var)	((bp)->fp[(nr)].var)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 0a9faa1..52ce87f 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -544,30 +544,53 @@ static void bnx2x_set_gro_params(struct sk_buff *skb, u16 parsing_flags,
 static int bnx2x_alloc_rx_sge(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			      u16 index, gfp_t gfp_mask)
 {
-	struct page *page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
 	struct sw_rx_page *sw_buf = &fp->rx_page_ring[index];
 	struct eth_rx_sge *sge = &fp->rx_sge_ring[index];
+	struct bnx2x_alloc_pool *pool = &fp->page_pool;
 	dma_addr_t mapping;
 
-	if (unlikely(page == NULL)) {
-		BNX2X_ERR("Can't alloc sge\n");
-		return -ENOMEM;
-	}
+	if (!pool->page || pool->offset > pool->last_offset) {
 
-	mapping = dma_map_page(&bp->pdev->dev, page, 0,
-			       SGE_PAGES, DMA_FROM_DEVICE);
-	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
-		__free_pages(page, PAGES_PER_SGE_SHIFT);
-		BNX2X_ERR("Can't map sge\n");
-		return -ENOMEM;
+		/* put page reference used by the memory pool, since we
+		 * won't be using this page as the mempool anymore.
+		 */
+		if (pool->page)
+			put_page(pool->page);
+
+		pool->page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
+		if (unlikely(!pool->page)) {
+			BNX2X_ERR("Can't alloc sge\n");
+			return -ENOMEM;
+		}
+
+		pool->dma = dma_map_page(&bp->pdev->dev, pool->page, 0,
+					 PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(dma_mapping_error(&bp->pdev->dev,
+					       pool->dma))) {
+			__free_pages(pool->page, PAGES_PER_SGE_SHIFT);
+			pool->page = NULL;
+			BNX2X_ERR("Can't map sge\n");
+			return -ENOMEM;
+		}
+		pool->offset = 0;
+		pool->len = PAGE_SIZE;
+		pool->last_offset = ((PAGE_SIZE / SGE_PAGE_SIZE - 1)
+				     * SGE_PAGE_SIZE);
 	}
 
-	sw_buf->page = page;
+	get_page(pool->page);
+	sw_buf->page = pool->page;
+	sw_buf->len = SGE_PAGE_SIZE;
+	sw_buf->offset = pool->offset;
+
+	mapping = pool->dma + sw_buf->offset;
 	dma_unmap_addr_set(sw_buf, mapping, mapping);
 
 	sge->addr_hi = cpu_to_le32(U64_HI(mapping));
 	sge->addr_lo = cpu_to_le32(U64_LO(mapping));
 
+	pool->offset += SGE_PAGE_SIZE;
+
 	return 0;
 }
 
@@ -629,20 +652,22 @@ static int bnx2x_fill_frag_skb(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			return err;
 		}
 
-		/* Unmap the page as we're going to pass it to the stack */
-		dma_unmap_page(&bp->pdev->dev,
-			       dma_unmap_addr(&old_rx_pg, mapping),
-			       SGE_PAGES, DMA_FROM_DEVICE);
+		dma_unmap_single(&bp->pdev->dev,
+				 dma_unmap_addr(&old_rx_pg, mapping),
+				 old_rx_pg.len, DMA_FROM_DEVICE);
 		/* Add one frag and update the appropriate fields in the skb */
 		if (fp->mode == TPA_MODE_LRO)
-			skb_fill_page_desc(skb, j, old_rx_pg.page, 0, frag_len);
+			skb_fill_page_desc(skb, j, old_rx_pg.page,
+					   old_rx_pg.offset, frag_len);
 		else { /* GRO */
 			int rem;
 			int offset = 0;
 			for (rem = frag_len; rem > 0; rem -= gro_size) {
 				int len = rem > gro_size ? gro_size : rem;
 				skb_fill_page_desc(skb, frag_id++,
-						   old_rx_pg.page, offset, len);
+						   old_rx_pg.page,
+						   old_rx_pg.offset + offset,
+						   len);
 				if (offset)
 					get_page(old_rx_pg.page);
 				offset += len;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index adcacda..3842d51 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -804,9 +804,14 @@ static inline void bnx2x_free_rx_sge(struct bnx2x *bp,
 	if (!page)
 		return;
 
-	dma_unmap_page(&bp->pdev->dev, dma_unmap_addr(sw_buf, mapping),
-		       SGE_PAGES, DMA_FROM_DEVICE);
-	__free_pages(page, PAGES_PER_SGE_SHIFT);
+	/* Since many fragments can share the same page, make sure to
+	 * only unmap and free the page once.
+	 */
+	dma_unmap_single(&bp->pdev->dev,
+			 dma_unmap_addr(sw_buf, mapping),
+			 sw_buf->len, DMA_FROM_DEVICE);
+
+	put_page(page);
 
 	sw_buf->page = NULL;
 	sge->addr_hi = 0;
@@ -964,6 +969,26 @@ static inline void bnx2x_set_fw_mac_addr(__le16 *fw_hi, __le16 *fw_mid,
 	((u8 *)fw_lo)[1]  = mac[4];
 }
 
+static inline void bnx2x_free_rx_mem_pool(struct bnx2x *bp,
+					  struct bnx2x_alloc_pool *pool)
+{
+	if (!pool->page)
+		return;
+
+	/* Page was not fully fragmented.  Unmap unused space */
+	if (pool->offset <= pool->last_offset) {
+		dma_addr_t dma = pool->dma + pool->offset;
+
+		dma_unmap_single(&bp->pdev->dev, dma,
+				 pool->len - pool->offset,
+				 DMA_FROM_DEVICE);
+	}
+
+	put_page(pool->page);
+
+	pool->page = NULL;
+}
+
 static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 					   struct bnx2x_fastpath *fp, int last)
 {
@@ -974,6 +999,8 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 
 	for (i = 0; i < last; i++)
 		bnx2x_free_rx_sge(bp, fp, i);
+
+	bnx2x_free_rx_mem_pool(bp, &fp->page_pool);
 }
 
 static inline void bnx2x_set_next_page_rx_bd(struct bnx2x_fastpath *fp)
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* RE: [PATCH v2] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-04-29 13:30   ` Gabriel Krisman Bertazi
@ 2015-04-30 11:25     ` Yuval Mintz
  2015-04-30 20:05       ` David Miller
  0 siblings, 1 reply; 17+ messages in thread
From: Yuval Mintz @ 2015-04-30 11:25 UTC (permalink / raw)
  To: Gabriel Krisman Bertazi, Lino Sanfilippo
  Cc: Ariel Elior, netdev, cascardo, cascardo, brking

 The driver allocates one page for each buffer on the rx ring, which is too much
> on architectures like ppc64 and can cause unexpected allocation failures when
> the system is under stress.  Now, we keep a memory pool per queue, and if the
> architecture's PAGE_SIZE is greater than 4k, we fragment pages and assign each
> 4k segment to a ring element, which reduces the overall memory consumption
> on such architectures.  This helps avoiding errors like the example below:
> 
> [bnx2x_alloc_rx_sge:435(eth1)]Can't alloc sge [c00000037ffeb900]
> [d000000075eddeb4] .bnx2x_alloc_rx_sge+0x44/0x200 [bnx2x]
> [c00000037ffeb9b0] [d000000075ee0b34] .bnx2x_fill_frag_skb+0x1ac/0x460
> [bnx2x] [c00000037ffebac0] [d000000075ee11f0]
> .bnx2x_tpa_stop+0x160/0x2e8 [bnx2x] [c00000037ffebb90]
> [d000000075ee1560] .bnx2x_rx_int+0x1e8/0xc30 [bnx2x] [c00000037ffebcd0]
> [d000000075ee2084] .bnx2x_poll+0xdc/0x3d8 [bnx2x] (unreliable)
> 
> Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
> ---
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x.h     | 19 ++++++--
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 61 +++++++++++++++++-
> -------  drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 33 +++++++++++--
>  3 files changed, 89 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> index 4085c4b..d0c8ed0 100644
> --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
> @@ -356,6 +356,8 @@ struct sw_tx_bd {
> 
 ...
> +struct bnx2x_alloc_pool {
> +	struct page	*page;
> +	dma_addr_t	dma;
> +	int		len;

Isn't len always set to PAGE_SIZE? If so it can be dropped

> +	int		offset;
> +	int		last_offset;

Looks like you last_offset is used only for comparisons with offset;
Can we make it into a counter of possible allocations instead?

I.e., set it to (PAGE_SIZE / SGE_PAGE_SIZE - 1) during allocation
And decrement it each time an SGE_PAGE_SIZE chunk is taken.
That would turn (pool-> offset > pool->last_offset) into (!remain).
	

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v2] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-04-30 11:25     ` Yuval Mintz
@ 2015-04-30 20:05       ` David Miller
  2015-05-04 19:32         ` [PATCH v3] " Gabriel Krisman Bertazi
  0 siblings, 1 reply; 17+ messages in thread
From: David Miller @ 2015-04-30 20:05 UTC (permalink / raw)
  To: Yuval.Mintz
  Cc: krisman, LinoSanfilippo, Ariel.Elior, netdev, cascardo, cascardo, brking

From: Yuval Mintz <Yuval.Mintz@qlogic.com>
Date: Thu, 30 Apr 2015 11:25:31 +0000

>> +struct bnx2x_alloc_pool {
>> +	struct page	*page;
>> +	dma_addr_t	dma;
>> +	int		len;
> 
> Isn't len always set to PAGE_SIZE? If so it can be dropped

Agreed, len appears unnecessary.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-04-30 20:05       ` David Miller
@ 2015-05-04 19:32         ` Gabriel Krisman Bertazi
  2015-05-04 19:58           ` Yuval Mintz
  0 siblings, 1 reply; 17+ messages in thread
From: Gabriel Krisman Bertazi @ 2015-05-04 19:32 UTC (permalink / raw)
  To: David Miller
  Cc: Yuval.Mintz, LinoSanfilippo, Ariel.Elior, netdev, cascardo, brking

David Miller <davem@davemloft.net> writes:

> Agreed, len appears unnecessary.

Yes, I agree.

I'm sending a new version that removes both len and last_offset and uses
the frag_count that Yuval suggested.

Thanks,

-- >8 --

Subject: [PATCH] bnx2x: Alloc 4k fragment for each rx ring buffer element

The driver allocates one page for each buffer on the rx ring, which is
too much on architectures like ppc64 and can cause unexpected allocation
failures when the system is under stress.  Now, we keep a memory pool
per queue, and if the architecture's PAGE_SIZE is greater than 4k, we
fragment pages and assign each 4k segment to a ring element, which
reduces the overall memory consumption on such architectures.  This
helps avoiding errors like the example below:

[bnx2x_alloc_rx_sge:435(eth1)]Can't alloc sge
[c00000037ffeb900] [d000000075eddeb4] .bnx2x_alloc_rx_sge+0x44/0x200 [bnx2x]
[c00000037ffeb9b0] [d000000075ee0b34] .bnx2x_fill_frag_skb+0x1ac/0x460 [bnx2x]
[c00000037ffebac0] [d000000075ee11f0] .bnx2x_tpa_stop+0x160/0x2e8 [bnx2x]
[c00000037ffebb90] [d000000075ee1560] .bnx2x_rx_int+0x1e8/0xc30 [bnx2x]
[c00000037ffebcd0] [d000000075ee2084] .bnx2x_poll+0xdc/0x3d8 [bnx2x] (unreliable)

Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h     | 17 +++++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 59 +++++++++++++++++--------
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 31 +++++++++++--
 3 files changed, 83 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 4085c4b..4228d8a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -356,6 +356,7 @@ struct sw_tx_bd {
 
 struct sw_rx_page {
 	struct page	*page;
+	int		offset;
 	DEFINE_DMA_UNMAP_ADDR(mapping);
 };
 
@@ -381,9 +382,10 @@ union db_prod {
 
 #define PAGES_PER_SGE_SHIFT	0
 #define PAGES_PER_SGE		(1 << PAGES_PER_SGE_SHIFT)
-#define SGE_PAGE_SIZE		PAGE_SIZE
-#define SGE_PAGE_SHIFT		PAGE_SHIFT
-#define SGE_PAGE_ALIGN(addr)	PAGE_ALIGN((typeof(PAGE_SIZE))(addr))
+#define SGE_PAGE_SHIFT		12
+#define SGE_PAGE_SIZE		(1 << SGE_PAGE_SHIFT)
+#define SGE_PAGE_MASK		(~(SGE_PAGE_SIZE - 1))
+#define SGE_PAGE_ALIGN(addr)	(((addr) + SGE_PAGE_SIZE - 1) & SGE_PAGE_MASK)
 #define SGE_PAGES		(SGE_PAGE_SIZE * PAGES_PER_SGE)
 #define TPA_AGG_SIZE		min_t(u32, (min_t(u32, 8, MAX_SKB_FRAGS) * \
 					    SGE_PAGES), 0xffff)
@@ -525,6 +527,13 @@ enum bnx2x_tpa_mode_t {
 	TPA_MODE_GRO
 };
 
+struct bnx2x_alloc_pool {
+	struct page	*page;
+	dma_addr_t	dma;
+	int		offset;
+	int		frag_count;
+};
+
 struct bnx2x_fastpath {
 	struct bnx2x		*bp; /* parent */
 
@@ -611,6 +620,8 @@ struct bnx2x_fastpath {
 	     4 (for the digits and to make it DWORD aligned) */
 #define FP_NAME_SIZE		(sizeof(((struct net_device *)0)->name) + 8)
 	char			name[FP_NAME_SIZE];
+
+	struct bnx2x_alloc_pool	page_pool;
 };
 
 #define bnx2x_fp(bp, nr, var)	((bp)->fp[(nr)].var)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 0a9faa1..eed6401 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -544,30 +544,51 @@ static void bnx2x_set_gro_params(struct sk_buff *skb, u16 parsing_flags,
 static int bnx2x_alloc_rx_sge(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			      u16 index, gfp_t gfp_mask)
 {
-	struct page *page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
 	struct sw_rx_page *sw_buf = &fp->rx_page_ring[index];
 	struct eth_rx_sge *sge = &fp->rx_sge_ring[index];
+	struct bnx2x_alloc_pool *pool = &fp->page_pool;
 	dma_addr_t mapping;
 
-	if (unlikely(page == NULL)) {
-		BNX2X_ERR("Can't alloc sge\n");
-		return -ENOMEM;
-	}
+	if (!pool->page || pool->frag_count == 0) {
 
-	mapping = dma_map_page(&bp->pdev->dev, page, 0,
-			       SGE_PAGES, DMA_FROM_DEVICE);
-	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
-		__free_pages(page, PAGES_PER_SGE_SHIFT);
-		BNX2X_ERR("Can't map sge\n");
-		return -ENOMEM;
+		/* put page reference used by the memory pool, since we
+		 * won't be using this page as the mempool anymore.
+		 */
+		if (pool->page)
+			put_page(pool->page);
+
+		pool->page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
+		if (unlikely(!pool->page)) {
+			BNX2X_ERR("Can't alloc sge\n");
+			return -ENOMEM;
+		}
+
+		pool->dma = dma_map_page(&bp->pdev->dev, pool->page, 0,
+					 PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(dma_mapping_error(&bp->pdev->dev,
+					       pool->dma))) {
+			__free_pages(pool->page, PAGES_PER_SGE_SHIFT);
+			pool->page = NULL;
+			BNX2X_ERR("Can't map sge\n");
+			return -ENOMEM;
+		}
+		pool->offset = 0;
+		pool->frag_count = (PAGE_SIZE / SGE_PAGE_SIZE);
 	}
 
-	sw_buf->page = page;
+	get_page(pool->page);
+	sw_buf->page = pool->page;
+	sw_buf->offset = pool->offset;
+
+	mapping = pool->dma + sw_buf->offset;
 	dma_unmap_addr_set(sw_buf, mapping, mapping);
 
 	sge->addr_hi = cpu_to_le32(U64_HI(mapping));
 	sge->addr_lo = cpu_to_le32(U64_LO(mapping));
 
+	pool->offset += SGE_PAGE_SIZE;
+	pool->frag_count -= 1;
+
 	return 0;
 }
 
@@ -629,20 +650,22 @@ static int bnx2x_fill_frag_skb(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			return err;
 		}
 
-		/* Unmap the page as we're going to pass it to the stack */
-		dma_unmap_page(&bp->pdev->dev,
-			       dma_unmap_addr(&old_rx_pg, mapping),
-			       SGE_PAGES, DMA_FROM_DEVICE);
+		dma_unmap_single(&bp->pdev->dev,
+				 dma_unmap_addr(&old_rx_pg, mapping),
+				 SGE_PAGE_SIZE, DMA_FROM_DEVICE);
 		/* Add one frag and update the appropriate fields in the skb */
 		if (fp->mode == TPA_MODE_LRO)
-			skb_fill_page_desc(skb, j, old_rx_pg.page, 0, frag_len);
+			skb_fill_page_desc(skb, j, old_rx_pg.page,
+					   old_rx_pg.offset, frag_len);
 		else { /* GRO */
 			int rem;
 			int offset = 0;
 			for (rem = frag_len; rem > 0; rem -= gro_size) {
 				int len = rem > gro_size ? gro_size : rem;
 				skb_fill_page_desc(skb, frag_id++,
-						   old_rx_pg.page, offset, len);
+						   old_rx_pg.page,
+						   old_rx_pg.offset + offset,
+						   len);
 				if (offset)
 					get_page(old_rx_pg.page);
 				offset += len;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index adcacda..8bc45da 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -804,9 +804,13 @@ static inline void bnx2x_free_rx_sge(struct bnx2x *bp,
 	if (!page)
 		return;
 
-	dma_unmap_page(&bp->pdev->dev, dma_unmap_addr(sw_buf, mapping),
-		       SGE_PAGES, DMA_FROM_DEVICE);
-	__free_pages(page, PAGES_PER_SGE_SHIFT);
+	/* Since many fragments can share the same page, make sure to
+	 * only unmap and free the page once.
+	 */
+	dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(sw_buf, mapping),
+			 SGE_PAGE_SIZE, DMA_FROM_DEVICE);
+
+	put_page(page);
 
 	sw_buf->page = NULL;
 	sge->addr_hi = 0;
@@ -964,6 +968,25 @@ static inline void bnx2x_set_fw_mac_addr(__le16 *fw_hi, __le16 *fw_mid,
 	((u8 *)fw_lo)[1]  = mac[4];
 }
 
+static inline void bnx2x_free_rx_mem_pool(struct bnx2x *bp,
+					  struct bnx2x_alloc_pool *pool)
+{
+	if (!pool->page)
+		return;
+
+	/* Page was not fully fragmented.  Unmap unused space */
+	if (pool->frag_count > 0) {
+		dma_addr_t dma = pool->dma + pool->offset;
+		int size = pool->frag_count * SGE_PAGE_SIZE;
+
+		dma_unmap_single(&bp->pdev->dev, dma, size, DMA_FROM_DEVICE);
+	}
+
+	put_page(pool->page);
+
+	pool->page = NULL;
+}
+
 static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 					   struct bnx2x_fastpath *fp, int last)
 {
@@ -974,6 +997,8 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 
 	for (i = 0; i < last; i++)
 		bnx2x_free_rx_sge(bp, fp, i);
+
+	bnx2x_free_rx_mem_pool(bp, &fp->page_pool);
 }
 
 static inline void bnx2x_set_next_page_rx_bd(struct bnx2x_fastpath *fp)
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* RE: [PATCH v3] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-05-04 19:32         ` [PATCH v3] " Gabriel Krisman Bertazi
@ 2015-05-04 19:58           ` Yuval Mintz
  2015-05-15 20:43             ` Gabriel Krisman Bertazi
  2015-05-21 13:20             ` [PATCH v4] " Gabriel Krisman Bertazi
  0 siblings, 2 replies; 17+ messages in thread
From: Yuval Mintz @ 2015-05-04 19:58 UTC (permalink / raw)
  To: Gabriel Krisman Bertazi, David Miller
  Cc: LinoSanfilippo, Ariel Elior, netdev, cascardo, brking

A couple of not really important [ mostly style] issues -

>+       int             offset;
>+       int             frag_count;
u8 seems better, as we never really expect this to be negative.
[Could have claimed the same about the offset]

...

> +       if (!pool->page || pool->frag_count == 0) {
Why the "== 0" and not simply "!"?

 ...

> +       pool->frag_count -= 1;
Why not pool->frag_count-- ?

...

> +       /* Page was not fully fragmented.  Unmap unused space */
> +       if (pool->frag_count > 0) {
Again, why not (pool->frag_count) instead?

Regardless, I'll give it a more thorough review tomorrow.
[If those are all the "problems" we'll find with it, I don't think we'll
need to re-spin this once more; that is, unless Dave insists]

Thanks,
Yuval

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v3] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-05-04 19:58           ` Yuval Mintz
@ 2015-05-15 20:43             ` Gabriel Krisman Bertazi
  2015-05-17  7:14               ` Yuval Mintz
  2015-05-21 13:20             ` [PATCH v4] " Gabriel Krisman Bertazi
  1 sibling, 1 reply; 17+ messages in thread
From: Gabriel Krisman Bertazi @ 2015-05-15 20:43 UTC (permalink / raw)
  To: Yuval Mintz
  Cc: David Miller, LinoSanfilippo, Ariel Elior, netdev, cascardo, brking

Yuval Mintz <Yuval.Mintz@qlogic.com> writes:

> Regardless, I'll give it a more thorough review tomorrow.
> [If those are all the "problems" we'll find with it, I don't think we'll
> need to re-spin this once more; that is, unless Dave insists]

Hi Yuval,  thanks for your suggestion.

Just pinging this patch.  Did you have a chance to make another review
on this one?  Looking forward to see this merged in.

Thanks!

-- 
Gabriel Krisman Bertazi

^ permalink raw reply	[flat|nested] 17+ messages in thread

* RE: [PATCH v3] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-05-15 20:43             ` Gabriel Krisman Bertazi
@ 2015-05-17  7:14               ` Yuval Mintz
  0 siblings, 0 replies; 17+ messages in thread
From: Yuval Mintz @ 2015-05-17  7:14 UTC (permalink / raw)
  To: Gabriel Krisman Bertazi
  Cc: David Miller, LinoSanfilippo, Ariel Elior, netdev, cascardo, brking

> > Regardless, I'll give it a more thorough review tomorrow.
> > [If those are all the "problems" we'll find with it, I don't think
> > we'll need to re-spin this once more; that is, unless Dave insists]
> 
> Hi Yuval,  thanks for your suggestion.
> 
> Just pinging this patch.  Did you have a chance to make another review on this
> one?  Looking forward to see this merged in.
> 
> Thanks!
> 
> --
> Gabriel Krisman Bertazi

Sorry for not responding - I've seen that Dave already moved it into
'changes required' state, so I've assumed you'd shortly send an update.

Reviewed it; Couldn't find anything inherently problematic.

Thanks for the effort.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-05-04 19:58           ` Yuval Mintz
  2015-05-15 20:43             ` Gabriel Krisman Bertazi
@ 2015-05-21 13:20             ` Gabriel Krisman Bertazi
  2015-05-23 10:04               ` Yuval Mintz
                                 ` (2 more replies)
  1 sibling, 3 replies; 17+ messages in thread
From: Gabriel Krisman Bertazi @ 2015-05-21 13:20 UTC (permalink / raw)
  To: Yuval Mintz
  Cc: David Miller, LinoSanfilippo, Ariel Elior, netdev, cascardo, brking

Yuval Mintz <Yuval.Mintz@qlogic.com> writes:

> Regardless, I'll give it a more thorough review tomorrow.
> [If those are all the "problems" we'll find with it, I don't think we'll
> need to re-spin this once more; that is, unless Dave insists]

As a follow up, here is a new version that fixes the style issues that Yuval
pointed out.  Are there any other concerns regarding this one?

-- >8 --
Subject: [PATCH v4] bnx2x: Alloc 4k fragment for each rx ring buffer element

The driver allocates one page for each buffer on the rx ring, which is
too much on architectures like ppc64 and can cause unexpected allocation
failures when the system is under stress.  Now, we keep a memory pool
per queue, and if the architecture's PAGE_SIZE is greater than 4k, we
fragment pages and assign each 4k segment to a ring element, which
reduces the overall memory consumption on such architectures.  This
helps avoiding errors like the example below:

[bnx2x_alloc_rx_sge:435(eth1)]Can't alloc sge
[c00000037ffeb900] [d000000075eddeb4] .bnx2x_alloc_rx_sge+0x44/0x200 [bnx2x]
[c00000037ffeb9b0] [d000000075ee0b34] .bnx2x_fill_frag_skb+0x1ac/0x460 [bnx2x]
[c00000037ffebac0] [d000000075ee11f0] .bnx2x_tpa_stop+0x160/0x2e8 [bnx2x]
[c00000037ffebb90] [d000000075ee1560] .bnx2x_rx_int+0x1e8/0xc30 [bnx2x]
[c00000037ffebcd0] [d000000075ee2084] .bnx2x_poll+0xdc/0x3d8 [bnx2x] (unreliable)

Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h     | 17 +++++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 59 +++++++++++++++++--------
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 31 +++++++++++--
 3 files changed, 83 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 4085c4b..5984d28 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -357,6 +357,7 @@ struct sw_tx_bd {
 struct sw_rx_page {
 	struct page	*page;
 	DEFINE_DMA_UNMAP_ADDR(mapping);
+	u8		offset;
 };
 
 union db_prod {
@@ -381,9 +382,10 @@ union db_prod {
 
 #define PAGES_PER_SGE_SHIFT	0
 #define PAGES_PER_SGE		(1 << PAGES_PER_SGE_SHIFT)
-#define SGE_PAGE_SIZE		PAGE_SIZE
-#define SGE_PAGE_SHIFT		PAGE_SHIFT
-#define SGE_PAGE_ALIGN(addr)	PAGE_ALIGN((typeof(PAGE_SIZE))(addr))
+#define SGE_PAGE_SHIFT		12
+#define SGE_PAGE_SIZE		(1 << SGE_PAGE_SHIFT)
+#define SGE_PAGE_MASK		(~(SGE_PAGE_SIZE - 1))
+#define SGE_PAGE_ALIGN(addr)	(((addr) + SGE_PAGE_SIZE - 1) & SGE_PAGE_MASK)
 #define SGE_PAGES		(SGE_PAGE_SIZE * PAGES_PER_SGE)
 #define TPA_AGG_SIZE		min_t(u32, (min_t(u32, 8, MAX_SKB_FRAGS) * \
 					    SGE_PAGES), 0xffff)
@@ -525,6 +527,13 @@ enum bnx2x_tpa_mode_t {
 	TPA_MODE_GRO
 };
 
+struct bnx2x_alloc_pool {
+	struct page	*page;
+	dma_addr_t	dma;
+	u8		offset;
+	u8		frag_count;
+};
+
 struct bnx2x_fastpath {
 	struct bnx2x		*bp; /* parent */
 
@@ -611,6 +620,8 @@ struct bnx2x_fastpath {
 	     4 (for the digits and to make it DWORD aligned) */
 #define FP_NAME_SIZE		(sizeof(((struct net_device *)0)->name) + 8)
 	char			name[FP_NAME_SIZE];
+
+	struct bnx2x_alloc_pool	page_pool;
 };
 
 #define bnx2x_fp(bp, nr, var)	((bp)->fp[(nr)].var)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 0a9faa1..ffdeaa8 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -544,30 +544,51 @@ static void bnx2x_set_gro_params(struct sk_buff *skb, u16 parsing_flags,
 static int bnx2x_alloc_rx_sge(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			      u16 index, gfp_t gfp_mask)
 {
-	struct page *page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
 	struct sw_rx_page *sw_buf = &fp->rx_page_ring[index];
 	struct eth_rx_sge *sge = &fp->rx_sge_ring[index];
+	struct bnx2x_alloc_pool *pool = &fp->page_pool;
 	dma_addr_t mapping;
 
-	if (unlikely(page == NULL)) {
-		BNX2X_ERR("Can't alloc sge\n");
-		return -ENOMEM;
-	}
+	if (!pool->page || !pool->frag_count) {
 
-	mapping = dma_map_page(&bp->pdev->dev, page, 0,
-			       SGE_PAGES, DMA_FROM_DEVICE);
-	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
-		__free_pages(page, PAGES_PER_SGE_SHIFT);
-		BNX2X_ERR("Can't map sge\n");
-		return -ENOMEM;
+		/* put page reference used by the memory pool, since we
+		 * won't be using this page as the mempool anymore.
+		 */
+		if (pool->page)
+			put_page(pool->page);
+
+		pool->page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
+		if (unlikely(!pool->page)) {
+			BNX2X_ERR("Can't alloc sge\n");
+			return -ENOMEM;
+		}
+
+		pool->dma = dma_map_page(&bp->pdev->dev, pool->page, 0,
+					 PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(dma_mapping_error(&bp->pdev->dev,
+					       pool->dma))) {
+			__free_pages(pool->page, PAGES_PER_SGE_SHIFT);
+			pool->page = NULL;
+			BNX2X_ERR("Can't map sge\n");
+			return -ENOMEM;
+		}
+		pool->offset = 0;
+		pool->frag_count = (PAGE_SIZE / SGE_PAGE_SIZE);
 	}
 
-	sw_buf->page = page;
+	get_page(pool->page);
+	sw_buf->page = pool->page;
+	sw_buf->offset = pool->offset;
+
+	mapping = pool->dma + sw_buf->offset;
 	dma_unmap_addr_set(sw_buf, mapping, mapping);
 
 	sge->addr_hi = cpu_to_le32(U64_HI(mapping));
 	sge->addr_lo = cpu_to_le32(U64_LO(mapping));
 
+	pool->offset += SGE_PAGE_SIZE;
+	pool->frag_count--;
+
 	return 0;
 }
 
@@ -629,20 +650,22 @@ static int bnx2x_fill_frag_skb(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			return err;
 		}
 
-		/* Unmap the page as we're going to pass it to the stack */
-		dma_unmap_page(&bp->pdev->dev,
-			       dma_unmap_addr(&old_rx_pg, mapping),
-			       SGE_PAGES, DMA_FROM_DEVICE);
+		dma_unmap_single(&bp->pdev->dev,
+				 dma_unmap_addr(&old_rx_pg, mapping),
+				 SGE_PAGE_SIZE, DMA_FROM_DEVICE);
 		/* Add one frag and update the appropriate fields in the skb */
 		if (fp->mode == TPA_MODE_LRO)
-			skb_fill_page_desc(skb, j, old_rx_pg.page, 0, frag_len);
+			skb_fill_page_desc(skb, j, old_rx_pg.page,
+					   old_rx_pg.offset, frag_len);
 		else { /* GRO */
 			int rem;
 			int offset = 0;
 			for (rem = frag_len; rem > 0; rem -= gro_size) {
 				int len = rem > gro_size ? gro_size : rem;
 				skb_fill_page_desc(skb, frag_id++,
-						   old_rx_pg.page, offset, len);
+						   old_rx_pg.page,
+						   old_rx_pg.offset + offset,
+						   len);
 				if (offset)
 					get_page(old_rx_pg.page);
 				offset += len;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index adcacda..beedb96 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -804,9 +804,13 @@ static inline void bnx2x_free_rx_sge(struct bnx2x *bp,
 	if (!page)
 		return;
 
-	dma_unmap_page(&bp->pdev->dev, dma_unmap_addr(sw_buf, mapping),
-		       SGE_PAGES, DMA_FROM_DEVICE);
-	__free_pages(page, PAGES_PER_SGE_SHIFT);
+	/* Since many fragments can share the same page, make sure to
+	 * only unmap and free the page once.
+	 */
+	dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(sw_buf, mapping),
+			 SGE_PAGE_SIZE, DMA_FROM_DEVICE);
+
+	put_page(page);
 
 	sw_buf->page = NULL;
 	sge->addr_hi = 0;
@@ -964,6 +968,25 @@ static inline void bnx2x_set_fw_mac_addr(__le16 *fw_hi, __le16 *fw_mid,
 	((u8 *)fw_lo)[1]  = mac[4];
 }
 
+static inline void bnx2x_free_rx_mem_pool(struct bnx2x *bp,
+					  struct bnx2x_alloc_pool *pool)
+{
+	if (!pool->page)
+		return;
+
+	/* Page was not fully fragmented.  Unmap unused space */
+	if (pool->frag_count) {
+		dma_addr_t dma = pool->dma + pool->offset;
+		int size = pool->frag_count * SGE_PAGE_SIZE;
+
+		dma_unmap_single(&bp->pdev->dev, dma, size, DMA_FROM_DEVICE);
+	}
+
+	put_page(pool->page);
+
+	pool->page = NULL;
+}
+
 static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 					   struct bnx2x_fastpath *fp, int last)
 {
@@ -974,6 +997,8 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 
 	for (i = 0; i < last; i++)
 		bnx2x_free_rx_sge(bp, fp, i);
+
+	bnx2x_free_rx_mem_pool(bp, &fp->page_pool);
 }
 
 static inline void bnx2x_set_next_page_rx_bd(struct bnx2x_fastpath *fp)
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* RE: [PATCH v4] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-05-21 13:20             ` [PATCH v4] " Gabriel Krisman Bertazi
@ 2015-05-23 10:04               ` Yuval Mintz
  2015-05-23 11:14               ` Lino Sanfilippo
  2015-05-27  8:59               ` Michal Schmidt
  2 siblings, 0 replies; 17+ messages in thread
From: Yuval Mintz @ 2015-05-23 10:04 UTC (permalink / raw)
  To: Gabriel Krisman Bertazi
  Cc: David Miller, LinoSanfilippo, Ariel Elior, netdev, cascardo, brking

> Subject: Re: [PATCH v4] bnx2x: Alloc 4k fragment for each rx ring buffer element
> 
> Yuval Mintz <Yuval.Mintz@qlogic.com> writes:
> 
> > Regardless, I'll give it a more thorough review tomorrow.
> > [If those are all the "problems" we'll find with it, I don't think
> > we'll need to re-spin this once more; that is, unless Dave insists]
> 
> As a follow up, here is a new version that fixes the style issues that Yuval pointed
> out.  Are there any other concerns regarding this one?
> 

Not from me; Thanks again for the effort of doing this.

Acked-by: Yuval Mintz <Yuval.Mintz@qlogic.com>

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-05-21 13:20             ` [PATCH v4] " Gabriel Krisman Bertazi
  2015-05-23 10:04               ` Yuval Mintz
@ 2015-05-23 11:14               ` Lino Sanfilippo
  2015-05-27  8:59               ` Michal Schmidt
  2 siblings, 0 replies; 17+ messages in thread
From: Lino Sanfilippo @ 2015-05-23 11:14 UTC (permalink / raw)
  To: Gabriel Krisman Bertazi, Yuval Mintz
  Cc: David Miller, Ariel Elior, netdev, cascardo, brking

On 21.05.2015 15:20, Gabriel Krisman Bertazi wrote:
> Yuval Mintz <Yuval.Mintz@qlogic.com> writes:
> 
>> Regardless, I'll give it a more thorough review tomorrow.
>> [If those are all the "problems" we'll find with it, I don't think we'll
>> need to re-spin this once more; that is, unless Dave insists]
> 
> As a follow up, here is a new version that fixes the style issues that Yuval
> pointed out.  Are there any other concerns regarding this one?
> 

Hi Gabriel,

the patch looks good concerning this issue in a former version:
http://marc.info/?l=linux-netdev&m=142991786419997&w=2

And (at least to me) it also looks good on the whole. You can
add Reviewed-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>.

Regards,
Lino

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-05-21 13:20             ` [PATCH v4] " Gabriel Krisman Bertazi
  2015-05-23 10:04               ` Yuval Mintz
  2015-05-23 11:14               ` Lino Sanfilippo
@ 2015-05-27  8:59               ` Michal Schmidt
  2015-05-27 16:51                 ` [PATCH] " Gabriel Krisman Bertazi
  2015-05-28  7:24                 ` [PATCH v4] " Yuval Mintz
  2 siblings, 2 replies; 17+ messages in thread
From: Michal Schmidt @ 2015-05-27  8:59 UTC (permalink / raw)
  To: Gabriel Krisman Bertazi, Yuval Mintz
  Cc: David Miller, LinoSanfilippo, Ariel Elior, netdev, cascardo, brking

On 05/21/2015 03:20 PM, Gabriel Krisman Bertazi wrote:
> +#define SGE_PAGE_SHIFT		12
> +#define SGE_PAGE_SIZE		(1 << SGE_PAGE_SHIFT)
...
> +struct bnx2x_alloc_pool {
> +	struct page	*page;
> +	dma_addr_t	dma;
> +	u8		offset;
> +	u8		frag_count;
> +};
...
>  static int bnx2x_alloc_rx_sge(struct bnx2x *bp, struct bnx2x_fastpath *fp,
>  			      u16 index, gfp_t gfp_mask)
>  {
        ...
> +	pool->offset += SGE_PAGE_SIZE;
> +	pool->frag_count--;
> +
>  	return 0;
>  }

One SGE_PAGE_SIZE is already bigger than representable by u8, so offset
will overflow.

Isn't storing both 'offset' and 'frag_count' redundant? There is a
simple linear relationship between the two.

Regards,
Michal

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-05-27  8:59               ` Michal Schmidt
@ 2015-05-27 16:51                 ` Gabriel Krisman Bertazi
  2015-06-01 22:57                   ` David Miller
  2015-05-28  7:24                 ` [PATCH v4] " Yuval Mintz
  1 sibling, 1 reply; 17+ messages in thread
From: Gabriel Krisman Bertazi @ 2015-05-27 16:51 UTC (permalink / raw)
  To: Michal Schmidt
  Cc: Yuval Mintz, David Miller, LinoSanfilippo, Ariel Elior, netdev,
	cascardo, brking

Michal Schmidt <mschmidt@redhat.com> writes:

> One SGE_PAGE_SIZE is already bigger than representable by u8, so offset
> will overflow.

Ouch, this bug was introduced in the last iteration, sorry.  Fixed.

> Isn't storing both 'offset' and 'frag_count' redundant? There is a
> simple linear relationship between the two.

Yes, but I felt it would make the code more readable, even though it
introduced sobre redundancy.  Anyway, the new version below removes
frag_count and fixes the overflow bug.

Thanks for your review!

-- >8 --
Subject: [PATCH] bnx2x: Alloc 4k fragment for each rx ring buffer element

The driver allocates one page for each buffer on the rx ring, which is
too much on architectures like ppc64 and can cause unexpected allocation
failures when the system is under stress.  Now, we keep a memory pool
per queue, and if the architecture's PAGE_SIZE is greater than 4k, we
fragment pages and assign each 4k segment to a ring element, which
reduces the overall memory consumption on such architectures.  This
helps avoiding errors like the example below:

[bnx2x_alloc_rx_sge:435(eth1)]Can't alloc sge
[c00000037ffeb900] [d000000075eddeb4] .bnx2x_alloc_rx_sge+0x44/0x200 [bnx2x]
[c00000037ffeb9b0] [d000000075ee0b34] .bnx2x_fill_frag_skb+0x1ac/0x460 [bnx2x]
[c00000037ffebac0] [d000000075ee11f0] .bnx2x_tpa_stop+0x160/0x2e8 [bnx2x]
[c00000037ffebb90] [d000000075ee1560] .bnx2x_rx_int+0x1e8/0xc30 [bnx2x]
[c00000037ffebcd0] [d000000075ee2084] .bnx2x_poll+0xdc/0x3d8 [bnx2x] (unreliable)

Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Acked-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
Reviewed-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>
---
 drivers/net/ethernet/broadcom/bnx2x/bnx2x.h     | 16 +++++--
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c | 57 +++++++++++++++++--------
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 31 ++++++++++++--
 3 files changed, 80 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
index 4085c4b..d127b9a 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
@@ -357,6 +357,7 @@ struct sw_tx_bd {
 struct sw_rx_page {
 	struct page	*page;
 	DEFINE_DMA_UNMAP_ADDR(mapping);
+	unsigned int	offset;
 };
 
 union db_prod {
@@ -381,9 +382,10 @@ union db_prod {
 
 #define PAGES_PER_SGE_SHIFT	0
 #define PAGES_PER_SGE		(1 << PAGES_PER_SGE_SHIFT)
-#define SGE_PAGE_SIZE		PAGE_SIZE
-#define SGE_PAGE_SHIFT		PAGE_SHIFT
-#define SGE_PAGE_ALIGN(addr)	PAGE_ALIGN((typeof(PAGE_SIZE))(addr))
+#define SGE_PAGE_SHIFT		12
+#define SGE_PAGE_SIZE		(1 << SGE_PAGE_SHIFT)
+#define SGE_PAGE_MASK		(~(SGE_PAGE_SIZE - 1))
+#define SGE_PAGE_ALIGN(addr)	(((addr) + SGE_PAGE_SIZE - 1) & SGE_PAGE_MASK)
 #define SGE_PAGES		(SGE_PAGE_SIZE * PAGES_PER_SGE)
 #define TPA_AGG_SIZE		min_t(u32, (min_t(u32, 8, MAX_SKB_FRAGS) * \
 					    SGE_PAGES), 0xffff)
@@ -525,6 +527,12 @@ enum bnx2x_tpa_mode_t {
 	TPA_MODE_GRO
 };
 
+struct bnx2x_alloc_pool {
+	struct page	*page;
+	dma_addr_t	dma;
+	unsigned int	offset;
+};
+
 struct bnx2x_fastpath {
 	struct bnx2x		*bp; /* parent */
 
@@ -611,6 +619,8 @@ struct bnx2x_fastpath {
 	     4 (for the digits and to make it DWORD aligned) */
 #define FP_NAME_SIZE		(sizeof(((struct net_device *)0)->name) + 8)
 	char			name[FP_NAME_SIZE];
+
+	struct bnx2x_alloc_pool	page_pool;
 };
 
 #define bnx2x_fp(bp, nr, var)	((bp)->fp[(nr)].var)
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
index 0a9faa1..70dea69 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
@@ -544,30 +544,49 @@ static void bnx2x_set_gro_params(struct sk_buff *skb, u16 parsing_flags,
 static int bnx2x_alloc_rx_sge(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			      u16 index, gfp_t gfp_mask)
 {
-	struct page *page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
 	struct sw_rx_page *sw_buf = &fp->rx_page_ring[index];
 	struct eth_rx_sge *sge = &fp->rx_sge_ring[index];
+	struct bnx2x_alloc_pool *pool = &fp->page_pool;
 	dma_addr_t mapping;
 
-	if (unlikely(page == NULL)) {
-		BNX2X_ERR("Can't alloc sge\n");
-		return -ENOMEM;
-	}
+	if (!pool->page || (PAGE_SIZE - pool->offset) < SGE_PAGE_SIZE) {
 
-	mapping = dma_map_page(&bp->pdev->dev, page, 0,
-			       SGE_PAGES, DMA_FROM_DEVICE);
-	if (unlikely(dma_mapping_error(&bp->pdev->dev, mapping))) {
-		__free_pages(page, PAGES_PER_SGE_SHIFT);
-		BNX2X_ERR("Can't map sge\n");
-		return -ENOMEM;
+		/* put page reference used by the memory pool, since we
+		 * won't be using this page as the mempool anymore.
+		 */
+		if (pool->page)
+			put_page(pool->page);
+
+		pool->page = alloc_pages(gfp_mask, PAGES_PER_SGE_SHIFT);
+		if (unlikely(!pool->page)) {
+			BNX2X_ERR("Can't alloc sge\n");
+			return -ENOMEM;
+		}
+
+		pool->dma = dma_map_page(&bp->pdev->dev, pool->page, 0,
+					 PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(dma_mapping_error(&bp->pdev->dev,
+					       pool->dma))) {
+			__free_pages(pool->page, PAGES_PER_SGE_SHIFT);
+			pool->page = NULL;
+			BNX2X_ERR("Can't map sge\n");
+			return -ENOMEM;
+		}
+		pool->offset = 0;
 	}
 
-	sw_buf->page = page;
+	get_page(pool->page);
+	sw_buf->page = pool->page;
+	sw_buf->offset = pool->offset;
+
+	mapping = pool->dma + sw_buf->offset;
 	dma_unmap_addr_set(sw_buf, mapping, mapping);
 
 	sge->addr_hi = cpu_to_le32(U64_HI(mapping));
 	sge->addr_lo = cpu_to_le32(U64_LO(mapping));
 
+	pool->offset += SGE_PAGE_SIZE;
+
 	return 0;
 }
 
@@ -629,20 +648,22 @@ static int bnx2x_fill_frag_skb(struct bnx2x *bp, struct bnx2x_fastpath *fp,
 			return err;
 		}
 
-		/* Unmap the page as we're going to pass it to the stack */
-		dma_unmap_page(&bp->pdev->dev,
-			       dma_unmap_addr(&old_rx_pg, mapping),
-			       SGE_PAGES, DMA_FROM_DEVICE);
+		dma_unmap_single(&bp->pdev->dev,
+				 dma_unmap_addr(&old_rx_pg, mapping),
+				 SGE_PAGE_SIZE, DMA_FROM_DEVICE);
 		/* Add one frag and update the appropriate fields in the skb */
 		if (fp->mode == TPA_MODE_LRO)
-			skb_fill_page_desc(skb, j, old_rx_pg.page, 0, frag_len);
+			skb_fill_page_desc(skb, j, old_rx_pg.page,
+					   old_rx_pg.offset, frag_len);
 		else { /* GRO */
 			int rem;
 			int offset = 0;
 			for (rem = frag_len; rem > 0; rem -= gro_size) {
 				int len = rem > gro_size ? gro_size : rem;
 				skb_fill_page_desc(skb, frag_id++,
-						   old_rx_pg.page, offset, len);
+						   old_rx_pg.page,
+						   old_rx_pg.offset + offset,
+						   len);
 				if (offset)
 					get_page(old_rx_pg.page);
 				offset += len;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
index adcacda..1df9d78 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h
@@ -804,9 +804,13 @@ static inline void bnx2x_free_rx_sge(struct bnx2x *bp,
 	if (!page)
 		return;
 
-	dma_unmap_page(&bp->pdev->dev, dma_unmap_addr(sw_buf, mapping),
-		       SGE_PAGES, DMA_FROM_DEVICE);
-	__free_pages(page, PAGES_PER_SGE_SHIFT);
+	/* Since many fragments can share the same page, make sure to
+	 * only unmap and free the page once.
+	 */
+	dma_unmap_single(&bp->pdev->dev, dma_unmap_addr(sw_buf, mapping),
+			 SGE_PAGE_SIZE, DMA_FROM_DEVICE);
+
+	put_page(page);
 
 	sw_buf->page = NULL;
 	sge->addr_hi = 0;
@@ -964,6 +968,25 @@ static inline void bnx2x_set_fw_mac_addr(__le16 *fw_hi, __le16 *fw_mid,
 	((u8 *)fw_lo)[1]  = mac[4];
 }
 
+static inline void bnx2x_free_rx_mem_pool(struct bnx2x *bp,
+					  struct bnx2x_alloc_pool *pool)
+{
+	if (!pool->page)
+		return;
+
+	/* Page was not fully fragmented.  Unmap unused space */
+	if (pool->offset < PAGE_SIZE) {
+		dma_addr_t dma = pool->dma + pool->offset;
+		int size = PAGE_SIZE - pool->offset;
+
+		dma_unmap_single(&bp->pdev->dev, dma, size, DMA_FROM_DEVICE);
+	}
+
+	put_page(pool->page);
+
+	pool->page = NULL;
+}
+
 static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 					   struct bnx2x_fastpath *fp, int last)
 {
@@ -974,6 +997,8 @@ static inline void bnx2x_free_rx_sge_range(struct bnx2x *bp,
 
 	for (i = 0; i < last; i++)
 		bnx2x_free_rx_sge(bp, fp, i);
+
+	bnx2x_free_rx_mem_pool(bp, &fp->page_pool);
 }
 
 static inline void bnx2x_set_next_page_rx_bd(struct bnx2x_fastpath *fp)
-- 
2.1.0

^ permalink raw reply related	[flat|nested] 17+ messages in thread

* RE: [PATCH v4] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-05-27  8:59               ` Michal Schmidt
  2015-05-27 16:51                 ` [PATCH] " Gabriel Krisman Bertazi
@ 2015-05-28  7:24                 ` Yuval Mintz
  2015-05-28 13:26                   ` Gabriel Krisman Bertazi
  1 sibling, 1 reply; 17+ messages in thread
From: Yuval Mintz @ 2015-05-28  7:24 UTC (permalink / raw)
  To: Michal Schmidt, Gabriel Krisman Bertazi
  Cc: David Miller, LinoSanfilippo, Ariel Elior, netdev, cascardo, brking

>> +struct bnx2x_alloc_pool {
>> +     struct page     *page;
>> +     dma_addr_t      dma;
>> +     u8              offset;
>> +     u8              frag_count;
>> +};
>...
>>  static int bnx2x_alloc_rx_sge(struct bnx2x *bp, struct bnx2x_fastpath *fp,
>>                             u16 index, gfp_t gfp_mask)
>>  {
>        ...
>> +     pool->offset += SGE_PAGE_SIZE;
>> +     pool->frag_count--;
>> +
>>       return 0;
>>  }

> One SGE_PAGE_SIZE is already bigger than representable by u8, so offset
> will overflow.

Thanks for the catch Michal.

Actually, this upsets me greatly. We didn't see it on a system with 4KB
pages, but this means you've actually tried to 'sell' us a fastpath fix that
was never tested on machines for which it was meant as an improvement.

Dave - if possible, please wait with accepting any further fixes for this
issue until we [qlogic] manage to prepare a test environment
where we can properly test this with 64KB page size architecture.

Thanks,
Yuval

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-05-28  7:24                 ` [PATCH v4] " Yuval Mintz
@ 2015-05-28 13:26                   ` Gabriel Krisman Bertazi
  0 siblings, 0 replies; 17+ messages in thread
From: Gabriel Krisman Bertazi @ 2015-05-28 13:26 UTC (permalink / raw)
  To: Yuval Mintz
  Cc: Michal Schmidt, David Miller, LinoSanfilippo, Ariel Elior,
	netdev, cascardo, brking

Yuval Mintz <Yuval.Mintz@qlogic.com> writes:

> Actually, this upsets me greatly. We didn't see it on a system with 4KB
> pages, but this means you've actually tried to 'sell' us a fastpath fix that
> was never tested on machines for which it was meant as an improvement.

The iteration that inserted this bug was such a quick fix that I didn`t
bother rerunning it.  It just modified the type variable.  Take my word
it was a one time honest mistake.

The last iteration, as well as all of the previous ones were tested on
Power servers with bnx2x adapters.  The tests included stressing the
device and several iterations of hotplugging the adapter, helping us to
verify correctness of the rx queue operation as well as verifying that DMA
mapping/unmapping were correct.

-- 
Gabriel Krisman Bertazi

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH] bnx2x: Alloc 4k fragment for each rx ring buffer element
  2015-05-27 16:51                 ` [PATCH] " Gabriel Krisman Bertazi
@ 2015-06-01 22:57                   ` David Miller
  0 siblings, 0 replies; 17+ messages in thread
From: David Miller @ 2015-06-01 22:57 UTC (permalink / raw)
  To: krisman
  Cc: mschmidt, Yuval.Mintz, LinoSanfilippo, Ariel.Elior, netdev,
	cascardo, brking

From: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
Date: Wed, 27 May 2015 13:51:43 -0300

> Subject: [PATCH] bnx2x: Alloc 4k fragment for each rx ring buffer element
> 
> The driver allocates one page for each buffer on the rx ring, which is
> too much on architectures like ppc64 and can cause unexpected allocation
> failures when the system is under stress.  Now, we keep a memory pool
> per queue, and if the architecture's PAGE_SIZE is greater than 4k, we
> fragment pages and assign each 4k segment to a ring element, which
> reduces the overall memory consumption on such architectures.  This
> helps avoiding errors like the example below:
> 
> [bnx2x_alloc_rx_sge:435(eth1)]Can't alloc sge
> [c00000037ffeb900] [d000000075eddeb4] .bnx2x_alloc_rx_sge+0x44/0x200 [bnx2x]
> [c00000037ffeb9b0] [d000000075ee0b34] .bnx2x_fill_frag_skb+0x1ac/0x460 [bnx2x]
> [c00000037ffebac0] [d000000075ee11f0] .bnx2x_tpa_stop+0x160/0x2e8 [bnx2x]
> [c00000037ffebb90] [d000000075ee1560] .bnx2x_rx_int+0x1e8/0xc30 [bnx2x]
> [c00000037ffebcd0] [d000000075ee2084] .bnx2x_poll+0xdc/0x3d8 [bnx2x] (unreliable)
> 
> Signed-off-by: Gabriel Krisman Bertazi <krisman@linux.vnet.ibm.com>
> Acked-by: Yuval Mintz <Yuval.Mintz@qlogic.com>
> Reviewed-by: Lino Sanfilippo <LinoSanfilippo@gmx.de>

I think I've waited long enough on this, applied to net-next, thanks.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2015-06-01 22:57 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-04-24 20:45 [PATCH v2] bnx2x: Alloc 4k fragment for each rx ring buffer element Gabriel Krisman Bertazi
2015-04-24 23:24 ` Lino Sanfilippo
2015-04-29 13:30   ` Gabriel Krisman Bertazi
2015-04-30 11:25     ` Yuval Mintz
2015-04-30 20:05       ` David Miller
2015-05-04 19:32         ` [PATCH v3] " Gabriel Krisman Bertazi
2015-05-04 19:58           ` Yuval Mintz
2015-05-15 20:43             ` Gabriel Krisman Bertazi
2015-05-17  7:14               ` Yuval Mintz
2015-05-21 13:20             ` [PATCH v4] " Gabriel Krisman Bertazi
2015-05-23 10:04               ` Yuval Mintz
2015-05-23 11:14               ` Lino Sanfilippo
2015-05-27  8:59               ` Michal Schmidt
2015-05-27 16:51                 ` [PATCH] " Gabriel Krisman Bertazi
2015-06-01 22:57                   ` David Miller
2015-05-28  7:24                 ` [PATCH v4] " Yuval Mintz
2015-05-28 13:26                   ` Gabriel Krisman Bertazi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).