All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] mlx4_en: map entire pages to increase throughput
@ 2012-07-16 17:01 Thadeu Lima de Souza Cascardo
  2012-07-16 17:27 ` Rick Jones
  2012-07-19 17:53 ` David Miller
  0 siblings, 2 replies; 28+ messages in thread
From: Thadeu Lima de Souza Cascardo @ 2012-07-16 17:01 UTC (permalink / raw)
  To: davem
  Cc: netdev, yevgenyp, ogerlitz, amirv, brking, leitao, klebers,
	Thadeu Lima de Souza Cascardo

In its receive path, mlx4_en driver maps each page chunk that it pushes
to the hardware and unmaps it when pushing it up the stack. This limits
throughput to about 3Gbps on a Power7 8-core machine.

One solution is to map the entire allocated page at once. However, this
requires that we keep track of every page fragment we give to a
descriptor. We also need to work with the discipline that all fragments will
be released (in the sense that it will not be reused by the driver
anymore) in the order they are allocated to the driver.

This requires that we don't reuse any fragments, every single one of
them must be reallocated. We do that by releasing all the fragments that
are processed and only after finished processing the descriptors, we
start the refill.

We also must somehow guarantee that we either refill all fragments in a
descriptor or none at all, without resorting to giving up a page
fragment that we would have already given. Otherwise, we would break the
discipline of only releasing the fragments in the order they were
allocated.

This has passed page allocation fault injections (restricted to the
driver by using required-start and required-end) and device hotplug
while 16 TCP streams were able to deliver more than 9Gbps.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_rx.c   |  237 ++++++++++++++------------
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h |    3 +-
 2 files changed, 131 insertions(+), 109 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index a04cbf7..37ac073 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -41,41 +41,75 @@
 
 #include "mlx4_en.h"
 
-
-static int mlx4_en_alloc_frag(struct mlx4_en_priv *priv,
-			      struct mlx4_en_rx_desc *rx_desc,
-			      struct page_frag *skb_frags,
-			      struct mlx4_en_rx_alloc *ring_alloc,
-			      int i)
+static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
+			       struct mlx4_en_rx_desc *rx_desc,
+			       struct mlx4_en_rx_alloc *frags,
+			       struct mlx4_en_rx_alloc *ring_alloc)
 {
-	struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
-	struct mlx4_en_rx_alloc *page_alloc = &ring_alloc[i];
+	struct mlx4_en_rx_alloc page_alloc[MLX4_EN_MAX_RX_FRAGS];
+	struct mlx4_en_frag_info *frag_info;
 	struct page *page;
 	dma_addr_t dma;
+	int i;
 
-	if (page_alloc->offset == frag_info->last_offset) {
-		/* Allocate new page */
-		page = alloc_pages(GFP_ATOMIC | __GFP_COMP, MLX4_EN_ALLOC_ORDER);
-		if (!page)
-			return -ENOMEM;
-
-		skb_frags[i].page = page_alloc->page;
-		skb_frags[i].offset = page_alloc->offset;
-		page_alloc->page = page;
-		page_alloc->offset = frag_info->frag_align;
-	} else {
-		page = page_alloc->page;
-		get_page(page);
+	for (i = 0; i < priv->num_frags; i++) {
+		frag_info = &priv->frag_info[i];
+		if (ring_alloc[i].offset == frag_info->last_offset) {
+			page = alloc_pages(GFP_ATOMIC | __GFP_COMP,
+					MLX4_EN_ALLOC_ORDER);
+			if (!page)
+				goto out;
+			dma = dma_map_page(priv->ddev, page, 0,
+				MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE);
+			if (dma_mapping_error(priv->ddev, dma)) {
+				put_page(page);
+				goto out;
+			}
+			page_alloc[i].page = page;
+			page_alloc[i].dma = dma;
+			page_alloc[i].offset = frag_info->frag_align;
+		} else {
+			page_alloc[i].page = ring_alloc[i].page;
+			get_page(ring_alloc[i].page);
+			page_alloc[i].dma = ring_alloc[i].dma;
+			page_alloc[i].offset = ring_alloc[i].offset +
+						frag_info->frag_stride;
+		}
+	}
 
-		skb_frags[i].page = page;
-		skb_frags[i].offset = page_alloc->offset;
-		page_alloc->offset += frag_info->frag_stride;
+	for (i = 0; i < priv->num_frags; i++) {
+		frags[i] = ring_alloc[i];
+		dma = ring_alloc[i].dma + ring_alloc[i].offset;
+		ring_alloc[i] = page_alloc[i];
+		rx_desc->data[i].addr = cpu_to_be64(dma);
 	}
-	dma = dma_map_single(priv->ddev, page_address(skb_frags[i].page) +
-			     skb_frags[i].offset, frag_info->frag_size,
-			     PCI_DMA_FROMDEVICE);
-	rx_desc->data[i].addr = cpu_to_be64(dma);
+
 	return 0;
+
+
+out:
+	while (i--) {
+		frag_info = &priv->frag_info[i];
+		if (ring_alloc[i].offset == frag_info->last_offset)
+			dma_unmap_page(priv->ddev, page_alloc[i].dma,
+				MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE);
+		put_page(page_alloc[i].page);
+	}
+	return -ENOMEM;
+}
+
+static void mlx4_en_free_frag(struct mlx4_en_priv *priv,
+			      struct mlx4_en_rx_alloc *frags,
+			      int i)
+{
+	struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
+
+	if (frags[i].offset == frag_info->last_offset) {
+		dma_unmap_page(priv->ddev, frags[i].dma, MLX4_EN_ALLOC_SIZE,
+					 PCI_DMA_FROMDEVICE);
+	}
+	if (frags[i].page)
+		put_page(frags[i].page);
 }
 
 static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
@@ -91,6 +125,13 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
 		if (!page_alloc->page)
 			goto out;
 
+		page_alloc->dma = dma_map_page(priv->ddev, page_alloc->page, 0,
+					MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE);
+		if (dma_mapping_error(priv->ddev, page_alloc->dma)) {
+			put_page(page_alloc->page);
+			page_alloc->page = NULL;
+			goto out;
+		}
 		page_alloc->offset = priv->frag_info[i].frag_align;
 		en_dbg(DRV, priv, "Initialized allocator:%d with page:%p\n",
 		       i, page_alloc->page);
@@ -100,6 +141,8 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
 out:
 	while (i--) {
 		page_alloc = &ring->page_alloc[i];
+		dma_unmap_page(priv->ddev, page_alloc->dma,
+				MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE);
 		put_page(page_alloc->page);
 		page_alloc->page = NULL;
 	}
@@ -117,24 +160,22 @@ static void mlx4_en_destroy_allocator(struct mlx4_en_priv *priv,
 		en_dbg(DRV, priv, "Freeing allocator:%d count:%d\n",
 		       i, page_count(page_alloc->page));
 
+		dma_unmap_page(priv->ddev, page_alloc->dma,
+				MLX4_EN_ALLOC_SIZE, PCI_DMA_FROMDEVICE);
 		put_page(page_alloc->page);
 		page_alloc->page = NULL;
 	}
 }
 
-
 static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
 				 struct mlx4_en_rx_ring *ring, int index)
 {
 	struct mlx4_en_rx_desc *rx_desc = ring->buf + ring->stride * index;
-	struct skb_frag_struct *skb_frags = ring->rx_info +
-					    (index << priv->log_rx_info);
 	int possible_frags;
 	int i;
 
 	/* Set size and memtype fields */
 	for (i = 0; i < priv->num_frags; i++) {
-		skb_frag_size_set(&skb_frags[i], priv->frag_info[i].frag_size);
 		rx_desc->data[i].byte_count =
 			cpu_to_be32(priv->frag_info[i].frag_size);
 		rx_desc->data[i].lkey = cpu_to_be32(priv->mdev->mr.key);
@@ -151,29 +192,14 @@ static void mlx4_en_init_rx_desc(struct mlx4_en_priv *priv,
 	}
 }
 
-
 static int mlx4_en_prepare_rx_desc(struct mlx4_en_priv *priv,
 				   struct mlx4_en_rx_ring *ring, int index)
 {
 	struct mlx4_en_rx_desc *rx_desc = ring->buf + (index * ring->stride);
-	struct page_frag *skb_frags = ring->rx_info +
-				      (index << priv->log_rx_info);
-	int i;
+	struct mlx4_en_rx_alloc *frags = ring->rx_info +
+					(index << priv->log_rx_info);
 
-	for (i = 0; i < priv->num_frags; i++)
-		if (mlx4_en_alloc_frag(priv, rx_desc, skb_frags, ring->page_alloc, i))
-			goto err;
-
-	return 0;
-
-err:
-	while (i--) {
-		dma_addr_t dma = be64_to_cpu(rx_desc->data[i].addr);
-		pci_unmap_single(priv->mdev->pdev, dma, skb_frags[i].size,
-				 PCI_DMA_FROMDEVICE);
-		put_page(skb_frags[i].page);
-	}
-	return -ENOMEM;
+	return mlx4_en_alloc_frags(priv, rx_desc, frags, ring->page_alloc);
 }
 
 static inline void mlx4_en_update_rx_prod_db(struct mlx4_en_rx_ring *ring)
@@ -185,20 +211,13 @@ static void mlx4_en_free_rx_desc(struct mlx4_en_priv *priv,
 				 struct mlx4_en_rx_ring *ring,
 				 int index)
 {
-	struct page_frag *skb_frags;
-	struct mlx4_en_rx_desc *rx_desc = ring->buf + (index << ring->log_stride);
-	dma_addr_t dma;
+	struct mlx4_en_rx_alloc *frags;
 	int nr;
 
-	skb_frags = ring->rx_info + (index << priv->log_rx_info);
+	frags = ring->rx_info + (index << priv->log_rx_info);
 	for (nr = 0; nr < priv->num_frags; nr++) {
 		en_dbg(DRV, priv, "Freeing fragment:%d\n", nr);
-		dma = be64_to_cpu(rx_desc->data[nr].addr);
-
-		en_dbg(DRV, priv, "Unmapping buffer at dma:0x%llx\n", (u64) dma);
-		dma_unmap_single(priv->ddev, dma, skb_frags[nr].size,
-				 PCI_DMA_FROMDEVICE);
-		put_page(skb_frags[nr].page);
+		mlx4_en_free_frag(priv, frags, nr);
 	}
 }
 
@@ -268,10 +287,9 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 			   struct mlx4_en_rx_ring *ring, u32 size, u16 stride)
 {
 	struct mlx4_en_dev *mdev = priv->mdev;
-	int err;
+	int err = -ENOMEM;
 	int tmp;
 
-
 	ring->prod = 0;
 	ring->cons = 0;
 	ring->size = size;
@@ -281,7 +299,7 @@ int mlx4_en_create_rx_ring(struct mlx4_en_priv *priv,
 	ring->buf_size = ring->size * ring->stride + TXBB_SIZE;
 
 	tmp = size * roundup_pow_of_two(MLX4_EN_MAX_RX_FRAGS *
-					sizeof(struct skb_frag_struct));
+					sizeof(struct mlx4_en_rx_alloc));
 	ring->rx_info = vmalloc(tmp);
 	if (!ring->rx_info)
 		return -ENOMEM;
@@ -338,7 +356,7 @@ int mlx4_en_activate_rx_rings(struct mlx4_en_priv *priv)
 		memset(ring->buf, 0, ring->buf_size);
 		mlx4_en_update_rx_prod_db(ring);
 
-		/* Initailize all descriptors */
+		/* Initialize all descriptors */
 		for (i = 0; i < ring->size; i++)
 			mlx4_en_init_rx_desc(priv, ring, i);
 
@@ -401,12 +419,10 @@ void mlx4_en_deactivate_rx_ring(struct mlx4_en_priv *priv,
 }
 
 
-/* Unmap a completed descriptor and free unused pages */
 static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 				    struct mlx4_en_rx_desc *rx_desc,
-				    struct page_frag *skb_frags,
+				    struct mlx4_en_rx_alloc *frags,
 				    struct sk_buff *skb,
-				    struct mlx4_en_rx_alloc *page_alloc,
 				    int length)
 {
 	struct skb_frag_struct *skb_frags_rx = skb_shinfo(skb)->frags;
@@ -414,26 +430,24 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 	int nr;
 	dma_addr_t dma;
 
-	/* Collect used fragments while replacing them in the HW descirptors */
+	/* Collect used fragments while replacing them in the HW descriptors */
 	for (nr = 0; nr < priv->num_frags; nr++) {
 		frag_info = &priv->frag_info[nr];
 		if (length <= frag_info->frag_prefix_size)
 			break;
+		if (!frags[nr].page)
+			goto fail;
 
-		/* Save page reference in skb */
-		__skb_frag_set_page(&skb_frags_rx[nr], skb_frags[nr].page);
-		skb_frag_size_set(&skb_frags_rx[nr], skb_frags[nr].size);
-		skb_frags_rx[nr].page_offset = skb_frags[nr].offset;
-		skb->truesize += frag_info->frag_stride;
 		dma = be64_to_cpu(rx_desc->data[nr].addr);
+		dma_sync_single_for_cpu(priv->ddev, dma, frag_info->frag_size,
+					DMA_FROM_DEVICE);
 
-		/* Allocate a replacement page */
-		if (mlx4_en_alloc_frag(priv, rx_desc, skb_frags, page_alloc, nr))
-			goto fail;
-
-		/* Unmap buffer */
-		dma_unmap_single(priv->ddev, dma, skb_frag_size(&skb_frags_rx[nr]),
-				 PCI_DMA_FROMDEVICE);
+		/* Save page reference in skb */
+		get_page(frags[nr].page);
+		__skb_frag_set_page(&skb_frags_rx[nr], frags[nr].page);
+		skb_frag_size_set(&skb_frags_rx[nr], frag_info->frag_size);
+		skb_frags_rx[nr].page_offset = frags[nr].offset;
+		skb->truesize += frag_info->frag_stride;
 	}
 	/* Adjust size of last fragment to match actual length */
 	if (nr > 0)
@@ -442,8 +456,6 @@ static int mlx4_en_complete_rx_desc(struct mlx4_en_priv *priv,
 	return nr;
 
 fail:
-	/* Drop all accumulated fragments (which have already been replaced in
-	 * the descriptor) of this packet; remaining fragments are reused... */
 	while (nr > 0) {
 		nr--;
 		__skb_frag_unref(&skb_frags_rx[nr]);
@@ -454,8 +466,7 @@ fail:
 
 static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
 				      struct mlx4_en_rx_desc *rx_desc,
-				      struct page_frag *skb_frags,
-				      struct mlx4_en_rx_alloc *page_alloc,
+				      struct mlx4_en_rx_alloc *frags,
 				      unsigned int length)
 {
 	struct sk_buff *skb;
@@ -473,23 +484,20 @@ static struct sk_buff *mlx4_en_rx_skb(struct mlx4_en_priv *priv,
 
 	/* Get pointer to first fragment so we could copy the headers into the
 	 * (linear part of the) skb */
-	va = page_address(skb_frags[0].page) + skb_frags[0].offset;
+	va = page_address(frags[0].page) + frags[0].offset;
 
 	if (length <= SMALL_PACKET_SIZE) {
 		/* We are copying all relevant data to the skb - temporarily
-		 * synch buffers for the copy */
+		 * sync buffers for the copy */
 		dma = be64_to_cpu(rx_desc->data[0].addr);
 		dma_sync_single_for_cpu(priv->ddev, dma, length,
 					DMA_FROM_DEVICE);
 		skb_copy_to_linear_data(skb, va, length);
-		dma_sync_single_for_device(priv->ddev, dma, length,
-					   DMA_FROM_DEVICE);
 		skb->tail += length;
 	} else {
-
 		/* Move relevant fragments to skb */
-		used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, skb_frags,
-						      skb, page_alloc, length);
+		used_frags = mlx4_en_complete_rx_desc(priv, rx_desc, frags,
+							skb, length);
 		if (unlikely(!used_frags)) {
 			kfree_skb(skb);
 			return NULL;
@@ -526,12 +534,25 @@ out_loopback:
 	dev_kfree_skb_any(skb);
 }
 
+static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
+				     struct mlx4_en_rx_ring *ring)
+{
+	int index = ring->prod & ring->size_mask;
+
+	while ((u32) (ring->prod - ring->cons) < ring->actual_size) {
+		if (mlx4_en_prepare_rx_desc(priv, ring, index))
+			break;
+		ring->prod++;
+		index = ring->prod & ring->size_mask;
+	}
+}
+
 int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
 	struct mlx4_cqe *cqe;
 	struct mlx4_en_rx_ring *ring = &priv->rx_ring[cq->ring];
-	struct page_frag *skb_frags;
+	struct mlx4_en_rx_alloc *frags;
 	struct mlx4_en_rx_desc *rx_desc;
 	struct sk_buff *skb;
 	int index;
@@ -540,6 +561,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	int polled = 0;
 	int ip_summed;
 	struct ethhdr *ethh;
+	dma_addr_t dma;
 	u64 s_mac;
 
 	if (!priv->port_up)
@@ -555,7 +577,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	while (XNOR(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK,
 		    cq->mcq.cons_index & cq->size)) {
 
-		skb_frags = ring->rx_info + (index << priv->log_rx_info);
+		frags = ring->rx_info + (index << priv->log_rx_info);
 		rx_desc = ring->buf + (index << ring->log_stride);
 
 		/*
@@ -579,8 +601,11 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 
 		/* Get pointer to first fragment since we haven't skb yet and
 		 * cast it to ethhdr struct */
-		ethh = (struct ethhdr *)(page_address(skb_frags[0].page) +
-					 skb_frags[0].offset);
+		dma = be64_to_cpu(rx_desc->data[0].addr);
+		dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
+					DMA_FROM_DEVICE);
+		ethh = (struct ethhdr *)(page_address(frags[0].page) +
+					 frags[0].offset);
 		s_mac = mlx4_en_mac_to_u64(ethh->h_source);
 
 		/* If source MAC is equal to our own MAC and not performing
@@ -612,10 +637,9 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 					if (!gro_skb)
 						goto next;
 
-					nr = mlx4_en_complete_rx_desc(
-						priv, rx_desc,
-						skb_frags, gro_skb,
-						ring->page_alloc, length);
+					nr = mlx4_en_complete_rx_desc(priv,
+						rx_desc, frags, gro_skb,
+						length);
 					if (!nr)
 						goto next;
 
@@ -651,8 +675,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 			ring->csum_none++;
 		}
 
-		skb = mlx4_en_rx_skb(priv, rx_desc, skb_frags,
-				     ring->page_alloc, length);
+		skb = mlx4_en_rx_skb(priv, rx_desc, frags, length);
 		if (!skb) {
 			priv->stats.rx_dropped++;
 			goto next;
@@ -678,6 +701,9 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 		netif_receive_skb(skb);
 
 next:
+		for (nr = 0; nr < priv->num_frags; nr++)
+			mlx4_en_free_frag(priv, frags, nr);
+
 		++cq->mcq.cons_index;
 		index = (cq->mcq.cons_index) & ring->size_mask;
 		cqe = &cq->buf[index];
@@ -693,7 +719,7 @@ out:
 	mlx4_cq_set_ci(&cq->mcq);
 	wmb(); /* ensure HW sees CQ consumer before we post new buffers */
 	ring->cons = cq->mcq.cons_index;
-	ring->prod += polled; /* Polled descriptors were realocated in place */
+	mlx4_en_refill_rx_buffers(priv, ring);
 	mlx4_en_update_rx_prod_db(ring);
 	return polled;
 }
@@ -782,7 +808,7 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
 
 	priv->num_frags = i;
 	priv->rx_skb_size = eff_mtu;
-	priv->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct skb_frag_struct));
+	priv->log_rx_info = ROUNDUP_LOG2(i * sizeof(struct mlx4_en_rx_alloc));
 
 	en_dbg(DRV, priv, "Rx buffer scatter-list (effective-mtu:%d "
 		  "num_frags:%d):\n", eff_mtu, priv->num_frags);
@@ -984,8 +1010,3 @@ void mlx4_en_release_rss_steer(struct mlx4_en_priv *priv)
 	}
 	mlx4_qp_release_range(mdev->dev, rss_map->base_qpn, priv->rx_ring_num);
 }
-
-
-
-
-
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index a126321..f2fc90d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -107,7 +107,7 @@ enum {
 #define MLX4_EN_MAX_TX_SIZE	8192
 #define MLX4_EN_MAX_RX_SIZE	8192
 
-/* Minimum ring size for our page-allocation sceme to work */
+/* Minimum ring size for our page-allocation scheme to work */
 #define MLX4_EN_MIN_RX_SIZE	(MLX4_EN_ALLOC_SIZE / SMP_CACHE_BYTES)
 #define MLX4_EN_MIN_TX_SIZE	(4096 / TXBB_SIZE)
 
@@ -228,6 +228,7 @@ struct mlx4_en_tx_desc {
 
 struct mlx4_en_rx_alloc {
 	struct page *page;
+	dma_addr_t dma;
 	u16 offset;
 };
 
-- 
1.7.4.4

^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-16 17:01 [PATCH] mlx4_en: map entire pages to increase throughput Thadeu Lima de Souza Cascardo
@ 2012-07-16 17:27 ` Rick Jones
  2012-07-16 19:06     ` Thadeu Lima de Souza Cascardo
  2012-07-17  5:29   ` David Miller
  2012-07-19 17:53 ` David Miller
  1 sibling, 2 replies; 28+ messages in thread
From: Rick Jones @ 2012-07-16 17:27 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo
  Cc: davem, netdev, yevgenyp, ogerlitz, amirv, brking, leitao, klebers

On 07/16/2012 10:01 AM, Thadeu Lima de Souza Cascardo wrote:
> In its receive path, mlx4_en driver maps each page chunk that it pushes
> to the hardware and unmaps it when pushing it up the stack. This limits
> throughput to about 3Gbps on a Power7 8-core machine.

That seems rather extraordinarily low - Power7 is supposed to be a 
rather high performance CPU.  The last time I noticed O(3Gbit/s) on 10G 
for bulk transfer was before the advent of LRO/GRO - that was in the x86 
space though.  Is mapping really that expensive with Power7?


> One solution is to map the entire allocated page at once. However, this
> requires that we keep track of every page fragment we give to a
> descriptor. We also need to work with the discipline that all fragments will
> be released (in the sense that it will not be reused by the driver
> anymore) in the order they are allocated to the driver.
>
> This requires that we don't reuse any fragments, every single one of
> them must be reallocated. We do that by releasing all the fragments that
> are processed and only after finished processing the descriptors, we
> start the refill.
>
> We also must somehow guarantee that we either refill all fragments in a
> descriptor or none at all, without resorting to giving up a page
> fragment that we would have already given. Otherwise, we would break the
> discipline of only releasing the fragments in the order they were
> allocated.
>
> This has passed page allocation fault injections (restricted to the
> driver by using required-start and required-end) and device hotplug
> while 16 TCP streams were able to deliver more than 9Gbps.

What is the effect on packet-per-second performance?  (eg aggregate, 
burst-mode netperf TCP_RR with TCP_NODELAY set or perhaps UDP_RR)

rick jones

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-16 17:27 ` Rick Jones
@ 2012-07-16 19:06     ` Thadeu Lima de Souza Cascardo
  2012-07-17  5:29   ` David Miller
  1 sibling, 0 replies; 28+ messages in thread
From: Thadeu Lima de Souza Cascardo @ 2012-07-16 19:06 UTC (permalink / raw)
  To: Rick Jones
  Cc: davem, netdev, yevgenyp, ogerlitz, amirv, brking, leitao,
	klebers, linuxppc-dev, anton

On Mon, Jul 16, 2012 at 10:27:57AM -0700, Rick Jones wrote:
> On 07/16/2012 10:01 AM, Thadeu Lima de Souza Cascardo wrote:
> >In its receive path, mlx4_en driver maps each page chunk that it pushes
> >to the hardware and unmaps it when pushing it up the stack. This limits
> >throughput to about 3Gbps on a Power7 8-core machine.
> 
> That seems rather extraordinarily low - Power7 is supposed to be a
> rather high performance CPU.  The last time I noticed O(3Gbit/s) on
> 10G for bulk transfer was before the advent of LRO/GRO - that was in
> the x86 space though.  Is mapping really that expensive with Power7?
> 

Copying linuxppc-dev and Anton here. But I can tell you that we have
lock contention when doing the mapping on the same adapter (map table
per device). Anton has sent some patches that improves that *a lot*.

However, for 1500 MTU, mlx4_en was doing two unmaps and two maps per
packet. The problem is not the CPU power needed to do the mappings, but
that we find the lock contention and end up with the CPUs more than 30%
of the time spent on spin locking.

> 
> >One solution is to map the entire allocated page at once. However, this
> >requires that we keep track of every page fragment we give to a
> >descriptor. We also need to work with the discipline that all fragments will
> >be released (in the sense that it will not be reused by the driver
> >anymore) in the order they are allocated to the driver.
> >
> >This requires that we don't reuse any fragments, every single one of
> >them must be reallocated. We do that by releasing all the fragments that
> >are processed and only after finished processing the descriptors, we
> >start the refill.
> >
> >We also must somehow guarantee that we either refill all fragments in a
> >descriptor or none at all, without resorting to giving up a page
> >fragment that we would have already given. Otherwise, we would break the
> >discipline of only releasing the fragments in the order they were
> >allocated.
> >
> >This has passed page allocation fault injections (restricted to the
> >driver by using required-start and required-end) and device hotplug
> >while 16 TCP streams were able to deliver more than 9Gbps.
> 
> What is the effect on packet-per-second performance?  (eg aggregate,
> burst-mode netperf TCP_RR with TCP_NODELAY set or perhaps UDP_RR)
> 

I used uperf with TCP_NODELAY and 16 threads sending from another
machine 64000-sized writes for 60 seconds.

I get 5898op/s (3.02Gb/s) without the patch against 18022ops/s
(9.23Gb/s) with the patch.

Best regards.
Cascardo.


> rick jones
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
@ 2012-07-16 19:06     ` Thadeu Lima de Souza Cascardo
  0 siblings, 0 replies; 28+ messages in thread
From: Thadeu Lima de Souza Cascardo @ 2012-07-16 19:06 UTC (permalink / raw)
  To: Rick Jones
  Cc: netdev, leitao, amirv, yevgenyp, klebers, anton, brking,
	ogerlitz, linuxppc-dev, davem

On Mon, Jul 16, 2012 at 10:27:57AM -0700, Rick Jones wrote:
> On 07/16/2012 10:01 AM, Thadeu Lima de Souza Cascardo wrote:
> >In its receive path, mlx4_en driver maps each page chunk that it pushes
> >to the hardware and unmaps it when pushing it up the stack. This limits
> >throughput to about 3Gbps on a Power7 8-core machine.
> 
> That seems rather extraordinarily low - Power7 is supposed to be a
> rather high performance CPU.  The last time I noticed O(3Gbit/s) on
> 10G for bulk transfer was before the advent of LRO/GRO - that was in
> the x86 space though.  Is mapping really that expensive with Power7?
> 

Copying linuxppc-dev and Anton here. But I can tell you that we have
lock contention when doing the mapping on the same adapter (map table
per device). Anton has sent some patches that improves that *a lot*.

However, for 1500 MTU, mlx4_en was doing two unmaps and two maps per
packet. The problem is not the CPU power needed to do the mappings, but
that we find the lock contention and end up with the CPUs more than 30%
of the time spent on spin locking.

> 
> >One solution is to map the entire allocated page at once. However, this
> >requires that we keep track of every page fragment we give to a
> >descriptor. We also need to work with the discipline that all fragments will
> >be released (in the sense that it will not be reused by the driver
> >anymore) in the order they are allocated to the driver.
> >
> >This requires that we don't reuse any fragments, every single one of
> >them must be reallocated. We do that by releasing all the fragments that
> >are processed and only after finished processing the descriptors, we
> >start the refill.
> >
> >We also must somehow guarantee that we either refill all fragments in a
> >descriptor or none at all, without resorting to giving up a page
> >fragment that we would have already given. Otherwise, we would break the
> >discipline of only releasing the fragments in the order they were
> >allocated.
> >
> >This has passed page allocation fault injections (restricted to the
> >driver by using required-start and required-end) and device hotplug
> >while 16 TCP streams were able to deliver more than 9Gbps.
> 
> What is the effect on packet-per-second performance?  (eg aggregate,
> burst-mode netperf TCP_RR with TCP_NODELAY set or perhaps UDP_RR)
> 

I used uperf with TCP_NODELAY and 16 threads sending from another
machine 64000-sized writes for 60 seconds.

I get 5898op/s (3.02Gb/s) without the patch against 18022ops/s
(9.23Gb/s) with the patch.

Best regards.
Cascardo.


> rick jones
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-16 19:06     ` Thadeu Lima de Souza Cascardo
@ 2012-07-16 19:42       ` Rick Jones
  -1 siblings, 0 replies; 28+ messages in thread
From: Rick Jones @ 2012-07-16 19:42 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo
  Cc: davem, netdev, yevgenyp, ogerlitz, amirv, brking, leitao,
	klebers, linuxppc-dev, anton

On 07/16/2012 12:06 PM, Thadeu Lima de Souza Cascardo wrote:
> On Mon, Jul 16, 2012 at 10:27:57AM -0700, Rick Jones wrote:
>
>> What is the effect on packet-per-second performance?  (eg aggregate,
>> burst-mode netperf TCP_RR with TCP_NODELAY set or perhaps UDP_RR)
>>
> I used uperf with TCP_NODELAY and 16 threads sending from another
> machine 64000-sized writes for 60 seconds.
>
> I get 5898op/s (3.02Gb/s) without the patch against 18022ops/s
> (9.23Gb/s) with the patch.

I was thinking more along the lines of an additional comparison, 
explicitly using netperf TCP_RR or something like it, not just the 
packets per second from a bulk transfer test.

rick

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
@ 2012-07-16 19:42       ` Rick Jones
  0 siblings, 0 replies; 28+ messages in thread
From: Rick Jones @ 2012-07-16 19:42 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo
  Cc: netdev, leitao, amirv, yevgenyp, klebers, anton, brking,
	ogerlitz, linuxppc-dev, davem

On 07/16/2012 12:06 PM, Thadeu Lima de Souza Cascardo wrote:
> On Mon, Jul 16, 2012 at 10:27:57AM -0700, Rick Jones wrote:
>
>> What is the effect on packet-per-second performance?  (eg aggregate,
>> burst-mode netperf TCP_RR with TCP_NODELAY set or perhaps UDP_RR)
>>
> I used uperf with TCP_NODELAY and 16 threads sending from another
> machine 64000-sized writes for 60 seconds.
>
> I get 5898op/s (3.02Gb/s) without the patch against 18022ops/s
> (9.23Gb/s) with the patch.

I was thinking more along the lines of an additional comparison, 
explicitly using netperf TCP_RR or something like it, not just the 
packets per second from a bulk transfer test.

rick

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-16 19:42       ` Rick Jones
@ 2012-07-16 20:36         ` Or Gerlitz
  -1 siblings, 0 replies; 28+ messages in thread
From: Or Gerlitz @ 2012-07-16 20:36 UTC (permalink / raw)
  To: Rick Jones
  Cc: netdev, leitao, amirv, yevgenyp, klebers,
	Thadeu Lima de Souza Cascardo, brking, ogerlitz, linuxppc-dev,
	davem, anton


[-- Attachment #1.1: Type: text/plain, Size: 312 bytes --]

On Mon, Jul 16, 2012 at 10:42 PM, Rick Jones <rick.jones2@hp.com> wrote:

> I was thinking more along the lines of an additional comparison,
> explicitly using netperf TCP_RR or something like it, not just the packets
> per second from a bulk transfer test.
>

TCP_STREAM would be good to know here as well

Or.

[-- Attachment #1.2: Type: text/html, Size: 636 bytes --]

[-- Attachment #2: Type: text/plain, Size: 150 bytes --]

_______________________________________________
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
@ 2012-07-16 20:36         ` Or Gerlitz
  0 siblings, 0 replies; 28+ messages in thread
From: Or Gerlitz @ 2012-07-16 20:36 UTC (permalink / raw)
  To: Rick Jones
  Cc: netdev, leitao, amirv, yevgenyp, klebers,
	Thadeu Lima de Souza Cascardo, brking, ogerlitz, linuxppc-dev,
	davem, anton

[-- Attachment #1: Type: text/plain, Size: 312 bytes --]

On Mon, Jul 16, 2012 at 10:42 PM, Rick Jones <rick.jones2@hp.com> wrote:

> I was thinking more along the lines of an additional comparison,
> explicitly using netperf TCP_RR or something like it, not just the packets
> per second from a bulk transfer test.
>

TCP_STREAM would be good to know here as well

Or.

[-- Attachment #2: Type: text/html, Size: 636 bytes --]

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-16 19:42       ` Rick Jones
@ 2012-07-16 20:43         ` Or Gerlitz
  -1 siblings, 0 replies; 28+ messages in thread
From: Or Gerlitz @ 2012-07-16 20:43 UTC (permalink / raw)
  To: Rick Jones, Thadeu Lima de Souza Cascardo
  Cc: davem, netdev, yevgenyp, ogerlitz, amirv, brking, leitao,
	klebers, linuxppc-dev, anton

On Mon, Jul 16, 2012 at 10:42 PM, Rick Jones <rick.jones2@hp.com> wrote:

> I was thinking more along the lines of an additional comparison,
> explicitly using netperf TCP_RR or something like it, not just the packets
> per second from a bulk transfer test.


TCP_STREAM from this setup before the patch would be good to know as well

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
@ 2012-07-16 20:43         ` Or Gerlitz
  0 siblings, 0 replies; 28+ messages in thread
From: Or Gerlitz @ 2012-07-16 20:43 UTC (permalink / raw)
  To: Rick Jones, Thadeu Lima de Souza Cascardo
  Cc: netdev, leitao, amirv, yevgenyp, klebers, anton, brking,
	ogerlitz, linuxppc-dev, davem

On Mon, Jul 16, 2012 at 10:42 PM, Rick Jones <rick.jones2@hp.com> wrote:

> I was thinking more along the lines of an additional comparison,
> explicitly using netperf TCP_RR or something like it, not just the packets
> per second from a bulk transfer test.


TCP_STREAM from this setup before the patch would be good to know as well

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-16 19:42       ` Rick Jones
@ 2012-07-16 20:47         ` Thadeu Lima de Souza Cascardo
  -1 siblings, 0 replies; 28+ messages in thread
From: Thadeu Lima de Souza Cascardo @ 2012-07-16 20:47 UTC (permalink / raw)
  To: Rick Jones
  Cc: davem, netdev, yevgenyp, ogerlitz, amirv, brking, leitao,
	klebers, linuxppc-dev, anton

On Mon, Jul 16, 2012 at 12:42:41PM -0700, Rick Jones wrote:
> On 07/16/2012 12:06 PM, Thadeu Lima de Souza Cascardo wrote:
> >On Mon, Jul 16, 2012 at 10:27:57AM -0700, Rick Jones wrote:
> >
> >>What is the effect on packet-per-second performance?  (eg aggregate,
> >>burst-mode netperf TCP_RR with TCP_NODELAY set or perhaps UDP_RR)
> >>
> >I used uperf with TCP_NODELAY and 16 threads sending from another
> >machine 64000-sized writes for 60 seconds.
> >
> >I get 5898op/s (3.02Gb/s) without the patch against 18022ops/s
> >(9.23Gb/s) with the patch.
> 
> I was thinking more along the lines of an additional comparison,
> explicitly using netperf TCP_RR or something like it, not just the
> packets per second from a bulk transfer test.
> 
> rick
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

I used a uperf profile that is similar to TCP_RR. It writes, then reads
some bytes. I kept the TCP_NODELAY flag.

Without the patch, I saw the following:

packet size	ops/s		Gb/s
1		337024		0.0027
90		276620		0.199
900		190455		1.37
4000		68863		2.20
9000		45638		3.29
60000		9409		4.52

With the patch:

packet size	ops/s		Gb/s
1		451738		0.0036
90		345682		0.248
900		272258		1.96
4000		127055		4.07
9000		106614		7.68
60000		30671		14.72

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
@ 2012-07-16 20:47         ` Thadeu Lima de Souza Cascardo
  0 siblings, 0 replies; 28+ messages in thread
From: Thadeu Lima de Souza Cascardo @ 2012-07-16 20:47 UTC (permalink / raw)
  To: Rick Jones
  Cc: netdev, leitao, amirv, yevgenyp, klebers, anton, brking,
	ogerlitz, linuxppc-dev, davem

On Mon, Jul 16, 2012 at 12:42:41PM -0700, Rick Jones wrote:
> On 07/16/2012 12:06 PM, Thadeu Lima de Souza Cascardo wrote:
> >On Mon, Jul 16, 2012 at 10:27:57AM -0700, Rick Jones wrote:
> >
> >>What is the effect on packet-per-second performance?  (eg aggregate,
> >>burst-mode netperf TCP_RR with TCP_NODELAY set or perhaps UDP_RR)
> >>
> >I used uperf with TCP_NODELAY and 16 threads sending from another
> >machine 64000-sized writes for 60 seconds.
> >
> >I get 5898op/s (3.02Gb/s) without the patch against 18022ops/s
> >(9.23Gb/s) with the patch.
> 
> I was thinking more along the lines of an additional comparison,
> explicitly using netperf TCP_RR or something like it, not just the
> packets per second from a bulk transfer test.
> 
> rick
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

I used a uperf profile that is similar to TCP_RR. It writes, then reads
some bytes. I kept the TCP_NODELAY flag.

Without the patch, I saw the following:

packet size	ops/s		Gb/s
1		337024		0.0027
90		276620		0.199
900		190455		1.37
4000		68863		2.20
9000		45638		3.29
60000		9409		4.52

With the patch:

packet size	ops/s		Gb/s
1		451738		0.0036
90		345682		0.248
900		272258		1.96
4000		127055		4.07
9000		106614		7.68
60000		30671		14.72

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-16 20:43         ` Or Gerlitz
@ 2012-07-16 20:57           ` Thadeu Lima de Souza Cascardo
  -1 siblings, 0 replies; 28+ messages in thread
From: Thadeu Lima de Souza Cascardo @ 2012-07-16 20:57 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: Rick Jones, davem, netdev, yevgenyp, ogerlitz, amirv, brking,
	leitao, klebers, linuxppc-dev, anton

On Mon, Jul 16, 2012 at 11:43:33PM +0300, Or Gerlitz wrote:
> On Mon, Jul 16, 2012 at 10:42 PM, Rick Jones <rick.jones2@hp.com> wrote:
> 
> > I was thinking more along the lines of an additional comparison,
> > explicitly using netperf TCP_RR or something like it, not just the packets
> > per second from a bulk transfer test.
> 
> 
> TCP_STREAM from this setup before the patch would be good to know as well
> 

Hi, Or.

Does the stream test that I did with uperf using messages of 64000 bytes
fit?

TCP_NODELAY does not make a difference in this case. I get something
around 3Gbps before the patch and something around 9Gbps after the
patch.

Before the patch:

# ./uperf-1.0.3-beta/src/uperf -m tcp.xml
Starting 16 threads running profile:tcp_stream ...   0.00 seconds
Txn1          0 /1.00(s) =            0          16op/s
Txn2    20.81GB /59.26(s) =     3.02Gb/s        5914op/s
Txn3          0 /0.00(s) =            0      128295op/s
-------------------------------------------------------------------------------------------------------------------------------
Total   20.81GB /61.37(s) =     2.91Gb/s        5712op/s

Netstat statistics for this run
-------------------------------------------------------------------------------------------------------------------------------
Nic       opkts/s     ipkts/s     obits/s     ibits/s
eth6       252459       31694   3.06Gb/s  16.74Mb/s
eth0            2          18   3.87Kb/s  14.28Kb/s
-------------------------------------------------------------------------------------------------------------------------------

Run Statistics
Hostname           Time        Data   Throughput   Operations
Errors
-------------------------------------------------------------------------------------------------------------------------------
10.0.0.2         61.47s     20.81GB     2.91Gb/s       350528
0.00
master           61.37s     20.81GB     2.91Gb/s       350528
0.00
-------------------------------------------------------------------------------------------------------------------------------
Difference(%)     -0.16%      0.00%        0.16%        0.00%
0.00%


After the patch:

# ./uperf-1.0.3-beta/src/uperf -m tcp.xml
Starting 16 threads running profile:tcp_stream ...   0.00 seconds
Txn1          0 /1.00(s) =            0          16op/s
Txn2    64.50GB /60.27(s) =     9.19Gb/s       17975op/s
Txn3          0 /0.00(s) =            0
-------------------------------------------------------------------------------------------------------------------------------
Total   64.50GB /62.27(s) =     8.90Gb/s       17397op/s

Netstat statistics for this run
-------------------------------------------------------------------------------------------------------------------------------
Nic       opkts/s     ipkts/s     obits/s     ibits/s
eth6       769428       96018   9.31Gb/s  50.72Mb/s
eth0            1          15   2.48Kb/s  13.59Kb/s
-------------------------------------------------------------------------------------------------------------------------------

Run Statistics
Hostname           Time        Data   Throughput   Operations
Errors
-------------------------------------------------------------------------------------------------------------------------------
10.0.0.2         62.27s     64.36GB     8.88Gb/s      1081096
0.00
master           62.27s     64.50GB     8.90Gb/s      1083325
0.00
-------------------------------------------------------------------------------------------------------------------------------
Difference(%)     -0.00%      0.21%        0.21%        0.21%
0.00%


Profile tcp.xml:

<?xml version="1.0"?>
<profile name="TCP_STREAM">
  <group nthreads="16">
        <transaction iterations="1">
            <flowop type="connect" options="remotehost=10.0.0.2 protocol=tcp tcp_nodelay"/>
        </transaction>
        <transaction duration="60">
            <flowop type="write" options="count=160 size=64000"/>
        </transaction>
        <transaction iterations="1">
            <flowop type="disconnect" />
        </transaction>
  </group>
</profile>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
@ 2012-07-16 20:57           ` Thadeu Lima de Souza Cascardo
  0 siblings, 0 replies; 28+ messages in thread
From: Thadeu Lima de Souza Cascardo @ 2012-07-16 20:57 UTC (permalink / raw)
  To: Or Gerlitz
  Cc: netdev, leitao, Rick Jones, amirv, yevgenyp, klebers, anton,
	brking, ogerlitz, linuxppc-dev, davem

On Mon, Jul 16, 2012 at 11:43:33PM +0300, Or Gerlitz wrote:
> On Mon, Jul 16, 2012 at 10:42 PM, Rick Jones <rick.jones2@hp.com> wrote:
> 
> > I was thinking more along the lines of an additional comparison,
> > explicitly using netperf TCP_RR or something like it, not just the packets
> > per second from a bulk transfer test.
> 
> 
> TCP_STREAM from this setup before the patch would be good to know as well
> 

Hi, Or.

Does the stream test that I did with uperf using messages of 64000 bytes
fit?

TCP_NODELAY does not make a difference in this case. I get something
around 3Gbps before the patch and something around 9Gbps after the
patch.

Before the patch:

# ./uperf-1.0.3-beta/src/uperf -m tcp.xml
Starting 16 threads running profile:tcp_stream ...   0.00 seconds
Txn1          0 /1.00(s) =            0          16op/s
Txn2    20.81GB /59.26(s) =     3.02Gb/s        5914op/s
Txn3          0 /0.00(s) =            0      128295op/s
-------------------------------------------------------------------------------------------------------------------------------
Total   20.81GB /61.37(s) =     2.91Gb/s        5712op/s

Netstat statistics for this run
-------------------------------------------------------------------------------------------------------------------------------
Nic       opkts/s     ipkts/s     obits/s     ibits/s
eth6       252459       31694   3.06Gb/s  16.74Mb/s
eth0            2          18   3.87Kb/s  14.28Kb/s
-------------------------------------------------------------------------------------------------------------------------------

Run Statistics
Hostname           Time        Data   Throughput   Operations
Errors
-------------------------------------------------------------------------------------------------------------------------------
10.0.0.2         61.47s     20.81GB     2.91Gb/s       350528
0.00
master           61.37s     20.81GB     2.91Gb/s       350528
0.00
-------------------------------------------------------------------------------------------------------------------------------
Difference(%)     -0.16%      0.00%        0.16%        0.00%
0.00%


After the patch:

# ./uperf-1.0.3-beta/src/uperf -m tcp.xml
Starting 16 threads running profile:tcp_stream ...   0.00 seconds
Txn1          0 /1.00(s) =            0          16op/s
Txn2    64.50GB /60.27(s) =     9.19Gb/s       17975op/s
Txn3          0 /0.00(s) =            0
-------------------------------------------------------------------------------------------------------------------------------
Total   64.50GB /62.27(s) =     8.90Gb/s       17397op/s

Netstat statistics for this run
-------------------------------------------------------------------------------------------------------------------------------
Nic       opkts/s     ipkts/s     obits/s     ibits/s
eth6       769428       96018   9.31Gb/s  50.72Mb/s
eth0            1          15   2.48Kb/s  13.59Kb/s
-------------------------------------------------------------------------------------------------------------------------------

Run Statistics
Hostname           Time        Data   Throughput   Operations
Errors
-------------------------------------------------------------------------------------------------------------------------------
10.0.0.2         62.27s     64.36GB     8.88Gb/s      1081096
0.00
master           62.27s     64.50GB     8.90Gb/s      1083325
0.00
-------------------------------------------------------------------------------------------------------------------------------
Difference(%)     -0.00%      0.21%        0.21%        0.21%
0.00%


Profile tcp.xml:

<?xml version="1.0"?>
<profile name="TCP_STREAM">
  <group nthreads="16">
        <transaction iterations="1">
            <flowop type="connect" options="remotehost=10.0.0.2 protocol=tcp tcp_nodelay"/>
        </transaction>
        <transaction duration="60">
            <flowop type="write" options="count=160 size=64000"/>
        </transaction>
        <transaction iterations="1">
            <flowop type="disconnect" />
        </transaction>
  </group>
</profile>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-16 20:47         ` Thadeu Lima de Souza Cascardo
@ 2012-07-16 21:08           ` Rick Jones
  -1 siblings, 0 replies; 28+ messages in thread
From: Rick Jones @ 2012-07-16 21:08 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo
  Cc: davem, netdev, yevgenyp, ogerlitz, amirv, brking, leitao,
	klebers, linuxppc-dev, anton


I was thinking more along the lines of an additional comparison,
explicitly using netperf TCP_RR or something like it, not just the
packets per second from a bulk transfer test.

rick
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

> I used a uperf profile that is similar to TCP_RR. It writes, then reads
> some bytes. I kept the TCP_NODELAY flag.
>
> Without the patch, I saw the following:
>
> packet size	ops/s		Gb/s
> 1		337024		0.0027
> 90		276620		0.199
> 900		190455		1.37
> 4000		68863		2.20
> 9000		45638		3.29
> 60000		9409		4.52
>
> With the patch:
>
> packet size	ops/s		Gb/s
> 1		451738		0.0036
> 90		345682		0.248
> 900		272258		1.96
> 4000		127055		4.07
> 9000		106614		7.68
> 60000		30671		14.72
>

So, on the surface it looks like it did good things for PPS, though it 
would be nice to know what the CPU utilizations/service demands were as 
a sanity check - does uperf not have that sort of functionality?

I'm guessing there were several writes at a time - the 1 byte packet 
size (sic - that is payload, not packet, and without TCP_NODELAY not 
even payload necessarily) How many writes does it have outstanding 
before it does a read?  And does it take care to build-up to that number 
of writes to avoid batching during slowstart, even with TCP_NODELAY set?

rick jones

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
@ 2012-07-16 21:08           ` Rick Jones
  0 siblings, 0 replies; 28+ messages in thread
From: Rick Jones @ 2012-07-16 21:08 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo
  Cc: netdev, leitao, amirv, yevgenyp, klebers, anton, brking,
	ogerlitz, linuxppc-dev, davem


I was thinking more along the lines of an additional comparison,
explicitly using netperf TCP_RR or something like it, not just the
packets per second from a bulk transfer test.

rick
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

> I used a uperf profile that is similar to TCP_RR. It writes, then reads
> some bytes. I kept the TCP_NODELAY flag.
>
> Without the patch, I saw the following:
>
> packet size	ops/s		Gb/s
> 1		337024		0.0027
> 90		276620		0.199
> 900		190455		1.37
> 4000		68863		2.20
> 9000		45638		3.29
> 60000		9409		4.52
>
> With the patch:
>
> packet size	ops/s		Gb/s
> 1		451738		0.0036
> 90		345682		0.248
> 900		272258		1.96
> 4000		127055		4.07
> 9000		106614		7.68
> 60000		30671		14.72
>

So, on the surface it looks like it did good things for PPS, though it 
would be nice to know what the CPU utilizations/service demands were as 
a sanity check - does uperf not have that sort of functionality?

I'm guessing there were several writes at a time - the 1 byte packet 
size (sic - that is payload, not packet, and without TCP_NODELAY not 
even payload necessarily) How many writes does it have outstanding 
before it does a read?  And does it take care to build-up to that number 
of writes to avoid batching during slowstart, even with TCP_NODELAY set?

rick jones

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-16 17:27 ` Rick Jones
  2012-07-16 19:06     ` Thadeu Lima de Souza Cascardo
@ 2012-07-17  5:29   ` David Miller
  2012-07-17 12:42     ` David Laight
  2012-07-17 18:17     ` Rick Jones
  1 sibling, 2 replies; 28+ messages in thread
From: David Miller @ 2012-07-17  5:29 UTC (permalink / raw)
  To: rick.jones2
  Cc: cascardo, netdev, yevgenyp, ogerlitz, amirv, brking, leitao, klebers

From: Rick Jones <rick.jones2@hp.com>
Date: Mon, 16 Jul 2012 10:27:57 -0700

> That seems rather extraordinarily low - Power7 is supposed to be a
> rather high performance CPU.  The last time I noticed O(3Gbit/s) on
> 10G for bulk transfer was before the advent of LRO/GRO - that was in
> the x86 space though.  Is mapping really that expensive with Power7?

Unfortunately, IOMMU mappings are incredibly expensive.  I see effects
like this on Sparc too.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-17  5:29   ` David Miller
@ 2012-07-17 12:42     ` David Laight
  2012-07-17 12:50       ` David Miller
  2012-07-17 18:17     ` Rick Jones
  1 sibling, 1 reply; 28+ messages in thread
From: David Laight @ 2012-07-17 12:42 UTC (permalink / raw)
  To: David Miller, rick.jones2
  Cc: cascardo, netdev, yevgenyp, ogerlitz, amirv, brking, leitao, klebers

> > That seems rather extraordinarily low - Power7 is supposed to be a
> > rather high performance CPU.  The last time I noticed O(3Gbit/s) on
> > 10G for bulk transfer was before the advent of LRO/GRO - that was in
> > the x86 space though.  Is mapping really that expensive with Power7?
> 
> Unfortunately, IOMMU mappings are incredibly expensive.  I see effects
> like this on Sparc too.

Would there be any mileage in permanently allocating IOMMU
virtual address to the ring entries, then 'just' assigning
the correct physical address during rx/tx setup?

A long time ago it used to be much faster on sparc systems
to receive into a permanently mapped buffer area and then
do a maximally aligned copy into the actual rx buffer.

	David

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-17 12:42     ` David Laight
@ 2012-07-17 12:50       ` David Miller
  2012-07-17 13:36         ` David Laight
  2012-07-17 13:50         ` Eric Dumazet
  0 siblings, 2 replies; 28+ messages in thread
From: David Miller @ 2012-07-17 12:50 UTC (permalink / raw)
  To: David.Laight
  Cc: rick.jones2, cascardo, netdev, yevgenyp, ogerlitz, amirv, brking,
	leitao, klebers

From: "David Laight" <David.Laight@ACULAB.COM>
Date: Tue, 17 Jul 2012 13:42:04 +0100

> Would there be any mileage in permanently allocating IOMMU
> virtual address to the ring entries, then 'just' assigning
> the correct physical address during rx/tx setup?

There is a not a one to one mapping between these two entities,
in particular on the transmit side.

A transmit packet can have multiple segments, some of which are
larger than one IOMMU page.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* RE: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-17 12:50       ` David Miller
@ 2012-07-17 13:36         ` David Laight
  2012-07-17 13:46           ` David Miller
  2012-07-17 13:50         ` Eric Dumazet
  1 sibling, 1 reply; 28+ messages in thread
From: David Laight @ 2012-07-17 13:36 UTC (permalink / raw)
  To: David Miller
  Cc: rick.jones2, cascardo, netdev, yevgenyp, ogerlitz, amirv, brking,
	leitao, klebers

> > Would there be any mileage in permanently allocating IOMMU
> > virtual address to the ring entries, then 'just' assigning
> > the correct physical address during rx/tx setup?
> 
> There is a not a one to one mapping between these two entities,
> in particular on the transmit side.
> 
> A transmit packet can have multiple segments, some of which are
> larger than one IOMMU page.

A SMOP :-) TX is probably easier than RX.
Each tx segment will already go into a separate ring entry,
page boundaries could do the same.
The driver will already have to cope with 'too many segments'
(I remember being passed a full sized frame made of a list
of 1-byte message blocks...)

Or allocate enough sequential IOMMU pages for the longest
tx segment for every ring entry - after all that is already
the 'worst case' allocation!

	David

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-17 13:36         ` David Laight
@ 2012-07-17 13:46           ` David Miller
  0 siblings, 0 replies; 28+ messages in thread
From: David Miller @ 2012-07-17 13:46 UTC (permalink / raw)
  To: David.Laight
  Cc: rick.jones2, cascardo, netdev, yevgenyp, ogerlitz, amirv, brking,
	leitao, klebers

From: "David Laight" <David.Laight@ACULAB.COM>
Date: Tue, 17 Jul 2012 14:36:11 +0100

> The driver will already have to cope with 'too many segments'
> (I remember being passed a full sized frame made of a list
> of 1-byte message blocks...)

Baring driver hardware bug workarounds, no it does not have to cope
with that.  The code is extremely simple now.

All the driver has to do is assume that a new TX packet can never
consume more than MAX_SKB_FRAGS.

Therefore it simply stops the queue if less than MAX_SKB_FRAGS
segments remain after queueing a transmit.

Your suggestion will significantly complicate driver TX paths.

If you're going to suggest a solution, it has to be completely
general enough to work in the current state of affairs, and
your idea absolutely does not.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-17 12:50       ` David Miller
  2012-07-17 13:36         ` David Laight
@ 2012-07-17 13:50         ` Eric Dumazet
  1 sibling, 0 replies; 28+ messages in thread
From: Eric Dumazet @ 2012-07-17 13:50 UTC (permalink / raw)
  To: David Miller
  Cc: David.Laight, rick.jones2, cascardo, netdev, yevgenyp, ogerlitz,
	amirv, brking, leitao, klebers

On Tue, 2012-07-17 at 05:50 -0700, David Miller wrote:
> From: "David Laight" <David.Laight@ACULAB.COM>
> Date: Tue, 17 Jul 2012 13:42:04 +0100
> 
> > Would there be any mileage in permanently allocating IOMMU
> > virtual address to the ring entries, then 'just' assigning
> > the correct physical address during rx/tx setup?
> 
> There is a not a one to one mapping between these two entities,
> in particular on the transmit side.
> 
> A transmit packet can have multiple segments, some of which are
> larger than one IOMMU page.

And on rx side, permanently allocating IOMMU would need to copy all
incoming frames to newly allocated memory.

Annot this IOMMU performance problem can be solved on its side,
instead of having to shuffle things in all drivers ?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-17  5:29   ` David Miller
  2012-07-17 12:42     ` David Laight
@ 2012-07-17 18:17     ` Rick Jones
  2012-07-17 20:10       ` Brian King
  1 sibling, 1 reply; 28+ messages in thread
From: Rick Jones @ 2012-07-17 18:17 UTC (permalink / raw)
  To: David Miller
  Cc: cascardo, netdev, yevgenyp, ogerlitz, amirv, brking, leitao, klebers

On 07/16/2012 10:29 PM, David Miller wrote:
> From: Rick Jones <rick.jones2@hp.com>
> Date: Mon, 16 Jul 2012 10:27:57 -0700
>
>> That seems rather extraordinarily low - Power7 is supposed to be a
>> rather high performance CPU.  The last time I noticed O(3Gbit/s) on
>> 10G for bulk transfer was before the advent of LRO/GRO - that was in
>> the x86 space though.  Is mapping really that expensive with Power7?
>
> Unfortunately, IOMMU mappings are incredibly expensive.  I see effects
> like this on Sparc too.

OK, so that has caused some dimm memory to get a small refresh - it ends 
up being akin to if not actually a PIO yes?  I recall schemes in drivers 
in other stacks whereby "small" packets were copied because it was 
cheaper to allocate/copy then it was to remap.

rick jones

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-17 18:17     ` Rick Jones
@ 2012-07-17 20:10       ` Brian King
  2012-07-17 20:20         ` David Miller
  0 siblings, 1 reply; 28+ messages in thread
From: Brian King @ 2012-07-17 20:10 UTC (permalink / raw)
  To: Rick Jones
  Cc: David Miller, cascardo, netdev, yevgenyp, ogerlitz, amirv,
	leitao, klebers

On 07/17/2012 01:17 PM, Rick Jones wrote:
> On 07/16/2012 10:29 PM, David Miller wrote:
>> From: Rick Jones <rick.jones2@hp.com> Date: Mon, 16 Jul 2012
>> 10:27:57 -0700
>> 
>>> That seems rather extraordinarily low - Power7 is supposed to be
>>> a rather high performance CPU.  The last time I noticed
>>> O(3Gbit/s) on 10G for bulk transfer was before the advent of
>>> LRO/GRO - that was in the x86 space though.  Is mapping really
>>> that expensive with Power7?
>> 
>> Unfortunately, IOMMU mappings are incredibly expensive.  I see
>> effects like this on Sparc too.
> 
> OK, so that has caused some dimm memory to get a small refresh - it
> ends up being akin to if not actually a PIO yes?  I recall schemes in
> drivers in other stacks whereby "small" packets were copied because
> it was cheaper to allocate/copy then it was to remap.

On Power it ends up being an hcall to the hypervisor

-Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-17 20:10       ` Brian King
@ 2012-07-17 20:20         ` David Miller
  0 siblings, 0 replies; 28+ messages in thread
From: David Miller @ 2012-07-17 20:20 UTC (permalink / raw)
  To: brking
  Cc: rick.jones2, cascardo, netdev, yevgenyp, ogerlitz, amirv, leitao,
	klebers

From: Brian King <brking@linux.vnet.ibm.com>
Date: Tue, 17 Jul 2012 15:10:08 -0500

> On 07/17/2012 01:17 PM, Rick Jones wrote:
>> On 07/16/2012 10:29 PM, David Miller wrote:
>>> From: Rick Jones <rick.jones2@hp.com> Date: Mon, 16 Jul 2012
>>> 10:27:57 -0700
>>> 
>>>> That seems rather extraordinarily low - Power7 is supposed to be
>>>> a rather high performance CPU.  The last time I noticed
>>>> O(3Gbit/s) on 10G for bulk transfer was before the advent of
>>>> LRO/GRO - that was in the x86 space though.  Is mapping really
>>>> that expensive with Power7?
>>> 
>>> Unfortunately, IOMMU mappings are incredibly expensive.  I see
>>> effects like this on Sparc too.
>> 
>> OK, so that has caused some dimm memory to get a small refresh - it
>> ends up being akin to if not actually a PIO yes?  I recall schemes in
>> drivers in other stacks whereby "small" packets were copied because
>> it was cheaper to allocate/copy then it was to remap.
> 
> On Power it ends up being an hcall to the hypervisor

This is true on sparc64 niagara systems as well.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-16 20:57           ` Thadeu Lima de Souza Cascardo
@ 2012-07-18 14:59             ` Or Gerlitz
  -1 siblings, 0 replies; 28+ messages in thread
From: Or Gerlitz @ 2012-07-18 14:59 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo, Yevgeny Petrilin
  Cc: Or Gerlitz, Rick Jones, davem, netdev, amirv, brking, leitao,
	klebers, linuxppc-dev, anton

On 7/16/2012 11:57 PM, Thadeu Lima de Souza Cascardo wrote:
> On Mon, Jul 16, 2012 at 11:43:33PM +0300, Or Gerlitz wrote:
>>
>>
>> TCP_STREAM from this setup before the patch would be good to know as well
>>
>
> Does the stream test that I did with uperf using messages of 64000 bytes fit?

netperf/TCP_STREAM is very common and it would help to better compare 
the numbers
you get on your systems before/after the patch which runs done here. As 
for review for
the patch itself and the related discussion, Yevgeny Petrilin should be 
looking on your
patch, he'll be in by early next week.

Or.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
@ 2012-07-18 14:59             ` Or Gerlitz
  0 siblings, 0 replies; 28+ messages in thread
From: Or Gerlitz @ 2012-07-18 14:59 UTC (permalink / raw)
  To: Thadeu Lima de Souza Cascardo, Yevgeny Petrilin
  Cc: Or Gerlitz, netdev, Rick Jones, amirv, leitao, klebers, anton,
	brking, linuxppc-dev, davem

On 7/16/2012 11:57 PM, Thadeu Lima de Souza Cascardo wrote:
> On Mon, Jul 16, 2012 at 11:43:33PM +0300, Or Gerlitz wrote:
>>
>>
>> TCP_STREAM from this setup before the patch would be good to know as well
>>
>
> Does the stream test that I did with uperf using messages of 64000 bytes fit?

netperf/TCP_STREAM is very common and it would help to better compare 
the numbers
you get on your systems before/after the patch which runs done here. As 
for review for
the patch itself and the related discussion, Yevgeny Petrilin should be 
looking on your
patch, he'll be in by early next week.

Or.

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH] mlx4_en: map entire pages to increase throughput
  2012-07-16 17:01 [PATCH] mlx4_en: map entire pages to increase throughput Thadeu Lima de Souza Cascardo
  2012-07-16 17:27 ` Rick Jones
@ 2012-07-19 17:53 ` David Miller
  1 sibling, 0 replies; 28+ messages in thread
From: David Miller @ 2012-07-19 17:53 UTC (permalink / raw)
  To: cascardo; +Cc: netdev, yevgenyp, ogerlitz, amirv, brking, leitao, klebers

From: Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>
Date: Mon, 16 Jul 2012 14:01:53 -0300

> In its receive path, mlx4_en driver maps each page chunk that it pushes
> to the hardware and unmaps it when pushing it up the stack. This limits
> throughput to about 3Gbps on a Power7 8-core machine.
> 
> One solution is to map the entire allocated page at once. However, this
> requires that we keep track of every page fragment we give to a
> descriptor. We also need to work with the discipline that all fragments will
> be released (in the sense that it will not be reused by the driver
> anymore) in the order they are allocated to the driver.
> 
> This requires that we don't reuse any fragments, every single one of
> them must be reallocated. We do that by releasing all the fragments that
> are processed and only after finished processing the descriptors, we
> start the refill.
> 
> We also must somehow guarantee that we either refill all fragments in a
> descriptor or none at all, without resorting to giving up a page
> fragment that we would have already given. Otherwise, we would break the
> discipline of only releasing the fragments in the order they were
> allocated.
> 
> This has passed page allocation fault injections (restricted to the
> driver by using required-start and required-end) and device hotplug
> while 16 TCP streams were able to deliver more than 9Gbps.
> 
> Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@linux.vnet.ibm.com>

I have not seen any reasonable objections to this patch, so I have
applied it to net-next, thanks!

^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2012-07-19 17:53 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-07-16 17:01 [PATCH] mlx4_en: map entire pages to increase throughput Thadeu Lima de Souza Cascardo
2012-07-16 17:27 ` Rick Jones
2012-07-16 19:06   ` Thadeu Lima de Souza Cascardo
2012-07-16 19:06     ` Thadeu Lima de Souza Cascardo
2012-07-16 19:42     ` Rick Jones
2012-07-16 19:42       ` Rick Jones
2012-07-16 20:36       ` Or Gerlitz
2012-07-16 20:36         ` Or Gerlitz
2012-07-16 20:43       ` Or Gerlitz
2012-07-16 20:43         ` Or Gerlitz
2012-07-16 20:57         ` Thadeu Lima de Souza Cascardo
2012-07-16 20:57           ` Thadeu Lima de Souza Cascardo
2012-07-18 14:59           ` Or Gerlitz
2012-07-18 14:59             ` Or Gerlitz
2012-07-16 20:47       ` Thadeu Lima de Souza Cascardo
2012-07-16 20:47         ` Thadeu Lima de Souza Cascardo
2012-07-16 21:08         ` Rick Jones
2012-07-16 21:08           ` Rick Jones
2012-07-17  5:29   ` David Miller
2012-07-17 12:42     ` David Laight
2012-07-17 12:50       ` David Miller
2012-07-17 13:36         ` David Laight
2012-07-17 13:46           ` David Miller
2012-07-17 13:50         ` Eric Dumazet
2012-07-17 18:17     ` Rick Jones
2012-07-17 20:10       ` Brian King
2012-07-17 20:20         ` David Miller
2012-07-19 17:53 ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.