All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header
@ 2013-11-12 22:21 Michael Dalton
  2013-11-12 22:21 ` [PATCH net-next 2/4] net: allow > 0 order atomic page alloc in skb_page_frag_refill Michael Dalton
                   ` (8 more replies)
  0 siblings, 9 replies; 21+ messages in thread
From: Michael Dalton @ 2013-11-12 22:21 UTC (permalink / raw)
  To: David S. Miller
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, Daniel Borkmann,
	virtualization, Eric Dumazet

Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page
frag allocators") changed the mergeable receive buffer size from PAGE_SIZE
to MTU-size. However, the merge buffer size does not take into account the
size of the virtio-net header. Consequently, packets that are MTU-size
will take two buffers intead of one (to store the virtio-net header),
substantially decreasing the throughput of MTU-size traffic due to TCP
window / SKB truesize effects.

This commit changes the mergeable buffer size to include the virtio-net
header. The buffer size is cacheline-aligned because skb_page_frag_refill
will not automatically align the requested size.

Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
between two QEMU VMs on a single physical machine. Each VM has two VCPUs and
vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup
cpuset, using cgroups to ensure that other processes in the system will not
be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive
buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to
force MTU-sized packets on the receiver.

next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s
net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s
net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s
net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s

Suggested-by: Eric Northup <digitaleric@google.com>
Signed-off-by: Michael Dalton <mwdalton@google.com>
---
 drivers/net/virtio_net.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 01f4eb5..69fb225 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -36,7 +36,10 @@ module_param(csum, bool, 0444);
 module_param(gso, bool, 0444);
 
 /* FIXME: MTU in config. */
-#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
+#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
+#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
+                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
+                                L1_CACHE_BYTES))
 #define GOOD_COPY_LEN	128
 
 #define VIRTNET_DRIVER_VERSION "1.0.0"
@@ -314,10 +317,10 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
 			head_skb->dev->stats.rx_length_errors++;
 			return -EINVAL;
 		}
-		if (unlikely(len > MAX_PACKET_LEN)) {
+		if (unlikely(len > MERGE_BUFFER_LEN)) {
 			pr_debug("%s: rx error: merge buffer too long\n",
 				 head_skb->dev->name);
-			len = MAX_PACKET_LEN;
+			len = MERGE_BUFFER_LEN;
 		}
 		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
 			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
@@ -336,18 +339,17 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
 			head_skb->len += len;
-			head_skb->truesize += MAX_PACKET_LEN;
+			head_skb->truesize += MERGE_BUFFER_LEN;
 		}
 		page = virt_to_head_page(buf);
 		offset = buf - (char *)page_address(page);
 		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
 			put_page(page);
 			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
-					     len, MAX_PACKET_LEN);
+					     len, MERGE_BUFFER_LEN);
 		} else {
 			skb_add_rx_frag(curr_skb, num_skb_frags, page,
-					offset, len,
-					MAX_PACKET_LEN);
+					offset, len, MERGE_BUFFER_LEN);
 		}
 		--rq->num;
 	}
@@ -383,7 +385,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 		struct page *page = virt_to_head_page(buf);
 		skb = page_to_skb(rq, page,
 				  (char *)buf - (char *)page_address(page),
-				  len, MAX_PACKET_LEN);
+				  len, MERGE_BUFFER_LEN);
 		if (unlikely(!skb)) {
 			dev->stats.rx_dropped++;
 			put_page(page);
@@ -471,11 +473,11 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
 	struct skb_vnet_hdr *hdr;
 	int err;
 
-	skb = __netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN, gfp);
+	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
 	if (unlikely(!skb))
 		return -ENOMEM;
 
-	skb_put(skb, MAX_PACKET_LEN);
+	skb_put(skb, GOOD_PACKET_LEN);
 
 	hdr = skb_vnet_hdr(skb);
 	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
@@ -542,20 +544,20 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 	int err;
 
 	if (gfp & __GFP_WAIT) {
-		if (skb_page_frag_refill(MAX_PACKET_LEN, &vi->alloc_frag,
+		if (skb_page_frag_refill(MERGE_BUFFER_LEN, &vi->alloc_frag,
 					 gfp)) {
 			buf = (char *)page_address(vi->alloc_frag.page) +
 			      vi->alloc_frag.offset;
 			get_page(vi->alloc_frag.page);
-			vi->alloc_frag.offset += MAX_PACKET_LEN;
+			vi->alloc_frag.offset += MERGE_BUFFER_LEN;
 		}
 	} else {
-		buf = netdev_alloc_frag(MAX_PACKET_LEN);
+		buf = netdev_alloc_frag(MERGE_BUFFER_LEN);
 	}
 	if (!buf)
 		return -ENOMEM;
 
-	sg_init_one(rq->sg, buf, MAX_PACKET_LEN);
+	sg_init_one(rq->sg, buf, MERGE_BUFFER_LEN);
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
 	if (err < 0)
 		put_page(virt_to_head_page(buf));
-- 
1.8.4.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH net-next 2/4] net: allow > 0 order atomic page alloc in skb_page_frag_refill
  2013-11-12 22:21 [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Michael Dalton
@ 2013-11-12 22:21 ` Michael Dalton
  2013-11-12 22:42   ` Eric Dumazet
  2013-11-12 22:21 ` [PATCH net-next 3/4] virtio-net: use per-receive queue page frag alloc for mergeable bufs Michael Dalton
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 21+ messages in thread
From: Michael Dalton @ 2013-11-12 22:21 UTC (permalink / raw)
  To: David S. Miller
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, Daniel Borkmann,
	virtualization, Eric Dumazet

skb_page_frag_refill currently permits only order-0 page allocs
unless GFP_WAIT is used. Change skb_page_frag_refill to attempt
higher-order page allocations whether or not GFP_WAIT is used. If
memory cannot be allocated, the allocator will fall back to
successively smaller page allocs (down to order-0 page allocs).

This change brings skb_page_frag_refill in line with the existing
page allocation strategy employed by netdev_alloc_frag, which attempts
higher-order page allocations whether or not GFP_WAIT is set, falling
back to successively lower-order page allocations on failure. Part
of migration of virtio-net to per-receive queue page frag allocators.

Signed-off-by: Michael Dalton <mwdalton@google.com>
---
 net/core/sock.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index ab20ed9..7383d23 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1865,9 +1865,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t prio)
 		put_page(pfrag->page);
 	}
 
-	/* We restrict high order allocations to users that can afford to wait */
-	order = (prio & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
-
+	order = SKB_FRAG_PAGE_ORDER;
 	do {
 		gfp_t gfp = prio;
 
-- 
1.8.4.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH net-next 3/4] virtio-net: use per-receive queue page frag alloc for mergeable bufs
  2013-11-12 22:21 [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Michael Dalton
  2013-11-12 22:21 ` [PATCH net-next 2/4] net: allow > 0 order atomic page alloc in skb_page_frag_refill Michael Dalton
@ 2013-11-12 22:21 ` Michael Dalton
  2013-11-12 22:43   ` Eric Dumazet
  2013-11-12 22:21 ` [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance Michael Dalton
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 21+ messages in thread
From: Michael Dalton @ 2013-11-12 22:21 UTC (permalink / raw)
  To: David S. Miller
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, Daniel Borkmann,
	virtualization, Eric Dumazet

The virtio-net driver currently uses netdev_alloc_frag() for GFP_ATOMIC
mergeable rx buffer allocations. This commit migrates virtio-net to use
per-receive queue page frags for GFP_ATOMIC allocation. This change unifies
mergeable rx buffer memory allocation, which now will use skb_refill_frag()
for both atomic and GFP-WAIT buffer allocations.

To address fragmentation concerns, if after buffer allocation there
is too little space left in the page frag to allocate a subsequent
buffer, the remaining space is added to the current allocated buffer
so that the remaining space can be used to store packet data.

Signed-off-by: Michael Dalton <mwdalton@google.com>
---
 drivers/net/virtio_net.c | 70 +++++++++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 31 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 69fb225..0c93054 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -79,6 +79,9 @@ struct receive_queue {
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
+	/* Page frag for GFP_ATOMIC packet buffer allocation. */
+	struct page_frag atomic_frag;
+
 	/* RX: fragments + linear part + virtio header */
 	struct scatterlist sg[MAX_SKB_FRAGS + 2];
 
@@ -128,9 +131,9 @@ struct virtnet_info {
 	struct mutex config_lock;
 
 	/* Page_frag for GFP_KERNEL packet buffer allocation when we run
-	 * low on memory.
+	 * low on memory. May sleep.
 	 */
-	struct page_frag alloc_frag;
+	struct page_frag sleep_frag;
 
 	/* Does the affinity hint is set for virtqueues? */
 	bool affinity_hint_set;
@@ -305,7 +308,7 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
 	struct sk_buff *curr_skb = head_skb;
 	char *buf;
 	struct page *page;
-	int num_buf, len, offset;
+	int num_buf, len, offset, truesize;
 
 	num_buf = hdr->mhdr.num_buffers;
 	while (--num_buf) {
@@ -317,11 +320,7 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
 			head_skb->dev->stats.rx_length_errors++;
 			return -EINVAL;
 		}
-		if (unlikely(len > MERGE_BUFFER_LEN)) {
-			pr_debug("%s: rx error: merge buffer too long\n",
-				 head_skb->dev->name);
-			len = MERGE_BUFFER_LEN;
-		}
+		truesize = max_t(int, len, MERGE_BUFFER_LEN);
 		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
 			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
 			if (unlikely(!nskb)) {
@@ -339,17 +338,17 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
 			head_skb->len += len;
-			head_skb->truesize += MERGE_BUFFER_LEN;
+			head_skb->truesize += truesize;
 		}
 		page = virt_to_head_page(buf);
 		offset = buf - (char *)page_address(page);
 		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
 			put_page(page);
 			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
-					     len, MERGE_BUFFER_LEN);
+					     len, truesize);
 		} else {
 			skb_add_rx_frag(curr_skb, num_skb_frags, page,
-					offset, len, MERGE_BUFFER_LEN);
+					offset, len, truesize);
 		}
 		--rq->num;
 	}
@@ -383,9 +382,10 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 		skb_trim(skb, len);
 	} else if (vi->mergeable_rx_bufs) {
 		struct page *page = virt_to_head_page(buf);
+		int truesize = max_t(int, len, MERGE_BUFFER_LEN);
 		skb = page_to_skb(rq, page,
 				  (char *)buf - (char *)page_address(page),
-				  len, MERGE_BUFFER_LEN);
+				  len, truesize);
 		if (unlikely(!skb)) {
 			dev->stats.rx_dropped++;
 			put_page(page);
@@ -540,24 +540,24 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
 	struct virtnet_info *vi = rq->vq->vdev->priv;
-	char *buf = NULL;
-	int err;
+	struct page_frag *alloc_frag;
+	char *buf;
+	int err, len, hole;
 
-	if (gfp & __GFP_WAIT) {
-		if (skb_page_frag_refill(MERGE_BUFFER_LEN, &vi->alloc_frag,
-					 gfp)) {
-			buf = (char *)page_address(vi->alloc_frag.page) +
-			      vi->alloc_frag.offset;
-			get_page(vi->alloc_frag.page);
-			vi->alloc_frag.offset += MERGE_BUFFER_LEN;
-		}
-	} else {
-		buf = netdev_alloc_frag(MERGE_BUFFER_LEN);
-	}
-	if (!buf)
+	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag : &rq->atomic_frag;
+	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
 		return -ENOMEM;
+	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	get_page(alloc_frag->page);
+	len = MERGE_BUFFER_LEN;
+	alloc_frag->offset += len;
+	hole = alloc_frag->size - alloc_frag->offset;
+	if (hole < MERGE_BUFFER_LEN) {
+		len += hole;
+		alloc_frag->offset += hole;
+	}
 
-	sg_init_one(rq->sg, buf, MERGE_BUFFER_LEN);
+	sg_init_one(rq->sg, buf, len);
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
 	if (err < 0)
 		put_page(virt_to_head_page(buf));
@@ -1335,6 +1335,16 @@ static void free_receive_bufs(struct virtnet_info *vi)
 	}
 }
 
+static void free_receive_page_frags(struct virtnet_info *vi)
+{
+	int i;
+	for (i = 0; i < vi->max_queue_pairs; i++)
+		if (vi->rq[i].atomic_frag.page)
+			put_page(vi->rq[i].atomic_frag.page);
+	if (vi->sleep_frag.page)
+		put_page(vi->sleep_frag.page);
+}
+
 static void free_unused_bufs(struct virtnet_info *vi)
 {
 	void *buf;
@@ -1655,8 +1665,7 @@ free_recv_bufs:
 free_vqs:
 	cancel_delayed_work_sync(&vi->refill);
 	virtnet_del_vqs(vi);
-	if (vi->alloc_frag.page)
-		put_page(vi->alloc_frag.page);
+	free_receive_page_frags(vi);
 free_stats:
 	free_percpu(vi->stats);
 free:
@@ -1690,8 +1699,7 @@ static void virtnet_remove(struct virtio_device *vdev)
 	unregister_netdev(vi->dev);
 
 	remove_vq_common(vi);
-	if (vi->alloc_frag.page)
-		put_page(vi->alloc_frag.page);
+	free_receive_page_frags(vi);
 
 	flush_work(&vi->config_work);
 
-- 
1.8.4.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
  2013-11-12 22:21 [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Michael Dalton
                   ` (2 preceding siblings ...)
  2013-11-12 22:21 ` [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance Michael Dalton
@ 2013-11-12 22:21 ` Michael Dalton
  2013-11-13  7:10   ` Jason Wang
  2013-11-13  8:47   ` Ronen Hod
  2013-11-12 22:41 ` [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Eric Dumazet
                   ` (4 subsequent siblings)
  8 siblings, 2 replies; 21+ messages in thread
From: Michael Dalton @ 2013-11-12 22:21 UTC (permalink / raw)
  To: David S. Miller
  Cc: netdev, Eric Dumazet, Rusty Russell, Michael S. Tsirkin,
	Daniel Borkmann, Jason Wang, Eric Northup, virtualization,
	Michael Dalton

Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page frag
allocators") changed the mergeable receive buffer size from PAGE_SIZE to
MTU-size, introducing a single-stream regression for benchmarks with large
average packet size. There is no single optimal buffer size for all workloads.
For workloads with packet size <= MTU bytes, MTU + virtio-net header-sized
buffers are preferred as larger buffers reduce the TCP window due to SKB
truesize. However, single-stream workloads with large average packet sizes
have higher throughput if larger (e.g., PAGE_SIZE) buffers are used.

This commit auto-tunes the mergeable receiver buffer packet size by choosing
the packet buffer size based on an EWMA of the recent packet sizes for the
receive queue. Packet buffer sizes range from MTU_SIZE + virtio-net header
len to PAGE_SIZE. This improves throughput for large packet workloads, as
any workload with average packet size >= PAGE_SIZE will use PAGE_SIZE
buffers.

These optimizations interact positively with recent commit
ba275241030c ("virtio-net: coalesce rx frags when possible during rx"),
which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
optimizations benefit buffers of any size.

Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
between two QEMU VMs on a single physical machine. Each VM has two VCPUs
with all offloads & vhost enabled. All VMs and vhost threads run in a
single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
in the system will not be scheduled on the benchmark CPUs. Trunk includes
SKB rx frag coalescing.

net-next trunk w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
net-next trunk (MTU-size bufs):  13170.01Gb/s
net-next trunk + auto-tune: 14555.94Gb/s

Signed-off-by: Michael Dalton <mwdalton@google.com>
---
 drivers/net/virtio_net.c | 73 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 20 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0c93054..b1086e0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -27,6 +27,7 @@
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <linux/average.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -37,10 +38,8 @@ module_param(gso, bool, 0444);
 
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
-#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
-                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
-                                L1_CACHE_BYTES))
 #define GOOD_COPY_LEN	128
+#define RECEIVE_AVG_WEIGHT 64
 
 #define VIRTNET_DRIVER_VERSION "1.0.0"
 
@@ -79,6 +78,9 @@ struct receive_queue {
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
+	/* Average packet length for mergeable receive buffers. */
+	struct ewma mrg_avg_pkt_len;
+
 	/* Page frag for GFP_ATOMIC packet buffer allocation. */
 	struct page_frag atomic_frag;
 
@@ -302,14 +304,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
 	return skb;
 }
 
-static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
+static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb,
+			     struct page *head_page)
 {
 	struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
 	struct sk_buff *curr_skb = head_skb;
+	struct page *page = head_page;
 	char *buf;
-	struct page *page;
-	int num_buf, len, offset, truesize;
+	int num_buf, len, offset;
+	u32 est_buffer_len;
 
+	len = head_skb->len;
 	num_buf = hdr->mhdr.num_buffers;
 	while (--num_buf) {
 		int num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
@@ -320,7 +325,6 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
 			head_skb->dev->stats.rx_length_errors++;
 			return -EINVAL;
 		}
-		truesize = max_t(int, len, MERGE_BUFFER_LEN);
 		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
 			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
 			if (unlikely(!nskb)) {
@@ -338,20 +342,38 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
 			head_skb->len += len;
-			head_skb->truesize += truesize;
+			head_skb->truesize += len;
 		}
 		page = virt_to_head_page(buf);
 		offset = buf - (char *)page_address(page);
 		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
 			put_page(page);
 			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
-					     len, truesize);
+					     len, len);
 		} else {
 			skb_add_rx_frag(curr_skb, num_skb_frags, page,
-					offset, len, truesize);
+					offset, len, len);
 		}
 		--rq->num;
 	}
+	/* All frags before the last frag are fully used -- for those frags,
+	 * truesize = len. Use the size of the most recent buffer allocation
+	 * from the last frag's page to estimate the truesize of the last frag.
+	 * EWMA with a weight of 64 makes the size adjustments quite small in
+	 * the frags allocated on one page (even a order-3 one), and truesize
+	 * doesn't need to be 100% accurate.
+	 */
+	if (page) {
+		est_buffer_len = page_private(page);
+		if (est_buffer_len > len) {
+			u32 truesize_delta = est_buffer_len - len;
+
+			curr_skb->truesize += truesize_delta;
+			if (curr_skb != head_skb)
+				head_skb->truesize += truesize_delta;
+		}
+	}
+	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
 	return 0;
 }
 
@@ -382,16 +404,21 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 		skb_trim(skb, len);
 	} else if (vi->mergeable_rx_bufs) {
 		struct page *page = virt_to_head_page(buf);
-		int truesize = max_t(int, len, MERGE_BUFFER_LEN);
+		/* Use an initial truesize of 'len' bytes for page_to_skb --
+		 * receive_mergeable will fixup the truesize of the last page
+		 * frag if the packet is non-linear (> GOOD_COPY_LEN bytes).
+		 */
 		skb = page_to_skb(rq, page,
 				  (char *)buf - (char *)page_address(page),
-				  len, truesize);
+				  len, len);
 		if (unlikely(!skb)) {
 			dev->stats.rx_dropped++;
 			put_page(page);
 			return;
 		}
-		if (receive_mergeable(rq, skb)) {
+		if (!skb_is_nonlinear(skb))
+			page = NULL;
+		if (receive_mergeable(rq, skb, page)) {
 			dev_kfree_skb(skb);
 			return;
 		}
@@ -540,24 +567,29 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
 	struct virtnet_info *vi = rq->vq->vdev->priv;
+	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	struct page_frag *alloc_frag;
 	char *buf;
-	int err, len, hole;
+	int err, hole;
+	u32 buflen;
 
+	buflen = hdr_len + clamp_t(u32, ewma_read(&rq->mrg_avg_pkt_len),
+				   GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
+	buflen = ALIGN(buflen, L1_CACHE_BYTES);
 	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag : &rq->atomic_frag;
-	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
+	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, gfp)))
 		return -ENOMEM;
 	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
 	get_page(alloc_frag->page);
-	len = MERGE_BUFFER_LEN;
-	alloc_frag->offset += len;
+	alloc_frag->offset += buflen;
+	set_page_private(alloc_frag->page, buflen);
 	hole = alloc_frag->size - alloc_frag->offset;
-	if (hole < MERGE_BUFFER_LEN) {
-		len += hole;
+	if (hole < buflen) {
+		buflen += hole;
 		alloc_frag->offset += hole;
 	}
 
-	sg_init_one(rq->sg, buf, len);
+	sg_init_one(rq->sg, buf, buflen);
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
 	if (err < 0)
 		put_page(virt_to_head_page(buf));
@@ -1475,6 +1507,7 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
 			       napi_weight);
 
 		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
+		ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
 		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
 	}
 
-- 
1.8.4.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
  2013-11-12 22:21 [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Michael Dalton
  2013-11-12 22:21 ` [PATCH net-next 2/4] net: allow > 0 order atomic page alloc in skb_page_frag_refill Michael Dalton
  2013-11-12 22:21 ` [PATCH net-next 3/4] virtio-net: use per-receive queue page frag alloc for mergeable bufs Michael Dalton
@ 2013-11-12 22:21 ` Michael Dalton
  2013-11-12 22:21 ` Michael Dalton
                   ` (5 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Michael Dalton @ 2013-11-12 22:21 UTC (permalink / raw)
  To: David S. Miller
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, Daniel Borkmann,
	virtualization, Eric Dumazet

Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page frag
allocators") changed the mergeable receive buffer size from PAGE_SIZE to
MTU-size, introducing a single-stream regression for benchmarks with large
average packet size. There is no single optimal buffer size for all workloads.
For workloads with packet size <= MTU bytes, MTU + virtio-net header-sized
buffers are preferred as larger buffers reduce the TCP window due to SKB
truesize. However, single-stream workloads with large average packet sizes
have higher throughput if larger (e.g., PAGE_SIZE) buffers are used.

This commit auto-tunes the mergeable receiver buffer packet size by choosing
the packet buffer size based on an EWMA of the recent packet sizes for the
receive queue. Packet buffer sizes range from MTU_SIZE + virtio-net header
len to PAGE_SIZE. This improves throughput for large packet workloads, as
any workload with average packet size >= PAGE_SIZE will use PAGE_SIZE
buffers.

These optimizations interact positively with recent commit
ba275241030c ("virtio-net: coalesce rx frags when possible during rx"),
which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
optimizations benefit buffers of any size.

Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
between two QEMU VMs on a single physical machine. Each VM has two VCPUs
with all offloads & vhost enabled. All VMs and vhost threads run in a
single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
in the system will not be scheduled on the benchmark CPUs. Trunk includes
SKB rx frag coalescing.

net-next trunk w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
net-next trunk (MTU-size bufs):  13170.01Gb/s
net-next trunk + auto-tune: 14555.94Gb/s

Signed-off-by: Michael Dalton <mwdalton@google.com>
---
 drivers/net/virtio_net.c | 73 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 53 insertions(+), 20 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 0c93054..b1086e0 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -27,6 +27,7 @@
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <linux/average.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -37,10 +38,8 @@ module_param(gso, bool, 0444);
 
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
-#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
-                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
-                                L1_CACHE_BYTES))
 #define GOOD_COPY_LEN	128
+#define RECEIVE_AVG_WEIGHT 64
 
 #define VIRTNET_DRIVER_VERSION "1.0.0"
 
@@ -79,6 +78,9 @@ struct receive_queue {
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
+	/* Average packet length for mergeable receive buffers. */
+	struct ewma mrg_avg_pkt_len;
+
 	/* Page frag for GFP_ATOMIC packet buffer allocation. */
 	struct page_frag atomic_frag;
 
@@ -302,14 +304,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
 	return skb;
 }
 
-static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
+static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb,
+			     struct page *head_page)
 {
 	struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
 	struct sk_buff *curr_skb = head_skb;
+	struct page *page = head_page;
 	char *buf;
-	struct page *page;
-	int num_buf, len, offset, truesize;
+	int num_buf, len, offset;
+	u32 est_buffer_len;
 
+	len = head_skb->len;
 	num_buf = hdr->mhdr.num_buffers;
 	while (--num_buf) {
 		int num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
@@ -320,7 +325,6 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
 			head_skb->dev->stats.rx_length_errors++;
 			return -EINVAL;
 		}
-		truesize = max_t(int, len, MERGE_BUFFER_LEN);
 		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
 			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
 			if (unlikely(!nskb)) {
@@ -338,20 +342,38 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
 			head_skb->len += len;
-			head_skb->truesize += truesize;
+			head_skb->truesize += len;
 		}
 		page = virt_to_head_page(buf);
 		offset = buf - (char *)page_address(page);
 		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
 			put_page(page);
 			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
-					     len, truesize);
+					     len, len);
 		} else {
 			skb_add_rx_frag(curr_skb, num_skb_frags, page,
-					offset, len, truesize);
+					offset, len, len);
 		}
 		--rq->num;
 	}
+	/* All frags before the last frag are fully used -- for those frags,
+	 * truesize = len. Use the size of the most recent buffer allocation
+	 * from the last frag's page to estimate the truesize of the last frag.
+	 * EWMA with a weight of 64 makes the size adjustments quite small in
+	 * the frags allocated on one page (even a order-3 one), and truesize
+	 * doesn't need to be 100% accurate.
+	 */
+	if (page) {
+		est_buffer_len = page_private(page);
+		if (est_buffer_len > len) {
+			u32 truesize_delta = est_buffer_len - len;
+
+			curr_skb->truesize += truesize_delta;
+			if (curr_skb != head_skb)
+				head_skb->truesize += truesize_delta;
+		}
+	}
+	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
 	return 0;
 }
 
@@ -382,16 +404,21 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
 		skb_trim(skb, len);
 	} else if (vi->mergeable_rx_bufs) {
 		struct page *page = virt_to_head_page(buf);
-		int truesize = max_t(int, len, MERGE_BUFFER_LEN);
+		/* Use an initial truesize of 'len' bytes for page_to_skb --
+		 * receive_mergeable will fixup the truesize of the last page
+		 * frag if the packet is non-linear (> GOOD_COPY_LEN bytes).
+		 */
 		skb = page_to_skb(rq, page,
 				  (char *)buf - (char *)page_address(page),
-				  len, truesize);
+				  len, len);
 		if (unlikely(!skb)) {
 			dev->stats.rx_dropped++;
 			put_page(page);
 			return;
 		}
-		if (receive_mergeable(rq, skb)) {
+		if (!skb_is_nonlinear(skb))
+			page = NULL;
+		if (receive_mergeable(rq, skb, page)) {
 			dev_kfree_skb(skb);
 			return;
 		}
@@ -540,24 +567,29 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
 static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 {
 	struct virtnet_info *vi = rq->vq->vdev->priv;
+	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
 	struct page_frag *alloc_frag;
 	char *buf;
-	int err, len, hole;
+	int err, hole;
+	u32 buflen;
 
+	buflen = hdr_len + clamp_t(u32, ewma_read(&rq->mrg_avg_pkt_len),
+				   GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
+	buflen = ALIGN(buflen, L1_CACHE_BYTES);
 	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag : &rq->atomic_frag;
-	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
+	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, gfp)))
 		return -ENOMEM;
 	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
 	get_page(alloc_frag->page);
-	len = MERGE_BUFFER_LEN;
-	alloc_frag->offset += len;
+	alloc_frag->offset += buflen;
+	set_page_private(alloc_frag->page, buflen);
 	hole = alloc_frag->size - alloc_frag->offset;
-	if (hole < MERGE_BUFFER_LEN) {
-		len += hole;
+	if (hole < buflen) {
+		buflen += hole;
 		alloc_frag->offset += hole;
 	}
 
-	sg_init_one(rq->sg, buf, len);
+	sg_init_one(rq->sg, buf, buflen);
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
 	if (err < 0)
 		put_page(virt_to_head_page(buf));
@@ -1475,6 +1507,7 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
 			       napi_weight);
 
 		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
+		ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
 		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
 	}
 
-- 
1.8.4.1

^ permalink raw reply related	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header
  2013-11-12 22:21 [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Michael Dalton
                   ` (3 preceding siblings ...)
  2013-11-12 22:21 ` Michael Dalton
@ 2013-11-12 22:41 ` Eric Dumazet
  2013-11-13  6:53 ` Jason Wang
                   ` (3 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2013-11-12 22:41 UTC (permalink / raw)
  To: Michael Dalton
  Cc: Michael S. Tsirkin, netdev, Daniel Borkmann, virtualization,
	Eric Dumazet, David S. Miller

On Tue, 2013-11-12 at 14:21 -0800, Michael Dalton wrote:
> Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page
> frag allocators") changed the mergeable receive buffer size from PAGE_SIZE
> to MTU-size. However, the merge buffer size does not take into account the
> size of the virtio-net header. Consequently, packets that are MTU-size
> will take two buffers intead of one (to store the virtio-net header),
> substantially decreasing the throughput of MTU-size traffic due to TCP
> window / SKB truesize effects.
> 
> This commit changes the mergeable buffer size to include the virtio-net
> header. The buffer size is cacheline-aligned because skb_page_frag_refill
> will not automatically align the requested size.
> 
> Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
> between two QEMU VMs on a single physical machine. Each VM has two VCPUs and
> vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup
> cpuset, using cgroups to ensure that other processes in the system will not
> be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive
> buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to
> force MTU-sized packets on the receiver.
> 
> next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s
> net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s
> net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s
> net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s
> 
> Suggested-by: Eric Northup <digitaleric@google.com>
> Signed-off-by: Michael Dalton <mwdalton@google.com>
> ---

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 2/4] net: allow > 0 order atomic page alloc in skb_page_frag_refill
  2013-11-12 22:21 ` [PATCH net-next 2/4] net: allow > 0 order atomic page alloc in skb_page_frag_refill Michael Dalton
@ 2013-11-12 22:42   ` Eric Dumazet
  0 siblings, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2013-11-12 22:42 UTC (permalink / raw)
  To: Michael Dalton
  Cc: Michael S. Tsirkin, netdev, Daniel Borkmann, virtualization,
	Eric Dumazet, David S. Miller

On Tue, 2013-11-12 at 14:21 -0800, Michael Dalton wrote:
> skb_page_frag_refill currently permits only order-0 page allocs
> unless GFP_WAIT is used. Change skb_page_frag_refill to attempt
> higher-order page allocations whether or not GFP_WAIT is used. If
> memory cannot be allocated, the allocator will fall back to
> successively smaller page allocs (down to order-0 page allocs).
> 
> This change brings skb_page_frag_refill in line with the existing
> page allocation strategy employed by netdev_alloc_frag, which attempts
> higher-order page allocations whether or not GFP_WAIT is set, falling
> back to successively lower-order page allocations on failure. Part
> of migration of virtio-net to per-receive queue page frag allocators.
> 
> Signed-off-by: Michael Dalton <mwdalton@google.com>
> ---
>  net/core/sock.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 3/4] virtio-net: use per-receive queue page frag alloc for mergeable bufs
  2013-11-12 22:21 ` [PATCH net-next 3/4] virtio-net: use per-receive queue page frag alloc for mergeable bufs Michael Dalton
@ 2013-11-12 22:43   ` Eric Dumazet
  0 siblings, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2013-11-12 22:43 UTC (permalink / raw)
  To: Michael Dalton
  Cc: Michael S. Tsirkin, netdev, Daniel Borkmann, virtualization,
	Eric Dumazet, David S. Miller

On Tue, 2013-11-12 at 14:21 -0800, Michael Dalton wrote:
> The virtio-net driver currently uses netdev_alloc_frag() for GFP_ATOMIC
> mergeable rx buffer allocations. This commit migrates virtio-net to use
> per-receive queue page frags for GFP_ATOMIC allocation. This change unifies
> mergeable rx buffer memory allocation, which now will use skb_refill_frag()
> for both atomic and GFP-WAIT buffer allocations.
> 
> To address fragmentation concerns, if after buffer allocation there
> is too little space left in the page frag to allocate a subsequent
> buffer, the remaining space is added to the current allocated buffer
> so that the remaining space can be used to store packet data.
> 
> Signed-off-by: Michael Dalton <mwdalton@google.com>
> ---

Acked-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header
  2013-11-12 22:21 [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Michael Dalton
                   ` (4 preceding siblings ...)
  2013-11-12 22:41 ` [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Eric Dumazet
@ 2013-11-13  6:53 ` Jason Wang
  2013-11-13 17:39 ` Michael S. Tsirkin
                   ` (2 subsequent siblings)
  8 siblings, 0 replies; 21+ messages in thread
From: Jason Wang @ 2013-11-13  6:53 UTC (permalink / raw)
  To: Michael Dalton, David S. Miller
  Cc: Michael S. Tsirkin, netdev, Daniel Borkmann, virtualization,
	Eric Dumazet

On 11/13/2013 06:21 AM, Michael Dalton wrote:
> Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page
> frag allocators") changed the mergeable receive buffer size from PAGE_SIZE
> to MTU-size. However, the merge buffer size does not take into account the
> size of the virtio-net header. Consequently, packets that are MTU-size
> will take two buffers intead of one (to store the virtio-net header),
> substantially decreasing the throughput of MTU-size traffic due to TCP
> window / SKB truesize effects.
>
> This commit changes the mergeable buffer size to include the virtio-net
> header. The buffer size is cacheline-aligned because skb_page_frag_refill
> will not automatically align the requested size.
>
> Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
> between two QEMU VMs on a single physical machine. Each VM has two VCPUs and
> vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup
> cpuset, using cgroups to ensure that other processes in the system will not
> be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive
> buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to
> force MTU-sized packets on the receiver.
>
> next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s
> net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s
> net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s
> net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s
>
> Suggested-by: Eric Northup <digitaleric@google.com>
> Signed-off-by: Michael Dalton <mwdalton@google.com>
> ---
>  drivers/net/virtio_net.c | 30 ++++++++++++++++--------------
>  1 file changed, 16 insertions(+), 14 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 01f4eb5..69fb225 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -36,7 +36,10 @@ module_param(csum, bool, 0444);
>  module_param(gso, bool, 0444);
>  
>  /* FIXME: MTU in config. */
> -#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> +#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> +#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
> +                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
> +                                L1_CACHE_BYTES))
>  #define GOOD_COPY_LEN	128
>  
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
> @@ -314,10 +317,10 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
>  			head_skb->dev->stats.rx_length_errors++;
>  			return -EINVAL;
>  		}
> -		if (unlikely(len > MAX_PACKET_LEN)) {
> +		if (unlikely(len > MERGE_BUFFER_LEN)) {
>  			pr_debug("%s: rx error: merge buffer too long\n",
>  				 head_skb->dev->name);
> -			len = MAX_PACKET_LEN;
> +			len = MERGE_BUFFER_LEN;
>  		}
>  		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
>  			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
> @@ -336,18 +339,17 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
>  		if (curr_skb != head_skb) {
>  			head_skb->data_len += len;
>  			head_skb->len += len;
> -			head_skb->truesize += MAX_PACKET_LEN;
> +			head_skb->truesize += MERGE_BUFFER_LEN;
>  		}
>  		page = virt_to_head_page(buf);
>  		offset = buf - (char *)page_address(page);
>  		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
>  			put_page(page);
>  			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> -					     len, MAX_PACKET_LEN);
> +					     len, MERGE_BUFFER_LEN);
>  		} else {
>  			skb_add_rx_frag(curr_skb, num_skb_frags, page,
> -					offset, len,
> -					MAX_PACKET_LEN);
> +					offset, len, MERGE_BUFFER_LEN);
>  		}
>  		--rq->num;
>  	}
> @@ -383,7 +385,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
>  		struct page *page = virt_to_head_page(buf);
>  		skb = page_to_skb(rq, page,
>  				  (char *)buf - (char *)page_address(page),
> -				  len, MAX_PACKET_LEN);
> +				  len, MERGE_BUFFER_LEN);
>  		if (unlikely(!skb)) {
>  			dev->stats.rx_dropped++;
>  			put_page(page);
> @@ -471,11 +473,11 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
>  	struct skb_vnet_hdr *hdr;
>  	int err;
>  
> -	skb = __netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN, gfp);
> +	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
>  	if (unlikely(!skb))
>  		return -ENOMEM;
>  
> -	skb_put(skb, MAX_PACKET_LEN);
> +	skb_put(skb, GOOD_PACKET_LEN);
>  
>  	hdr = skb_vnet_hdr(skb);
>  	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
> @@ -542,20 +544,20 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>  	int err;
>  
>  	if (gfp & __GFP_WAIT) {
> -		if (skb_page_frag_refill(MAX_PACKET_LEN, &vi->alloc_frag,
> +		if (skb_page_frag_refill(MERGE_BUFFER_LEN, &vi->alloc_frag,
>  					 gfp)) {
>  			buf = (char *)page_address(vi->alloc_frag.page) +
>  			      vi->alloc_frag.offset;
>  			get_page(vi->alloc_frag.page);
> -			vi->alloc_frag.offset += MAX_PACKET_LEN;
> +			vi->alloc_frag.offset += MERGE_BUFFER_LEN;
>  		}
>  	} else {
> -		buf = netdev_alloc_frag(MAX_PACKET_LEN);
> +		buf = netdev_alloc_frag(MERGE_BUFFER_LEN);
>  	}
>  	if (!buf)
>  		return -ENOMEM;
>  
> -	sg_init_one(rq->sg, buf, MAX_PACKET_LEN);
> +	sg_init_one(rq->sg, buf, MERGE_BUFFER_LEN);
>  	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
>  	if (err < 0)
>  		put_page(virt_to_head_page(buf));

Acked-by: Jason Wang <jasowang@redhat.com>

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
  2013-11-12 22:21 ` Michael Dalton
@ 2013-11-13  7:10   ` Jason Wang
  2013-11-13  7:40     ` Eric Dumazet
  2013-11-13 17:42     ` Michael S. Tsirkin
  2013-11-13  8:47   ` Ronen Hod
  1 sibling, 2 replies; 21+ messages in thread
From: Jason Wang @ 2013-11-13  7:10 UTC (permalink / raw)
  To: Michael Dalton, David S. Miller
  Cc: Michael S. Tsirkin, netdev, Daniel Borkmann, virtualization,
	Eric Dumazet

On 11/13/2013 06:21 AM, Michael Dalton wrote:
> Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page frag
> allocators") changed the mergeable receive buffer size from PAGE_SIZE to
> MTU-size, introducing a single-stream regression for benchmarks with large
> average packet size. There is no single optimal buffer size for all workloads.
> For workloads with packet size <= MTU bytes, MTU + virtio-net header-sized
> buffers are preferred as larger buffers reduce the TCP window due to SKB
> truesize. However, single-stream workloads with large average packet sizes
> have higher throughput if larger (e.g., PAGE_SIZE) buffers are used.
>
> This commit auto-tunes the mergeable receiver buffer packet size by choosing
> the packet buffer size based on an EWMA of the recent packet sizes for the
> receive queue. Packet buffer sizes range from MTU_SIZE + virtio-net header
> len to PAGE_SIZE. This improves throughput for large packet workloads, as
> any workload with average packet size >= PAGE_SIZE will use PAGE_SIZE
> buffers.

Hi Michael:

There's one concern with EWMA. How well does it handle multiple streams
each with different packet size? E.g there may be two flows, one with
256 bytes each packet another is 64K.  Looks like it can result we
allocate PAGE_SIZE buffer for 256 (which is bad since the
payload/truesize is low) bytes or 1500+ for 64K buffer (which is ok
since we can do coalescing).
>
> These optimizations interact positively with recent commit
> ba275241030c ("virtio-net: coalesce rx frags when possible during rx"),
> which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
> optimizations benefit buffers of any size.
>
> Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
> between two QEMU VMs on a single physical machine. Each VM has two VCPUs
> with all offloads & vhost enabled. All VMs and vhost threads run in a
> single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
> in the system will not be scheduled on the benchmark CPUs. Trunk includes
> SKB rx frag coalescing.
>
> net-next trunk w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
> net-next trunk (MTU-size bufs):  13170.01Gb/s
> net-next trunk + auto-tune: 14555.94Gb/s

Do you have perf numbers that just without this patch? We need to know
how much EWMA help exactly.
>
> Signed-off-by: Michael Dalton <mwdalton@google.com>
> ---
>  drivers/net/virtio_net.c | 73 +++++++++++++++++++++++++++++++++++-------------
>  1 file changed, 53 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 0c93054..b1086e0 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -27,6 +27,7 @@
>  #include <linux/if_vlan.h>
>  #include <linux/slab.h>
>  #include <linux/cpu.h>
> +#include <linux/average.h>
>  
>  static int napi_weight = NAPI_POLL_WEIGHT;
>  module_param(napi_weight, int, 0444);
> @@ -37,10 +38,8 @@ module_param(gso, bool, 0444);
>  
>  /* FIXME: MTU in config. */
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> -#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
> -                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
> -                                L1_CACHE_BYTES))
>  #define GOOD_COPY_LEN	128
> +#define RECEIVE_AVG_WEIGHT 64

Maybe we can make this as a module parameter.
>  
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
>  
> @@ -79,6 +78,9 @@ struct receive_queue {
>  	/* Chain pages by the private ptr. */
>  	struct page *pages;
>  
> +	/* Average packet length for mergeable receive buffers. */
> +	struct ewma mrg_avg_pkt_len;
> +
>  	/* Page frag for GFP_ATOMIC packet buffer allocation. */
>  	struct page_frag atomic_frag;
>  
> @@ -302,14 +304,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
>  	return skb;
>  }
>  
> -static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
> +static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb,
> +			     struct page *head_page)
>  {
>  	struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
>  	struct sk_buff *curr_skb = head_skb;
> +	struct page *page = head_page;
>  	char *buf;
> -	struct page *page;
> -	int num_buf, len, offset, truesize;
> +	int num_buf, len, offset;
> +	u32 est_buffer_len;
>  
> +	len = head_skb->len;
>  	num_buf = hdr->mhdr.num_buffers;
>  	while (--num_buf) {
>  		int num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
> @@ -320,7 +325,6 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
>  			head_skb->dev->stats.rx_length_errors++;
>  			return -EINVAL;
>  		}
> -		truesize = max_t(int, len, MERGE_BUFFER_LEN);
>  		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
>  			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
>  			if (unlikely(!nskb)) {
> @@ -338,20 +342,38 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
>  		if (curr_skb != head_skb) {
>  			head_skb->data_len += len;
>  			head_skb->len += len;
> -			head_skb->truesize += truesize;
> +			head_skb->truesize += len;
>  		}
>  		page = virt_to_head_page(buf);
>  		offset = buf - (char *)page_address(page);
>  		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
>  			put_page(page);
>  			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> -					     len, truesize);
> +					     len, len);
>  		} else {
>  			skb_add_rx_frag(curr_skb, num_skb_frags, page,
> -					offset, len, truesize);
> +					offset, len, len);
>  		}
>  		--rq->num;
>  	}
> +	/* All frags before the last frag are fully used -- for those frags,
> +	 * truesize = len. Use the size of the most recent buffer allocation
> +	 * from the last frag's page to estimate the truesize of the last frag.
> +	 * EWMA with a weight of 64 makes the size adjustments quite small in
> +	 * the frags allocated on one page (even a order-3 one), and truesize
> +	 * doesn't need to be 100% accurate.
> +	 */
> +	if (page) {
> +		est_buffer_len = page_private(page);
> +		if (est_buffer_len > len) {
> +			u32 truesize_delta = est_buffer_len - len;
> +
> +			curr_skb->truesize += truesize_delta;
> +			if (curr_skb != head_skb)
> +				head_skb->truesize += truesize_delta;
> +		}

Is there a chance that est_buffer_len was smaller than or equal with len?
> +	}
> +	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
>  	return 0;
>  }
>  
> @@ -382,16 +404,21 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
>  		skb_trim(skb, len);
>  	} else if (vi->mergeable_rx_bufs) {
>  		struct page *page = virt_to_head_page(buf);
> -		int truesize = max_t(int, len, MERGE_BUFFER_LEN);
> +		/* Use an initial truesize of 'len' bytes for page_to_skb --
> +		 * receive_mergeable will fixup the truesize of the last page
> +		 * frag if the packet is non-linear (> GOOD_COPY_LEN bytes).
> +		 */
>  		skb = page_to_skb(rq, page,
>  				  (char *)buf - (char *)page_address(page),
> -				  len, truesize);
> +				  len, len);
>  		if (unlikely(!skb)) {
>  			dev->stats.rx_dropped++;
>  			put_page(page);
>  			return;
>  		}
> -		if (receive_mergeable(rq, skb)) {
> +		if (!skb_is_nonlinear(skb))
> +			page = NULL;
> +		if (receive_mergeable(rq, skb, page)) {
>  			dev_kfree_skb(skb);
>  			return;
>  		}
> @@ -540,24 +567,29 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
>  static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>  {
>  	struct virtnet_info *vi = rq->vq->vdev->priv;
> +	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>  	struct page_frag *alloc_frag;
>  	char *buf;
> -	int err, len, hole;
> +	int err, hole;
> +	u32 buflen;
>  
> +	buflen = hdr_len + clamp_t(u32, ewma_read(&rq->mrg_avg_pkt_len),
> +				   GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
> +	buflen = ALIGN(buflen, L1_CACHE_BYTES);
>  	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag : &rq->atomic_frag;
> -	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
> +	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, gfp)))
>  		return -ENOMEM;
>  	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>  	get_page(alloc_frag->page);
> -	len = MERGE_BUFFER_LEN;
> -	alloc_frag->offset += len;
> +	alloc_frag->offset += buflen;
> +	set_page_private(alloc_frag->page, buflen);

Not sure this is accurate, since buflen may change and several frags may
share a single page. So the est_buffer_len we get in receive_mergeable()
may not be the correct value.
>  	hole = alloc_frag->size - alloc_frag->offset;
> -	if (hole < MERGE_BUFFER_LEN) {
> -		len += hole;
> +	if (hole < buflen) {
> +		buflen += hole;
>  		alloc_frag->offset += hole;
>  	}
>  
> -	sg_init_one(rq->sg, buf, len);
> +	sg_init_one(rq->sg, buf, buflen);
>  	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
>  	if (err < 0)
>  		put_page(virt_to_head_page(buf));
> @@ -1475,6 +1507,7 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
>  			       napi_weight);
>  
>  		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
> +		ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
>  		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
>  	}
>  

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
  2013-11-13  7:10   ` Jason Wang
@ 2013-11-13  7:40     ` Eric Dumazet
  2013-11-20  2:06       ` Rusty Russell
  2013-11-13 17:42     ` Michael S. Tsirkin
  1 sibling, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2013-11-13  7:40 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, Daniel Borkmann,
	virtualization, Eric Dumazet, David S. Miller

On Wed, 2013-11-13 at 15:10 +0800, Jason Wang wrote:

> There's one concern with EWMA. How well does it handle multiple streams
> each with different packet size? E.g there may be two flows, one with
> 256 bytes each packet another is 64K.  Looks like it can result we
> allocate PAGE_SIZE buffer for 256 (which is bad since the
> payload/truesize is low) bytes or 1500+ for 64K buffer (which is ok
> since we can do coalescing).

It's hard to predict the future ;)

256 bytes frames consume 2.5 KB anyway on a traditional NIC.
If it was a concern, we would have it already.

If you receive a mix of big and small frames, there is no win.

> > +	if (page) {
> > +		est_buffer_len = page_private(page);
> > +		if (est_buffer_len > len) {
> > +			u32 truesize_delta = est_buffer_len - len;
> > +
> > +			curr_skb->truesize += truesize_delta;
> > +			if (curr_skb != head_skb)
> > +				head_skb->truesize += truesize_delta;
> > +		}
> 
> Is there a chance that est_buffer_len was smaller than or equal with len?

Yes, and in this case we do not really care, see below.

> > +	}
> > +	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
> >  	return 0;
> >  }
> >  
> > @@ -382,16 +404,21 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
> >  		skb_trim(skb, len);
> >  	} else if (vi->mergeable_rx_bufs) {
> >  		struct page *page = virt_to_head_page(buf);
> > -		int truesize = max_t(int, len, MERGE_BUFFER_LEN);
> > +		/* Use an initial truesize of 'len' bytes for page_to_skb --
> > +		 * receive_mergeable will fixup the truesize of the last page
> > +		 * frag if the packet is non-linear (> GOOD_COPY_LEN bytes).
> > +		 */
> >  		skb = page_to_skb(rq, page,
> >  				  (char *)buf - (char *)page_address(page),
> > -				  len, truesize);
> > +				  len, len);
> >  		if (unlikely(!skb)) {
> >  			dev->stats.rx_dropped++;
> >  			put_page(page);
> >  			return;
> >  		}
> > -		if (receive_mergeable(rq, skb)) {
> > +		if (!skb_is_nonlinear(skb))
> > +			page = NULL;
> > +		if (receive_mergeable(rq, skb, page)) {
> >  			dev_kfree_skb(skb);
> >  			return;
> >  		}
> > @@ -540,24 +567,29 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
> >  static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
> >  {
> >  	struct virtnet_info *vi = rq->vq->vdev->priv;
> > +	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> >  	struct page_frag *alloc_frag;
> >  	char *buf;
> > -	int err, len, hole;
> > +	int err, hole;
> > +	u32 buflen;
> >  
> > +	buflen = hdr_len + clamp_t(u32, ewma_read(&rq->mrg_avg_pkt_len),
> > +				   GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
> > +	buflen = ALIGN(buflen, L1_CACHE_BYTES);
> >  	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag : &rq->atomic_frag;
> > -	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
> > +	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, gfp)))
> >  		return -ENOMEM;
> >  	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> >  	get_page(alloc_frag->page);
> > -	len = MERGE_BUFFER_LEN;
> > -	alloc_frag->offset += len;
> > +	alloc_frag->offset += buflen;
> > +	set_page_private(alloc_frag->page, buflen);
> 
> Not sure this is accurate, since buflen may change and several frags may
> share a single page. So the est_buffer_len we get in receive_mergeable()
> may not be the correct value.

skb->truesize has to be reasonably accurate. 

For example, fast clone storage is not accounted for in TCP skbs stored
in socket write queues. Thats ~256 bytes per skb of 'missing'
accounting.

This is about 10% error when TSO/GSO is off.

With this EWMA using a factor of 64, the potential error will be much
less than 10%.

Small frames tend to be consumed quite fast (ACK messages, small UDP
frames) in most cases.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
  2013-11-12 22:21 ` Michael Dalton
  2013-11-13  7:10   ` Jason Wang
@ 2013-11-13  8:47   ` Ronen Hod
  2013-11-13 14:19     ` Eric Dumazet
  1 sibling, 1 reply; 21+ messages in thread
From: Ronen Hod @ 2013-11-13  8:47 UTC (permalink / raw)
  To: Michael Dalton, David S. Miller
  Cc: Michael S. Tsirkin, netdev, Daniel Borkmann, virtualization,
	Eric Dumazet

On 11/13/2013 12:21 AM, Michael Dalton wrote:
> Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page frag
> allocators") changed the mergeable receive buffer size from PAGE_SIZE to
> MTU-size, introducing a single-stream regression for benchmarks with large
> average packet size. There is no single optimal buffer size for all workloads.
> For workloads with packet size <= MTU bytes, MTU + virtio-net header-sized
> buffers are preferred as larger buffers reduce the TCP window due to SKB
> truesize. However, single-stream workloads with large average packet sizes
> have higher throughput if larger (e.g., PAGE_SIZE) buffers are used.
>
> This commit auto-tunes the mergeable receiver buffer packet size by choosing
> the packet buffer size based on an EWMA of the recent packet sizes for the
> receive queue. Packet buffer sizes range from MTU_SIZE + virtio-net header
> len to PAGE_SIZE. This improves throughput for large packet workloads, as
> any workload with average packet size >= PAGE_SIZE will use PAGE_SIZE
> buffers.
>
> These optimizations interact positively with recent commit
> ba275241030c ("virtio-net: coalesce rx frags when possible during rx"),
> which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
> optimizations benefit buffers of any size.
>
> Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
> between two QEMU VMs on a single physical machine. Each VM has two VCPUs
> with all offloads & vhost enabled. All VMs and vhost threads run in a
> single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
> in the system will not be scheduled on the benchmark CPUs. Trunk includes
> SKB rx frag coalescing.
>
> net-next trunk w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
> net-next trunk (MTU-size bufs):  13170.01Gb/s
> net-next trunk + auto-tune: 14555.94Gb/s
>
> Signed-off-by: Michael Dalton <mwdalton@google.com>
> ---
>   drivers/net/virtio_net.c | 73 +++++++++++++++++++++++++++++++++++-------------
>   1 file changed, 53 insertions(+), 20 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 0c93054..b1086e0 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -27,6 +27,7 @@
>   #include <linux/if_vlan.h>
>   #include <linux/slab.h>
>   #include <linux/cpu.h>
> +#include <linux/average.h>
>   
>   static int napi_weight = NAPI_POLL_WEIGHT;
>   module_param(napi_weight, int, 0444);
> @@ -37,10 +38,8 @@ module_param(gso, bool, 0444);
>   
>   /* FIXME: MTU in config. */
>   #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> -#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
> -                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
> -                                L1_CACHE_BYTES))
>   #define GOOD_COPY_LEN	128
> +#define RECEIVE_AVG_WEIGHT 64
>   
>   #define VIRTNET_DRIVER_VERSION "1.0.0"
>   
> @@ -79,6 +78,9 @@ struct receive_queue {
>   	/* Chain pages by the private ptr. */
>   	struct page *pages;
>   
> +	/* Average packet length for mergeable receive buffers. */
> +	struct ewma mrg_avg_pkt_len;
> +
>   	/* Page frag for GFP_ATOMIC packet buffer allocation. */
>   	struct page_frag atomic_frag;
>   
> @@ -302,14 +304,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
>   	return skb;
>   }
>   
> -static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
> +static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb,
> +			     struct page *head_page)
>   {
>   	struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
>   	struct sk_buff *curr_skb = head_skb;
> +	struct page *page = head_page;
>   	char *buf;
> -	struct page *page;
> -	int num_buf, len, offset, truesize;
> +	int num_buf, len, offset;
> +	u32 est_buffer_len;
>   
> +	len = head_skb->len;
>   	num_buf = hdr->mhdr.num_buffers;
>   	while (--num_buf) {
>   		int num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
> @@ -320,7 +325,6 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
>   			head_skb->dev->stats.rx_length_errors++;
>   			return -EINVAL;
>   		}
> -		truesize = max_t(int, len, MERGE_BUFFER_LEN);
>   		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
>   			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
>   			if (unlikely(!nskb)) {
> @@ -338,20 +342,38 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
>   		if (curr_skb != head_skb) {
>   			head_skb->data_len += len;
>   			head_skb->len += len;
> -			head_skb->truesize += truesize;
> +			head_skb->truesize += len;
>   		}
>   		page = virt_to_head_page(buf);
>   		offset = buf - (char *)page_address(page);
>   		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
>   			put_page(page);
>   			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> -					     len, truesize);
> +					     len, len);
>   		} else {
>   			skb_add_rx_frag(curr_skb, num_skb_frags, page,
> -					offset, len, truesize);
> +					offset, len, len);
>   		}
>   		--rq->num;
>   	}
> +	/* All frags before the last frag are fully used -- for those frags,
> +	 * truesize = len. Use the size of the most recent buffer allocation
> +	 * from the last frag's page to estimate the truesize of the last frag.
> +	 * EWMA with a weight of 64 makes the size adjustments quite small in
> +	 * the frags allocated on one page (even a order-3 one), and truesize
> +	 * doesn't need to be 100% accurate.
> +	 */
> +	if (page) {
> +		est_buffer_len = page_private(page);
> +		if (est_buffer_len > len) {
> +			u32 truesize_delta = est_buffer_len - len;
> +
> +			curr_skb->truesize += truesize_delta;
> +			if (curr_skb != head_skb)
> +				head_skb->truesize += truesize_delta;
> +		}
> +	}
> +	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);

Hi,

I looked at how ewma works, and although it is computationally efficient,
and it does what it is supposed to do, initially (at the first samples) it is strongly
biased towards the value that was added at the first ewma_add.
I suggest that you print the values of ewma_add() and ewma_read(). If you are
happy with the results, then ignore my comments. If you are not, then I can
provide a version that does better for the first samples.
Unfortunately, it will be slightly less efficient.

Ronen.

>   	return 0;
>   }
>   
> @@ -382,16 +404,21 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
>   		skb_trim(skb, len);
>   	} else if (vi->mergeable_rx_bufs) {
>   		struct page *page = virt_to_head_page(buf);
> -		int truesize = max_t(int, len, MERGE_BUFFER_LEN);
> +		/* Use an initial truesize of 'len' bytes for page_to_skb --
> +		 * receive_mergeable will fixup the truesize of the last page
> +		 * frag if the packet is non-linear (> GOOD_COPY_LEN bytes).
> +		 */
>   		skb = page_to_skb(rq, page,
>   				  (char *)buf - (char *)page_address(page),
> -				  len, truesize);
> +				  len, len);
>   		if (unlikely(!skb)) {
>   			dev->stats.rx_dropped++;
>   			put_page(page);
>   			return;
>   		}
> -		if (receive_mergeable(rq, skb)) {
> +		if (!skb_is_nonlinear(skb))
> +			page = NULL;
> +		if (receive_mergeable(rq, skb, page)) {
>   			dev_kfree_skb(skb);
>   			return;
>   		}
> @@ -540,24 +567,29 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
>   static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>   {
>   	struct virtnet_info *vi = rq->vq->vdev->priv;
> +	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
>   	struct page_frag *alloc_frag;
>   	char *buf;
> -	int err, len, hole;
> +	int err, hole;
> +	u32 buflen;
>   
> +	buflen = hdr_len + clamp_t(u32, ewma_read(&rq->mrg_avg_pkt_len),
> +				   GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
> +	buflen = ALIGN(buflen, L1_CACHE_BYTES);
>   	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag : &rq->atomic_frag;
> -	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
> +	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, gfp)))
>   		return -ENOMEM;
>   	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>   	get_page(alloc_frag->page);
> -	len = MERGE_BUFFER_LEN;
> -	alloc_frag->offset += len;
> +	alloc_frag->offset += buflen;
> +	set_page_private(alloc_frag->page, buflen);
>   	hole = alloc_frag->size - alloc_frag->offset;
> -	if (hole < MERGE_BUFFER_LEN) {
> -		len += hole;
> +	if (hole < buflen) {
> +		buflen += hole;
>   		alloc_frag->offset += hole;
>   	}
>   
> -	sg_init_one(rq->sg, buf, len);
> +	sg_init_one(rq->sg, buf, buflen);
>   	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
>   	if (err < 0)
>   		put_page(virt_to_head_page(buf));
> @@ -1475,6 +1507,7 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
>   			       napi_weight);
>   
>   		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
> +		ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
>   		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
>   	}
>   

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
  2013-11-13  8:47   ` Ronen Hod
@ 2013-11-13 14:19     ` Eric Dumazet
  2013-11-13 16:43       ` Ronen Hod
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2013-11-13 14:19 UTC (permalink / raw)
  To: Ronen Hod
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, Daniel Borkmann,
	virtualization, Eric Dumazet, David S. Miller

On Wed, 2013-11-13 at 10:47 +0200, Ronen Hod wrote:

> I looked at how ewma works, and although it is computationally efficient,
> and it does what it is supposed to do, initially (at the first samples) it is strongly
> biased towards the value that was added at the first ewma_add.
> I suggest that you print the values of ewma_add() and ewma_read(). If you are
> happy with the results, then ignore my comments. If you are not, then I can
> provide a version that does better for the first samples.
> Unfortunately, it will be slightly less efficient.

Value is clamped by (GOOD_PACKET_LEN, PAGE_SIZE - hdr_len)

So initial value is conservative and not really used.

Thanks

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
  2013-11-13 14:19     ` Eric Dumazet
@ 2013-11-13 16:43       ` Ronen Hod
  2013-11-13 17:18         ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Ronen Hod @ 2013-11-13 16:43 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, Daniel Borkmann,
	virtualization, Eric Dumazet, David S. Miller

On 11/13/2013 04:19 PM, Eric Dumazet wrote:
> On Wed, 2013-11-13 at 10:47 +0200, Ronen Hod wrote:
>
>> I looked at how ewma works, and although it is computationally efficient,
>> and it does what it is supposed to do, initially (at the first samples) it is strongly
>> biased towards the value that was added at the first ewma_add.
>> I suggest that you print the values of ewma_add() and ewma_read(). If you are
>> happy with the results, then ignore my comments. If you are not, then I can
>> provide a version that does better for the first samples.
>> Unfortunately, it will be slightly less efficient.
> Value is clamped by (GOOD_PACKET_LEN, PAGE_SIZE - hdr_len)
>
> So initial value is conservative and not really used.

Hi Eric,

This initial value, that you do not really want to use, will slowly fade, but it
will still pretty much dominate the returned value for the first RECEIVE_AVG_WEIGHT(==64)
samples or so (most ewma implementations suffer from this bug).
Naturally, it doesn't matter much if you just keep it running forever.
However, if you will want to restart the learning process more often, which might make
sense upon changes, then the auto-tuning will be very sub-optimal.

Ronen.

> Thanks
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
  2013-11-13 16:43       ` Ronen Hod
@ 2013-11-13 17:18         ` Eric Dumazet
  0 siblings, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2013-11-13 17:18 UTC (permalink / raw)
  To: Ronen Hod
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, Daniel Borkmann,
	virtualization, Eric Dumazet, David S. Miller

On Wed, 2013-11-13 at 18:43 +0200, Ronen Hod wrote:

> 
> This initial value, that you do not really want to use, will slowly fade, but it
> will still pretty much dominate the returned value for the first RECEIVE_AVG_WEIGHT(==64)
> samples or so (most ewma implementations suffer from this bug).
> Naturally, it doesn't matter much if you just keep it running forever.
> However, if you will want to restart the learning process more often, which might make
> sense upon changes, then the auto-tuning will be very sub-optimal.

Note that we fill a ring buffer at open time (try_fill_recv()),
all these buffers will be of the minimal size.

By the time we have refilled the ring buffer, EWMA value will be
GOOD_PACKET_LEN.

These sizes are a hint, clamped between 1500 and PAGE_SIZE.

We do not care of very first allocated buffers, they are good enough.

We only care of the million of following allocations.

Also note the EWMA is per queue, not global to the device.

Of course, there is no 'one size' perfect for all usages.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header
  2013-11-12 22:21 [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Michael Dalton
                   ` (5 preceding siblings ...)
  2013-11-13  6:53 ` Jason Wang
@ 2013-11-13 17:39 ` Michael S. Tsirkin
  2013-11-13 17:43 ` Michael S. Tsirkin
  2013-11-14  7:38 ` David Miller
  8 siblings, 0 replies; 21+ messages in thread
From: Michael S. Tsirkin @ 2013-11-13 17:39 UTC (permalink / raw)
  To: Michael Dalton
  Cc: netdev, Daniel Borkmann, virtualization, Eric Dumazet, David S. Miller

On Tue, Nov 12, 2013 at 02:21:22PM -0800, Michael Dalton wrote:
> Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page
> frag allocators") changed the mergeable receive buffer size from PAGE_SIZE
> to MTU-size. However, the merge buffer size does not take into account the
> size of the virtio-net header. Consequently, packets that are MTU-size
> will take two buffers intead of one (to store the virtio-net header),
> substantially decreasing the throughput of MTU-size traffic due to TCP
> window / SKB truesize effects.
> 
> This commit changes the mergeable buffer size to include the virtio-net
> header. The buffer size is cacheline-aligned because skb_page_frag_refill
> will not automatically align the requested size.
> 
> Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
> between two QEMU VMs on a single physical machine. Each VM has two VCPUs and
> vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup
> cpuset, using cgroups to ensure that other processes in the system will not
> be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive
> buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to
> force MTU-sized packets on the receiver.
> 
> next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s
> net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s
> net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s
> net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s
> 
> Suggested-by: Eric Northup <digitaleric@google.com>
> Signed-off-by: Michael Dalton <mwdalton@google.com>

Acked-by: Michael S. Tsirkin <mst@redhat.com>

> ---
>  drivers/net/virtio_net.c | 30 ++++++++++++++++--------------
>  1 file changed, 16 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 01f4eb5..69fb225 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -36,7 +36,10 @@ module_param(csum, bool, 0444);
>  module_param(gso, bool, 0444);
>  
>  /* FIXME: MTU in config. */
> -#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> +#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> +#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
> +                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
> +                                L1_CACHE_BYTES))
>  #define GOOD_COPY_LEN	128
>  
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
> @@ -314,10 +317,10 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
>  			head_skb->dev->stats.rx_length_errors++;
>  			return -EINVAL;
>  		}
> -		if (unlikely(len > MAX_PACKET_LEN)) {
> +		if (unlikely(len > MERGE_BUFFER_LEN)) {
>  			pr_debug("%s: rx error: merge buffer too long\n",
>  				 head_skb->dev->name);
> -			len = MAX_PACKET_LEN;
> +			len = MERGE_BUFFER_LEN;
>  		}
>  		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
>  			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
> @@ -336,18 +339,17 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
>  		if (curr_skb != head_skb) {
>  			head_skb->data_len += len;
>  			head_skb->len += len;
> -			head_skb->truesize += MAX_PACKET_LEN;
> +			head_skb->truesize += MERGE_BUFFER_LEN;
>  		}
>  		page = virt_to_head_page(buf);
>  		offset = buf - (char *)page_address(page);
>  		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
>  			put_page(page);
>  			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> -					     len, MAX_PACKET_LEN);
> +					     len, MERGE_BUFFER_LEN);
>  		} else {
>  			skb_add_rx_frag(curr_skb, num_skb_frags, page,
> -					offset, len,
> -					MAX_PACKET_LEN);
> +					offset, len, MERGE_BUFFER_LEN);
>  		}
>  		--rq->num;
>  	}
> @@ -383,7 +385,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
>  		struct page *page = virt_to_head_page(buf);
>  		skb = page_to_skb(rq, page,
>  				  (char *)buf - (char *)page_address(page),
> -				  len, MAX_PACKET_LEN);
> +				  len, MERGE_BUFFER_LEN);
>  		if (unlikely(!skb)) {
>  			dev->stats.rx_dropped++;
>  			put_page(page);
> @@ -471,11 +473,11 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
>  	struct skb_vnet_hdr *hdr;
>  	int err;
>  
> -	skb = __netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN, gfp);
> +	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
>  	if (unlikely(!skb))
>  		return -ENOMEM;
>  
> -	skb_put(skb, MAX_PACKET_LEN);
> +	skb_put(skb, GOOD_PACKET_LEN);
>  
>  	hdr = skb_vnet_hdr(skb);
>  	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
> @@ -542,20 +544,20 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>  	int err;
>  
>  	if (gfp & __GFP_WAIT) {
> -		if (skb_page_frag_refill(MAX_PACKET_LEN, &vi->alloc_frag,
> +		if (skb_page_frag_refill(MERGE_BUFFER_LEN, &vi->alloc_frag,
>  					 gfp)) {
>  			buf = (char *)page_address(vi->alloc_frag.page) +
>  			      vi->alloc_frag.offset;
>  			get_page(vi->alloc_frag.page);
> -			vi->alloc_frag.offset += MAX_PACKET_LEN;
> +			vi->alloc_frag.offset += MERGE_BUFFER_LEN;
>  		}
>  	} else {
> -		buf = netdev_alloc_frag(MAX_PACKET_LEN);
> +		buf = netdev_alloc_frag(MERGE_BUFFER_LEN);
>  	}
>  	if (!buf)
>  		return -ENOMEM;
>  
> -	sg_init_one(rq->sg, buf, MAX_PACKET_LEN);
> +	sg_init_one(rq->sg, buf, MERGE_BUFFER_LEN);
>  	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
>  	if (err < 0)
>  		put_page(virt_to_head_page(buf));
> -- 
> 1.8.4.1

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
  2013-11-13  7:10   ` Jason Wang
  2013-11-13  7:40     ` Eric Dumazet
@ 2013-11-13 17:42     ` Michael S. Tsirkin
  2013-11-16  9:06       ` Michael Dalton
  1 sibling, 1 reply; 21+ messages in thread
From: Michael S. Tsirkin @ 2013-11-13 17:42 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael Dalton, netdev, Daniel Borkmann, virtualization,
	Eric Dumazet, David S. Miller

On Wed, Nov 13, 2013 at 03:10:20PM +0800, Jason Wang wrote:
> On 11/13/2013 06:21 AM, Michael Dalton wrote:
> > Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page frag
> > allocators") changed the mergeable receive buffer size from PAGE_SIZE to
> > MTU-size, introducing a single-stream regression for benchmarks with large
> > average packet size. There is no single optimal buffer size for all workloads.
> > For workloads with packet size <= MTU bytes, MTU + virtio-net header-sized
> > buffers are preferred as larger buffers reduce the TCP window due to SKB
> > truesize. However, single-stream workloads with large average packet sizes
> > have higher throughput if larger (e.g., PAGE_SIZE) buffers are used.
> >
> > This commit auto-tunes the mergeable receiver buffer packet size by choosing
> > the packet buffer size based on an EWMA of the recent packet sizes for the
> > receive queue. Packet buffer sizes range from MTU_SIZE + virtio-net header
> > len to PAGE_SIZE. This improves throughput for large packet workloads, as
> > any workload with average packet size >= PAGE_SIZE will use PAGE_SIZE
> > buffers.
> 
> Hi Michael:
> 
> There's one concern with EWMA. How well does it handle multiple streams
> each with different packet size? E.g there may be two flows, one with
> 256 bytes each packet another is 64K.  Looks like it can result we
> allocate PAGE_SIZE buffer for 256 (which is bad since the
> payload/truesize is low) bytes or 1500+ for 64K buffer (which is ok
> since we can do coalescing).
> >
> > These optimizations interact positively with recent commit
> > ba275241030c ("virtio-net: coalesce rx frags when possible during rx"),
> > which coalesces adjacent RX SKB fragments in virtio_net. The coalescing
> > optimizations benefit buffers of any size.
> >
> > Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
> > between two QEMU VMs on a single physical machine. Each VM has two VCPUs
> > with all offloads & vhost enabled. All VMs and vhost threads run in a
> > single 4 CPU cgroup cpuset, using cgroups to ensure that other processes
> > in the system will not be scheduled on the benchmark CPUs. Trunk includes
> > SKB rx frag coalescing.
> >
> > net-next trunk w/ virtio_net before 2613af0ed18a (PAGE_SIZE bufs): 14642.85Gb/s
> > net-next trunk (MTU-size bufs):  13170.01Gb/s
> > net-next trunk + auto-tune: 14555.94Gb/s
> 
> Do you have perf numbers that just without this patch? We need to know
> how much EWMA help exactly.

Yes I'm curious too.

> >
> > Signed-off-by: Michael Dalton <mwdalton@google.com>
> > ---
> >  drivers/net/virtio_net.c | 73 +++++++++++++++++++++++++++++++++++-------------
> >  1 file changed, 53 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 0c93054..b1086e0 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -27,6 +27,7 @@
> >  #include <linux/if_vlan.h>
> >  #include <linux/slab.h>
> >  #include <linux/cpu.h>
> > +#include <linux/average.h>
> >  
> >  static int napi_weight = NAPI_POLL_WEIGHT;
> >  module_param(napi_weight, int, 0444);
> > @@ -37,10 +38,8 @@ module_param(gso, bool, 0444);
> >  
> >  /* FIXME: MTU in config. */
> >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > -#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
> > -                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
> > -                                L1_CACHE_BYTES))
> >  #define GOOD_COPY_LEN	128
> > +#define RECEIVE_AVG_WEIGHT 64
> 
> Maybe we can make this as a module parameter.

I'm not sure it's useful - no one is likely to tune it in practice.
But how about a comment explaining how was the number chosen?

> >  
> >  #define VIRTNET_DRIVER_VERSION "1.0.0"
> >  
> > @@ -79,6 +78,9 @@ struct receive_queue {
> >  	/* Chain pages by the private ptr. */
> >  	struct page *pages;
> >  
> > +	/* Average packet length for mergeable receive buffers. */
> > +	struct ewma mrg_avg_pkt_len;
> > +
> >  	/* Page frag for GFP_ATOMIC packet buffer allocation. */
> >  	struct page_frag atomic_frag;
> >  
> > @@ -302,14 +304,17 @@ static struct sk_buff *page_to_skb(struct receive_queue *rq,
> >  	return skb;
> >  }
> >  
> > -static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
> > +static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb,
> > +			     struct page *head_page)
> >  {
> >  	struct skb_vnet_hdr *hdr = skb_vnet_hdr(head_skb);
> >  	struct sk_buff *curr_skb = head_skb;
> > +	struct page *page = head_page;
> >  	char *buf;
> > -	struct page *page;
> > -	int num_buf, len, offset, truesize;
> > +	int num_buf, len, offset;
> > +	u32 est_buffer_len;
> >  
> > +	len = head_skb->len;
> >  	num_buf = hdr->mhdr.num_buffers;
> >  	while (--num_buf) {
> >  		int num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
> > @@ -320,7 +325,6 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
> >  			head_skb->dev->stats.rx_length_errors++;
> >  			return -EINVAL;
> >  		}
> > -		truesize = max_t(int, len, MERGE_BUFFER_LEN);
> >  		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
> >  			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
> >  			if (unlikely(!nskb)) {
> > @@ -338,20 +342,38 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
> >  		if (curr_skb != head_skb) {
> >  			head_skb->data_len += len;
> >  			head_skb->len += len;
> > -			head_skb->truesize += truesize;
> > +			head_skb->truesize += len;
> >  		}
> >  		page = virt_to_head_page(buf);
> >  		offset = buf - (char *)page_address(page);
> >  		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> >  			put_page(page);
> >  			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > -					     len, truesize);
> > +					     len, len);
> >  		} else {
> >  			skb_add_rx_frag(curr_skb, num_skb_frags, page,
> > -					offset, len, truesize);
> > +					offset, len, len);
> >  		}
> >  		--rq->num;
> >  	}
> > +	/* All frags before the last frag are fully used -- for those frags,
> > +	 * truesize = len. Use the size of the most recent buffer allocation
> > +	 * from the last frag's page to estimate the truesize of the last frag.
> > +	 * EWMA with a weight of 64 makes the size adjustments quite small in
> > +	 * the frags allocated on one page (even a order-3 one), and truesize
> > +	 * doesn't need to be 100% accurate.
> > +	 */
> > +	if (page) {
> > +		est_buffer_len = page_private(page);
> > +		if (est_buffer_len > len) {
> > +			u32 truesize_delta = est_buffer_len - len;
> > +
> > +			curr_skb->truesize += truesize_delta;
> > +			if (curr_skb != head_skb)
> > +				head_skb->truesize += truesize_delta;
> > +		}
> 
> Is there a chance that est_buffer_len was smaller than or equal with len?
> > +	}
> > +	ewma_add(&rq->mrg_avg_pkt_len, head_skb->len);
> >  	return 0;
> >  }
> >  
> > @@ -382,16 +404,21 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
> >  		skb_trim(skb, len);
> >  	} else if (vi->mergeable_rx_bufs) {
> >  		struct page *page = virt_to_head_page(buf);
> > -		int truesize = max_t(int, len, MERGE_BUFFER_LEN);
> > +		/* Use an initial truesize of 'len' bytes for page_to_skb --
> > +		 * receive_mergeable will fixup the truesize of the last page
> > +		 * frag if the packet is non-linear (> GOOD_COPY_LEN bytes).
> > +		 */
> >  		skb = page_to_skb(rq, page,
> >  				  (char *)buf - (char *)page_address(page),
> > -				  len, truesize);
> > +				  len, len);
> >  		if (unlikely(!skb)) {
> >  			dev->stats.rx_dropped++;
> >  			put_page(page);
> >  			return;
> >  		}
> > -		if (receive_mergeable(rq, skb)) {
> > +		if (!skb_is_nonlinear(skb))
> > +			page = NULL;
> > +		if (receive_mergeable(rq, skb, page)) {
> >  			dev_kfree_skb(skb);
> >  			return;
> >  		}
> > @@ -540,24 +567,29 @@ static int add_recvbuf_big(struct receive_queue *rq, gfp_t gfp)
> >  static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
> >  {
> >  	struct virtnet_info *vi = rq->vq->vdev->priv;
> > +	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> >  	struct page_frag *alloc_frag;
> >  	char *buf;
> > -	int err, len, hole;
> > +	int err, hole;
> > +	u32 buflen;
> >  
> > +	buflen = hdr_len + clamp_t(u32, ewma_read(&rq->mrg_avg_pkt_len),
> > +				   GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
> > +	buflen = ALIGN(buflen, L1_CACHE_BYTES);
> >  	alloc_frag = (gfp & __GFP_WAIT) ? &vi->sleep_frag : &rq->atomic_frag;
> > -	if (unlikely(!skb_page_frag_refill(MERGE_BUFFER_LEN, alloc_frag, gfp)))
> > +	if (unlikely(!skb_page_frag_refill(buflen, alloc_frag, gfp)))
> >  		return -ENOMEM;
> >  	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> >  	get_page(alloc_frag->page);
> > -	len = MERGE_BUFFER_LEN;
> > -	alloc_frag->offset += len;
> > +	alloc_frag->offset += buflen;
> > +	set_page_private(alloc_frag->page, buflen);
> 
> Not sure this is accurate, since buflen may change and several frags may
> share a single page. So the est_buffer_len we get in receive_mergeable()
> may not be the correct value.
> >  	hole = alloc_frag->size - alloc_frag->offset;
> > -	if (hole < MERGE_BUFFER_LEN) {
> > -		len += hole;
> > +	if (hole < buflen) {
> > +		buflen += hole;
> >  		alloc_frag->offset += hole;
> >  	}
> >  
> > -	sg_init_one(rq->sg, buf, len);
> > +	sg_init_one(rq->sg, buf, buflen);
> >  	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
> >  	if (err < 0)
> >  		put_page(virt_to_head_page(buf));
> > @@ -1475,6 +1507,7 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
> >  			       napi_weight);
> >  
> >  		sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
> > +		ewma_init(&vi->rq[i].mrg_avg_pkt_len, 1, RECEIVE_AVG_WEIGHT);
> >  		sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
> >  	}
> >  

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header
  2013-11-12 22:21 [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Michael Dalton
                   ` (6 preceding siblings ...)
  2013-11-13 17:39 ` Michael S. Tsirkin
@ 2013-11-13 17:43 ` Michael S. Tsirkin
  2013-11-14  7:38 ` David Miller
  8 siblings, 0 replies; 21+ messages in thread
From: Michael S. Tsirkin @ 2013-11-13 17:43 UTC (permalink / raw)
  To: Michael Dalton
  Cc: netdev, Daniel Borkmann, virtualization, Eric Dumazet, David S. Miller

On Tue, Nov 12, 2013 at 02:21:22PM -0800, Michael Dalton wrote:
> Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page
> frag allocators") changed the mergeable receive buffer size from PAGE_SIZE
> to MTU-size. However, the merge buffer size does not take into account the
> size of the virtio-net header. Consequently, packets that are MTU-size
> will take two buffers intead of one (to store the virtio-net header),
> substantially decreasing the throughput of MTU-size traffic due to TCP
> window / SKB truesize effects.
> 
> This commit changes the mergeable buffer size to include the virtio-net
> header. The buffer size is cacheline-aligned because skb_page_frag_refill
> will not automatically align the requested size.
> 
> Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
> between two QEMU VMs on a single physical machine. Each VM has two VCPUs and
> vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup
> cpuset, using cgroups to ensure that other processes in the system will not
> be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive
> buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to
> force MTU-sized packets on the receiver.
> 
> next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s
> net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s
> net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s
> net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s
> 
> Suggested-by: Eric Northup <digitaleric@google.com>
> Signed-off-by: Michael Dalton <mwdalton@google.com>

Please note this is a bugfix - useful by itself.

> ---
>  drivers/net/virtio_net.c | 30 ++++++++++++++++--------------
>  1 file changed, 16 insertions(+), 14 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 01f4eb5..69fb225 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -36,7 +36,10 @@ module_param(csum, bool, 0444);
>  module_param(gso, bool, 0444);
>  
>  /* FIXME: MTU in config. */
> -#define MAX_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> +#define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> +#define MERGE_BUFFER_LEN (ALIGN(GOOD_PACKET_LEN + \
> +                                sizeof(struct virtio_net_hdr_mrg_rxbuf), \
> +                                L1_CACHE_BYTES))
>  #define GOOD_COPY_LEN	128
>  
>  #define VIRTNET_DRIVER_VERSION "1.0.0"
> @@ -314,10 +317,10 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
>  			head_skb->dev->stats.rx_length_errors++;
>  			return -EINVAL;
>  		}
> -		if (unlikely(len > MAX_PACKET_LEN)) {
> +		if (unlikely(len > MERGE_BUFFER_LEN)) {
>  			pr_debug("%s: rx error: merge buffer too long\n",
>  				 head_skb->dev->name);
> -			len = MAX_PACKET_LEN;
> +			len = MERGE_BUFFER_LEN;
>  		}
>  		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
>  			struct sk_buff *nskb = alloc_skb(0, GFP_ATOMIC);
> @@ -336,18 +339,17 @@ static int receive_mergeable(struct receive_queue *rq, struct sk_buff *head_skb)
>  		if (curr_skb != head_skb) {
>  			head_skb->data_len += len;
>  			head_skb->len += len;
> -			head_skb->truesize += MAX_PACKET_LEN;
> +			head_skb->truesize += MERGE_BUFFER_LEN;
>  		}
>  		page = virt_to_head_page(buf);
>  		offset = buf - (char *)page_address(page);
>  		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
>  			put_page(page);
>  			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> -					     len, MAX_PACKET_LEN);
> +					     len, MERGE_BUFFER_LEN);
>  		} else {
>  			skb_add_rx_frag(curr_skb, num_skb_frags, page,
> -					offset, len,
> -					MAX_PACKET_LEN);
> +					offset, len, MERGE_BUFFER_LEN);
>  		}
>  		--rq->num;
>  	}
> @@ -383,7 +385,7 @@ static void receive_buf(struct receive_queue *rq, void *buf, unsigned int len)
>  		struct page *page = virt_to_head_page(buf);
>  		skb = page_to_skb(rq, page,
>  				  (char *)buf - (char *)page_address(page),
> -				  len, MAX_PACKET_LEN);
> +				  len, MERGE_BUFFER_LEN);
>  		if (unlikely(!skb)) {
>  			dev->stats.rx_dropped++;
>  			put_page(page);
> @@ -471,11 +473,11 @@ static int add_recvbuf_small(struct receive_queue *rq, gfp_t gfp)
>  	struct skb_vnet_hdr *hdr;
>  	int err;
>  
> -	skb = __netdev_alloc_skb_ip_align(vi->dev, MAX_PACKET_LEN, gfp);
> +	skb = __netdev_alloc_skb_ip_align(vi->dev, GOOD_PACKET_LEN, gfp);
>  	if (unlikely(!skb))
>  		return -ENOMEM;
>  
> -	skb_put(skb, MAX_PACKET_LEN);
> +	skb_put(skb, GOOD_PACKET_LEN);
>  
>  	hdr = skb_vnet_hdr(skb);
>  	sg_set_buf(rq->sg, &hdr->hdr, sizeof hdr->hdr);
> @@ -542,20 +544,20 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>  	int err;
>  
>  	if (gfp & __GFP_WAIT) {
> -		if (skb_page_frag_refill(MAX_PACKET_LEN, &vi->alloc_frag,
> +		if (skb_page_frag_refill(MERGE_BUFFER_LEN, &vi->alloc_frag,
>  					 gfp)) {
>  			buf = (char *)page_address(vi->alloc_frag.page) +
>  			      vi->alloc_frag.offset;
>  			get_page(vi->alloc_frag.page);
> -			vi->alloc_frag.offset += MAX_PACKET_LEN;
> +			vi->alloc_frag.offset += MERGE_BUFFER_LEN;
>  		}
>  	} else {
> -		buf = netdev_alloc_frag(MAX_PACKET_LEN);
> +		buf = netdev_alloc_frag(MERGE_BUFFER_LEN);
>  	}
>  	if (!buf)
>  		return -ENOMEM;
>  
> -	sg_init_one(rq->sg, buf, MAX_PACKET_LEN);
> +	sg_init_one(rq->sg, buf, MERGE_BUFFER_LEN);
>  	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
>  	if (err < 0)
>  		put_page(virt_to_head_page(buf));
> -- 
> 1.8.4.1

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header
  2013-11-12 22:21 [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Michael Dalton
                   ` (7 preceding siblings ...)
  2013-11-13 17:43 ` Michael S. Tsirkin
@ 2013-11-14  7:38 ` David Miller
  8 siblings, 0 replies; 21+ messages in thread
From: David Miller @ 2013-11-14  7:38 UTC (permalink / raw)
  To: mwdalton; +Cc: mst, netdev, dborkman, virtualization, edumazet

From: Michael Dalton <mwdalton@google.com>
Date: Tue, 12 Nov 2013 14:21:22 -0800

> Commit 2613af0ed18a ("virtio_net: migrate mergeable rx buffers to page
> frag allocators") changed the mergeable receive buffer size from PAGE_SIZE
> to MTU-size. However, the merge buffer size does not take into account the
> size of the virtio-net header. Consequently, packets that are MTU-size
> will take two buffers intead of one (to store the virtio-net header),
> substantially decreasing the throughput of MTU-size traffic due to TCP
> window / SKB truesize effects.
> 
> This commit changes the mergeable buffer size to include the virtio-net
> header. The buffer size is cacheline-aligned because skb_page_frag_refill
> will not automatically align the requested size.
> 
> Benchmarks taken from an average of 5 netperf 30-second TCP_STREAM runs
> between two QEMU VMs on a single physical machine. Each VM has two VCPUs and
> vhost enabled. All VMs and vhost threads run in a single 4 CPU cgroup
> cpuset, using cgroups to ensure that other processes in the system will not
> be scheduled on the benchmark CPUs. Transmit offloads and mergeable receive
> buffers are enabled, but guest_tso4 / guest_csum are explicitly disabled to
> force MTU-sized packets on the receiver.
> 
> next-net trunk before 2613af0ed18a (PAGE_SIZE buf): 3861.08Gb/s
> net-next trunk (MTU 1500- packet uses two buf due to size bug): 4076.62Gb/s
> net-next trunk (MTU 1480- packet fits in one buf): 6301.34Gb/s
> net-next trunk w/ size fix (MTU 1500 - packet fits in one buf): 6445.44Gb/s
> 
> Suggested-by: Eric Northup <digitaleric@google.com>
> Signed-off-by: Michael Dalton <mwdalton@google.com>

Michael, please submit this seperately for the 'net' tree as it is
a bug fix.

The rest of this series are optimizations and should be resubmitted
when the merge window closes and the 'net-next' tree opens back up.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
  2013-11-13 17:42     ` Michael S. Tsirkin
@ 2013-11-16  9:06       ` Michael Dalton
  0 siblings, 0 replies; 21+ messages in thread
From: Michael Dalton @ 2013-11-16  9:06 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: netdev, Eric Dumazet, lf-virt, Daniel Borkmann, David S. Miller

Hi,

Apologies for the delay, I wanted to get answers together for all of the
open questions raised on this thread. The first patch in this patchset is
already merged, so after the merge window re-opens I'll send out new
patchsets covering the remaining 3 patches.

After reflecting on feedback from this thread, I think it makes sense to
separate out the per-receive queue page frag allocator patches from the
autotuning patch when the merge window re-opens. The per-receive queue
page frag allocator patches help deal with fragmentation (PAGE_SIZE does
not evenly divide MERGE_BUFFER_LEN), and provide benefits whether or not
auto-tuning is present. Auto-tuning can then be evaluated separately.

On Wed, 2013-11-13 at 15:10 +0800, Jason Wang wrote:
> There's one concern with EWMA. How well does it handle multiple streams
> each with different packet size? E.g there may be two flows, one with
> 256 bytes each packet another is 64K.  Looks like it can result we
> allocate PAGE_SIZE buffer for 256 (which is bad since the
> payload/truesize is low) bytes or 1500+ for 64K buffer (which is ok
> since we can do coalescing).

If multiple streams of very different packet sizes are arriving on the
same receive queue, no single buffer size is ideal(e.g., large buffers
will cause small packets to take up too much memory, but small buffers
may reduce throughput somewhat for large packets). We don't know a
priori which packet will be delivered to a given receive queue packet
buffer, so any size we choose will not be optimal for all cases if we
have significant variance in packet sizes.

> Do you have perf numbers that just without this patch? We need to know
> how much EWMA help exactly.

Great point, I should have included that in my initial benchmarking. I ran
a benchmark in the same environment as my initial results, this time with
the first 3 patches in this patchset applied but without the autotuning
patch.  The average performance over 5 runs of 30-second netperf was
13760.85Gb/s.

> Is there a chance that est_buffer_len was smaller than or equal with len?

Yes, that is possible if the average packet length decreases.

> Not sure this is accurate, since buflen may change and several frags may
> share a single page. So the est_buffer_len we get in receive_mergeable()
> may not be the correct value.

I agree it may not be 100% accurate but we can choose a weight that will
cause the average packet size to change slowly. Even with an order 3 page
there will not be too many packet buffers allocated from a single page.

On Wed, 2013-11-13 at 17:42 +0800, Michael S. Tsirkin wrote:
> I'm not sure it's useful - no one is likely to tune it in practice.
> But how about a comment explaining how was the number chosen?

That makes sense, I agree a comment is needed. The weight determines
how quickly we react to a change in packet size. As we attempt to fill
all free ring entries on refill (in try_fill_recv), I chose a large
weight so that a short burst of traffic with a different average packet
size will not substantially shift the packet buffer size for the entire
ring the next time try_fill_recv is called. I'll add a comment that
compares 64 to nearby values (32, 16).

Best,

Mike

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance
  2013-11-13  7:40     ` Eric Dumazet
@ 2013-11-20  2:06       ` Rusty Russell
  0 siblings, 0 replies; 21+ messages in thread
From: Rusty Russell @ 2013-11-20  2:06 UTC (permalink / raw)
  To: Eric Dumazet, Jason Wang
  Cc: Michael Dalton, Michael S. Tsirkin, netdev, Daniel Borkmann,
	virtualization, Eric Dumazet, David S. Miller

Eric Dumazet <eric.dumazet@gmail.com> writes:
> On Wed, 2013-11-13 at 15:10 +0800, Jason Wang wrote:
>
>> There's one concern with EWMA. How well does it handle multiple streams
>> each with different packet size? E.g there may be two flows, one with
>> 256 bytes each packet another is 64K.  Looks like it can result we
>> allocate PAGE_SIZE buffer for 256 (which is bad since the
>> payload/truesize is low) bytes or 1500+ for 64K buffer (which is ok
>> since we can do coalescing).
>
> It's hard to predict the future ;)
>
> 256 bytes frames consume 2.5 KB anyway on a traditional NIC.
> If it was a concern, we would have it already.
>
> If you receive a mix of big and small frames, there is no win.

Well, that's not quite true.  The device could optimistically look
through the queue a bit for a small buffer; it does not need to consume
in order.  We'd probably want a feature bit for this.

I look forward to your thoughts on what mixing algorithm of different
sizes to use, of course.

Meanwhile, I suspect your patch works well because of 4k pages.  80%
non-GSO packets won't drop the average len below 4k.  On PPC64 with 64k
pages, that's not true.

I wonder if "last used len" would work about as well; it really might if
we had a smarter device...

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2013-11-20  2:06 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-11-12 22:21 [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Michael Dalton
2013-11-12 22:21 ` [PATCH net-next 2/4] net: allow > 0 order atomic page alloc in skb_page_frag_refill Michael Dalton
2013-11-12 22:42   ` Eric Dumazet
2013-11-12 22:21 ` [PATCH net-next 3/4] virtio-net: use per-receive queue page frag alloc for mergeable bufs Michael Dalton
2013-11-12 22:43   ` Eric Dumazet
2013-11-12 22:21 ` [PATCH net-next 4/4] virtio-net: auto-tune mergeable rx buffer size for improved performance Michael Dalton
2013-11-12 22:21 ` Michael Dalton
2013-11-13  7:10   ` Jason Wang
2013-11-13  7:40     ` Eric Dumazet
2013-11-20  2:06       ` Rusty Russell
2013-11-13 17:42     ` Michael S. Tsirkin
2013-11-16  9:06       ` Michael Dalton
2013-11-13  8:47   ` Ronen Hod
2013-11-13 14:19     ` Eric Dumazet
2013-11-13 16:43       ` Ronen Hod
2013-11-13 17:18         ` Eric Dumazet
2013-11-12 22:41 ` [PATCH net-next 1/4] virtio-net: mergeable buffer size should include virtio-net header Eric Dumazet
2013-11-13  6:53 ` Jason Wang
2013-11-13 17:39 ` Michael S. Tsirkin
2013-11-13 17:43 ` Michael S. Tsirkin
2013-11-14  7:38 ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.