netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain
@ 2023-05-26  5:46 Liang Chen
  2023-05-26  5:46 ` [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance Liang Chen
                   ` (5 more replies)
  0 siblings, 6 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-26  5:46 UTC (permalink / raw)
  To: jasowang, mst
  Cc: virtualization, netdev, linux-kernel, xuanzhuo, kuba, edumazet,
	davem, pabeni, alexander.duyck, Liang Chen

"private" of buffer page is currently used for big mode to chain pages.
But in mergeable mode, that offset of page could mean something else,
e.g. when page_pool page is used instead. So excluding mergeable mode to
avoid such a problem.

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
---
 drivers/net/virtio_net.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 5a7f7a76b920..c5dca0d92e64 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 			return NULL;
 
 		page = (struct page *)page->private;
-		if (page)
+		if (!vi->mergeable_rx_bufs && page)
 			give_pages(rq, page);
 		goto ok;
 	}
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-26  5:46 [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Liang Chen
@ 2023-05-26  5:46 ` Liang Chen
  2023-05-26  6:50   ` Jason Wang
                     ` (4 more replies)
  2023-05-26  5:46 ` [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support Liang Chen
                   ` (4 subsequent siblings)
  5 siblings, 5 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-26  5:46 UTC (permalink / raw)
  To: jasowang, mst
  Cc: virtualization, netdev, linux-kernel, xuanzhuo, kuba, edumazet,
	davem, pabeni, alexander.duyck, Liang Chen

The implementation at the moment uses one page per packet in both the
normal and XDP path. In addition, introducing a module parameter to enable
or disable the usage of page pool (disabled by default).

In single-core vm testing environments, it gives a modest performance gain
in the normal path.
  Upstream codebase: 47.5 Gbits/sec
  Upstream codebase + page_pool support: 50.2 Gbits/sec

In multi-core vm testing environments, The most significant performance
gain is observed in XDP cpumap:
  Upstream codebase: 1.38 Gbits/sec
  Upstream codebase + page_pool support: 9.74 Gbits/sec

With this foundation, we can further integrate page pool fragmentation and
DMA map/unmap support.

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
---
 drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
 1 file changed, 146 insertions(+), 42 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c5dca0d92e64..99c0ca0c1781 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
 module_param(gso, bool, 0444);
 module_param(napi_tx, bool, 0644);
 
+static bool page_pool_enabled;
+module_param(page_pool_enabled, bool, 0400);
+
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
@@ -159,6 +162,9 @@ struct receive_queue {
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
+	/* Page pool */
+	struct page_pool *page_pool;
+
 	/* Average packet length for mergeable receive buffers. */
 	struct ewma_pkt_len mrg_avg_pkt_len;
 
@@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
 	return skb;
 }
 
+static void virtnet_put_page(struct receive_queue *rq, struct page *page)
+{
+	if (rq->page_pool)
+		page_pool_put_full_page(rq->page_pool, page, true);
+	else
+		put_page(page);
+}
+
 /* Called from bottom half context */
 static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 				   struct receive_queue *rq,
@@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	hdr = skb_vnet_hdr(skb);
 	memcpy(hdr, hdr_p, hdr_len);
 	if (page_to_free)
-		put_page(page_to_free);
+		virtnet_put_page(rq, page_to_free);
 
 	return skb;
 }
@@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
 	return ret;
 }
 
-static void put_xdp_frags(struct xdp_buff *xdp)
+static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
 {
 	struct skb_shared_info *shinfo;
 	struct page *xdp_page;
@@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
 		shinfo = xdp_get_shared_info_from_buff(xdp);
 		for (i = 0; i < shinfo->nr_frags; i++) {
 			xdp_page = skb_frag_page(&shinfo->frags[i]);
-			put_page(xdp_page);
+			virtnet_put_page(rq, xdp_page);
 		}
 	}
 }
@@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
 	if (page_off + *len + tailroom > PAGE_SIZE)
 		return NULL;
 
-	page = alloc_page(GFP_ATOMIC);
+	if (rq->page_pool)
+		page = page_pool_dev_alloc_pages(rq->page_pool);
+	else
+		page = alloc_page(GFP_ATOMIC);
+
 	if (!page)
 		return NULL;
 
@@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
 		 * is sending packet larger than the MTU.
 		 */
 		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
-			put_page(p);
+			virtnet_put_page(rq, p);
 			goto err_buf;
 		}
 
 		memcpy(page_address(page) + page_off,
 		       page_address(p) + off, buflen);
 		page_off += buflen;
-		put_page(p);
+		virtnet_put_page(rq, p);
 	}
 
 	/* Headroom does not contribute to packet length */
 	*len = page_off - VIRTIO_XDP_HEADROOM;
 	return page;
 err_buf:
-	__free_pages(page, 0);
+	if (rq->page_pool)
+		page_pool_put_full_page(rq->page_pool, page, true);
+	else
+		__free_pages(page, 0);
 	return NULL;
 }
 
@@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
 		}
 		stats->bytes += len;
 		page = virt_to_head_page(buf);
-		put_page(page);
+		virtnet_put_page(rq, page);
 	}
 }
 
@@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
 		cur_frag_size = truesize;
 		xdp_frags_truesz += cur_frag_size;
 		if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
-			put_page(page);
+			virtnet_put_page(rq, page);
 			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
 				 dev->name, len, (unsigned long)(truesize - room));
 			dev->stats.rx_length_errors++;
@@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
 	return 0;
 
 err:
-	put_xdp_frags(xdp);
+	put_xdp_frags(xdp, rq);
 	return -EINVAL;
 }
 
@@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
 		if (*len + xdp_room > PAGE_SIZE)
 			return NULL;
 
-		xdp_page = alloc_page(GFP_ATOMIC);
+		if (rq->page_pool)
+			xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
+		else
+			xdp_page = alloc_page(GFP_ATOMIC);
 		if (!xdp_page)
 			return NULL;
 
@@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
 
 	*frame_sz = PAGE_SIZE;
 
-	put_page(*page);
+	virtnet_put_page(rq, *page);
 
 	*page = xdp_page;
 
@@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
 		head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
 		if (unlikely(!head_skb))
 			break;
+		if (rq->page_pool)
+			skb_mark_for_recycle(head_skb);
 		return head_skb;
 
 	case XDP_TX:
@@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
 		break;
 	}
 
-	put_xdp_frags(&xdp);
+	put_xdp_frags(&xdp, rq);
 
 err_xdp:
-	put_page(page);
+	virtnet_put_page(rq, page);
 	mergeable_buf_free(rq, num_buf, dev, stats);
 
 	stats->xdp_drops++;
@@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
 	curr_skb = head_skb;
 
+	if (rq->page_pool)
+		skb_mark_for_recycle(curr_skb);
+
 	if (unlikely(!curr_skb))
 		goto err_skb;
 	while (--num_buf) {
@@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			curr_skb = nskb;
 			head_skb->truesize += nskb->truesize;
 			num_skb_frags = 0;
+			if (rq->page_pool)
+				skb_mark_for_recycle(curr_skb);
 		}
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
@@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 		}
 		offset = buf - page_address(page);
 		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
-			put_page(page);
+			virtnet_put_page(rq, page);
 			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
 					     len, truesize);
 		} else {
@@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	return head_skb;
 
 err_skb:
-	put_page(page);
+	virtnet_put_page(rq, page);
 	mergeable_buf_free(rq, num_buf, dev, stats);
 
 err_buf:
@@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
 	 * disabled GSO for XDP, it won't be a big issue.
 	 */
 	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
-	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
-		return -ENOMEM;
+	if (rq->page_pool) {
+		struct page *page;
 
-	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-	buf += headroom; /* advance address leaving hole at front of pkt */
-	get_page(alloc_frag->page);
-	alloc_frag->offset += len + room;
-	hole = alloc_frag->size - alloc_frag->offset;
-	if (hole < len + room) {
-		/* To avoid internal fragmentation, if there is very likely not
-		 * enough space for another buffer, add the remaining space to
-		 * the current buffer.
-		 * XDP core assumes that frame_size of xdp_buff and the length
-		 * of the frag are PAGE_SIZE, so we disable the hole mechanism.
-		 */
-		if (!headroom)
-			len += hole;
-		alloc_frag->offset += hole;
-	}
+		page = page_pool_dev_alloc_pages(rq->page_pool);
+		if (unlikely(!page))
+			return -ENOMEM;
+		buf = (char *)page_address(page);
+		buf += headroom; /* advance address leaving hole at front of pkt */
+	} else {
+		if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
+			return -ENOMEM;
 
+		buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+		buf += headroom; /* advance address leaving hole at front of pkt */
+		get_page(alloc_frag->page);
+		alloc_frag->offset += len + room;
+		hole = alloc_frag->size - alloc_frag->offset;
+		if (hole < len + room) {
+			/* To avoid internal fragmentation, if there is very likely not
+			 * enough space for another buffer, add the remaining space to
+			 * the current buffer.
+			 * XDP core assumes that frame_size of xdp_buff and the length
+			 * of the frag are PAGE_SIZE, so we disable the hole mechanism.
+			 */
+			if (!headroom)
+				len += hole;
+			alloc_frag->offset += hole;
+		}
+	}
 	sg_init_one(rq->sg, buf, len);
 	ctx = mergeable_len_to_ctx(len + room, headroom);
 	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
 	if (err < 0)
-		put_page(virt_to_head_page(buf));
+		virtnet_put_page(rq, virt_to_head_page(buf));
 
 	return err;
 }
@@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
 	if (err < 0)
 		return err;
 
-	err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
-					 MEM_TYPE_PAGE_SHARED, NULL);
+	if (vi->rq[qp_index].page_pool)
+		err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
+						 MEM_TYPE_PAGE_POOL,
+						 vi->rq[qp_index].page_pool);
+	else
+		err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
+						 MEM_TYPE_PAGE_SHARED,
+						 NULL);
+
 	if (err < 0)
 		goto err_xdp_reg_mem_model;
 
@@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
 				ethtool_sprintf(&p, "tx_queue_%u_%s", i,
 						virtnet_sq_stats_desc[j].desc);
 		}
+		page_pool_ethtool_stats_get_strings(p);
 		break;
 	}
 }
@@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
 	switch (sset) {
 	case ETH_SS_STATS:
 		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
-					       VIRTNET_SQ_STATS_LEN);
+					       VIRTNET_SQ_STATS_LEN +
+						(page_pool_enabled && vi->mergeable_rx_bufs ?
+						 page_pool_ethtool_stats_get_count() : 0));
 	default:
 		return -EOPNOTSUPP;
 	}
 }
 
+static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
+{
+#ifdef CONFIG_PAGE_POOL_STATS
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct page_pool_stats pp_stats = {};
+	int i;
+
+	for (i = 0; i < vi->curr_queue_pairs; i++) {
+		if (!vi->rq[i].page_pool)
+			continue;
+		page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
+	}
+	page_pool_ethtool_stats_get(data, &pp_stats);
+#endif /* CONFIG_PAGE_POOL_STATS */
+}
+
 static void virtnet_get_ethtool_stats(struct net_device *dev,
 				      struct ethtool_stats *stats, u64 *data)
 {
@@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
 		} while (u64_stats_fetch_retry(&sq->stats.syncp, start));
 		idx += VIRTNET_SQ_STATS_LEN;
 	}
+
+	virtnet_get_page_pool_stats(dev, &data[idx]);
 }
 
 static void virtnet_get_channels(struct net_device *dev,
@@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		__netif_napi_del(&vi->rq[i].napi);
 		__netif_napi_del(&vi->sq[i].napi);
+		if (vi->rq[i].page_pool)
+			page_pool_destroy(vi->rq[i].page_pool);
 	}
 
 	/* We called __netif_napi_del(),
@@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
 	struct virtnet_info *vi = vq->vdev->priv;
 	int i = vq2rxq(vq);
 
-	if (vi->mergeable_rx_bufs)
-		put_page(virt_to_head_page(buf));
-	else if (vi->big_packets)
+	if (vi->mergeable_rx_bufs) {
+		if (vi->rq[i].page_pool) {
+			page_pool_put_full_page(vi->rq[i].page_pool,
+						virt_to_head_page(buf),
+						true);
+		} else {
+			put_page(virt_to_head_page(buf));
+		}
+	} else if (vi->big_packets) {
 		give_pages(&vi->rq[i], buf);
-	else
+	} else {
 		put_page(virt_to_head_page(buf));
+	}
 }
 
 static void free_unused_bufs(struct virtnet_info *vi)
@@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
 	virtnet_free_queues(vi);
 }
 
+static void virtnet_alloc_page_pool(struct receive_queue *rq)
+{
+	struct virtio_device *vdev = rq->vq->vdev;
+
+	struct page_pool_params pp_params = {
+		.order = 0,
+		.pool_size = rq->vq->num_max,
+		.nid = dev_to_node(vdev->dev.parent),
+		.dev = vdev->dev.parent,
+		.offset = 0,
+	};
+
+	rq->page_pool = page_pool_create(&pp_params);
+	if (IS_ERR(rq->page_pool)) {
+		dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
+			 PTR_ERR(rq->page_pool));
+		rq->page_pool = NULL;
+	}
+}
+
 /* How large should a single buffer be so a queue full of these can fit at
  * least one full packet?
  * Logic below assumes the mergeable buffer header is used.
@@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
 		vi->rq[i].vq = vqs[rxq2vq(i)];
 		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
 		vi->sq[i].vq = vqs[txq2vq(i)];
+
+		if (page_pool_enabled && vi->mergeable_rx_bufs)
+			virtnet_alloc_page_pool(&vi->rq[i]);
+		else
+			dev_warn(&vi->vdev->dev,
+				 "page pool only support mergeable mode\n");
+
 	}
 
 	/* run here: ret == 0. */
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support
  2023-05-26  5:46 [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Liang Chen
  2023-05-26  5:46 ` [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance Liang Chen
@ 2023-05-26  5:46 ` Liang Chen
  2023-05-26  8:29   ` Horatiu Vultur
                     ` (3 more replies)
  2023-05-26  5:46 ` [PATCH net-next 4/5] virtio_ring: Introduce DMA pre-handler Liang Chen
                   ` (3 subsequent siblings)
  5 siblings, 4 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-26  5:46 UTC (permalink / raw)
  To: jasowang, mst
  Cc: virtualization, netdev, linux-kernel, xuanzhuo, kuba, edumazet,
	davem, pabeni, alexander.duyck, Liang Chen

To further enhance performance, implement page pool fragmentation
support and introduce a module parameter to enable or disable it.

In single-core vm testing environments, there is an additional performance
gain observed in the normal path compared to the one packet per page
approach.
  Upstream codebase: 47.5 Gbits/sec
  Upstream codebase with page pool: 50.2 Gbits/sec
  Upstream codebase with page pool fragmentation support: 52.3 Gbits/sec

There is also some performance gain for XDP cpumap.
  Upstream codebase: 1.38 Gbits/sec
  Upstream codebase with page pool: 9.74 Gbits/sec
  Upstream codebase with page pool fragmentation: 10.3 Gbits/sec

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
---
 drivers/net/virtio_net.c | 72 ++++++++++++++++++++++++++++++----------
 1 file changed, 55 insertions(+), 17 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 99c0ca0c1781..ac40b8c66c59 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -32,7 +32,9 @@ module_param(gso, bool, 0444);
 module_param(napi_tx, bool, 0644);
 
 static bool page_pool_enabled;
+static bool page_pool_frag;
 module_param(page_pool_enabled, bool, 0400);
+module_param(page_pool_frag, bool, 0400);
 
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
@@ -909,23 +911,32 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
 				       struct page *p,
 				       int offset,
 				       int page_off,
-				       unsigned int *len)
+				       unsigned int *len,
+					   unsigned int *pp_frag_offset)
 {
 	int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 	struct page *page;
+	unsigned int pp_frag_offset_val;
 
 	if (page_off + *len + tailroom > PAGE_SIZE)
 		return NULL;
 
 	if (rq->page_pool)
-		page = page_pool_dev_alloc_pages(rq->page_pool);
+		if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG)
+			page = page_pool_dev_alloc_frag(rq->page_pool, pp_frag_offset,
+							PAGE_SIZE);
+		else
+			page = page_pool_dev_alloc_pages(rq->page_pool);
 	else
 		page = alloc_page(GFP_ATOMIC);
 
 	if (!page)
 		return NULL;
 
-	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
+	pp_frag_offset_val = pp_frag_offset ? *pp_frag_offset : 0;
+
+	memcpy(page_address(page) + page_off + pp_frag_offset_val,
+	       page_address(p) + offset, *len);
 	page_off += *len;
 
 	while (--*num_buf) {
@@ -948,7 +959,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
 			goto err_buf;
 		}
 
-		memcpy(page_address(page) + page_off,
+		memcpy(page_address(page) + page_off + pp_frag_offset_val,
 		       page_address(p) + off, buflen);
 		page_off += buflen;
 		virtnet_put_page(rq, p);
@@ -1029,7 +1040,7 @@ static struct sk_buff *receive_small_xdp(struct net_device *dev,
 			SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 		xdp_page = xdp_linearize_page(rq, &num_buf, page,
 					      offset, header_offset,
-					      &tlen);
+					      &tlen, NULL);
 		if (!xdp_page)
 			goto err_xdp;
 
@@ -1323,6 +1334,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
 	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
 	struct page *xdp_page;
 	unsigned int xdp_room;
+	unsigned int page_frag_offset = 0;
 
 	/* Transient failure which in theory could occur if
 	 * in-flight packets from before XDP was enabled reach
@@ -1356,7 +1368,8 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
 		xdp_page = xdp_linearize_page(rq, num_buf,
 					      *page, offset,
 					      VIRTIO_XDP_HEADROOM,
-					      len);
+					      len,
+						  &page_frag_offset);
 		if (!xdp_page)
 			return NULL;
 	} else {
@@ -1366,14 +1379,19 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
 			return NULL;
 
 		if (rq->page_pool)
-			xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
+			if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG)
+				xdp_page = page_pool_dev_alloc_frag(rq->page_pool,
+								    &page_frag_offset, PAGE_SIZE);
+			else
+				xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
 		else
 			xdp_page = alloc_page(GFP_ATOMIC);
+
 		if (!xdp_page)
 			return NULL;
 
-		memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM,
-		       page_address(*page) + offset, *len);
+		memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM +
+				page_frag_offset, page_address(*page) + offset, *len);
 	}
 
 	*frame_sz = PAGE_SIZE;
@@ -1382,7 +1400,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
 
 	*page = xdp_page;
 
-	return page_address(*page) + VIRTIO_XDP_HEADROOM;
+	return page_address(*page) + VIRTIO_XDP_HEADROOM + page_frag_offset;
 }
 
 static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
@@ -1762,6 +1780,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
 	void *ctx;
 	int err;
 	unsigned int len, hole;
+	unsigned int pp_frag_offset;
 
 	/* Extra tailroom is needed to satisfy XDP's assumption. This
 	 * means rx frags coalescing won't work, but consider we've
@@ -1769,13 +1788,29 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
 	 */
 	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
 	if (rq->page_pool) {
-		struct page *page;
+		if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG) {
+			if (unlikely(!page_pool_dev_alloc_frag(rq->page_pool,
+							       &pp_frag_offset, len + room)))
+				return -ENOMEM;
+			buf = (char *)page_address(rq->page_pool->frag_page) +
+				pp_frag_offset;
+			buf += headroom; /* advance address leaving hole at front of pkt */
+			hole = (PAGE_SIZE << rq->page_pool->p.order)
+				- rq->page_pool->frag_offset;
+			if (hole < len + room) {
+				if (!headroom)
+					len += hole;
+				rq->page_pool->frag_offset += hole;
+			}
+		} else {
+			struct page *page;
 
-		page = page_pool_dev_alloc_pages(rq->page_pool);
-		if (unlikely(!page))
-			return -ENOMEM;
-		buf = (char *)page_address(page);
-		buf += headroom; /* advance address leaving hole at front of pkt */
+			page = page_pool_dev_alloc_pages(rq->page_pool);
+			if (unlikely(!page))
+				return -ENOMEM;
+			buf = (char *)page_address(page);
+			buf += headroom; /* advance address leaving hole at front of pkt */
+		}
 	} else {
 		if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
 			return -ENOMEM;
@@ -3800,13 +3835,16 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq)
 	struct virtio_device *vdev = rq->vq->vdev;
 
 	struct page_pool_params pp_params = {
-		.order = 0,
+		.order = page_pool_frag ? SKB_FRAG_PAGE_ORDER : 0,
 		.pool_size = rq->vq->num_max,
 		.nid = dev_to_node(vdev->dev.parent),
 		.dev = vdev->dev.parent,
 		.offset = 0,
 	};
 
+	if (page_pool_frag)
+		pp_params.flags |= PP_FLAG_PAGE_FRAG;
+
 	rq->page_pool = page_pool_create(&pp_params);
 	if (IS_ERR(rq->page_pool)) {
 		dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH net-next 4/5] virtio_ring: Introduce DMA pre-handler
  2023-05-26  5:46 [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Liang Chen
  2023-05-26  5:46 ` [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance Liang Chen
  2023-05-26  5:46 ` [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support Liang Chen
@ 2023-05-26  5:46 ` Liang Chen
  2023-05-26  6:57   ` Jason Wang
  2023-05-26  5:46 ` [PATCH net-next 5/5] virtio_net: Implement " Liang Chen
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 56+ messages in thread
From: Liang Chen @ 2023-05-26  5:46 UTC (permalink / raw)
  To: jasowang, mst
  Cc: virtualization, netdev, linux-kernel, xuanzhuo, kuba, edumazet,
	davem, pabeni, alexander.duyck, Liang Chen

Currently, DMA operations of virtio devices' data buffer are encapsulated
within the underlying virtqueue implementation. DMA map/unmap operations
are performed for each data buffer attached to/detached from the virtqueue,
which is transparent and invisible to the higher-level virtio device
drivers. This encapsulation makes it not viable for device drivers to
introduce certain mechanisms, such as page pool, that require explicit
management of DMA map/unmap. Therefore, by inserting a pre-handler before
the generic DMA map/unmap operations, virtio device drivers have the
opportunity to participate in DMA operations.

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
---
 drivers/virtio/virtio_ring.c | 73 +++++++++++++++++++++++++++++++++---
 include/linux/virtio.h       | 18 +++++++++
 2 files changed, 85 insertions(+), 6 deletions(-)

diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c
index c5310eaf8b46..a99641260555 100644
--- a/drivers/virtio/virtio_ring.c
+++ b/drivers/virtio/virtio_ring.c
@@ -213,6 +213,9 @@ struct vring_virtqueue {
 	bool last_add_time_valid;
 	ktime_t last_add_time;
 #endif
+
+	/* DMA mapping Pre-handler for virtio device driver */
+	struct virtqueue_pre_dma_ops *pre_dma_ops;
 };
 
 static struct virtqueue *__vring_new_virtqueue(unsigned int index,
@@ -369,6 +372,19 @@ static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
 		return (dma_addr_t)sg_phys(sg);
 	}
 
+	/* Allow virtio drivers to perform customized mapping operation, and
+	 * fallback to the generic path if it fails to handle the mapping.
+	 */
+	if (vq->pre_dma_ops && vq->pre_dma_ops->map_page) {
+		dma_addr_t addr;
+
+		addr = vq->pre_dma_ops->map_page(vring_dma_dev(vq),
+				sg_page(sg), sg->offset, sg->length,
+				direction, 0);
+		if (addr)
+			return addr;
+	}
+
 	/*
 	 * We can't use dma_map_sg, because we don't use scatterlists in
 	 * the way it expects (we don't guarantee that the scatterlist
@@ -432,6 +448,15 @@ static void vring_unmap_one_split_indirect(const struct vring_virtqueue *vq,
 
 	flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
 
+	if (vq->pre_dma_ops && vq->pre_dma_ops->unmap_page) {
+		if (vq->pre_dma_ops->unmap_page(vring_dma_dev(vq),
+					virtio64_to_cpu(vq->vq.vdev, desc->addr),
+					virtio32_to_cpu(vq->vq.vdev, desc->len),
+					(flags & VRING_DESC_F_WRITE) ?
+					DMA_FROM_DEVICE : DMA_TO_DEVICE, 0))
+			return;
+	}
+
 	dma_unmap_page(vring_dma_dev(vq),
 		       virtio64_to_cpu(vq->vq.vdev, desc->addr),
 		       virtio32_to_cpu(vq->vq.vdev, desc->len),
@@ -456,14 +481,22 @@ static unsigned int vring_unmap_one_split(const struct vring_virtqueue *vq,
 				 extra[i].len,
 				 (flags & VRING_DESC_F_WRITE) ?
 				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
-	} else {
-		dma_unmap_page(vring_dma_dev(vq),
-			       extra[i].addr,
-			       extra[i].len,
-			       (flags & VRING_DESC_F_WRITE) ?
-			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
+		goto out;
+	} else if (vq->pre_dma_ops && vq->pre_dma_ops->unmap_page) {
+		if (vq->pre_dma_ops->unmap_page(vring_dma_dev(vq),
+					extra[i].addr,
+					extra[i].len,
+					(flags & VRING_DESC_F_WRITE) ?
+					DMA_FROM_DEVICE : DMA_TO_DEVICE, 0))
+			goto out;
 	}
 
+	dma_unmap_page(vring_dma_dev(vq),
+			extra[i].addr,
+			extra[i].len,
+			(flags & VRING_DESC_F_WRITE) ?
+			DMA_FROM_DEVICE : DMA_TO_DEVICE);
+
 out:
 	return extra[i].next;
 }
@@ -1206,10 +1239,19 @@ static void vring_unmap_extra_packed(const struct vring_virtqueue *vq,
 				 (flags & VRING_DESC_F_WRITE) ?
 				 DMA_FROM_DEVICE : DMA_TO_DEVICE);
 	} else {
+		if (vq->pre_dma_ops && vq->pre_dma_ops->unmap_page) {
+			if (vq->pre_dma_ops->unmap_page(vring_dma_dev(vq),
+						extra->addr,
+						extra->len,
+						(flags & VRING_DESC_F_WRITE) ?
+						DMA_FROM_DEVICE : DMA_TO_DEVICE, 0))
+				return;
+		}
 		dma_unmap_page(vring_dma_dev(vq),
 			       extra->addr, extra->len,
 			       (flags & VRING_DESC_F_WRITE) ?
 			       DMA_FROM_DEVICE : DMA_TO_DEVICE);
+
 	}
 }
 
@@ -1223,6 +1265,15 @@ static void vring_unmap_desc_packed(const struct vring_virtqueue *vq,
 
 	flags = le16_to_cpu(desc->flags);
 
+	if (vq->pre_dma_ops && vq->pre_dma_ops->unmap_page) {
+		if (vq->pre_dma_ops->unmap_page(vring_dma_dev(vq),
+					le64_to_cpu(desc->addr),
+					le32_to_cpu(desc->len),
+					(flags & VRING_DESC_F_WRITE) ?
+					DMA_FROM_DEVICE : DMA_TO_DEVICE, 0))
+			return;
+	}
+
 	dma_unmap_page(vring_dma_dev(vq),
 		       le64_to_cpu(desc->addr),
 		       le32_to_cpu(desc->len),
@@ -2052,6 +2103,7 @@ static struct virtqueue *vring_create_virtqueue_packed(
 	vq->packed_ring = true;
 	vq->dma_dev = dma_dev;
 	vq->use_dma_api = vring_use_dma_api(vdev);
+	vq->pre_dma_ops = NULL;
 
 	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
 		!context;
@@ -2541,6 +2593,7 @@ static struct virtqueue *__vring_new_virtqueue(unsigned int index,
 #endif
 	vq->dma_dev = dma_dev;
 	vq->use_dma_api = vring_use_dma_api(vdev);
+	vq->pre_dma_ops = NULL;
 
 	vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) &&
 		!context;
@@ -2945,4 +2998,12 @@ const struct vring *virtqueue_get_vring(const struct virtqueue *vq)
 }
 EXPORT_SYMBOL_GPL(virtqueue_get_vring);
 
+/* The virtio device driver can register its own DMA map/unmap pre-handler. */
+void virtqueue_register_pre_dma_ops(struct virtqueue *vq,
+		struct virtqueue_pre_dma_ops *pre_dma_ops)
+{
+	to_vvq(vq)->pre_dma_ops = pre_dma_ops;
+}
+EXPORT_SYMBOL_GPL(virtqueue_register_pre_dma_ops);
+
 MODULE_LICENSE("GPL");
diff --git a/include/linux/virtio.h b/include/linux/virtio.h
index b93238db94e3..1d5755b5e03f 100644
--- a/include/linux/virtio.h
+++ b/include/linux/virtio.h
@@ -9,6 +9,7 @@
 #include <linux/device.h>
 #include <linux/mod_devicetable.h>
 #include <linux/gfp.h>
+#include <linux/dma-map-ops.h>
 
 /**
  * struct virtqueue - a queue to register buffers for sending or receiving.
@@ -203,4 +204,21 @@ void unregister_virtio_driver(struct virtio_driver *drv);
 #define module_virtio_driver(__virtio_driver) \
 	module_driver(__virtio_driver, register_virtio_driver, \
 			unregister_virtio_driver)
+/**
+ * struct virtqueue_pre_dma_ops - DMA pre-handler for virtio device driver
+ * @map_page: map a single page of memory for DMA
+ * @unmap_page: unmap a single page of memory for DMA
+ */
+struct virtqueue_pre_dma_ops {
+	dma_addr_t (*map_page)(struct device *dev, struct page *page,
+			unsigned long offset, size_t size,
+			enum dma_data_direction dir, unsigned long attrs);
+	bool (*unmap_page)(struct device *dev, dma_addr_t dma_handle,
+			size_t size, enum dma_data_direction dir,
+			unsigned long attrs);
+};
+
+void virtqueue_register_pre_dma_ops(struct virtqueue *vq,
+		struct virtqueue_pre_dma_ops *pre_dma_ops);
+
 #endif /* _LINUX_VIRTIO_H */
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* [PATCH net-next 5/5] virtio_net: Implement DMA pre-handler
  2023-05-26  5:46 [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Liang Chen
                   ` (2 preceding siblings ...)
  2023-05-26  5:46 ` [PATCH net-next 4/5] virtio_ring: Introduce DMA pre-handler Liang Chen
@ 2023-05-26  5:46 ` Liang Chen
  2023-05-26  7:06   ` Jason Wang
  2023-05-26 17:34   ` kernel test robot
  2023-05-26  6:38 ` [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Jason Wang
  2023-05-28  6:16 ` Michael S. Tsirkin
  5 siblings, 2 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-26  5:46 UTC (permalink / raw)
  To: jasowang, mst
  Cc: virtualization, netdev, linux-kernel, xuanzhuo, kuba, edumazet,
	davem, pabeni, alexander.duyck, Liang Chen

Adding a DMA pre-handler that utilizes page pool for managing DMA mappings.
When IOMMU is enabled, turning on the page_pool_dma_map module parameter to
select page pool for DMA mapping management gives a significant reduction
in the overhead caused by DMA mappings.

In testing environments with a single core vm and qemu emulated IOMMU,
significant performance improvements can be observed:
  Upstream codebase: 1.76 Gbits/sec
  Upstream codebase with page pool fragmentation support: 1.81 Gbits/sec
  Upstream codebase with page pool fragmentation and DMA support: 19.3
  Gbits/sec

Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
---
 drivers/net/virtio_net.c | 55 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index ac40b8c66c59..73cc4f9fe4fa 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -22,6 +22,7 @@
 #include <net/route.h>
 #include <net/xdp.h>
 #include <net/net_failover.h>
+#include <linux/iommu.h>
 
 static int napi_weight = NAPI_POLL_WEIGHT;
 module_param(napi_weight, int, 0444);
@@ -33,8 +34,10 @@ module_param(napi_tx, bool, 0644);
 
 static bool page_pool_enabled;
 static bool page_pool_frag;
+static bool page_pool_dma_map;
 module_param(page_pool_enabled, bool, 0400);
 module_param(page_pool_frag, bool, 0400);
+module_param(page_pool_dma_map, bool, 0400);
 
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
@@ -3830,6 +3833,49 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
 	virtnet_free_queues(vi);
 }
 
+static dma_addr_t virtnet_pp_dma_map_page(struct device *dev, struct page *page,
+					  unsigned long offset, size_t size,
+					  enum dma_data_direction dir, unsigned long attrs)
+{
+	struct page *head_page;
+
+	if (dir != DMA_FROM_DEVICE)
+		return 0;
+
+	head_page = compound_head(page);
+	return page_pool_get_dma_addr(head_page)
+		+ (page - head_page) * PAGE_SIZE
+		+ offset;
+}
+
+static bool virtnet_pp_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
+				      size_t size, enum dma_data_direction dir,
+				      unsigned long attrs)
+{
+	phys_addr_t phys;
+
+	/* Handle only the RX direction, and sync the DMA memory only if it's not
+	 * a DMA coherent architecture.
+	 */
+	if (dir != DMA_FROM_DEVICE)
+		return false;
+
+	if (dev_is_dma_coherent(dev))
+		return true;
+
+	phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
+	if (WARN_ON(!phys))
+		return false;
+
+	arch_sync_dma_for_cpu(phys, size, dir);
+	return true;
+}
+
+static struct virtqueue_pre_dma_ops virtnet_pp_pre_dma_ops = {
+	.map_page = virtnet_pp_dma_map_page,
+	.unmap_page = virtnet_pp_dma_unmap_page,
+};
+
 static void virtnet_alloc_page_pool(struct receive_queue *rq)
 {
 	struct virtio_device *vdev = rq->vq->vdev;
@@ -3845,6 +3891,15 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq)
 	if (page_pool_frag)
 		pp_params.flags |= PP_FLAG_PAGE_FRAG;
 
+	/* Consider using page pool DMA support only when DMA API is used. */
+	if (virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM) &&
+	    page_pool_dma_map) {
+		pp_params.flags |= PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
+		pp_params.dma_dir = DMA_FROM_DEVICE;
+		pp_params.max_len = PAGE_SIZE << pp_params.order;
+		virtqueue_register_pre_dma_ops(rq->vq, &virtnet_pp_pre_dma_ops);
+	}
+
 	rq->page_pool = page_pool_create(&pp_params);
 	if (IS_ERR(rq->page_pool)) {
 		dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
-- 
2.31.1


^ permalink raw reply related	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain
  2023-05-26  5:46 [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Liang Chen
                   ` (3 preceding siblings ...)
  2023-05-26  5:46 ` [PATCH net-next 5/5] virtio_net: Implement " Liang Chen
@ 2023-05-26  6:38 ` Jason Wang
  2023-05-27 12:33   ` Liang Chen
  2023-05-28  6:29   ` Michael S. Tsirkin
  2023-05-28  6:16 ` Michael S. Tsirkin
  5 siblings, 2 replies; 56+ messages in thread
From: Jason Wang @ 2023-05-26  6:38 UTC (permalink / raw)
  To: Liang Chen
  Cc: mst, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote:
>
> "private" of buffer page is currently used for big mode to chain pages.
> But in mergeable mode, that offset of page could mean something else,
> e.g. when page_pool page is used instead. So excluding mergeable mode to
> avoid such a problem.

If this issue happens only in the case of page_pool, it would be
better to squash it there.

Thanks

>
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> ---
>  drivers/net/virtio_net.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 5a7f7a76b920..c5dca0d92e64 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>                         return NULL;
>
>                 page = (struct page *)page->private;
> -               if (page)
> +               if (!vi->mergeable_rx_bufs && page)
>                         give_pages(rq, page);
>                 goto ok;
>         }
> --
> 2.31.1
>


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-26  5:46 ` [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance Liang Chen
@ 2023-05-26  6:50   ` Jason Wang
  2023-05-27 12:35     ` Liang Chen
  2023-05-26 16:11   ` kernel test robot
                     ` (3 subsequent siblings)
  4 siblings, 1 reply; 56+ messages in thread
From: Jason Wang @ 2023-05-26  6:50 UTC (permalink / raw)
  To: Liang Chen
  Cc: mst, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote:
>
> The implementation at the moment uses one page per packet in both the
> normal and XDP path.

It's better to explain why we need a page pool and how it can help the
performance.

> In addition, introducing a module parameter to enable
> or disable the usage of page pool (disabled by default).

If page pool wins for most of the cases, any reason to disable it by default?

>
> In single-core vm testing environments, it gives a modest performance gain
> in the normal path.
>   Upstream codebase: 47.5 Gbits/sec
>   Upstream codebase + page_pool support: 50.2 Gbits/sec
>
> In multi-core vm testing environments, The most significant performance
> gain is observed in XDP cpumap:
>   Upstream codebase: 1.38 Gbits/sec
>   Upstream codebase + page_pool support: 9.74 Gbits/sec

Please show more details on the test. E.g which kinds of tests have
you measured?

Btw, it would be better to measure PPS as well.

>
> With this foundation, we can further integrate page pool fragmentation and
> DMA map/unmap support.
>
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> ---
>  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------

I believe we should make virtio-net to select CONFIG_PAGE_POOL or do
the ifdef tricks at least.

>  1 file changed, 146 insertions(+), 42 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c5dca0d92e64..99c0ca0c1781 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
>  module_param(gso, bool, 0444);
>  module_param(napi_tx, bool, 0644);
>
> +static bool page_pool_enabled;
> +module_param(page_pool_enabled, bool, 0400);
> +
>  /* FIXME: MTU in config. */
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>  #define GOOD_COPY_LEN  128
> @@ -159,6 +162,9 @@ struct receive_queue {
>         /* Chain pages by the private ptr. */
>         struct page *pages;
>
> +       /* Page pool */
> +       struct page_pool *page_pool;
> +
>         /* Average packet length for mergeable receive buffers. */
>         struct ewma_pkt_len mrg_avg_pkt_len;
>
> @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
>         return skb;
>  }
>
> +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> +{
> +       if (rq->page_pool)
> +               page_pool_put_full_page(rq->page_pool, page, true);
> +       else
> +               put_page(page);
> +}
> +
>  /* Called from bottom half context */
>  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>                                    struct receive_queue *rq,
> @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>         hdr = skb_vnet_hdr(skb);
>         memcpy(hdr, hdr_p, hdr_len);
>         if (page_to_free)
> -               put_page(page_to_free);
> +               virtnet_put_page(rq, page_to_free);
>
>         return skb;
>  }
> @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
>         return ret;
>  }
>
> -static void put_xdp_frags(struct xdp_buff *xdp)
> +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
>  {

rq could be fetched from xdp_rxq_info?

>         struct skb_shared_info *shinfo;
>         struct page *xdp_page;
> @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
>                 shinfo = xdp_get_shared_info_from_buff(xdp);
>                 for (i = 0; i < shinfo->nr_frags; i++) {
>                         xdp_page = skb_frag_page(&shinfo->frags[i]);
> -                       put_page(xdp_page);
> +                       virtnet_put_page(rq, xdp_page);
>                 }
>         }
>  }
> @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>         if (page_off + *len + tailroom > PAGE_SIZE)
>                 return NULL;
>
> -       page = alloc_page(GFP_ATOMIC);
> +       if (rq->page_pool)
> +               page = page_pool_dev_alloc_pages(rq->page_pool);
> +       else
> +               page = alloc_page(GFP_ATOMIC);
> +
>         if (!page)
>                 return NULL;
>
> @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>                  * is sending packet larger than the MTU.
>                  */
>                 if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> -                       put_page(p);
> +                       virtnet_put_page(rq, p);
>                         goto err_buf;
>                 }
>
>                 memcpy(page_address(page) + page_off,
>                        page_address(p) + off, buflen);
>                 page_off += buflen;
> -               put_page(p);
> +               virtnet_put_page(rq, p);
>         }
>
>         /* Headroom does not contribute to packet length */
>         *len = page_off - VIRTIO_XDP_HEADROOM;
>         return page;
>  err_buf:
> -       __free_pages(page, 0);
> +       if (rq->page_pool)
> +               page_pool_put_full_page(rq->page_pool, page, true);
> +       else
> +               __free_pages(page, 0);
>         return NULL;
>  }
>
> @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
>                 }
>                 stats->bytes += len;
>                 page = virt_to_head_page(buf);
> -               put_page(page);
> +               virtnet_put_page(rq, page);
>         }
>  }
>
> @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
>                 cur_frag_size = truesize;
>                 xdp_frags_truesz += cur_frag_size;
>                 if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> -                       put_page(page);
> +                       virtnet_put_page(rq, page);
>                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
>                                  dev->name, len, (unsigned long)(truesize - room));
>                         dev->stats.rx_length_errors++;
> @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
>         return 0;
>
>  err:
> -       put_xdp_frags(xdp);
> +       put_xdp_frags(xdp, rq);
>         return -EINVAL;
>  }
>
> @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>                 if (*len + xdp_room > PAGE_SIZE)
>                         return NULL;
>
> -               xdp_page = alloc_page(GFP_ATOMIC);
> +               if (rq->page_pool)
> +                       xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> +               else
> +                       xdp_page = alloc_page(GFP_ATOMIC);
>                 if (!xdp_page)
>                         return NULL;
>
> @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>
>         *frame_sz = PAGE_SIZE;
>
> -       put_page(*page);
> +       virtnet_put_page(rq, *page);
>
>         *page = xdp_page;
>
> @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
>                 head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
>                 if (unlikely(!head_skb))
>                         break;
> +               if (rq->page_pool)
> +                       skb_mark_for_recycle(head_skb);
>                 return head_skb;
>
>         case XDP_TX:
> @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
>                 break;
>         }
>
> -       put_xdp_frags(&xdp);
> +       put_xdp_frags(&xdp, rq);
>
>  err_xdp:
> -       put_page(page);
> +       virtnet_put_page(rq, page);
>         mergeable_buf_free(rq, num_buf, dev, stats);
>
>         stats->xdp_drops++;
> @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>         head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
>         curr_skb = head_skb;
>
> +       if (rq->page_pool)
> +               skb_mark_for_recycle(curr_skb);
> +
>         if (unlikely(!curr_skb))
>                 goto err_skb;
>         while (--num_buf) {
> @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>                         curr_skb = nskb;
>                         head_skb->truesize += nskb->truesize;
>                         num_skb_frags = 0;
> +                       if (rq->page_pool)
> +                               skb_mark_for_recycle(curr_skb);
>                 }
>                 if (curr_skb != head_skb) {
>                         head_skb->data_len += len;
> @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>                 }
>                 offset = buf - page_address(page);
>                 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> -                       put_page(page);
> +                       virtnet_put_page(rq, page);

I wonder why not we can't do this during buffer allocation like other drivers?

>                         skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
>                                              len, truesize);
>                 } else {
> @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>         return head_skb;
>
>  err_skb:
> -       put_page(page);
> +       virtnet_put_page(rq, page);
>         mergeable_buf_free(rq, num_buf, dev, stats);
>
>  err_buf:
> @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>          * disabled GSO for XDP, it won't be a big issue.
>          */
>         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> -       if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> -               return -ENOMEM;
> +       if (rq->page_pool) {
> +               struct page *page;
>
> -       buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> -       buf += headroom; /* advance address leaving hole at front of pkt */
> -       get_page(alloc_frag->page);
> -       alloc_frag->offset += len + room;
> -       hole = alloc_frag->size - alloc_frag->offset;
> -       if (hole < len + room) {
> -               /* To avoid internal fragmentation, if there is very likely not
> -                * enough space for another buffer, add the remaining space to
> -                * the current buffer.
> -                * XDP core assumes that frame_size of xdp_buff and the length
> -                * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> -                */
> -               if (!headroom)
> -                       len += hole;
> -               alloc_frag->offset += hole;
> -       }
> +               page = page_pool_dev_alloc_pages(rq->page_pool);
> +               if (unlikely(!page))
> +                       return -ENOMEM;
> +               buf = (char *)page_address(page);
> +               buf += headroom; /* advance address leaving hole at front of pkt */
> +       } else {
> +               if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))

Why not simply use a helper like virtnet_page_frag_refill() and add
the page_pool allocation logic there? It helps to reduce the
changeset.

> +                       return -ENOMEM;
>
> +               buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +               buf += headroom; /* advance address leaving hole at front of pkt */
> +               get_page(alloc_frag->page);
> +               alloc_frag->offset += len + room;
> +               hole = alloc_frag->size - alloc_frag->offset;
> +               if (hole < len + room) {
> +                       /* To avoid internal fragmentation, if there is very likely not
> +                        * enough space for another buffer, add the remaining space to
> +                        * the current buffer.
> +                        * XDP core assumes that frame_size of xdp_buff and the length
> +                        * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> +                        */
> +                       if (!headroom)
> +                               len += hole;
> +                       alloc_frag->offset += hole;
> +               }
> +       }
>         sg_init_one(rq->sg, buf, len);
>         ctx = mergeable_len_to_ctx(len + room, headroom);
>         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
>         if (err < 0)
> -               put_page(virt_to_head_page(buf));
> +               virtnet_put_page(rq, virt_to_head_page(buf));
>
>         return err;
>  }
> @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
>         if (err < 0)
>                 return err;
>
> -       err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> -                                        MEM_TYPE_PAGE_SHARED, NULL);
> +       if (vi->rq[qp_index].page_pool)
> +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> +                                                MEM_TYPE_PAGE_POOL,
> +                                                vi->rq[qp_index].page_pool);
> +       else
> +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> +                                                MEM_TYPE_PAGE_SHARED,
> +                                                NULL);
> +
>         if (err < 0)
>                 goto err_xdp_reg_mem_model;
>
> @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
>                                 ethtool_sprintf(&p, "tx_queue_%u_%s", i,
>                                                 virtnet_sq_stats_desc[j].desc);
>                 }
> +               page_pool_ethtool_stats_get_strings(p);
>                 break;
>         }
>  }
> @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
>         switch (sset) {
>         case ETH_SS_STATS:
>                 return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> -                                              VIRTNET_SQ_STATS_LEN);
> +                                              VIRTNET_SQ_STATS_LEN +
> +                                               (page_pool_enabled && vi->mergeable_rx_bufs ?
> +                                                page_pool_ethtool_stats_get_count() : 0));
>         default:
>                 return -EOPNOTSUPP;
>         }
>  }
>
> +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> +{
> +#ifdef CONFIG_PAGE_POOL_STATS
> +       struct virtnet_info *vi = netdev_priv(dev);
> +       struct page_pool_stats pp_stats = {};
> +       int i;
> +
> +       for (i = 0; i < vi->curr_queue_pairs; i++) {
> +               if (!vi->rq[i].page_pool)
> +                       continue;
> +               page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> +       }
> +       page_pool_ethtool_stats_get(data, &pp_stats);
> +#endif /* CONFIG_PAGE_POOL_STATS */
> +}
> +
>  static void virtnet_get_ethtool_stats(struct net_device *dev,
>                                       struct ethtool_stats *stats, u64 *data)
>  {
> @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
>                 } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
>                 idx += VIRTNET_SQ_STATS_LEN;
>         }
> +
> +       virtnet_get_page_pool_stats(dev, &data[idx]);
>  }
>
>  static void virtnet_get_channels(struct net_device *dev,
> @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
>         for (i = 0; i < vi->max_queue_pairs; i++) {
>                 __netif_napi_del(&vi->rq[i].napi);
>                 __netif_napi_del(&vi->sq[i].napi);
> +               if (vi->rq[i].page_pool)
> +                       page_pool_destroy(vi->rq[i].page_pool);
>         }
>
>         /* We called __netif_napi_del(),
> @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
>         struct virtnet_info *vi = vq->vdev->priv;
>         int i = vq2rxq(vq);
>
> -       if (vi->mergeable_rx_bufs)
> -               put_page(virt_to_head_page(buf));
> -       else if (vi->big_packets)
> +       if (vi->mergeable_rx_bufs) {
> +               if (vi->rq[i].page_pool) {
> +                       page_pool_put_full_page(vi->rq[i].page_pool,
> +                                               virt_to_head_page(buf),
> +                                               true);
> +               } else {
> +                       put_page(virt_to_head_page(buf));
> +               }
> +       } else if (vi->big_packets) {
>                 give_pages(&vi->rq[i], buf);

Any reason only mergeable were modified but not for small and big?

Thanks

> -       else
> +       } else {
>                 put_page(virt_to_head_page(buf));
> +       }
>  }
>
>  static void free_unused_bufs(struct virtnet_info *vi)
> @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
>         virtnet_free_queues(vi);
>  }
>
> +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> +{
> +       struct virtio_device *vdev = rq->vq->vdev;
> +
> +       struct page_pool_params pp_params = {
> +               .order = 0,
> +               .pool_size = rq->vq->num_max,
> +               .nid = dev_to_node(vdev->dev.parent),
> +               .dev = vdev->dev.parent,
> +               .offset = 0,
> +       };
> +
> +       rq->page_pool = page_pool_create(&pp_params);
> +       if (IS_ERR(rq->page_pool)) {
> +               dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> +                        PTR_ERR(rq->page_pool));
> +               rq->page_pool = NULL;
> +       }
> +}
> +
>  /* How large should a single buffer be so a queue full of these can fit at
>   * least one full packet?
>   * Logic below assumes the mergeable buffer header is used.
> @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
>                 vi->rq[i].vq = vqs[rxq2vq(i)];
>                 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
>                 vi->sq[i].vq = vqs[txq2vq(i)];
> +
> +               if (page_pool_enabled && vi->mergeable_rx_bufs)
> +                       virtnet_alloc_page_pool(&vi->rq[i]);
> +               else
> +                       dev_warn(&vi->vdev->dev,
> +                                "page pool only support mergeable mode\n");
> +
>         }
>
>         /* run here: ret == 0. */
> --
> 2.31.1
>


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 4/5] virtio_ring: Introduce DMA pre-handler
  2023-05-26  5:46 ` [PATCH net-next 4/5] virtio_ring: Introduce DMA pre-handler Liang Chen
@ 2023-05-26  6:57   ` Jason Wang
  0 siblings, 0 replies; 56+ messages in thread
From: Jason Wang @ 2023-05-26  6:57 UTC (permalink / raw)
  To: Liang Chen
  Cc: mst, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 1:47 PM Liang Chen <liangchen.linux@gmail.com> wrote:
>
> Currently, DMA operations of virtio devices' data buffer are encapsulated
> within the underlying virtqueue implementation. DMA map/unmap operations
> are performed for each data buffer attached to/detached from the virtqueue,
> which is transparent and invisible to the higher-level virtio device
> drivers. This encapsulation makes it not viable for device drivers to
> introduce certain mechanisms, such as page pool, that require explicit
> management of DMA map/unmap. Therefore, by inserting a pre-handler before
> the generic DMA map/unmap operations, virtio device drivers have the
> opportunity to participate in DMA operations.
>
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>

So Xuan is doing AF_XDP for the virtio-net that allows the DMA to be
mapped at least by the virtio-net.

It looks like a way to allow virtio-net to map and unmap the DMA
buffer by itself, but this patch goes into another way which seems to
query the address from the virtio core.

Personally, I think map and sync by the virtio-net driver seems clean.
But we can see.

Thanks


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 5/5] virtio_net: Implement DMA pre-handler
  2023-05-26  5:46 ` [PATCH net-next 5/5] virtio_net: Implement " Liang Chen
@ 2023-05-26  7:06   ` Jason Wang
  2023-05-27 12:35     ` Liang Chen
  2023-05-26 17:34   ` kernel test robot
  1 sibling, 1 reply; 56+ messages in thread
From: Jason Wang @ 2023-05-26  7:06 UTC (permalink / raw)
  To: Liang Chen
  Cc: mst, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 1:47 PM Liang Chen <liangchen.linux@gmail.com> wrote:
>
> Adding a DMA pre-handler that utilizes page pool for managing DMA mappings.
> When IOMMU is enabled, turning on the page_pool_dma_map module parameter to
> select page pool for DMA mapping management gives a significant reduction
> in the overhead caused by DMA mappings.
>
> In testing environments with a single core vm and qemu emulated IOMMU,
> significant performance improvements can be observed:
>   Upstream codebase: 1.76 Gbits/sec
>   Upstream codebase with page pool fragmentation support: 1.81 Gbits/sec
>   Upstream codebase with page pool fragmentation and DMA support: 19.3
>   Gbits/sec
>
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> ---
>  drivers/net/virtio_net.c | 55 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 55 insertions(+)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index ac40b8c66c59..73cc4f9fe4fa 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -22,6 +22,7 @@
>  #include <net/route.h>
>  #include <net/xdp.h>
>  #include <net/net_failover.h>
> +#include <linux/iommu.h>
>
>  static int napi_weight = NAPI_POLL_WEIGHT;
>  module_param(napi_weight, int, 0444);
> @@ -33,8 +34,10 @@ module_param(napi_tx, bool, 0644);
>
>  static bool page_pool_enabled;
>  static bool page_pool_frag;
> +static bool page_pool_dma_map;
>  module_param(page_pool_enabled, bool, 0400);
>  module_param(page_pool_frag, bool, 0400);
> +module_param(page_pool_dma_map, bool, 0400);
>
>  /* FIXME: MTU in config. */
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> @@ -3830,6 +3833,49 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
>         virtnet_free_queues(vi);
>  }
>
> +static dma_addr_t virtnet_pp_dma_map_page(struct device *dev, struct page *page,
> +                                         unsigned long offset, size_t size,
> +                                         enum dma_data_direction dir, unsigned long attrs)
> +{
> +       struct page *head_page;
> +
> +       if (dir != DMA_FROM_DEVICE)
> +               return 0;
> +
> +       head_page = compound_head(page);
> +       return page_pool_get_dma_addr(head_page)
> +               + (page - head_page) * PAGE_SIZE
> +               + offset;

So it's not a map, it is just a query from the dma address from the pool.

> +}
> +
> +static bool virtnet_pp_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
> +                                     size_t size, enum dma_data_direction dir,
> +                                     unsigned long attrs)
> +{
> +       phys_addr_t phys;
> +
> +       /* Handle only the RX direction, and sync the DMA memory only if it's not
> +        * a DMA coherent architecture.
> +        */
> +       if (dir != DMA_FROM_DEVICE)
> +               return false;
> +
> +       if (dev_is_dma_coherent(dev))
> +               return true;
> +
> +       phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);

This would be somehow slow. If we track the mapping by driver, it
would be much faster.

More could be seen here:

https://lists.linuxfoundation.org/pipermail/virtualization/2023-May/066778.html

Thanks

> +       if (WARN_ON(!phys))
> +               return false;
> +
> +       arch_sync_dma_for_cpu(phys, size, dir);
> +       return true;
> +}
> +
> +static struct virtqueue_pre_dma_ops virtnet_pp_pre_dma_ops = {
> +       .map_page = virtnet_pp_dma_map_page,
> +       .unmap_page = virtnet_pp_dma_unmap_page,
> +};
> +
>  static void virtnet_alloc_page_pool(struct receive_queue *rq)
>  {
>         struct virtio_device *vdev = rq->vq->vdev;
> @@ -3845,6 +3891,15 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq)
>         if (page_pool_frag)
>                 pp_params.flags |= PP_FLAG_PAGE_FRAG;
>
> +       /* Consider using page pool DMA support only when DMA API is used. */
> +       if (virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM) &&
> +           page_pool_dma_map) {
> +               pp_params.flags |= PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> +               pp_params.dma_dir = DMA_FROM_DEVICE;
> +               pp_params.max_len = PAGE_SIZE << pp_params.order;
> +               virtqueue_register_pre_dma_ops(rq->vq, &virtnet_pp_pre_dma_ops);
> +       }
> +
>         rq->page_pool = page_pool_create(&pp_params);
>         if (IS_ERR(rq->page_pool)) {
>                 dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> --
> 2.31.1
>


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support
  2023-05-26  5:46 ` [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support Liang Chen
@ 2023-05-26  8:29   ` Horatiu Vultur
  2023-05-27 12:36     ` Liang Chen
  2023-05-26 17:44   ` kernel test robot
                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 56+ messages in thread
From: Horatiu Vultur @ 2023-05-26  8:29 UTC (permalink / raw)
  To: Liang Chen
  Cc: jasowang, mst, virtualization, netdev, linux-kernel, xuanzhuo,
	kuba, edumazet, davem, pabeni, alexander.duyck

The 05/26/2023 13:46, Liang Chen wrote:

Hi Liang,

> 
> To further enhance performance, implement page pool fragmentation
> support and introduce a module parameter to enable or disable it.
> 
> In single-core vm testing environments, there is an additional performance
> gain observed in the normal path compared to the one packet per page
> approach.
>   Upstream codebase: 47.5 Gbits/sec
>   Upstream codebase with page pool: 50.2 Gbits/sec
>   Upstream codebase with page pool fragmentation support: 52.3 Gbits/sec
> 
> There is also some performance gain for XDP cpumap.
>   Upstream codebase: 1.38 Gbits/sec
>   Upstream codebase with page pool: 9.74 Gbits/sec
>   Upstream codebase with page pool fragmentation: 10.3 Gbits/sec
> 
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> ---
>  drivers/net/virtio_net.c | 72 ++++++++++++++++++++++++++++++----------
>  1 file changed, 55 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 99c0ca0c1781..ac40b8c66c59 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -32,7 +32,9 @@ module_param(gso, bool, 0444);
>  module_param(napi_tx, bool, 0644);
> 
>  static bool page_pool_enabled;
> +static bool page_pool_frag;
>  module_param(page_pool_enabled, bool, 0400);
> +module_param(page_pool_frag, bool, 0400);
> 
>  /* FIXME: MTU in config. */
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> @@ -909,23 +911,32 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>                                        struct page *p,
>                                        int offset,
>                                        int page_off,
> -                                      unsigned int *len)
> +                                      unsigned int *len,
> +                                          unsigned int *pp_frag_offset)

The 'unsigned int *pp_frag_offset' seems to be unaligned.

>  {
>         int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>         struct page *page;
> +       unsigned int pp_frag_offset_val;

Please use reverse christmas tree notation here. The pp_frag_offset_val
needs to be declared before page;

> 
>         if (page_off + *len + tailroom > PAGE_SIZE)
>                 return NULL;
> 
>         if (rq->page_pool)
> -               page = page_pool_dev_alloc_pages(rq->page_pool);
> +               if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG)
> +                       page = page_pool_dev_alloc_frag(rq->page_pool, pp_frag_offset,
> +                                                       PAGE_SIZE);

Don't you need to check if pp_frag_offset is null? As you call once with
NULL.

> +               else
> +                       page = page_pool_dev_alloc_pages(rq->page_pool);
>         else
>                 page = alloc_page(GFP_ATOMIC);
> 
>         if (!page)
>                 return NULL;
> 
> -       memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
> +       pp_frag_offset_val = pp_frag_offset ? *pp_frag_offset : 0;
> +
> +       memcpy(page_address(page) + page_off + pp_frag_offset_val,
> +              page_address(p) + offset, *len);
>         page_off += *len;
> 
>         while (--*num_buf) {
> @@ -948,7 +959,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>                         goto err_buf;
>                 }
> 
> -               memcpy(page_address(page) + page_off,
> +               memcpy(page_address(page) + page_off + pp_frag_offset_val,
>                        page_address(p) + off, buflen);
>                 page_off += buflen;
>                 virtnet_put_page(rq, p);
> @@ -1029,7 +1040,7 @@ static struct sk_buff *receive_small_xdp(struct net_device *dev,
>                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>                 xdp_page = xdp_linearize_page(rq, &num_buf, page,
>                                               offset, header_offset,
> -                                             &tlen);
> +                                             &tlen, NULL);
>                 if (!xdp_page)
>                         goto err_xdp;
> 
> @@ -1323,6 +1334,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>         unsigned int headroom = mergeable_ctx_to_headroom(ctx);
>         struct page *xdp_page;
>         unsigned int xdp_room;
> +       unsigned int page_frag_offset = 0;

Please use reverse x-mas tree notation.

> 
>         /* Transient failure which in theory could occur if
>          * in-flight packets from before XDP was enabled reach
> @@ -1356,7 +1368,8 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>                 xdp_page = xdp_linearize_page(rq, num_buf,
>                                               *page, offset,
>                                               VIRTIO_XDP_HEADROOM,
> -                                             len);
> +                                             len,
> +                                                 &page_frag_offset);

You have also here some misalignment with regards to page_frag_offset.

>                 if (!xdp_page)
>                         return NULL;
>         } else {
> @@ -1366,14 +1379,19 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>                         return NULL;
> 
>                 if (rq->page_pool)
> -                       xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> +                       if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG)
> +                               xdp_page = page_pool_dev_alloc_frag(rq->page_pool,
> +                                                                   &page_frag_offset, PAGE_SIZE);
> +                       else
> +                               xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
>                 else
>                         xdp_page = alloc_page(GFP_ATOMIC);
> +
>                 if (!xdp_page)
>                         return NULL;
> 
> -               memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM,
> -                      page_address(*page) + offset, *len);
> +               memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM +
> +                               page_frag_offset, page_address(*page) + offset, *len);
>         }
> 
>         *frame_sz = PAGE_SIZE;
> @@ -1382,7 +1400,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> 
>         *page = xdp_page;
> 
> -       return page_address(*page) + VIRTIO_XDP_HEADROOM;
> +       return page_address(*page) + VIRTIO_XDP_HEADROOM + page_frag_offset;
>  }
> 
>  static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> @@ -1762,6 +1780,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>         void *ctx;
>         int err;
>         unsigned int len, hole;
> +       unsigned int pp_frag_offset;

There same here.

> 
>         /* Extra tailroom is needed to satisfy XDP's assumption. This
>          * means rx frags coalescing won't work, but consider we've
> @@ -1769,13 +1788,29 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>          */
>         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
>         if (rq->page_pool) {
> -               struct page *page;
> +               if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG) {
> +                       if (unlikely(!page_pool_dev_alloc_frag(rq->page_pool,
> +                                                              &pp_frag_offset, len + room)))
> +                               return -ENOMEM;
> +                       buf = (char *)page_address(rq->page_pool->frag_page) +
> +                               pp_frag_offset;
> +                       buf += headroom; /* advance address leaving hole at front of pkt */
> +                       hole = (PAGE_SIZE << rq->page_pool->p.order)
> +                               - rq->page_pool->frag_offset;
> +                       if (hole < len + room) {
> +                               if (!headroom)
> +                                       len += hole;
> +                               rq->page_pool->frag_offset += hole;
> +                       }
> +               } else {
> +                       struct page *page;
> 
> -               page = page_pool_dev_alloc_pages(rq->page_pool);
> -               if (unlikely(!page))
> -                       return -ENOMEM;
> -               buf = (char *)page_address(page);
> -               buf += headroom; /* advance address leaving hole at front of pkt */
> +                       page = page_pool_dev_alloc_pages(rq->page_pool);
> +                       if (unlikely(!page))
> +                               return -ENOMEM;
> +                       buf = (char *)page_address(page);
> +                       buf += headroom; /* advance address leaving hole at front of pkt */
> +               }
>         } else {
>                 if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
>                         return -ENOMEM;
> @@ -3800,13 +3835,16 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq)
>         struct virtio_device *vdev = rq->vq->vdev;
> 
>         struct page_pool_params pp_params = {
> -               .order = 0,
> +               .order = page_pool_frag ? SKB_FRAG_PAGE_ORDER : 0,
>                 .pool_size = rq->vq->num_max,
>                 .nid = dev_to_node(vdev->dev.parent),
>                 .dev = vdev->dev.parent,
>                 .offset = 0,
>         };
> 
> +       if (page_pool_frag)
> +               pp_params.flags |= PP_FLAG_PAGE_FRAG;
> +
>         rq->page_pool = page_pool_create(&pp_params);
>         if (IS_ERR(rq->page_pool)) {
>                 dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> --
> 2.31.1
> 
> 

-- 
/Horatiu

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-26  5:46 ` [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance Liang Chen
  2023-05-26  6:50   ` Jason Wang
@ 2023-05-26 16:11   ` kernel test robot
  2023-05-28  6:27     ` Michael S. Tsirkin
  2023-05-28  6:20   ` Michael S. Tsirkin
                     ` (2 subsequent siblings)
  4 siblings, 1 reply; 56+ messages in thread
From: kernel test robot @ 2023-05-26 16:11 UTC (permalink / raw)
  To: Liang Chen, jasowang, mst
  Cc: oe-kbuild-all, virtualization, netdev, linux-kernel, xuanzhuo,
	kuba, edumazet, davem, pabeni, alexander.duyck, Liang Chen

Hi Liang,

kernel test robot noticed the following build errors:

[auto build test ERROR on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805
base:   net-next/main
patch link:    https://lore.kernel.org/r/20230526054621.18371-2-liangchen.linux%40gmail.com
patch subject: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20230526/202305262334.GiFQ3wpG-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/bfba563f43bba37181d8502cb2e566c32f96ec9e
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805
        git checkout bfba563f43bba37181d8502cb2e566c32f96ec9e
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 olddefconfig
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202305262334.GiFQ3wpG-lkp@intel.com/

All errors (new ones prefixed by >>):

   ld: vmlinux.o: in function `virtnet_find_vqs':
>> virtio_net.c:(.text+0x901fb5): undefined reference to `page_pool_create'
   ld: vmlinux.o: in function `add_recvbuf_mergeable.isra.0':
>> virtio_net.c:(.text+0x905618): undefined reference to `page_pool_alloc_pages'
   ld: vmlinux.o: in function `xdp_linearize_page':
   virtio_net.c:(.text+0x906b6b): undefined reference to `page_pool_alloc_pages'
   ld: vmlinux.o: in function `mergeable_xdp_get_buf.isra.0':
   virtio_net.c:(.text+0x90728f): undefined reference to `page_pool_alloc_pages'

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 5/5] virtio_net: Implement DMA pre-handler
  2023-05-26  5:46 ` [PATCH net-next 5/5] virtio_net: Implement " Liang Chen
  2023-05-26  7:06   ` Jason Wang
@ 2023-05-26 17:34   ` kernel test robot
  1 sibling, 0 replies; 56+ messages in thread
From: kernel test robot @ 2023-05-26 17:34 UTC (permalink / raw)
  To: Liang Chen, jasowang, mst
  Cc: oe-kbuild-all, virtualization, netdev, linux-kernel, xuanzhuo,
	kuba, edumazet, davem, pabeni, alexander.duyck, Liang Chen

Hi Liang,

kernel test robot noticed the following build errors:

[auto build test ERROR on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805
base:   net-next/main
patch link:    https://lore.kernel.org/r/20230526054621.18371-5-liangchen.linux%40gmail.com
patch subject: [PATCH net-next 5/5] virtio_net: Implement DMA pre-handler
config: m68k-allmodconfig (https://download.01.org/0day-ci/archive/20230527/202305270110.TbNSDh0Z-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 12.1.0
reproduce (this is a W=1 build):
        mkdir -p ~/bin
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # https://github.com/intel-lab-lkp/linux/commit/e968bb5cacd30b672d0ccf705a24f1a792ff45aa
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805
        git checkout e968bb5cacd30b672d0ccf705a24f1a792ff45aa
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 ~/bin/make.cross W=1 O=build_dir ARCH=m68k olddefconfig
        COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-12.1.0 ~/bin/make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202305270110.TbNSDh0Z-lkp@intel.com/

All errors (new ones prefixed by >>, old ones prefixed by <<):

>> ERROR: modpost: "iommu_get_dma_domain" [drivers/net/virtio_net.ko] undefined!

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support
  2023-05-26  5:46 ` [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support Liang Chen
  2023-05-26  8:29   ` Horatiu Vultur
@ 2023-05-26 17:44   ` kernel test robot
  2023-05-28  6:25   ` Michael S. Tsirkin
  2023-05-29  1:33   ` Yunsheng Lin
  3 siblings, 0 replies; 56+ messages in thread
From: kernel test robot @ 2023-05-26 17:44 UTC (permalink / raw)
  To: Liang Chen, jasowang, mst
  Cc: oe-kbuild-all, virtualization, netdev, linux-kernel, xuanzhuo,
	kuba, edumazet, davem, pabeni, alexander.duyck, Liang Chen

Hi Liang,

kernel test robot noticed the following build errors:

[auto build test ERROR on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805
base:   net-next/main
patch link:    https://lore.kernel.org/r/20230526054621.18371-3-liangchen.linux%40gmail.com
patch subject: [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support
config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20230527/202305270116.TJ31IjNL-lkp@intel.com/config)
compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
reproduce (this is a W=1 build):
        # https://github.com/intel-lab-lkp/linux/commit/dda0469e059354b61192e1d25b77c57351346282
        git remote add linux-review https://github.com/intel-lab-lkp/linux
        git fetch --no-tags linux-review Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805
        git checkout dda0469e059354b61192e1d25b77c57351346282
        # save the config file
        mkdir build_dir && cp config build_dir/.config
        make W=1 O=build_dir ARCH=x86_64 olddefconfig
        make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash

If you fix the issue, kindly add following tag where applicable
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202305270116.TJ31IjNL-lkp@intel.com/

All errors (new ones prefixed by >>):

   ld: vmlinux.o: in function `virtnet_find_vqs':
   virtio_net.c:(.text+0x901fd2): undefined reference to `page_pool_create'
   ld: vmlinux.o: in function `add_recvbuf_mergeable.isra.0':
   virtio_net.c:(.text+0x905662): undefined reference to `page_pool_alloc_pages'
>> ld: virtio_net.c:(.text+0x905715): undefined reference to `page_pool_alloc_frag'
   ld: vmlinux.o: in function `xdp_linearize_page':
   virtio_net.c:(.text+0x906c50): undefined reference to `page_pool_alloc_pages'
   ld: virtio_net.c:(.text+0x906e33): undefined reference to `page_pool_alloc_frag'
   ld: vmlinux.o: in function `mergeable_xdp_get_buf.isra.0':
>> virtio_net.c:(.text+0x90740e): undefined reference to `page_pool_alloc_frag'
>> ld: virtio_net.c:(.text+0x90750b): undefined reference to `page_pool_alloc_pages'

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain
  2023-05-26  6:38 ` [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Jason Wang
@ 2023-05-27 12:33   ` Liang Chen
  2023-05-28  6:29   ` Michael S. Tsirkin
  1 sibling, 0 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-27 12:33 UTC (permalink / raw)
  To: Jason Wang
  Cc: mst, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 2:39 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote:
> >
> > "private" of buffer page is currently used for big mode to chain pages.
> > But in mergeable mode, that offset of page could mean something else,
> > e.g. when page_pool page is used instead. So excluding mergeable mode to
> > avoid such a problem.
>
> If this issue happens only in the case of page_pool, it would be
> better to squash it there.
>
> Thanks

Sure, thanks!


>
> >
> > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > ---
> >  drivers/net/virtio_net.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 5a7f7a76b920..c5dca0d92e64 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> >                         return NULL;
> >
> >                 page = (struct page *)page->private;
> > -               if (page)
> > +               if (!vi->mergeable_rx_bufs && page)
> >                         give_pages(rq, page);
> >                 goto ok;
> >         }
> > --
> > 2.31.1
> >
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-26  6:50   ` Jason Wang
@ 2023-05-27 12:35     ` Liang Chen
  2023-05-28  6:40       ` Michael S. Tsirkin
  0 siblings, 1 reply; 56+ messages in thread
From: Liang Chen @ 2023-05-27 12:35 UTC (permalink / raw)
  To: Jason Wang
  Cc: mst, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote:
> >
> > The implementation at the moment uses one page per packet in both the
> > normal and XDP path.
>
> It's better to explain why we need a page pool and how it can help the
> performance.
>

Sure, I will include that on v2.
> > In addition, introducing a module parameter to enable
> > or disable the usage of page pool (disabled by default).
>
> If page pool wins for most of the cases, any reason to disable it by default?
>

Thank you for raising the point. It does make sense to enable it by default.
> >
> > In single-core vm testing environments, it gives a modest performance gain
> > in the normal path.
> >   Upstream codebase: 47.5 Gbits/sec
> >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> >
> > In multi-core vm testing environments, The most significant performance
> > gain is observed in XDP cpumap:
> >   Upstream codebase: 1.38 Gbits/sec
> >   Upstream codebase + page_pool support: 9.74 Gbits/sec
>
> Please show more details on the test. E.g which kinds of tests have
> you measured?
>
> Btw, it would be better to measure PPS as well.
>

Sure. It will be added on v2.
> >
> > With this foundation, we can further integrate page pool fragmentation and
> > DMA map/unmap support.
> >
> > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > ---
> >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
>
> I believe we should make virtio-net to select CONFIG_PAGE_POOL or do
> the ifdef tricks at least.
>

Sure. it will be done on v2.
> >  1 file changed, 146 insertions(+), 42 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index c5dca0d92e64..99c0ca0c1781 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> >  module_param(gso, bool, 0444);
> >  module_param(napi_tx, bool, 0644);
> >
> > +static bool page_pool_enabled;
> > +module_param(page_pool_enabled, bool, 0400);
> > +
> >  /* FIXME: MTU in config. */
> >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> >  #define GOOD_COPY_LEN  128
> > @@ -159,6 +162,9 @@ struct receive_queue {
> >         /* Chain pages by the private ptr. */
> >         struct page *pages;
> >
> > +       /* Page pool */
> > +       struct page_pool *page_pool;
> > +
> >         /* Average packet length for mergeable receive buffers. */
> >         struct ewma_pkt_len mrg_avg_pkt_len;
> >
> > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> >         return skb;
> >  }
> >
> > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > +{
> > +       if (rq->page_pool)
> > +               page_pool_put_full_page(rq->page_pool, page, true);
> > +       else
> > +               put_page(page);
> > +}
> > +
> >  /* Called from bottom half context */
> >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> >                                    struct receive_queue *rq,
> > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> >         hdr = skb_vnet_hdr(skb);
> >         memcpy(hdr, hdr_p, hdr_len);
> >         if (page_to_free)
> > -               put_page(page_to_free);
> > +               virtnet_put_page(rq, page_to_free);
> >
> >         return skb;
> >  }
> > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> >         return ret;
> >  }
> >
> > -static void put_xdp_frags(struct xdp_buff *xdp)
> > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> >  {
>
> rq could be fetched from xdp_rxq_info?

Yeah, it has the queue_index there.
>
> >         struct skb_shared_info *shinfo;
> >         struct page *xdp_page;
> > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> >                 shinfo = xdp_get_shared_info_from_buff(xdp);
> >                 for (i = 0; i < shinfo->nr_frags; i++) {
> >                         xdp_page = skb_frag_page(&shinfo->frags[i]);
> > -                       put_page(xdp_page);
> > +                       virtnet_put_page(rq, xdp_page);
> >                 }
> >         }
> >  }
> > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >         if (page_off + *len + tailroom > PAGE_SIZE)
> >                 return NULL;
> >
> > -       page = alloc_page(GFP_ATOMIC);
> > +       if (rq->page_pool)
> > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > +       else
> > +               page = alloc_page(GFP_ATOMIC);
> > +
> >         if (!page)
> >                 return NULL;
> >
> > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >                  * is sending packet larger than the MTU.
> >                  */
> >                 if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > -                       put_page(p);
> > +                       virtnet_put_page(rq, p);
> >                         goto err_buf;
> >                 }
> >
> >                 memcpy(page_address(page) + page_off,
> >                        page_address(p) + off, buflen);
> >                 page_off += buflen;
> > -               put_page(p);
> > +               virtnet_put_page(rq, p);
> >         }
> >
> >         /* Headroom does not contribute to packet length */
> >         *len = page_off - VIRTIO_XDP_HEADROOM;
> >         return page;
> >  err_buf:
> > -       __free_pages(page, 0);
> > +       if (rq->page_pool)
> > +               page_pool_put_full_page(rq->page_pool, page, true);
> > +       else
> > +               __free_pages(page, 0);
> >         return NULL;
> >  }
> >
> > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> >                 }
> >                 stats->bytes += len;
> >                 page = virt_to_head_page(buf);
> > -               put_page(page);
> > +               virtnet_put_page(rq, page);
> >         }
> >  }
> >
> > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> >                 cur_frag_size = truesize;
> >                 xdp_frags_truesz += cur_frag_size;
> >                 if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > -                       put_page(page);
> > +                       virtnet_put_page(rq, page);
> >                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> >                                  dev->name, len, (unsigned long)(truesize - room));
> >                         dev->stats.rx_length_errors++;
> > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> >         return 0;
> >
> >  err:
> > -       put_xdp_frags(xdp);
> > +       put_xdp_frags(xdp, rq);
> >         return -EINVAL;
> >  }
> >
> > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >                 if (*len + xdp_room > PAGE_SIZE)
> >                         return NULL;
> >
> > -               xdp_page = alloc_page(GFP_ATOMIC);
> > +               if (rq->page_pool)
> > +                       xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > +               else
> > +                       xdp_page = alloc_page(GFP_ATOMIC);
> >                 if (!xdp_page)
> >                         return NULL;
> >
> > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >
> >         *frame_sz = PAGE_SIZE;
> >
> > -       put_page(*page);
> > +       virtnet_put_page(rq, *page);
> >
> >         *page = xdp_page;
> >
> > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> >                 head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> >                 if (unlikely(!head_skb))
> >                         break;
> > +               if (rq->page_pool)
> > +                       skb_mark_for_recycle(head_skb);
> >                 return head_skb;
> >
> >         case XDP_TX:
> > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> >                 break;
> >         }
> >
> > -       put_xdp_frags(&xdp);
> > +       put_xdp_frags(&xdp, rq);
> >
> >  err_xdp:
> > -       put_page(page);
> > +       virtnet_put_page(rq, page);
> >         mergeable_buf_free(rq, num_buf, dev, stats);
> >
> >         stats->xdp_drops++;
> > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >         head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> >         curr_skb = head_skb;
> >
> > +       if (rq->page_pool)
> > +               skb_mark_for_recycle(curr_skb);
> > +
> >         if (unlikely(!curr_skb))
> >                 goto err_skb;
> >         while (--num_buf) {
> > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >                         curr_skb = nskb;
> >                         head_skb->truesize += nskb->truesize;
> >                         num_skb_frags = 0;
> > +                       if (rq->page_pool)
> > +                               skb_mark_for_recycle(curr_skb);
> >                 }
> >                 if (curr_skb != head_skb) {
> >                         head_skb->data_len += len;
> > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >                 }
> >                 offset = buf - page_address(page);
> >                 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > -                       put_page(page);
> > +                       virtnet_put_page(rq, page);
>
> I wonder why not we can't do this during buffer allocation like other drivers?
>

Sorry, I don't quite understand the point here. Would you please
elaborate a bit more?
> >                         skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> >                                              len, truesize);
> >                 } else {
> > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >         return head_skb;
> >
> >  err_skb:
> > -       put_page(page);
> > +       virtnet_put_page(rq, page);
> >         mergeable_buf_free(rq, num_buf, dev, stats);
> >
> >  err_buf:
> > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >          * disabled GSO for XDP, it won't be a big issue.
> >          */
> >         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > -       if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > -               return -ENOMEM;
> > +       if (rq->page_pool) {
> > +               struct page *page;
> >
> > -       buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > -       buf += headroom; /* advance address leaving hole at front of pkt */
> > -       get_page(alloc_frag->page);
> > -       alloc_frag->offset += len + room;
> > -       hole = alloc_frag->size - alloc_frag->offset;
> > -       if (hole < len + room) {
> > -               /* To avoid internal fragmentation, if there is very likely not
> > -                * enough space for another buffer, add the remaining space to
> > -                * the current buffer.
> > -                * XDP core assumes that frame_size of xdp_buff and the length
> > -                * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > -                */
> > -               if (!headroom)
> > -                       len += hole;
> > -               alloc_frag->offset += hole;
> > -       }
> > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > +               if (unlikely(!page))
> > +                       return -ENOMEM;
> > +               buf = (char *)page_address(page);
> > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > +       } else {
> > +               if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
>
> Why not simply use a helper like virtnet_page_frag_refill() and add
> the page_pool allocation logic there? It helps to reduce the
> changeset.
>

Sure. Will do that on v2.
> > +                       return -ENOMEM;
> >
> > +               buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > +               get_page(alloc_frag->page);
> > +               alloc_frag->offset += len + room;
> > +               hole = alloc_frag->size - alloc_frag->offset;
> > +               if (hole < len + room) {
> > +                       /* To avoid internal fragmentation, if there is very likely not
> > +                        * enough space for another buffer, add the remaining space to
> > +                        * the current buffer.
> > +                        * XDP core assumes that frame_size of xdp_buff and the length
> > +                        * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > +                        */
> > +                       if (!headroom)
> > +                               len += hole;
> > +                       alloc_frag->offset += hole;
> > +               }
> > +       }
> >         sg_init_one(rq->sg, buf, len);
> >         ctx = mergeable_len_to_ctx(len + room, headroom);
> >         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> >         if (err < 0)
> > -               put_page(virt_to_head_page(buf));
> > +               virtnet_put_page(rq, virt_to_head_page(buf));
> >
> >         return err;
> >  }
> > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> >         if (err < 0)
> >                 return err;
> >
> > -       err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > -                                        MEM_TYPE_PAGE_SHARED, NULL);
> > +       if (vi->rq[qp_index].page_pool)
> > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > +                                                MEM_TYPE_PAGE_POOL,
> > +                                                vi->rq[qp_index].page_pool);
> > +       else
> > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > +                                                MEM_TYPE_PAGE_SHARED,
> > +                                                NULL);
> > +
> >         if (err < 0)
> >                 goto err_xdp_reg_mem_model;
> >
> > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> >                                 ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> >                                                 virtnet_sq_stats_desc[j].desc);
> >                 }
> > +               page_pool_ethtool_stats_get_strings(p);
> >                 break;
> >         }
> >  }
> > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> >         switch (sset) {
> >         case ETH_SS_STATS:
> >                 return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > -                                              VIRTNET_SQ_STATS_LEN);
> > +                                              VIRTNET_SQ_STATS_LEN +
> > +                                               (page_pool_enabled && vi->mergeable_rx_bufs ?
> > +                                                page_pool_ethtool_stats_get_count() : 0));
> >         default:
> >                 return -EOPNOTSUPP;
> >         }
> >  }
> >
> > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > +{
> > +#ifdef CONFIG_PAGE_POOL_STATS
> > +       struct virtnet_info *vi = netdev_priv(dev);
> > +       struct page_pool_stats pp_stats = {};
> > +       int i;
> > +
> > +       for (i = 0; i < vi->curr_queue_pairs; i++) {
> > +               if (!vi->rq[i].page_pool)
> > +                       continue;
> > +               page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > +       }
> > +       page_pool_ethtool_stats_get(data, &pp_stats);
> > +#endif /* CONFIG_PAGE_POOL_STATS */
> > +}
> > +
> >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> >                                       struct ethtool_stats *stats, u64 *data)
> >  {
> > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> >                 } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> >                 idx += VIRTNET_SQ_STATS_LEN;
> >         }
> > +
> > +       virtnet_get_page_pool_stats(dev, &data[idx]);
> >  }
> >
> >  static void virtnet_get_channels(struct net_device *dev,
> > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> >         for (i = 0; i < vi->max_queue_pairs; i++) {
> >                 __netif_napi_del(&vi->rq[i].napi);
> >                 __netif_napi_del(&vi->sq[i].napi);
> > +               if (vi->rq[i].page_pool)
> > +                       page_pool_destroy(vi->rq[i].page_pool);
> >         }
> >
> >         /* We called __netif_napi_del(),
> > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> >         struct virtnet_info *vi = vq->vdev->priv;
> >         int i = vq2rxq(vq);
> >
> > -       if (vi->mergeable_rx_bufs)
> > -               put_page(virt_to_head_page(buf));
> > -       else if (vi->big_packets)
> > +       if (vi->mergeable_rx_bufs) {
> > +               if (vi->rq[i].page_pool) {
> > +                       page_pool_put_full_page(vi->rq[i].page_pool,
> > +                                               virt_to_head_page(buf),
> > +                                               true);
> > +               } else {
> > +                       put_page(virt_to_head_page(buf));
> > +               }
> > +       } else if (vi->big_packets) {
> >                 give_pages(&vi->rq[i], buf);
>
> Any reason only mergeable were modified but not for small and big?
>
> Thanks
>

Big mode uses the page chain to recycle pages, thus the using of
"private" of the buffer page. I will take further look into that to
see if it is better to use page pool in these cases. Thanks!



> > -       else
> > +       } else {
> >                 put_page(virt_to_head_page(buf));
> > +       }
> >  }
> >
> >  static void free_unused_bufs(struct virtnet_info *vi)
> > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> >         virtnet_free_queues(vi);
> >  }
> >
> > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > +{
> > +       struct virtio_device *vdev = rq->vq->vdev;
> > +
> > +       struct page_pool_params pp_params = {
> > +               .order = 0,
> > +               .pool_size = rq->vq->num_max,
> > +               .nid = dev_to_node(vdev->dev.parent),
> > +               .dev = vdev->dev.parent,
> > +               .offset = 0,
> > +       };
> > +
> > +       rq->page_pool = page_pool_create(&pp_params);
> > +       if (IS_ERR(rq->page_pool)) {
> > +               dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > +                        PTR_ERR(rq->page_pool));
> > +               rq->page_pool = NULL;
> > +       }
> > +}
> > +
> >  /* How large should a single buffer be so a queue full of these can fit at
> >   * least one full packet?
> >   * Logic below assumes the mergeable buffer header is used.
> > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> >                 vi->rq[i].vq = vqs[rxq2vq(i)];
> >                 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> >                 vi->sq[i].vq = vqs[txq2vq(i)];
> > +
> > +               if (page_pool_enabled && vi->mergeable_rx_bufs)
> > +                       virtnet_alloc_page_pool(&vi->rq[i]);
> > +               else
> > +                       dev_warn(&vi->vdev->dev,
> > +                                "page pool only support mergeable mode\n");
> > +
> >         }
> >
> >         /* run here: ret == 0. */
> > --
> > 2.31.1
> >
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 5/5] virtio_net: Implement DMA pre-handler
  2023-05-26  7:06   ` Jason Wang
@ 2023-05-27 12:35     ` Liang Chen
  0 siblings, 0 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-27 12:35 UTC (permalink / raw)
  To: Jason Wang
  Cc: mst, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 3:06 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Fri, May 26, 2023 at 1:47 PM Liang Chen <liangchen.linux@gmail.com> wrote:
> >
> > Adding a DMA pre-handler that utilizes page pool for managing DMA mappings.
> > When IOMMU is enabled, turning on the page_pool_dma_map module parameter to
> > select page pool for DMA mapping management gives a significant reduction
> > in the overhead caused by DMA mappings.
> >
> > In testing environments with a single core vm and qemu emulated IOMMU,
> > significant performance improvements can be observed:
> >   Upstream codebase: 1.76 Gbits/sec
> >   Upstream codebase with page pool fragmentation support: 1.81 Gbits/sec
> >   Upstream codebase with page pool fragmentation and DMA support: 19.3
> >   Gbits/sec
> >
> > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > ---
> >  drivers/net/virtio_net.c | 55 ++++++++++++++++++++++++++++++++++++++++
> >  1 file changed, 55 insertions(+)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index ac40b8c66c59..73cc4f9fe4fa 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -22,6 +22,7 @@
> >  #include <net/route.h>
> >  #include <net/xdp.h>
> >  #include <net/net_failover.h>
> > +#include <linux/iommu.h>
> >
> >  static int napi_weight = NAPI_POLL_WEIGHT;
> >  module_param(napi_weight, int, 0444);
> > @@ -33,8 +34,10 @@ module_param(napi_tx, bool, 0644);
> >
> >  static bool page_pool_enabled;
> >  static bool page_pool_frag;
> > +static bool page_pool_dma_map;
> >  module_param(page_pool_enabled, bool, 0400);
> >  module_param(page_pool_frag, bool, 0400);
> > +module_param(page_pool_dma_map, bool, 0400);
> >
> >  /* FIXME: MTU in config. */
> >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > @@ -3830,6 +3833,49 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> >         virtnet_free_queues(vi);
> >  }
> >
> > +static dma_addr_t virtnet_pp_dma_map_page(struct device *dev, struct page *page,
> > +                                         unsigned long offset, size_t size,
> > +                                         enum dma_data_direction dir, unsigned long attrs)
> > +{
> > +       struct page *head_page;
> > +
> > +       if (dir != DMA_FROM_DEVICE)
> > +               return 0;
> > +
> > +       head_page = compound_head(page);
> > +       return page_pool_get_dma_addr(head_page)
> > +               + (page - head_page) * PAGE_SIZE
> > +               + offset;
>
> So it's not a map, it is just a query from the dma address from the pool.
>
> > +}
> > +
> > +static bool virtnet_pp_dma_unmap_page(struct device *dev, dma_addr_t dma_handle,
> > +                                     size_t size, enum dma_data_direction dir,
> > +                                     unsigned long attrs)
> > +{
> > +       phys_addr_t phys;
> > +
> > +       /* Handle only the RX direction, and sync the DMA memory only if it's not
> > +        * a DMA coherent architecture.
> > +        */
> > +       if (dir != DMA_FROM_DEVICE)
> > +               return false;
> > +
> > +       if (dev_is_dma_coherent(dev))
> > +               return true;
> > +
> > +       phys = iommu_iova_to_phys(iommu_get_dma_domain(dev), dma_handle);
>
> This would be somehow slow. If we track the mapping by driver, it
> would be much faster.
>
> More could be seen here:
>
> https://lists.linuxfoundation.org/pipermail/virtualization/2023-May/066778.html
>
> Thanks
>

Thanks for the information. I agree with your suggestion, and I will
drop the last two patches on v2 and wait for Xuan's patch to land for
dma mapping management.




> > +       if (WARN_ON(!phys))
> > +               return false;
> > +
> > +       arch_sync_dma_for_cpu(phys, size, dir);
> > +       return true;
> > +}
> > +
> > +static struct virtqueue_pre_dma_ops virtnet_pp_pre_dma_ops = {
> > +       .map_page = virtnet_pp_dma_map_page,
> > +       .unmap_page = virtnet_pp_dma_unmap_page,
> > +};
> > +
> >  static void virtnet_alloc_page_pool(struct receive_queue *rq)
> >  {
> >         struct virtio_device *vdev = rq->vq->vdev;
> > @@ -3845,6 +3891,15 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq)
> >         if (page_pool_frag)
> >                 pp_params.flags |= PP_FLAG_PAGE_FRAG;
> >
> > +       /* Consider using page pool DMA support only when DMA API is used. */
> > +       if (virtio_has_feature(vdev, VIRTIO_F_ACCESS_PLATFORM) &&
> > +           page_pool_dma_map) {
> > +               pp_params.flags |= PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV;
> > +               pp_params.dma_dir = DMA_FROM_DEVICE;
> > +               pp_params.max_len = PAGE_SIZE << pp_params.order;
> > +               virtqueue_register_pre_dma_ops(rq->vq, &virtnet_pp_pre_dma_ops);
> > +       }
> > +
> >         rq->page_pool = page_pool_create(&pp_params);
> >         if (IS_ERR(rq->page_pool)) {
> >                 dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > --
> > 2.31.1
> >
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support
  2023-05-26  8:29   ` Horatiu Vultur
@ 2023-05-27 12:36     ` Liang Chen
  0 siblings, 0 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-27 12:36 UTC (permalink / raw)
  To: Horatiu Vultur
  Cc: jasowang, mst, virtualization, netdev, linux-kernel, xuanzhuo,
	kuba, edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 4:29 PM Horatiu Vultur
<horatiu.vultur@microchip.com> wrote:
>
> The 05/26/2023 13:46, Liang Chen wrote:
>
> Hi Liang,
>
> >
> > To further enhance performance, implement page pool fragmentation
> > support and introduce a module parameter to enable or disable it.
> >
> > In single-core vm testing environments, there is an additional performance
> > gain observed in the normal path compared to the one packet per page
> > approach.
> >   Upstream codebase: 47.5 Gbits/sec
> >   Upstream codebase with page pool: 50.2 Gbits/sec
> >   Upstream codebase with page pool fragmentation support: 52.3 Gbits/sec
> >
> > There is also some performance gain for XDP cpumap.
> >   Upstream codebase: 1.38 Gbits/sec
> >   Upstream codebase with page pool: 9.74 Gbits/sec
> >   Upstream codebase with page pool fragmentation: 10.3 Gbits/sec
> >
> > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > ---
> >  drivers/net/virtio_net.c | 72 ++++++++++++++++++++++++++++++----------
> >  1 file changed, 55 insertions(+), 17 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 99c0ca0c1781..ac40b8c66c59 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -32,7 +32,9 @@ module_param(gso, bool, 0444);
> >  module_param(napi_tx, bool, 0644);
> >
> >  static bool page_pool_enabled;
> > +static bool page_pool_frag;
> >  module_param(page_pool_enabled, bool, 0400);
> > +module_param(page_pool_frag, bool, 0400);
> >
> >  /* FIXME: MTU in config. */
> >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > @@ -909,23 +911,32 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >                                        struct page *p,
> >                                        int offset,
> >                                        int page_off,
> > -                                      unsigned int *len)
> > +                                      unsigned int *len,
> > +                                          unsigned int *pp_frag_offset)
>
> The 'unsigned int *pp_frag_offset' seems to be unaligned.
>

Sure, Thanks!
> >  {
> >         int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> >         struct page *page;
> > +       unsigned int pp_frag_offset_val;
>
> Please use reverse christmas tree notation here. The pp_frag_offset_val
> needs to be declared before page;
>

Sure. Will do on v2.
> >
> >         if (page_off + *len + tailroom > PAGE_SIZE)
> >                 return NULL;
> >
> >         if (rq->page_pool)
> > -               page = page_pool_dev_alloc_pages(rq->page_pool);
> > +               if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG)
> > +                       page = page_pool_dev_alloc_frag(rq->page_pool, pp_frag_offset,
> > +                                                       PAGE_SIZE);
>
> Don't you need to check if pp_frag_offset is null? As you call once with
> NULL.
>

At the moment, page_pool is enabled only for mergeable mode, and the
path leading to a call with NULL pp_frag_offset is from small mode.
But I will evaluate again whether it is beneficial to support
page_pool for small mode on v2. Thanks.
> > +               else
> > +                       page = page_pool_dev_alloc_pages(rq->page_pool);
> >         else
> >                 page = alloc_page(GFP_ATOMIC);
> >
> >         if (!page)
> >                 return NULL;
> >
> > -       memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
> > +       pp_frag_offset_val = pp_frag_offset ? *pp_frag_offset : 0;
> > +
> > +       memcpy(page_address(page) + page_off + pp_frag_offset_val,
> > +              page_address(p) + offset, *len);
> >         page_off += *len;
> >
> >         while (--*num_buf) {
> > @@ -948,7 +959,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >                         goto err_buf;
> >                 }
> >
> > -               memcpy(page_address(page) + page_off,
> > +               memcpy(page_address(page) + page_off + pp_frag_offset_val,
> >                        page_address(p) + off, buflen);
> >                 page_off += buflen;
> >                 virtnet_put_page(rq, p);
> > @@ -1029,7 +1040,7 @@ static struct sk_buff *receive_small_xdp(struct net_device *dev,
> >                         SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> >                 xdp_page = xdp_linearize_page(rq, &num_buf, page,
> >                                               offset, header_offset,
> > -                                             &tlen);
> > +                                             &tlen, NULL);
> >                 if (!xdp_page)
> >                         goto err_xdp;
> >
> > @@ -1323,6 +1334,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >         unsigned int headroom = mergeable_ctx_to_headroom(ctx);
> >         struct page *xdp_page;
> >         unsigned int xdp_room;
> > +       unsigned int page_frag_offset = 0;
>
> Please use reverse x-mas tree notation.
>

Sure. Will do on v2.
> >
> >         /* Transient failure which in theory could occur if
> >          * in-flight packets from before XDP was enabled reach
> > @@ -1356,7 +1368,8 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >                 xdp_page = xdp_linearize_page(rq, num_buf,
> >                                               *page, offset,
> >                                               VIRTIO_XDP_HEADROOM,
> > -                                             len);
> > +                                             len,
> > +                                                 &page_frag_offset);
>
> You have also here some misalignment with regards to page_frag_offset.
>

Sure, Thanks!
> >                 if (!xdp_page)
> >                         return NULL;
> >         } else {
> > @@ -1366,14 +1379,19 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >                         return NULL;
> >
> >                 if (rq->page_pool)
> > -                       xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > +                       if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG)
> > +                               xdp_page = page_pool_dev_alloc_frag(rq->page_pool,
> > +                                                                   &page_frag_offset, PAGE_SIZE);
> > +                       else
> > +                               xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> >                 else
> >                         xdp_page = alloc_page(GFP_ATOMIC);
> > +
> >                 if (!xdp_page)
> >                         return NULL;
> >
> > -               memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM,
> > -                      page_address(*page) + offset, *len);
> > +               memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM +
> > +                               page_frag_offset, page_address(*page) + offset, *len);
> >         }
> >
> >         *frame_sz = PAGE_SIZE;
> > @@ -1382,7 +1400,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >
> >         *page = xdp_page;
> >
> > -       return page_address(*page) + VIRTIO_XDP_HEADROOM;
> > +       return page_address(*page) + VIRTIO_XDP_HEADROOM + page_frag_offset;
> >  }
> >
> >  static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > @@ -1762,6 +1780,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >         void *ctx;
> >         int err;
> >         unsigned int len, hole;
> > +       unsigned int pp_frag_offset;
>
> There same here.
>

Sure, Thanks!

> >
> >         /* Extra tailroom is needed to satisfy XDP's assumption. This
> >          * means rx frags coalescing won't work, but consider we've
> > @@ -1769,13 +1788,29 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >          */
> >         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> >         if (rq->page_pool) {
> > -               struct page *page;
> > +               if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG) {
> > +                       if (unlikely(!page_pool_dev_alloc_frag(rq->page_pool,
> > +                                                              &pp_frag_offset, len + room)))
> > +                               return -ENOMEM;
> > +                       buf = (char *)page_address(rq->page_pool->frag_page) +
> > +                               pp_frag_offset;
> > +                       buf += headroom; /* advance address leaving hole at front of pkt */
> > +                       hole = (PAGE_SIZE << rq->page_pool->p.order)
> > +                               - rq->page_pool->frag_offset;
> > +                       if (hole < len + room) {
> > +                               if (!headroom)
> > +                                       len += hole;
> > +                               rq->page_pool->frag_offset += hole;
> > +                       }
> > +               } else {
> > +                       struct page *page;
> >
> > -               page = page_pool_dev_alloc_pages(rq->page_pool);
> > -               if (unlikely(!page))
> > -                       return -ENOMEM;
> > -               buf = (char *)page_address(page);
> > -               buf += headroom; /* advance address leaving hole at front of pkt */
> > +                       page = page_pool_dev_alloc_pages(rq->page_pool);
> > +                       if (unlikely(!page))
> > +                               return -ENOMEM;
> > +                       buf = (char *)page_address(page);
> > +                       buf += headroom; /* advance address leaving hole at front of pkt */
> > +               }
> >         } else {
> >                 if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> >                         return -ENOMEM;
> > @@ -3800,13 +3835,16 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq)
> >         struct virtio_device *vdev = rq->vq->vdev;
> >
> >         struct page_pool_params pp_params = {
> > -               .order = 0,
> > +               .order = page_pool_frag ? SKB_FRAG_PAGE_ORDER : 0,
> >                 .pool_size = rq->vq->num_max,
> >                 .nid = dev_to_node(vdev->dev.parent),
> >                 .dev = vdev->dev.parent,
> >                 .offset = 0,
> >         };
> >
> > +       if (page_pool_frag)
> > +               pp_params.flags |= PP_FLAG_PAGE_FRAG;
> > +
> >         rq->page_pool = page_pool_create(&pp_params);
> >         if (IS_ERR(rq->page_pool)) {
> >                 dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > --
> > 2.31.1
> >
> >
>
> --
> /Horatiu

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain
  2023-05-26  5:46 [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Liang Chen
                   ` (4 preceding siblings ...)
  2023-05-26  6:38 ` [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Jason Wang
@ 2023-05-28  6:16 ` Michael S. Tsirkin
  2023-05-29  7:25   ` Liang Chen
  5 siblings, 1 reply; 56+ messages in thread
From: Michael S. Tsirkin @ 2023-05-28  6:16 UTC (permalink / raw)
  To: Liang Chen
  Cc: jasowang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 01:46:17PM +0800, Liang Chen wrote:
> "private" of buffer page is currently used for big mode to chain pages.
> But in mergeable mode, that offset of page could mean something else,
> e.g. when page_pool page is used instead. So excluding mergeable mode to
> avoid such a problem.
> 
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>

Ugh the subject makes it looks like current code has a problem
but I don't think so because I don't think anything besides
big packets uses page->private.

The reason patch is needed is because follow up patches
use page_pool.
pls adjust commit log and subject to make all this clear.


> ---
>  drivers/net/virtio_net.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 5a7f7a76b920..c5dca0d92e64 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  			return NULL;
>  
>  		page = (struct page *)page->private;
> -		if (page)
> +		if (!vi->mergeable_rx_bufs && page)

To be safe let's limit to big packets too:

	if (!vi->mergeable_rx_bufs && vi->big_packets && page)



>  			give_pages(rq, page);
>  		goto ok;
>  	}
> -- 
> 2.31.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-26  5:46 ` [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance Liang Chen
  2023-05-26  6:50   ` Jason Wang
  2023-05-26 16:11   ` kernel test robot
@ 2023-05-28  6:20   ` Michael S. Tsirkin
  2023-05-29  7:27     ` Liang Chen
  2023-11-29 14:50   ` Zhu Yanjun
  2023-11-30  7:17   ` Zhu Yanjun
  4 siblings, 1 reply; 56+ messages in thread
From: Michael S. Tsirkin @ 2023-05-28  6:20 UTC (permalink / raw)
  To: Liang Chen
  Cc: jasowang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> The implementation at the moment uses one page per packet in both the
> normal and XDP path. In addition, introducing a module parameter to enable
> or disable the usage of page pool (disabled by default).
> 
> In single-core vm testing environments, it gives a modest performance gain
> in the normal path.
>   Upstream codebase: 47.5 Gbits/sec
>   Upstream codebase + page_pool support: 50.2 Gbits/sec
> 
> In multi-core vm testing environments, The most significant performance
> gain is observed in XDP cpumap:
>   Upstream codebase: 1.38 Gbits/sec
>   Upstream codebase + page_pool support: 9.74 Gbits/sec
> 
> With this foundation, we can further integrate page pool fragmentation and
> DMA map/unmap support.
> 
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>

Why off by default?
I am guessing it sometimes has performance costs too?


What happens if we use page pool for big mode too?
The less modes we have the better...


> ---
>  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
>  1 file changed, 146 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c5dca0d92e64..99c0ca0c1781 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
>  module_param(gso, bool, 0444);
>  module_param(napi_tx, bool, 0644);
>  
> +static bool page_pool_enabled;
> +module_param(page_pool_enabled, bool, 0400);
> +
>  /* FIXME: MTU in config. */
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>  #define GOOD_COPY_LEN	128
> @@ -159,6 +162,9 @@ struct receive_queue {
>  	/* Chain pages by the private ptr. */
>  	struct page *pages;
>  
> +	/* Page pool */
> +	struct page_pool *page_pool;
> +
>  	/* Average packet length for mergeable receive buffers. */
>  	struct ewma_pkt_len mrg_avg_pkt_len;
>  
> @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
>  	return skb;
>  }
>  
> +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> +{
> +	if (rq->page_pool)
> +		page_pool_put_full_page(rq->page_pool, page, true);
> +	else
> +		put_page(page);
> +}
> +
>  /* Called from bottom half context */
>  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  				   struct receive_queue *rq,
> @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  	hdr = skb_vnet_hdr(skb);
>  	memcpy(hdr, hdr_p, hdr_len);
>  	if (page_to_free)
> -		put_page(page_to_free);
> +		virtnet_put_page(rq, page_to_free);
>  
>  	return skb;
>  }
> @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
>  	return ret;
>  }
>  
> -static void put_xdp_frags(struct xdp_buff *xdp)
> +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
>  {
>  	struct skb_shared_info *shinfo;
>  	struct page *xdp_page;
> @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
>  		shinfo = xdp_get_shared_info_from_buff(xdp);
>  		for (i = 0; i < shinfo->nr_frags; i++) {
>  			xdp_page = skb_frag_page(&shinfo->frags[i]);
> -			put_page(xdp_page);
> +			virtnet_put_page(rq, xdp_page);
>  		}
>  	}
>  }
> @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>  	if (page_off + *len + tailroom > PAGE_SIZE)
>  		return NULL;
>  
> -	page = alloc_page(GFP_ATOMIC);
> +	if (rq->page_pool)
> +		page = page_pool_dev_alloc_pages(rq->page_pool);
> +	else
> +		page = alloc_page(GFP_ATOMIC);
> +
>  	if (!page)
>  		return NULL;
>  
> @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>  		 * is sending packet larger than the MTU.
>  		 */
>  		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> -			put_page(p);
> +			virtnet_put_page(rq, p);
>  			goto err_buf;
>  		}
>  
>  		memcpy(page_address(page) + page_off,
>  		       page_address(p) + off, buflen);
>  		page_off += buflen;
> -		put_page(p);
> +		virtnet_put_page(rq, p);
>  	}
>  
>  	/* Headroom does not contribute to packet length */
>  	*len = page_off - VIRTIO_XDP_HEADROOM;
>  	return page;
>  err_buf:
> -	__free_pages(page, 0);
> +	if (rq->page_pool)
> +		page_pool_put_full_page(rq->page_pool, page, true);
> +	else
> +		__free_pages(page, 0);
>  	return NULL;
>  }
>  
> @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
>  		}
>  		stats->bytes += len;
>  		page = virt_to_head_page(buf);
> -		put_page(page);
> +		virtnet_put_page(rq, page);
>  	}
>  }
>  
> @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
>  		cur_frag_size = truesize;
>  		xdp_frags_truesz += cur_frag_size;
>  		if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> -			put_page(page);
> +			virtnet_put_page(rq, page);
>  			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
>  				 dev->name, len, (unsigned long)(truesize - room));
>  			dev->stats.rx_length_errors++;
> @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
>  	return 0;
>  
>  err:
> -	put_xdp_frags(xdp);
> +	put_xdp_frags(xdp, rq);
>  	return -EINVAL;
>  }
>  
> @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>  		if (*len + xdp_room > PAGE_SIZE)
>  			return NULL;
>  
> -		xdp_page = alloc_page(GFP_ATOMIC);
> +		if (rq->page_pool)
> +			xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> +		else
> +			xdp_page = alloc_page(GFP_ATOMIC);
>  		if (!xdp_page)
>  			return NULL;
>  
> @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>  
>  	*frame_sz = PAGE_SIZE;
>  
> -	put_page(*page);
> +	virtnet_put_page(rq, *page);
>  
>  	*page = xdp_page;
>  
> @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
>  		head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
>  		if (unlikely(!head_skb))
>  			break;
> +		if (rq->page_pool)
> +			skb_mark_for_recycle(head_skb);
>  		return head_skb;
>  
>  	case XDP_TX:
> @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
>  		break;
>  	}
>  
> -	put_xdp_frags(&xdp);
> +	put_xdp_frags(&xdp, rq);
>  
>  err_xdp:
> -	put_page(page);
> +	virtnet_put_page(rq, page);
>  	mergeable_buf_free(rq, num_buf, dev, stats);
>  
>  	stats->xdp_drops++;
> @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
>  	curr_skb = head_skb;
>  
> +	if (rq->page_pool)
> +		skb_mark_for_recycle(curr_skb);
> +
>  	if (unlikely(!curr_skb))
>  		goto err_skb;
>  	while (--num_buf) {
> @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  			curr_skb = nskb;
>  			head_skb->truesize += nskb->truesize;
>  			num_skb_frags = 0;
> +			if (rq->page_pool)
> +				skb_mark_for_recycle(curr_skb);
>  		}
>  		if (curr_skb != head_skb) {
>  			head_skb->data_len += len;
> @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  		}
>  		offset = buf - page_address(page);
>  		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> -			put_page(page);
> +			virtnet_put_page(rq, page);
>  			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
>  					     len, truesize);
>  		} else {
> @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  	return head_skb;
>  
>  err_skb:
> -	put_page(page);
> +	virtnet_put_page(rq, page);
>  	mergeable_buf_free(rq, num_buf, dev, stats);
>  
>  err_buf:
> @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>  	 * disabled GSO for XDP, it won't be a big issue.
>  	 */
>  	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> -	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> -		return -ENOMEM;
> +	if (rq->page_pool) {
> +		struct page *page;
>  
> -	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> -	buf += headroom; /* advance address leaving hole at front of pkt */
> -	get_page(alloc_frag->page);
> -	alloc_frag->offset += len + room;
> -	hole = alloc_frag->size - alloc_frag->offset;
> -	if (hole < len + room) {
> -		/* To avoid internal fragmentation, if there is very likely not
> -		 * enough space for another buffer, add the remaining space to
> -		 * the current buffer.
> -		 * XDP core assumes that frame_size of xdp_buff and the length
> -		 * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> -		 */
> -		if (!headroom)
> -			len += hole;
> -		alloc_frag->offset += hole;
> -	}
> +		page = page_pool_dev_alloc_pages(rq->page_pool);
> +		if (unlikely(!page))
> +			return -ENOMEM;
> +		buf = (char *)page_address(page);
> +		buf += headroom; /* advance address leaving hole at front of pkt */
> +	} else {
> +		if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> +			return -ENOMEM;
>  
> +		buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +		buf += headroom; /* advance address leaving hole at front of pkt */
> +		get_page(alloc_frag->page);
> +		alloc_frag->offset += len + room;
> +		hole = alloc_frag->size - alloc_frag->offset;
> +		if (hole < len + room) {
> +			/* To avoid internal fragmentation, if there is very likely not
> +			 * enough space for another buffer, add the remaining space to
> +			 * the current buffer.
> +			 * XDP core assumes that frame_size of xdp_buff and the length
> +			 * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> +			 */
> +			if (!headroom)
> +				len += hole;
> +			alloc_frag->offset += hole;
> +		}
> +	}
>  	sg_init_one(rq->sg, buf, len);
>  	ctx = mergeable_len_to_ctx(len + room, headroom);
>  	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
>  	if (err < 0)
> -		put_page(virt_to_head_page(buf));
> +		virtnet_put_page(rq, virt_to_head_page(buf));
>  
>  	return err;
>  }
> @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
>  	if (err < 0)
>  		return err;
>  
> -	err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> -					 MEM_TYPE_PAGE_SHARED, NULL);
> +	if (vi->rq[qp_index].page_pool)
> +		err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> +						 MEM_TYPE_PAGE_POOL,
> +						 vi->rq[qp_index].page_pool);
> +	else
> +		err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> +						 MEM_TYPE_PAGE_SHARED,
> +						 NULL);
> +
>  	if (err < 0)
>  		goto err_xdp_reg_mem_model;
>  
> @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
>  				ethtool_sprintf(&p, "tx_queue_%u_%s", i,
>  						virtnet_sq_stats_desc[j].desc);
>  		}
> +		page_pool_ethtool_stats_get_strings(p);
>  		break;
>  	}
>  }
> @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
>  	switch (sset) {
>  	case ETH_SS_STATS:
>  		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> -					       VIRTNET_SQ_STATS_LEN);
> +					       VIRTNET_SQ_STATS_LEN +
> +						(page_pool_enabled && vi->mergeable_rx_bufs ?
> +						 page_pool_ethtool_stats_get_count() : 0));
>  	default:
>  		return -EOPNOTSUPP;
>  	}
>  }
>  
> +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> +{
> +#ifdef CONFIG_PAGE_POOL_STATS
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	struct page_pool_stats pp_stats = {};
> +	int i;
> +
> +	for (i = 0; i < vi->curr_queue_pairs; i++) {
> +		if (!vi->rq[i].page_pool)
> +			continue;
> +		page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> +	}
> +	page_pool_ethtool_stats_get(data, &pp_stats);
> +#endif /* CONFIG_PAGE_POOL_STATS */
> +}
> +
>  static void virtnet_get_ethtool_stats(struct net_device *dev,
>  				      struct ethtool_stats *stats, u64 *data)
>  {
> @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
>  		} while (u64_stats_fetch_retry(&sq->stats.syncp, start));
>  		idx += VIRTNET_SQ_STATS_LEN;
>  	}
> +
> +	virtnet_get_page_pool_stats(dev, &data[idx]);
>  }
>  
>  static void virtnet_get_channels(struct net_device *dev,
> @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
>  	for (i = 0; i < vi->max_queue_pairs; i++) {
>  		__netif_napi_del(&vi->rq[i].napi);
>  		__netif_napi_del(&vi->sq[i].napi);
> +		if (vi->rq[i].page_pool)
> +			page_pool_destroy(vi->rq[i].page_pool);
>  	}
>  
>  	/* We called __netif_napi_del(),
> @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
>  	struct virtnet_info *vi = vq->vdev->priv;
>  	int i = vq2rxq(vq);
>  
> -	if (vi->mergeable_rx_bufs)
> -		put_page(virt_to_head_page(buf));
> -	else if (vi->big_packets)
> +	if (vi->mergeable_rx_bufs) {
> +		if (vi->rq[i].page_pool) {
> +			page_pool_put_full_page(vi->rq[i].page_pool,
> +						virt_to_head_page(buf),
> +						true);
> +		} else {
> +			put_page(virt_to_head_page(buf));
> +		}
> +	} else if (vi->big_packets) {
>  		give_pages(&vi->rq[i], buf);
> -	else
> +	} else {
>  		put_page(virt_to_head_page(buf));
> +	}
>  }
>  
>  static void free_unused_bufs(struct virtnet_info *vi)
> @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
>  	virtnet_free_queues(vi);
>  }
>  
> +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> +{
> +	struct virtio_device *vdev = rq->vq->vdev;
> +
> +	struct page_pool_params pp_params = {
> +		.order = 0,
> +		.pool_size = rq->vq->num_max,
> +		.nid = dev_to_node(vdev->dev.parent),
> +		.dev = vdev->dev.parent,
> +		.offset = 0,
> +	};
> +
> +	rq->page_pool = page_pool_create(&pp_params);
> +	if (IS_ERR(rq->page_pool)) {
> +		dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> +			 PTR_ERR(rq->page_pool));
> +		rq->page_pool = NULL;
> +	}
> +}
> +
>  /* How large should a single buffer be so a queue full of these can fit at
>   * least one full packet?
>   * Logic below assumes the mergeable buffer header is used.
> @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
>  		vi->rq[i].vq = vqs[rxq2vq(i)];
>  		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
>  		vi->sq[i].vq = vqs[txq2vq(i)];
> +
> +		if (page_pool_enabled && vi->mergeable_rx_bufs)
> +			virtnet_alloc_page_pool(&vi->rq[i]);
> +		else
> +			dev_warn(&vi->vdev->dev,
> +				 "page pool only support mergeable mode\n");
> +
>  	}
>  
>  	/* run here: ret == 0. */
> -- 
> 2.31.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support
  2023-05-26  5:46 ` [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support Liang Chen
  2023-05-26  8:29   ` Horatiu Vultur
  2023-05-26 17:44   ` kernel test robot
@ 2023-05-28  6:25   ` Michael S. Tsirkin
  2023-05-29  7:29     ` Liang Chen
  2023-05-29  1:33   ` Yunsheng Lin
  3 siblings, 1 reply; 56+ messages in thread
From: Michael S. Tsirkin @ 2023-05-28  6:25 UTC (permalink / raw)
  To: Liang Chen
  Cc: jasowang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 01:46:19PM +0800, Liang Chen wrote:
> To further enhance performance, implement page pool fragmentation
> support and introduce a module parameter to enable or disable it.
> 
> In single-core vm testing environments, there is an additional performance
> gain observed in the normal path compared to the one packet per page
> approach.
>   Upstream codebase: 47.5 Gbits/sec
>   Upstream codebase with page pool: 50.2 Gbits/sec
>   Upstream codebase with page pool fragmentation support: 52.3 Gbits/sec
> 
> There is also some performance gain for XDP cpumap.
>   Upstream codebase: 1.38 Gbits/sec
>   Upstream codebase with page pool: 9.74 Gbits/sec
>   Upstream codebase with page pool fragmentation: 10.3 Gbits/sec
> 
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>

I think it's called fragmenting not fragmentation.


> ---
>  drivers/net/virtio_net.c | 72 ++++++++++++++++++++++++++++++----------
>  1 file changed, 55 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 99c0ca0c1781..ac40b8c66c59 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -32,7 +32,9 @@ module_param(gso, bool, 0444);
>  module_param(napi_tx, bool, 0644);
>  
>  static bool page_pool_enabled;
> +static bool page_pool_frag;
>  module_param(page_pool_enabled, bool, 0400);
> +module_param(page_pool_frag, bool, 0400);
>  
>  /* FIXME: MTU in config. */
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)

So here again same questions.

-when is this a net perf gain when does it have no effect?
-can be on by default
- can we get rid of the extra modes?


> @@ -909,23 +911,32 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>  				       struct page *p,
>  				       int offset,
>  				       int page_off,
> -				       unsigned int *len)
> +				       unsigned int *len,
> +					   unsigned int *pp_frag_offset)
>  {
>  	int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>  	struct page *page;
> +	unsigned int pp_frag_offset_val;
>  
>  	if (page_off + *len + tailroom > PAGE_SIZE)
>  		return NULL;
>  
>  	if (rq->page_pool)
> -		page = page_pool_dev_alloc_pages(rq->page_pool);
> +		if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG)
> +			page = page_pool_dev_alloc_frag(rq->page_pool, pp_frag_offset,
> +							PAGE_SIZE);
> +		else
> +			page = page_pool_dev_alloc_pages(rq->page_pool);
>  	else
>  		page = alloc_page(GFP_ATOMIC);
>  
>  	if (!page)
>  		return NULL;
>  
> -	memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
> +	pp_frag_offset_val = pp_frag_offset ? *pp_frag_offset : 0;
> +
> +	memcpy(page_address(page) + page_off + pp_frag_offset_val,
> +	       page_address(p) + offset, *len);
>  	page_off += *len;
>  
>  	while (--*num_buf) {
> @@ -948,7 +959,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>  			goto err_buf;
>  		}
>  
> -		memcpy(page_address(page) + page_off,
> +		memcpy(page_address(page) + page_off + pp_frag_offset_val,
>  		       page_address(p) + off, buflen);
>  		page_off += buflen;
>  		virtnet_put_page(rq, p);
> @@ -1029,7 +1040,7 @@ static struct sk_buff *receive_small_xdp(struct net_device *dev,
>  			SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>  		xdp_page = xdp_linearize_page(rq, &num_buf, page,
>  					      offset, header_offset,
> -					      &tlen);
> +					      &tlen, NULL);
>  		if (!xdp_page)
>  			goto err_xdp;
>  
> @@ -1323,6 +1334,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>  	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
>  	struct page *xdp_page;
>  	unsigned int xdp_room;
> +	unsigned int page_frag_offset = 0;
>  
>  	/* Transient failure which in theory could occur if
>  	 * in-flight packets from before XDP was enabled reach
> @@ -1356,7 +1368,8 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>  		xdp_page = xdp_linearize_page(rq, num_buf,
>  					      *page, offset,
>  					      VIRTIO_XDP_HEADROOM,
> -					      len);
> +					      len,
> +						  &page_frag_offset);
>  		if (!xdp_page)
>  			return NULL;
>  	} else {
> @@ -1366,14 +1379,19 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>  			return NULL;
>  
>  		if (rq->page_pool)
> -			xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> +			if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG)
> +				xdp_page = page_pool_dev_alloc_frag(rq->page_pool,
> +								    &page_frag_offset, PAGE_SIZE);
> +			else
> +				xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
>  		else
>  			xdp_page = alloc_page(GFP_ATOMIC);
> +
>  		if (!xdp_page)
>  			return NULL;
>  
> -		memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM,
> -		       page_address(*page) + offset, *len);
> +		memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM +
> +				page_frag_offset, page_address(*page) + offset, *len);
>  	}
>  
>  	*frame_sz = PAGE_SIZE;
> @@ -1382,7 +1400,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>  
>  	*page = xdp_page;
>  
> -	return page_address(*page) + VIRTIO_XDP_HEADROOM;
> +	return page_address(*page) + VIRTIO_XDP_HEADROOM + page_frag_offset;
>  }
>  
>  static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> @@ -1762,6 +1780,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>  	void *ctx;
>  	int err;
>  	unsigned int len, hole;
> +	unsigned int pp_frag_offset;
>  
>  	/* Extra tailroom is needed to satisfy XDP's assumption. This
>  	 * means rx frags coalescing won't work, but consider we've
> @@ -1769,13 +1788,29 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>  	 */
>  	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
>  	if (rq->page_pool) {
> -		struct page *page;
> +		if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG) {
> +			if (unlikely(!page_pool_dev_alloc_frag(rq->page_pool,
> +							       &pp_frag_offset, len + room)))
> +				return -ENOMEM;
> +			buf = (char *)page_address(rq->page_pool->frag_page) +
> +				pp_frag_offset;
> +			buf += headroom; /* advance address leaving hole at front of pkt */
> +			hole = (PAGE_SIZE << rq->page_pool->p.order)
> +				- rq->page_pool->frag_offset;
> +			if (hole < len + room) {
> +				if (!headroom)
> +					len += hole;
> +				rq->page_pool->frag_offset += hole;
> +			}
> +		} else {
> +			struct page *page;
>  
> -		page = page_pool_dev_alloc_pages(rq->page_pool);
> -		if (unlikely(!page))
> -			return -ENOMEM;
> -		buf = (char *)page_address(page);
> -		buf += headroom; /* advance address leaving hole at front of pkt */
> +			page = page_pool_dev_alloc_pages(rq->page_pool);
> +			if (unlikely(!page))
> +				return -ENOMEM;
> +			buf = (char *)page_address(page);
> +			buf += headroom; /* advance address leaving hole at front of pkt */
> +		}
>  	} else {
>  		if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
>  			return -ENOMEM;
> @@ -3800,13 +3835,16 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq)
>  	struct virtio_device *vdev = rq->vq->vdev;
>  
>  	struct page_pool_params pp_params = {
> -		.order = 0,
> +		.order = page_pool_frag ? SKB_FRAG_PAGE_ORDER : 0,
>  		.pool_size = rq->vq->num_max,
>  		.nid = dev_to_node(vdev->dev.parent),
>  		.dev = vdev->dev.parent,
>  		.offset = 0,
>  	};
>  
> +	if (page_pool_frag)
> +		pp_params.flags |= PP_FLAG_PAGE_FRAG;
> +
>  	rq->page_pool = page_pool_create(&pp_params);
>  	if (IS_ERR(rq->page_pool)) {
>  		dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> -- 
> 2.31.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-26 16:11   ` kernel test robot
@ 2023-05-28  6:27     ` Michael S. Tsirkin
  2023-05-29  7:28       ` Liang Chen
  0 siblings, 1 reply; 56+ messages in thread
From: Michael S. Tsirkin @ 2023-05-28  6:27 UTC (permalink / raw)
  To: kernel test robot
  Cc: Liang Chen, jasowang, oe-kbuild-all, virtualization, netdev,
	linux-kernel, xuanzhuo, kuba, edumazet, davem, pabeni,
	alexander.duyck

On Sat, May 27, 2023 at 12:11:25AM +0800, kernel test robot wrote:
> Hi Liang,
> 
> kernel test robot noticed the following build errors:
> 
> [auto build test ERROR on net-next/main]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805
> base:   net-next/main
> patch link:    https://lore.kernel.org/r/20230526054621.18371-2-liangchen.linux%40gmail.com
> patch subject: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
> config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20230526/202305262334.GiFQ3wpG-lkp@intel.com/config)
> compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
> reproduce (this is a W=1 build):
>         # https://github.com/intel-lab-lkp/linux/commit/bfba563f43bba37181d8502cb2e566c32f96ec9e
>         git remote add linux-review https://github.com/intel-lab-lkp/linux
>         git fetch --no-tags linux-review Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805
>         git checkout bfba563f43bba37181d8502cb2e566c32f96ec9e
>         # save the config file
>         mkdir build_dir && cp config build_dir/.config
>         make W=1 O=build_dir ARCH=x86_64 olddefconfig
>         make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash
> 
> If you fix the issue, kindly add following tag where applicable
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202305262334.GiFQ3wpG-lkp@intel.com/
> 
> All errors (new ones prefixed by >>):
> 
>    ld: vmlinux.o: in function `virtnet_find_vqs':
> >> virtio_net.c:(.text+0x901fb5): undefined reference to `page_pool_create'
>    ld: vmlinux.o: in function `add_recvbuf_mergeable.isra.0':
> >> virtio_net.c:(.text+0x905618): undefined reference to `page_pool_alloc_pages'
>    ld: vmlinux.o: in function `xdp_linearize_page':
>    virtio_net.c:(.text+0x906b6b): undefined reference to `page_pool_alloc_pages'
>    ld: vmlinux.o: in function `mergeable_xdp_get_buf.isra.0':
>    virtio_net.c:(.text+0x90728f): undefined reference to `page_pool_alloc_pages'


you need to tweak Kconfig to select PAGE_POOL I think.

> -- 
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain
  2023-05-26  6:38 ` [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Jason Wang
  2023-05-27 12:33   ` Liang Chen
@ 2023-05-28  6:29   ` Michael S. Tsirkin
  2023-05-29  7:25     ` Liang Chen
  1 sibling, 1 reply; 56+ messages in thread
From: Michael S. Tsirkin @ 2023-05-28  6:29 UTC (permalink / raw)
  To: Jason Wang
  Cc: Liang Chen, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Fri, May 26, 2023 at 02:38:54PM +0800, Jason Wang wrote:
> On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote:
> >
> > "private" of buffer page is currently used for big mode to chain pages.
> > But in mergeable mode, that offset of page could mean something else,
> > e.g. when page_pool page is used instead. So excluding mergeable mode to
> > avoid such a problem.
> 
> If this issue happens only in the case of page_pool, it would be
> better to squash it there.
> 
> Thanks


This is a tiny patch so I don't care. Generally it's ok
to first rework code then change functionality.
in this case what Jason says os right especially because
you then do not need to explain that current code is ok.

> >
> > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > ---
> >  drivers/net/virtio_net.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 5a7f7a76b920..c5dca0d92e64 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> >                         return NULL;
> >
> >                 page = (struct page *)page->private;
> > -               if (page)
> > +               if (!vi->mergeable_rx_bufs && page)
> >                         give_pages(rq, page);
> >                 goto ok;
> >         }
> > --
> > 2.31.1
> >


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-27 12:35     ` Liang Chen
@ 2023-05-28  6:40       ` Michael S. Tsirkin
  2023-05-29  7:28         ` Liang Chen
  0 siblings, 1 reply; 56+ messages in thread
From: Michael S. Tsirkin @ 2023-05-28  6:40 UTC (permalink / raw)
  To: Liang Chen
  Cc: Jason Wang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Sat, May 27, 2023 at 08:35:01PM +0800, Liang Chen wrote:
> On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote:
> > >
> > > The implementation at the moment uses one page per packet in both the
> > > normal and XDP path.
> >
> > It's better to explain why we need a page pool and how it can help the
> > performance.
> >
> 
> Sure, I will include that on v2.
> > > In addition, introducing a module parameter to enable
> > > or disable the usage of page pool (disabled by default).
> >
> > If page pool wins for most of the cases, any reason to disable it by default?
> >
> 
> Thank you for raising the point. It does make sense to enable it by default.

I'd like to see more benchmarks pls then, with a variety of packet
sizes, udp and tcp.

> > >
> > > In single-core vm testing environments, it gives a modest performance gain
> > > in the normal path.
> > >   Upstream codebase: 47.5 Gbits/sec
> > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > >
> > > In multi-core vm testing environments, The most significant performance
> > > gain is observed in XDP cpumap:
> > >   Upstream codebase: 1.38 Gbits/sec
> > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> >
> > Please show more details on the test. E.g which kinds of tests have
> > you measured?
> >
> > Btw, it would be better to measure PPS as well.
> >
> 
> Sure. It will be added on v2.
> > >
> > > With this foundation, we can further integrate page pool fragmentation and
> > > DMA map/unmap support.
> > >
> > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > ---
> > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> >
> > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do
> > the ifdef tricks at least.
> >
> 
> Sure. it will be done on v2.
> > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index c5dca0d92e64..99c0ca0c1781 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > >  module_param(gso, bool, 0444);
> > >  module_param(napi_tx, bool, 0644);
> > >
> > > +static bool page_pool_enabled;
> > > +module_param(page_pool_enabled, bool, 0400);
> > > +
> > >  /* FIXME: MTU in config. */
> > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > >  #define GOOD_COPY_LEN  128
> > > @@ -159,6 +162,9 @@ struct receive_queue {
> > >         /* Chain pages by the private ptr. */
> > >         struct page *pages;
> > >
> > > +       /* Page pool */
> > > +       struct page_pool *page_pool;
> > > +
> > >         /* Average packet length for mergeable receive buffers. */
> > >         struct ewma_pkt_len mrg_avg_pkt_len;
> > >
> > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > >         return skb;
> > >  }
> > >
> > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > +{
> > > +       if (rq->page_pool)
> > > +               page_pool_put_full_page(rq->page_pool, page, true);
> > > +       else
> > > +               put_page(page);
> > > +}
> > > +
> > >  /* Called from bottom half context */
> > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > >                                    struct receive_queue *rq,
> > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > >         hdr = skb_vnet_hdr(skb);
> > >         memcpy(hdr, hdr_p, hdr_len);
> > >         if (page_to_free)
> > > -               put_page(page_to_free);
> > > +               virtnet_put_page(rq, page_to_free);
> > >
> > >         return skb;
> > >  }
> > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > >         return ret;
> > >  }
> > >
> > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > >  {
> >
> > rq could be fetched from xdp_rxq_info?
> 
> Yeah, it has the queue_index there.
> >
> > >         struct skb_shared_info *shinfo;
> > >         struct page *xdp_page;
> > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > >                 shinfo = xdp_get_shared_info_from_buff(xdp);
> > >                 for (i = 0; i < shinfo->nr_frags; i++) {
> > >                         xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > -                       put_page(xdp_page);
> > > +                       virtnet_put_page(rq, xdp_page);
> > >                 }
> > >         }
> > >  }
> > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > >         if (page_off + *len + tailroom > PAGE_SIZE)
> > >                 return NULL;
> > >
> > > -       page = alloc_page(GFP_ATOMIC);
> > > +       if (rq->page_pool)
> > > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > > +       else
> > > +               page = alloc_page(GFP_ATOMIC);
> > > +
> > >         if (!page)
> > >                 return NULL;
> > >
> > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > >                  * is sending packet larger than the MTU.
> > >                  */
> > >                 if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > -                       put_page(p);
> > > +                       virtnet_put_page(rq, p);
> > >                         goto err_buf;
> > >                 }
> > >
> > >                 memcpy(page_address(page) + page_off,
> > >                        page_address(p) + off, buflen);
> > >                 page_off += buflen;
> > > -               put_page(p);
> > > +               virtnet_put_page(rq, p);
> > >         }
> > >
> > >         /* Headroom does not contribute to packet length */
> > >         *len = page_off - VIRTIO_XDP_HEADROOM;
> > >         return page;
> > >  err_buf:
> > > -       __free_pages(page, 0);
> > > +       if (rq->page_pool)
> > > +               page_pool_put_full_page(rq->page_pool, page, true);
> > > +       else
> > > +               __free_pages(page, 0);
> > >         return NULL;
> > >  }
> > >
> > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > >                 }
> > >                 stats->bytes += len;
> > >                 page = virt_to_head_page(buf);
> > > -               put_page(page);
> > > +               virtnet_put_page(rq, page);
> > >         }
> > >  }
> > >
> > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > >                 cur_frag_size = truesize;
> > >                 xdp_frags_truesz += cur_frag_size;
> > >                 if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > -                       put_page(page);
> > > +                       virtnet_put_page(rq, page);
> > >                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > >                                  dev->name, len, (unsigned long)(truesize - room));
> > >                         dev->stats.rx_length_errors++;
> > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > >         return 0;
> > >
> > >  err:
> > > -       put_xdp_frags(xdp);
> > > +       put_xdp_frags(xdp, rq);
> > >         return -EINVAL;
> > >  }
> > >
> > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > >                 if (*len + xdp_room > PAGE_SIZE)
> > >                         return NULL;
> > >
> > > -               xdp_page = alloc_page(GFP_ATOMIC);
> > > +               if (rq->page_pool)
> > > +                       xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > +               else
> > > +                       xdp_page = alloc_page(GFP_ATOMIC);
> > >                 if (!xdp_page)
> > >                         return NULL;
> > >
> > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > >
> > >         *frame_sz = PAGE_SIZE;
> > >
> > > -       put_page(*page);
> > > +       virtnet_put_page(rq, *page);
> > >
> > >         *page = xdp_page;
> > >
> > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > >                 head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > >                 if (unlikely(!head_skb))
> > >                         break;
> > > +               if (rq->page_pool)
> > > +                       skb_mark_for_recycle(head_skb);
> > >                 return head_skb;
> > >
> > >         case XDP_TX:
> > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > >                 break;
> > >         }
> > >
> > > -       put_xdp_frags(&xdp);
> > > +       put_xdp_frags(&xdp, rq);
> > >
> > >  err_xdp:
> > > -       put_page(page);
> > > +       virtnet_put_page(rq, page);
> > >         mergeable_buf_free(rq, num_buf, dev, stats);
> > >
> > >         stats->xdp_drops++;
> > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >         head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > >         curr_skb = head_skb;
> > >
> > > +       if (rq->page_pool)
> > > +               skb_mark_for_recycle(curr_skb);
> > > +
> > >         if (unlikely(!curr_skb))
> > >                 goto err_skb;
> > >         while (--num_buf) {
> > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >                         curr_skb = nskb;
> > >                         head_skb->truesize += nskb->truesize;
> > >                         num_skb_frags = 0;
> > > +                       if (rq->page_pool)
> > > +                               skb_mark_for_recycle(curr_skb);
> > >                 }
> > >                 if (curr_skb != head_skb) {
> > >                         head_skb->data_len += len;
> > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >                 }
> > >                 offset = buf - page_address(page);
> > >                 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > -                       put_page(page);
> > > +                       virtnet_put_page(rq, page);
> >
> > I wonder why not we can't do this during buffer allocation like other drivers?
> >
> 
> Sorry, I don't quite understand the point here. Would you please
> elaborate a bit more?
> > >                         skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > >                                              len, truesize);
> > >                 } else {
> > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >         return head_skb;
> > >
> > >  err_skb:
> > > -       put_page(page);
> > > +       virtnet_put_page(rq, page);
> > >         mergeable_buf_free(rq, num_buf, dev, stats);
> > >
> > >  err_buf:
> > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > >          * disabled GSO for XDP, it won't be a big issue.
> > >          */
> > >         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > -       if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > -               return -ENOMEM;
> > > +       if (rq->page_pool) {
> > > +               struct page *page;
> > >
> > > -       buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > -       buf += headroom; /* advance address leaving hole at front of pkt */
> > > -       get_page(alloc_frag->page);
> > > -       alloc_frag->offset += len + room;
> > > -       hole = alloc_frag->size - alloc_frag->offset;
> > > -       if (hole < len + room) {
> > > -               /* To avoid internal fragmentation, if there is very likely not
> > > -                * enough space for another buffer, add the remaining space to
> > > -                * the current buffer.
> > > -                * XDP core assumes that frame_size of xdp_buff and the length
> > > -                * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > -                */
> > > -               if (!headroom)
> > > -                       len += hole;
> > > -               alloc_frag->offset += hole;
> > > -       }
> > > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > > +               if (unlikely(!page))
> > > +                       return -ENOMEM;
> > > +               buf = (char *)page_address(page);
> > > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > > +       } else {
> > > +               if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> >
> > Why not simply use a helper like virtnet_page_frag_refill() and add
> > the page_pool allocation logic there? It helps to reduce the
> > changeset.
> >
> 
> Sure. Will do that on v2.
> > > +                       return -ENOMEM;
> > >
> > > +               buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > > +               get_page(alloc_frag->page);
> > > +               alloc_frag->offset += len + room;
> > > +               hole = alloc_frag->size - alloc_frag->offset;
> > > +               if (hole < len + room) {
> > > +                       /* To avoid internal fragmentation, if there is very likely not
> > > +                        * enough space for another buffer, add the remaining space to
> > > +                        * the current buffer.
> > > +                        * XDP core assumes that frame_size of xdp_buff and the length
> > > +                        * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > +                        */
> > > +                       if (!headroom)
> > > +                               len += hole;
> > > +                       alloc_frag->offset += hole;
> > > +               }
> > > +       }
> > >         sg_init_one(rq->sg, buf, len);
> > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > >         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > >         if (err < 0)
> > > -               put_page(virt_to_head_page(buf));
> > > +               virtnet_put_page(rq, virt_to_head_page(buf));
> > >
> > >         return err;
> > >  }
> > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > >         if (err < 0)
> > >                 return err;
> > >
> > > -       err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > -                                        MEM_TYPE_PAGE_SHARED, NULL);
> > > +       if (vi->rq[qp_index].page_pool)
> > > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > +                                                MEM_TYPE_PAGE_POOL,
> > > +                                                vi->rq[qp_index].page_pool);
> > > +       else
> > > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > +                                                MEM_TYPE_PAGE_SHARED,
> > > +                                                NULL);
> > > +
> > >         if (err < 0)
> > >                 goto err_xdp_reg_mem_model;
> > >
> > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > >                                 ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > >                                                 virtnet_sq_stats_desc[j].desc);
> > >                 }
> > > +               page_pool_ethtool_stats_get_strings(p);
> > >                 break;
> > >         }
> > >  }
> > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > >         switch (sset) {
> > >         case ETH_SS_STATS:
> > >                 return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > -                                              VIRTNET_SQ_STATS_LEN);
> > > +                                              VIRTNET_SQ_STATS_LEN +
> > > +                                               (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > +                                                page_pool_ethtool_stats_get_count() : 0));
> > >         default:
> > >                 return -EOPNOTSUPP;
> > >         }
> > >  }
> > >
> > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > +{
> > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > +       struct virtnet_info *vi = netdev_priv(dev);
> > > +       struct page_pool_stats pp_stats = {};
> > > +       int i;
> > > +
> > > +       for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > +               if (!vi->rq[i].page_pool)
> > > +                       continue;
> > > +               page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > +       }
> > > +       page_pool_ethtool_stats_get(data, &pp_stats);
> > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > +}
> > > +
> > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > >                                       struct ethtool_stats *stats, u64 *data)
> > >  {
> > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > >                 } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > >                 idx += VIRTNET_SQ_STATS_LEN;
> > >         }
> > > +
> > > +       virtnet_get_page_pool_stats(dev, &data[idx]);
> > >  }
> > >
> > >  static void virtnet_get_channels(struct net_device *dev,
> > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > >                 __netif_napi_del(&vi->rq[i].napi);
> > >                 __netif_napi_del(&vi->sq[i].napi);
> > > +               if (vi->rq[i].page_pool)
> > > +                       page_pool_destroy(vi->rq[i].page_pool);
> > >         }
> > >
> > >         /* We called __netif_napi_del(),
> > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > >         struct virtnet_info *vi = vq->vdev->priv;
> > >         int i = vq2rxq(vq);
> > >
> > > -       if (vi->mergeable_rx_bufs)
> > > -               put_page(virt_to_head_page(buf));
> > > -       else if (vi->big_packets)
> > > +       if (vi->mergeable_rx_bufs) {
> > > +               if (vi->rq[i].page_pool) {
> > > +                       page_pool_put_full_page(vi->rq[i].page_pool,
> > > +                                               virt_to_head_page(buf),
> > > +                                               true);
> > > +               } else {
> > > +                       put_page(virt_to_head_page(buf));
> > > +               }
> > > +       } else if (vi->big_packets) {
> > >                 give_pages(&vi->rq[i], buf);
> >
> > Any reason only mergeable were modified but not for small and big?
> >
> > Thanks
> >
> 
> Big mode uses the page chain to recycle pages, thus the using of
> "private" of the buffer page. I will take further look into that to
> see if it is better to use page pool in these cases. Thanks!
> 
> 
> 
> > > -       else
> > > +       } else {
> > >                 put_page(virt_to_head_page(buf));
> > > +       }
> > >  }
> > >
> > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > >         virtnet_free_queues(vi);
> > >  }
> > >
> > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > +{
> > > +       struct virtio_device *vdev = rq->vq->vdev;
> > > +
> > > +       struct page_pool_params pp_params = {
> > > +               .order = 0,
> > > +               .pool_size = rq->vq->num_max,
> > > +               .nid = dev_to_node(vdev->dev.parent),
> > > +               .dev = vdev->dev.parent,
> > > +               .offset = 0,
> > > +       };
> > > +
> > > +       rq->page_pool = page_pool_create(&pp_params);
> > > +       if (IS_ERR(rq->page_pool)) {
> > > +               dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > +                        PTR_ERR(rq->page_pool));
> > > +               rq->page_pool = NULL;
> > > +       }
> > > +}
> > > +
> > >  /* How large should a single buffer be so a queue full of these can fit at
> > >   * least one full packet?
> > >   * Logic below assumes the mergeable buffer header is used.
> > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > >                 vi->rq[i].vq = vqs[rxq2vq(i)];
> > >                 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > >                 vi->sq[i].vq = vqs[txq2vq(i)];
> > > +
> > > +               if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > +                       virtnet_alloc_page_pool(&vi->rq[i]);
> > > +               else
> > > +                       dev_warn(&vi->vdev->dev,
> > > +                                "page pool only support mergeable mode\n");
> > > +
> > >         }
> > >
> > >         /* run here: ret == 0. */
> > > --
> > > 2.31.1
> > >
> >


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support
  2023-05-26  5:46 ` [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support Liang Chen
                     ` (2 preceding siblings ...)
  2023-05-28  6:25   ` Michael S. Tsirkin
@ 2023-05-29  1:33   ` Yunsheng Lin
  2023-05-29  7:30     ` Liang Chen
  3 siblings, 1 reply; 56+ messages in thread
From: Yunsheng Lin @ 2023-05-29  1:33 UTC (permalink / raw)
  To: Liang Chen, jasowang, mst
  Cc: virtualization, netdev, linux-kernel, xuanzhuo, kuba, edumazet,
	davem, pabeni, alexander.duyck

On 2023/5/26 13:46, Liang Chen wrote:

...

> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 99c0ca0c1781..ac40b8c66c59 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -32,7 +32,9 @@ module_param(gso, bool, 0444);
>  module_param(napi_tx, bool, 0644);
>  
>  static bool page_pool_enabled;
> +static bool page_pool_frag;
>  module_param(page_pool_enabled, bool, 0400);
> +module_param(page_pool_frag, bool, 0400);

The below patchset unifies the frag and non-frag page for
page_pool_alloc_frag() API, perhaps it would simplify the
driver's support of page pool.

https://patchwork.kernel.org/project/netdevbpf/cover/20230526092616.40355-1-linyunsheng@huawei.com/

>  

...

> @@ -1769,13 +1788,29 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>  	 */
>  	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
>  	if (rq->page_pool) {
> -		struct page *page;
> +		if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG) {
> +			if (unlikely(!page_pool_dev_alloc_frag(rq->page_pool,
> +							       &pp_frag_offset, len + room)))
> +				return -ENOMEM;
> +			buf = (char *)page_address(rq->page_pool->frag_page) +
> +				pp_frag_offset;
> +			buf += headroom; /* advance address leaving hole at front of pkt */
> +			hole = (PAGE_SIZE << rq->page_pool->p.order)
> +				- rq->page_pool->frag_offset;
> +			if (hole < len + room) {
> +				if (!headroom)
> +					len += hole;
> +				rq->page_pool->frag_offset += hole;

Is there any reason why the driver need to be aware of page_pool->frag_offset?
Isn't the page_pool_dev_alloc_frag() will drain the last page for you when
page_pool_dev_alloc_frag() is called with size being 'len + room' later?
One case I can think of needing this is to have an accurate truesize report
for skb, but I am not sure it matters that much as 'struct page_frag_cache'
and 'page_frag' implementation both have a similar problem.

> +			}
> +		} else {
> +			struct page *page;
>  
> -		page = page_pool_dev_alloc_pages(rq->page_pool);
> -		if (unlikely(!page))
> -			return -ENOMEM;
> -		buf = (char *)page_address(page);
> -		buf += headroom; /* advance address leaving hole at front of pkt */
> +			page = page_pool_dev_alloc_pages(rq->page_pool);
> +			if (unlikely(!page))
> +				return -ENOMEM;
> +			buf = (char *)page_address(page);
> +			buf += headroom; /* advance address leaving hole at front of pkt */
> +		}
>  	} else {
>  		if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
>  			return -ENOMEM;
> @@ -3800,13 +3835,16 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq)
>  	struct virtio_device *vdev = rq->vq->vdev;
>  
>  	struct page_pool_params pp_params = {
> -		.order = 0,
> +		.order = page_pool_frag ? SKB_FRAG_PAGE_ORDER : 0,
>  		.pool_size = rq->vq->num_max,

If it using order SKB_FRAG_PAGE_ORDER page, perhaps pool_size does
not have to be rq->vq->num_max? Even for order 0 page, perhaps the
pool_size does not need to be as big as rq->vq->num_max?

>  		.nid = dev_to_node(vdev->dev.parent),
>  		.dev = vdev->dev.parent,
>  		.offset = 0,
>  	};
>  
> +	if (page_pool_frag)
> +		pp_params.flags |= PP_FLAG_PAGE_FRAG;
> +
>  	rq->page_pool = page_pool_create(&pp_params);
>  	if (IS_ERR(rq->page_pool)) {
>  		dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> 

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain
  2023-05-28  6:16 ` Michael S. Tsirkin
@ 2023-05-29  7:25   ` Liang Chen
  0 siblings, 0 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-29  7:25 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: jasowang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Sun, May 28, 2023 at 2:16 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, May 26, 2023 at 01:46:17PM +0800, Liang Chen wrote:
> > "private" of buffer page is currently used for big mode to chain pages.
> > But in mergeable mode, that offset of page could mean something else,
> > e.g. when page_pool page is used instead. So excluding mergeable mode to
> > avoid such a problem.
> >
> > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
>
> Ugh the subject makes it looks like current code has a problem
> but I don't think so because I don't think anything besides
> big packets uses page->private.
>
> The reason patch is needed is because follow up patches
> use page_pool.
> pls adjust commit log and subject to make all this clear.
>
>
> > ---
> >  drivers/net/virtio_net.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 5a7f7a76b920..c5dca0d92e64 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> >                       return NULL;
> >
> >               page = (struct page *)page->private;
> > -             if (page)
> > +             if (!vi->mergeable_rx_bufs && page)
>
> To be safe let's limit to big packets too:
>
>         if (!vi->mergeable_rx_bufs && vi->big_packets && page)
>
>
>

Sure, thanks!

> >                       give_pages(rq, page);
> >               goto ok;
> >       }
> > --
> > 2.31.1
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain
  2023-05-28  6:29   ` Michael S. Tsirkin
@ 2023-05-29  7:25     ` Liang Chen
  0 siblings, 0 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-29  7:25 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Sun, May 28, 2023 at 2:29 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, May 26, 2023 at 02:38:54PM +0800, Jason Wang wrote:
> > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote:
> > >
> > > "private" of buffer page is currently used for big mode to chain pages.
> > > But in mergeable mode, that offset of page could mean something else,
> > > e.g. when page_pool page is used instead. So excluding mergeable mode to
> > > avoid such a problem.
> >
> > If this issue happens only in the case of page_pool, it would be
> > better to squash it there.
> >
> > Thanks
>
>
> This is a tiny patch so I don't care. Generally it's ok
> to first rework code then change functionality.
> in this case what Jason says os right especially because
> you then do not need to explain that current code is ok.
>

Sure. it will be squashed into the page pool enablement patch. Thanks!

> > >
> > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > ---
> > >  drivers/net/virtio_net.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index 5a7f7a76b920..c5dca0d92e64 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -497,7 +497,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > >                         return NULL;
> > >
> > >                 page = (struct page *)page->private;
> > > -               if (page)
> > > +               if (!vi->mergeable_rx_bufs && page)
> > >                         give_pages(rq, page);
> > >                 goto ok;
> > >         }
> > > --
> > > 2.31.1
> > >
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-28  6:20   ` Michael S. Tsirkin
@ 2023-05-29  7:27     ` Liang Chen
  2023-05-29  9:55       ` Michael S. Tsirkin
  0 siblings, 1 reply; 56+ messages in thread
From: Liang Chen @ 2023-05-29  7:27 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: jasowang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > The implementation at the moment uses one page per packet in both the
> > normal and XDP path. In addition, introducing a module parameter to enable
> > or disable the usage of page pool (disabled by default).
> >
> > In single-core vm testing environments, it gives a modest performance gain
> > in the normal path.
> >   Upstream codebase: 47.5 Gbits/sec
> >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> >
> > In multi-core vm testing environments, The most significant performance
> > gain is observed in XDP cpumap:
> >   Upstream codebase: 1.38 Gbits/sec
> >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> >
> > With this foundation, we can further integrate page pool fragmentation and
> > DMA map/unmap support.
> >
> > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
>
> Why off by default?
> I am guessing it sometimes has performance costs too?
>
>
> What happens if we use page pool for big mode too?
> The less modes we have the better...
>
>

Sure, now I believe it makes sense to enable it by default. When the
packet size is very small, it reduces the likelihood of skb
coalescing. But such cases are rare.
The usage of page pool for big mode is being evaluated now. Thanks!

> > ---
> >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> >  1 file changed, 146 insertions(+), 42 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index c5dca0d92e64..99c0ca0c1781 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> >  module_param(gso, bool, 0444);
> >  module_param(napi_tx, bool, 0644);
> >
> > +static bool page_pool_enabled;
> > +module_param(page_pool_enabled, bool, 0400);
> > +
> >  /* FIXME: MTU in config. */
> >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> >  #define GOOD_COPY_LEN        128
> > @@ -159,6 +162,9 @@ struct receive_queue {
> >       /* Chain pages by the private ptr. */
> >       struct page *pages;
> >
> > +     /* Page pool */
> > +     struct page_pool *page_pool;
> > +
> >       /* Average packet length for mergeable receive buffers. */
> >       struct ewma_pkt_len mrg_avg_pkt_len;
> >
> > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> >       return skb;
> >  }
> >
> > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > +{
> > +     if (rq->page_pool)
> > +             page_pool_put_full_page(rq->page_pool, page, true);
> > +     else
> > +             put_page(page);
> > +}
> > +
> >  /* Called from bottom half context */
> >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> >                                  struct receive_queue *rq,
> > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> >       hdr = skb_vnet_hdr(skb);
> >       memcpy(hdr, hdr_p, hdr_len);
> >       if (page_to_free)
> > -             put_page(page_to_free);
> > +             virtnet_put_page(rq, page_to_free);
> >
> >       return skb;
> >  }
> > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> >       return ret;
> >  }
> >
> > -static void put_xdp_frags(struct xdp_buff *xdp)
> > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> >  {
> >       struct skb_shared_info *shinfo;
> >       struct page *xdp_page;
> > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> >               shinfo = xdp_get_shared_info_from_buff(xdp);
> >               for (i = 0; i < shinfo->nr_frags; i++) {
> >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > -                     put_page(xdp_page);
> > +                     virtnet_put_page(rq, xdp_page);
> >               }
> >       }
> >  }
> > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >       if (page_off + *len + tailroom > PAGE_SIZE)
> >               return NULL;
> >
> > -     page = alloc_page(GFP_ATOMIC);
> > +     if (rq->page_pool)
> > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > +     else
> > +             page = alloc_page(GFP_ATOMIC);
> > +
> >       if (!page)
> >               return NULL;
> >
> > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >                * is sending packet larger than the MTU.
> >                */
> >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > -                     put_page(p);
> > +                     virtnet_put_page(rq, p);
> >                       goto err_buf;
> >               }
> >
> >               memcpy(page_address(page) + page_off,
> >                      page_address(p) + off, buflen);
> >               page_off += buflen;
> > -             put_page(p);
> > +             virtnet_put_page(rq, p);
> >       }
> >
> >       /* Headroom does not contribute to packet length */
> >       *len = page_off - VIRTIO_XDP_HEADROOM;
> >       return page;
> >  err_buf:
> > -     __free_pages(page, 0);
> > +     if (rq->page_pool)
> > +             page_pool_put_full_page(rq->page_pool, page, true);
> > +     else
> > +             __free_pages(page, 0);
> >       return NULL;
> >  }
> >
> > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> >               }
> >               stats->bytes += len;
> >               page = virt_to_head_page(buf);
> > -             put_page(page);
> > +             virtnet_put_page(rq, page);
> >       }
> >  }
> >
> > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> >               cur_frag_size = truesize;
> >               xdp_frags_truesz += cur_frag_size;
> >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > -                     put_page(page);
> > +                     virtnet_put_page(rq, page);
> >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> >                                dev->name, len, (unsigned long)(truesize - room));
> >                       dev->stats.rx_length_errors++;
> > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> >       return 0;
> >
> >  err:
> > -     put_xdp_frags(xdp);
> > +     put_xdp_frags(xdp, rq);
> >       return -EINVAL;
> >  }
> >
> > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >               if (*len + xdp_room > PAGE_SIZE)
> >                       return NULL;
> >
> > -             xdp_page = alloc_page(GFP_ATOMIC);
> > +             if (rq->page_pool)
> > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > +             else
> > +                     xdp_page = alloc_page(GFP_ATOMIC);
> >               if (!xdp_page)
> >                       return NULL;
> >
> > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >
> >       *frame_sz = PAGE_SIZE;
> >
> > -     put_page(*page);
> > +     virtnet_put_page(rq, *page);
> >
> >       *page = xdp_page;
> >
> > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> >               if (unlikely(!head_skb))
> >                       break;
> > +             if (rq->page_pool)
> > +                     skb_mark_for_recycle(head_skb);
> >               return head_skb;
> >
> >       case XDP_TX:
> > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> >               break;
> >       }
> >
> > -     put_xdp_frags(&xdp);
> > +     put_xdp_frags(&xdp, rq);
> >
> >  err_xdp:
> > -     put_page(page);
> > +     virtnet_put_page(rq, page);
> >       mergeable_buf_free(rq, num_buf, dev, stats);
> >
> >       stats->xdp_drops++;
> > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> >       curr_skb = head_skb;
> >
> > +     if (rq->page_pool)
> > +             skb_mark_for_recycle(curr_skb);
> > +
> >       if (unlikely(!curr_skb))
> >               goto err_skb;
> >       while (--num_buf) {
> > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >                       curr_skb = nskb;
> >                       head_skb->truesize += nskb->truesize;
> >                       num_skb_frags = 0;
> > +                     if (rq->page_pool)
> > +                             skb_mark_for_recycle(curr_skb);
> >               }
> >               if (curr_skb != head_skb) {
> >                       head_skb->data_len += len;
> > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >               }
> >               offset = buf - page_address(page);
> >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > -                     put_page(page);
> > +                     virtnet_put_page(rq, page);
> >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> >                                            len, truesize);
> >               } else {
> > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> >       return head_skb;
> >
> >  err_skb:
> > -     put_page(page);
> > +     virtnet_put_page(rq, page);
> >       mergeable_buf_free(rq, num_buf, dev, stats);
> >
> >  err_buf:
> > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >        * disabled GSO for XDP, it won't be a big issue.
> >        */
> >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > -             return -ENOMEM;
> > +     if (rq->page_pool) {
> > +             struct page *page;
> >
> > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > -     get_page(alloc_frag->page);
> > -     alloc_frag->offset += len + room;
> > -     hole = alloc_frag->size - alloc_frag->offset;
> > -     if (hole < len + room) {
> > -             /* To avoid internal fragmentation, if there is very likely not
> > -              * enough space for another buffer, add the remaining space to
> > -              * the current buffer.
> > -              * XDP core assumes that frame_size of xdp_buff and the length
> > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > -              */
> > -             if (!headroom)
> > -                     len += hole;
> > -             alloc_frag->offset += hole;
> > -     }
> > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > +             if (unlikely(!page))
> > +                     return -ENOMEM;
> > +             buf = (char *)page_address(page);
> > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > +     } else {
> > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > +                     return -ENOMEM;
> >
> > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > +             get_page(alloc_frag->page);
> > +             alloc_frag->offset += len + room;
> > +             hole = alloc_frag->size - alloc_frag->offset;
> > +             if (hole < len + room) {
> > +                     /* To avoid internal fragmentation, if there is very likely not
> > +                      * enough space for another buffer, add the remaining space to
> > +                      * the current buffer.
> > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > +                      */
> > +                     if (!headroom)
> > +                             len += hole;
> > +                     alloc_frag->offset += hole;
> > +             }
> > +     }
> >       sg_init_one(rq->sg, buf, len);
> >       ctx = mergeable_len_to_ctx(len + room, headroom);
> >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> >       if (err < 0)
> > -             put_page(virt_to_head_page(buf));
> > +             virtnet_put_page(rq, virt_to_head_page(buf));
> >
> >       return err;
> >  }
> > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> >       if (err < 0)
> >               return err;
> >
> > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > +     if (vi->rq[qp_index].page_pool)
> > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > +                                              MEM_TYPE_PAGE_POOL,
> > +                                              vi->rq[qp_index].page_pool);
> > +     else
> > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > +                                              MEM_TYPE_PAGE_SHARED,
> > +                                              NULL);
> > +
> >       if (err < 0)
> >               goto err_xdp_reg_mem_model;
> >
> > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> >                                               virtnet_sq_stats_desc[j].desc);
> >               }
> > +             page_pool_ethtool_stats_get_strings(p);
> >               break;
> >       }
> >  }
> > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> >       switch (sset) {
> >       case ETH_SS_STATS:
> >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > -                                            VIRTNET_SQ_STATS_LEN);
> > +                                            VIRTNET_SQ_STATS_LEN +
> > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > +                                              page_pool_ethtool_stats_get_count() : 0));
> >       default:
> >               return -EOPNOTSUPP;
> >       }
> >  }
> >
> > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > +{
> > +#ifdef CONFIG_PAGE_POOL_STATS
> > +     struct virtnet_info *vi = netdev_priv(dev);
> > +     struct page_pool_stats pp_stats = {};
> > +     int i;
> > +
> > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > +             if (!vi->rq[i].page_pool)
> > +                     continue;
> > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > +     }
> > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > +#endif /* CONFIG_PAGE_POOL_STATS */
> > +}
> > +
> >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> >                                     struct ethtool_stats *stats, u64 *data)
> >  {
> > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> >               idx += VIRTNET_SQ_STATS_LEN;
> >       }
> > +
> > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> >  }
> >
> >  static void virtnet_get_channels(struct net_device *dev,
> > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> >       for (i = 0; i < vi->max_queue_pairs; i++) {
> >               __netif_napi_del(&vi->rq[i].napi);
> >               __netif_napi_del(&vi->sq[i].napi);
> > +             if (vi->rq[i].page_pool)
> > +                     page_pool_destroy(vi->rq[i].page_pool);
> >       }
> >
> >       /* We called __netif_napi_del(),
> > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> >       struct virtnet_info *vi = vq->vdev->priv;
> >       int i = vq2rxq(vq);
> >
> > -     if (vi->mergeable_rx_bufs)
> > -             put_page(virt_to_head_page(buf));
> > -     else if (vi->big_packets)
> > +     if (vi->mergeable_rx_bufs) {
> > +             if (vi->rq[i].page_pool) {
> > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > +                                             virt_to_head_page(buf),
> > +                                             true);
> > +             } else {
> > +                     put_page(virt_to_head_page(buf));
> > +             }
> > +     } else if (vi->big_packets) {
> >               give_pages(&vi->rq[i], buf);
> > -     else
> > +     } else {
> >               put_page(virt_to_head_page(buf));
> > +     }
> >  }
> >
> >  static void free_unused_bufs(struct virtnet_info *vi)
> > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> >       virtnet_free_queues(vi);
> >  }
> >
> > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > +{
> > +     struct virtio_device *vdev = rq->vq->vdev;
> > +
> > +     struct page_pool_params pp_params = {
> > +             .order = 0,
> > +             .pool_size = rq->vq->num_max,
> > +             .nid = dev_to_node(vdev->dev.parent),
> > +             .dev = vdev->dev.parent,
> > +             .offset = 0,
> > +     };
> > +
> > +     rq->page_pool = page_pool_create(&pp_params);
> > +     if (IS_ERR(rq->page_pool)) {
> > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > +                      PTR_ERR(rq->page_pool));
> > +             rq->page_pool = NULL;
> > +     }
> > +}
> > +
> >  /* How large should a single buffer be so a queue full of these can fit at
> >   * least one full packet?
> >   * Logic below assumes the mergeable buffer header is used.
> > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> >               vi->rq[i].vq = vqs[rxq2vq(i)];
> >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> >               vi->sq[i].vq = vqs[txq2vq(i)];
> > +
> > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > +             else
> > +                     dev_warn(&vi->vdev->dev,
> > +                              "page pool only support mergeable mode\n");
> > +
> >       }
> >
> >       /* run here: ret == 0. */
> > --
> > 2.31.1
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-28  6:27     ` Michael S. Tsirkin
@ 2023-05-29  7:28       ` Liang Chen
  0 siblings, 0 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-29  7:28 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: kernel test robot, jasowang, oe-kbuild-all, virtualization,
	netdev, linux-kernel, xuanzhuo, kuba, edumazet, davem, pabeni,
	alexander.duyck

On Sun, May 28, 2023 at 2:27 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Sat, May 27, 2023 at 12:11:25AM +0800, kernel test robot wrote:
> > Hi Liang,
> >
> > kernel test robot noticed the following build errors:
> >
> > [auto build test ERROR on net-next/main]
> >
> > url:    https://github.com/intel-lab-lkp/linux/commits/Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805
> > base:   net-next/main
> > patch link:    https://lore.kernel.org/r/20230526054621.18371-2-liangchen.linux%40gmail.com
> > patch subject: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
> > config: x86_64-defconfig (https://download.01.org/0day-ci/archive/20230526/202305262334.GiFQ3wpG-lkp@intel.com/config)
> > compiler: gcc-11 (Debian 11.3.0-12) 11.3.0
> > reproduce (this is a W=1 build):
> >         # https://github.com/intel-lab-lkp/linux/commit/bfba563f43bba37181d8502cb2e566c32f96ec9e
> >         git remote add linux-review https://github.com/intel-lab-lkp/linux
> >         git fetch --no-tags linux-review Liang-Chen/virtio_net-Add-page_pool-support-to-improve-performance/20230526-135805
> >         git checkout bfba563f43bba37181d8502cb2e566c32f96ec9e
> >         # save the config file
> >         mkdir build_dir && cp config build_dir/.config
> >         make W=1 O=build_dir ARCH=x86_64 olddefconfig
> >         make W=1 O=build_dir ARCH=x86_64 SHELL=/bin/bash
> >
> > If you fix the issue, kindly add following tag where applicable
> > | Reported-by: kernel test robot <lkp@intel.com>
> > | Closes: https://lore.kernel.org/oe-kbuild-all/202305262334.GiFQ3wpG-lkp@intel.com/
> >
> > All errors (new ones prefixed by >>):
> >
> >    ld: vmlinux.o: in function `virtnet_find_vqs':
> > >> virtio_net.c:(.text+0x901fb5): undefined reference to `page_pool_create'
> >    ld: vmlinux.o: in function `add_recvbuf_mergeable.isra.0':
> > >> virtio_net.c:(.text+0x905618): undefined reference to `page_pool_alloc_pages'
> >    ld: vmlinux.o: in function `xdp_linearize_page':
> >    virtio_net.c:(.text+0x906b6b): undefined reference to `page_pool_alloc_pages'
> >    ld: vmlinux.o: in function `mergeable_xdp_get_buf.isra.0':
> >    virtio_net.c:(.text+0x90728f): undefined reference to `page_pool_alloc_pages'
>
>
> you need to tweak Kconfig to select PAGE_POOL I think.
>

Sure, thanks!


> > --
> > 0-DAY CI Kernel Test Service
> > https://github.com/intel/lkp-tests/wiki
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-28  6:40       ` Michael S. Tsirkin
@ 2023-05-29  7:28         ` Liang Chen
  2023-05-31  3:10           ` Xuan Zhuo
  0 siblings, 1 reply; 56+ messages in thread
From: Liang Chen @ 2023-05-29  7:28 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Jason Wang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Sun, May 28, 2023 at 2:40 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Sat, May 27, 2023 at 08:35:01PM +0800, Liang Chen wrote:
> > On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote:
> > > >
> > > > The implementation at the moment uses one page per packet in both the
> > > > normal and XDP path.
> > >
> > > It's better to explain why we need a page pool and how it can help the
> > > performance.
> > >
> >
> > Sure, I will include that on v2.
> > > > In addition, introducing a module parameter to enable
> > > > or disable the usage of page pool (disabled by default).
> > >
> > > If page pool wins for most of the cases, any reason to disable it by default?
> > >
> >
> > Thank you for raising the point. It does make sense to enable it by default.
>
> I'd like to see more benchmarks pls then, with a variety of packet
> sizes, udp and tcp.
>

Sure, more benchmarks will be provided. Thanks.


> > > >
> > > > In single-core vm testing environments, it gives a modest performance gain
> > > > in the normal path.
> > > >   Upstream codebase: 47.5 Gbits/sec
> > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > >
> > > > In multi-core vm testing environments, The most significant performance
> > > > gain is observed in XDP cpumap:
> > > >   Upstream codebase: 1.38 Gbits/sec
> > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > >
> > > Please show more details on the test. E.g which kinds of tests have
> > > you measured?
> > >
> > > Btw, it would be better to measure PPS as well.
> > >
> >
> > Sure. It will be added on v2.
> > > >
> > > > With this foundation, we can further integrate page pool fragmentation and
> > > > DMA map/unmap support.
> > > >
> > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > ---
> > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > >
> > > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do
> > > the ifdef tricks at least.
> > >
> >
> > Sure. it will be done on v2.
> > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > >
> > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > --- a/drivers/net/virtio_net.c
> > > > +++ b/drivers/net/virtio_net.c
> > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > >  module_param(gso, bool, 0444);
> > > >  module_param(napi_tx, bool, 0644);
> > > >
> > > > +static bool page_pool_enabled;
> > > > +module_param(page_pool_enabled, bool, 0400);
> > > > +
> > > >  /* FIXME: MTU in config. */
> > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > >  #define GOOD_COPY_LEN  128
> > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > >         /* Chain pages by the private ptr. */
> > > >         struct page *pages;
> > > >
> > > > +       /* Page pool */
> > > > +       struct page_pool *page_pool;
> > > > +
> > > >         /* Average packet length for mergeable receive buffers. */
> > > >         struct ewma_pkt_len mrg_avg_pkt_len;
> > > >
> > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > >         return skb;
> > > >  }
> > > >
> > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > +{
> > > > +       if (rq->page_pool)
> > > > +               page_pool_put_full_page(rq->page_pool, page, true);
> > > > +       else
> > > > +               put_page(page);
> > > > +}
> > > > +
> > > >  /* Called from bottom half context */
> > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > >                                    struct receive_queue *rq,
> > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > >         hdr = skb_vnet_hdr(skb);
> > > >         memcpy(hdr, hdr_p, hdr_len);
> > > >         if (page_to_free)
> > > > -               put_page(page_to_free);
> > > > +               virtnet_put_page(rq, page_to_free);
> > > >
> > > >         return skb;
> > > >  }
> > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > >         return ret;
> > > >  }
> > > >
> > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > >  {
> > >
> > > rq could be fetched from xdp_rxq_info?
> >
> > Yeah, it has the queue_index there.
> > >
> > > >         struct skb_shared_info *shinfo;
> > > >         struct page *xdp_page;
> > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > >                 shinfo = xdp_get_shared_info_from_buff(xdp);
> > > >                 for (i = 0; i < shinfo->nr_frags; i++) {
> > > >                         xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > -                       put_page(xdp_page);
> > > > +                       virtnet_put_page(rq, xdp_page);
> > > >                 }
> > > >         }
> > > >  }
> > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > >         if (page_off + *len + tailroom > PAGE_SIZE)
> > > >                 return NULL;
> > > >
> > > > -       page = alloc_page(GFP_ATOMIC);
> > > > +       if (rq->page_pool)
> > > > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > +       else
> > > > +               page = alloc_page(GFP_ATOMIC);
> > > > +
> > > >         if (!page)
> > > >                 return NULL;
> > > >
> > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > >                  * is sending packet larger than the MTU.
> > > >                  */
> > > >                 if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > -                       put_page(p);
> > > > +                       virtnet_put_page(rq, p);
> > > >                         goto err_buf;
> > > >                 }
> > > >
> > > >                 memcpy(page_address(page) + page_off,
> > > >                        page_address(p) + off, buflen);
> > > >                 page_off += buflen;
> > > > -               put_page(p);
> > > > +               virtnet_put_page(rq, p);
> > > >         }
> > > >
> > > >         /* Headroom does not contribute to packet length */
> > > >         *len = page_off - VIRTIO_XDP_HEADROOM;
> > > >         return page;
> > > >  err_buf:
> > > > -       __free_pages(page, 0);
> > > > +       if (rq->page_pool)
> > > > +               page_pool_put_full_page(rq->page_pool, page, true);
> > > > +       else
> > > > +               __free_pages(page, 0);
> > > >         return NULL;
> > > >  }
> > > >
> > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > >                 }
> > > >                 stats->bytes += len;
> > > >                 page = virt_to_head_page(buf);
> > > > -               put_page(page);
> > > > +               virtnet_put_page(rq, page);
> > > >         }
> > > >  }
> > > >
> > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > >                 cur_frag_size = truesize;
> > > >                 xdp_frags_truesz += cur_frag_size;
> > > >                 if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > -                       put_page(page);
> > > > +                       virtnet_put_page(rq, page);
> > > >                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > >                                  dev->name, len, (unsigned long)(truesize - room));
> > > >                         dev->stats.rx_length_errors++;
> > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > >         return 0;
> > > >
> > > >  err:
> > > > -       put_xdp_frags(xdp);
> > > > +       put_xdp_frags(xdp, rq);
> > > >         return -EINVAL;
> > > >  }
> > > >
> > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > >                 if (*len + xdp_room > PAGE_SIZE)
> > > >                         return NULL;
> > > >
> > > > -               xdp_page = alloc_page(GFP_ATOMIC);
> > > > +               if (rq->page_pool)
> > > > +                       xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > +               else
> > > > +                       xdp_page = alloc_page(GFP_ATOMIC);
> > > >                 if (!xdp_page)
> > > >                         return NULL;
> > > >
> > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > >
> > > >         *frame_sz = PAGE_SIZE;
> > > >
> > > > -       put_page(*page);
> > > > +       virtnet_put_page(rq, *page);
> > > >
> > > >         *page = xdp_page;
> > > >
> > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > >                 head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > >                 if (unlikely(!head_skb))
> > > >                         break;
> > > > +               if (rq->page_pool)
> > > > +                       skb_mark_for_recycle(head_skb);
> > > >                 return head_skb;
> > > >
> > > >         case XDP_TX:
> > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > >                 break;
> > > >         }
> > > >
> > > > -       put_xdp_frags(&xdp);
> > > > +       put_xdp_frags(&xdp, rq);
> > > >
> > > >  err_xdp:
> > > > -       put_page(page);
> > > > +       virtnet_put_page(rq, page);
> > > >         mergeable_buf_free(rq, num_buf, dev, stats);
> > > >
> > > >         stats->xdp_drops++;
> > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >         head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > >         curr_skb = head_skb;
> > > >
> > > > +       if (rq->page_pool)
> > > > +               skb_mark_for_recycle(curr_skb);
> > > > +
> > > >         if (unlikely(!curr_skb))
> > > >                 goto err_skb;
> > > >         while (--num_buf) {
> > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >                         curr_skb = nskb;
> > > >                         head_skb->truesize += nskb->truesize;
> > > >                         num_skb_frags = 0;
> > > > +                       if (rq->page_pool)
> > > > +                               skb_mark_for_recycle(curr_skb);
> > > >                 }
> > > >                 if (curr_skb != head_skb) {
> > > >                         head_skb->data_len += len;
> > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >                 }
> > > >                 offset = buf - page_address(page);
> > > >                 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > -                       put_page(page);
> > > > +                       virtnet_put_page(rq, page);
> > >
> > > I wonder why not we can't do this during buffer allocation like other drivers?
> > >
> >
> > Sorry, I don't quite understand the point here. Would you please
> > elaborate a bit more?
> > > >                         skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > >                                              len, truesize);
> > > >                 } else {
> > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >         return head_skb;
> > > >
> > > >  err_skb:
> > > > -       put_page(page);
> > > > +       virtnet_put_page(rq, page);
> > > >         mergeable_buf_free(rq, num_buf, dev, stats);
> > > >
> > > >  err_buf:
> > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > >          * disabled GSO for XDP, it won't be a big issue.
> > > >          */
> > > >         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > -       if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > -               return -ENOMEM;
> > > > +       if (rq->page_pool) {
> > > > +               struct page *page;
> > > >
> > > > -       buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > -       buf += headroom; /* advance address leaving hole at front of pkt */
> > > > -       get_page(alloc_frag->page);
> > > > -       alloc_frag->offset += len + room;
> > > > -       hole = alloc_frag->size - alloc_frag->offset;
> > > > -       if (hole < len + room) {
> > > > -               /* To avoid internal fragmentation, if there is very likely not
> > > > -                * enough space for another buffer, add the remaining space to
> > > > -                * the current buffer.
> > > > -                * XDP core assumes that frame_size of xdp_buff and the length
> > > > -                * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > -                */
> > > > -               if (!headroom)
> > > > -                       len += hole;
> > > > -               alloc_frag->offset += hole;
> > > > -       }
> > > > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > +               if (unlikely(!page))
> > > > +                       return -ENOMEM;
> > > > +               buf = (char *)page_address(page);
> > > > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > > > +       } else {
> > > > +               if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > >
> > > Why not simply use a helper like virtnet_page_frag_refill() and add
> > > the page_pool allocation logic there? It helps to reduce the
> > > changeset.
> > >
> >
> > Sure. Will do that on v2.
> > > > +                       return -ENOMEM;
> > > >
> > > > +               buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > > > +               get_page(alloc_frag->page);
> > > > +               alloc_frag->offset += len + room;
> > > > +               hole = alloc_frag->size - alloc_frag->offset;
> > > > +               if (hole < len + room) {
> > > > +                       /* To avoid internal fragmentation, if there is very likely not
> > > > +                        * enough space for another buffer, add the remaining space to
> > > > +                        * the current buffer.
> > > > +                        * XDP core assumes that frame_size of xdp_buff and the length
> > > > +                        * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > +                        */
> > > > +                       if (!headroom)
> > > > +                               len += hole;
> > > > +                       alloc_frag->offset += hole;
> > > > +               }
> > > > +       }
> > > >         sg_init_one(rq->sg, buf, len);
> > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > >         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > >         if (err < 0)
> > > > -               put_page(virt_to_head_page(buf));
> > > > +               virtnet_put_page(rq, virt_to_head_page(buf));
> > > >
> > > >         return err;
> > > >  }
> > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > >         if (err < 0)
> > > >                 return err;
> > > >
> > > > -       err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > -                                        MEM_TYPE_PAGE_SHARED, NULL);
> > > > +       if (vi->rq[qp_index].page_pool)
> > > > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > +                                                MEM_TYPE_PAGE_POOL,
> > > > +                                                vi->rq[qp_index].page_pool);
> > > > +       else
> > > > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > +                                                MEM_TYPE_PAGE_SHARED,
> > > > +                                                NULL);
> > > > +
> > > >         if (err < 0)
> > > >                 goto err_xdp_reg_mem_model;
> > > >
> > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > >                                 ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > >                                                 virtnet_sq_stats_desc[j].desc);
> > > >                 }
> > > > +               page_pool_ethtool_stats_get_strings(p);
> > > >                 break;
> > > >         }
> > > >  }
> > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > >         switch (sset) {
> > > >         case ETH_SS_STATS:
> > > >                 return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > -                                              VIRTNET_SQ_STATS_LEN);
> > > > +                                              VIRTNET_SQ_STATS_LEN +
> > > > +                                               (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > +                                                page_pool_ethtool_stats_get_count() : 0));
> > > >         default:
> > > >                 return -EOPNOTSUPP;
> > > >         }
> > > >  }
> > > >
> > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > +{
> > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > +       struct virtnet_info *vi = netdev_priv(dev);
> > > > +       struct page_pool_stats pp_stats = {};
> > > > +       int i;
> > > > +
> > > > +       for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > +               if (!vi->rq[i].page_pool)
> > > > +                       continue;
> > > > +               page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > +       }
> > > > +       page_pool_ethtool_stats_get(data, &pp_stats);
> > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > +}
> > > > +
> > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > >                                       struct ethtool_stats *stats, u64 *data)
> > > >  {
> > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > >                 } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > >                 idx += VIRTNET_SQ_STATS_LEN;
> > > >         }
> > > > +
> > > > +       virtnet_get_page_pool_stats(dev, &data[idx]);
> > > >  }
> > > >
> > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > +               if (vi->rq[i].page_pool)
> > > > +                       page_pool_destroy(vi->rq[i].page_pool);
> > > >         }
> > > >
> > > >         /* We called __netif_napi_del(),
> > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > >         struct virtnet_info *vi = vq->vdev->priv;
> > > >         int i = vq2rxq(vq);
> > > >
> > > > -       if (vi->mergeable_rx_bufs)
> > > > -               put_page(virt_to_head_page(buf));
> > > > -       else if (vi->big_packets)
> > > > +       if (vi->mergeable_rx_bufs) {
> > > > +               if (vi->rq[i].page_pool) {
> > > > +                       page_pool_put_full_page(vi->rq[i].page_pool,
> > > > +                                               virt_to_head_page(buf),
> > > > +                                               true);
> > > > +               } else {
> > > > +                       put_page(virt_to_head_page(buf));
> > > > +               }
> > > > +       } else if (vi->big_packets) {
> > > >                 give_pages(&vi->rq[i], buf);
> > >
> > > Any reason only mergeable were modified but not for small and big?
> > >
> > > Thanks
> > >
> >
> > Big mode uses the page chain to recycle pages, thus the using of
> > "private" of the buffer page. I will take further look into that to
> > see if it is better to use page pool in these cases. Thanks!
> >
> >
> >
> > > > -       else
> > > > +       } else {
> > > >                 put_page(virt_to_head_page(buf));
> > > > +       }
> > > >  }
> > > >
> > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > >         virtnet_free_queues(vi);
> > > >  }
> > > >
> > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > +{
> > > > +       struct virtio_device *vdev = rq->vq->vdev;
> > > > +
> > > > +       struct page_pool_params pp_params = {
> > > > +               .order = 0,
> > > > +               .pool_size = rq->vq->num_max,
> > > > +               .nid = dev_to_node(vdev->dev.parent),
> > > > +               .dev = vdev->dev.parent,
> > > > +               .offset = 0,
> > > > +       };
> > > > +
> > > > +       rq->page_pool = page_pool_create(&pp_params);
> > > > +       if (IS_ERR(rq->page_pool)) {
> > > > +               dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > +                        PTR_ERR(rq->page_pool));
> > > > +               rq->page_pool = NULL;
> > > > +       }
> > > > +}
> > > > +
> > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > >   * least one full packet?
> > > >   * Logic below assumes the mergeable buffer header is used.
> > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > >                 vi->rq[i].vq = vqs[rxq2vq(i)];
> > > >                 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > >                 vi->sq[i].vq = vqs[txq2vq(i)];
> > > > +
> > > > +               if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > +                       virtnet_alloc_page_pool(&vi->rq[i]);
> > > > +               else
> > > > +                       dev_warn(&vi->vdev->dev,
> > > > +                                "page pool only support mergeable mode\n");
> > > > +
> > > >         }
> > > >
> > > >         /* run here: ret == 0. */
> > > > --
> > > > 2.31.1
> > > >
> > >
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support
  2023-05-28  6:25   ` Michael S. Tsirkin
@ 2023-05-29  7:29     ` Liang Chen
  0 siblings, 0 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-29  7:29 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: jasowang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Sun, May 28, 2023 at 2:25 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Fri, May 26, 2023 at 01:46:19PM +0800, Liang Chen wrote:
> > To further enhance performance, implement page pool fragmentation
> > support and introduce a module parameter to enable or disable it.
> >
> > In single-core vm testing environments, there is an additional performance
> > gain observed in the normal path compared to the one packet per page
> > approach.
> >   Upstream codebase: 47.5 Gbits/sec
> >   Upstream codebase with page pool: 50.2 Gbits/sec
> >   Upstream codebase with page pool fragmentation support: 52.3 Gbits/sec
> >
> > There is also some performance gain for XDP cpumap.
> >   Upstream codebase: 1.38 Gbits/sec
> >   Upstream codebase with page pool: 9.74 Gbits/sec
> >   Upstream codebase with page pool fragmentation: 10.3 Gbits/sec
> >
> > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
>
> I think it's called fragmenting not fragmentation.
>
>

Sure, thanks!

> > ---
> >  drivers/net/virtio_net.c | 72 ++++++++++++++++++++++++++++++----------
> >  1 file changed, 55 insertions(+), 17 deletions(-)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 99c0ca0c1781..ac40b8c66c59 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -32,7 +32,9 @@ module_param(gso, bool, 0444);
> >  module_param(napi_tx, bool, 0644);
> >
> >  static bool page_pool_enabled;
> > +static bool page_pool_frag;
> >  module_param(page_pool_enabled, bool, 0400);
> > +module_param(page_pool_frag, bool, 0400);
> >
> >  /* FIXME: MTU in config. */
> >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>
> So here again same questions.
>
> -when is this a net perf gain when does it have no effect?
> -can be on by default
> - can we get rid of the extra modes?
>
>

Yeah, now I believe it makes sense to enable it by default to avoid
the extra modes. Thanks.


> > @@ -909,23 +911,32 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >                                      struct page *p,
> >                                      int offset,
> >                                      int page_off,
> > -                                    unsigned int *len)
> > +                                    unsigned int *len,
> > +                                        unsigned int *pp_frag_offset)
> >  {
> >       int tailroom = SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> >       struct page *page;
> > +     unsigned int pp_frag_offset_val;
> >
> >       if (page_off + *len + tailroom > PAGE_SIZE)
> >               return NULL;
> >
> >       if (rq->page_pool)
> > -             page = page_pool_dev_alloc_pages(rq->page_pool);
> > +             if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG)
> > +                     page = page_pool_dev_alloc_frag(rq->page_pool, pp_frag_offset,
> > +                                                     PAGE_SIZE);
> > +             else
> > +                     page = page_pool_dev_alloc_pages(rq->page_pool);
> >       else
> >               page = alloc_page(GFP_ATOMIC);
> >
> >       if (!page)
> >               return NULL;
> >
> > -     memcpy(page_address(page) + page_off, page_address(p) + offset, *len);
> > +     pp_frag_offset_val = pp_frag_offset ? *pp_frag_offset : 0;
> > +
> > +     memcpy(page_address(page) + page_off + pp_frag_offset_val,
> > +            page_address(p) + offset, *len);
> >       page_off += *len;
> >
> >       while (--*num_buf) {
> > @@ -948,7 +959,7 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> >                       goto err_buf;
> >               }
> >
> > -             memcpy(page_address(page) + page_off,
> > +             memcpy(page_address(page) + page_off + pp_frag_offset_val,
> >                      page_address(p) + off, buflen);
> >               page_off += buflen;
> >               virtnet_put_page(rq, p);
> > @@ -1029,7 +1040,7 @@ static struct sk_buff *receive_small_xdp(struct net_device *dev,
> >                       SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> >               xdp_page = xdp_linearize_page(rq, &num_buf, page,
> >                                             offset, header_offset,
> > -                                           &tlen);
> > +                                           &tlen, NULL);
> >               if (!xdp_page)
> >                       goto err_xdp;
> >
> > @@ -1323,6 +1334,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >       unsigned int headroom = mergeable_ctx_to_headroom(ctx);
> >       struct page *xdp_page;
> >       unsigned int xdp_room;
> > +     unsigned int page_frag_offset = 0;
> >
> >       /* Transient failure which in theory could occur if
> >        * in-flight packets from before XDP was enabled reach
> > @@ -1356,7 +1368,8 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >               xdp_page = xdp_linearize_page(rq, num_buf,
> >                                             *page, offset,
> >                                             VIRTIO_XDP_HEADROOM,
> > -                                           len);
> > +                                           len,
> > +                                               &page_frag_offset);
> >               if (!xdp_page)
> >                       return NULL;
> >       } else {
> > @@ -1366,14 +1379,19 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >                       return NULL;
> >
> >               if (rq->page_pool)
> > -                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > +                     if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG)
> > +                             xdp_page = page_pool_dev_alloc_frag(rq->page_pool,
> > +                                                                 &page_frag_offset, PAGE_SIZE);
> > +                     else
> > +                             xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> >               else
> >                       xdp_page = alloc_page(GFP_ATOMIC);
> > +
> >               if (!xdp_page)
> >                       return NULL;
> >
> > -             memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM,
> > -                    page_address(*page) + offset, *len);
> > +             memcpy(page_address(xdp_page) + VIRTIO_XDP_HEADROOM +
> > +                             page_frag_offset, page_address(*page) + offset, *len);
> >       }
> >
> >       *frame_sz = PAGE_SIZE;
> > @@ -1382,7 +1400,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> >
> >       *page = xdp_page;
> >
> > -     return page_address(*page) + VIRTIO_XDP_HEADROOM;
> > +     return page_address(*page) + VIRTIO_XDP_HEADROOM + page_frag_offset;
> >  }
> >
> >  static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > @@ -1762,6 +1780,7 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >       void *ctx;
> >       int err;
> >       unsigned int len, hole;
> > +     unsigned int pp_frag_offset;
> >
> >       /* Extra tailroom is needed to satisfy XDP's assumption. This
> >        * means rx frags coalescing won't work, but consider we've
> > @@ -1769,13 +1788,29 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >        */
> >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> >       if (rq->page_pool) {
> > -             struct page *page;
> > +             if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG) {
> > +                     if (unlikely(!page_pool_dev_alloc_frag(rq->page_pool,
> > +                                                            &pp_frag_offset, len + room)))
> > +                             return -ENOMEM;
> > +                     buf = (char *)page_address(rq->page_pool->frag_page) +
> > +                             pp_frag_offset;
> > +                     buf += headroom; /* advance address leaving hole at front of pkt */
> > +                     hole = (PAGE_SIZE << rq->page_pool->p.order)
> > +                             - rq->page_pool->frag_offset;
> > +                     if (hole < len + room) {
> > +                             if (!headroom)
> > +                                     len += hole;
> > +                             rq->page_pool->frag_offset += hole;
> > +                     }
> > +             } else {
> > +                     struct page *page;
> >
> > -             page = page_pool_dev_alloc_pages(rq->page_pool);
> > -             if (unlikely(!page))
> > -                     return -ENOMEM;
> > -             buf = (char *)page_address(page);
> > -             buf += headroom; /* advance address leaving hole at front of pkt */
> > +                     page = page_pool_dev_alloc_pages(rq->page_pool);
> > +                     if (unlikely(!page))
> > +                             return -ENOMEM;
> > +                     buf = (char *)page_address(page);
> > +                     buf += headroom; /* advance address leaving hole at front of pkt */
> > +             }
> >       } else {
> >               if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> >                       return -ENOMEM;
> > @@ -3800,13 +3835,16 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq)
> >       struct virtio_device *vdev = rq->vq->vdev;
> >
> >       struct page_pool_params pp_params = {
> > -             .order = 0,
> > +             .order = page_pool_frag ? SKB_FRAG_PAGE_ORDER : 0,
> >               .pool_size = rq->vq->num_max,
> >               .nid = dev_to_node(vdev->dev.parent),
> >               .dev = vdev->dev.parent,
> >               .offset = 0,
> >       };
> >
> > +     if (page_pool_frag)
> > +             pp_params.flags |= PP_FLAG_PAGE_FRAG;
> > +
> >       rq->page_pool = page_pool_create(&pp_params);
> >       if (IS_ERR(rq->page_pool)) {
> >               dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > --
> > 2.31.1
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support
  2023-05-29  1:33   ` Yunsheng Lin
@ 2023-05-29  7:30     ` Liang Chen
  0 siblings, 0 replies; 56+ messages in thread
From: Liang Chen @ 2023-05-29  7:30 UTC (permalink / raw)
  To: Yunsheng Lin
  Cc: jasowang, mst, virtualization, netdev, linux-kernel, xuanzhuo,
	kuba, edumazet, davem, pabeni, alexander.duyck

On Mon, May 29, 2023 at 9:33 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2023/5/26 13:46, Liang Chen wrote:
>
> ...
>
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 99c0ca0c1781..ac40b8c66c59 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -32,7 +32,9 @@ module_param(gso, bool, 0444);
> >  module_param(napi_tx, bool, 0644);
> >
> >  static bool page_pool_enabled;
> > +static bool page_pool_frag;
> >  module_param(page_pool_enabled, bool, 0400);
> > +module_param(page_pool_frag, bool, 0400);
>
> The below patchset unifies the frag and non-frag page for
> page_pool_alloc_frag() API, perhaps it would simplify the
> driver's support of page pool.
>
> https://patchwork.kernel.org/project/netdevbpf/cover/20230526092616.40355-1-linyunsheng@huawei.com/
>

Thanks for the information and the work to make driver support easy. I
will rebase accordingly after it lands.

> >
>
> ...
>
> > @@ -1769,13 +1788,29 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> >        */
> >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> >       if (rq->page_pool) {
> > -             struct page *page;
> > +             if (rq->page_pool->p.flags & PP_FLAG_PAGE_FRAG) {
> > +                     if (unlikely(!page_pool_dev_alloc_frag(rq->page_pool,
> > +                                                            &pp_frag_offset, len + room)))
> > +                             return -ENOMEM;
> > +                     buf = (char *)page_address(rq->page_pool->frag_page) +
> > +                             pp_frag_offset;
> > +                     buf += headroom; /* advance address leaving hole at front of pkt */
> > +                     hole = (PAGE_SIZE << rq->page_pool->p.order)
> > +                             - rq->page_pool->frag_offset;
> > +                     if (hole < len + room) {
> > +                             if (!headroom)
> > +                                     len += hole;
> > +                             rq->page_pool->frag_offset += hole;
>
> Is there any reason why the driver need to be aware of page_pool->frag_offset?
> Isn't the page_pool_dev_alloc_frag() will drain the last page for you when
> page_pool_dev_alloc_frag() is called with size being 'len + room' later?
> One case I can think of needing this is to have an accurate truesize report
> for skb, but I am not sure it matters that much as 'struct page_frag_cache'
> and 'page_frag' implementation both have a similar problem.
>

Yeah, as you pointed out page_pool_dev_alloc_frag will drain the page
itself, so does skb_page_frag_refill. This is trying to keep the logic
consistent with non page pool case where the hole was skipped and
included in buffer len.

> > +                     }
> > +             } else {
> > +                     struct page *page;
> >
> > -             page = page_pool_dev_alloc_pages(rq->page_pool);
> > -             if (unlikely(!page))
> > -                     return -ENOMEM;
> > -             buf = (char *)page_address(page);
> > -             buf += headroom; /* advance address leaving hole at front of pkt */
> > +                     page = page_pool_dev_alloc_pages(rq->page_pool);
> > +                     if (unlikely(!page))
> > +                             return -ENOMEM;
> > +                     buf = (char *)page_address(page);
> > +                     buf += headroom; /* advance address leaving hole at front of pkt */
> > +             }
> >       } else {
> >               if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> >                       return -ENOMEM;
> > @@ -3800,13 +3835,16 @@ static void virtnet_alloc_page_pool(struct receive_queue *rq)
> >       struct virtio_device *vdev = rq->vq->vdev;
> >
> >       struct page_pool_params pp_params = {
> > -             .order = 0,
> > +             .order = page_pool_frag ? SKB_FRAG_PAGE_ORDER : 0,
> >               .pool_size = rq->vq->num_max,
>
> If it using order SKB_FRAG_PAGE_ORDER page, perhaps pool_size does
> not have to be rq->vq->num_max? Even for order 0 page, perhaps the
> pool_size does not need to be as big as rq->vq->num_max?
>

Thanks for pointing this out! pool_size will be lowered to a more
appropriate value on v2.


> >               .nid = dev_to_node(vdev->dev.parent),
> >               .dev = vdev->dev.parent,
> >               .offset = 0,
> >       };
> >
> > +     if (page_pool_frag)
> > +             pp_params.flags |= PP_FLAG_PAGE_FRAG;
> > +
> >       rq->page_pool = page_pool_create(&pp_params);
> >       if (IS_ERR(rq->page_pool)) {
> >               dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> >

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-29  7:27     ` Liang Chen
@ 2023-05-29  9:55       ` Michael S. Tsirkin
  2023-05-30  1:19         ` Liang Chen
  0 siblings, 1 reply; 56+ messages in thread
From: Michael S. Tsirkin @ 2023-05-29  9:55 UTC (permalink / raw)
  To: Liang Chen
  Cc: jasowang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > The implementation at the moment uses one page per packet in both the
> > > normal and XDP path. In addition, introducing a module parameter to enable
> > > or disable the usage of page pool (disabled by default).
> > >
> > > In single-core vm testing environments, it gives a modest performance gain
> > > in the normal path.
> > >   Upstream codebase: 47.5 Gbits/sec
> > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > >
> > > In multi-core vm testing environments, The most significant performance
> > > gain is observed in XDP cpumap:
> > >   Upstream codebase: 1.38 Gbits/sec
> > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > >
> > > With this foundation, we can further integrate page pool fragmentation and
> > > DMA map/unmap support.
> > >
> > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> >
> > Why off by default?
> > I am guessing it sometimes has performance costs too?
> >
> >
> > What happens if we use page pool for big mode too?
> > The less modes we have the better...
> >
> >
> 
> Sure, now I believe it makes sense to enable it by default. When the
> packet size is very small, it reduces the likelihood of skb
> coalescing. But such cases are rare.

small packets are rare? These workloads are easy to create actually.
Pls try and include benchmark with small packet size.

> The usage of page pool for big mode is being evaluated now. Thanks!
> 
> > > ---
> > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > >
> > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > index c5dca0d92e64..99c0ca0c1781 100644
> > > --- a/drivers/net/virtio_net.c
> > > +++ b/drivers/net/virtio_net.c
> > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > >  module_param(gso, bool, 0444);
> > >  module_param(napi_tx, bool, 0644);
> > >
> > > +static bool page_pool_enabled;
> > > +module_param(page_pool_enabled, bool, 0400);
> > > +
> > >  /* FIXME: MTU in config. */
> > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > >  #define GOOD_COPY_LEN        128
> > > @@ -159,6 +162,9 @@ struct receive_queue {
> > >       /* Chain pages by the private ptr. */
> > >       struct page *pages;
> > >
> > > +     /* Page pool */
> > > +     struct page_pool *page_pool;
> > > +
> > >       /* Average packet length for mergeable receive buffers. */
> > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > >
> > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > >       return skb;
> > >  }
> > >
> > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > +{
> > > +     if (rq->page_pool)
> > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > +     else
> > > +             put_page(page);
> > > +}
> > > +
> > >  /* Called from bottom half context */
> > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > >                                  struct receive_queue *rq,
> > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > >       hdr = skb_vnet_hdr(skb);
> > >       memcpy(hdr, hdr_p, hdr_len);
> > >       if (page_to_free)
> > > -             put_page(page_to_free);
> > > +             virtnet_put_page(rq, page_to_free);
> > >
> > >       return skb;
> > >  }
> > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > >       return ret;
> > >  }
> > >
> > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > >  {
> > >       struct skb_shared_info *shinfo;
> > >       struct page *xdp_page;
> > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > -                     put_page(xdp_page);
> > > +                     virtnet_put_page(rq, xdp_page);
> > >               }
> > >       }
> > >  }
> > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > >               return NULL;
> > >
> > > -     page = alloc_page(GFP_ATOMIC);
> > > +     if (rq->page_pool)
> > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > +     else
> > > +             page = alloc_page(GFP_ATOMIC);
> > > +
> > >       if (!page)
> > >               return NULL;
> > >
> > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > >                * is sending packet larger than the MTU.
> > >                */
> > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > -                     put_page(p);
> > > +                     virtnet_put_page(rq, p);
> > >                       goto err_buf;
> > >               }
> > >
> > >               memcpy(page_address(page) + page_off,
> > >                      page_address(p) + off, buflen);
> > >               page_off += buflen;
> > > -             put_page(p);
> > > +             virtnet_put_page(rq, p);
> > >       }
> > >
> > >       /* Headroom does not contribute to packet length */
> > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > >       return page;
> > >  err_buf:
> > > -     __free_pages(page, 0);
> > > +     if (rq->page_pool)
> > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > +     else
> > > +             __free_pages(page, 0);
> > >       return NULL;
> > >  }
> > >
> > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > >               }
> > >               stats->bytes += len;
> > >               page = virt_to_head_page(buf);
> > > -             put_page(page);
> > > +             virtnet_put_page(rq, page);
> > >       }
> > >  }
> > >
> > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > >               cur_frag_size = truesize;
> > >               xdp_frags_truesz += cur_frag_size;
> > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > -                     put_page(page);
> > > +                     virtnet_put_page(rq, page);
> > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > >                                dev->name, len, (unsigned long)(truesize - room));
> > >                       dev->stats.rx_length_errors++;
> > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > >       return 0;
> > >
> > >  err:
> > > -     put_xdp_frags(xdp);
> > > +     put_xdp_frags(xdp, rq);
> > >       return -EINVAL;
> > >  }
> > >
> > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > >               if (*len + xdp_room > PAGE_SIZE)
> > >                       return NULL;
> > >
> > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > +             if (rq->page_pool)
> > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > +             else
> > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > >               if (!xdp_page)
> > >                       return NULL;
> > >
> > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > >
> > >       *frame_sz = PAGE_SIZE;
> > >
> > > -     put_page(*page);
> > > +     virtnet_put_page(rq, *page);
> > >
> > >       *page = xdp_page;
> > >
> > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > >               if (unlikely(!head_skb))
> > >                       break;
> > > +             if (rq->page_pool)
> > > +                     skb_mark_for_recycle(head_skb);
> > >               return head_skb;
> > >
> > >       case XDP_TX:
> > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > >               break;
> > >       }
> > >
> > > -     put_xdp_frags(&xdp);
> > > +     put_xdp_frags(&xdp, rq);
> > >
> > >  err_xdp:
> > > -     put_page(page);
> > > +     virtnet_put_page(rq, page);
> > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > >
> > >       stats->xdp_drops++;
> > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > >       curr_skb = head_skb;
> > >
> > > +     if (rq->page_pool)
> > > +             skb_mark_for_recycle(curr_skb);
> > > +
> > >       if (unlikely(!curr_skb))
> > >               goto err_skb;
> > >       while (--num_buf) {
> > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >                       curr_skb = nskb;
> > >                       head_skb->truesize += nskb->truesize;
> > >                       num_skb_frags = 0;
> > > +                     if (rq->page_pool)
> > > +                             skb_mark_for_recycle(curr_skb);
> > >               }
> > >               if (curr_skb != head_skb) {
> > >                       head_skb->data_len += len;
> > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >               }
> > >               offset = buf - page_address(page);
> > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > -                     put_page(page);
> > > +                     virtnet_put_page(rq, page);
> > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > >                                            len, truesize);
> > >               } else {
> > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > >       return head_skb;
> > >
> > >  err_skb:
> > > -     put_page(page);
> > > +     virtnet_put_page(rq, page);
> > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > >
> > >  err_buf:
> > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > >        * disabled GSO for XDP, it won't be a big issue.
> > >        */
> > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > -             return -ENOMEM;
> > > +     if (rq->page_pool) {
> > > +             struct page *page;
> > >
> > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > -     get_page(alloc_frag->page);
> > > -     alloc_frag->offset += len + room;
> > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > -     if (hole < len + room) {
> > > -             /* To avoid internal fragmentation, if there is very likely not
> > > -              * enough space for another buffer, add the remaining space to
> > > -              * the current buffer.
> > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > -              */
> > > -             if (!headroom)
> > > -                     len += hole;
> > > -             alloc_frag->offset += hole;
> > > -     }
> > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > +             if (unlikely(!page))
> > > +                     return -ENOMEM;
> > > +             buf = (char *)page_address(page);
> > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > +     } else {
> > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > +                     return -ENOMEM;
> > >
> > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > +             get_page(alloc_frag->page);
> > > +             alloc_frag->offset += len + room;
> > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > +             if (hole < len + room) {
> > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > +                      * enough space for another buffer, add the remaining space to
> > > +                      * the current buffer.
> > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > +                      */
> > > +                     if (!headroom)
> > > +                             len += hole;
> > > +                     alloc_frag->offset += hole;
> > > +             }
> > > +     }
> > >       sg_init_one(rq->sg, buf, len);
> > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > >       if (err < 0)
> > > -             put_page(virt_to_head_page(buf));
> > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > >
> > >       return err;
> > >  }
> > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > >       if (err < 0)
> > >               return err;
> > >
> > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > +     if (vi->rq[qp_index].page_pool)
> > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > +                                              MEM_TYPE_PAGE_POOL,
> > > +                                              vi->rq[qp_index].page_pool);
> > > +     else
> > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > +                                              MEM_TYPE_PAGE_SHARED,
> > > +                                              NULL);
> > > +
> > >       if (err < 0)
> > >               goto err_xdp_reg_mem_model;
> > >
> > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > >                                               virtnet_sq_stats_desc[j].desc);
> > >               }
> > > +             page_pool_ethtool_stats_get_strings(p);
> > >               break;
> > >       }
> > >  }
> > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > >       switch (sset) {
> > >       case ETH_SS_STATS:
> > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > -                                            VIRTNET_SQ_STATS_LEN);
> > > +                                            VIRTNET_SQ_STATS_LEN +
> > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > >       default:
> > >               return -EOPNOTSUPP;
> > >       }
> > >  }
> > >
> > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > +{
> > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > +     struct page_pool_stats pp_stats = {};
> > > +     int i;
> > > +
> > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > +             if (!vi->rq[i].page_pool)
> > > +                     continue;
> > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > +     }
> > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > +}
> > > +
> > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > >                                     struct ethtool_stats *stats, u64 *data)
> > >  {
> > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > >               idx += VIRTNET_SQ_STATS_LEN;
> > >       }
> > > +
> > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > >  }
> > >
> > >  static void virtnet_get_channels(struct net_device *dev,
> > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > >               __netif_napi_del(&vi->rq[i].napi);
> > >               __netif_napi_del(&vi->sq[i].napi);
> > > +             if (vi->rq[i].page_pool)
> > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > >       }
> > >
> > >       /* We called __netif_napi_del(),
> > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > >       struct virtnet_info *vi = vq->vdev->priv;
> > >       int i = vq2rxq(vq);
> > >
> > > -     if (vi->mergeable_rx_bufs)
> > > -             put_page(virt_to_head_page(buf));
> > > -     else if (vi->big_packets)
> > > +     if (vi->mergeable_rx_bufs) {
> > > +             if (vi->rq[i].page_pool) {
> > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > +                                             virt_to_head_page(buf),
> > > +                                             true);
> > > +             } else {
> > > +                     put_page(virt_to_head_page(buf));
> > > +             }
> > > +     } else if (vi->big_packets) {
> > >               give_pages(&vi->rq[i], buf);
> > > -     else
> > > +     } else {
> > >               put_page(virt_to_head_page(buf));
> > > +     }
> > >  }
> > >
> > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > >       virtnet_free_queues(vi);
> > >  }
> > >
> > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > +{
> > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > +
> > > +     struct page_pool_params pp_params = {
> > > +             .order = 0,
> > > +             .pool_size = rq->vq->num_max,
> > > +             .nid = dev_to_node(vdev->dev.parent),
> > > +             .dev = vdev->dev.parent,
> > > +             .offset = 0,
> > > +     };
> > > +
> > > +     rq->page_pool = page_pool_create(&pp_params);
> > > +     if (IS_ERR(rq->page_pool)) {
> > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > +                      PTR_ERR(rq->page_pool));
> > > +             rq->page_pool = NULL;
> > > +     }
> > > +}
> > > +
> > >  /* How large should a single buffer be so a queue full of these can fit at
> > >   * least one full packet?
> > >   * Logic below assumes the mergeable buffer header is used.
> > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > +
> > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > +             else
> > > +                     dev_warn(&vi->vdev->dev,
> > > +                              "page pool only support mergeable mode\n");
> > > +
> > >       }
> > >
> > >       /* run here: ret == 0. */
> > > --
> > > 2.31.1
> >


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-29  9:55       ` Michael S. Tsirkin
@ 2023-05-30  1:19         ` Liang Chen
  2023-06-07  9:08           ` Liang Chen
  0 siblings, 1 reply; 56+ messages in thread
From: Liang Chen @ 2023-05-30  1:19 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: jasowang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > The implementation at the moment uses one page per packet in both the
> > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > or disable the usage of page pool (disabled by default).
> > > >
> > > > In single-core vm testing environments, it gives a modest performance gain
> > > > in the normal path.
> > > >   Upstream codebase: 47.5 Gbits/sec
> > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > >
> > > > In multi-core vm testing environments, The most significant performance
> > > > gain is observed in XDP cpumap:
> > > >   Upstream codebase: 1.38 Gbits/sec
> > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > >
> > > > With this foundation, we can further integrate page pool fragmentation and
> > > > DMA map/unmap support.
> > > >
> > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > >
> > > Why off by default?
> > > I am guessing it sometimes has performance costs too?
> > >
> > >
> > > What happens if we use page pool for big mode too?
> > > The less modes we have the better...
> > >
> > >
> >
> > Sure, now I believe it makes sense to enable it by default. When the
> > packet size is very small, it reduces the likelihood of skb
> > coalescing. But such cases are rare.
>
> small packets are rare? These workloads are easy to create actually.
> Pls try and include benchmark with small packet size.
>

Sure, Thanks!
> > The usage of page pool for big mode is being evaluated now. Thanks!
> >
> > > > ---
> > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > >
> > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > --- a/drivers/net/virtio_net.c
> > > > +++ b/drivers/net/virtio_net.c
> > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > >  module_param(gso, bool, 0444);
> > > >  module_param(napi_tx, bool, 0644);
> > > >
> > > > +static bool page_pool_enabled;
> > > > +module_param(page_pool_enabled, bool, 0400);
> > > > +
> > > >  /* FIXME: MTU in config. */
> > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > >  #define GOOD_COPY_LEN        128
> > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > >       /* Chain pages by the private ptr. */
> > > >       struct page *pages;
> > > >
> > > > +     /* Page pool */
> > > > +     struct page_pool *page_pool;
> > > > +
> > > >       /* Average packet length for mergeable receive buffers. */
> > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > >
> > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > >       return skb;
> > > >  }
> > > >
> > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > +{
> > > > +     if (rq->page_pool)
> > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > +     else
> > > > +             put_page(page);
> > > > +}
> > > > +
> > > >  /* Called from bottom half context */
> > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > >                                  struct receive_queue *rq,
> > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > >       hdr = skb_vnet_hdr(skb);
> > > >       memcpy(hdr, hdr_p, hdr_len);
> > > >       if (page_to_free)
> > > > -             put_page(page_to_free);
> > > > +             virtnet_put_page(rq, page_to_free);
> > > >
> > > >       return skb;
> > > >  }
> > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > >       return ret;
> > > >  }
> > > >
> > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > >  {
> > > >       struct skb_shared_info *shinfo;
> > > >       struct page *xdp_page;
> > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > -                     put_page(xdp_page);
> > > > +                     virtnet_put_page(rq, xdp_page);
> > > >               }
> > > >       }
> > > >  }
> > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > >               return NULL;
> > > >
> > > > -     page = alloc_page(GFP_ATOMIC);
> > > > +     if (rq->page_pool)
> > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > +     else
> > > > +             page = alloc_page(GFP_ATOMIC);
> > > > +
> > > >       if (!page)
> > > >               return NULL;
> > > >
> > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > >                * is sending packet larger than the MTU.
> > > >                */
> > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > -                     put_page(p);
> > > > +                     virtnet_put_page(rq, p);
> > > >                       goto err_buf;
> > > >               }
> > > >
> > > >               memcpy(page_address(page) + page_off,
> > > >                      page_address(p) + off, buflen);
> > > >               page_off += buflen;
> > > > -             put_page(p);
> > > > +             virtnet_put_page(rq, p);
> > > >       }
> > > >
> > > >       /* Headroom does not contribute to packet length */
> > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > >       return page;
> > > >  err_buf:
> > > > -     __free_pages(page, 0);
> > > > +     if (rq->page_pool)
> > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > +     else
> > > > +             __free_pages(page, 0);
> > > >       return NULL;
> > > >  }
> > > >
> > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > >               }
> > > >               stats->bytes += len;
> > > >               page = virt_to_head_page(buf);
> > > > -             put_page(page);
> > > > +             virtnet_put_page(rq, page);
> > > >       }
> > > >  }
> > > >
> > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > >               cur_frag_size = truesize;
> > > >               xdp_frags_truesz += cur_frag_size;
> > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > -                     put_page(page);
> > > > +                     virtnet_put_page(rq, page);
> > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > >                       dev->stats.rx_length_errors++;
> > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > >       return 0;
> > > >
> > > >  err:
> > > > -     put_xdp_frags(xdp);
> > > > +     put_xdp_frags(xdp, rq);
> > > >       return -EINVAL;
> > > >  }
> > > >
> > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > >               if (*len + xdp_room > PAGE_SIZE)
> > > >                       return NULL;
> > > >
> > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > +             if (rq->page_pool)
> > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > +             else
> > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > >               if (!xdp_page)
> > > >                       return NULL;
> > > >
> > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > >
> > > >       *frame_sz = PAGE_SIZE;
> > > >
> > > > -     put_page(*page);
> > > > +     virtnet_put_page(rq, *page);
> > > >
> > > >       *page = xdp_page;
> > > >
> > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > >               if (unlikely(!head_skb))
> > > >                       break;
> > > > +             if (rq->page_pool)
> > > > +                     skb_mark_for_recycle(head_skb);
> > > >               return head_skb;
> > > >
> > > >       case XDP_TX:
> > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > >               break;
> > > >       }
> > > >
> > > > -     put_xdp_frags(&xdp);
> > > > +     put_xdp_frags(&xdp, rq);
> > > >
> > > >  err_xdp:
> > > > -     put_page(page);
> > > > +     virtnet_put_page(rq, page);
> > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > >
> > > >       stats->xdp_drops++;
> > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > >       curr_skb = head_skb;
> > > >
> > > > +     if (rq->page_pool)
> > > > +             skb_mark_for_recycle(curr_skb);
> > > > +
> > > >       if (unlikely(!curr_skb))
> > > >               goto err_skb;
> > > >       while (--num_buf) {
> > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >                       curr_skb = nskb;
> > > >                       head_skb->truesize += nskb->truesize;
> > > >                       num_skb_frags = 0;
> > > > +                     if (rq->page_pool)
> > > > +                             skb_mark_for_recycle(curr_skb);
> > > >               }
> > > >               if (curr_skb != head_skb) {
> > > >                       head_skb->data_len += len;
> > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >               }
> > > >               offset = buf - page_address(page);
> > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > -                     put_page(page);
> > > > +                     virtnet_put_page(rq, page);
> > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > >                                            len, truesize);
> > > >               } else {
> > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > >       return head_skb;
> > > >
> > > >  err_skb:
> > > > -     put_page(page);
> > > > +     virtnet_put_page(rq, page);
> > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > >
> > > >  err_buf:
> > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > >        * disabled GSO for XDP, it won't be a big issue.
> > > >        */
> > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > -             return -ENOMEM;
> > > > +     if (rq->page_pool) {
> > > > +             struct page *page;
> > > >
> > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > -     get_page(alloc_frag->page);
> > > > -     alloc_frag->offset += len + room;
> > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > -     if (hole < len + room) {
> > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > -              * enough space for another buffer, add the remaining space to
> > > > -              * the current buffer.
> > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > -              */
> > > > -             if (!headroom)
> > > > -                     len += hole;
> > > > -             alloc_frag->offset += hole;
> > > > -     }
> > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > +             if (unlikely(!page))
> > > > +                     return -ENOMEM;
> > > > +             buf = (char *)page_address(page);
> > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > +     } else {
> > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > +                     return -ENOMEM;
> > > >
> > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > +             get_page(alloc_frag->page);
> > > > +             alloc_frag->offset += len + room;
> > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > +             if (hole < len + room) {
> > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > +                      * enough space for another buffer, add the remaining space to
> > > > +                      * the current buffer.
> > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > +                      */
> > > > +                     if (!headroom)
> > > > +                             len += hole;
> > > > +                     alloc_frag->offset += hole;
> > > > +             }
> > > > +     }
> > > >       sg_init_one(rq->sg, buf, len);
> > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > >       if (err < 0)
> > > > -             put_page(virt_to_head_page(buf));
> > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > >
> > > >       return err;
> > > >  }
> > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > >       if (err < 0)
> > > >               return err;
> > > >
> > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > +     if (vi->rq[qp_index].page_pool)
> > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > +                                              vi->rq[qp_index].page_pool);
> > > > +     else
> > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > +                                              NULL);
> > > > +
> > > >       if (err < 0)
> > > >               goto err_xdp_reg_mem_model;
> > > >
> > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > >                                               virtnet_sq_stats_desc[j].desc);
> > > >               }
> > > > +             page_pool_ethtool_stats_get_strings(p);
> > > >               break;
> > > >       }
> > > >  }
> > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > >       switch (sset) {
> > > >       case ETH_SS_STATS:
> > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > >       default:
> > > >               return -EOPNOTSUPP;
> > > >       }
> > > >  }
> > > >
> > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > +{
> > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > +     struct page_pool_stats pp_stats = {};
> > > > +     int i;
> > > > +
> > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > +             if (!vi->rq[i].page_pool)
> > > > +                     continue;
> > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > +     }
> > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > +}
> > > > +
> > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > >                                     struct ethtool_stats *stats, u64 *data)
> > > >  {
> > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > >       }
> > > > +
> > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > >  }
> > > >
> > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > >               __netif_napi_del(&vi->rq[i].napi);
> > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > +             if (vi->rq[i].page_pool)
> > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > >       }
> > > >
> > > >       /* We called __netif_napi_del(),
> > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > >       int i = vq2rxq(vq);
> > > >
> > > > -     if (vi->mergeable_rx_bufs)
> > > > -             put_page(virt_to_head_page(buf));
> > > > -     else if (vi->big_packets)
> > > > +     if (vi->mergeable_rx_bufs) {
> > > > +             if (vi->rq[i].page_pool) {
> > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > +                                             virt_to_head_page(buf),
> > > > +                                             true);
> > > > +             } else {
> > > > +                     put_page(virt_to_head_page(buf));
> > > > +             }
> > > > +     } else if (vi->big_packets) {
> > > >               give_pages(&vi->rq[i], buf);
> > > > -     else
> > > > +     } else {
> > > >               put_page(virt_to_head_page(buf));
> > > > +     }
> > > >  }
> > > >
> > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > >       virtnet_free_queues(vi);
> > > >  }
> > > >
> > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > +{
> > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > +
> > > > +     struct page_pool_params pp_params = {
> > > > +             .order = 0,
> > > > +             .pool_size = rq->vq->num_max,
> > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > +             .dev = vdev->dev.parent,
> > > > +             .offset = 0,
> > > > +     };
> > > > +
> > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > +     if (IS_ERR(rq->page_pool)) {
> > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > +                      PTR_ERR(rq->page_pool));
> > > > +             rq->page_pool = NULL;
> > > > +     }
> > > > +}
> > > > +
> > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > >   * least one full packet?
> > > >   * Logic below assumes the mergeable buffer header is used.
> > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > +
> > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > +             else
> > > > +                     dev_warn(&vi->vdev->dev,
> > > > +                              "page pool only support mergeable mode\n");
> > > > +
> > > >       }
> > > >
> > > >       /* run here: ret == 0. */
> > > > --
> > > > 2.31.1
> > >
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-29  7:28         ` Liang Chen
@ 2023-05-31  3:10           ` Xuan Zhuo
  2023-06-07  9:11             ` Liang Chen
  0 siblings, 1 reply; 56+ messages in thread
From: Xuan Zhuo @ 2023-05-31  3:10 UTC (permalink / raw)
  To: Liang Chen
  Cc: Jason Wang, virtualization, netdev, linux-kernel, kuba, edumazet,
	davem, pabeni, alexander.duyck, Michael S. Tsirkin

On Mon, 29 May 2023 15:28:17 +0800, Liang Chen <liangchen.linux@gmail.com> wrote:
> On Sun, May 28, 2023 at 2:40 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Sat, May 27, 2023 at 08:35:01PM +0800, Liang Chen wrote:
> > > On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote:
> > > > >
> > > > > The implementation at the moment uses one page per packet in both the
> > > > > normal and XDP path.
> > > >
> > > > It's better to explain why we need a page pool and how it can help the
> > > > performance.
> > > >
> > >
> > > Sure, I will include that on v2.
> > > > > In addition, introducing a module parameter to enable
> > > > > or disable the usage of page pool (disabled by default).
> > > >
> > > > If page pool wins for most of the cases, any reason to disable it by default?
> > > >
> > >
> > > Thank you for raising the point. It does make sense to enable it by default.
> >
> > I'd like to see more benchmarks pls then, with a variety of packet
> > sizes, udp and tcp.
> >
>
> Sure, more benchmarks will be provided. Thanks.


I think so.

I did this, but I did not found any improve. So I gave up it.

Thanks.


>
>
> > > > >
> > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > in the normal path.
> > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > >
> > > > > In multi-core vm testing environments, The most significant performance
> > > > > gain is observed in XDP cpumap:
> > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > >
> > > > Please show more details on the test. E.g which kinds of tests have
> > > > you measured?
> > > >
> > > > Btw, it would be better to measure PPS as well.
> > > >
> > >
> > > Sure. It will be added on v2.
> > > > >
> > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > DMA map/unmap support.
> > > > >
> > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > > ---
> > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > >
> > > > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do
> > > > the ifdef tricks at least.
> > > >
> > >
> > > Sure. it will be done on v2.
> > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > --- a/drivers/net/virtio_net.c
> > > > > +++ b/drivers/net/virtio_net.c
> > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > >  module_param(gso, bool, 0444);
> > > > >  module_param(napi_tx, bool, 0644);
> > > > >
> > > > > +static bool page_pool_enabled;
> > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > +
> > > > >  /* FIXME: MTU in config. */
> > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > >  #define GOOD_COPY_LEN  128
> > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > >         /* Chain pages by the private ptr. */
> > > > >         struct page *pages;
> > > > >
> > > > > +       /* Page pool */
> > > > > +       struct page_pool *page_pool;
> > > > > +
> > > > >         /* Average packet length for mergeable receive buffers. */
> > > > >         struct ewma_pkt_len mrg_avg_pkt_len;
> > > > >
> > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > >         return skb;
> > > > >  }
> > > > >
> > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > +{
> > > > > +       if (rq->page_pool)
> > > > > +               page_pool_put_full_page(rq->page_pool, page, true);
> > > > > +       else
> > > > > +               put_page(page);
> > > > > +}
> > > > > +
> > > > >  /* Called from bottom half context */
> > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > >                                    struct receive_queue *rq,
> > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > >         hdr = skb_vnet_hdr(skb);
> > > > >         memcpy(hdr, hdr_p, hdr_len);
> > > > >         if (page_to_free)
> > > > > -               put_page(page_to_free);
> > > > > +               virtnet_put_page(rq, page_to_free);
> > > > >
> > > > >         return skb;
> > > > >  }
> > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > >         return ret;
> > > > >  }
> > > > >
> > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > >  {
> > > >
> > > > rq could be fetched from xdp_rxq_info?
> > >
> > > Yeah, it has the queue_index there.
> > > >
> > > > >         struct skb_shared_info *shinfo;
> > > > >         struct page *xdp_page;
> > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > >                 shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > >                 for (i = 0; i < shinfo->nr_frags; i++) {
> > > > >                         xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > -                       put_page(xdp_page);
> > > > > +                       virtnet_put_page(rq, xdp_page);
> > > > >                 }
> > > > >         }
> > > > >  }
> > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > >         if (page_off + *len + tailroom > PAGE_SIZE)
> > > > >                 return NULL;
> > > > >
> > > > > -       page = alloc_page(GFP_ATOMIC);
> > > > > +       if (rq->page_pool)
> > > > > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > +       else
> > > > > +               page = alloc_page(GFP_ATOMIC);
> > > > > +
> > > > >         if (!page)
> > > > >                 return NULL;
> > > > >
> > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > >                  * is sending packet larger than the MTU.
> > > > >                  */
> > > > >                 if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > -                       put_page(p);
> > > > > +                       virtnet_put_page(rq, p);
> > > > >                         goto err_buf;
> > > > >                 }
> > > > >
> > > > >                 memcpy(page_address(page) + page_off,
> > > > >                        page_address(p) + off, buflen);
> > > > >                 page_off += buflen;
> > > > > -               put_page(p);
> > > > > +               virtnet_put_page(rq, p);
> > > > >         }
> > > > >
> > > > >         /* Headroom does not contribute to packet length */
> > > > >         *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > >         return page;
> > > > >  err_buf:
> > > > > -       __free_pages(page, 0);
> > > > > +       if (rq->page_pool)
> > > > > +               page_pool_put_full_page(rq->page_pool, page, true);
> > > > > +       else
> > > > > +               __free_pages(page, 0);
> > > > >         return NULL;
> > > > >  }
> > > > >
> > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > >                 }
> > > > >                 stats->bytes += len;
> > > > >                 page = virt_to_head_page(buf);
> > > > > -               put_page(page);
> > > > > +               virtnet_put_page(rq, page);
> > > > >         }
> > > > >  }
> > > > >
> > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > >                 cur_frag_size = truesize;
> > > > >                 xdp_frags_truesz += cur_frag_size;
> > > > >                 if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > -                       put_page(page);
> > > > > +                       virtnet_put_page(rq, page);
> > > > >                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > >                                  dev->name, len, (unsigned long)(truesize - room));
> > > > >                         dev->stats.rx_length_errors++;
> > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > >         return 0;
> > > > >
> > > > >  err:
> > > > > -       put_xdp_frags(xdp);
> > > > > +       put_xdp_frags(xdp, rq);
> > > > >         return -EINVAL;
> > > > >  }
> > > > >
> > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > >                 if (*len + xdp_room > PAGE_SIZE)
> > > > >                         return NULL;
> > > > >
> > > > > -               xdp_page = alloc_page(GFP_ATOMIC);
> > > > > +               if (rq->page_pool)
> > > > > +                       xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > +               else
> > > > > +                       xdp_page = alloc_page(GFP_ATOMIC);
> > > > >                 if (!xdp_page)
> > > > >                         return NULL;
> > > > >
> > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > >
> > > > >         *frame_sz = PAGE_SIZE;
> > > > >
> > > > > -       put_page(*page);
> > > > > +       virtnet_put_page(rq, *page);
> > > > >
> > > > >         *page = xdp_page;
> > > > >
> > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > >                 head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > >                 if (unlikely(!head_skb))
> > > > >                         break;
> > > > > +               if (rq->page_pool)
> > > > > +                       skb_mark_for_recycle(head_skb);
> > > > >                 return head_skb;
> > > > >
> > > > >         case XDP_TX:
> > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > >                 break;
> > > > >         }
> > > > >
> > > > > -       put_xdp_frags(&xdp);
> > > > > +       put_xdp_frags(&xdp, rq);
> > > > >
> > > > >  err_xdp:
> > > > > -       put_page(page);
> > > > > +       virtnet_put_page(rq, page);
> > > > >         mergeable_buf_free(rq, num_buf, dev, stats);
> > > > >
> > > > >         stats->xdp_drops++;
> > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >         head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > >         curr_skb = head_skb;
> > > > >
> > > > > +       if (rq->page_pool)
> > > > > +               skb_mark_for_recycle(curr_skb);
> > > > > +
> > > > >         if (unlikely(!curr_skb))
> > > > >                 goto err_skb;
> > > > >         while (--num_buf) {
> > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >                         curr_skb = nskb;
> > > > >                         head_skb->truesize += nskb->truesize;
> > > > >                         num_skb_frags = 0;
> > > > > +                       if (rq->page_pool)
> > > > > +                               skb_mark_for_recycle(curr_skb);
> > > > >                 }
> > > > >                 if (curr_skb != head_skb) {
> > > > >                         head_skb->data_len += len;
> > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >                 }
> > > > >                 offset = buf - page_address(page);
> > > > >                 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > -                       put_page(page);
> > > > > +                       virtnet_put_page(rq, page);
> > > >
> > > > I wonder why not we can't do this during buffer allocation like other drivers?
> > > >
> > >
> > > Sorry, I don't quite understand the point here. Would you please
> > > elaborate a bit more?
> > > > >                         skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > >                                              len, truesize);
> > > > >                 } else {
> > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >         return head_skb;
> > > > >
> > > > >  err_skb:
> > > > > -       put_page(page);
> > > > > +       virtnet_put_page(rq, page);
> > > > >         mergeable_buf_free(rq, num_buf, dev, stats);
> > > > >
> > > > >  err_buf:
> > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > >          * disabled GSO for XDP, it won't be a big issue.
> > > > >          */
> > > > >         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > -       if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > -               return -ENOMEM;
> > > > > +       if (rq->page_pool) {
> > > > > +               struct page *page;
> > > > >
> > > > > -       buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > -       buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > -       get_page(alloc_frag->page);
> > > > > -       alloc_frag->offset += len + room;
> > > > > -       hole = alloc_frag->size - alloc_frag->offset;
> > > > > -       if (hole < len + room) {
> > > > > -               /* To avoid internal fragmentation, if there is very likely not
> > > > > -                * enough space for another buffer, add the remaining space to
> > > > > -                * the current buffer.
> > > > > -                * XDP core assumes that frame_size of xdp_buff and the length
> > > > > -                * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > -                */
> > > > > -               if (!headroom)
> > > > > -                       len += hole;
> > > > > -               alloc_frag->offset += hole;
> > > > > -       }
> > > > > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > +               if (unlikely(!page))
> > > > > +                       return -ENOMEM;
> > > > > +               buf = (char *)page_address(page);
> > > > > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > +       } else {
> > > > > +               if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > >
> > > > Why not simply use a helper like virtnet_page_frag_refill() and add
> > > > the page_pool allocation logic there? It helps to reduce the
> > > > changeset.
> > > >
> > >
> > > Sure. Will do that on v2.
> > > > > +                       return -ENOMEM;
> > > > >
> > > > > +               buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > +               get_page(alloc_frag->page);
> > > > > +               alloc_frag->offset += len + room;
> > > > > +               hole = alloc_frag->size - alloc_frag->offset;
> > > > > +               if (hole < len + room) {
> > > > > +                       /* To avoid internal fragmentation, if there is very likely not
> > > > > +                        * enough space for another buffer, add the remaining space to
> > > > > +                        * the current buffer.
> > > > > +                        * XDP core assumes that frame_size of xdp_buff and the length
> > > > > +                        * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > +                        */
> > > > > +                       if (!headroom)
> > > > > +                               len += hole;
> > > > > +                       alloc_frag->offset += hole;
> > > > > +               }
> > > > > +       }
> > > > >         sg_init_one(rq->sg, buf, len);
> > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > >         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > >         if (err < 0)
> > > > > -               put_page(virt_to_head_page(buf));
> > > > > +               virtnet_put_page(rq, virt_to_head_page(buf));
> > > > >
> > > > >         return err;
> > > > >  }
> > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > >         if (err < 0)
> > > > >                 return err;
> > > > >
> > > > > -       err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > -                                        MEM_TYPE_PAGE_SHARED, NULL);
> > > > > +       if (vi->rq[qp_index].page_pool)
> > > > > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > +                                                MEM_TYPE_PAGE_POOL,
> > > > > +                                                vi->rq[qp_index].page_pool);
> > > > > +       else
> > > > > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > +                                                MEM_TYPE_PAGE_SHARED,
> > > > > +                                                NULL);
> > > > > +
> > > > >         if (err < 0)
> > > > >                 goto err_xdp_reg_mem_model;
> > > > >
> > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > >                                 ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > >                                                 virtnet_sq_stats_desc[j].desc);
> > > > >                 }
> > > > > +               page_pool_ethtool_stats_get_strings(p);
> > > > >                 break;
> > > > >         }
> > > > >  }
> > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > >         switch (sset) {
> > > > >         case ETH_SS_STATS:
> > > > >                 return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > -                                              VIRTNET_SQ_STATS_LEN);
> > > > > +                                              VIRTNET_SQ_STATS_LEN +
> > > > > +                                               (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > +                                                page_pool_ethtool_stats_get_count() : 0));
> > > > >         default:
> > > > >                 return -EOPNOTSUPP;
> > > > >         }
> > > > >  }
> > > > >
> > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > +{
> > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > +       struct virtnet_info *vi = netdev_priv(dev);
> > > > > +       struct page_pool_stats pp_stats = {};
> > > > > +       int i;
> > > > > +
> > > > > +       for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > +               if (!vi->rq[i].page_pool)
> > > > > +                       continue;
> > > > > +               page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > +       }
> > > > > +       page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > +}
> > > > > +
> > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > >                                       struct ethtool_stats *stats, u64 *data)
> > > > >  {
> > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > >                 } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > >                 idx += VIRTNET_SQ_STATS_LEN;
> > > > >         }
> > > > > +
> > > > > +       virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > >  }
> > > > >
> > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > +               if (vi->rq[i].page_pool)
> > > > > +                       page_pool_destroy(vi->rq[i].page_pool);
> > > > >         }
> > > > >
> > > > >         /* We called __netif_napi_del(),
> > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > >         struct virtnet_info *vi = vq->vdev->priv;
> > > > >         int i = vq2rxq(vq);
> > > > >
> > > > > -       if (vi->mergeable_rx_bufs)
> > > > > -               put_page(virt_to_head_page(buf));
> > > > > -       else if (vi->big_packets)
> > > > > +       if (vi->mergeable_rx_bufs) {
> > > > > +               if (vi->rq[i].page_pool) {
> > > > > +                       page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > +                                               virt_to_head_page(buf),
> > > > > +                                               true);
> > > > > +               } else {
> > > > > +                       put_page(virt_to_head_page(buf));
> > > > > +               }
> > > > > +       } else if (vi->big_packets) {
> > > > >                 give_pages(&vi->rq[i], buf);
> > > >
> > > > Any reason only mergeable were modified but not for small and big?
> > > >
> > > > Thanks
> > > >
> > >
> > > Big mode uses the page chain to recycle pages, thus the using of
> > > "private" of the buffer page. I will take further look into that to
> > > see if it is better to use page pool in these cases. Thanks!
> > >
> > >
> > >
> > > > > -       else
> > > > > +       } else {
> > > > >                 put_page(virt_to_head_page(buf));
> > > > > +       }
> > > > >  }
> > > > >
> > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > >         virtnet_free_queues(vi);
> > > > >  }
> > > > >
> > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > +{
> > > > > +       struct virtio_device *vdev = rq->vq->vdev;
> > > > > +
> > > > > +       struct page_pool_params pp_params = {
> > > > > +               .order = 0,
> > > > > +               .pool_size = rq->vq->num_max,
> > > > > +               .nid = dev_to_node(vdev->dev.parent),
> > > > > +               .dev = vdev->dev.parent,
> > > > > +               .offset = 0,
> > > > > +       };
> > > > > +
> > > > > +       rq->page_pool = page_pool_create(&pp_params);
> > > > > +       if (IS_ERR(rq->page_pool)) {
> > > > > +               dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > +                        PTR_ERR(rq->page_pool));
> > > > > +               rq->page_pool = NULL;
> > > > > +       }
> > > > > +}
> > > > > +
> > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > >   * least one full packet?
> > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > >                 vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > >                 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > >                 vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > +
> > > > > +               if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > +                       virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > +               else
> > > > > +                       dev_warn(&vi->vdev->dev,
> > > > > +                                "page pool only support mergeable mode\n");
> > > > > +
> > > > >         }
> > > > >
> > > > >         /* run here: ret == 0. */
> > > > > --
> > > > > 2.31.1
> > > > >
> > > >
> >

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-30  1:19         ` Liang Chen
@ 2023-06-07  9:08           ` Liang Chen
  2023-06-07  9:35             ` Xuan Zhuo
  2023-06-07 20:17             ` Michael S. Tsirkin
  0 siblings, 2 replies; 56+ messages in thread
From: Liang Chen @ 2023-06-07  9:08 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: jasowang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote:
>
> On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > The implementation at the moment uses one page per packet in both the
> > > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > > or disable the usage of page pool (disabled by default).
> > > > >
> > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > in the normal path.
> > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > >
> > > > > In multi-core vm testing environments, The most significant performance
> > > > > gain is observed in XDP cpumap:
> > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > >
> > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > DMA map/unmap support.
> > > > >
> > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > >
> > > > Why off by default?
> > > > I am guessing it sometimes has performance costs too?
> > > >
> > > >
> > > > What happens if we use page pool for big mode too?
> > > > The less modes we have the better...
> > > >
> > > >
> > >
> > > Sure, now I believe it makes sense to enable it by default. When the
> > > packet size is very small, it reduces the likelihood of skb
> > > coalescing. But such cases are rare.
> >
> > small packets are rare? These workloads are easy to create actually.
> > Pls try and include benchmark with small packet size.
> >
>
> Sure, Thanks!

Before going ahead and posting v2 patch, I would like to hear more
advice for the cases of small packets. I have done more performance
benchmark with small packets since then. Here is a list of iperf
output,

With PP and PP fragmenting:
256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec    0    144 KBytes
1K:       [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec    0
223 KBytes
2K:       [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec    0
324 KBytes
4K:       [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec    0
1.08 MBytes
8K:       [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec    0
744 KBytes
16K:     [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec    0    963 KBytes
32K:     [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec    0   1.25 MBytes
64K:     [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec    0   1.70 MBytes
128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec    0   4.26 MBytes
256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec    0   3.20 MBytes

Without PP:
256:     [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec    0    359 KBytes
1K:      [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec    0    730 KBytes
2K:      [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec    0   1.99 MBytes
4K:      [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec    0   1.20 MBytes
8K:      [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec    0   1.72 MBytes
16K:    [  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec    0   2.90 MBytes
32K:    [  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec    0   3.03 MBytes
64K:    [  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec    0   3.05 MBytes
128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec    1   3.03 MBytes
256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec    0   3.11 MBytes


The major factor contributing to the performance drop is the reduction
of skb coalescing. Additionally, without the page pool, small packets
can still benefit from the allocation of 8 continuous pages by
breaking them down into smaller pieces. This effectively reduces the
frequency of page allocation from the buddy system. For instance, the
arrival of 32 1K packets only triggers one alloc_page call. Therefore,
the benefits of using a page pool are limited in such cases. In fact,
without page pool fragmenting enabled, it can even hinder performance
from this perspective.

Upon further consideration, I tend to believe making page pool the
default option may not be appropriate. As you pointed out, we cannot
simply ignore the performance impact on small packets. Any comments on
this will be much appreciated.


Thanks,
Liang


> > > The usage of page pool for big mode is being evaluated now. Thanks!
> > >
> > > > > ---
> > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > --- a/drivers/net/virtio_net.c
> > > > > +++ b/drivers/net/virtio_net.c
> > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > >  module_param(gso, bool, 0444);
> > > > >  module_param(napi_tx, bool, 0644);
> > > > >
> > > > > +static bool page_pool_enabled;
> > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > +
> > > > >  /* FIXME: MTU in config. */
> > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > >  #define GOOD_COPY_LEN        128
> > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > >       /* Chain pages by the private ptr. */
> > > > >       struct page *pages;
> > > > >
> > > > > +     /* Page pool */
> > > > > +     struct page_pool *page_pool;
> > > > > +
> > > > >       /* Average packet length for mergeable receive buffers. */
> > > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > > >
> > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > >       return skb;
> > > > >  }
> > > > >
> > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > +{
> > > > > +     if (rq->page_pool)
> > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > +     else
> > > > > +             put_page(page);
> > > > > +}
> > > > > +
> > > > >  /* Called from bottom half context */
> > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > >                                  struct receive_queue *rq,
> > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > >       hdr = skb_vnet_hdr(skb);
> > > > >       memcpy(hdr, hdr_p, hdr_len);
> > > > >       if (page_to_free)
> > > > > -             put_page(page_to_free);
> > > > > +             virtnet_put_page(rq, page_to_free);
> > > > >
> > > > >       return skb;
> > > > >  }
> > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > >       return ret;
> > > > >  }
> > > > >
> > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > >  {
> > > > >       struct skb_shared_info *shinfo;
> > > > >       struct page *xdp_page;
> > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > -                     put_page(xdp_page);
> > > > > +                     virtnet_put_page(rq, xdp_page);
> > > > >               }
> > > > >       }
> > > > >  }
> > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > > >               return NULL;
> > > > >
> > > > > -     page = alloc_page(GFP_ATOMIC);
> > > > > +     if (rq->page_pool)
> > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > +     else
> > > > > +             page = alloc_page(GFP_ATOMIC);
> > > > > +
> > > > >       if (!page)
> > > > >               return NULL;
> > > > >
> > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > >                * is sending packet larger than the MTU.
> > > > >                */
> > > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > -                     put_page(p);
> > > > > +                     virtnet_put_page(rq, p);
> > > > >                       goto err_buf;
> > > > >               }
> > > > >
> > > > >               memcpy(page_address(page) + page_off,
> > > > >                      page_address(p) + off, buflen);
> > > > >               page_off += buflen;
> > > > > -             put_page(p);
> > > > > +             virtnet_put_page(rq, p);
> > > > >       }
> > > > >
> > > > >       /* Headroom does not contribute to packet length */
> > > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > >       return page;
> > > > >  err_buf:
> > > > > -     __free_pages(page, 0);
> > > > > +     if (rq->page_pool)
> > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > +     else
> > > > > +             __free_pages(page, 0);
> > > > >       return NULL;
> > > > >  }
> > > > >
> > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > >               }
> > > > >               stats->bytes += len;
> > > > >               page = virt_to_head_page(buf);
> > > > > -             put_page(page);
> > > > > +             virtnet_put_page(rq, page);
> > > > >       }
> > > > >  }
> > > > >
> > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > >               cur_frag_size = truesize;
> > > > >               xdp_frags_truesz += cur_frag_size;
> > > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > -                     put_page(page);
> > > > > +                     virtnet_put_page(rq, page);
> > > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > > >                       dev->stats.rx_length_errors++;
> > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > >       return 0;
> > > > >
> > > > >  err:
> > > > > -     put_xdp_frags(xdp);
> > > > > +     put_xdp_frags(xdp, rq);
> > > > >       return -EINVAL;
> > > > >  }
> > > > >
> > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > >               if (*len + xdp_room > PAGE_SIZE)
> > > > >                       return NULL;
> > > > >
> > > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > > +             if (rq->page_pool)
> > > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > +             else
> > > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > > >               if (!xdp_page)
> > > > >                       return NULL;
> > > > >
> > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > >
> > > > >       *frame_sz = PAGE_SIZE;
> > > > >
> > > > > -     put_page(*page);
> > > > > +     virtnet_put_page(rq, *page);
> > > > >
> > > > >       *page = xdp_page;
> > > > >
> > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > >               if (unlikely(!head_skb))
> > > > >                       break;
> > > > > +             if (rq->page_pool)
> > > > > +                     skb_mark_for_recycle(head_skb);
> > > > >               return head_skb;
> > > > >
> > > > >       case XDP_TX:
> > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > >               break;
> > > > >       }
> > > > >
> > > > > -     put_xdp_frags(&xdp);
> > > > > +     put_xdp_frags(&xdp, rq);
> > > > >
> > > > >  err_xdp:
> > > > > -     put_page(page);
> > > > > +     virtnet_put_page(rq, page);
> > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > >
> > > > >       stats->xdp_drops++;
> > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > >       curr_skb = head_skb;
> > > > >
> > > > > +     if (rq->page_pool)
> > > > > +             skb_mark_for_recycle(curr_skb);
> > > > > +
> > > > >       if (unlikely(!curr_skb))
> > > > >               goto err_skb;
> > > > >       while (--num_buf) {
> > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >                       curr_skb = nskb;
> > > > >                       head_skb->truesize += nskb->truesize;
> > > > >                       num_skb_frags = 0;
> > > > > +                     if (rq->page_pool)
> > > > > +                             skb_mark_for_recycle(curr_skb);
> > > > >               }
> > > > >               if (curr_skb != head_skb) {
> > > > >                       head_skb->data_len += len;
> > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >               }
> > > > >               offset = buf - page_address(page);
> > > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > -                     put_page(page);
> > > > > +                     virtnet_put_page(rq, page);
> > > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > >                                            len, truesize);
> > > > >               } else {
> > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > >       return head_skb;
> > > > >
> > > > >  err_skb:
> > > > > -     put_page(page);
> > > > > +     virtnet_put_page(rq, page);
> > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > >
> > > > >  err_buf:
> > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > >        * disabled GSO for XDP, it won't be a big issue.
> > > > >        */
> > > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > -             return -ENOMEM;
> > > > > +     if (rq->page_pool) {
> > > > > +             struct page *page;
> > > > >
> > > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > -     get_page(alloc_frag->page);
> > > > > -     alloc_frag->offset += len + room;
> > > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > > -     if (hole < len + room) {
> > > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > > -              * enough space for another buffer, add the remaining space to
> > > > > -              * the current buffer.
> > > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > -              */
> > > > > -             if (!headroom)
> > > > > -                     len += hole;
> > > > > -             alloc_frag->offset += hole;
> > > > > -     }
> > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > +             if (unlikely(!page))
> > > > > +                     return -ENOMEM;
> > > > > +             buf = (char *)page_address(page);
> > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > +     } else {
> > > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > +                     return -ENOMEM;
> > > > >
> > > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > +             get_page(alloc_frag->page);
> > > > > +             alloc_frag->offset += len + room;
> > > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > > +             if (hole < len + room) {
> > > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > > +                      * enough space for another buffer, add the remaining space to
> > > > > +                      * the current buffer.
> > > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > +                      */
> > > > > +                     if (!headroom)
> > > > > +                             len += hole;
> > > > > +                     alloc_frag->offset += hole;
> > > > > +             }
> > > > > +     }
> > > > >       sg_init_one(rq->sg, buf, len);
> > > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > >       if (err < 0)
> > > > > -             put_page(virt_to_head_page(buf));
> > > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > > >
> > > > >       return err;
> > > > >  }
> > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > >       if (err < 0)
> > > > >               return err;
> > > > >
> > > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > > +     if (vi->rq[qp_index].page_pool)
> > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > > +                                              vi->rq[qp_index].page_pool);
> > > > > +     else
> > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > > +                                              NULL);
> > > > > +
> > > > >       if (err < 0)
> > > > >               goto err_xdp_reg_mem_model;
> > > > >
> > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > >                                               virtnet_sq_stats_desc[j].desc);
> > > > >               }
> > > > > +             page_pool_ethtool_stats_get_strings(p);
> > > > >               break;
> > > > >       }
> > > > >  }
> > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > >       switch (sset) {
> > > > >       case ETH_SS_STATS:
> > > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > > >       default:
> > > > >               return -EOPNOTSUPP;
> > > > >       }
> > > > >  }
> > > > >
> > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > +{
> > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > > +     struct page_pool_stats pp_stats = {};
> > > > > +     int i;
> > > > > +
> > > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > +             if (!vi->rq[i].page_pool)
> > > > > +                     continue;
> > > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > +     }
> > > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > +}
> > > > > +
> > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > >                                     struct ethtool_stats *stats, u64 *data)
> > > > >  {
> > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > > >       }
> > > > > +
> > > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > >  }
> > > > >
> > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > >               __netif_napi_del(&vi->rq[i].napi);
> > > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > > +             if (vi->rq[i].page_pool)
> > > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > > >       }
> > > > >
> > > > >       /* We called __netif_napi_del(),
> > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > > >       int i = vq2rxq(vq);
> > > > >
> > > > > -     if (vi->mergeable_rx_bufs)
> > > > > -             put_page(virt_to_head_page(buf));
> > > > > -     else if (vi->big_packets)
> > > > > +     if (vi->mergeable_rx_bufs) {
> > > > > +             if (vi->rq[i].page_pool) {
> > > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > +                                             virt_to_head_page(buf),
> > > > > +                                             true);
> > > > > +             } else {
> > > > > +                     put_page(virt_to_head_page(buf));
> > > > > +             }
> > > > > +     } else if (vi->big_packets) {
> > > > >               give_pages(&vi->rq[i], buf);
> > > > > -     else
> > > > > +     } else {
> > > > >               put_page(virt_to_head_page(buf));
> > > > > +     }
> > > > >  }
> > > > >
> > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > >       virtnet_free_queues(vi);
> > > > >  }
> > > > >
> > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > +{
> > > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > > +
> > > > > +     struct page_pool_params pp_params = {
> > > > > +             .order = 0,
> > > > > +             .pool_size = rq->vq->num_max,
> > > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > > +             .dev = vdev->dev.parent,
> > > > > +             .offset = 0,
> > > > > +     };
> > > > > +
> > > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > > +     if (IS_ERR(rq->page_pool)) {
> > > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > +                      PTR_ERR(rq->page_pool));
> > > > > +             rq->page_pool = NULL;
> > > > > +     }
> > > > > +}
> > > > > +
> > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > >   * least one full packet?
> > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > +
> > > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > +             else
> > > > > +                     dev_warn(&vi->vdev->dev,
> > > > > +                              "page pool only support mergeable mode\n");
> > > > > +
> > > > >       }
> > > > >
> > > > >       /* run here: ret == 0. */
> > > > > --
> > > > > 2.31.1
> > > >
> >

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-31  3:10           ` Xuan Zhuo
@ 2023-06-07  9:11             ` Liang Chen
  2023-06-07  9:33               ` Xuan Zhuo
  0 siblings, 1 reply; 56+ messages in thread
From: Liang Chen @ 2023-06-07  9:11 UTC (permalink / raw)
  To: Xuan Zhuo
  Cc: Jason Wang, virtualization, netdev, linux-kernel, kuba, edumazet,
	davem, pabeni, alexander.duyck, Michael S. Tsirkin

On Wed, May 31, 2023 at 11:12 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Mon, 29 May 2023 15:28:17 +0800, Liang Chen <liangchen.linux@gmail.com> wrote:
> > On Sun, May 28, 2023 at 2:40 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Sat, May 27, 2023 at 08:35:01PM +0800, Liang Chen wrote:
> > > > On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > >
> > > > > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote:
> > > > > >
> > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > normal and XDP path.
> > > > >
> > > > > It's better to explain why we need a page pool and how it can help the
> > > > > performance.
> > > > >
> > > >
> > > > Sure, I will include that on v2.
> > > > > > In addition, introducing a module parameter to enable
> > > > > > or disable the usage of page pool (disabled by default).
> > > > >
> > > > > If page pool wins for most of the cases, any reason to disable it by default?
> > > > >
> > > >
> > > > Thank you for raising the point. It does make sense to enable it by default.
> > >
> > > I'd like to see more benchmarks pls then, with a variety of packet
> > > sizes, udp and tcp.
> > >
> >
> > Sure, more benchmarks will be provided. Thanks.
>
>
> I think so.
>
> I did this, but I did not found any improve. So I gave up it.
>
> Thanks.
>
>

Our UDP benchmark shows a steady 0.8 percent change in PPS
measurement. However, when conducting iperf TCP stream performance
testing, the results vary depending on the packet size and testing
setup. With small packet sizes, the performance actually drops
slightly due to the reasons I explained in the previous email. On the
other hand, with large packets, we need to ensure that the sender side
doesn't become the bottleneck. To achieve this, our setup uses a
single-core vm to keep the receiver busy, which allows us to identify
performance differences in the receiving path.


Thanks,
Liang




> >
> >
> > > > > >
> > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > in the normal path.
> > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > >
> > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > gain is observed in XDP cpumap:
> > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > >
> > > > > Please show more details on the test. E.g which kinds of tests have
> > > > > you measured?
> > > > >
> > > > > Btw, it would be better to measure PPS as well.
> > > > >
> > > >
> > > > Sure. It will be added on v2.
> > > > > >
> > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > DMA map/unmap support.
> > > > > >
> > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > > > ---
> > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > >
> > > > > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do
> > > > > the ifdef tricks at least.
> > > > >
> > > >
> > > > Sure. it will be done on v2.
> > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > --- a/drivers/net/virtio_net.c
> > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > >  module_param(gso, bool, 0444);
> > > > > >  module_param(napi_tx, bool, 0644);
> > > > > >
> > > > > > +static bool page_pool_enabled;
> > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > +
> > > > > >  /* FIXME: MTU in config. */
> > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > >  #define GOOD_COPY_LEN  128
> > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > >         /* Chain pages by the private ptr. */
> > > > > >         struct page *pages;
> > > > > >
> > > > > > +       /* Page pool */
> > > > > > +       struct page_pool *page_pool;
> > > > > > +
> > > > > >         /* Average packet length for mergeable receive buffers. */
> > > > > >         struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > >
> > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > >         return skb;
> > > > > >  }
> > > > > >
> > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > +{
> > > > > > +       if (rq->page_pool)
> > > > > > +               page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > +       else
> > > > > > +               put_page(page);
> > > > > > +}
> > > > > > +
> > > > > >  /* Called from bottom half context */
> > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > >                                    struct receive_queue *rq,
> > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > >         hdr = skb_vnet_hdr(skb);
> > > > > >         memcpy(hdr, hdr_p, hdr_len);
> > > > > >         if (page_to_free)
> > > > > > -               put_page(page_to_free);
> > > > > > +               virtnet_put_page(rq, page_to_free);
> > > > > >
> > > > > >         return skb;
> > > > > >  }
> > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > >         return ret;
> > > > > >  }
> > > > > >
> > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > >  {
> > > > >
> > > > > rq could be fetched from xdp_rxq_info?
> > > >
> > > > Yeah, it has the queue_index there.
> > > > >
> > > > > >         struct skb_shared_info *shinfo;
> > > > > >         struct page *xdp_page;
> > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > >                 shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > >                 for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > >                         xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > -                       put_page(xdp_page);
> > > > > > +                       virtnet_put_page(rq, xdp_page);
> > > > > >                 }
> > > > > >         }
> > > > > >  }
> > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > >         if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > >                 return NULL;
> > > > > >
> > > > > > -       page = alloc_page(GFP_ATOMIC);
> > > > > > +       if (rq->page_pool)
> > > > > > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > +       else
> > > > > > +               page = alloc_page(GFP_ATOMIC);
> > > > > > +
> > > > > >         if (!page)
> > > > > >                 return NULL;
> > > > > >
> > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > >                  * is sending packet larger than the MTU.
> > > > > >                  */
> > > > > >                 if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > -                       put_page(p);
> > > > > > +                       virtnet_put_page(rq, p);
> > > > > >                         goto err_buf;
> > > > > >                 }
> > > > > >
> > > > > >                 memcpy(page_address(page) + page_off,
> > > > > >                        page_address(p) + off, buflen);
> > > > > >                 page_off += buflen;
> > > > > > -               put_page(p);
> > > > > > +               virtnet_put_page(rq, p);
> > > > > >         }
> > > > > >
> > > > > >         /* Headroom does not contribute to packet length */
> > > > > >         *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > >         return page;
> > > > > >  err_buf:
> > > > > > -       __free_pages(page, 0);
> > > > > > +       if (rq->page_pool)
> > > > > > +               page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > +       else
> > > > > > +               __free_pages(page, 0);
> > > > > >         return NULL;
> > > > > >  }
> > > > > >
> > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > >                 }
> > > > > >                 stats->bytes += len;
> > > > > >                 page = virt_to_head_page(buf);
> > > > > > -               put_page(page);
> > > > > > +               virtnet_put_page(rq, page);
> > > > > >         }
> > > > > >  }
> > > > > >
> > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > >                 cur_frag_size = truesize;
> > > > > >                 xdp_frags_truesz += cur_frag_size;
> > > > > >                 if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > -                       put_page(page);
> > > > > > +                       virtnet_put_page(rq, page);
> > > > > >                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > >                                  dev->name, len, (unsigned long)(truesize - room));
> > > > > >                         dev->stats.rx_length_errors++;
> > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > >         return 0;
> > > > > >
> > > > > >  err:
> > > > > > -       put_xdp_frags(xdp);
> > > > > > +       put_xdp_frags(xdp, rq);
> > > > > >         return -EINVAL;
> > > > > >  }
> > > > > >
> > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > >                 if (*len + xdp_room > PAGE_SIZE)
> > > > > >                         return NULL;
> > > > > >
> > > > > > -               xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > +               if (rq->page_pool)
> > > > > > +                       xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > +               else
> > > > > > +                       xdp_page = alloc_page(GFP_ATOMIC);
> > > > > >                 if (!xdp_page)
> > > > > >                         return NULL;
> > > > > >
> > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > >
> > > > > >         *frame_sz = PAGE_SIZE;
> > > > > >
> > > > > > -       put_page(*page);
> > > > > > +       virtnet_put_page(rq, *page);
> > > > > >
> > > > > >         *page = xdp_page;
> > > > > >
> > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > >                 head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > >                 if (unlikely(!head_skb))
> > > > > >                         break;
> > > > > > +               if (rq->page_pool)
> > > > > > +                       skb_mark_for_recycle(head_skb);
> > > > > >                 return head_skb;
> > > > > >
> > > > > >         case XDP_TX:
> > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > >                 break;
> > > > > >         }
> > > > > >
> > > > > > -       put_xdp_frags(&xdp);
> > > > > > +       put_xdp_frags(&xdp, rq);
> > > > > >
> > > > > >  err_xdp:
> > > > > > -       put_page(page);
> > > > > > +       virtnet_put_page(rq, page);
> > > > > >         mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > >
> > > > > >         stats->xdp_drops++;
> > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >         head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > >         curr_skb = head_skb;
> > > > > >
> > > > > > +       if (rq->page_pool)
> > > > > > +               skb_mark_for_recycle(curr_skb);
> > > > > > +
> > > > > >         if (unlikely(!curr_skb))
> > > > > >                 goto err_skb;
> > > > > >         while (--num_buf) {
> > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >                         curr_skb = nskb;
> > > > > >                         head_skb->truesize += nskb->truesize;
> > > > > >                         num_skb_frags = 0;
> > > > > > +                       if (rq->page_pool)
> > > > > > +                               skb_mark_for_recycle(curr_skb);
> > > > > >                 }
> > > > > >                 if (curr_skb != head_skb) {
> > > > > >                         head_skb->data_len += len;
> > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >                 }
> > > > > >                 offset = buf - page_address(page);
> > > > > >                 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > -                       put_page(page);
> > > > > > +                       virtnet_put_page(rq, page);
> > > > >
> > > > > I wonder why not we can't do this during buffer allocation like other drivers?
> > > > >
> > > >
> > > > Sorry, I don't quite understand the point here. Would you please
> > > > elaborate a bit more?
> > > > > >                         skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > >                                              len, truesize);
> > > > > >                 } else {
> > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >         return head_skb;
> > > > > >
> > > > > >  err_skb:
> > > > > > -       put_page(page);
> > > > > > +       virtnet_put_page(rq, page);
> > > > > >         mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > >
> > > > > >  err_buf:
> > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > >          * disabled GSO for XDP, it won't be a big issue.
> > > > > >          */
> > > > > >         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > -       if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > -               return -ENOMEM;
> > > > > > +       if (rq->page_pool) {
> > > > > > +               struct page *page;
> > > > > >
> > > > > > -       buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > -       buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > -       get_page(alloc_frag->page);
> > > > > > -       alloc_frag->offset += len + room;
> > > > > > -       hole = alloc_frag->size - alloc_frag->offset;
> > > > > > -       if (hole < len + room) {
> > > > > > -               /* To avoid internal fragmentation, if there is very likely not
> > > > > > -                * enough space for another buffer, add the remaining space to
> > > > > > -                * the current buffer.
> > > > > > -                * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > -                * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > -                */
> > > > > > -               if (!headroom)
> > > > > > -                       len += hole;
> > > > > > -               alloc_frag->offset += hole;
> > > > > > -       }
> > > > > > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > +               if (unlikely(!page))
> > > > > > +                       return -ENOMEM;
> > > > > > +               buf = (char *)page_address(page);
> > > > > > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > +       } else {
> > > > > > +               if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > >
> > > > > Why not simply use a helper like virtnet_page_frag_refill() and add
> > > > > the page_pool allocation logic there? It helps to reduce the
> > > > > changeset.
> > > > >
> > > >
> > > > Sure. Will do that on v2.
> > > > > > +                       return -ENOMEM;
> > > > > >
> > > > > > +               buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > +               get_page(alloc_frag->page);
> > > > > > +               alloc_frag->offset += len + room;
> > > > > > +               hole = alloc_frag->size - alloc_frag->offset;
> > > > > > +               if (hole < len + room) {
> > > > > > +                       /* To avoid internal fragmentation, if there is very likely not
> > > > > > +                        * enough space for another buffer, add the remaining space to
> > > > > > +                        * the current buffer.
> > > > > > +                        * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > +                        * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > +                        */
> > > > > > +                       if (!headroom)
> > > > > > +                               len += hole;
> > > > > > +                       alloc_frag->offset += hole;
> > > > > > +               }
> > > > > > +       }
> > > > > >         sg_init_one(rq->sg, buf, len);
> > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > >         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > >         if (err < 0)
> > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > +               virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > >
> > > > > >         return err;
> > > > > >  }
> > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > >         if (err < 0)
> > > > > >                 return err;
> > > > > >
> > > > > > -       err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > -                                        MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > +       if (vi->rq[qp_index].page_pool)
> > > > > > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > +                                                MEM_TYPE_PAGE_POOL,
> > > > > > +                                                vi->rq[qp_index].page_pool);
> > > > > > +       else
> > > > > > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > +                                                MEM_TYPE_PAGE_SHARED,
> > > > > > +                                                NULL);
> > > > > > +
> > > > > >         if (err < 0)
> > > > > >                 goto err_xdp_reg_mem_model;
> > > > > >
> > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > >                                 ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > >                                                 virtnet_sq_stats_desc[j].desc);
> > > > > >                 }
> > > > > > +               page_pool_ethtool_stats_get_strings(p);
> > > > > >                 break;
> > > > > >         }
> > > > > >  }
> > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > >         switch (sset) {
> > > > > >         case ETH_SS_STATS:
> > > > > >                 return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > -                                              VIRTNET_SQ_STATS_LEN);
> > > > > > +                                              VIRTNET_SQ_STATS_LEN +
> > > > > > +                                               (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > +                                                page_pool_ethtool_stats_get_count() : 0));
> > > > > >         default:
> > > > > >                 return -EOPNOTSUPP;
> > > > > >         }
> > > > > >  }
> > > > > >
> > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > +{
> > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > +       struct virtnet_info *vi = netdev_priv(dev);
> > > > > > +       struct page_pool_stats pp_stats = {};
> > > > > > +       int i;
> > > > > > +
> > > > > > +       for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > +               if (!vi->rq[i].page_pool)
> > > > > > +                       continue;
> > > > > > +               page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > +       }
> > > > > > +       page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > +}
> > > > > > +
> > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > >                                       struct ethtool_stats *stats, u64 *data)
> > > > > >  {
> > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > >                 } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > >                 idx += VIRTNET_SQ_STATS_LEN;
> > > > > >         }
> > > > > > +
> > > > > > +       virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > >  }
> > > > > >
> > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > +               if (vi->rq[i].page_pool)
> > > > > > +                       page_pool_destroy(vi->rq[i].page_pool);
> > > > > >         }
> > > > > >
> > > > > >         /* We called __netif_napi_del(),
> > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > >         struct virtnet_info *vi = vq->vdev->priv;
> > > > > >         int i = vq2rxq(vq);
> > > > > >
> > > > > > -       if (vi->mergeable_rx_bufs)
> > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > -       else if (vi->big_packets)
> > > > > > +       if (vi->mergeable_rx_bufs) {
> > > > > > +               if (vi->rq[i].page_pool) {
> > > > > > +                       page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > +                                               virt_to_head_page(buf),
> > > > > > +                                               true);
> > > > > > +               } else {
> > > > > > +                       put_page(virt_to_head_page(buf));
> > > > > > +               }
> > > > > > +       } else if (vi->big_packets) {
> > > > > >                 give_pages(&vi->rq[i], buf);
> > > > >
> > > > > Any reason only mergeable were modified but not for small and big?
> > > > >
> > > > > Thanks
> > > > >
> > > >
> > > > Big mode uses the page chain to recycle pages, thus the using of
> > > > "private" of the buffer page. I will take further look into that to
> > > > see if it is better to use page pool in these cases. Thanks!
> > > >
> > > >
> > > >
> > > > > > -       else
> > > > > > +       } else {
> > > > > >                 put_page(virt_to_head_page(buf));
> > > > > > +       }
> > > > > >  }
> > > > > >
> > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > >         virtnet_free_queues(vi);
> > > > > >  }
> > > > > >
> > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > +{
> > > > > > +       struct virtio_device *vdev = rq->vq->vdev;
> > > > > > +
> > > > > > +       struct page_pool_params pp_params = {
> > > > > > +               .order = 0,
> > > > > > +               .pool_size = rq->vq->num_max,
> > > > > > +               .nid = dev_to_node(vdev->dev.parent),
> > > > > > +               .dev = vdev->dev.parent,
> > > > > > +               .offset = 0,
> > > > > > +       };
> > > > > > +
> > > > > > +       rq->page_pool = page_pool_create(&pp_params);
> > > > > > +       if (IS_ERR(rq->page_pool)) {
> > > > > > +               dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > +                        PTR_ERR(rq->page_pool));
> > > > > > +               rq->page_pool = NULL;
> > > > > > +       }
> > > > > > +}
> > > > > > +
> > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > >   * least one full packet?
> > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > >                 vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > >                 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > >                 vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > +
> > > > > > +               if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > +                       virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > +               else
> > > > > > +                       dev_warn(&vi->vdev->dev,
> > > > > > +                                "page pool only support mergeable mode\n");
> > > > > > +
> > > > > >         }
> > > > > >
> > > > > >         /* run here: ret == 0. */
> > > > > > --
> > > > > > 2.31.1
> > > > > >
> > > > >
> > >

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-06-07  9:11             ` Liang Chen
@ 2023-06-07  9:33               ` Xuan Zhuo
  0 siblings, 0 replies; 56+ messages in thread
From: Xuan Zhuo @ 2023-06-07  9:33 UTC (permalink / raw)
  To: Liang Chen
  Cc: Jason Wang, virtualization, netdev, linux-kernel, kuba, edumazet,
	davem, pabeni, alexander.duyck, Michael S. Tsirkin

On Wed, 7 Jun 2023 17:11:44 +0800, Liang Chen <liangchen.linux@gmail.com> wrote:
> On Wed, May 31, 2023 at 11:12 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
> >
> > On Mon, 29 May 2023 15:28:17 +0800, Liang Chen <liangchen.linux@gmail.com> wrote:
> > > On Sun, May 28, 2023 at 2:40 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Sat, May 27, 2023 at 08:35:01PM +0800, Liang Chen wrote:
> > > > > On Fri, May 26, 2023 at 2:51 PM Jason Wang <jasowang@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, May 26, 2023 at 1:46 PM Liang Chen <liangchen.linux@gmail.com> wrote:
> > > > > > >
> > > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > > normal and XDP path.
> > > > > >
> > > > > > It's better to explain why we need a page pool and how it can help the
> > > > > > performance.
> > > > > >
> > > > >
> > > > > Sure, I will include that on v2.
> > > > > > > In addition, introducing a module parameter to enable
> > > > > > > or disable the usage of page pool (disabled by default).
> > > > > >
> > > > > > If page pool wins for most of the cases, any reason to disable it by default?
> > > > > >
> > > > >
> > > > > Thank you for raising the point. It does make sense to enable it by default.
> > > >
> > > > I'd like to see more benchmarks pls then, with a variety of packet
> > > > sizes, udp and tcp.
> > > >
> > >
> > > Sure, more benchmarks will be provided. Thanks.
> >
> >
> > I think so.
> >
> > I did this, but I did not found any improve. So I gave up it.
> >
> > Thanks.
> >
> >
>
> Our UDP benchmark shows a steady 0.8 percent change in PPS
> measurement. However, when conducting iperf TCP stream performance
> testing, the results vary depending on the packet size and testing
> setup. With small packet sizes, the performance actually drops
> slightly due to the reasons I explained in the previous email. On the
> other hand, with large packets, we need to ensure that the sender side
> doesn't become the bottleneck. To achieve this, our setup uses a
> single-core vm to keep the receiver busy, which allows us to identify
> performance differences in the receiving path.

Could you show some numbers?

Thanks.


>
>
> Thanks,
> Liang
>
>
>
>
> > >
> > >
> > > > > > >
> > > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > > in the normal path.
> > > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > > >
> > > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > > gain is observed in XDP cpumap:
> > > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > >
> > > > > > Please show more details on the test. E.g which kinds of tests have
> > > > > > you measured?
> > > > > >
> > > > > > Btw, it would be better to measure PPS as well.
> > > > > >
> > > > >
> > > > > Sure. It will be added on v2.
> > > > > > >
> > > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > > DMA map/unmap support.
> > > > > > >
> > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > > > > ---
> > > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > > >
> > > > > > I believe we should make virtio-net to select CONFIG_PAGE_POOL or do
> > > > > > the ifdef tricks at least.
> > > > > >
> > > > >
> > > > > Sure. it will be done on v2.
> > > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > > >  module_param(gso, bool, 0444);
> > > > > > >  module_param(napi_tx, bool, 0644);
> > > > > > >
> > > > > > > +static bool page_pool_enabled;
> > > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > > +
> > > > > > >  /* FIXME: MTU in config. */
> > > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > > >  #define GOOD_COPY_LEN  128
> > > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > > >         /* Chain pages by the private ptr. */
> > > > > > >         struct page *pages;
> > > > > > >
> > > > > > > +       /* Page pool */
> > > > > > > +       struct page_pool *page_pool;
> > > > > > > +
> > > > > > >         /* Average packet length for mergeable receive buffers. */
> > > > > > >         struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > > >
> > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > > >         return skb;
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > > +{
> > > > > > > +       if (rq->page_pool)
> > > > > > > +               page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > +       else
> > > > > > > +               put_page(page);
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* Called from bottom half context */
> > > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > >                                    struct receive_queue *rq,
> > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > >         hdr = skb_vnet_hdr(skb);
> > > > > > >         memcpy(hdr, hdr_p, hdr_len);
> > > > > > >         if (page_to_free)
> > > > > > > -               put_page(page_to_free);
> > > > > > > +               virtnet_put_page(rq, page_to_free);
> > > > > > >
> > > > > > >         return skb;
> > > > > > >  }
> > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > > >         return ret;
> > > > > > >  }
> > > > > > >
> > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > > >  {
> > > > > >
> > > > > > rq could be fetched from xdp_rxq_info?
> > > > >
> > > > > Yeah, it has the queue_index there.
> > > > > >
> > > > > > >         struct skb_shared_info *shinfo;
> > > > > > >         struct page *xdp_page;
> > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > >                 shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > > >                 for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > > >                         xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > > -                       put_page(xdp_page);
> > > > > > > +                       virtnet_put_page(rq, xdp_page);
> > > > > > >                 }
> > > > > > >         }
> > > > > > >  }
> > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > >         if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > > >                 return NULL;
> > > > > > >
> > > > > > > -       page = alloc_page(GFP_ATOMIC);
> > > > > > > +       if (rq->page_pool)
> > > > > > > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +       else
> > > > > > > +               page = alloc_page(GFP_ATOMIC);
> > > > > > > +
> > > > > > >         if (!page)
> > > > > > >                 return NULL;
> > > > > > >
> > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > >                  * is sending packet larger than the MTU.
> > > > > > >                  */
> > > > > > >                 if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > > -                       put_page(p);
> > > > > > > +                       virtnet_put_page(rq, p);
> > > > > > >                         goto err_buf;
> > > > > > >                 }
> > > > > > >
> > > > > > >                 memcpy(page_address(page) + page_off,
> > > > > > >                        page_address(p) + off, buflen);
> > > > > > >                 page_off += buflen;
> > > > > > > -               put_page(p);
> > > > > > > +               virtnet_put_page(rq, p);
> > > > > > >         }
> > > > > > >
> > > > > > >         /* Headroom does not contribute to packet length */
> > > > > > >         *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > > >         return page;
> > > > > > >  err_buf:
> > > > > > > -       __free_pages(page, 0);
> > > > > > > +       if (rq->page_pool)
> > > > > > > +               page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > +       else
> > > > > > > +               __free_pages(page, 0);
> > > > > > >         return NULL;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > > >                 }
> > > > > > >                 stats->bytes += len;
> > > > > > >                 page = virt_to_head_page(buf);
> > > > > > > -               put_page(page);
> > > > > > > +               virtnet_put_page(rq, page);
> > > > > > >         }
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > >                 cur_frag_size = truesize;
> > > > > > >                 xdp_frags_truesz += cur_frag_size;
> > > > > > >                 if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > > -                       put_page(page);
> > > > > > > +                       virtnet_put_page(rq, page);
> > > > > > >                         pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > > >                                  dev->name, len, (unsigned long)(truesize - room));
> > > > > > >                         dev->stats.rx_length_errors++;
> > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > >         return 0;
> > > > > > >
> > > > > > >  err:
> > > > > > > -       put_xdp_frags(xdp);
> > > > > > > +       put_xdp_frags(xdp, rq);
> > > > > > >         return -EINVAL;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > >                 if (*len + xdp_room > PAGE_SIZE)
> > > > > > >                         return NULL;
> > > > > > >
> > > > > > > -               xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > +               if (rq->page_pool)
> > > > > > > +                       xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +               else
> > > > > > > +                       xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > >                 if (!xdp_page)
> > > > > > >                         return NULL;
> > > > > > >
> > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > >
> > > > > > >         *frame_sz = PAGE_SIZE;
> > > > > > >
> > > > > > > -       put_page(*page);
> > > > > > > +       virtnet_put_page(rq, *page);
> > > > > > >
> > > > > > >         *page = xdp_page;
> > > > > > >
> > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > >                 head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > > >                 if (unlikely(!head_skb))
> > > > > > >                         break;
> > > > > > > +               if (rq->page_pool)
> > > > > > > +                       skb_mark_for_recycle(head_skb);
> > > > > > >                 return head_skb;
> > > > > > >
> > > > > > >         case XDP_TX:
> > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > >                 break;
> > > > > > >         }
> > > > > > >
> > > > > > > -       put_xdp_frags(&xdp);
> > > > > > > +       put_xdp_frags(&xdp, rq);
> > > > > > >
> > > > > > >  err_xdp:
> > > > > > > -       put_page(page);
> > > > > > > +       virtnet_put_page(rq, page);
> > > > > > >         mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > >
> > > > > > >         stats->xdp_drops++;
> > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >         head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > > >         curr_skb = head_skb;
> > > > > > >
> > > > > > > +       if (rq->page_pool)
> > > > > > > +               skb_mark_for_recycle(curr_skb);
> > > > > > > +
> > > > > > >         if (unlikely(!curr_skb))
> > > > > > >                 goto err_skb;
> > > > > > >         while (--num_buf) {
> > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >                         curr_skb = nskb;
> > > > > > >                         head_skb->truesize += nskb->truesize;
> > > > > > >                         num_skb_frags = 0;
> > > > > > > +                       if (rq->page_pool)
> > > > > > > +                               skb_mark_for_recycle(curr_skb);
> > > > > > >                 }
> > > > > > >                 if (curr_skb != head_skb) {
> > > > > > >                         head_skb->data_len += len;
> > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >                 }
> > > > > > >                 offset = buf - page_address(page);
> > > > > > >                 if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > > -                       put_page(page);
> > > > > > > +                       virtnet_put_page(rq, page);
> > > > > >
> > > > > > I wonder why not we can't do this during buffer allocation like other drivers?
> > > > > >
> > > > >
> > > > > Sorry, I don't quite understand the point here. Would you please
> > > > > elaborate a bit more?
> > > > > > >                         skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > > >                                              len, truesize);
> > > > > > >                 } else {
> > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >         return head_skb;
> > > > > > >
> > > > > > >  err_skb:
> > > > > > > -       put_page(page);
> > > > > > > +       virtnet_put_page(rq, page);
> > > > > > >         mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > >
> > > > > > >  err_buf:
> > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > >          * disabled GSO for XDP, it won't be a big issue.
> > > > > > >          */
> > > > > > >         len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > > -       if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > -               return -ENOMEM;
> > > > > > > +       if (rq->page_pool) {
> > > > > > > +               struct page *page;
> > > > > > >
> > > > > > > -       buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > -       buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > -       get_page(alloc_frag->page);
> > > > > > > -       alloc_frag->offset += len + room;
> > > > > > > -       hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > -       if (hole < len + room) {
> > > > > > > -               /* To avoid internal fragmentation, if there is very likely not
> > > > > > > -                * enough space for another buffer, add the remaining space to
> > > > > > > -                * the current buffer.
> > > > > > > -                * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > -                * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > -                */
> > > > > > > -               if (!headroom)
> > > > > > > -                       len += hole;
> > > > > > > -               alloc_frag->offset += hole;
> > > > > > > -       }
> > > > > > > +               page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +               if (unlikely(!page))
> > > > > > > +                       return -ENOMEM;
> > > > > > > +               buf = (char *)page_address(page);
> > > > > > > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > +       } else {
> > > > > > > +               if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > >
> > > > > > Why not simply use a helper like virtnet_page_frag_refill() and add
> > > > > > the page_pool allocation logic there? It helps to reduce the
> > > > > > changeset.
> > > > > >
> > > > >
> > > > > Sure. Will do that on v2.
> > > > > > > +                       return -ENOMEM;
> > > > > > >
> > > > > > > +               buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > +               buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > +               get_page(alloc_frag->page);
> > > > > > > +               alloc_frag->offset += len + room;
> > > > > > > +               hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > +               if (hole < len + room) {
> > > > > > > +                       /* To avoid internal fragmentation, if there is very likely not
> > > > > > > +                        * enough space for another buffer, add the remaining space to
> > > > > > > +                        * the current buffer.
> > > > > > > +                        * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > +                        * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > +                        */
> > > > > > > +                       if (!headroom)
> > > > > > > +                               len += hole;
> > > > > > > +                       alloc_frag->offset += hole;
> > > > > > > +               }
> > > > > > > +       }
> > > > > > >         sg_init_one(rq->sg, buf, len);
> > > > > > >         ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > >         err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > >         if (err < 0)
> > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > +               virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > > >
> > > > > > >         return err;
> > > > > > >  }
> > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > >         if (err < 0)
> > > > > > >                 return err;
> > > > > > >
> > > > > > > -       err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > -                                        MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > > +       if (vi->rq[qp_index].page_pool)
> > > > > > > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > +                                                MEM_TYPE_PAGE_POOL,
> > > > > > > +                                                vi->rq[qp_index].page_pool);
> > > > > > > +       else
> > > > > > > +               err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > +                                                MEM_TYPE_PAGE_SHARED,
> > > > > > > +                                                NULL);
> > > > > > > +
> > > > > > >         if (err < 0)
> > > > > > >                 goto err_xdp_reg_mem_model;
> > > > > > >
> > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > > >                                 ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > > >                                                 virtnet_sq_stats_desc[j].desc);
> > > > > > >                 }
> > > > > > > +               page_pool_ethtool_stats_get_strings(p);
> > > > > > >                 break;
> > > > > > >         }
> > > > > > >  }
> > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > > >         switch (sset) {
> > > > > > >         case ETH_SS_STATS:
> > > > > > >                 return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > > -                                              VIRTNET_SQ_STATS_LEN);
> > > > > > > +                                              VIRTNET_SQ_STATS_LEN +
> > > > > > > +                                               (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > > +                                                page_pool_ethtool_stats_get_count() : 0));
> > > > > > >         default:
> > > > > > >                 return -EOPNOTSUPP;
> > > > > > >         }
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > > +{
> > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > > +       struct virtnet_info *vi = netdev_priv(dev);
> > > > > > > +       struct page_pool_stats pp_stats = {};
> > > > > > > +       int i;
> > > > > > > +
> > > > > > > +       for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > > +               if (!vi->rq[i].page_pool)
> > > > > > > +                       continue;
> > > > > > > +               page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > > +       }
> > > > > > > +       page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > > +}
> > > > > > > +
> > > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > >                                       struct ethtool_stats *stats, u64 *data)
> > > > > > >  {
> > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > >                 } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > > >                 idx += VIRTNET_SQ_STATS_LEN;
> > > > > > >         }
> > > > > > > +
> > > > > > > +       virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > > >  }
> > > > > > >
> > > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > >         for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > >                 __netif_napi_del(&vi->rq[i].napi);
> > > > > > >                 __netif_napi_del(&vi->sq[i].napi);
> > > > > > > +               if (vi->rq[i].page_pool)
> > > > > > > +                       page_pool_destroy(vi->rq[i].page_pool);
> > > > > > >         }
> > > > > > >
> > > > > > >         /* We called __netif_napi_del(),
> > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > > >         struct virtnet_info *vi = vq->vdev->priv;
> > > > > > >         int i = vq2rxq(vq);
> > > > > > >
> > > > > > > -       if (vi->mergeable_rx_bufs)
> > > > > > > -               put_page(virt_to_head_page(buf));
> > > > > > > -       else if (vi->big_packets)
> > > > > > > +       if (vi->mergeable_rx_bufs) {
> > > > > > > +               if (vi->rq[i].page_pool) {
> > > > > > > +                       page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > > +                                               virt_to_head_page(buf),
> > > > > > > +                                               true);
> > > > > > > +               } else {
> > > > > > > +                       put_page(virt_to_head_page(buf));
> > > > > > > +               }
> > > > > > > +       } else if (vi->big_packets) {
> > > > > > >                 give_pages(&vi->rq[i], buf);
> > > > > >
> > > > > > Any reason only mergeable were modified but not for small and big?
> > > > > >
> > > > > > Thanks
> > > > > >
> > > > >
> > > > > Big mode uses the page chain to recycle pages, thus the using of
> > > > > "private" of the buffer page. I will take further look into that to
> > > > > see if it is better to use page pool in these cases. Thanks!
> > > > >
> > > > >
> > > > >
> > > > > > > -       else
> > > > > > > +       } else {
> > > > > > >                 put_page(virt_to_head_page(buf));
> > > > > > > +       }
> > > > > > >  }
> > > > > > >
> > > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > > >         virtnet_free_queues(vi);
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > > +{
> > > > > > > +       struct virtio_device *vdev = rq->vq->vdev;
> > > > > > > +
> > > > > > > +       struct page_pool_params pp_params = {
> > > > > > > +               .order = 0,
> > > > > > > +               .pool_size = rq->vq->num_max,
> > > > > > > +               .nid = dev_to_node(vdev->dev.parent),
> > > > > > > +               .dev = vdev->dev.parent,
> > > > > > > +               .offset = 0,
> > > > > > > +       };
> > > > > > > +
> > > > > > > +       rq->page_pool = page_pool_create(&pp_params);
> > > > > > > +       if (IS_ERR(rq->page_pool)) {
> > > > > > > +               dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > > +                        PTR_ERR(rq->page_pool));
> > > > > > > +               rq->page_pool = NULL;
> > > > > > > +       }
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > > >   * least one full packet?
> > > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > > >                 vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > > >                 vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > > >                 vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > > +
> > > > > > > +               if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > > +                       virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > > +               else
> > > > > > > +                       dev_warn(&vi->vdev->dev,
> > > > > > > +                                "page pool only support mergeable mode\n");
> > > > > > > +
> > > > > > >         }
> > > > > > >
> > > > > > >         /* run here: ret == 0. */
> > > > > > > --
> > > > > > > 2.31.1
> > > > > > >
> > > > > >
> > > >

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-06-07  9:08           ` Liang Chen
@ 2023-06-07  9:35             ` Xuan Zhuo
  2023-06-07 13:58               ` Liang Chen
  2023-06-07 20:17             ` Michael S. Tsirkin
  1 sibling, 1 reply; 56+ messages in thread
From: Xuan Zhuo @ 2023-06-07  9:35 UTC (permalink / raw)
  To: Liang Chen
  Cc: jasowang, virtualization, netdev, linux-kernel, kuba, edumazet,
	davem, pabeni, alexander.duyck, Michael S. Tsirkin

On Wed, 7 Jun 2023 17:08:59 +0800, Liang Chen <liangchen.linux@gmail.com> wrote:
> On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> >
> > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > > > or disable the usage of page pool (disabled by default).
> > > > > >
> > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > in the normal path.
> > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > >
> > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > gain is observed in XDP cpumap:
> > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > >
> > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > DMA map/unmap support.
> > > > > >
> > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > >
> > > > > Why off by default?
> > > > > I am guessing it sometimes has performance costs too?
> > > > >
> > > > >
> > > > > What happens if we use page pool for big mode too?
> > > > > The less modes we have the better...
> > > > >
> > > > >
> > > >
> > > > Sure, now I believe it makes sense to enable it by default. When the
> > > > packet size is very small, it reduces the likelihood of skb
> > > > coalescing. But such cases are rare.
> > >
> > > small packets are rare? These workloads are easy to create actually.
> > > Pls try and include benchmark with small packet size.
> > >
> >
> > Sure, Thanks!
>
> Before going ahead and posting v2 patch, I would like to hear more
> advice for the cases of small packets. I have done more performance
> benchmark with small packets since then. Here is a list of iperf
> output,

Could you show the commnad line?

Thanks


>
> With PP and PP fragmenting:
> 256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec    0    144 KBytes
> 1K:       [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec    0
> 223 KBytes
> 2K:       [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec    0
> 324 KBytes
> 4K:       [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec    0
> 1.08 MBytes
> 8K:       [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec    0
> 744 KBytes
> 16K:     [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec    0    963 KBytes
> 32K:     [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec    0   1.25 MBytes
> 64K:     [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec    0   1.70 MBytes
> 128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec    0   4.26 MBytes
> 256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec    0   3.20 MBytes
>
> Without PP:
> 256:     [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec    0    359 KBytes
> 1K:      [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec    0    730 KBytes
> 2K:      [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec    0   1.99 MBytes
> 4K:      [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec    0   1.20 MBytes
> 8K:      [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec    0   1.72 MBytes
> 16K:    [  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec    0   2.90 MBytes
> 32K:    [  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec    0   3.03 MBytes
> 64K:    [  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec    0   3.05 MBytes
> 128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec    1   3.03 MBytes
> 256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec    0   3.11 MBytes
>
>
> The major factor contributing to the performance drop is the reduction
> of skb coalescing. Additionally, without the page pool, small packets
> can still benefit from the allocation of 8 continuous pages by
> breaking them down into smaller pieces. This effectively reduces the
> frequency of page allocation from the buddy system. For instance, the
> arrival of 32 1K packets only triggers one alloc_page call. Therefore,
> the benefits of using a page pool are limited in such cases. In fact,
> without page pool fragmenting enabled, it can even hinder performance
> from this perspective.
>
> Upon further consideration, I tend to believe making page pool the
> default option may not be appropriate. As you pointed out, we cannot
> simply ignore the performance impact on small packets. Any comments on
> this will be much appreciated.
>
>
> Thanks,
> Liang
>
>
> > > > The usage of page pool for big mode is being evaluated now. Thanks!
> > > >
> > > > > > ---
> > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > --- a/drivers/net/virtio_net.c
> > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > >  module_param(gso, bool, 0444);
> > > > > >  module_param(napi_tx, bool, 0644);
> > > > > >
> > > > > > +static bool page_pool_enabled;
> > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > +
> > > > > >  /* FIXME: MTU in config. */
> > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > >  #define GOOD_COPY_LEN        128
> > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > >       /* Chain pages by the private ptr. */
> > > > > >       struct page *pages;
> > > > > >
> > > > > > +     /* Page pool */
> > > > > > +     struct page_pool *page_pool;
> > > > > > +
> > > > > >       /* Average packet length for mergeable receive buffers. */
> > > > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > >
> > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > >       return skb;
> > > > > >  }
> > > > > >
> > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > +{
> > > > > > +     if (rq->page_pool)
> > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > +     else
> > > > > > +             put_page(page);
> > > > > > +}
> > > > > > +
> > > > > >  /* Called from bottom half context */
> > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > >                                  struct receive_queue *rq,
> > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > >       hdr = skb_vnet_hdr(skb);
> > > > > >       memcpy(hdr, hdr_p, hdr_len);
> > > > > >       if (page_to_free)
> > > > > > -             put_page(page_to_free);
> > > > > > +             virtnet_put_page(rq, page_to_free);
> > > > > >
> > > > > >       return skb;
> > > > > >  }
> > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > >       return ret;
> > > > > >  }
> > > > > >
> > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > >  {
> > > > > >       struct skb_shared_info *shinfo;
> > > > > >       struct page *xdp_page;
> > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > -                     put_page(xdp_page);
> > > > > > +                     virtnet_put_page(rq, xdp_page);
> > > > > >               }
> > > > > >       }
> > > > > >  }
> > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > >               return NULL;
> > > > > >
> > > > > > -     page = alloc_page(GFP_ATOMIC);
> > > > > > +     if (rq->page_pool)
> > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > +     else
> > > > > > +             page = alloc_page(GFP_ATOMIC);
> > > > > > +
> > > > > >       if (!page)
> > > > > >               return NULL;
> > > > > >
> > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > >                * is sending packet larger than the MTU.
> > > > > >                */
> > > > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > -                     put_page(p);
> > > > > > +                     virtnet_put_page(rq, p);
> > > > > >                       goto err_buf;
> > > > > >               }
> > > > > >
> > > > > >               memcpy(page_address(page) + page_off,
> > > > > >                      page_address(p) + off, buflen);
> > > > > >               page_off += buflen;
> > > > > > -             put_page(p);
> > > > > > +             virtnet_put_page(rq, p);
> > > > > >       }
> > > > > >
> > > > > >       /* Headroom does not contribute to packet length */
> > > > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > >       return page;
> > > > > >  err_buf:
> > > > > > -     __free_pages(page, 0);
> > > > > > +     if (rq->page_pool)
> > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > +     else
> > > > > > +             __free_pages(page, 0);
> > > > > >       return NULL;
> > > > > >  }
> > > > > >
> > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > >               }
> > > > > >               stats->bytes += len;
> > > > > >               page = virt_to_head_page(buf);
> > > > > > -             put_page(page);
> > > > > > +             virtnet_put_page(rq, page);
> > > > > >       }
> > > > > >  }
> > > > > >
> > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > >               cur_frag_size = truesize;
> > > > > >               xdp_frags_truesz += cur_frag_size;
> > > > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > -                     put_page(page);
> > > > > > +                     virtnet_put_page(rq, page);
> > > > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > > > >                       dev->stats.rx_length_errors++;
> > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > >       return 0;
> > > > > >
> > > > > >  err:
> > > > > > -     put_xdp_frags(xdp);
> > > > > > +     put_xdp_frags(xdp, rq);
> > > > > >       return -EINVAL;
> > > > > >  }
> > > > > >
> > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > >               if (*len + xdp_room > PAGE_SIZE)
> > > > > >                       return NULL;
> > > > > >
> > > > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > +             if (rq->page_pool)
> > > > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > +             else
> > > > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > > > >               if (!xdp_page)
> > > > > >                       return NULL;
> > > > > >
> > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > >
> > > > > >       *frame_sz = PAGE_SIZE;
> > > > > >
> > > > > > -     put_page(*page);
> > > > > > +     virtnet_put_page(rq, *page);
> > > > > >
> > > > > >       *page = xdp_page;
> > > > > >
> > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > >               if (unlikely(!head_skb))
> > > > > >                       break;
> > > > > > +             if (rq->page_pool)
> > > > > > +                     skb_mark_for_recycle(head_skb);
> > > > > >               return head_skb;
> > > > > >
> > > > > >       case XDP_TX:
> > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > >               break;
> > > > > >       }
> > > > > >
> > > > > > -     put_xdp_frags(&xdp);
> > > > > > +     put_xdp_frags(&xdp, rq);
> > > > > >
> > > > > >  err_xdp:
> > > > > > -     put_page(page);
> > > > > > +     virtnet_put_page(rq, page);
> > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > >
> > > > > >       stats->xdp_drops++;
> > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > >       curr_skb = head_skb;
> > > > > >
> > > > > > +     if (rq->page_pool)
> > > > > > +             skb_mark_for_recycle(curr_skb);
> > > > > > +
> > > > > >       if (unlikely(!curr_skb))
> > > > > >               goto err_skb;
> > > > > >       while (--num_buf) {
> > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >                       curr_skb = nskb;
> > > > > >                       head_skb->truesize += nskb->truesize;
> > > > > >                       num_skb_frags = 0;
> > > > > > +                     if (rq->page_pool)
> > > > > > +                             skb_mark_for_recycle(curr_skb);
> > > > > >               }
> > > > > >               if (curr_skb != head_skb) {
> > > > > >                       head_skb->data_len += len;
> > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >               }
> > > > > >               offset = buf - page_address(page);
> > > > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > -                     put_page(page);
> > > > > > +                     virtnet_put_page(rq, page);
> > > > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > >                                            len, truesize);
> > > > > >               } else {
> > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >       return head_skb;
> > > > > >
> > > > > >  err_skb:
> > > > > > -     put_page(page);
> > > > > > +     virtnet_put_page(rq, page);
> > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > >
> > > > > >  err_buf:
> > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > >        * disabled GSO for XDP, it won't be a big issue.
> > > > > >        */
> > > > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > -             return -ENOMEM;
> > > > > > +     if (rq->page_pool) {
> > > > > > +             struct page *page;
> > > > > >
> > > > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > -     get_page(alloc_frag->page);
> > > > > > -     alloc_frag->offset += len + room;
> > > > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > > > -     if (hole < len + room) {
> > > > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > > > -              * enough space for another buffer, add the remaining space to
> > > > > > -              * the current buffer.
> > > > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > -              */
> > > > > > -             if (!headroom)
> > > > > > -                     len += hole;
> > > > > > -             alloc_frag->offset += hole;
> > > > > > -     }
> > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > +             if (unlikely(!page))
> > > > > > +                     return -ENOMEM;
> > > > > > +             buf = (char *)page_address(page);
> > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > +     } else {
> > > > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > +                     return -ENOMEM;
> > > > > >
> > > > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > +             get_page(alloc_frag->page);
> > > > > > +             alloc_frag->offset += len + room;
> > > > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > > > +             if (hole < len + room) {
> > > > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > > > +                      * enough space for another buffer, add the remaining space to
> > > > > > +                      * the current buffer.
> > > > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > +                      */
> > > > > > +                     if (!headroom)
> > > > > > +                             len += hole;
> > > > > > +                     alloc_frag->offset += hole;
> > > > > > +             }
> > > > > > +     }
> > > > > >       sg_init_one(rq->sg, buf, len);
> > > > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > >       if (err < 0)
> > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > >
> > > > > >       return err;
> > > > > >  }
> > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > >       if (err < 0)
> > > > > >               return err;
> > > > > >
> > > > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > +     if (vi->rq[qp_index].page_pool)
> > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > > > +                                              vi->rq[qp_index].page_pool);
> > > > > > +     else
> > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > > > +                                              NULL);
> > > > > > +
> > > > > >       if (err < 0)
> > > > > >               goto err_xdp_reg_mem_model;
> > > > > >
> > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > >                                               virtnet_sq_stats_desc[j].desc);
> > > > > >               }
> > > > > > +             page_pool_ethtool_stats_get_strings(p);
> > > > > >               break;
> > > > > >       }
> > > > > >  }
> > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > >       switch (sset) {
> > > > > >       case ETH_SS_STATS:
> > > > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > > > >       default:
> > > > > >               return -EOPNOTSUPP;
> > > > > >       }
> > > > > >  }
> > > > > >
> > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > +{
> > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > > > +     struct page_pool_stats pp_stats = {};
> > > > > > +     int i;
> > > > > > +
> > > > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > +             if (!vi->rq[i].page_pool)
> > > > > > +                     continue;
> > > > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > +     }
> > > > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > +}
> > > > > > +
> > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > >                                     struct ethtool_stats *stats, u64 *data)
> > > > > >  {
> > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > > > >       }
> > > > > > +
> > > > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > >  }
> > > > > >
> > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > >               __netif_napi_del(&vi->rq[i].napi);
> > > > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > > > +             if (vi->rq[i].page_pool)
> > > > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > > > >       }
> > > > > >
> > > > > >       /* We called __netif_napi_del(),
> > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > > > >       int i = vq2rxq(vq);
> > > > > >
> > > > > > -     if (vi->mergeable_rx_bufs)
> > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > -     else if (vi->big_packets)
> > > > > > +     if (vi->mergeable_rx_bufs) {
> > > > > > +             if (vi->rq[i].page_pool) {
> > > > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > +                                             virt_to_head_page(buf),
> > > > > > +                                             true);
> > > > > > +             } else {
> > > > > > +                     put_page(virt_to_head_page(buf));
> > > > > > +             }
> > > > > > +     } else if (vi->big_packets) {
> > > > > >               give_pages(&vi->rq[i], buf);
> > > > > > -     else
> > > > > > +     } else {
> > > > > >               put_page(virt_to_head_page(buf));
> > > > > > +     }
> > > > > >  }
> > > > > >
> > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > >       virtnet_free_queues(vi);
> > > > > >  }
> > > > > >
> > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > +{
> > > > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > > > +
> > > > > > +     struct page_pool_params pp_params = {
> > > > > > +             .order = 0,
> > > > > > +             .pool_size = rq->vq->num_max,
> > > > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > > > +             .dev = vdev->dev.parent,
> > > > > > +             .offset = 0,
> > > > > > +     };
> > > > > > +
> > > > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > > > +     if (IS_ERR(rq->page_pool)) {
> > > > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > +                      PTR_ERR(rq->page_pool));
> > > > > > +             rq->page_pool = NULL;
> > > > > > +     }
> > > > > > +}
> > > > > > +
> > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > >   * least one full packet?
> > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > +
> > > > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > +             else
> > > > > > +                     dev_warn(&vi->vdev->dev,
> > > > > > +                              "page pool only support mergeable mode\n");
> > > > > > +
> > > > > >       }
> > > > > >
> > > > > >       /* run here: ret == 0. */
> > > > > > --
> > > > > > 2.31.1
> > > > >
> > >

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-06-07  9:35             ` Xuan Zhuo
@ 2023-06-07 13:58               ` Liang Chen
  0 siblings, 0 replies; 56+ messages in thread
From: Liang Chen @ 2023-06-07 13:58 UTC (permalink / raw)
  To: Xuan Zhuo
  Cc: jasowang, virtualization, netdev, linux-kernel, kuba, edumazet,
	davem, pabeni, alexander.duyck, Michael S. Tsirkin

On Wed, Jun 7, 2023 at 5:36 PM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> On Wed, 7 Jun 2023 17:08:59 +0800, Liang Chen <liangchen.linux@gmail.com> wrote:
> > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> > >
> > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > > > > or disable the usage of page pool (disabled by default).
> > > > > > >
> > > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > > in the normal path.
> > > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > > >
> > > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > > gain is observed in XDP cpumap:
> > > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > > >
> > > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > > DMA map/unmap support.
> > > > > > >
> > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > > >
> > > > > > Why off by default?
> > > > > > I am guessing it sometimes has performance costs too?
> > > > > >
> > > > > >
> > > > > > What happens if we use page pool for big mode too?
> > > > > > The less modes we have the better...
> > > > > >
> > > > > >
> > > > >
> > > > > Sure, now I believe it makes sense to enable it by default. When the
> > > > > packet size is very small, it reduces the likelihood of skb
> > > > > coalescing. But such cases are rare.
> > > >
> > > > small packets are rare? These workloads are easy to create actually.
> > > > Pls try and include benchmark with small packet size.
> > > >
> > >
> > > Sure, Thanks!
> >
> > Before going ahead and posting v2 patch, I would like to hear more
> > advice for the cases of small packets. I have done more performance
> > benchmark with small packets since then. Here is a list of iperf
> > output,
>
> Could you show the commnad line?
>
> Thanks
>
>

Sure.   iperf3 -c <IP> -i 5 -f g -t 0 -l <packet size>

> >
> > With PP and PP fragmenting:
> > 256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec    0    144 KBytes
> > 1K:       [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec    0
> > 223 KBytes
> > 2K:       [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec    0
> > 324 KBytes
> > 4K:       [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec    0
> > 1.08 MBytes
> > 8K:       [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec    0
> > 744 KBytes
> > 16K:     [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec    0    963 KBytes
> > 32K:     [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec    0   1.25 MBytes
> > 64K:     [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec    0   1.70 MBytes
> > 128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec    0   4.26 MBytes
> > 256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec    0   3.20 MBytes
> >
> > Without PP:
> > 256:     [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec    0    359 KBytes
> > 1K:      [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec    0    730 KBytes
> > 2K:      [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec    0   1.99 MBytes
> > 4K:      [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec    0   1.20 MBytes
> > 8K:      [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec    0   1.72 MBytes
> > 16K:    [  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec    0   2.90 MBytes
> > 32K:    [  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec    0   3.03 MBytes
> > 64K:    [  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec    0   3.05 MBytes
> > 128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec    1   3.03 MBytes
> > 256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec    0   3.11 MBytes
> >
> >
> > The major factor contributing to the performance drop is the reduction
> > of skb coalescing. Additionally, without the page pool, small packets
> > can still benefit from the allocation of 8 continuous pages by
> > breaking them down into smaller pieces. This effectively reduces the
> > frequency of page allocation from the buddy system. For instance, the
> > arrival of 32 1K packets only triggers one alloc_page call. Therefore,
> > the benefits of using a page pool are limited in such cases. In fact,
> > without page pool fragmenting enabled, it can even hinder performance
> > from this perspective.
> >
> > Upon further consideration, I tend to believe making page pool the
> > default option may not be appropriate. As you pointed out, we cannot
> > simply ignore the performance impact on small packets. Any comments on
> > this will be much appreciated.
> >
> >
> > Thanks,
> > Liang
> >
> >
> > > > > The usage of page pool for big mode is being evaluated now. Thanks!
> > > > >
> > > > > > > ---
> > > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > > >  module_param(gso, bool, 0444);
> > > > > > >  module_param(napi_tx, bool, 0644);
> > > > > > >
> > > > > > > +static bool page_pool_enabled;
> > > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > > +
> > > > > > >  /* FIXME: MTU in config. */
> > > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > > >  #define GOOD_COPY_LEN        128
> > > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > > >       /* Chain pages by the private ptr. */
> > > > > > >       struct page *pages;
> > > > > > >
> > > > > > > +     /* Page pool */
> > > > > > > +     struct page_pool *page_pool;
> > > > > > > +
> > > > > > >       /* Average packet length for mergeable receive buffers. */
> > > > > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > > >
> > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > > >       return skb;
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > > +{
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > +     else
> > > > > > > +             put_page(page);
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* Called from bottom half context */
> > > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > >                                  struct receive_queue *rq,
> > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > >       hdr = skb_vnet_hdr(skb);
> > > > > > >       memcpy(hdr, hdr_p, hdr_len);
> > > > > > >       if (page_to_free)
> > > > > > > -             put_page(page_to_free);
> > > > > > > +             virtnet_put_page(rq, page_to_free);
> > > > > > >
> > > > > > >       return skb;
> > > > > > >  }
> > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > > >       return ret;
> > > > > > >  }
> > > > > > >
> > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > > >  {
> > > > > > >       struct skb_shared_info *shinfo;
> > > > > > >       struct page *xdp_page;
> > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > > -                     put_page(xdp_page);
> > > > > > > +                     virtnet_put_page(rq, xdp_page);
> > > > > > >               }
> > > > > > >       }
> > > > > > >  }
> > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > > >               return NULL;
> > > > > > >
> > > > > > > -     page = alloc_page(GFP_ATOMIC);
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +     else
> > > > > > > +             page = alloc_page(GFP_ATOMIC);
> > > > > > > +
> > > > > > >       if (!page)
> > > > > > >               return NULL;
> > > > > > >
> > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > >                * is sending packet larger than the MTU.
> > > > > > >                */
> > > > > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > > -                     put_page(p);
> > > > > > > +                     virtnet_put_page(rq, p);
> > > > > > >                       goto err_buf;
> > > > > > >               }
> > > > > > >
> > > > > > >               memcpy(page_address(page) + page_off,
> > > > > > >                      page_address(p) + off, buflen);
> > > > > > >               page_off += buflen;
> > > > > > > -             put_page(p);
> > > > > > > +             virtnet_put_page(rq, p);
> > > > > > >       }
> > > > > > >
> > > > > > >       /* Headroom does not contribute to packet length */
> > > > > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > > >       return page;
> > > > > > >  err_buf:
> > > > > > > -     __free_pages(page, 0);
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > +     else
> > > > > > > +             __free_pages(page, 0);
> > > > > > >       return NULL;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > > >               }
> > > > > > >               stats->bytes += len;
> > > > > > >               page = virt_to_head_page(buf);
> > > > > > > -             put_page(page);
> > > > > > > +             virtnet_put_page(rq, page);
> > > > > > >       }
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > >               cur_frag_size = truesize;
> > > > > > >               xdp_frags_truesz += cur_frag_size;
> > > > > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > > -                     put_page(page);
> > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > > > > >                       dev->stats.rx_length_errors++;
> > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > >       return 0;
> > > > > > >
> > > > > > >  err:
> > > > > > > -     put_xdp_frags(xdp);
> > > > > > > +     put_xdp_frags(xdp, rq);
> > > > > > >       return -EINVAL;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > >               if (*len + xdp_room > PAGE_SIZE)
> > > > > > >                       return NULL;
> > > > > > >
> > > > > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > +             if (rq->page_pool)
> > > > > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +             else
> > > > > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > >               if (!xdp_page)
> > > > > > >                       return NULL;
> > > > > > >
> > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > >
> > > > > > >       *frame_sz = PAGE_SIZE;
> > > > > > >
> > > > > > > -     put_page(*page);
> > > > > > > +     virtnet_put_page(rq, *page);
> > > > > > >
> > > > > > >       *page = xdp_page;
> > > > > > >
> > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > > >               if (unlikely(!head_skb))
> > > > > > >                       break;
> > > > > > > +             if (rq->page_pool)
> > > > > > > +                     skb_mark_for_recycle(head_skb);
> > > > > > >               return head_skb;
> > > > > > >
> > > > > > >       case XDP_TX:
> > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > >               break;
> > > > > > >       }
> > > > > > >
> > > > > > > -     put_xdp_frags(&xdp);
> > > > > > > +     put_xdp_frags(&xdp, rq);
> > > > > > >
> > > > > > >  err_xdp:
> > > > > > > -     put_page(page);
> > > > > > > +     virtnet_put_page(rq, page);
> > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > >
> > > > > > >       stats->xdp_drops++;
> > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > > >       curr_skb = head_skb;
> > > > > > >
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             skb_mark_for_recycle(curr_skb);
> > > > > > > +
> > > > > > >       if (unlikely(!curr_skb))
> > > > > > >               goto err_skb;
> > > > > > >       while (--num_buf) {
> > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >                       curr_skb = nskb;
> > > > > > >                       head_skb->truesize += nskb->truesize;
> > > > > > >                       num_skb_frags = 0;
> > > > > > > +                     if (rq->page_pool)
> > > > > > > +                             skb_mark_for_recycle(curr_skb);
> > > > > > >               }
> > > > > > >               if (curr_skb != head_skb) {
> > > > > > >                       head_skb->data_len += len;
> > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >               }
> > > > > > >               offset = buf - page_address(page);
> > > > > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > > -                     put_page(page);
> > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > > >                                            len, truesize);
> > > > > > >               } else {
> > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >       return head_skb;
> > > > > > >
> > > > > > >  err_skb:
> > > > > > > -     put_page(page);
> > > > > > > +     virtnet_put_page(rq, page);
> > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > >
> > > > > > >  err_buf:
> > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > >        * disabled GSO for XDP, it won't be a big issue.
> > > > > > >        */
> > > > > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > -             return -ENOMEM;
> > > > > > > +     if (rq->page_pool) {
> > > > > > > +             struct page *page;
> > > > > > >
> > > > > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > -     get_page(alloc_frag->page);
> > > > > > > -     alloc_frag->offset += len + room;
> > > > > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > -     if (hole < len + room) {
> > > > > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > > > > -              * enough space for another buffer, add the remaining space to
> > > > > > > -              * the current buffer.
> > > > > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > -              */
> > > > > > > -             if (!headroom)
> > > > > > > -                     len += hole;
> > > > > > > -             alloc_frag->offset += hole;
> > > > > > > -     }
> > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +             if (unlikely(!page))
> > > > > > > +                     return -ENOMEM;
> > > > > > > +             buf = (char *)page_address(page);
> > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > +     } else {
> > > > > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > +                     return -ENOMEM;
> > > > > > >
> > > > > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > +             get_page(alloc_frag->page);
> > > > > > > +             alloc_frag->offset += len + room;
> > > > > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > +             if (hole < len + room) {
> > > > > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > > > > +                      * enough space for another buffer, add the remaining space to
> > > > > > > +                      * the current buffer.
> > > > > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > +                      */
> > > > > > > +                     if (!headroom)
> > > > > > > +                             len += hole;
> > > > > > > +                     alloc_frag->offset += hole;
> > > > > > > +             }
> > > > > > > +     }
> > > > > > >       sg_init_one(rq->sg, buf, len);
> > > > > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > >       if (err < 0)
> > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > > >
> > > > > > >       return err;
> > > > > > >  }
> > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > >       if (err < 0)
> > > > > > >               return err;
> > > > > > >
> > > > > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > > +     if (vi->rq[qp_index].page_pool)
> > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > > > > +                                              vi->rq[qp_index].page_pool);
> > > > > > > +     else
> > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > > > > +                                              NULL);
> > > > > > > +
> > > > > > >       if (err < 0)
> > > > > > >               goto err_xdp_reg_mem_model;
> > > > > > >
> > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > > >                                               virtnet_sq_stats_desc[j].desc);
> > > > > > >               }
> > > > > > > +             page_pool_ethtool_stats_get_strings(p);
> > > > > > >               break;
> > > > > > >       }
> > > > > > >  }
> > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > > >       switch (sset) {
> > > > > > >       case ETH_SS_STATS:
> > > > > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > > > > >       default:
> > > > > > >               return -EOPNOTSUPP;
> > > > > > >       }
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > > +{
> > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > > > > +     struct page_pool_stats pp_stats = {};
> > > > > > > +     int i;
> > > > > > > +
> > > > > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > > +             if (!vi->rq[i].page_pool)
> > > > > > > +                     continue;
> > > > > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > > +     }
> > > > > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > > +}
> > > > > > > +
> > > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > >                                     struct ethtool_stats *stats, u64 *data)
> > > > > > >  {
> > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > > > > >       }
> > > > > > > +
> > > > > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > > >  }
> > > > > > >
> > > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > >               __netif_napi_del(&vi->rq[i].napi);
> > > > > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > > > > +             if (vi->rq[i].page_pool)
> > > > > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > > > > >       }
> > > > > > >
> > > > > > >       /* We called __netif_napi_del(),
> > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > > > > >       int i = vq2rxq(vq);
> > > > > > >
> > > > > > > -     if (vi->mergeable_rx_bufs)
> > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > -     else if (vi->big_packets)
> > > > > > > +     if (vi->mergeable_rx_bufs) {
> > > > > > > +             if (vi->rq[i].page_pool) {
> > > > > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > > +                                             virt_to_head_page(buf),
> > > > > > > +                                             true);
> > > > > > > +             } else {
> > > > > > > +                     put_page(virt_to_head_page(buf));
> > > > > > > +             }
> > > > > > > +     } else if (vi->big_packets) {
> > > > > > >               give_pages(&vi->rq[i], buf);
> > > > > > > -     else
> > > > > > > +     } else {
> > > > > > >               put_page(virt_to_head_page(buf));
> > > > > > > +     }
> > > > > > >  }
> > > > > > >
> > > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > > >       virtnet_free_queues(vi);
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > > +{
> > > > > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > > > > +
> > > > > > > +     struct page_pool_params pp_params = {
> > > > > > > +             .order = 0,
> > > > > > > +             .pool_size = rq->vq->num_max,
> > > > > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > > > > +             .dev = vdev->dev.parent,
> > > > > > > +             .offset = 0,
> > > > > > > +     };
> > > > > > > +
> > > > > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > > > > +     if (IS_ERR(rq->page_pool)) {
> > > > > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > > +                      PTR_ERR(rq->page_pool));
> > > > > > > +             rq->page_pool = NULL;
> > > > > > > +     }
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > > >   * least one full packet?
> > > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > > +
> > > > > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > > +             else
> > > > > > > +                     dev_warn(&vi->vdev->dev,
> > > > > > > +                              "page pool only support mergeable mode\n");
> > > > > > > +
> > > > > > >       }
> > > > > > >
> > > > > > >       /* run here: ret == 0. */
> > > > > > > --
> > > > > > > 2.31.1
> > > > > >
> > > >

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-06-07  9:08           ` Liang Chen
  2023-06-07  9:35             ` Xuan Zhuo
@ 2023-06-07 20:17             ` Michael S. Tsirkin
  2023-06-08  0:38               ` Jason Wang
  2023-06-09  2:57               ` Liang Chen
  1 sibling, 2 replies; 56+ messages in thread
From: Michael S. Tsirkin @ 2023-06-07 20:17 UTC (permalink / raw)
  To: Liang Chen
  Cc: jasowang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote:
> On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> >
> > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > > > or disable the usage of page pool (disabled by default).
> > > > > >
> > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > in the normal path.
> > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > >
> > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > gain is observed in XDP cpumap:
> > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > >
> > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > DMA map/unmap support.
> > > > > >
> > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > >
> > > > > Why off by default?
> > > > > I am guessing it sometimes has performance costs too?
> > > > >
> > > > >
> > > > > What happens if we use page pool for big mode too?
> > > > > The less modes we have the better...
> > > > >
> > > > >
> > > >
> > > > Sure, now I believe it makes sense to enable it by default. When the
> > > > packet size is very small, it reduces the likelihood of skb
> > > > coalescing. But such cases are rare.
> > >
> > > small packets are rare? These workloads are easy to create actually.
> > > Pls try and include benchmark with small packet size.
> > >
> >
> > Sure, Thanks!
> 
> Before going ahead and posting v2 patch, I would like to hear more
> advice for the cases of small packets. I have done more performance
> benchmark with small packets since then. Here is a list of iperf
> output,
> 
> With PP and PP fragmenting:
> 256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec    0    144 KBytes
> 1K:       [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec    0
> 223 KBytes
> 2K:       [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec    0
> 324 KBytes
> 4K:       [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec    0
> 1.08 MBytes
> 8K:       [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec    0
> 744 KBytes
> 16K:     [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec    0    963 KBytes
> 32K:     [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec    0   1.25 MBytes
> 64K:     [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec    0   1.70 MBytes
> 128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec    0   4.26 MBytes
> 256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec    0   3.20 MBytes
> 
> Without PP:
> 256:     [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec    0    359 KBytes
> 1K:      [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec    0    730 KBytes
> 2K:      [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec    0   1.99 MBytes
> 4K:      [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec    0   1.20 MBytes
> 8K:      [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec    0   1.72 MBytes
> 16K:    [  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec    0   2.90 MBytes
> 32K:    [  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec    0   3.03 MBytes
> 64K:    [  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec    0   3.05 MBytes
> 128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec    1   3.03 MBytes
> 256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec    0   3.11 MBytes
> 
> 
> The major factor contributing to the performance drop is the reduction
> of skb coalescing. Additionally, without the page pool, small packets
> can still benefit from the allocation of 8 continuous pages by
> breaking them down into smaller pieces. This effectively reduces the
> frequency of page allocation from the buddy system. For instance, the
> arrival of 32 1K packets only triggers one alloc_page call. Therefore,
> the benefits of using a page pool are limited in such cases. In fact,
> without page pool fragmenting enabled, it can even hinder performance
> from this perspective.
> 
> Upon further consideration, I tend to believe making page pool the
> default option may not be appropriate. As you pointed out, we cannot
> simply ignore the performance impact on small packets. Any comments on
> this will be much appreciated.
> 
> 
> Thanks,
> Liang


So, let's only use page pool for XDP then?

> 
> > > > The usage of page pool for big mode is being evaluated now. Thanks!
> > > >
> > > > > > ---
> > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > >
> > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > --- a/drivers/net/virtio_net.c
> > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > >  module_param(gso, bool, 0444);
> > > > > >  module_param(napi_tx, bool, 0644);
> > > > > >
> > > > > > +static bool page_pool_enabled;
> > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > +
> > > > > >  /* FIXME: MTU in config. */
> > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > >  #define GOOD_COPY_LEN        128
> > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > >       /* Chain pages by the private ptr. */
> > > > > >       struct page *pages;
> > > > > >
> > > > > > +     /* Page pool */
> > > > > > +     struct page_pool *page_pool;
> > > > > > +
> > > > > >       /* Average packet length for mergeable receive buffers. */
> > > > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > >
> > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > >       return skb;
> > > > > >  }
> > > > > >
> > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > +{
> > > > > > +     if (rq->page_pool)
> > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > +     else
> > > > > > +             put_page(page);
> > > > > > +}
> > > > > > +
> > > > > >  /* Called from bottom half context */
> > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > >                                  struct receive_queue *rq,
> > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > >       hdr = skb_vnet_hdr(skb);
> > > > > >       memcpy(hdr, hdr_p, hdr_len);
> > > > > >       if (page_to_free)
> > > > > > -             put_page(page_to_free);
> > > > > > +             virtnet_put_page(rq, page_to_free);
> > > > > >
> > > > > >       return skb;
> > > > > >  }
> > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > >       return ret;
> > > > > >  }
> > > > > >
> > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > >  {
> > > > > >       struct skb_shared_info *shinfo;
> > > > > >       struct page *xdp_page;
> > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > -                     put_page(xdp_page);
> > > > > > +                     virtnet_put_page(rq, xdp_page);
> > > > > >               }
> > > > > >       }
> > > > > >  }
> > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > >               return NULL;
> > > > > >
> > > > > > -     page = alloc_page(GFP_ATOMIC);
> > > > > > +     if (rq->page_pool)
> > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > +     else
> > > > > > +             page = alloc_page(GFP_ATOMIC);
> > > > > > +
> > > > > >       if (!page)
> > > > > >               return NULL;
> > > > > >
> > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > >                * is sending packet larger than the MTU.
> > > > > >                */
> > > > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > -                     put_page(p);
> > > > > > +                     virtnet_put_page(rq, p);
> > > > > >                       goto err_buf;
> > > > > >               }
> > > > > >
> > > > > >               memcpy(page_address(page) + page_off,
> > > > > >                      page_address(p) + off, buflen);
> > > > > >               page_off += buflen;
> > > > > > -             put_page(p);
> > > > > > +             virtnet_put_page(rq, p);
> > > > > >       }
> > > > > >
> > > > > >       /* Headroom does not contribute to packet length */
> > > > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > >       return page;
> > > > > >  err_buf:
> > > > > > -     __free_pages(page, 0);
> > > > > > +     if (rq->page_pool)
> > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > +     else
> > > > > > +             __free_pages(page, 0);
> > > > > >       return NULL;
> > > > > >  }
> > > > > >
> > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > >               }
> > > > > >               stats->bytes += len;
> > > > > >               page = virt_to_head_page(buf);
> > > > > > -             put_page(page);
> > > > > > +             virtnet_put_page(rq, page);
> > > > > >       }
> > > > > >  }
> > > > > >
> > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > >               cur_frag_size = truesize;
> > > > > >               xdp_frags_truesz += cur_frag_size;
> > > > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > -                     put_page(page);
> > > > > > +                     virtnet_put_page(rq, page);
> > > > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > > > >                       dev->stats.rx_length_errors++;
> > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > >       return 0;
> > > > > >
> > > > > >  err:
> > > > > > -     put_xdp_frags(xdp);
> > > > > > +     put_xdp_frags(xdp, rq);
> > > > > >       return -EINVAL;
> > > > > >  }
> > > > > >
> > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > >               if (*len + xdp_room > PAGE_SIZE)
> > > > > >                       return NULL;
> > > > > >
> > > > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > +             if (rq->page_pool)
> > > > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > +             else
> > > > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > > > >               if (!xdp_page)
> > > > > >                       return NULL;
> > > > > >
> > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > >
> > > > > >       *frame_sz = PAGE_SIZE;
> > > > > >
> > > > > > -     put_page(*page);
> > > > > > +     virtnet_put_page(rq, *page);
> > > > > >
> > > > > >       *page = xdp_page;
> > > > > >
> > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > >               if (unlikely(!head_skb))
> > > > > >                       break;
> > > > > > +             if (rq->page_pool)
> > > > > > +                     skb_mark_for_recycle(head_skb);
> > > > > >               return head_skb;
> > > > > >
> > > > > >       case XDP_TX:
> > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > >               break;
> > > > > >       }
> > > > > >
> > > > > > -     put_xdp_frags(&xdp);
> > > > > > +     put_xdp_frags(&xdp, rq);
> > > > > >
> > > > > >  err_xdp:
> > > > > > -     put_page(page);
> > > > > > +     virtnet_put_page(rq, page);
> > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > >
> > > > > >       stats->xdp_drops++;
> > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > >       curr_skb = head_skb;
> > > > > >
> > > > > > +     if (rq->page_pool)
> > > > > > +             skb_mark_for_recycle(curr_skb);
> > > > > > +
> > > > > >       if (unlikely(!curr_skb))
> > > > > >               goto err_skb;
> > > > > >       while (--num_buf) {
> > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >                       curr_skb = nskb;
> > > > > >                       head_skb->truesize += nskb->truesize;
> > > > > >                       num_skb_frags = 0;
> > > > > > +                     if (rq->page_pool)
> > > > > > +                             skb_mark_for_recycle(curr_skb);
> > > > > >               }
> > > > > >               if (curr_skb != head_skb) {
> > > > > >                       head_skb->data_len += len;
> > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >               }
> > > > > >               offset = buf - page_address(page);
> > > > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > -                     put_page(page);
> > > > > > +                     virtnet_put_page(rq, page);
> > > > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > >                                            len, truesize);
> > > > > >               } else {
> > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > >       return head_skb;
> > > > > >
> > > > > >  err_skb:
> > > > > > -     put_page(page);
> > > > > > +     virtnet_put_page(rq, page);
> > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > >
> > > > > >  err_buf:
> > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > >        * disabled GSO for XDP, it won't be a big issue.
> > > > > >        */
> > > > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > -             return -ENOMEM;
> > > > > > +     if (rq->page_pool) {
> > > > > > +             struct page *page;
> > > > > >
> > > > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > -     get_page(alloc_frag->page);
> > > > > > -     alloc_frag->offset += len + room;
> > > > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > > > -     if (hole < len + room) {
> > > > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > > > -              * enough space for another buffer, add the remaining space to
> > > > > > -              * the current buffer.
> > > > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > -              */
> > > > > > -             if (!headroom)
> > > > > > -                     len += hole;
> > > > > > -             alloc_frag->offset += hole;
> > > > > > -     }
> > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > +             if (unlikely(!page))
> > > > > > +                     return -ENOMEM;
> > > > > > +             buf = (char *)page_address(page);
> > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > +     } else {
> > > > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > +                     return -ENOMEM;
> > > > > >
> > > > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > +             get_page(alloc_frag->page);
> > > > > > +             alloc_frag->offset += len + room;
> > > > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > > > +             if (hole < len + room) {
> > > > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > > > +                      * enough space for another buffer, add the remaining space to
> > > > > > +                      * the current buffer.
> > > > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > +                      */
> > > > > > +                     if (!headroom)
> > > > > > +                             len += hole;
> > > > > > +                     alloc_frag->offset += hole;
> > > > > > +             }
> > > > > > +     }
> > > > > >       sg_init_one(rq->sg, buf, len);
> > > > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > >       if (err < 0)
> > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > >
> > > > > >       return err;
> > > > > >  }
> > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > >       if (err < 0)
> > > > > >               return err;
> > > > > >
> > > > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > +     if (vi->rq[qp_index].page_pool)
> > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > > > +                                              vi->rq[qp_index].page_pool);
> > > > > > +     else
> > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > > > +                                              NULL);
> > > > > > +
> > > > > >       if (err < 0)
> > > > > >               goto err_xdp_reg_mem_model;
> > > > > >
> > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > >                                               virtnet_sq_stats_desc[j].desc);
> > > > > >               }
> > > > > > +             page_pool_ethtool_stats_get_strings(p);
> > > > > >               break;
> > > > > >       }
> > > > > >  }
> > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > >       switch (sset) {
> > > > > >       case ETH_SS_STATS:
> > > > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > > > >       default:
> > > > > >               return -EOPNOTSUPP;
> > > > > >       }
> > > > > >  }
> > > > > >
> > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > +{
> > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > > > +     struct page_pool_stats pp_stats = {};
> > > > > > +     int i;
> > > > > > +
> > > > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > +             if (!vi->rq[i].page_pool)
> > > > > > +                     continue;
> > > > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > +     }
> > > > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > +}
> > > > > > +
> > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > >                                     struct ethtool_stats *stats, u64 *data)
> > > > > >  {
> > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > > > >       }
> > > > > > +
> > > > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > >  }
> > > > > >
> > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > >               __netif_napi_del(&vi->rq[i].napi);
> > > > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > > > +             if (vi->rq[i].page_pool)
> > > > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > > > >       }
> > > > > >
> > > > > >       /* We called __netif_napi_del(),
> > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > > > >       int i = vq2rxq(vq);
> > > > > >
> > > > > > -     if (vi->mergeable_rx_bufs)
> > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > -     else if (vi->big_packets)
> > > > > > +     if (vi->mergeable_rx_bufs) {
> > > > > > +             if (vi->rq[i].page_pool) {
> > > > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > +                                             virt_to_head_page(buf),
> > > > > > +                                             true);
> > > > > > +             } else {
> > > > > > +                     put_page(virt_to_head_page(buf));
> > > > > > +             }
> > > > > > +     } else if (vi->big_packets) {
> > > > > >               give_pages(&vi->rq[i], buf);
> > > > > > -     else
> > > > > > +     } else {
> > > > > >               put_page(virt_to_head_page(buf));
> > > > > > +     }
> > > > > >  }
> > > > > >
> > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > >       virtnet_free_queues(vi);
> > > > > >  }
> > > > > >
> > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > +{
> > > > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > > > +
> > > > > > +     struct page_pool_params pp_params = {
> > > > > > +             .order = 0,
> > > > > > +             .pool_size = rq->vq->num_max,
> > > > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > > > +             .dev = vdev->dev.parent,
> > > > > > +             .offset = 0,
> > > > > > +     };
> > > > > > +
> > > > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > > > +     if (IS_ERR(rq->page_pool)) {
> > > > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > +                      PTR_ERR(rq->page_pool));
> > > > > > +             rq->page_pool = NULL;
> > > > > > +     }
> > > > > > +}
> > > > > > +
> > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > >   * least one full packet?
> > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > +
> > > > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > +             else
> > > > > > +                     dev_warn(&vi->vdev->dev,
> > > > > > +                              "page pool only support mergeable mode\n");
> > > > > > +
> > > > > >       }
> > > > > >
> > > > > >       /* run here: ret == 0. */
> > > > > > --
> > > > > > 2.31.1
> > > > >
> > >


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-06-07 20:17             ` Michael S. Tsirkin
@ 2023-06-08  0:38               ` Jason Wang
  2023-06-08  3:54                 ` Xuan Zhuo
  2023-06-09  2:57                 ` Liang Chen
  2023-06-09  2:57               ` Liang Chen
  1 sibling, 2 replies; 56+ messages in thread
From: Jason Wang @ 2023-06-08  0:38 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Liang Chen, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote:
> > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> > >
> > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > > > > or disable the usage of page pool (disabled by default).
> > > > > > >
> > > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > > in the normal path.
> > > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > > >
> > > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > > gain is observed in XDP cpumap:
> > > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > > >
> > > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > > DMA map/unmap support.
> > > > > > >
> > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > > >
> > > > > > Why off by default?
> > > > > > I am guessing it sometimes has performance costs too?
> > > > > >
> > > > > >
> > > > > > What happens if we use page pool for big mode too?
> > > > > > The less modes we have the better...
> > > > > >
> > > > > >
> > > > >
> > > > > Sure, now I believe it makes sense to enable it by default. When the
> > > > > packet size is very small, it reduces the likelihood of skb
> > > > > coalescing. But such cases are rare.
> > > >
> > > > small packets are rare? These workloads are easy to create actually.
> > > > Pls try and include benchmark with small packet size.
> > > >
> > >
> > > Sure, Thanks!
> >
> > Before going ahead and posting v2 patch, I would like to hear more
> > advice for the cases of small packets. I have done more performance
> > benchmark with small packets since then. Here is a list of iperf
> > output,
> >
> > With PP and PP fragmenting:
> > 256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec    0    144 KBytes
> > 1K:       [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec    0
> > 223 KBytes
> > 2K:       [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec    0
> > 324 KBytes
> > 4K:       [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec    0
> > 1.08 MBytes
> > 8K:       [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec    0
> > 744 KBytes
> > 16K:     [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec    0    963 KBytes
> > 32K:     [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec    0   1.25 MBytes
> > 64K:     [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec    0   1.70 MBytes
> > 128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec    0   4.26 MBytes
> > 256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec    0   3.20 MBytes

Note that virtio-net driver is lacking things like BQL and others, so
it might suffer from buffer bloat for TCP performance. Would you mind
to measure with e.g using testpmd on the vhost to see the rx PPS?

> >
> > Without PP:
> > 256:     [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec    0    359 KBytes
> > 1K:      [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec    0    730 KBytes
> > 2K:      [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec    0   1.99 MBytes
> > 4K:      [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec    0   1.20 MBytes
> > 8K:      [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec    0   1.72 MBytes
> > 16K:    [  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec    0   2.90 MBytes
> > 32K:    [  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec    0   3.03 MBytes
> > 64K:    [  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec    0   3.05 MBytes
> > 128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec    1   3.03 MBytes
> > 256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec    0   3.11 MBytes
> >
> >
> > The major factor contributing to the performance drop is the reduction
> > of skb coalescing. Additionally, without the page pool, small packets
> > can still benefit from the allocation of 8 continuous pages by
> > breaking them down into smaller pieces. This effectively reduces the
> > frequency of page allocation from the buddy system. For instance, the
> > arrival of 32 1K packets only triggers one alloc_page call. Therefore,
> > the benefits of using a page pool are limited in such cases.

I wonder if we can improve page pool in this case anyhow.

> In fact,
> > without page pool fragmenting enabled, it can even hinder performance
> > from this perspective.
> >
> > Upon further consideration, I tend to believe making page pool the
> > default option may not be appropriate. As you pointed out, we cannot
> > simply ignore the performance impact on small packets. Any comments on
> > this will be much appreciated.
> >
> >
> > Thanks,
> > Liang
>
>
> So, let's only use page pool for XDP then?

+1

We can start from this.

Thanks

>
> >
> > > > > The usage of page pool for big mode is being evaluated now. Thanks!
> > > > >
> > > > > > > ---
> > > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > > >  module_param(gso, bool, 0444);
> > > > > > >  module_param(napi_tx, bool, 0644);
> > > > > > >
> > > > > > > +static bool page_pool_enabled;
> > > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > > +
> > > > > > >  /* FIXME: MTU in config. */
> > > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > > >  #define GOOD_COPY_LEN        128
> > > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > > >       /* Chain pages by the private ptr. */
> > > > > > >       struct page *pages;
> > > > > > >
> > > > > > > +     /* Page pool */
> > > > > > > +     struct page_pool *page_pool;
> > > > > > > +
> > > > > > >       /* Average packet length for mergeable receive buffers. */
> > > > > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > > >
> > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > > >       return skb;
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > > +{
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > +     else
> > > > > > > +             put_page(page);
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* Called from bottom half context */
> > > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > >                                  struct receive_queue *rq,
> > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > >       hdr = skb_vnet_hdr(skb);
> > > > > > >       memcpy(hdr, hdr_p, hdr_len);
> > > > > > >       if (page_to_free)
> > > > > > > -             put_page(page_to_free);
> > > > > > > +             virtnet_put_page(rq, page_to_free);
> > > > > > >
> > > > > > >       return skb;
> > > > > > >  }
> > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > > >       return ret;
> > > > > > >  }
> > > > > > >
> > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > > >  {
> > > > > > >       struct skb_shared_info *shinfo;
> > > > > > >       struct page *xdp_page;
> > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > > -                     put_page(xdp_page);
> > > > > > > +                     virtnet_put_page(rq, xdp_page);
> > > > > > >               }
> > > > > > >       }
> > > > > > >  }
> > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > > >               return NULL;
> > > > > > >
> > > > > > > -     page = alloc_page(GFP_ATOMIC);
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +     else
> > > > > > > +             page = alloc_page(GFP_ATOMIC);
> > > > > > > +
> > > > > > >       if (!page)
> > > > > > >               return NULL;
> > > > > > >
> > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > >                * is sending packet larger than the MTU.
> > > > > > >                */
> > > > > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > > -                     put_page(p);
> > > > > > > +                     virtnet_put_page(rq, p);
> > > > > > >                       goto err_buf;
> > > > > > >               }
> > > > > > >
> > > > > > >               memcpy(page_address(page) + page_off,
> > > > > > >                      page_address(p) + off, buflen);
> > > > > > >               page_off += buflen;
> > > > > > > -             put_page(p);
> > > > > > > +             virtnet_put_page(rq, p);
> > > > > > >       }
> > > > > > >
> > > > > > >       /* Headroom does not contribute to packet length */
> > > > > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > > >       return page;
> > > > > > >  err_buf:
> > > > > > > -     __free_pages(page, 0);
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > +     else
> > > > > > > +             __free_pages(page, 0);
> > > > > > >       return NULL;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > > >               }
> > > > > > >               stats->bytes += len;
> > > > > > >               page = virt_to_head_page(buf);
> > > > > > > -             put_page(page);
> > > > > > > +             virtnet_put_page(rq, page);
> > > > > > >       }
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > >               cur_frag_size = truesize;
> > > > > > >               xdp_frags_truesz += cur_frag_size;
> > > > > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > > -                     put_page(page);
> > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > > > > >                       dev->stats.rx_length_errors++;
> > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > >       return 0;
> > > > > > >
> > > > > > >  err:
> > > > > > > -     put_xdp_frags(xdp);
> > > > > > > +     put_xdp_frags(xdp, rq);
> > > > > > >       return -EINVAL;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > >               if (*len + xdp_room > PAGE_SIZE)
> > > > > > >                       return NULL;
> > > > > > >
> > > > > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > +             if (rq->page_pool)
> > > > > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +             else
> > > > > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > >               if (!xdp_page)
> > > > > > >                       return NULL;
> > > > > > >
> > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > >
> > > > > > >       *frame_sz = PAGE_SIZE;
> > > > > > >
> > > > > > > -     put_page(*page);
> > > > > > > +     virtnet_put_page(rq, *page);
> > > > > > >
> > > > > > >       *page = xdp_page;
> > > > > > >
> > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > > >               if (unlikely(!head_skb))
> > > > > > >                       break;
> > > > > > > +             if (rq->page_pool)
> > > > > > > +                     skb_mark_for_recycle(head_skb);
> > > > > > >               return head_skb;
> > > > > > >
> > > > > > >       case XDP_TX:
> > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > >               break;
> > > > > > >       }
> > > > > > >
> > > > > > > -     put_xdp_frags(&xdp);
> > > > > > > +     put_xdp_frags(&xdp, rq);
> > > > > > >
> > > > > > >  err_xdp:
> > > > > > > -     put_page(page);
> > > > > > > +     virtnet_put_page(rq, page);
> > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > >
> > > > > > >       stats->xdp_drops++;
> > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > > >       curr_skb = head_skb;
> > > > > > >
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             skb_mark_for_recycle(curr_skb);
> > > > > > > +
> > > > > > >       if (unlikely(!curr_skb))
> > > > > > >               goto err_skb;
> > > > > > >       while (--num_buf) {
> > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >                       curr_skb = nskb;
> > > > > > >                       head_skb->truesize += nskb->truesize;
> > > > > > >                       num_skb_frags = 0;
> > > > > > > +                     if (rq->page_pool)
> > > > > > > +                             skb_mark_for_recycle(curr_skb);
> > > > > > >               }
> > > > > > >               if (curr_skb != head_skb) {
> > > > > > >                       head_skb->data_len += len;
> > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >               }
> > > > > > >               offset = buf - page_address(page);
> > > > > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > > -                     put_page(page);
> > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > > >                                            len, truesize);
> > > > > > >               } else {
> > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >       return head_skb;
> > > > > > >
> > > > > > >  err_skb:
> > > > > > > -     put_page(page);
> > > > > > > +     virtnet_put_page(rq, page);
> > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > >
> > > > > > >  err_buf:
> > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > >        * disabled GSO for XDP, it won't be a big issue.
> > > > > > >        */
> > > > > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > -             return -ENOMEM;
> > > > > > > +     if (rq->page_pool) {
> > > > > > > +             struct page *page;
> > > > > > >
> > > > > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > -     get_page(alloc_frag->page);
> > > > > > > -     alloc_frag->offset += len + room;
> > > > > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > -     if (hole < len + room) {
> > > > > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > > > > -              * enough space for another buffer, add the remaining space to
> > > > > > > -              * the current buffer.
> > > > > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > -              */
> > > > > > > -             if (!headroom)
> > > > > > > -                     len += hole;
> > > > > > > -             alloc_frag->offset += hole;
> > > > > > > -     }
> > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +             if (unlikely(!page))
> > > > > > > +                     return -ENOMEM;
> > > > > > > +             buf = (char *)page_address(page);
> > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > +     } else {
> > > > > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > +                     return -ENOMEM;
> > > > > > >
> > > > > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > +             get_page(alloc_frag->page);
> > > > > > > +             alloc_frag->offset += len + room;
> > > > > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > +             if (hole < len + room) {
> > > > > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > > > > +                      * enough space for another buffer, add the remaining space to
> > > > > > > +                      * the current buffer.
> > > > > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > +                      */
> > > > > > > +                     if (!headroom)
> > > > > > > +                             len += hole;
> > > > > > > +                     alloc_frag->offset += hole;
> > > > > > > +             }
> > > > > > > +     }
> > > > > > >       sg_init_one(rq->sg, buf, len);
> > > > > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > >       if (err < 0)
> > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > > >
> > > > > > >       return err;
> > > > > > >  }
> > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > >       if (err < 0)
> > > > > > >               return err;
> > > > > > >
> > > > > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > > +     if (vi->rq[qp_index].page_pool)
> > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > > > > +                                              vi->rq[qp_index].page_pool);
> > > > > > > +     else
> > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > > > > +                                              NULL);
> > > > > > > +
> > > > > > >       if (err < 0)
> > > > > > >               goto err_xdp_reg_mem_model;
> > > > > > >
> > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > > >                                               virtnet_sq_stats_desc[j].desc);
> > > > > > >               }
> > > > > > > +             page_pool_ethtool_stats_get_strings(p);
> > > > > > >               break;
> > > > > > >       }
> > > > > > >  }
> > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > > >       switch (sset) {
> > > > > > >       case ETH_SS_STATS:
> > > > > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > > > > >       default:
> > > > > > >               return -EOPNOTSUPP;
> > > > > > >       }
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > > +{
> > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > > > > +     struct page_pool_stats pp_stats = {};
> > > > > > > +     int i;
> > > > > > > +
> > > > > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > > +             if (!vi->rq[i].page_pool)
> > > > > > > +                     continue;
> > > > > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > > +     }
> > > > > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > > +}
> > > > > > > +
> > > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > >                                     struct ethtool_stats *stats, u64 *data)
> > > > > > >  {
> > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > > > > >       }
> > > > > > > +
> > > > > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > > >  }
> > > > > > >
> > > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > >               __netif_napi_del(&vi->rq[i].napi);
> > > > > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > > > > +             if (vi->rq[i].page_pool)
> > > > > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > > > > >       }
> > > > > > >
> > > > > > >       /* We called __netif_napi_del(),
> > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > > > > >       int i = vq2rxq(vq);
> > > > > > >
> > > > > > > -     if (vi->mergeable_rx_bufs)
> > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > -     else if (vi->big_packets)
> > > > > > > +     if (vi->mergeable_rx_bufs) {
> > > > > > > +             if (vi->rq[i].page_pool) {
> > > > > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > > +                                             virt_to_head_page(buf),
> > > > > > > +                                             true);
> > > > > > > +             } else {
> > > > > > > +                     put_page(virt_to_head_page(buf));
> > > > > > > +             }
> > > > > > > +     } else if (vi->big_packets) {
> > > > > > >               give_pages(&vi->rq[i], buf);
> > > > > > > -     else
> > > > > > > +     } else {
> > > > > > >               put_page(virt_to_head_page(buf));
> > > > > > > +     }
> > > > > > >  }
> > > > > > >
> > > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > > >       virtnet_free_queues(vi);
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > > +{
> > > > > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > > > > +
> > > > > > > +     struct page_pool_params pp_params = {
> > > > > > > +             .order = 0,
> > > > > > > +             .pool_size = rq->vq->num_max,
> > > > > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > > > > +             .dev = vdev->dev.parent,
> > > > > > > +             .offset = 0,
> > > > > > > +     };
> > > > > > > +
> > > > > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > > > > +     if (IS_ERR(rq->page_pool)) {
> > > > > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > > +                      PTR_ERR(rq->page_pool));
> > > > > > > +             rq->page_pool = NULL;
> > > > > > > +     }
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > > >   * least one full packet?
> > > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > > +
> > > > > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > > +             else
> > > > > > > +                     dev_warn(&vi->vdev->dev,
> > > > > > > +                              "page pool only support mergeable mode\n");
> > > > > > > +
> > > > > > >       }
> > > > > > >
> > > > > > >       /* run here: ret == 0. */
> > > > > > > --
> > > > > > > 2.31.1
> > > > > >
> > > >
>


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-06-08  0:38               ` Jason Wang
@ 2023-06-08  3:54                 ` Xuan Zhuo
  2023-06-09  2:57                 ` Liang Chen
  1 sibling, 0 replies; 56+ messages in thread
From: Xuan Zhuo @ 2023-06-08  3:54 UTC (permalink / raw)
  To: Jason Wang
  Cc: Liang Chen, virtualization, netdev, linux-kernel, kuba, edumazet,
	davem, pabeni, alexander.duyck, Michael S. Tsirkin

On Thu, 8 Jun 2023 08:38:14 +0800, Jason Wang <jasowang@redhat.com> wrote:
> On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote:
> > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> > > >
> > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > > > > > or disable the usage of page pool (disabled by default).
> > > > > > > >
> > > > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > > > in the normal path.
> > > > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > > > >
> > > > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > > > gain is observed in XDP cpumap:
> > > > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > > > >
> > > > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > > > DMA map/unmap support.
> > > > > > > >
> > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > > > >
> > > > > > > Why off by default?
> > > > > > > I am guessing it sometimes has performance costs too?
> > > > > > >
> > > > > > >
> > > > > > > What happens if we use page pool for big mode too?
> > > > > > > The less modes we have the better...
> > > > > > >
> > > > > > >
> > > > > >
> > > > > > Sure, now I believe it makes sense to enable it by default. When the
> > > > > > packet size is very small, it reduces the likelihood of skb
> > > > > > coalescing. But such cases are rare.
> > > > >
> > > > > small packets are rare? These workloads are easy to create actually.
> > > > > Pls try and include benchmark with small packet size.
> > > > >
> > > >
> > > > Sure, Thanks!
> > >
> > > Before going ahead and posting v2 patch, I would like to hear more
> > > advice for the cases of small packets. I have done more performance
> > > benchmark with small packets since then. Here is a list of iperf
> > > output,
> > >
> > > With PP and PP fragmenting:
> > > 256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec    0    144 KBytes
> > > 1K:       [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec    0
> > > 223 KBytes
> > > 2K:       [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec    0
> > > 324 KBytes
> > > 4K:       [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec    0
> > > 1.08 MBytes
> > > 8K:       [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec    0
> > > 744 KBytes
> > > 16K:     [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec    0    963 KBytes
> > > 32K:     [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec    0   1.25 MBytes
> > > 64K:     [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec    0   1.70 MBytes
> > > 128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec    0   4.26 MBytes
> > > 256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec    0   3.20 MBytes
>
> Note that virtio-net driver is lacking things like BQL and others, so
> it might suffer from buffer bloat for TCP performance. Would you mind
> to measure with e.g using testpmd on the vhost to see the rx PPS?
>
> > >
> > > Without PP:
> > > 256:     [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec    0    359 KBytes
> > > 1K:      [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec    0    730 KBytes
> > > 2K:      [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec    0   1.99 MBytes
> > > 4K:      [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec    0   1.20 MBytes
> > > 8K:      [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec    0   1.72 MBytes
> > > 16K:    [  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec    0   2.90 MBytes
> > > 32K:    [  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec    0   3.03 MBytes
> > > 64K:    [  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec    0   3.05 MBytes
> > > 128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec    1   3.03 MBytes
> > > 256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec    0   3.11 MBytes
> > >
> > >
> > > The major factor contributing to the performance drop is the reduction
> > > of skb coalescing. Additionally, without the page pool, small packets
> > > can still benefit from the allocation of 8 continuous pages by
> > > breaking them down into smaller pieces. This effectively reduces the
> > > frequency of page allocation from the buddy system. For instance, the
> > > arrival of 32 1K packets only triggers one alloc_page call. Therefore,
> > > the benefits of using a page pool are limited in such cases.
>
> I wonder if we can improve page pool in this case anyhow.
>
> > In fact,
> > > without page pool fragmenting enabled, it can even hinder performance
> > > from this perspective.
> > >
> > > Upon further consideration, I tend to believe making page pool the
> > > default option may not be appropriate. As you pointed out, we cannot
> > > simply ignore the performance impact on small packets. Any comments on
> > > this will be much appreciated.
> > >
> > >
> > > Thanks,
> > > Liang
> >
> >
> > So, let's only use page pool for XDP then?
>
> +1

+1

Thanks.

>
> We can start from this.
>
> Thanks
>
> >
> > >
> > > > > > The usage of page pool for big mode is being evaluated now. Thanks!
> > > > > >
> > > > > > > > ---
> > > > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > > > >  module_param(gso, bool, 0444);
> > > > > > > >  module_param(napi_tx, bool, 0644);
> > > > > > > >
> > > > > > > > +static bool page_pool_enabled;
> > > > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > > > +
> > > > > > > >  /* FIXME: MTU in config. */
> > > > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > > > >  #define GOOD_COPY_LEN        128
> > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > > > >       /* Chain pages by the private ptr. */
> > > > > > > >       struct page *pages;
> > > > > > > >
> > > > > > > > +     /* Page pool */
> > > > > > > > +     struct page_pool *page_pool;
> > > > > > > > +
> > > > > > > >       /* Average packet length for mergeable receive buffers. */
> > > > > > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > > > >
> > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > > > >       return skb;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > > > +{
> > > > > > > > +     if (rq->page_pool)
> > > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > > +     else
> > > > > > > > +             put_page(page);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /* Called from bottom half context */
> > > > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > >                                  struct receive_queue *rq,
> > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > >       hdr = skb_vnet_hdr(skb);
> > > > > > > >       memcpy(hdr, hdr_p, hdr_len);
> > > > > > > >       if (page_to_free)
> > > > > > > > -             put_page(page_to_free);
> > > > > > > > +             virtnet_put_page(rq, page_to_free);
> > > > > > > >
> > > > > > > >       return skb;
> > > > > > > >  }
> > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > > > >       return ret;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > > > >  {
> > > > > > > >       struct skb_shared_info *shinfo;
> > > > > > > >       struct page *xdp_page;
> > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > > > -                     put_page(xdp_page);
> > > > > > > > +                     virtnet_put_page(rq, xdp_page);
> > > > > > > >               }
> > > > > > > >       }
> > > > > > > >  }
> > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > > > >               return NULL;
> > > > > > > >
> > > > > > > > -     page = alloc_page(GFP_ATOMIC);
> > > > > > > > +     if (rq->page_pool)
> > > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > +     else
> > > > > > > > +             page = alloc_page(GFP_ATOMIC);
> > > > > > > > +
> > > > > > > >       if (!page)
> > > > > > > >               return NULL;
> > > > > > > >
> > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > >                * is sending packet larger than the MTU.
> > > > > > > >                */
> > > > > > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > > > -                     put_page(p);
> > > > > > > > +                     virtnet_put_page(rq, p);
> > > > > > > >                       goto err_buf;
> > > > > > > >               }
> > > > > > > >
> > > > > > > >               memcpy(page_address(page) + page_off,
> > > > > > > >                      page_address(p) + off, buflen);
> > > > > > > >               page_off += buflen;
> > > > > > > > -             put_page(p);
> > > > > > > > +             virtnet_put_page(rq, p);
> > > > > > > >       }
> > > > > > > >
> > > > > > > >       /* Headroom does not contribute to packet length */
> > > > > > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > > > >       return page;
> > > > > > > >  err_buf:
> > > > > > > > -     __free_pages(page, 0);
> > > > > > > > +     if (rq->page_pool)
> > > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > > +     else
> > > > > > > > +             __free_pages(page, 0);
> > > > > > > >       return NULL;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > > > >               }
> > > > > > > >               stats->bytes += len;
> > > > > > > >               page = virt_to_head_page(buf);
> > > > > > > > -             put_page(page);
> > > > > > > > +             virtnet_put_page(rq, page);
> > > > > > > >       }
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > >               cur_frag_size = truesize;
> > > > > > > >               xdp_frags_truesz += cur_frag_size;
> > > > > > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > > > -                     put_page(page);
> > > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > > > > > >                       dev->stats.rx_length_errors++;
> > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > >       return 0;
> > > > > > > >
> > > > > > > >  err:
> > > > > > > > -     put_xdp_frags(xdp);
> > > > > > > > +     put_xdp_frags(xdp, rq);
> > > > > > > >       return -EINVAL;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > > >               if (*len + xdp_room > PAGE_SIZE)
> > > > > > > >                       return NULL;
> > > > > > > >
> > > > > > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > > +             if (rq->page_pool)
> > > > > > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > +             else
> > > > > > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > >               if (!xdp_page)
> > > > > > > >                       return NULL;
> > > > > > > >
> > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > > >
> > > > > > > >       *frame_sz = PAGE_SIZE;
> > > > > > > >
> > > > > > > > -     put_page(*page);
> > > > > > > > +     virtnet_put_page(rq, *page);
> > > > > > > >
> > > > > > > >       *page = xdp_page;
> > > > > > > >
> > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > > > >               if (unlikely(!head_skb))
> > > > > > > >                       break;
> > > > > > > > +             if (rq->page_pool)
> > > > > > > > +                     skb_mark_for_recycle(head_skb);
> > > > > > > >               return head_skb;
> > > > > > > >
> > > > > > > >       case XDP_TX:
> > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > > >               break;
> > > > > > > >       }
> > > > > > > >
> > > > > > > > -     put_xdp_frags(&xdp);
> > > > > > > > +     put_xdp_frags(&xdp, rq);
> > > > > > > >
> > > > > > > >  err_xdp:
> > > > > > > > -     put_page(page);
> > > > > > > > +     virtnet_put_page(rq, page);
> > > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > > >
> > > > > > > >       stats->xdp_drops++;
> > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > > > >       curr_skb = head_skb;
> > > > > > > >
> > > > > > > > +     if (rq->page_pool)
> > > > > > > > +             skb_mark_for_recycle(curr_skb);
> > > > > > > > +
> > > > > > > >       if (unlikely(!curr_skb))
> > > > > > > >               goto err_skb;
> > > > > > > >       while (--num_buf) {
> > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > >                       curr_skb = nskb;
> > > > > > > >                       head_skb->truesize += nskb->truesize;
> > > > > > > >                       num_skb_frags = 0;
> > > > > > > > +                     if (rq->page_pool)
> > > > > > > > +                             skb_mark_for_recycle(curr_skb);
> > > > > > > >               }
> > > > > > > >               if (curr_skb != head_skb) {
> > > > > > > >                       head_skb->data_len += len;
> > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > >               }
> > > > > > > >               offset = buf - page_address(page);
> > > > > > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > > > -                     put_page(page);
> > > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > > > >                                            len, truesize);
> > > > > > > >               } else {
> > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > >       return head_skb;
> > > > > > > >
> > > > > > > >  err_skb:
> > > > > > > > -     put_page(page);
> > > > > > > > +     virtnet_put_page(rq, page);
> > > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > > >
> > > > > > > >  err_buf:
> > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > >        * disabled GSO for XDP, it won't be a big issue.
> > > > > > > >        */
> > > > > > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > > -             return -ENOMEM;
> > > > > > > > +     if (rq->page_pool) {
> > > > > > > > +             struct page *page;
> > > > > > > >
> > > > > > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > -     get_page(alloc_frag->page);
> > > > > > > > -     alloc_frag->offset += len + room;
> > > > > > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > > -     if (hole < len + room) {
> > > > > > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > > > > > -              * enough space for another buffer, add the remaining space to
> > > > > > > > -              * the current buffer.
> > > > > > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > > -              */
> > > > > > > > -             if (!headroom)
> > > > > > > > -                     len += hole;
> > > > > > > > -             alloc_frag->offset += hole;
> > > > > > > > -     }
> > > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > +             if (unlikely(!page))
> > > > > > > > +                     return -ENOMEM;
> > > > > > > > +             buf = (char *)page_address(page);
> > > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > +     } else {
> > > > > > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > > +                     return -ENOMEM;
> > > > > > > >
> > > > > > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > +             get_page(alloc_frag->page);
> > > > > > > > +             alloc_frag->offset += len + room;
> > > > > > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > > +             if (hole < len + room) {
> > > > > > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > > > > > +                      * enough space for another buffer, add the remaining space to
> > > > > > > > +                      * the current buffer.
> > > > > > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > > +                      */
> > > > > > > > +                     if (!headroom)
> > > > > > > > +                             len += hole;
> > > > > > > > +                     alloc_frag->offset += hole;
> > > > > > > > +             }
> > > > > > > > +     }
> > > > > > > >       sg_init_one(rq->sg, buf, len);
> > > > > > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > >       if (err < 0)
> > > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > > > >
> > > > > > > >       return err;
> > > > > > > >  }
> > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > > >       if (err < 0)
> > > > > > > >               return err;
> > > > > > > >
> > > > > > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > > > +     if (vi->rq[qp_index].page_pool)
> > > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > > > > > +                                              vi->rq[qp_index].page_pool);
> > > > > > > > +     else
> > > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > > > > > +                                              NULL);
> > > > > > > > +
> > > > > > > >       if (err < 0)
> > > > > > > >               goto err_xdp_reg_mem_model;
> > > > > > > >
> > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > > > >                                               virtnet_sq_stats_desc[j].desc);
> > > > > > > >               }
> > > > > > > > +             page_pool_ethtool_stats_get_strings(p);
> > > > > > > >               break;
> > > > > > > >       }
> > > > > > > >  }
> > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > > > >       switch (sset) {
> > > > > > > >       case ETH_SS_STATS:
> > > > > > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > > > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > > > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > > > > > >       default:
> > > > > > > >               return -EOPNOTSUPP;
> > > > > > > >       }
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > > > +{
> > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > > > > > +     struct page_pool_stats pp_stats = {};
> > > > > > > > +     int i;
> > > > > > > > +
> > > > > > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > > > +             if (!vi->rq[i].page_pool)
> > > > > > > > +                     continue;
> > > > > > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > > > +     }
> > > > > > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > > >                                     struct ethtool_stats *stats, u64 *data)
> > > > > > > >  {
> > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > > > > > >       }
> > > > > > > > +
> > > > > > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > >               __netif_napi_del(&vi->rq[i].napi);
> > > > > > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > +             if (vi->rq[i].page_pool)
> > > > > > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > > > > > >       }
> > > > > > > >
> > > > > > > >       /* We called __netif_napi_del(),
> > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > > > > > >       int i = vq2rxq(vq);
> > > > > > > >
> > > > > > > > -     if (vi->mergeable_rx_bufs)
> > > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > > -     else if (vi->big_packets)
> > > > > > > > +     if (vi->mergeable_rx_bufs) {
> > > > > > > > +             if (vi->rq[i].page_pool) {
> > > > > > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > > > +                                             virt_to_head_page(buf),
> > > > > > > > +                                             true);
> > > > > > > > +             } else {
> > > > > > > > +                     put_page(virt_to_head_page(buf));
> > > > > > > > +             }
> > > > > > > > +     } else if (vi->big_packets) {
> > > > > > > >               give_pages(&vi->rq[i], buf);
> > > > > > > > -     else
> > > > > > > > +     } else {
> > > > > > > >               put_page(virt_to_head_page(buf));
> > > > > > > > +     }
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > > > >       virtnet_free_queues(vi);
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > > > +{
> > > > > > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > > > > > +
> > > > > > > > +     struct page_pool_params pp_params = {
> > > > > > > > +             .order = 0,
> > > > > > > > +             .pool_size = rq->vq->num_max,
> > > > > > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > > > > > +             .dev = vdev->dev.parent,
> > > > > > > > +             .offset = 0,
> > > > > > > > +     };
> > > > > > > > +
> > > > > > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > > > > > +     if (IS_ERR(rq->page_pool)) {
> > > > > > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > > > +                      PTR_ERR(rq->page_pool));
> > > > > > > > +             rq->page_pool = NULL;
> > > > > > > > +     }
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > > > >   * least one full packet?
> > > > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > > > +
> > > > > > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > > > +             else
> > > > > > > > +                     dev_warn(&vi->vdev->dev,
> > > > > > > > +                              "page pool only support mergeable mode\n");
> > > > > > > > +
> > > > > > > >       }
> > > > > > > >
> > > > > > > >       /* run here: ret == 0. */
> > > > > > > > --
> > > > > > > > 2.31.1
> > > > > > >
> > > > >
> >
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-06-07 20:17             ` Michael S. Tsirkin
  2023-06-08  0:38               ` Jason Wang
@ 2023-06-09  2:57               ` Liang Chen
  1 sibling, 0 replies; 56+ messages in thread
From: Liang Chen @ 2023-06-09  2:57 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: jasowang, virtualization, netdev, linux-kernel, xuanzhuo, kuba,
	edumazet, davem, pabeni, alexander.duyck

On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote:
>
> On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote:
> > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> > >
> > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > > > > or disable the usage of page pool (disabled by default).
> > > > > > >
> > > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > > in the normal path.
> > > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > > >
> > > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > > gain is observed in XDP cpumap:
> > > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > > >
> > > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > > DMA map/unmap support.
> > > > > > >
> > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > > >
> > > > > > Why off by default?
> > > > > > I am guessing it sometimes has performance costs too?
> > > > > >
> > > > > >
> > > > > > What happens if we use page pool for big mode too?
> > > > > > The less modes we have the better...
> > > > > >
> > > > > >
> > > > >
> > > > > Sure, now I believe it makes sense to enable it by default. When the
> > > > > packet size is very small, it reduces the likelihood of skb
> > > > > coalescing. But such cases are rare.
> > > >
> > > > small packets are rare? These workloads are easy to create actually.
> > > > Pls try and include benchmark with small packet size.
> > > >
> > >
> > > Sure, Thanks!
> >
> > Before going ahead and posting v2 patch, I would like to hear more
> > advice for the cases of small packets. I have done more performance
> > benchmark with small packets since then. Here is a list of iperf
> > output,
> >
> > With PP and PP fragmenting:
> > 256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec    0    144 KBytes
> > 1K:       [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec    0
> > 223 KBytes
> > 2K:       [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec    0
> > 324 KBytes
> > 4K:       [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec    0
> > 1.08 MBytes
> > 8K:       [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec    0
> > 744 KBytes
> > 16K:     [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec    0    963 KBytes
> > 32K:     [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec    0   1.25 MBytes
> > 64K:     [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec    0   1.70 MBytes
> > 128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec    0   4.26 MBytes
> > 256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec    0   3.20 MBytes
> >
> > Without PP:
> > 256:     [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec    0    359 KBytes
> > 1K:      [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec    0    730 KBytes
> > 2K:      [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec    0   1.99 MBytes
> > 4K:      [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec    0   1.20 MBytes
> > 8K:      [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec    0   1.72 MBytes
> > 16K:    [  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec    0   2.90 MBytes
> > 32K:    [  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec    0   3.03 MBytes
> > 64K:    [  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec    0   3.05 MBytes
> > 128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec    1   3.03 MBytes
> > 256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec    0   3.11 MBytes
> >
> >
> > The major factor contributing to the performance drop is the reduction
> > of skb coalescing. Additionally, without the page pool, small packets
> > can still benefit from the allocation of 8 continuous pages by
> > breaking them down into smaller pieces. This effectively reduces the
> > frequency of page allocation from the buddy system. For instance, the
> > arrival of 32 1K packets only triggers one alloc_page call. Therefore,
> > the benefits of using a page pool are limited in such cases. In fact,
> > without page pool fragmenting enabled, it can even hinder performance
> > from this perspective.
> >
> > Upon further consideration, I tend to believe making page pool the
> > default option may not be appropriate. As you pointed out, we cannot
> > simply ignore the performance impact on small packets. Any comments on
> > this will be much appreciated.
> >
> >
> > Thanks,
> > Liang
>
>
> So, let's only use page pool for XDP then?
>

Sure. We will prepare v2 for xdp only, and come back later for the
normal path when skb coalecsing works better with page pool.


> >
> > > > > The usage of page pool for big mode is being evaluated now. Thanks!
> > > > >
> > > > > > > ---
> > > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > > >
> > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > > >  module_param(gso, bool, 0444);
> > > > > > >  module_param(napi_tx, bool, 0644);
> > > > > > >
> > > > > > > +static bool page_pool_enabled;
> > > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > > +
> > > > > > >  /* FIXME: MTU in config. */
> > > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > > >  #define GOOD_COPY_LEN        128
> > > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > > >       /* Chain pages by the private ptr. */
> > > > > > >       struct page *pages;
> > > > > > >
> > > > > > > +     /* Page pool */
> > > > > > > +     struct page_pool *page_pool;
> > > > > > > +
> > > > > > >       /* Average packet length for mergeable receive buffers. */
> > > > > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > > >
> > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > > >       return skb;
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > > +{
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > +     else
> > > > > > > +             put_page(page);
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* Called from bottom half context */
> > > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > >                                  struct receive_queue *rq,
> > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > >       hdr = skb_vnet_hdr(skb);
> > > > > > >       memcpy(hdr, hdr_p, hdr_len);
> > > > > > >       if (page_to_free)
> > > > > > > -             put_page(page_to_free);
> > > > > > > +             virtnet_put_page(rq, page_to_free);
> > > > > > >
> > > > > > >       return skb;
> > > > > > >  }
> > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > > >       return ret;
> > > > > > >  }
> > > > > > >
> > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > > >  {
> > > > > > >       struct skb_shared_info *shinfo;
> > > > > > >       struct page *xdp_page;
> > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > > -                     put_page(xdp_page);
> > > > > > > +                     virtnet_put_page(rq, xdp_page);
> > > > > > >               }
> > > > > > >       }
> > > > > > >  }
> > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > > >               return NULL;
> > > > > > >
> > > > > > > -     page = alloc_page(GFP_ATOMIC);
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +     else
> > > > > > > +             page = alloc_page(GFP_ATOMIC);
> > > > > > > +
> > > > > > >       if (!page)
> > > > > > >               return NULL;
> > > > > > >
> > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > >                * is sending packet larger than the MTU.
> > > > > > >                */
> > > > > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > > -                     put_page(p);
> > > > > > > +                     virtnet_put_page(rq, p);
> > > > > > >                       goto err_buf;
> > > > > > >               }
> > > > > > >
> > > > > > >               memcpy(page_address(page) + page_off,
> > > > > > >                      page_address(p) + off, buflen);
> > > > > > >               page_off += buflen;
> > > > > > > -             put_page(p);
> > > > > > > +             virtnet_put_page(rq, p);
> > > > > > >       }
> > > > > > >
> > > > > > >       /* Headroom does not contribute to packet length */
> > > > > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > > >       return page;
> > > > > > >  err_buf:
> > > > > > > -     __free_pages(page, 0);
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > +     else
> > > > > > > +             __free_pages(page, 0);
> > > > > > >       return NULL;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > > >               }
> > > > > > >               stats->bytes += len;
> > > > > > >               page = virt_to_head_page(buf);
> > > > > > > -             put_page(page);
> > > > > > > +             virtnet_put_page(rq, page);
> > > > > > >       }
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > >               cur_frag_size = truesize;
> > > > > > >               xdp_frags_truesz += cur_frag_size;
> > > > > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > > -                     put_page(page);
> > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > > > > >                       dev->stats.rx_length_errors++;
> > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > >       return 0;
> > > > > > >
> > > > > > >  err:
> > > > > > > -     put_xdp_frags(xdp);
> > > > > > > +     put_xdp_frags(xdp, rq);
> > > > > > >       return -EINVAL;
> > > > > > >  }
> > > > > > >
> > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > >               if (*len + xdp_room > PAGE_SIZE)
> > > > > > >                       return NULL;
> > > > > > >
> > > > > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > +             if (rq->page_pool)
> > > > > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +             else
> > > > > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > >               if (!xdp_page)
> > > > > > >                       return NULL;
> > > > > > >
> > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > >
> > > > > > >       *frame_sz = PAGE_SIZE;
> > > > > > >
> > > > > > > -     put_page(*page);
> > > > > > > +     virtnet_put_page(rq, *page);
> > > > > > >
> > > > > > >       *page = xdp_page;
> > > > > > >
> > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > > >               if (unlikely(!head_skb))
> > > > > > >                       break;
> > > > > > > +             if (rq->page_pool)
> > > > > > > +                     skb_mark_for_recycle(head_skb);
> > > > > > >               return head_skb;
> > > > > > >
> > > > > > >       case XDP_TX:
> > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > >               break;
> > > > > > >       }
> > > > > > >
> > > > > > > -     put_xdp_frags(&xdp);
> > > > > > > +     put_xdp_frags(&xdp, rq);
> > > > > > >
> > > > > > >  err_xdp:
> > > > > > > -     put_page(page);
> > > > > > > +     virtnet_put_page(rq, page);
> > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > >
> > > > > > >       stats->xdp_drops++;
> > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > > >       curr_skb = head_skb;
> > > > > > >
> > > > > > > +     if (rq->page_pool)
> > > > > > > +             skb_mark_for_recycle(curr_skb);
> > > > > > > +
> > > > > > >       if (unlikely(!curr_skb))
> > > > > > >               goto err_skb;
> > > > > > >       while (--num_buf) {
> > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >                       curr_skb = nskb;
> > > > > > >                       head_skb->truesize += nskb->truesize;
> > > > > > >                       num_skb_frags = 0;
> > > > > > > +                     if (rq->page_pool)
> > > > > > > +                             skb_mark_for_recycle(curr_skb);
> > > > > > >               }
> > > > > > >               if (curr_skb != head_skb) {
> > > > > > >                       head_skb->data_len += len;
> > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >               }
> > > > > > >               offset = buf - page_address(page);
> > > > > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > > -                     put_page(page);
> > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > > >                                            len, truesize);
> > > > > > >               } else {
> > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > >       return head_skb;
> > > > > > >
> > > > > > >  err_skb:
> > > > > > > -     put_page(page);
> > > > > > > +     virtnet_put_page(rq, page);
> > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > >
> > > > > > >  err_buf:
> > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > >        * disabled GSO for XDP, it won't be a big issue.
> > > > > > >        */
> > > > > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > -             return -ENOMEM;
> > > > > > > +     if (rq->page_pool) {
> > > > > > > +             struct page *page;
> > > > > > >
> > > > > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > -     get_page(alloc_frag->page);
> > > > > > > -     alloc_frag->offset += len + room;
> > > > > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > -     if (hole < len + room) {
> > > > > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > > > > -              * enough space for another buffer, add the remaining space to
> > > > > > > -              * the current buffer.
> > > > > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > -              */
> > > > > > > -             if (!headroom)
> > > > > > > -                     len += hole;
> > > > > > > -             alloc_frag->offset += hole;
> > > > > > > -     }
> > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > +             if (unlikely(!page))
> > > > > > > +                     return -ENOMEM;
> > > > > > > +             buf = (char *)page_address(page);
> > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > +     } else {
> > > > > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > +                     return -ENOMEM;
> > > > > > >
> > > > > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > +             get_page(alloc_frag->page);
> > > > > > > +             alloc_frag->offset += len + room;
> > > > > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > +             if (hole < len + room) {
> > > > > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > > > > +                      * enough space for another buffer, add the remaining space to
> > > > > > > +                      * the current buffer.
> > > > > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > +                      */
> > > > > > > +                     if (!headroom)
> > > > > > > +                             len += hole;
> > > > > > > +                     alloc_frag->offset += hole;
> > > > > > > +             }
> > > > > > > +     }
> > > > > > >       sg_init_one(rq->sg, buf, len);
> > > > > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > >       if (err < 0)
> > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > > >
> > > > > > >       return err;
> > > > > > >  }
> > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > >       if (err < 0)
> > > > > > >               return err;
> > > > > > >
> > > > > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > > +     if (vi->rq[qp_index].page_pool)
> > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > > > > +                                              vi->rq[qp_index].page_pool);
> > > > > > > +     else
> > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > > > > +                                              NULL);
> > > > > > > +
> > > > > > >       if (err < 0)
> > > > > > >               goto err_xdp_reg_mem_model;
> > > > > > >
> > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > > >                                               virtnet_sq_stats_desc[j].desc);
> > > > > > >               }
> > > > > > > +             page_pool_ethtool_stats_get_strings(p);
> > > > > > >               break;
> > > > > > >       }
> > > > > > >  }
> > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > > >       switch (sset) {
> > > > > > >       case ETH_SS_STATS:
> > > > > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > > > > >       default:
> > > > > > >               return -EOPNOTSUPP;
> > > > > > >       }
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > > +{
> > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > > > > +     struct page_pool_stats pp_stats = {};
> > > > > > > +     int i;
> > > > > > > +
> > > > > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > > +             if (!vi->rq[i].page_pool)
> > > > > > > +                     continue;
> > > > > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > > +     }
> > > > > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > > +}
> > > > > > > +
> > > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > >                                     struct ethtool_stats *stats, u64 *data)
> > > > > > >  {
> > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > > > > >       }
> > > > > > > +
> > > > > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > > >  }
> > > > > > >
> > > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > >               __netif_napi_del(&vi->rq[i].napi);
> > > > > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > > > > +             if (vi->rq[i].page_pool)
> > > > > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > > > > >       }
> > > > > > >
> > > > > > >       /* We called __netif_napi_del(),
> > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > > > > >       int i = vq2rxq(vq);
> > > > > > >
> > > > > > > -     if (vi->mergeable_rx_bufs)
> > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > -     else if (vi->big_packets)
> > > > > > > +     if (vi->mergeable_rx_bufs) {
> > > > > > > +             if (vi->rq[i].page_pool) {
> > > > > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > > +                                             virt_to_head_page(buf),
> > > > > > > +                                             true);
> > > > > > > +             } else {
> > > > > > > +                     put_page(virt_to_head_page(buf));
> > > > > > > +             }
> > > > > > > +     } else if (vi->big_packets) {
> > > > > > >               give_pages(&vi->rq[i], buf);
> > > > > > > -     else
> > > > > > > +     } else {
> > > > > > >               put_page(virt_to_head_page(buf));
> > > > > > > +     }
> > > > > > >  }
> > > > > > >
> > > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > > >       virtnet_free_queues(vi);
> > > > > > >  }
> > > > > > >
> > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > > +{
> > > > > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > > > > +
> > > > > > > +     struct page_pool_params pp_params = {
> > > > > > > +             .order = 0,
> > > > > > > +             .pool_size = rq->vq->num_max,
> > > > > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > > > > +             .dev = vdev->dev.parent,
> > > > > > > +             .offset = 0,
> > > > > > > +     };
> > > > > > > +
> > > > > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > > > > +     if (IS_ERR(rq->page_pool)) {
> > > > > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > > +                      PTR_ERR(rq->page_pool));
> > > > > > > +             rq->page_pool = NULL;
> > > > > > > +     }
> > > > > > > +}
> > > > > > > +
> > > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > > >   * least one full packet?
> > > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > > +
> > > > > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > > +             else
> > > > > > > +                     dev_warn(&vi->vdev->dev,
> > > > > > > +                              "page pool only support mergeable mode\n");
> > > > > > > +
> > > > > > >       }
> > > > > > >
> > > > > > >       /* run here: ret == 0. */
> > > > > > > --
> > > > > > > 2.31.1
> > > > > >
> > > >
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-06-08  0:38               ` Jason Wang
  2023-06-08  3:54                 ` Xuan Zhuo
@ 2023-06-09  2:57                 ` Liang Chen
  2023-07-05  5:41                   ` Liang Chen
  1 sibling, 1 reply; 56+ messages in thread
From: Liang Chen @ 2023-06-09  2:57 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, virtualization, netdev, linux-kernel,
	xuanzhuo, kuba, edumazet, davem, pabeni, alexander.duyck

On Thu, Jun 8, 2023 at 8:38 AM Jason Wang <jasowang@redhat.com> wrote:
>
> On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> >
> > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote:
> > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> > > >
> > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > > > > > or disable the usage of page pool (disabled by default).
> > > > > > > >
> > > > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > > > in the normal path.
> > > > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > > > >
> > > > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > > > gain is observed in XDP cpumap:
> > > > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > > > >
> > > > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > > > DMA map/unmap support.
> > > > > > > >
> > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > > > >
> > > > > > > Why off by default?
> > > > > > > I am guessing it sometimes has performance costs too?
> > > > > > >
> > > > > > >
> > > > > > > What happens if we use page pool for big mode too?
> > > > > > > The less modes we have the better...
> > > > > > >
> > > > > > >
> > > > > >
> > > > > > Sure, now I believe it makes sense to enable it by default. When the
> > > > > > packet size is very small, it reduces the likelihood of skb
> > > > > > coalescing. But such cases are rare.
> > > > >
> > > > > small packets are rare? These workloads are easy to create actually.
> > > > > Pls try and include benchmark with small packet size.
> > > > >
> > > >
> > > > Sure, Thanks!
> > >
> > > Before going ahead and posting v2 patch, I would like to hear more
> > > advice for the cases of small packets. I have done more performance
> > > benchmark with small packets since then. Here is a list of iperf
> > > output,
> > >
> > > With PP and PP fragmenting:
> > > 256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec    0    144 KBytes
> > > 1K:       [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec    0
> > > 223 KBytes
> > > 2K:       [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec    0
> > > 324 KBytes
> > > 4K:       [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec    0
> > > 1.08 MBytes
> > > 8K:       [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec    0
> > > 744 KBytes
> > > 16K:     [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec    0    963 KBytes
> > > 32K:     [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec    0   1.25 MBytes
> > > 64K:     [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec    0   1.70 MBytes
> > > 128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec    0   4.26 MBytes
> > > 256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec    0   3.20 MBytes
>
> Note that virtio-net driver is lacking things like BQL and others, so
> it might suffer from buffer bloat for TCP performance. Would you mind
> to measure with e.g using testpmd on the vhost to see the rx PPS?
>

No problem. Before we proceed to measure with testpmd, could you
please take a look at the PPS measurements we obtained previously and
see if they are sufficient? Though we will only utilize page pool for
xdp on v2.

netperf -H 192.168.124.197 -p 4444 -t UDP_STREAM -l 0 -- -m $((1))

with page pool:
1.
Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
rxcmp/s   txcmp/s  rxmcst/s   %ifutil
Average:       enp8s0 655092.27      0.35  27508.77      0.03
0.00      0.00      0.00      0.00
2.
Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
rxcmp/s   txcmp/s  rxmcst/s   %ifutil
Average:       enp8s0 654749.87      0.63  27494.42      0.05
0.00      0.00      0.00      0.00
3.
Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
rxcmp/s   txcmp/s  rxmcst/s   %ifutil
Average:       enp8s0 654230.40      0.10  27472.57      0.01
0.00      0.00      0.00      0.00
4.
Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
rxcmp/s   txcmp/s  rxmcst/s   %ifutil
Average:       enp8s0 656661.33      0.15  27574.65      0.01
0.00      0.00      0.00      0.00


without page pool:
1.
Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
rxcmp/s   txcmp/s  rxmcst/s   %ifutil
Average:       enp8s0 646515.20      0.47  27148.60      0.04
0.00      0.00      0.00      0.00
2.
Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
rxcmp/s   txcmp/s  rxmcst/s   %ifutil
Average:       enp8s0 653874.13      0.18  27457.61      0.02
0.00      0.00      0.00      0.00
3.
Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
rxcmp/s   txcmp/s  rxmcst/s   %ifutil
Average:       enp8s0 647246.93      0.15  27179.32      0.01
0.00      0.00      0.00      0.00
4.
Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
rxcmp/s   txcmp/s  rxmcst/s   %ifutil
Average:       enp8s0 650625.07      0.27  27321.18      0.02
0.00      0.00      0.00      0.00


(655092+654749+654230+656661)/(646515+653874+647246+650625) =
1.00864886500966031113
On average it gives around 0.8% increase in PPS, and this figure can
be reproduced consistently.

> > >
> > > Without PP:
> > > 256:     [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec    0    359 KBytes
> > > 1K:      [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec    0    730 KBytes
> > > 2K:      [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec    0   1.99 MBytes
> > > 4K:      [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec    0   1.20 MBytes
> > > 8K:      [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec    0   1.72 MBytes
> > > 16K:    [  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec    0   2.90 MBytes
> > > 32K:    [  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec    0   3.03 MBytes
> > > 64K:    [  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec    0   3.05 MBytes
> > > 128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec    1   3.03 MBytes
> > > 256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec    0   3.11 MBytes
> > >
> > >
> > > The major factor contributing to the performance drop is the reduction
> > > of skb coalescing. Additionally, without the page pool, small packets
> > > can still benefit from the allocation of 8 continuous pages by
> > > breaking them down into smaller pieces. This effectively reduces the
> > > frequency of page allocation from the buddy system. For instance, the
> > > arrival of 32 1K packets only triggers one alloc_page call. Therefore,
> > > the benefits of using a page pool are limited in such cases.
>
> I wonder if we can improve page pool in this case anyhow.
>

We would like to make the effort to enhance skb coalecsing to be more
friendly with page pool buffers. But that involves modifications to
some core data structure of mm.


> > In fact,
> > > without page pool fragmenting enabled, it can even hinder performance
> > > from this perspective.
> > >
> > > Upon further consideration, I tend to believe making page pool the
> > > default option may not be appropriate. As you pointed out, we cannot
> > > simply ignore the performance impact on small packets. Any comments on
> > > this will be much appreciated.
> > >
> > >
> > > Thanks,
> > > Liang
> >
> >
> > So, let's only use page pool for XDP then?
>
> +1
>
> We can start from this.
>
> Thanks
>
> >
> > >
> > > > > > The usage of page pool for big mode is being evaluated now. Thanks!
> > > > > >
> > > > > > > > ---
> > > > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > > > >
> > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > > > >  module_param(gso, bool, 0444);
> > > > > > > >  module_param(napi_tx, bool, 0644);
> > > > > > > >
> > > > > > > > +static bool page_pool_enabled;
> > > > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > > > +
> > > > > > > >  /* FIXME: MTU in config. */
> > > > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > > > >  #define GOOD_COPY_LEN        128
> > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > > > >       /* Chain pages by the private ptr. */
> > > > > > > >       struct page *pages;
> > > > > > > >
> > > > > > > > +     /* Page pool */
> > > > > > > > +     struct page_pool *page_pool;
> > > > > > > > +
> > > > > > > >       /* Average packet length for mergeable receive buffers. */
> > > > > > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > > > >
> > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > > > >       return skb;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > > > +{
> > > > > > > > +     if (rq->page_pool)
> > > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > > +     else
> > > > > > > > +             put_page(page);
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /* Called from bottom half context */
> > > > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > >                                  struct receive_queue *rq,
> > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > >       hdr = skb_vnet_hdr(skb);
> > > > > > > >       memcpy(hdr, hdr_p, hdr_len);
> > > > > > > >       if (page_to_free)
> > > > > > > > -             put_page(page_to_free);
> > > > > > > > +             virtnet_put_page(rq, page_to_free);
> > > > > > > >
> > > > > > > >       return skb;
> > > > > > > >  }
> > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > > > >       return ret;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > > > >  {
> > > > > > > >       struct skb_shared_info *shinfo;
> > > > > > > >       struct page *xdp_page;
> > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > > > -                     put_page(xdp_page);
> > > > > > > > +                     virtnet_put_page(rq, xdp_page);
> > > > > > > >               }
> > > > > > > >       }
> > > > > > > >  }
> > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > > > >               return NULL;
> > > > > > > >
> > > > > > > > -     page = alloc_page(GFP_ATOMIC);
> > > > > > > > +     if (rq->page_pool)
> > > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > +     else
> > > > > > > > +             page = alloc_page(GFP_ATOMIC);
> > > > > > > > +
> > > > > > > >       if (!page)
> > > > > > > >               return NULL;
> > > > > > > >
> > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > >                * is sending packet larger than the MTU.
> > > > > > > >                */
> > > > > > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > > > -                     put_page(p);
> > > > > > > > +                     virtnet_put_page(rq, p);
> > > > > > > >                       goto err_buf;
> > > > > > > >               }
> > > > > > > >
> > > > > > > >               memcpy(page_address(page) + page_off,
> > > > > > > >                      page_address(p) + off, buflen);
> > > > > > > >               page_off += buflen;
> > > > > > > > -             put_page(p);
> > > > > > > > +             virtnet_put_page(rq, p);
> > > > > > > >       }
> > > > > > > >
> > > > > > > >       /* Headroom does not contribute to packet length */
> > > > > > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > > > >       return page;
> > > > > > > >  err_buf:
> > > > > > > > -     __free_pages(page, 0);
> > > > > > > > +     if (rq->page_pool)
> > > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > > +     else
> > > > > > > > +             __free_pages(page, 0);
> > > > > > > >       return NULL;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > > > >               }
> > > > > > > >               stats->bytes += len;
> > > > > > > >               page = virt_to_head_page(buf);
> > > > > > > > -             put_page(page);
> > > > > > > > +             virtnet_put_page(rq, page);
> > > > > > > >       }
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > >               cur_frag_size = truesize;
> > > > > > > >               xdp_frags_truesz += cur_frag_size;
> > > > > > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > > > -                     put_page(page);
> > > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > > > > > >                       dev->stats.rx_length_errors++;
> > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > >       return 0;
> > > > > > > >
> > > > > > > >  err:
> > > > > > > > -     put_xdp_frags(xdp);
> > > > > > > > +     put_xdp_frags(xdp, rq);
> > > > > > > >       return -EINVAL;
> > > > > > > >  }
> > > > > > > >
> > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > > >               if (*len + xdp_room > PAGE_SIZE)
> > > > > > > >                       return NULL;
> > > > > > > >
> > > > > > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > > +             if (rq->page_pool)
> > > > > > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > +             else
> > > > > > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > >               if (!xdp_page)
> > > > > > > >                       return NULL;
> > > > > > > >
> > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > > >
> > > > > > > >       *frame_sz = PAGE_SIZE;
> > > > > > > >
> > > > > > > > -     put_page(*page);
> > > > > > > > +     virtnet_put_page(rq, *page);
> > > > > > > >
> > > > > > > >       *page = xdp_page;
> > > > > > > >
> > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > > > >               if (unlikely(!head_skb))
> > > > > > > >                       break;
> > > > > > > > +             if (rq->page_pool)
> > > > > > > > +                     skb_mark_for_recycle(head_skb);
> > > > > > > >               return head_skb;
> > > > > > > >
> > > > > > > >       case XDP_TX:
> > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > > >               break;
> > > > > > > >       }
> > > > > > > >
> > > > > > > > -     put_xdp_frags(&xdp);
> > > > > > > > +     put_xdp_frags(&xdp, rq);
> > > > > > > >
> > > > > > > >  err_xdp:
> > > > > > > > -     put_page(page);
> > > > > > > > +     virtnet_put_page(rq, page);
> > > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > > >
> > > > > > > >       stats->xdp_drops++;
> > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > > > >       curr_skb = head_skb;
> > > > > > > >
> > > > > > > > +     if (rq->page_pool)
> > > > > > > > +             skb_mark_for_recycle(curr_skb);
> > > > > > > > +
> > > > > > > >       if (unlikely(!curr_skb))
> > > > > > > >               goto err_skb;
> > > > > > > >       while (--num_buf) {
> > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > >                       curr_skb = nskb;
> > > > > > > >                       head_skb->truesize += nskb->truesize;
> > > > > > > >                       num_skb_frags = 0;
> > > > > > > > +                     if (rq->page_pool)
> > > > > > > > +                             skb_mark_for_recycle(curr_skb);
> > > > > > > >               }
> > > > > > > >               if (curr_skb != head_skb) {
> > > > > > > >                       head_skb->data_len += len;
> > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > >               }
> > > > > > > >               offset = buf - page_address(page);
> > > > > > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > > > -                     put_page(page);
> > > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > > > >                                            len, truesize);
> > > > > > > >               } else {
> > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > >       return head_skb;
> > > > > > > >
> > > > > > > >  err_skb:
> > > > > > > > -     put_page(page);
> > > > > > > > +     virtnet_put_page(rq, page);
> > > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > > >
> > > > > > > >  err_buf:
> > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > >        * disabled GSO for XDP, it won't be a big issue.
> > > > > > > >        */
> > > > > > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > > -             return -ENOMEM;
> > > > > > > > +     if (rq->page_pool) {
> > > > > > > > +             struct page *page;
> > > > > > > >
> > > > > > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > -     get_page(alloc_frag->page);
> > > > > > > > -     alloc_frag->offset += len + room;
> > > > > > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > > -     if (hole < len + room) {
> > > > > > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > > > > > -              * enough space for another buffer, add the remaining space to
> > > > > > > > -              * the current buffer.
> > > > > > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > > -              */
> > > > > > > > -             if (!headroom)
> > > > > > > > -                     len += hole;
> > > > > > > > -             alloc_frag->offset += hole;
> > > > > > > > -     }
> > > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > +             if (unlikely(!page))
> > > > > > > > +                     return -ENOMEM;
> > > > > > > > +             buf = (char *)page_address(page);
> > > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > +     } else {
> > > > > > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > > +                     return -ENOMEM;
> > > > > > > >
> > > > > > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > +             get_page(alloc_frag->page);
> > > > > > > > +             alloc_frag->offset += len + room;
> > > > > > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > > +             if (hole < len + room) {
> > > > > > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > > > > > +                      * enough space for another buffer, add the remaining space to
> > > > > > > > +                      * the current buffer.
> > > > > > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > > +                      */
> > > > > > > > +                     if (!headroom)
> > > > > > > > +                             len += hole;
> > > > > > > > +                     alloc_frag->offset += hole;
> > > > > > > > +             }
> > > > > > > > +     }
> > > > > > > >       sg_init_one(rq->sg, buf, len);
> > > > > > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > >       if (err < 0)
> > > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > > > >
> > > > > > > >       return err;
> > > > > > > >  }
> > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > > >       if (err < 0)
> > > > > > > >               return err;
> > > > > > > >
> > > > > > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > > > +     if (vi->rq[qp_index].page_pool)
> > > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > > > > > +                                              vi->rq[qp_index].page_pool);
> > > > > > > > +     else
> > > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > > > > > +                                              NULL);
> > > > > > > > +
> > > > > > > >       if (err < 0)
> > > > > > > >               goto err_xdp_reg_mem_model;
> > > > > > > >
> > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > > > >                                               virtnet_sq_stats_desc[j].desc);
> > > > > > > >               }
> > > > > > > > +             page_pool_ethtool_stats_get_strings(p);
> > > > > > > >               break;
> > > > > > > >       }
> > > > > > > >  }
> > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > > > >       switch (sset) {
> > > > > > > >       case ETH_SS_STATS:
> > > > > > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > > > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > > > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > > > > > >       default:
> > > > > > > >               return -EOPNOTSUPP;
> > > > > > > >       }
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > > > +{
> > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > > > > > +     struct page_pool_stats pp_stats = {};
> > > > > > > > +     int i;
> > > > > > > > +
> > > > > > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > > > +             if (!vi->rq[i].page_pool)
> > > > > > > > +                     continue;
> > > > > > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > > > +     }
> > > > > > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > > >                                     struct ethtool_stats *stats, u64 *data)
> > > > > > > >  {
> > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > > > > > >       }
> > > > > > > > +
> > > > > > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > >               __netif_napi_del(&vi->rq[i].napi);
> > > > > > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > +             if (vi->rq[i].page_pool)
> > > > > > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > > > > > >       }
> > > > > > > >
> > > > > > > >       /* We called __netif_napi_del(),
> > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > > > > > >       int i = vq2rxq(vq);
> > > > > > > >
> > > > > > > > -     if (vi->mergeable_rx_bufs)
> > > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > > -     else if (vi->big_packets)
> > > > > > > > +     if (vi->mergeable_rx_bufs) {
> > > > > > > > +             if (vi->rq[i].page_pool) {
> > > > > > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > > > +                                             virt_to_head_page(buf),
> > > > > > > > +                                             true);
> > > > > > > > +             } else {
> > > > > > > > +                     put_page(virt_to_head_page(buf));
> > > > > > > > +             }
> > > > > > > > +     } else if (vi->big_packets) {
> > > > > > > >               give_pages(&vi->rq[i], buf);
> > > > > > > > -     else
> > > > > > > > +     } else {
> > > > > > > >               put_page(virt_to_head_page(buf));
> > > > > > > > +     }
> > > > > > > >  }
> > > > > > > >
> > > > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > > > >       virtnet_free_queues(vi);
> > > > > > > >  }
> > > > > > > >
> > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > > > +{
> > > > > > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > > > > > +
> > > > > > > > +     struct page_pool_params pp_params = {
> > > > > > > > +             .order = 0,
> > > > > > > > +             .pool_size = rq->vq->num_max,
> > > > > > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > > > > > +             .dev = vdev->dev.parent,
> > > > > > > > +             .offset = 0,
> > > > > > > > +     };
> > > > > > > > +
> > > > > > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > > > > > +     if (IS_ERR(rq->page_pool)) {
> > > > > > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > > > +                      PTR_ERR(rq->page_pool));
> > > > > > > > +             rq->page_pool = NULL;
> > > > > > > > +     }
> > > > > > > > +}
> > > > > > > > +
> > > > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > > > >   * least one full packet?
> > > > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > > > +
> > > > > > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > > > +             else
> > > > > > > > +                     dev_warn(&vi->vdev->dev,
> > > > > > > > +                              "page pool only support mergeable mode\n");
> > > > > > > > +
> > > > > > > >       }
> > > > > > > >
> > > > > > > >       /* run here: ret == 0. */
> > > > > > > > --
> > > > > > > > 2.31.1
> > > > > > >
> > > > >
> >
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-06-09  2:57                 ` Liang Chen
@ 2023-07-05  5:41                   ` Liang Chen
  2023-07-05  6:04                     ` Jason Wang
  0 siblings, 1 reply; 56+ messages in thread
From: Liang Chen @ 2023-07-05  5:41 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, virtualization, netdev, linux-kernel,
	xuanzhuo, kuba, edumazet, davem, pabeni, alexander.duyck

On Fri, Jun 9, 2023 at 10:57 AM Liang Chen <liangchen.linux@gmail.com> wrote:
>
> On Thu, Jun 8, 2023 at 8:38 AM Jason Wang <jasowang@redhat.com> wrote:
> >
> > On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > >
> > > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote:
> > > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> > > > >
> > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > >
> > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > > > > > > or disable the usage of page pool (disabled by default).
> > > > > > > > >
> > > > > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > > > > in the normal path.
> > > > > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > > > > >
> > > > > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > > > > gain is observed in XDP cpumap:
> > > > > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > > > > >
> > > > > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > > > > DMA map/unmap support.
> > > > > > > > >
> > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > > > > >
> > > > > > > > Why off by default?
> > > > > > > > I am guessing it sometimes has performance costs too?
> > > > > > > >
> > > > > > > >
> > > > > > > > What happens if we use page pool for big mode too?
> > > > > > > > The less modes we have the better...
> > > > > > > >
> > > > > > > >
> > > > > > >
> > > > > > > Sure, now I believe it makes sense to enable it by default. When the
> > > > > > > packet size is very small, it reduces the likelihood of skb
> > > > > > > coalescing. But such cases are rare.
> > > > > >
> > > > > > small packets are rare? These workloads are easy to create actually.
> > > > > > Pls try and include benchmark with small packet size.
> > > > > >
> > > > >
> > > > > Sure, Thanks!
> > > >
> > > > Before going ahead and posting v2 patch, I would like to hear more
> > > > advice for the cases of small packets. I have done more performance
> > > > benchmark with small packets since then. Here is a list of iperf
> > > > output,
> > > >
> > > > With PP and PP fragmenting:
> > > > 256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec    0    144 KBytes
> > > > 1K:       [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec    0
> > > > 223 KBytes
> > > > 2K:       [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec    0
> > > > 324 KBytes
> > > > 4K:       [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec    0
> > > > 1.08 MBytes
> > > > 8K:       [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec    0
> > > > 744 KBytes
> > > > 16K:     [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec    0    963 KBytes
> > > > 32K:     [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec    0   1.25 MBytes
> > > > 64K:     [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec    0   1.70 MBytes
> > > > 128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec    0   4.26 MBytes
> > > > 256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec    0   3.20 MBytes
> >
> > Note that virtio-net driver is lacking things like BQL and others, so
> > it might suffer from buffer bloat for TCP performance. Would you mind
> > to measure with e.g using testpmd on the vhost to see the rx PPS?
> >
>
> No problem. Before we proceed to measure with testpmd, could you
> please take a look at the PPS measurements we obtained previously and
> see if they are sufficient? Though we will only utilize page pool for
> xdp on v2.
>
> netperf -H 192.168.124.197 -p 4444 -t UDP_STREAM -l 0 -- -m $((1))
>
> with page pool:
> 1.
> Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> Average:       enp8s0 655092.27      0.35  27508.77      0.03
> 0.00      0.00      0.00      0.00
> 2.
> Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> Average:       enp8s0 654749.87      0.63  27494.42      0.05
> 0.00      0.00      0.00      0.00
> 3.
> Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> Average:       enp8s0 654230.40      0.10  27472.57      0.01
> 0.00      0.00      0.00      0.00
> 4.
> Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> Average:       enp8s0 656661.33      0.15  27574.65      0.01
> 0.00      0.00      0.00      0.00
>
>
> without page pool:
> 1.
> Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> Average:       enp8s0 646515.20      0.47  27148.60      0.04
> 0.00      0.00      0.00      0.00
> 2.
> Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> Average:       enp8s0 653874.13      0.18  27457.61      0.02
> 0.00      0.00      0.00      0.00
> 3.
> Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> Average:       enp8s0 647246.93      0.15  27179.32      0.01
> 0.00      0.00      0.00      0.00
> 4.
> Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> Average:       enp8s0 650625.07      0.27  27321.18      0.02
> 0.00      0.00      0.00      0.00
>
>
> (655092+654749+654230+656661)/(646515+653874+647246+650625) =
> 1.00864886500966031113
> On average it gives around 0.8% increase in PPS, and this figure can
> be reproduced consistently.
>
> > > >
> > > > Without PP:
> > > > 256:     [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec    0    359 KBytes
> > > > 1K:      [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec    0    730 KBytes
> > > > 2K:      [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec    0   1.99 MBytes
> > > > 4K:      [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec    0   1.20 MBytes
> > > > 8K:      [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec    0   1.72 MBytes
> > > > 16K:    [  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec    0   2.90 MBytes
> > > > 32K:    [  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec    0   3.03 MBytes
> > > > 64K:    [  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec    0   3.05 MBytes
> > > > 128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec    1   3.03 MBytes
> > > > 256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec    0   3.11 MBytes
> > > >
> > > >
> > > > The major factor contributing to the performance drop is the reduction
> > > > of skb coalescing. Additionally, without the page pool, small packets
> > > > can still benefit from the allocation of 8 continuous pages by
> > > > breaking them down into smaller pieces. This effectively reduces the
> > > > frequency of page allocation from the buddy system. For instance, the
> > > > arrival of 32 1K packets only triggers one alloc_page call. Therefore,
> > > > the benefits of using a page pool are limited in such cases.
> >
> > I wonder if we can improve page pool in this case anyhow.
> >
>
> We would like to make the effort to enhance skb coalecsing to be more
> friendly with page pool buffers. But that involves modifications to
> some core data structure of mm.
>
>

Just to give an update on the status of v2 progress. We have
identified the specific situation where skb coalescing is affected by
page pool and small packets. In summary, it was because, in our
environment (Fedora 37 default network setup), NetworkManager creates
a SOCK_DGRAM socket, which eventually results in skbs being cloned,
thus causing the failure of skb coalescing.

While it affects small packet performance in our environment, it
doesn't necessarily have the same impact for other users.
Nevertheless, we are trying to make an optimization to allow skb
coalescing in that situation. It may take some time as we are advised
to wait for some relevant prior patches to land first.
(https://patchwork.kernel.org/project/netdevbpf/patch/20230628121150.47778-1-liangchen.linux@gmail.com/)

In addition, if small packet performance is not dropping anymore,
perhaps we can enable page pool for the normal path as well.


Thanks,
Liang



> > > In fact,
> > > > without page pool fragmenting enabled, it can even hinder performance
> > > > from this perspective.
> > > >
> > > > Upon further consideration, I tend to believe making page pool the
> > > > default option may not be appropriate. As you pointed out, we cannot
> > > > simply ignore the performance impact on small packets. Any comments on
> > > > this will be much appreciated.
> > > >
> > > >
> > > > Thanks,
> > > > Liang
> > >
> > >
> > > So, let's only use page pool for XDP then?
> >
> > +1
> >
> > We can start from this.
> >
> > Thanks
> >
> > >
> > > >
> > > > > > > The usage of page pool for big mode is being evaluated now. Thanks!
> > > > > > >
> > > > > > > > > ---
> > > > > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > > > > >
> > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > > > > >  module_param(gso, bool, 0444);
> > > > > > > > >  module_param(napi_tx, bool, 0644);
> > > > > > > > >
> > > > > > > > > +static bool page_pool_enabled;
> > > > > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > > > > +
> > > > > > > > >  /* FIXME: MTU in config. */
> > > > > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > > > > >  #define GOOD_COPY_LEN        128
> > > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > > > > >       /* Chain pages by the private ptr. */
> > > > > > > > >       struct page *pages;
> > > > > > > > >
> > > > > > > > > +     /* Page pool */
> > > > > > > > > +     struct page_pool *page_pool;
> > > > > > > > > +
> > > > > > > > >       /* Average packet length for mergeable receive buffers. */
> > > > > > > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > > > > >
> > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > > > > >       return skb;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > > > > +{
> > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > > > +     else
> > > > > > > > > +             put_page(page);
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  /* Called from bottom half context */
> > > > > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > >                                  struct receive_queue *rq,
> > > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > >       hdr = skb_vnet_hdr(skb);
> > > > > > > > >       memcpy(hdr, hdr_p, hdr_len);
> > > > > > > > >       if (page_to_free)
> > > > > > > > > -             put_page(page_to_free);
> > > > > > > > > +             virtnet_put_page(rq, page_to_free);
> > > > > > > > >
> > > > > > > > >       return skb;
> > > > > > > > >  }
> > > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > > > > >       return ret;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > > > > >  {
> > > > > > > > >       struct skb_shared_info *shinfo;
> > > > > > > > >       struct page *xdp_page;
> > > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > > > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > > > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > > > > -                     put_page(xdp_page);
> > > > > > > > > +                     virtnet_put_page(rq, xdp_page);
> > > > > > > > >               }
> > > > > > > > >       }
> > > > > > > > >  }
> > > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > > > > >               return NULL;
> > > > > > > > >
> > > > > > > > > -     page = alloc_page(GFP_ATOMIC);
> > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > > +     else
> > > > > > > > > +             page = alloc_page(GFP_ATOMIC);
> > > > > > > > > +
> > > > > > > > >       if (!page)
> > > > > > > > >               return NULL;
> > > > > > > > >
> > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > >                * is sending packet larger than the MTU.
> > > > > > > > >                */
> > > > > > > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > > > > -                     put_page(p);
> > > > > > > > > +                     virtnet_put_page(rq, p);
> > > > > > > > >                       goto err_buf;
> > > > > > > > >               }
> > > > > > > > >
> > > > > > > > >               memcpy(page_address(page) + page_off,
> > > > > > > > >                      page_address(p) + off, buflen);
> > > > > > > > >               page_off += buflen;
> > > > > > > > > -             put_page(p);
> > > > > > > > > +             virtnet_put_page(rq, p);
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > >       /* Headroom does not contribute to packet length */
> > > > > > > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > > > > >       return page;
> > > > > > > > >  err_buf:
> > > > > > > > > -     __free_pages(page, 0);
> > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > > > +     else
> > > > > > > > > +             __free_pages(page, 0);
> > > > > > > > >       return NULL;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > > > > >               }
> > > > > > > > >               stats->bytes += len;
> > > > > > > > >               page = virt_to_head_page(buf);
> > > > > > > > > -             put_page(page);
> > > > > > > > > +             virtnet_put_page(rq, page);
> > > > > > > > >       }
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > >               cur_frag_size = truesize;
> > > > > > > > >               xdp_frags_truesz += cur_frag_size;
> > > > > > > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > > > > -                     put_page(page);
> > > > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > > > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > > > > > > >                       dev->stats.rx_length_errors++;
> > > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > >       return 0;
> > > > > > > > >
> > > > > > > > >  err:
> > > > > > > > > -     put_xdp_frags(xdp);
> > > > > > > > > +     put_xdp_frags(xdp, rq);
> > > > > > > > >       return -EINVAL;
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > > > >               if (*len + xdp_room > PAGE_SIZE)
> > > > > > > > >                       return NULL;
> > > > > > > > >
> > > > > > > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > > > +             if (rq->page_pool)
> > > > > > > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > > +             else
> > > > > > > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > > >               if (!xdp_page)
> > > > > > > > >                       return NULL;
> > > > > > > > >
> > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > > > >
> > > > > > > > >       *frame_sz = PAGE_SIZE;
> > > > > > > > >
> > > > > > > > > -     put_page(*page);
> > > > > > > > > +     virtnet_put_page(rq, *page);
> > > > > > > > >
> > > > > > > > >       *page = xdp_page;
> > > > > > > > >
> > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > > > > >               if (unlikely(!head_skb))
> > > > > > > > >                       break;
> > > > > > > > > +             if (rq->page_pool)
> > > > > > > > > +                     skb_mark_for_recycle(head_skb);
> > > > > > > > >               return head_skb;
> > > > > > > > >
> > > > > > > > >       case XDP_TX:
> > > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > > > >               break;
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > > -     put_xdp_frags(&xdp);
> > > > > > > > > +     put_xdp_frags(&xdp, rq);
> > > > > > > > >
> > > > > > > > >  err_xdp:
> > > > > > > > > -     put_page(page);
> > > > > > > > > +     virtnet_put_page(rq, page);
> > > > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > > > >
> > > > > > > > >       stats->xdp_drops++;
> > > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > > > > >       curr_skb = head_skb;
> > > > > > > > >
> > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > +             skb_mark_for_recycle(curr_skb);
> > > > > > > > > +
> > > > > > > > >       if (unlikely(!curr_skb))
> > > > > > > > >               goto err_skb;
> > > > > > > > >       while (--num_buf) {
> > > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > >                       curr_skb = nskb;
> > > > > > > > >                       head_skb->truesize += nskb->truesize;
> > > > > > > > >                       num_skb_frags = 0;
> > > > > > > > > +                     if (rq->page_pool)
> > > > > > > > > +                             skb_mark_for_recycle(curr_skb);
> > > > > > > > >               }
> > > > > > > > >               if (curr_skb != head_skb) {
> > > > > > > > >                       head_skb->data_len += len;
> > > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > >               }
> > > > > > > > >               offset = buf - page_address(page);
> > > > > > > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > > > > -                     put_page(page);
> > > > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > > > > >                                            len, truesize);
> > > > > > > > >               } else {
> > > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > >       return head_skb;
> > > > > > > > >
> > > > > > > > >  err_skb:
> > > > > > > > > -     put_page(page);
> > > > > > > > > +     virtnet_put_page(rq, page);
> > > > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > > > >
> > > > > > > > >  err_buf:
> > > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > >        * disabled GSO for XDP, it won't be a big issue.
> > > > > > > > >        */
> > > > > > > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > > > -             return -ENOMEM;
> > > > > > > > > +     if (rq->page_pool) {
> > > > > > > > > +             struct page *page;
> > > > > > > > >
> > > > > > > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > > -     get_page(alloc_frag->page);
> > > > > > > > > -     alloc_frag->offset += len + room;
> > > > > > > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > > > -     if (hole < len + room) {
> > > > > > > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > > > > > > -              * enough space for another buffer, add the remaining space to
> > > > > > > > > -              * the current buffer.
> > > > > > > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > > > -              */
> > > > > > > > > -             if (!headroom)
> > > > > > > > > -                     len += hole;
> > > > > > > > > -             alloc_frag->offset += hole;
> > > > > > > > > -     }
> > > > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > > +             if (unlikely(!page))
> > > > > > > > > +                     return -ENOMEM;
> > > > > > > > > +             buf = (char *)page_address(page);
> > > > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > > +     } else {
> > > > > > > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > > > +                     return -ENOMEM;
> > > > > > > > >
> > > > > > > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > > +             get_page(alloc_frag->page);
> > > > > > > > > +             alloc_frag->offset += len + room;
> > > > > > > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > > > +             if (hole < len + room) {
> > > > > > > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > > > > > > +                      * enough space for another buffer, add the remaining space to
> > > > > > > > > +                      * the current buffer.
> > > > > > > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > > > +                      */
> > > > > > > > > +                     if (!headroom)
> > > > > > > > > +                             len += hole;
> > > > > > > > > +                     alloc_frag->offset += hole;
> > > > > > > > > +             }
> > > > > > > > > +     }
> > > > > > > > >       sg_init_one(rq->sg, buf, len);
> > > > > > > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > >       if (err < 0)
> > > > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > > > > >
> > > > > > > > >       return err;
> > > > > > > > >  }
> > > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > > > >       if (err < 0)
> > > > > > > > >               return err;
> > > > > > > > >
> > > > > > > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > > > > +     if (vi->rq[qp_index].page_pool)
> > > > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > > > > > > +                                              vi->rq[qp_index].page_pool);
> > > > > > > > > +     else
> > > > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > > > > > > +                                              NULL);
> > > > > > > > > +
> > > > > > > > >       if (err < 0)
> > > > > > > > >               goto err_xdp_reg_mem_model;
> > > > > > > > >
> > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > > > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > > > > >                                               virtnet_sq_stats_desc[j].desc);
> > > > > > > > >               }
> > > > > > > > > +             page_pool_ethtool_stats_get_strings(p);
> > > > > > > > >               break;
> > > > > > > > >       }
> > > > > > > > >  }
> > > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > > > > >       switch (sset) {
> > > > > > > > >       case ETH_SS_STATS:
> > > > > > > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > > > > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > > > > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > > > > > > >       default:
> > > > > > > > >               return -EOPNOTSUPP;
> > > > > > > > >       }
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > > > > +{
> > > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > > > > > > +     struct page_pool_stats pp_stats = {};
> > > > > > > > > +     int i;
> > > > > > > > > +
> > > > > > > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > > > > +             if (!vi->rq[i].page_pool)
> > > > > > > > > +                     continue;
> > > > > > > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > > > > +     }
> > > > > > > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > > > >                                     struct ethtool_stats *stats, u64 *data)
> > > > > > > > >  {
> > > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > > > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > > > > > > >       }
> > > > > > > > > +
> > > > > > > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > >               __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > +             if (vi->rq[i].page_pool)
> > > > > > > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > >       /* We called __netif_napi_del(),
> > > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > > > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > > > > > > >       int i = vq2rxq(vq);
> > > > > > > > >
> > > > > > > > > -     if (vi->mergeable_rx_bufs)
> > > > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > > > -     else if (vi->big_packets)
> > > > > > > > > +     if (vi->mergeable_rx_bufs) {
> > > > > > > > > +             if (vi->rq[i].page_pool) {
> > > > > > > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > > > > +                                             virt_to_head_page(buf),
> > > > > > > > > +                                             true);
> > > > > > > > > +             } else {
> > > > > > > > > +                     put_page(virt_to_head_page(buf));
> > > > > > > > > +             }
> > > > > > > > > +     } else if (vi->big_packets) {
> > > > > > > > >               give_pages(&vi->rq[i], buf);
> > > > > > > > > -     else
> > > > > > > > > +     } else {
> > > > > > > > >               put_page(virt_to_head_page(buf));
> > > > > > > > > +     }
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > > > > >       virtnet_free_queues(vi);
> > > > > > > > >  }
> > > > > > > > >
> > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > > > > +{
> > > > > > > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > > > > > > +
> > > > > > > > > +     struct page_pool_params pp_params = {
> > > > > > > > > +             .order = 0,
> > > > > > > > > +             .pool_size = rq->vq->num_max,
> > > > > > > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > > > > > > +             .dev = vdev->dev.parent,
> > > > > > > > > +             .offset = 0,
> > > > > > > > > +     };
> > > > > > > > > +
> > > > > > > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > > > > > > +     if (IS_ERR(rq->page_pool)) {
> > > > > > > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > > > > +                      PTR_ERR(rq->page_pool));
> > > > > > > > > +             rq->page_pool = NULL;
> > > > > > > > > +     }
> > > > > > > > > +}
> > > > > > > > > +
> > > > > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > > > > >   * least one full packet?
> > > > > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > > > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > > > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > > > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > > > > +
> > > > > > > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > > > > +             else
> > > > > > > > > +                     dev_warn(&vi->vdev->dev,
> > > > > > > > > +                              "page pool only support mergeable mode\n");
> > > > > > > > > +
> > > > > > > > >       }
> > > > > > > > >
> > > > > > > > >       /* run here: ret == 0. */
> > > > > > > > > --
> > > > > > > > > 2.31.1
> > > > > > > >
> > > > > >
> > >
> >

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-07-05  5:41                   ` Liang Chen
@ 2023-07-05  6:04                     ` Jason Wang
  2023-07-06  1:01                       ` Liang Chen
  0 siblings, 1 reply; 56+ messages in thread
From: Jason Wang @ 2023-07-05  6:04 UTC (permalink / raw)
  To: Liang Chen
  Cc: Michael S. Tsirkin, virtualization, netdev, linux-kernel,
	xuanzhuo, kuba, edumazet, davem, pabeni, alexander.duyck

On Wed, Jul 5, 2023 at 1:41 PM Liang Chen <liangchen.linux@gmail.com> wrote:
>
> On Fri, Jun 9, 2023 at 10:57 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> >
> > On Thu, Jun 8, 2023 at 8:38 AM Jason Wang <jasowang@redhat.com> wrote:
> > >
> > > On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > >
> > > > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote:
> > > > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> > > > > >
> > > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > >
> > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > >
> > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > > > > > > > or disable the usage of page pool (disabled by default).
> > > > > > > > > >
> > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > > > > > in the normal path.
> > > > > > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > > > > > >
> > > > > > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > > > > > gain is observed in XDP cpumap:
> > > > > > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > > > > > >
> > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > > > > > DMA map/unmap support.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > > > > > >
> > > > > > > > > Why off by default?
> > > > > > > > > I am guessing it sometimes has performance costs too?
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > What happens if we use page pool for big mode too?
> > > > > > > > > The less modes we have the better...
> > > > > > > > >
> > > > > > > > >
> > > > > > > >
> > > > > > > > Sure, now I believe it makes sense to enable it by default. When the
> > > > > > > > packet size is very small, it reduces the likelihood of skb
> > > > > > > > coalescing. But such cases are rare.
> > > > > > >
> > > > > > > small packets are rare? These workloads are easy to create actually.
> > > > > > > Pls try and include benchmark with small packet size.
> > > > > > >
> > > > > >
> > > > > > Sure, Thanks!
> > > > >
> > > > > Before going ahead and posting v2 patch, I would like to hear more
> > > > > advice for the cases of small packets. I have done more performance
> > > > > benchmark with small packets since then. Here is a list of iperf
> > > > > output,
> > > > >
> > > > > With PP and PP fragmenting:
> > > > > 256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec    0    144 KBytes
> > > > > 1K:       [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec    0
> > > > > 223 KBytes
> > > > > 2K:       [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec    0
> > > > > 324 KBytes
> > > > > 4K:       [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec    0
> > > > > 1.08 MBytes
> > > > > 8K:       [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec    0
> > > > > 744 KBytes
> > > > > 16K:     [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec    0    963 KBytes
> > > > > 32K:     [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec    0   1.25 MBytes
> > > > > 64K:     [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec    0   1.70 MBytes
> > > > > 128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec    0   4.26 MBytes
> > > > > 256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec    0   3.20 MBytes
> > >
> > > Note that virtio-net driver is lacking things like BQL and others, so
> > > it might suffer from buffer bloat for TCP performance. Would you mind
> > > to measure with e.g using testpmd on the vhost to see the rx PPS?
> > >
> >
> > No problem. Before we proceed to measure with testpmd, could you
> > please take a look at the PPS measurements we obtained previously and
> > see if they are sufficient? Though we will only utilize page pool for
> > xdp on v2.
> >
> > netperf -H 192.168.124.197 -p 4444 -t UDP_STREAM -l 0 -- -m $((1))
> >
> > with page pool:
> > 1.
> > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > Average:       enp8s0 655092.27      0.35  27508.77      0.03
> > 0.00      0.00      0.00      0.00
> > 2.
> > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > Average:       enp8s0 654749.87      0.63  27494.42      0.05
> > 0.00      0.00      0.00      0.00
> > 3.
> > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > Average:       enp8s0 654230.40      0.10  27472.57      0.01
> > 0.00      0.00      0.00      0.00
> > 4.
> > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > Average:       enp8s0 656661.33      0.15  27574.65      0.01
> > 0.00      0.00      0.00      0.00
> >
> >
> > without page pool:
> > 1.
> > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > Average:       enp8s0 646515.20      0.47  27148.60      0.04
> > 0.00      0.00      0.00      0.00
> > 2.
> > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > Average:       enp8s0 653874.13      0.18  27457.61      0.02
> > 0.00      0.00      0.00      0.00
> > 3.
> > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > Average:       enp8s0 647246.93      0.15  27179.32      0.01
> > 0.00      0.00      0.00      0.00
> > 4.
> > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > Average:       enp8s0 650625.07      0.27  27321.18      0.02
> > 0.00      0.00      0.00      0.00
> >
> >
> > (655092+654749+654230+656661)/(646515+653874+647246+650625) =
> > 1.00864886500966031113
> > On average it gives around 0.8% increase in PPS, and this figure can
> > be reproduced consistently.
> >
> > > > >
> > > > > Without PP:
> > > > > 256:     [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec    0    359 KBytes
> > > > > 1K:      [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec    0    730 KBytes
> > > > > 2K:      [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec    0   1.99 MBytes
> > > > > 4K:      [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec    0   1.20 MBytes
> > > > > 8K:      [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec    0   1.72 MBytes
> > > > > 16K:    [  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec    0   2.90 MBytes
> > > > > 32K:    [  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec    0   3.03 MBytes
> > > > > 64K:    [  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec    0   3.05 MBytes
> > > > > 128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec    1   3.03 MBytes
> > > > > 256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec    0   3.11 MBytes
> > > > >
> > > > >
> > > > > The major factor contributing to the performance drop is the reduction
> > > > > of skb coalescing. Additionally, without the page pool, small packets
> > > > > can still benefit from the allocation of 8 continuous pages by
> > > > > breaking them down into smaller pieces. This effectively reduces the
> > > > > frequency of page allocation from the buddy system. For instance, the
> > > > > arrival of 32 1K packets only triggers one alloc_page call. Therefore,
> > > > > the benefits of using a page pool are limited in such cases.
> > >
> > > I wonder if we can improve page pool in this case anyhow.
> > >
> >
> > We would like to make the effort to enhance skb coalecsing to be more
> > friendly with page pool buffers. But that involves modifications to
> > some core data structure of mm.
> >
> >
>
> Just to give an update on the status of v2 progress. We have
> identified the specific situation where skb coalescing is affected by
> page pool and small packets. In summary, it was because, in our
> environment (Fedora 37 default network setup), NetworkManager creates
> a SOCK_DGRAM socket, which eventually results in skbs being cloned,
> thus causing the failure of skb coalescing.
>
> While it affects small packet performance in our environment, it
> doesn't necessarily have the same impact for other users.
> Nevertheless, we are trying to make an optimization to allow skb
> coalescing in that situation. It may take some time as we are advised
> to wait for some relevant prior patches to land first.
> (https://patchwork.kernel.org/project/netdevbpf/patch/20230628121150.47778-1-liangchen.linux@gmail.com/)

If it's not too late, I would like to be copied in the next version of
this. (Since it seems to be suggested by me).

>
> In addition, if small packet performance is not dropping anymore,
> perhaps we can enable page pool for the normal path as well.

It would be simpler if we start from XDP and normal on top. But it's
your call anyway.

Thanks

>
>
> Thanks,
> Liang
>
>
>
> > > > In fact,
> > > > > without page pool fragmenting enabled, it can even hinder performance
> > > > > from this perspective.
> > > > >
> > > > > Upon further consideration, I tend to believe making page pool the
> > > > > default option may not be appropriate. As you pointed out, we cannot
> > > > > simply ignore the performance impact on small packets. Any comments on
> > > > > this will be much appreciated.
> > > > >
> > > > >
> > > > > Thanks,
> > > > > Liang
> > > >
> > > >
> > > > So, let's only use page pool for XDP then?
> > >
> > > +1
> > >
> > > We can start from this.
> > >
> > > Thanks
> > >
> > > >
> > > > >
> > > > > > > > The usage of page pool for big mode is being evaluated now. Thanks!
> > > > > > > >
> > > > > > > > > > ---
> > > > > > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > > > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > > > > > >  module_param(gso, bool, 0444);
> > > > > > > > > >  module_param(napi_tx, bool, 0644);
> > > > > > > > > >
> > > > > > > > > > +static bool page_pool_enabled;
> > > > > > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > > > > > +
> > > > > > > > > >  /* FIXME: MTU in config. */
> > > > > > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > > > > > >  #define GOOD_COPY_LEN        128
> > > > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > > > > > >       /* Chain pages by the private ptr. */
> > > > > > > > > >       struct page *pages;
> > > > > > > > > >
> > > > > > > > > > +     /* Page pool */
> > > > > > > > > > +     struct page_pool *page_pool;
> > > > > > > > > > +
> > > > > > > > > >       /* Average packet length for mergeable receive buffers. */
> > > > > > > > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > > > > > >
> > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > > > > > >       return skb;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > > > > > +{
> > > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > > > > +     else
> > > > > > > > > > +             put_page(page);
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >  /* Called from bottom half context */
> > > > > > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > >                                  struct receive_queue *rq,
> > > > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > >       hdr = skb_vnet_hdr(skb);
> > > > > > > > > >       memcpy(hdr, hdr_p, hdr_len);
> > > > > > > > > >       if (page_to_free)
> > > > > > > > > > -             put_page(page_to_free);
> > > > > > > > > > +             virtnet_put_page(rq, page_to_free);
> > > > > > > > > >
> > > > > > > > > >       return skb;
> > > > > > > > > >  }
> > > > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > > > > > >       return ret;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > > > > > >  {
> > > > > > > > > >       struct skb_shared_info *shinfo;
> > > > > > > > > >       struct page *xdp_page;
> > > > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > > > > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > > > > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > > > > > -                     put_page(xdp_page);
> > > > > > > > > > +                     virtnet_put_page(rq, xdp_page);
> > > > > > > > > >               }
> > > > > > > > > >       }
> > > > > > > > > >  }
> > > > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > > > > > >               return NULL;
> > > > > > > > > >
> > > > > > > > > > -     page = alloc_page(GFP_ATOMIC);
> > > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > > > +     else
> > > > > > > > > > +             page = alloc_page(GFP_ATOMIC);
> > > > > > > > > > +
> > > > > > > > > >       if (!page)
> > > > > > > > > >               return NULL;
> > > > > > > > > >
> > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > >                * is sending packet larger than the MTU.
> > > > > > > > > >                */
> > > > > > > > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > > > > > -                     put_page(p);
> > > > > > > > > > +                     virtnet_put_page(rq, p);
> > > > > > > > > >                       goto err_buf;
> > > > > > > > > >               }
> > > > > > > > > >
> > > > > > > > > >               memcpy(page_address(page) + page_off,
> > > > > > > > > >                      page_address(p) + off, buflen);
> > > > > > > > > >               page_off += buflen;
> > > > > > > > > > -             put_page(p);
> > > > > > > > > > +             virtnet_put_page(rq, p);
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > >       /* Headroom does not contribute to packet length */
> > > > > > > > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > > > > > >       return page;
> > > > > > > > > >  err_buf:
> > > > > > > > > > -     __free_pages(page, 0);
> > > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > > > > +     else
> > > > > > > > > > +             __free_pages(page, 0);
> > > > > > > > > >       return NULL;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > > > > > >               }
> > > > > > > > > >               stats->bytes += len;
> > > > > > > > > >               page = virt_to_head_page(buf);
> > > > > > > > > > -             put_page(page);
> > > > > > > > > > +             virtnet_put_page(rq, page);
> > > > > > > > > >       }
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > >               cur_frag_size = truesize;
> > > > > > > > > >               xdp_frags_truesz += cur_frag_size;
> > > > > > > > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > > > > > -                     put_page(page);
> > > > > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > > > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > > > > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > > > > > > > >                       dev->stats.rx_length_errors++;
> > > > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > >       return 0;
> > > > > > > > > >
> > > > > > > > > >  err:
> > > > > > > > > > -     put_xdp_frags(xdp);
> > > > > > > > > > +     put_xdp_frags(xdp, rq);
> > > > > > > > > >       return -EINVAL;
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > > > > >               if (*len + xdp_room > PAGE_SIZE)
> > > > > > > > > >                       return NULL;
> > > > > > > > > >
> > > > > > > > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > > > > +             if (rq->page_pool)
> > > > > > > > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > > > +             else
> > > > > > > > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > > > >               if (!xdp_page)
> > > > > > > > > >                       return NULL;
> > > > > > > > > >
> > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > > > > >
> > > > > > > > > >       *frame_sz = PAGE_SIZE;
> > > > > > > > > >
> > > > > > > > > > -     put_page(*page);
> > > > > > > > > > +     virtnet_put_page(rq, *page);
> > > > > > > > > >
> > > > > > > > > >       *page = xdp_page;
> > > > > > > > > >
> > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > > > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > > > > > >               if (unlikely(!head_skb))
> > > > > > > > > >                       break;
> > > > > > > > > > +             if (rq->page_pool)
> > > > > > > > > > +                     skb_mark_for_recycle(head_skb);
> > > > > > > > > >               return head_skb;
> > > > > > > > > >
> > > > > > > > > >       case XDP_TX:
> > > > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > > > > >               break;
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > > -     put_xdp_frags(&xdp);
> > > > > > > > > > +     put_xdp_frags(&xdp, rq);
> > > > > > > > > >
> > > > > > > > > >  err_xdp:
> > > > > > > > > > -     put_page(page);
> > > > > > > > > > +     virtnet_put_page(rq, page);
> > > > > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > > > > >
> > > > > > > > > >       stats->xdp_drops++;
> > > > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > > > > > >       curr_skb = head_skb;
> > > > > > > > > >
> > > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > > +             skb_mark_for_recycle(curr_skb);
> > > > > > > > > > +
> > > > > > > > > >       if (unlikely(!curr_skb))
> > > > > > > > > >               goto err_skb;
> > > > > > > > > >       while (--num_buf) {
> > > > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > >                       curr_skb = nskb;
> > > > > > > > > >                       head_skb->truesize += nskb->truesize;
> > > > > > > > > >                       num_skb_frags = 0;
> > > > > > > > > > +                     if (rq->page_pool)
> > > > > > > > > > +                             skb_mark_for_recycle(curr_skb);
> > > > > > > > > >               }
> > > > > > > > > >               if (curr_skb != head_skb) {
> > > > > > > > > >                       head_skb->data_len += len;
> > > > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > >               }
> > > > > > > > > >               offset = buf - page_address(page);
> > > > > > > > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > > > > > -                     put_page(page);
> > > > > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > > > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > > > > > >                                            len, truesize);
> > > > > > > > > >               } else {
> > > > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > >       return head_skb;
> > > > > > > > > >
> > > > > > > > > >  err_skb:
> > > > > > > > > > -     put_page(page);
> > > > > > > > > > +     virtnet_put_page(rq, page);
> > > > > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > > > > >
> > > > > > > > > >  err_buf:
> > > > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > >        * disabled GSO for XDP, it won't be a big issue.
> > > > > > > > > >        */
> > > > > > > > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > > > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > > > > -             return -ENOMEM;
> > > > > > > > > > +     if (rq->page_pool) {
> > > > > > > > > > +             struct page *page;
> > > > > > > > > >
> > > > > > > > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > > > -     get_page(alloc_frag->page);
> > > > > > > > > > -     alloc_frag->offset += len + room;
> > > > > > > > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > > > > -     if (hole < len + room) {
> > > > > > > > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > > > > > > > -              * enough space for another buffer, add the remaining space to
> > > > > > > > > > -              * the current buffer.
> > > > > > > > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > > > > -              */
> > > > > > > > > > -             if (!headroom)
> > > > > > > > > > -                     len += hole;
> > > > > > > > > > -             alloc_frag->offset += hole;
> > > > > > > > > > -     }
> > > > > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > > > +             if (unlikely(!page))
> > > > > > > > > > +                     return -ENOMEM;
> > > > > > > > > > +             buf = (char *)page_address(page);
> > > > > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > > > +     } else {
> > > > > > > > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > > > > +                     return -ENOMEM;
> > > > > > > > > >
> > > > > > > > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > > > +             get_page(alloc_frag->page);
> > > > > > > > > > +             alloc_frag->offset += len + room;
> > > > > > > > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > > > > +             if (hole < len + room) {
> > > > > > > > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > > > > > > > +                      * enough space for another buffer, add the remaining space to
> > > > > > > > > > +                      * the current buffer.
> > > > > > > > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > > > > +                      */
> > > > > > > > > > +                     if (!headroom)
> > > > > > > > > > +                             len += hole;
> > > > > > > > > > +                     alloc_frag->offset += hole;
> > > > > > > > > > +             }
> > > > > > > > > > +     }
> > > > > > > > > >       sg_init_one(rq->sg, buf, len);
> > > > > > > > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > >       if (err < 0)
> > > > > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > > > > > >
> > > > > > > > > >       return err;
> > > > > > > > > >  }
> > > > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > > > > >       if (err < 0)
> > > > > > > > > >               return err;
> > > > > > > > > >
> > > > > > > > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > > > > > +     if (vi->rq[qp_index].page_pool)
> > > > > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > > > > > > > +                                              vi->rq[qp_index].page_pool);
> > > > > > > > > > +     else
> > > > > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > > > > > > > +                                              NULL);
> > > > > > > > > > +
> > > > > > > > > >       if (err < 0)
> > > > > > > > > >               goto err_xdp_reg_mem_model;
> > > > > > > > > >
> > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > > > > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > > > > > >                                               virtnet_sq_stats_desc[j].desc);
> > > > > > > > > >               }
> > > > > > > > > > +             page_pool_ethtool_stats_get_strings(p);
> > > > > > > > > >               break;
> > > > > > > > > >       }
> > > > > > > > > >  }
> > > > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > > > > > >       switch (sset) {
> > > > > > > > > >       case ETH_SS_STATS:
> > > > > > > > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > > > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > > > > > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > > > > > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > > > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > > > > > > > >       default:
> > > > > > > > > >               return -EOPNOTSUPP;
> > > > > > > > > >       }
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > > > > > +{
> > > > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > > > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > > > > > > > +     struct page_pool_stats pp_stats = {};
> > > > > > > > > > +     int i;
> > > > > > > > > > +
> > > > > > > > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > > > > > +             if (!vi->rq[i].page_pool)
> > > > > > > > > > +                     continue;
> > > > > > > > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > > > > > +     }
> > > > > > > > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > > > > >                                     struct ethtool_stats *stats, u64 *data)
> > > > > > > > > >  {
> > > > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > > > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > > > > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > > > > > > > >       }
> > > > > > > > > > +
> > > > > > > > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > >               __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > > +             if (vi->rq[i].page_pool)
> > > > > > > > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > >       /* We called __netif_napi_del(),
> > > > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > > > > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > > > > > > > >       int i = vq2rxq(vq);
> > > > > > > > > >
> > > > > > > > > > -     if (vi->mergeable_rx_bufs)
> > > > > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > > > > -     else if (vi->big_packets)
> > > > > > > > > > +     if (vi->mergeable_rx_bufs) {
> > > > > > > > > > +             if (vi->rq[i].page_pool) {
> > > > > > > > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > > > > > +                                             virt_to_head_page(buf),
> > > > > > > > > > +                                             true);
> > > > > > > > > > +             } else {
> > > > > > > > > > +                     put_page(virt_to_head_page(buf));
> > > > > > > > > > +             }
> > > > > > > > > > +     } else if (vi->big_packets) {
> > > > > > > > > >               give_pages(&vi->rq[i], buf);
> > > > > > > > > > -     else
> > > > > > > > > > +     } else {
> > > > > > > > > >               put_page(virt_to_head_page(buf));
> > > > > > > > > > +     }
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > > > > > >       virtnet_free_queues(vi);
> > > > > > > > > >  }
> > > > > > > > > >
> > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > > > > > +{
> > > > > > > > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > > > > > > > +
> > > > > > > > > > +     struct page_pool_params pp_params = {
> > > > > > > > > > +             .order = 0,
> > > > > > > > > > +             .pool_size = rq->vq->num_max,
> > > > > > > > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > > > > > > > +             .dev = vdev->dev.parent,
> > > > > > > > > > +             .offset = 0,
> > > > > > > > > > +     };
> > > > > > > > > > +
> > > > > > > > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > > > > > > > +     if (IS_ERR(rq->page_pool)) {
> > > > > > > > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > > > > > +                      PTR_ERR(rq->page_pool));
> > > > > > > > > > +             rq->page_pool = NULL;
> > > > > > > > > > +     }
> > > > > > > > > > +}
> > > > > > > > > > +
> > > > > > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > > > > > >   * least one full packet?
> > > > > > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > > > > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > > > > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > > > > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > > > > > +
> > > > > > > > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > > > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > > > > > +             else
> > > > > > > > > > +                     dev_warn(&vi->vdev->dev,
> > > > > > > > > > +                              "page pool only support mergeable mode\n");
> > > > > > > > > > +
> > > > > > > > > >       }
> > > > > > > > > >
> > > > > > > > > >       /* run here: ret == 0. */
> > > > > > > > > > --
> > > > > > > > > > 2.31.1
> > > > > > > > >
> > > > > > >
> > > >
> > >
>


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-07-05  6:04                     ` Jason Wang
@ 2023-07-06  1:01                       ` Liang Chen
  0 siblings, 0 replies; 56+ messages in thread
From: Liang Chen @ 2023-07-06  1:01 UTC (permalink / raw)
  To: Jason Wang
  Cc: Michael S. Tsirkin, virtualization, netdev, linux-kernel,
	xuanzhuo, kuba, edumazet, davem, pabeni, alexander.duyck

On Wed, Jul 5, 2023 at 2:05 PM Jason Wang <jasowang@redhat.com> wrote:
>
> On Wed, Jul 5, 2023 at 1:41 PM Liang Chen <liangchen.linux@gmail.com> wrote:
> >
> > On Fri, Jun 9, 2023 at 10:57 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> > >
> > > On Thu, Jun 8, 2023 at 8:38 AM Jason Wang <jasowang@redhat.com> wrote:
> > > >
> > > > On Thu, Jun 8, 2023 at 4:17 AM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > >
> > > > > On Wed, Jun 07, 2023 at 05:08:59PM +0800, Liang Chen wrote:
> > > > > > On Tue, May 30, 2023 at 9:19 AM Liang Chen <liangchen.linux@gmail.com> wrote:
> > > > > > >
> > > > > > > On Mon, May 29, 2023 at 5:55 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > >
> > > > > > > > On Mon, May 29, 2023 at 03:27:56PM +0800, Liang Chen wrote:
> > > > > > > > > On Sun, May 28, 2023 at 2:20 PM Michael S. Tsirkin <mst@redhat.com> wrote:
> > > > > > > > > >
> > > > > > > > > > On Fri, May 26, 2023 at 01:46:18PM +0800, Liang Chen wrote:
> > > > > > > > > > > The implementation at the moment uses one page per packet in both the
> > > > > > > > > > > normal and XDP path. In addition, introducing a module parameter to enable
> > > > > > > > > > > or disable the usage of page pool (disabled by default).
> > > > > > > > > > >
> > > > > > > > > > > In single-core vm testing environments, it gives a modest performance gain
> > > > > > > > > > > in the normal path.
> > > > > > > > > > >   Upstream codebase: 47.5 Gbits/sec
> > > > > > > > > > >   Upstream codebase + page_pool support: 50.2 Gbits/sec
> > > > > > > > > > >
> > > > > > > > > > > In multi-core vm testing environments, The most significant performance
> > > > > > > > > > > gain is observed in XDP cpumap:
> > > > > > > > > > >   Upstream codebase: 1.38 Gbits/sec
> > > > > > > > > > >   Upstream codebase + page_pool support: 9.74 Gbits/sec
> > > > > > > > > > >
> > > > > > > > > > > With this foundation, we can further integrate page pool fragmentation and
> > > > > > > > > > > DMA map/unmap support.
> > > > > > > > > > >
> > > > > > > > > > > Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> > > > > > > > > >
> > > > > > > > > > Why off by default?
> > > > > > > > > > I am guessing it sometimes has performance costs too?
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > > > What happens if we use page pool for big mode too?
> > > > > > > > > > The less modes we have the better...
> > > > > > > > > >
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > Sure, now I believe it makes sense to enable it by default. When the
> > > > > > > > > packet size is very small, it reduces the likelihood of skb
> > > > > > > > > coalescing. But such cases are rare.
> > > > > > > >
> > > > > > > > small packets are rare? These workloads are easy to create actually.
> > > > > > > > Pls try and include benchmark with small packet size.
> > > > > > > >
> > > > > > >
> > > > > > > Sure, Thanks!
> > > > > >
> > > > > > Before going ahead and posting v2 patch, I would like to hear more
> > > > > > advice for the cases of small packets. I have done more performance
> > > > > > benchmark with small packets since then. Here is a list of iperf
> > > > > > output,
> > > > > >
> > > > > > With PP and PP fragmenting:
> > > > > > 256K:   [  5] 505.00-510.00 sec  1.34 GBytes  2.31 Gbits/sec    0    144 KBytes
> > > > > > 1K:       [  5]  30.00-35.00  sec  4.63 GBytes  7.95 Gbits/sec    0
> > > > > > 223 KBytes
> > > > > > 2K:       [  5]  65.00-70.00  sec  8.33 GBytes  14.3 Gbits/sec    0
> > > > > > 324 KBytes
> > > > > > 4K:       [  5]  30.00-35.00  sec  13.3 GBytes  22.8 Gbits/sec    0
> > > > > > 1.08 MBytes
> > > > > > 8K:       [  5]  50.00-55.00  sec  18.9 GBytes  32.4 Gbits/sec    0
> > > > > > 744 KBytes
> > > > > > 16K:     [  5]  25.00-30.00  sec  24.6 GBytes  42.3 Gbits/sec    0    963 KBytes
> > > > > > 32K:     [  5]  45.00-50.00  sec  29.8 GBytes  51.2 Gbits/sec    0   1.25 MBytes
> > > > > > 64K:     [  5]  35.00-40.00  sec  34.0 GBytes  58.4 Gbits/sec    0   1.70 MBytes
> > > > > > 128K:   [  5]  45.00-50.00  sec  36.7 GBytes  63.1 Gbits/sec    0   4.26 MBytes
> > > > > > 256K:   [  5]  30.00-35.00  sec  40.0 GBytes  68.8 Gbits/sec    0   3.20 MBytes
> > > >
> > > > Note that virtio-net driver is lacking things like BQL and others, so
> > > > it might suffer from buffer bloat for TCP performance. Would you mind
> > > > to measure with e.g using testpmd on the vhost to see the rx PPS?
> > > >
> > >
> > > No problem. Before we proceed to measure with testpmd, could you
> > > please take a look at the PPS measurements we obtained previously and
> > > see if they are sufficient? Though we will only utilize page pool for
> > > xdp on v2.
> > >
> > > netperf -H 192.168.124.197 -p 4444 -t UDP_STREAM -l 0 -- -m $((1))
> > >
> > > with page pool:
> > > 1.
> > > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > > Average:       enp8s0 655092.27      0.35  27508.77      0.03
> > > 0.00      0.00      0.00      0.00
> > > 2.
> > > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > > Average:       enp8s0 654749.87      0.63  27494.42      0.05
> > > 0.00      0.00      0.00      0.00
> > > 3.
> > > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > > Average:       enp8s0 654230.40      0.10  27472.57      0.01
> > > 0.00      0.00      0.00      0.00
> > > 4.
> > > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > > Average:       enp8s0 656661.33      0.15  27574.65      0.01
> > > 0.00      0.00      0.00      0.00
> > >
> > >
> > > without page pool:
> > > 1.
> > > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > > Average:       enp8s0 646515.20      0.47  27148.60      0.04
> > > 0.00      0.00      0.00      0.00
> > > 2.
> > > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > > Average:       enp8s0 653874.13      0.18  27457.61      0.02
> > > 0.00      0.00      0.00      0.00
> > > 3.
> > > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > > Average:       enp8s0 647246.93      0.15  27179.32      0.01
> > > 0.00      0.00      0.00      0.00
> > > 4.
> > > Average:        IFACE   rxpck/s   txpck/s    rxkB/s    txkB/s
> > > rxcmp/s   txcmp/s  rxmcst/s   %ifutil
> > > Average:       enp8s0 650625.07      0.27  27321.18      0.02
> > > 0.00      0.00      0.00      0.00
> > >
> > >
> > > (655092+654749+654230+656661)/(646515+653874+647246+650625) =
> > > 1.00864886500966031113
> > > On average it gives around 0.8% increase in PPS, and this figure can
> > > be reproduced consistently.
> > >
> > > > > >
> > > > > > Without PP:
> > > > > > 256:     [  5] 680.00-685.00 sec  1.57 GBytes  2.69 Gbits/sec    0    359 KBytes
> > > > > > 1K:      [  5]  75.00-80.00  sec  5.47 GBytes  9.40 Gbits/sec    0    730 KBytes
> > > > > > 2K:      [  5]  65.00-70.00  sec  9.46 GBytes  16.2 Gbits/sec    0   1.99 MBytes
> > > > > > 4K:      [  5]  30.00-35.00  sec  14.5 GBytes  25.0 Gbits/sec    0   1.20 MBytes
> > > > > > 8K:      [  5]  45.00-50.00  sec  19.9 GBytes  34.1 Gbits/sec    0   1.72 MBytes
> > > > > > 16K:    [  5]   5.00-10.00  sec  23.8 GBytes  40.9 Gbits/sec    0   2.90 MBytes
> > > > > > 32K:    [  5]  15.00-20.00  sec  28.0 GBytes  48.1 Gbits/sec    0   3.03 MBytes
> > > > > > 64K:    [  5]  60.00-65.00  sec  31.8 GBytes  54.6 Gbits/sec    0   3.05 MBytes
> > > > > > 128K:  [  5]  45.00-50.00  sec  33.0 GBytes  56.6 Gbits/sec    1   3.03 MBytes
> > > > > > 256K:  [  5]  25.00-30.00  sec  34.7 GBytes  59.6 Gbits/sec    0   3.11 MBytes
> > > > > >
> > > > > >
> > > > > > The major factor contributing to the performance drop is the reduction
> > > > > > of skb coalescing. Additionally, without the page pool, small packets
> > > > > > can still benefit from the allocation of 8 continuous pages by
> > > > > > breaking them down into smaller pieces. This effectively reduces the
> > > > > > frequency of page allocation from the buddy system. For instance, the
> > > > > > arrival of 32 1K packets only triggers one alloc_page call. Therefore,
> > > > > > the benefits of using a page pool are limited in such cases.
> > > >
> > > > I wonder if we can improve page pool in this case anyhow.
> > > >
> > >
> > > We would like to make the effort to enhance skb coalecsing to be more
> > > friendly with page pool buffers. But that involves modifications to
> > > some core data structure of mm.
> > >
> > >
> >
> > Just to give an update on the status of v2 progress. We have
> > identified the specific situation where skb coalescing is affected by
> > page pool and small packets. In summary, it was because, in our
> > environment (Fedora 37 default network setup), NetworkManager creates
> > a SOCK_DGRAM socket, which eventually results in skbs being cloned,
> > thus causing the failure of skb coalescing.
> >
> > While it affects small packet performance in our environment, it
> > doesn't necessarily have the same impact for other users.
> > Nevertheless, we are trying to make an optimization to allow skb
> > coalescing in that situation. It may take some time as we are advised
> > to wait for some relevant prior patches to land first.
> > (https://patchwork.kernel.org/project/netdevbpf/patch/20230628121150.47778-1-liangchen.linux@gmail.com/)
>
> If it's not too late, I would like to be copied in the next version of
> this. (Since it seems to be suggested by me).
>

Sure, no problem. It indeed was suggested by you.

Thanks,
Liang

> >
> > In addition, if small packet performance is not dropping anymore,
> > perhaps we can enable page pool for the normal path as well.
>
> It would be simpler if we start from XDP and normal on top. But it's
> your call anyway.
>
> Thanks
>
> >
> >
> > Thanks,
> > Liang
> >
> >
> >
> > > > > In fact,
> > > > > > without page pool fragmenting enabled, it can even hinder performance
> > > > > > from this perspective.
> > > > > >
> > > > > > Upon further consideration, I tend to believe making page pool the
> > > > > > default option may not be appropriate. As you pointed out, we cannot
> > > > > > simply ignore the performance impact on small packets. Any comments on
> > > > > > this will be much appreciated.
> > > > > >
> > > > > >
> > > > > > Thanks,
> > > > > > Liang
> > > > >
> > > > >
> > > > > So, let's only use page pool for XDP then?
> > > >
> > > > +1
> > > >
> > > > We can start from this.
> > > >
> > > > Thanks
> > > >
> > > > >
> > > > > >
> > > > > > > > > The usage of page pool for big mode is being evaluated now. Thanks!
> > > > > > > > >
> > > > > > > > > > > ---
> > > > > > > > > > >  drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
> > > > > > > > > > >  1 file changed, 146 insertions(+), 42 deletions(-)
> > > > > > > > > > >
> > > > > > > > > > > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > > > > > > > > > index c5dca0d92e64..99c0ca0c1781 100644
> > > > > > > > > > > --- a/drivers/net/virtio_net.c
> > > > > > > > > > > +++ b/drivers/net/virtio_net.c
> > > > > > > > > > > @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
> > > > > > > > > > >  module_param(gso, bool, 0444);
> > > > > > > > > > >  module_param(napi_tx, bool, 0644);
> > > > > > > > > > >
> > > > > > > > > > > +static bool page_pool_enabled;
> > > > > > > > > > > +module_param(page_pool_enabled, bool, 0400);
> > > > > > > > > > > +
> > > > > > > > > > >  /* FIXME: MTU in config. */
> > > > > > > > > > >  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
> > > > > > > > > > >  #define GOOD_COPY_LEN        128
> > > > > > > > > > > @@ -159,6 +162,9 @@ struct receive_queue {
> > > > > > > > > > >       /* Chain pages by the private ptr. */
> > > > > > > > > > >       struct page *pages;
> > > > > > > > > > >
> > > > > > > > > > > +     /* Page pool */
> > > > > > > > > > > +     struct page_pool *page_pool;
> > > > > > > > > > > +
> > > > > > > > > > >       /* Average packet length for mergeable receive buffers. */
> > > > > > > > > > >       struct ewma_pkt_len mrg_avg_pkt_len;
> > > > > > > > > > >
> > > > > > > > > > > @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
> > > > > > > > > > >       return skb;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> > > > > > > > > > > +{
> > > > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > > > > > +     else
> > > > > > > > > > > +             put_page(page);
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  /* Called from bottom half context */
> > > > > > > > > > >  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > > >                                  struct receive_queue *rq,
> > > > > > > > > > > @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
> > > > > > > > > > >       hdr = skb_vnet_hdr(skb);
> > > > > > > > > > >       memcpy(hdr, hdr_p, hdr_len);
> > > > > > > > > > >       if (page_to_free)
> > > > > > > > > > > -             put_page(page_to_free);
> > > > > > > > > > > +             virtnet_put_page(rq, page_to_free);
> > > > > > > > > > >
> > > > > > > > > > >       return skb;
> > > > > > > > > > >  }
> > > > > > > > > > > @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
> > > > > > > > > > >       return ret;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > -static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > > > > > +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
> > > > > > > > > > >  {
> > > > > > > > > > >       struct skb_shared_info *shinfo;
> > > > > > > > > > >       struct page *xdp_page;
> > > > > > > > > > > @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
> > > > > > > > > > >               shinfo = xdp_get_shared_info_from_buff(xdp);
> > > > > > > > > > >               for (i = 0; i < shinfo->nr_frags; i++) {
> > > > > > > > > > >                       xdp_page = skb_frag_page(&shinfo->frags[i]);
> > > > > > > > > > > -                     put_page(xdp_page);
> > > > > > > > > > > +                     virtnet_put_page(rq, xdp_page);
> > > > > > > > > > >               }
> > > > > > > > > > >       }
> > > > > > > > > > >  }
> > > > > > > > > > > @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > > >       if (page_off + *len + tailroom > PAGE_SIZE)
> > > > > > > > > > >               return NULL;
> > > > > > > > > > >
> > > > > > > > > > > -     page = alloc_page(GFP_ATOMIC);
> > > > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > > > > +     else
> > > > > > > > > > > +             page = alloc_page(GFP_ATOMIC);
> > > > > > > > > > > +
> > > > > > > > > > >       if (!page)
> > > > > > > > > > >               return NULL;
> > > > > > > > > > >
> > > > > > > > > > > @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
> > > > > > > > > > >                * is sending packet larger than the MTU.
> > > > > > > > > > >                */
> > > > > > > > > > >               if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> > > > > > > > > > > -                     put_page(p);
> > > > > > > > > > > +                     virtnet_put_page(rq, p);
> > > > > > > > > > >                       goto err_buf;
> > > > > > > > > > >               }
> > > > > > > > > > >
> > > > > > > > > > >               memcpy(page_address(page) + page_off,
> > > > > > > > > > >                      page_address(p) + off, buflen);
> > > > > > > > > > >               page_off += buflen;
> > > > > > > > > > > -             put_page(p);
> > > > > > > > > > > +             virtnet_put_page(rq, p);
> > > > > > > > > > >       }
> > > > > > > > > > >
> > > > > > > > > > >       /* Headroom does not contribute to packet length */
> > > > > > > > > > >       *len = page_off - VIRTIO_XDP_HEADROOM;
> > > > > > > > > > >       return page;
> > > > > > > > > > >  err_buf:
> > > > > > > > > > > -     __free_pages(page, 0);
> > > > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > > > +             page_pool_put_full_page(rq->page_pool, page, true);
> > > > > > > > > > > +     else
> > > > > > > > > > > +             __free_pages(page, 0);
> > > > > > > > > > >       return NULL;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
> > > > > > > > > > >               }
> > > > > > > > > > >               stats->bytes += len;
> > > > > > > > > > >               page = virt_to_head_page(buf);
> > > > > > > > > > > -             put_page(page);
> > > > > > > > > > > +             virtnet_put_page(rq, page);
> > > > > > > > > > >       }
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > > >               cur_frag_size = truesize;
> > > > > > > > > > >               xdp_frags_truesz += cur_frag_size;
> > > > > > > > > > >               if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> > > > > > > > > > > -                     put_page(page);
> > > > > > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > > > > > >                       pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
> > > > > > > > > > >                                dev->name, len, (unsigned long)(truesize - room));
> > > > > > > > > > >                       dev->stats.rx_length_errors++;
> > > > > > > > > > > @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
> > > > > > > > > > >       return 0;
> > > > > > > > > > >
> > > > > > > > > > >  err:
> > > > > > > > > > > -     put_xdp_frags(xdp);
> > > > > > > > > > > +     put_xdp_frags(xdp, rq);
> > > > > > > > > > >       return -EINVAL;
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > > > > > >               if (*len + xdp_room > PAGE_SIZE)
> > > > > > > > > > >                       return NULL;
> > > > > > > > > > >
> > > > > > > > > > > -             xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > > > > > +             if (rq->page_pool)
> > > > > > > > > > > +                     xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > > > > +             else
> > > > > > > > > > > +                     xdp_page = alloc_page(GFP_ATOMIC);
> > > > > > > > > > >               if (!xdp_page)
> > > > > > > > > > >                       return NULL;
> > > > > > > > > > >
> > > > > > > > > > > @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
> > > > > > > > > > >
> > > > > > > > > > >       *frame_sz = PAGE_SIZE;
> > > > > > > > > > >
> > > > > > > > > > > -     put_page(*page);
> > > > > > > > > > > +     virtnet_put_page(rq, *page);
> > > > > > > > > > >
> > > > > > > > > > >       *page = xdp_page;
> > > > > > > > > > >
> > > > > > > > > > > @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > > > > > >               head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
> > > > > > > > > > >               if (unlikely(!head_skb))
> > > > > > > > > > >                       break;
> > > > > > > > > > > +             if (rq->page_pool)
> > > > > > > > > > > +                     skb_mark_for_recycle(head_skb);
> > > > > > > > > > >               return head_skb;
> > > > > > > > > > >
> > > > > > > > > > >       case XDP_TX:
> > > > > > > > > > > @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
> > > > > > > > > > >               break;
> > > > > > > > > > >       }
> > > > > > > > > > >
> > > > > > > > > > > -     put_xdp_frags(&xdp);
> > > > > > > > > > > +     put_xdp_frags(&xdp, rq);
> > > > > > > > > > >
> > > > > > > > > > >  err_xdp:
> > > > > > > > > > > -     put_page(page);
> > > > > > > > > > > +     virtnet_put_page(rq, page);
> > > > > > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > > > > > >
> > > > > > > > > > >       stats->xdp_drops++;
> > > > > > > > > > > @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > >       head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
> > > > > > > > > > >       curr_skb = head_skb;
> > > > > > > > > > >
> > > > > > > > > > > +     if (rq->page_pool)
> > > > > > > > > > > +             skb_mark_for_recycle(curr_skb);
> > > > > > > > > > > +
> > > > > > > > > > >       if (unlikely(!curr_skb))
> > > > > > > > > > >               goto err_skb;
> > > > > > > > > > >       while (--num_buf) {
> > > > > > > > > > > @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > >                       curr_skb = nskb;
> > > > > > > > > > >                       head_skb->truesize += nskb->truesize;
> > > > > > > > > > >                       num_skb_frags = 0;
> > > > > > > > > > > +                     if (rq->page_pool)
> > > > > > > > > > > +                             skb_mark_for_recycle(curr_skb);
> > > > > > > > > > >               }
> > > > > > > > > > >               if (curr_skb != head_skb) {
> > > > > > > > > > >                       head_skb->data_len += len;
> > > > > > > > > > > @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > >               }
> > > > > > > > > > >               offset = buf - page_address(page);
> > > > > > > > > > >               if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> > > > > > > > > > > -                     put_page(page);
> > > > > > > > > > > +                     virtnet_put_page(rq, page);
> > > > > > > > > > >                       skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
> > > > > > > > > > >                                            len, truesize);
> > > > > > > > > > >               } else {
> > > > > > > > > > > @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
> > > > > > > > > > >       return head_skb;
> > > > > > > > > > >
> > > > > > > > > > >  err_skb:
> > > > > > > > > > > -     put_page(page);
> > > > > > > > > > > +     virtnet_put_page(rq, page);
> > > > > > > > > > >       mergeable_buf_free(rq, num_buf, dev, stats);
> > > > > > > > > > >
> > > > > > > > > > >  err_buf:
> > > > > > > > > > > @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
> > > > > > > > > > >        * disabled GSO for XDP, it won't be a big issue.
> > > > > > > > > > >        */
> > > > > > > > > > >       len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> > > > > > > > > > > -     if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > > > > > -             return -ENOMEM;
> > > > > > > > > > > +     if (rq->page_pool) {
> > > > > > > > > > > +             struct page *page;
> > > > > > > > > > >
> > > > > > > > > > > -     buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > > -     buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > > > > -     get_page(alloc_frag->page);
> > > > > > > > > > > -     alloc_frag->offset += len + room;
> > > > > > > > > > > -     hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > > > > > -     if (hole < len + room) {
> > > > > > > > > > > -             /* To avoid internal fragmentation, if there is very likely not
> > > > > > > > > > > -              * enough space for another buffer, add the remaining space to
> > > > > > > > > > > -              * the current buffer.
> > > > > > > > > > > -              * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > > > > > -              * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > > > > > -              */
> > > > > > > > > > > -             if (!headroom)
> > > > > > > > > > > -                     len += hole;
> > > > > > > > > > > -             alloc_frag->offset += hole;
> > > > > > > > > > > -     }
> > > > > > > > > > > +             page = page_pool_dev_alloc_pages(rq->page_pool);
> > > > > > > > > > > +             if (unlikely(!page))
> > > > > > > > > > > +                     return -ENOMEM;
> > > > > > > > > > > +             buf = (char *)page_address(page);
> > > > > > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > > > > +     } else {
> > > > > > > > > > > +             if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> > > > > > > > > > > +                     return -ENOMEM;
> > > > > > > > > > >
> > > > > > > > > > > +             buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> > > > > > > > > > > +             buf += headroom; /* advance address leaving hole at front of pkt */
> > > > > > > > > > > +             get_page(alloc_frag->page);
> > > > > > > > > > > +             alloc_frag->offset += len + room;
> > > > > > > > > > > +             hole = alloc_frag->size - alloc_frag->offset;
> > > > > > > > > > > +             if (hole < len + room) {
> > > > > > > > > > > +                     /* To avoid internal fragmentation, if there is very likely not
> > > > > > > > > > > +                      * enough space for another buffer, add the remaining space to
> > > > > > > > > > > +                      * the current buffer.
> > > > > > > > > > > +                      * XDP core assumes that frame_size of xdp_buff and the length
> > > > > > > > > > > +                      * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> > > > > > > > > > > +                      */
> > > > > > > > > > > +                     if (!headroom)
> > > > > > > > > > > +                             len += hole;
> > > > > > > > > > > +                     alloc_frag->offset += hole;
> > > > > > > > > > > +             }
> > > > > > > > > > > +     }
> > > > > > > > > > >       sg_init_one(rq->sg, buf, len);
> > > > > > > > > > >       ctx = mergeable_len_to_ctx(len + room, headroom);
> > > > > > > > > > >       err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
> > > > > > > > > > >       if (err < 0)
> > > > > > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > > > > > +             virtnet_put_page(rq, virt_to_head_page(buf));
> > > > > > > > > > >
> > > > > > > > > > >       return err;
> > > > > > > > > > >  }
> > > > > > > > > > > @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
> > > > > > > > > > >       if (err < 0)
> > > > > > > > > > >               return err;
> > > > > > > > > > >
> > > > > > > > > > > -     err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > > > > -                                      MEM_TYPE_PAGE_SHARED, NULL);
> > > > > > > > > > > +     if (vi->rq[qp_index].page_pool)
> > > > > > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > > > > +                                              MEM_TYPE_PAGE_POOL,
> > > > > > > > > > > +                                              vi->rq[qp_index].page_pool);
> > > > > > > > > > > +     else
> > > > > > > > > > > +             err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> > > > > > > > > > > +                                              MEM_TYPE_PAGE_SHARED,
> > > > > > > > > > > +                                              NULL);
> > > > > > > > > > > +
> > > > > > > > > > >       if (err < 0)
> > > > > > > > > > >               goto err_xdp_reg_mem_model;
> > > > > > > > > > >
> > > > > > > > > > > @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
> > > > > > > > > > >                               ethtool_sprintf(&p, "tx_queue_%u_%s", i,
> > > > > > > > > > >                                               virtnet_sq_stats_desc[j].desc);
> > > > > > > > > > >               }
> > > > > > > > > > > +             page_pool_ethtool_stats_get_strings(p);
> > > > > > > > > > >               break;
> > > > > > > > > > >       }
> > > > > > > > > > >  }
> > > > > > > > > > > @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
> > > > > > > > > > >       switch (sset) {
> > > > > > > > > > >       case ETH_SS_STATS:
> > > > > > > > > > >               return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> > > > > > > > > > > -                                            VIRTNET_SQ_STATS_LEN);
> > > > > > > > > > > +                                            VIRTNET_SQ_STATS_LEN +
> > > > > > > > > > > +                                             (page_pool_enabled && vi->mergeable_rx_bufs ?
> > > > > > > > > > > +                                              page_pool_ethtool_stats_get_count() : 0));
> > > > > > > > > > >       default:
> > > > > > > > > > >               return -EOPNOTSUPP;
> > > > > > > > > > >       }
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> > > > > > > > > > > +{
> > > > > > > > > > > +#ifdef CONFIG_PAGE_POOL_STATS
> > > > > > > > > > > +     struct virtnet_info *vi = netdev_priv(dev);
> > > > > > > > > > > +     struct page_pool_stats pp_stats = {};
> > > > > > > > > > > +     int i;
> > > > > > > > > > > +
> > > > > > > > > > > +     for (i = 0; i < vi->curr_queue_pairs; i++) {
> > > > > > > > > > > +             if (!vi->rq[i].page_pool)
> > > > > > > > > > > +                     continue;
> > > > > > > > > > > +             page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> > > > > > > > > > > +     }
> > > > > > > > > > > +     page_pool_ethtool_stats_get(data, &pp_stats);
> > > > > > > > > > > +#endif /* CONFIG_PAGE_POOL_STATS */
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > > > > > >                                     struct ethtool_stats *stats, u64 *data)
> > > > > > > > > > >  {
> > > > > > > > > > > @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
> > > > > > > > > > >               } while (u64_stats_fetch_retry(&sq->stats.syncp, start));
> > > > > > > > > > >               idx += VIRTNET_SQ_STATS_LEN;
> > > > > > > > > > >       }
> > > > > > > > > > > +
> > > > > > > > > > > +     virtnet_get_page_pool_stats(dev, &data[idx]);
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > >  static void virtnet_get_channels(struct net_device *dev,
> > > > > > > > > > > @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
> > > > > > > > > > >       for (i = 0; i < vi->max_queue_pairs; i++) {
> > > > > > > > > > >               __netif_napi_del(&vi->rq[i].napi);
> > > > > > > > > > >               __netif_napi_del(&vi->sq[i].napi);
> > > > > > > > > > > +             if (vi->rq[i].page_pool)
> > > > > > > > > > > +                     page_pool_destroy(vi->rq[i].page_pool);
> > > > > > > > > > >       }
> > > > > > > > > > >
> > > > > > > > > > >       /* We called __netif_napi_del(),
> > > > > > > > > > > @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
> > > > > > > > > > >       struct virtnet_info *vi = vq->vdev->priv;
> > > > > > > > > > >       int i = vq2rxq(vq);
> > > > > > > > > > >
> > > > > > > > > > > -     if (vi->mergeable_rx_bufs)
> > > > > > > > > > > -             put_page(virt_to_head_page(buf));
> > > > > > > > > > > -     else if (vi->big_packets)
> > > > > > > > > > > +     if (vi->mergeable_rx_bufs) {
> > > > > > > > > > > +             if (vi->rq[i].page_pool) {
> > > > > > > > > > > +                     page_pool_put_full_page(vi->rq[i].page_pool,
> > > > > > > > > > > +                                             virt_to_head_page(buf),
> > > > > > > > > > > +                                             true);
> > > > > > > > > > > +             } else {
> > > > > > > > > > > +                     put_page(virt_to_head_page(buf));
> > > > > > > > > > > +             }
> > > > > > > > > > > +     } else if (vi->big_packets) {
> > > > > > > > > > >               give_pages(&vi->rq[i], buf);
> > > > > > > > > > > -     else
> > > > > > > > > > > +     } else {
> > > > > > > > > > >               put_page(virt_to_head_page(buf));
> > > > > > > > > > > +     }
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > >  static void free_unused_bufs(struct virtnet_info *vi)
> > > > > > > > > > > @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
> > > > > > > > > > >       virtnet_free_queues(vi);
> > > > > > > > > > >  }
> > > > > > > > > > >
> > > > > > > > > > > +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> > > > > > > > > > > +{
> > > > > > > > > > > +     struct virtio_device *vdev = rq->vq->vdev;
> > > > > > > > > > > +
> > > > > > > > > > > +     struct page_pool_params pp_params = {
> > > > > > > > > > > +             .order = 0,
> > > > > > > > > > > +             .pool_size = rq->vq->num_max,
> > > > > > > > > > > +             .nid = dev_to_node(vdev->dev.parent),
> > > > > > > > > > > +             .dev = vdev->dev.parent,
> > > > > > > > > > > +             .offset = 0,
> > > > > > > > > > > +     };
> > > > > > > > > > > +
> > > > > > > > > > > +     rq->page_pool = page_pool_create(&pp_params);
> > > > > > > > > > > +     if (IS_ERR(rq->page_pool)) {
> > > > > > > > > > > +             dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> > > > > > > > > > > +                      PTR_ERR(rq->page_pool));
> > > > > > > > > > > +             rq->page_pool = NULL;
> > > > > > > > > > > +     }
> > > > > > > > > > > +}
> > > > > > > > > > > +
> > > > > > > > > > >  /* How large should a single buffer be so a queue full of these can fit at
> > > > > > > > > > >   * least one full packet?
> > > > > > > > > > >   * Logic below assumes the mergeable buffer header is used.
> > > > > > > > > > > @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
> > > > > > > > > > >               vi->rq[i].vq = vqs[rxq2vq(i)];
> > > > > > > > > > >               vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
> > > > > > > > > > >               vi->sq[i].vq = vqs[txq2vq(i)];
> > > > > > > > > > > +
> > > > > > > > > > > +             if (page_pool_enabled && vi->mergeable_rx_bufs)
> > > > > > > > > > > +                     virtnet_alloc_page_pool(&vi->rq[i]);
> > > > > > > > > > > +             else
> > > > > > > > > > > +                     dev_warn(&vi->vdev->dev,
> > > > > > > > > > > +                              "page pool only support mergeable mode\n");
> > > > > > > > > > > +
> > > > > > > > > > >       }
> > > > > > > > > > >
> > > > > > > > > > >       /* run here: ret == 0. */
> > > > > > > > > > > --
> > > > > > > > > > > 2.31.1
> > > > > > > > > >
> > > > > > > >
> > > > >
> > > >
> >
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-26  5:46 ` [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance Liang Chen
                     ` (2 preceding siblings ...)
  2023-05-28  6:20   ` Michael S. Tsirkin
@ 2023-11-29 14:50   ` Zhu Yanjun
  2023-11-29 14:59     ` Michael S. Tsirkin
  2023-11-30  7:17   ` Zhu Yanjun
  4 siblings, 1 reply; 56+ messages in thread
From: Zhu Yanjun @ 2023-11-29 14:50 UTC (permalink / raw)
  To: Liang Chen, jasowang, mst
  Cc: virtualization, netdev, linux-kernel, xuanzhuo, kuba, edumazet,
	davem, pabeni, alexander.duyck

在 2023/5/26 13:46, Liang Chen 写道:
> The implementation at the moment uses one page per packet in both the
> normal and XDP path. In addition, introducing a module parameter to enable
> or disable the usage of page pool (disabled by default).
> 
> In single-core vm testing environments, it gives a modest performance gain
> in the normal path.
>    Upstream codebase: 47.5 Gbits/sec
>    Upstream codebase + page_pool support: 50.2 Gbits/sec
> 
> In multi-core vm testing environments, The most significant performance
> gain is observed in XDP cpumap:
>    Upstream codebase: 1.38 Gbits/sec
>    Upstream codebase + page_pool support: 9.74 Gbits/sec
> 
> With this foundation, we can further integrate page pool fragmentation and
> DMA map/unmap support.
> 
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> ---
>   drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
>   1 file changed, 146 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c5dca0d92e64..99c0ca0c1781 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
>   module_param(gso, bool, 0444);
>   module_param(napi_tx, bool, 0644);
>   
> +static bool page_pool_enabled;
> +module_param(page_pool_enabled, bool, 0400);
> +
>   /* FIXME: MTU in config. */
>   #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>   #define GOOD_COPY_LEN	128
> @@ -159,6 +162,9 @@ struct receive_queue {
>   	/* Chain pages by the private ptr. */
>   	struct page *pages;
>   
> +	/* Page pool */
> +	struct page_pool *page_pool;
> +
>   	/* Average packet length for mergeable receive buffers. */
>   	struct ewma_pkt_len mrg_avg_pkt_len;
>   
> @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
>   	return skb;
>   }
>   
> +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> +{
> +	if (rq->page_pool)
> +		page_pool_put_full_page(rq->page_pool, page, true);
> +	else
> +		put_page(page);
> +}
> +
>   /* Called from bottom half context */
>   static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>   				   struct receive_queue *rq,
> @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>   	hdr = skb_vnet_hdr(skb);
>   	memcpy(hdr, hdr_p, hdr_len);
>   	if (page_to_free)
> -		put_page(page_to_free);
> +		virtnet_put_page(rq, page_to_free);
>   
>   	return skb;
>   }
> @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
>   	return ret;
>   }
>   
> -static void put_xdp_frags(struct xdp_buff *xdp)
> +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
>   {
>   	struct skb_shared_info *shinfo;
>   	struct page *xdp_page;
> @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
>   		shinfo = xdp_get_shared_info_from_buff(xdp);
>   		for (i = 0; i < shinfo->nr_frags; i++) {
>   			xdp_page = skb_frag_page(&shinfo->frags[i]);
> -			put_page(xdp_page);
> +			virtnet_put_page(rq, xdp_page);
>   		}
>   	}
>   }
> @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>   	if (page_off + *len + tailroom > PAGE_SIZE)
>   		return NULL;
>   
> -	page = alloc_page(GFP_ATOMIC);
> +	if (rq->page_pool)
> +		page = page_pool_dev_alloc_pages(rq->page_pool);
> +	else
> +		page = alloc_page(GFP_ATOMIC);
> +
>   	if (!page)
>   		return NULL;
>   
> @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>   		 * is sending packet larger than the MTU.
>   		 */
>   		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> -			put_page(p);
> +			virtnet_put_page(rq, p);
>   			goto err_buf;
>   		}
>   
>   		memcpy(page_address(page) + page_off,
>   		       page_address(p) + off, buflen);
>   		page_off += buflen;
> -		put_page(p);
> +		virtnet_put_page(rq, p);
>   	}
>   
>   	/* Headroom does not contribute to packet length */
>   	*len = page_off - VIRTIO_XDP_HEADROOM;
>   	return page;
>   err_buf:
> -	__free_pages(page, 0);
> +	if (rq->page_pool)
> +		page_pool_put_full_page(rq->page_pool, page, true);
> +	else
> +		__free_pages(page, 0);
>   	return NULL;
>   }
>   
> @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
>   		}
>   		stats->bytes += len;
>   		page = virt_to_head_page(buf);
> -		put_page(page);
> +		virtnet_put_page(rq, page);
>   	}
>   }
>   
> @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
>   		cur_frag_size = truesize;
>   		xdp_frags_truesz += cur_frag_size;
>   		if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> -			put_page(page);
> +			virtnet_put_page(rq, page);
>   			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
>   				 dev->name, len, (unsigned long)(truesize - room));
>   			dev->stats.rx_length_errors++;
> @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
>   	return 0;
>   
>   err:
> -	put_xdp_frags(xdp);
> +	put_xdp_frags(xdp, rq);
>   	return -EINVAL;
>   }
>   
> @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>   		if (*len + xdp_room > PAGE_SIZE)
>   			return NULL;
>   
> -		xdp_page = alloc_page(GFP_ATOMIC);
> +		if (rq->page_pool)
> +			xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> +		else
> +			xdp_page = alloc_page(GFP_ATOMIC);
>   		if (!xdp_page)
>   			return NULL;
>   
> @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>   
>   	*frame_sz = PAGE_SIZE;
>   
> -	put_page(*page);
> +	virtnet_put_page(rq, *page);
>   
>   	*page = xdp_page;
>   
> @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
>   		head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
>   		if (unlikely(!head_skb))
>   			break;
> +		if (rq->page_pool)
> +			skb_mark_for_recycle(head_skb);
>   		return head_skb;
>   
>   	case XDP_TX:
> @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
>   		break;
>   	}
>   
> -	put_xdp_frags(&xdp);
> +	put_xdp_frags(&xdp, rq);
>   
>   err_xdp:
> -	put_page(page);
> +	virtnet_put_page(rq, page);
>   	mergeable_buf_free(rq, num_buf, dev, stats);
>   
>   	stats->xdp_drops++;
> @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
>   	curr_skb = head_skb;
>   
> +	if (rq->page_pool)
> +		skb_mark_for_recycle(curr_skb);
> +
>   	if (unlikely(!curr_skb))
>   		goto err_skb;
>   	while (--num_buf) {
> @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   			curr_skb = nskb;
>   			head_skb->truesize += nskb->truesize;
>   			num_skb_frags = 0;
> +			if (rq->page_pool)
> +				skb_mark_for_recycle(curr_skb);
>   		}
>   		if (curr_skb != head_skb) {
>   			head_skb->data_len += len;
> @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   		}
>   		offset = buf - page_address(page);
>   		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> -			put_page(page);
> +			virtnet_put_page(rq, page);
>   			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
>   					     len, truesize);
>   		} else {
> @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   	return head_skb;
>   
>   err_skb:
> -	put_page(page);
> +	virtnet_put_page(rq, page);
>   	mergeable_buf_free(rq, num_buf, dev, stats);
>   
>   err_buf:
> @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>   	 * disabled GSO for XDP, it won't be a big issue.
>   	 */
>   	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> -	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> -		return -ENOMEM;
> +	if (rq->page_pool) {
> +		struct page *page;
>   
> -	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> -	buf += headroom; /* advance address leaving hole at front of pkt */
> -	get_page(alloc_frag->page);
> -	alloc_frag->offset += len + room;
> -	hole = alloc_frag->size - alloc_frag->offset;
> -	if (hole < len + room) {
> -		/* To avoid internal fragmentation, if there is very likely not
> -		 * enough space for another buffer, add the remaining space to
> -		 * the current buffer.
> -		 * XDP core assumes that frame_size of xdp_buff and the length
> -		 * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> -		 */
> -		if (!headroom)
> -			len += hole;
> -		alloc_frag->offset += hole;
> -	}
> +		page = page_pool_dev_alloc_pages(rq->page_pool);
> +		if (unlikely(!page))
> +			return -ENOMEM;
> +		buf = (char *)page_address(page);
> +		buf += headroom; /* advance address leaving hole at front of pkt */
> +	} else {
> +		if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> +			return -ENOMEM;
>   
> +		buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +		buf += headroom; /* advance address leaving hole at front of pkt */
> +		get_page(alloc_frag->page);
> +		alloc_frag->offset += len + room;
> +		hole = alloc_frag->size - alloc_frag->offset;
> +		if (hole < len + room) {
> +			/* To avoid internal fragmentation, if there is very likely not
> +			 * enough space for another buffer, add the remaining space to
> +			 * the current buffer.
> +			 * XDP core assumes that frame_size of xdp_buff and the length
> +			 * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> +			 */
> +			if (!headroom)
> +				len += hole;
> +			alloc_frag->offset += hole;
> +		}
> +	}
>   	sg_init_one(rq->sg, buf, len);
>   	ctx = mergeable_len_to_ctx(len + room, headroom);
>   	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
>   	if (err < 0)
> -		put_page(virt_to_head_page(buf));
> +		virtnet_put_page(rq, virt_to_head_page(buf));
>   
>   	return err;
>   }
> @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
>   	if (err < 0)
>   		return err;
>   
> -	err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> -					 MEM_TYPE_PAGE_SHARED, NULL);
> +	if (vi->rq[qp_index].page_pool)
> +		err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> +						 MEM_TYPE_PAGE_POOL,
> +						 vi->rq[qp_index].page_pool);
> +	else
> +		err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> +						 MEM_TYPE_PAGE_SHARED,
> +						 NULL);
> +
>   	if (err < 0)
>   		goto err_xdp_reg_mem_model;
>   
> @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
>   				ethtool_sprintf(&p, "tx_queue_%u_%s", i,
>   						virtnet_sq_stats_desc[j].desc);
>   		}
> +		page_pool_ethtool_stats_get_strings(p);
>   		break;
>   	}
>   }
> @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
>   	switch (sset) {
>   	case ETH_SS_STATS:
>   		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> -					       VIRTNET_SQ_STATS_LEN);
> +					       VIRTNET_SQ_STATS_LEN +
> +						(page_pool_enabled && vi->mergeable_rx_bufs ?
> +						 page_pool_ethtool_stats_get_count() : 0));

vi->curr_queue_pairs should not multiply 
page_pool_ethtool_stats_get_count().

Zhu Yanjun
>   	default:
>   		return -EOPNOTSUPP;
>   	}
>   }
>   
> +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> +{
> +#ifdef CONFIG_PAGE_POOL_STATS
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	struct page_pool_stats pp_stats = {};
> +	int i;
> +
> +	for (i = 0; i < vi->curr_queue_pairs; i++) {
> +		if (!vi->rq[i].page_pool)
> +			continue;
> +		page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> +	}
> +	page_pool_ethtool_stats_get(data, &pp_stats);
> +#endif /* CONFIG_PAGE_POOL_STATS */
> +}
> +
>   static void virtnet_get_ethtool_stats(struct net_device *dev,
>   				      struct ethtool_stats *stats, u64 *data)
>   {
> @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
>   		} while (u64_stats_fetch_retry(&sq->stats.syncp, start));
>   		idx += VIRTNET_SQ_STATS_LEN;
>   	}
> +
> +	virtnet_get_page_pool_stats(dev, &data[idx]);
>   }
>   
>   static void virtnet_get_channels(struct net_device *dev,
> @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
>   	for (i = 0; i < vi->max_queue_pairs; i++) {
>   		__netif_napi_del(&vi->rq[i].napi);
>   		__netif_napi_del(&vi->sq[i].napi);
> +		if (vi->rq[i].page_pool)
> +			page_pool_destroy(vi->rq[i].page_pool);
>   	}
>   
>   	/* We called __netif_napi_del(),
> @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
>   	struct virtnet_info *vi = vq->vdev->priv;
>   	int i = vq2rxq(vq);
>   
> -	if (vi->mergeable_rx_bufs)
> -		put_page(virt_to_head_page(buf));
> -	else if (vi->big_packets)
> +	if (vi->mergeable_rx_bufs) {
> +		if (vi->rq[i].page_pool) {
> +			page_pool_put_full_page(vi->rq[i].page_pool,
> +						virt_to_head_page(buf),
> +						true);
> +		} else {
> +			put_page(virt_to_head_page(buf));
> +		}
> +	} else if (vi->big_packets) {
>   		give_pages(&vi->rq[i], buf);
> -	else
> +	} else {
>   		put_page(virt_to_head_page(buf));
> +	}
>   }
>   
>   static void free_unused_bufs(struct virtnet_info *vi)
> @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
>   	virtnet_free_queues(vi);
>   }
>   
> +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> +{
> +	struct virtio_device *vdev = rq->vq->vdev;
> +
> +	struct page_pool_params pp_params = {
> +		.order = 0,
> +		.pool_size = rq->vq->num_max,
> +		.nid = dev_to_node(vdev->dev.parent),
> +		.dev = vdev->dev.parent,
> +		.offset = 0,
> +	};
> +
> +	rq->page_pool = page_pool_create(&pp_params);
> +	if (IS_ERR(rq->page_pool)) {
> +		dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> +			 PTR_ERR(rq->page_pool));
> +		rq->page_pool = NULL;
> +	}
> +}
> +
>   /* How large should a single buffer be so a queue full of these can fit at
>    * least one full packet?
>    * Logic below assumes the mergeable buffer header is used.
> @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
>   		vi->rq[i].vq = vqs[rxq2vq(i)];
>   		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
>   		vi->sq[i].vq = vqs[txq2vq(i)];
> +
> +		if (page_pool_enabled && vi->mergeable_rx_bufs)
> +			virtnet_alloc_page_pool(&vi->rq[i]);
> +		else
> +			dev_warn(&vi->vdev->dev,
> +				 "page pool only support mergeable mode\n");
> +
>   	}
>   
>   	/* run here: ret == 0. */


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-11-29 14:50   ` Zhu Yanjun
@ 2023-11-29 14:59     ` Michael S. Tsirkin
  2023-11-29 15:22       ` Zhu Yanjun
  0 siblings, 1 reply; 56+ messages in thread
From: Michael S. Tsirkin @ 2023-11-29 14:59 UTC (permalink / raw)
  To: Zhu Yanjun
  Cc: Liang Chen, jasowang, virtualization, netdev, linux-kernel,
	xuanzhuo, kuba, edumazet, davem, pabeni, alexander.duyck

On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote:
> 在 2023/5/26 13:46, Liang Chen 写道:


what made you respond to a patch from May, now?

-- 
MST


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-11-29 14:59     ` Michael S. Tsirkin
@ 2023-11-29 15:22       ` Zhu Yanjun
  2023-11-29 15:29         ` Zhu Yanjun
  0 siblings, 1 reply; 56+ messages in thread
From: Zhu Yanjun @ 2023-11-29 15:22 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Liang Chen, jasowang, virtualization, netdev, linux-kernel,
	xuanzhuo, kuba, edumazet, davem, pabeni, alexander.duyck


在 2023/11/29 22:59, Michael S. Tsirkin 写道:
> On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote:
>> 在 2023/5/26 13:46, Liang Chen 写道:
>
> what made you respond to a patch from May, now?

I want to apply page_pool to our virtio_net. This virtio_net works on 
our device.

I want to verify whether page_pool on virtio_net with our device can 
improve the performance or not.

And I found that ethtool is wrong.

I use virtio_net on our device. I found that page member variable in rq 
is not used in recv path.

When virtio_net is modprobe, I checked page member variable in rq with 
kprobe or crash tool.  page member variable in rq is always NULL.

But sg in recv path is used.

So how to use page member variable in rq? If page member variable in rq 
is always NULL, can we remove it?

BTW, I use ping and iperf tool to make tests with virtio_net. In the 
tests, page member variable in rq is always NULL.

It is interesting.

Zhu Yanjun

>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-11-29 15:22       ` Zhu Yanjun
@ 2023-11-29 15:29         ` Zhu Yanjun
  2023-11-30  2:34           ` Xuan Zhuo
  0 siblings, 1 reply; 56+ messages in thread
From: Zhu Yanjun @ 2023-11-29 15:29 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Liang Chen, jasowang, virtualization, netdev, linux-kernel,
	xuanzhuo, kuba, edumazet, davem, pabeni, alexander.duyck


在 2023/11/29 23:22, Zhu Yanjun 写道:
>
> 在 2023/11/29 22:59, Michael S. Tsirkin 写道:
>> On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote:
>>> 在 2023/5/26 13:46, Liang Chen 写道:
>>
>> what made you respond to a patch from May, now?
>
> I want to apply page_pool to our virtio_net. This virtio_net works on 
> our device.
>
> I want to verify whether page_pool on virtio_net with our device can 
> improve the performance or not.
>
> And I found that ethtool is wrong.
>
> I use virtio_net on our device. I found that page member variable in 
> rq is not used in recv path.
>
> When virtio_net is modprobe, I checked page member variable in rq with 
> kprobe or crash tool.  page member variable in rq is always NULL.
>
> But sg in recv path is used.
>
> So how to use page member variable in rq? If page member variable in 
> rq is always NULL, can we remove it?
>
> BTW, I use ping and iperf tool to make tests with virtio_net. In the 
> tests, page member variable in rq is always NULL.


And I replaced page member variable in rq with page_pool, but the 
statistics of page_pool are always 0.

It is interesting that page_pool member variable in rq is not used in 
ping and iperf tests.

I am not sure what tests can make page member variable not NULL. ^_^

Best Regards,

Zhu Yanjun


>
> It is interesting.
>
> Zhu Yanjun
>
>>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-11-29 15:29         ` Zhu Yanjun
@ 2023-11-30  2:34           ` Xuan Zhuo
  2023-11-30  5:30             ` Zhu Yanjun
  0 siblings, 1 reply; 56+ messages in thread
From: Xuan Zhuo @ 2023-11-30  2:34 UTC (permalink / raw)
  To: Zhu Yanjun
  Cc: Liang Chen, jasowang, virtualization, netdev, linux-kernel, kuba,
	edumazet, davem, pabeni, alexander.duyck, Michael S. Tsirkin

On Wed, 29 Nov 2023 23:29:10 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote:
>
> 在 2023/11/29 23:22, Zhu Yanjun 写道:
> >
> > 在 2023/11/29 22:59, Michael S. Tsirkin 写道:
> >> On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote:
> >>> 在 2023/5/26 13:46, Liang Chen 写道:
> >>
> >> what made you respond to a patch from May, now?
> >
> > I want to apply page_pool to our virtio_net. This virtio_net works on
> > our device.
> >
> > I want to verify whether page_pool on virtio_net with our device can
> > improve the performance or not.
> >
> > And I found that ethtool is wrong.
> >
> > I use virtio_net on our device. I found that page member variable in
> > rq is not used in recv path.
> >
> > When virtio_net is modprobe, I checked page member variable in rq with
> > kprobe or crash tool.  page member variable in rq is always NULL.
> >
> > But sg in recv path is used.
> >
> > So how to use page member variable in rq? If page member variable in
> > rq is always NULL, can we remove it?
> >
> > BTW, I use ping and iperf tool to make tests with virtio_net. In the
> > tests, page member variable in rq is always NULL.
>
>
> And I replaced page member variable in rq with page_pool, but the
> statistics of page_pool are always 0.
>
> It is interesting that page_pool member variable in rq is not used in
> ping and iperf tests.
>
> I am not sure what tests can make page member variable not NULL. ^_^

Do you mean rq->pages?

That is for big mode.

Thanks.


>
> Best Regards,
>
> Zhu Yanjun
>
>
> >
> > It is interesting.
> >
> > Zhu Yanjun
> >
> >>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-11-30  2:34           ` Xuan Zhuo
@ 2023-11-30  5:30             ` Zhu Yanjun
  2023-12-01  1:38               ` Xuan Zhuo
  0 siblings, 1 reply; 56+ messages in thread
From: Zhu Yanjun @ 2023-11-30  5:30 UTC (permalink / raw)
  To: Xuan Zhuo
  Cc: Liang Chen, jasowang, virtualization, netdev, linux-kernel, kuba,
	edumazet, davem, pabeni, alexander.duyck, Michael S. Tsirkin


在 2023/11/30 10:34, Xuan Zhuo 写道:
> On Wed, 29 Nov 2023 23:29:10 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote:
>> 在 2023/11/29 23:22, Zhu Yanjun 写道:
>>> 在 2023/11/29 22:59, Michael S. Tsirkin 写道:
>>>> On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote:
>>>>> 在 2023/5/26 13:46, Liang Chen 写道:
>>>> what made you respond to a patch from May, now?
>>> I want to apply page_pool to our virtio_net. This virtio_net works on
>>> our device.
>>>
>>> I want to verify whether page_pool on virtio_net with our device can
>>> improve the performance or not.
>>>
>>> And I found that ethtool is wrong.
>>>
>>> I use virtio_net on our device. I found that page member variable in
>>> rq is not used in recv path.
>>>
>>> When virtio_net is modprobe, I checked page member variable in rq with
>>> kprobe or crash tool.  page member variable in rq is always NULL.
>>>
>>> But sg in recv path is used.
>>>
>>> So how to use page member variable in rq? If page member variable in
>>> rq is always NULL, can we remove it?
>>>
>>> BTW, I use ping and iperf tool to make tests with virtio_net. In the
>>> tests, page member variable in rq is always NULL.
>>
>> And I replaced page member variable in rq with page_pool, but the
>> statistics of page_pool are always 0.
>>
>> It is interesting that page_pool member variable in rq is not used in
>> ping and iperf tests.
>>
>> I am not sure what tests can make page member variable not NULL. ^_^
> Do you mean rq->pages?
>
> That is for big mode.

Hi, Xuan

Got it. What is big mode? Do you mean big packet size? I run iperf with 
the packet size 2^23.

The rq->pages is still NULL.

It is interesting.

Zhu Yanjun


>
> Thanks.
>
>
>> Best Regards,
>>
>> Zhu Yanjun
>>
>>
>>> It is interesting.
>>>
>>> Zhu Yanjun
>>>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-05-26  5:46 ` [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance Liang Chen
                     ` (3 preceding siblings ...)
  2023-11-29 14:50   ` Zhu Yanjun
@ 2023-11-30  7:17   ` Zhu Yanjun
  4 siblings, 0 replies; 56+ messages in thread
From: Zhu Yanjun @ 2023-11-30  7:17 UTC (permalink / raw)
  To: Liang Chen, jasowang, mst
  Cc: virtualization, netdev, linux-kernel, xuanzhuo, kuba, edumazet,
	davem, pabeni, alexander.duyck

在 2023/5/26 13:46, Liang Chen 写道:
> The implementation at the moment uses one page per packet in both the
> normal and XDP path. In addition, introducing a module parameter to enable
> or disable the usage of page pool (disabled by default).
> 
> In single-core vm testing environments, it gives a modest performance gain
> in the normal path.
>    Upstream codebase: 47.5 Gbits/sec
>    Upstream codebase + page_pool support: 50.2 Gbits/sec
> 
> In multi-core vm testing environments, The most significant performance
> gain is observed in XDP cpumap:
>    Upstream codebase: 1.38 Gbits/sec
>    Upstream codebase + page_pool support: 9.74 Gbits/sec
> 
> With this foundation, we can further integrate page pool fragmentation and
> DMA map/unmap support.
> 
> Signed-off-by: Liang Chen <liangchen.linux@gmail.com>
> ---
>   drivers/net/virtio_net.c | 188 ++++++++++++++++++++++++++++++---------
>   1 file changed, 146 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index c5dca0d92e64..99c0ca0c1781 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -31,6 +31,9 @@ module_param(csum, bool, 0444);
>   module_param(gso, bool, 0444);
>   module_param(napi_tx, bool, 0644);
>   
> +static bool page_pool_enabled;
> +module_param(page_pool_enabled, bool, 0400);
> +
>   /* FIXME: MTU in config. */
>   #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>   #define GOOD_COPY_LEN	128
> @@ -159,6 +162,9 @@ struct receive_queue {
>   	/* Chain pages by the private ptr. */
>   	struct page *pages;
>   
> +	/* Page pool */
> +	struct page_pool *page_pool;
> +
>   	/* Average packet length for mergeable receive buffers. */
>   	struct ewma_pkt_len mrg_avg_pkt_len;
>   
> @@ -459,6 +465,14 @@ static struct sk_buff *virtnet_build_skb(void *buf, unsigned int buflen,
>   	return skb;
>   }
>   
> +static void virtnet_put_page(struct receive_queue *rq, struct page *page)
> +{
> +	if (rq->page_pool)
> +		page_pool_put_full_page(rq->page_pool, page, true);
> +	else
> +		put_page(page);
> +}
> +
>   /* Called from bottom half context */
>   static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>   				   struct receive_queue *rq,
> @@ -555,7 +569,7 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>   	hdr = skb_vnet_hdr(skb);
>   	memcpy(hdr, hdr_p, hdr_len);
>   	if (page_to_free)
> -		put_page(page_to_free);
> +		virtnet_put_page(rq, page_to_free);
>   
>   	return skb;
>   }
> @@ -802,7 +816,7 @@ static int virtnet_xdp_xmit(struct net_device *dev,
>   	return ret;
>   }
>   
> -static void put_xdp_frags(struct xdp_buff *xdp)
> +static void put_xdp_frags(struct xdp_buff *xdp, struct receive_queue *rq)
>   {
>   	struct skb_shared_info *shinfo;
>   	struct page *xdp_page;
> @@ -812,7 +826,7 @@ static void put_xdp_frags(struct xdp_buff *xdp)
>   		shinfo = xdp_get_shared_info_from_buff(xdp);
>   		for (i = 0; i < shinfo->nr_frags; i++) {
>   			xdp_page = skb_frag_page(&shinfo->frags[i]);
> -			put_page(xdp_page);
> +			virtnet_put_page(rq, xdp_page);
>   		}
>   	}
>   }
> @@ -903,7 +917,11 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>   	if (page_off + *len + tailroom > PAGE_SIZE)
>   		return NULL;
>   
> -	page = alloc_page(GFP_ATOMIC);
> +	if (rq->page_pool)
> +		page = page_pool_dev_alloc_pages(rq->page_pool);
> +	else
> +		page = alloc_page(GFP_ATOMIC);
> +
>   	if (!page)
>   		return NULL;
>   
> @@ -926,21 +944,24 @@ static struct page *xdp_linearize_page(struct receive_queue *rq,
>   		 * is sending packet larger than the MTU.
>   		 */
>   		if ((page_off + buflen + tailroom) > PAGE_SIZE) {
> -			put_page(p);
> +			virtnet_put_page(rq, p);
>   			goto err_buf;
>   		}
>   
>   		memcpy(page_address(page) + page_off,
>   		       page_address(p) + off, buflen);
>   		page_off += buflen;
> -		put_page(p);
> +		virtnet_put_page(rq, p);
>   	}
>   
>   	/* Headroom does not contribute to packet length */
>   	*len = page_off - VIRTIO_XDP_HEADROOM;
>   	return page;
>   err_buf:
> -	__free_pages(page, 0);
> +	if (rq->page_pool)
> +		page_pool_put_full_page(rq->page_pool, page, true);
> +	else
> +		__free_pages(page, 0);
>   	return NULL;
>   }
>   
> @@ -1144,7 +1165,7 @@ static void mergeable_buf_free(struct receive_queue *rq, int num_buf,
>   		}
>   		stats->bytes += len;
>   		page = virt_to_head_page(buf);
> -		put_page(page);
> +		virtnet_put_page(rq, page);
>   	}
>   }
>   
> @@ -1264,7 +1285,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
>   		cur_frag_size = truesize;
>   		xdp_frags_truesz += cur_frag_size;
>   		if (unlikely(len > truesize - room || cur_frag_size > PAGE_SIZE)) {
> -			put_page(page);
> +			virtnet_put_page(rq, page);
>   			pr_debug("%s: rx error: len %u exceeds truesize %lu\n",
>   				 dev->name, len, (unsigned long)(truesize - room));
>   			dev->stats.rx_length_errors++;
> @@ -1283,7 +1304,7 @@ static int virtnet_build_xdp_buff_mrg(struct net_device *dev,
>   	return 0;
>   
>   err:
> -	put_xdp_frags(xdp);
> +	put_xdp_frags(xdp, rq);
>   	return -EINVAL;
>   }
>   
> @@ -1344,7 +1365,10 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>   		if (*len + xdp_room > PAGE_SIZE)
>   			return NULL;
>   
> -		xdp_page = alloc_page(GFP_ATOMIC);
> +		if (rq->page_pool)
> +			xdp_page = page_pool_dev_alloc_pages(rq->page_pool);
> +		else
> +			xdp_page = alloc_page(GFP_ATOMIC);
>   		if (!xdp_page)
>   			return NULL;
>   
> @@ -1354,7 +1378,7 @@ static void *mergeable_xdp_get_buf(struct virtnet_info *vi,
>   
>   	*frame_sz = PAGE_SIZE;
>   
> -	put_page(*page);
> +	virtnet_put_page(rq, *page);
>   
>   	*page = xdp_page;
>   
> @@ -1400,6 +1424,8 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
>   		head_skb = build_skb_from_xdp_buff(dev, vi, &xdp, xdp_frags_truesz);
>   		if (unlikely(!head_skb))
>   			break;
> +		if (rq->page_pool)
> +			skb_mark_for_recycle(head_skb);
>   		return head_skb;
>   
>   	case XDP_TX:
> @@ -1410,10 +1436,10 @@ static struct sk_buff *receive_mergeable_xdp(struct net_device *dev,
>   		break;
>   	}
>   
> -	put_xdp_frags(&xdp);
> +	put_xdp_frags(&xdp, rq);
>   
>   err_xdp:
> -	put_page(page);
> +	virtnet_put_page(rq, page);
>   	mergeable_buf_free(rq, num_buf, dev, stats);
>   
>   	stats->xdp_drops++;
> @@ -1467,6 +1493,9 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   	head_skb = page_to_skb(vi, rq, page, offset, len, truesize, headroom);
>   	curr_skb = head_skb;
>   
> +	if (rq->page_pool)
> +		skb_mark_for_recycle(curr_skb);
> +
>   	if (unlikely(!curr_skb))
>   		goto err_skb;
>   	while (--num_buf) {
> @@ -1509,6 +1538,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   			curr_skb = nskb;
>   			head_skb->truesize += nskb->truesize;
>   			num_skb_frags = 0;
> +			if (rq->page_pool)
> +				skb_mark_for_recycle(curr_skb);
>   		}
>   		if (curr_skb != head_skb) {
>   			head_skb->data_len += len;
> @@ -1517,7 +1548,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   		}
>   		offset = buf - page_address(page);
>   		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
> -			put_page(page);
> +			virtnet_put_page(rq, page);
>   			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
>   					     len, truesize);
>   		} else {
> @@ -1530,7 +1561,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>   	return head_skb;
>   
>   err_skb:
> -	put_page(page);
> +	virtnet_put_page(rq, page);
>   	mergeable_buf_free(rq, num_buf, dev, stats);
>   
>   err_buf:
> @@ -1737,31 +1768,40 @@ static int add_recvbuf_mergeable(struct virtnet_info *vi,
>   	 * disabled GSO for XDP, it won't be a big issue.
>   	 */
>   	len = get_mergeable_buf_len(rq, &rq->mrg_avg_pkt_len, room);
> -	if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> -		return -ENOMEM;
> +	if (rq->page_pool) {
> +		struct page *page;
>   
> -	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> -	buf += headroom; /* advance address leaving hole at front of pkt */
> -	get_page(alloc_frag->page);
> -	alloc_frag->offset += len + room;
> -	hole = alloc_frag->size - alloc_frag->offset;
> -	if (hole < len + room) {
> -		/* To avoid internal fragmentation, if there is very likely not
> -		 * enough space for another buffer, add the remaining space to
> -		 * the current buffer.
> -		 * XDP core assumes that frame_size of xdp_buff and the length
> -		 * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> -		 */
> -		if (!headroom)
> -			len += hole;
> -		alloc_frag->offset += hole;
> -	}
> +		page = page_pool_dev_alloc_pages(rq->page_pool);
> +		if (unlikely(!page))
> +			return -ENOMEM;
> +		buf = (char *)page_address(page);
> +		buf += headroom; /* advance address leaving hole at front of pkt */
> +	} else {
> +		if (unlikely(!skb_page_frag_refill(len + room, alloc_frag, gfp)))
> +			return -ENOMEM;
>   
> +		buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +		buf += headroom; /* advance address leaving hole at front of pkt */
> +		get_page(alloc_frag->page);
> +		alloc_frag->offset += len + room;
> +		hole = alloc_frag->size - alloc_frag->offset;
> +		if (hole < len + room) {
> +			/* To avoid internal fragmentation, if there is very likely not
> +			 * enough space for another buffer, add the remaining space to
> +			 * the current buffer.
> +			 * XDP core assumes that frame_size of xdp_buff and the length
> +			 * of the frag are PAGE_SIZE, so we disable the hole mechanism.
> +			 */
> +			if (!headroom)
> +				len += hole;
> +			alloc_frag->offset += hole;
> +		}
> +	}
>   	sg_init_one(rq->sg, buf, len);
>   	ctx = mergeable_len_to_ctx(len + room, headroom);
>   	err = virtqueue_add_inbuf_ctx(rq->vq, rq->sg, 1, buf, ctx, gfp);
>   	if (err < 0)
> -		put_page(virt_to_head_page(buf));
> +		virtnet_put_page(rq, virt_to_head_page(buf));
>   
>   	return err;
>   }
> @@ -1994,8 +2034,15 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
>   	if (err < 0)
>   		return err;
>   
> -	err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> -					 MEM_TYPE_PAGE_SHARED, NULL);
> +	if (vi->rq[qp_index].page_pool)
> +		err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> +						 MEM_TYPE_PAGE_POOL,
> +						 vi->rq[qp_index].page_pool);
> +	else
> +		err = xdp_rxq_info_reg_mem_model(&vi->rq[qp_index].xdp_rxq,
> +						 MEM_TYPE_PAGE_SHARED,
> +						 NULL);
> +
>   	if (err < 0)
>   		goto err_xdp_reg_mem_model;
>   
> @@ -2951,6 +2998,7 @@ static void virtnet_get_strings(struct net_device *dev, u32 stringset, u8 *data)
>   				ethtool_sprintf(&p, "tx_queue_%u_%s", i,
>   						virtnet_sq_stats_desc[j].desc);
>   		}
> +		page_pool_ethtool_stats_get_strings(p);
>   		break;
>   	}
>   }
> @@ -2962,12 +3010,30 @@ static int virtnet_get_sset_count(struct net_device *dev, int sset)
>   	switch (sset) {
>   	case ETH_SS_STATS:
>   		return vi->curr_queue_pairs * (VIRTNET_RQ_STATS_LEN +
> -					       VIRTNET_SQ_STATS_LEN);
> +					       VIRTNET_SQ_STATS_LEN +
> +						(page_pool_enabled && vi->mergeable_rx_bufs ?
> +						 page_pool_ethtool_stats_get_count() : 0));
>   	default:
>   		return -EOPNOTSUPP;
>   	}
>   }
>   
> +static void virtnet_get_page_pool_stats(struct net_device *dev, u64 *data)
> +{
> +#ifdef CONFIG_PAGE_POOL_STATS
> +	struct virtnet_info *vi = netdev_priv(dev);
> +	struct page_pool_stats pp_stats = {};
> +	int i;
> +
> +	for (i = 0; i < vi->curr_queue_pairs; i++) {
> +		if (!vi->rq[i].page_pool)
> +			continue;
> +		page_pool_get_stats(vi->rq[i].page_pool, &pp_stats);
> +	}
> +	page_pool_ethtool_stats_get(data, &pp_stats);
> +#endif /* CONFIG_PAGE_POOL_STATS */
> +}
> +
>   static void virtnet_get_ethtool_stats(struct net_device *dev,
>   				      struct ethtool_stats *stats, u64 *data)
>   {
> @@ -3003,6 +3069,8 @@ static void virtnet_get_ethtool_stats(struct net_device *dev,
>   		} while (u64_stats_fetch_retry(&sq->stats.syncp, start));
>   		idx += VIRTNET_SQ_STATS_LEN;
>   	}
> +
> +	virtnet_get_page_pool_stats(dev, &data[idx]);

virtnet_get_page_pool_stats(dev, &data[idx]); should be moved to between 
SQ and RQ.

Combined with the correct page_pool_ethtool_stats_get_count, ethtool can 
get page_pool statistics. But all the values about page_pool are zero. 
It is interesting.

I will delve into this and find out why.

Zhu Yanjun

>   }
>   
>   static void virtnet_get_channels(struct net_device *dev,
> @@ -3623,6 +3691,8 @@ static void virtnet_free_queues(struct virtnet_info *vi)
>   	for (i = 0; i < vi->max_queue_pairs; i++) {
>   		__netif_napi_del(&vi->rq[i].napi);
>   		__netif_napi_del(&vi->sq[i].napi);
> +		if (vi->rq[i].page_pool)
> +			page_pool_destroy(vi->rq[i].page_pool);
>   	}
>   
>   	/* We called __netif_napi_del(),
> @@ -3679,12 +3749,19 @@ static void virtnet_rq_free_unused_buf(struct virtqueue *vq, void *buf)
>   	struct virtnet_info *vi = vq->vdev->priv;
>   	int i = vq2rxq(vq);
>   
> -	if (vi->mergeable_rx_bufs)
> -		put_page(virt_to_head_page(buf));
> -	else if (vi->big_packets)
> +	if (vi->mergeable_rx_bufs) {
> +		if (vi->rq[i].page_pool) {
> +			page_pool_put_full_page(vi->rq[i].page_pool,
> +						virt_to_head_page(buf),
> +						true);
> +		} else {
> +			put_page(virt_to_head_page(buf));
> +		}
> +	} else if (vi->big_packets) {
>   		give_pages(&vi->rq[i], buf);
> -	else
> +	} else {
>   		put_page(virt_to_head_page(buf));
> +	}
>   }
>   
>   static void free_unused_bufs(struct virtnet_info *vi)
> @@ -3718,6 +3795,26 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
>   	virtnet_free_queues(vi);
>   }
>   
> +static void virtnet_alloc_page_pool(struct receive_queue *rq)
> +{
> +	struct virtio_device *vdev = rq->vq->vdev;
> +
> +	struct page_pool_params pp_params = {
> +		.order = 0,
> +		.pool_size = rq->vq->num_max,
> +		.nid = dev_to_node(vdev->dev.parent),
> +		.dev = vdev->dev.parent,
> +		.offset = 0,
> +	};
> +
> +	rq->page_pool = page_pool_create(&pp_params);
> +	if (IS_ERR(rq->page_pool)) {
> +		dev_warn(&vdev->dev, "page pool creation failed: %ld\n",
> +			 PTR_ERR(rq->page_pool));
> +		rq->page_pool = NULL;
> +	}
> +}
> +
>   /* How large should a single buffer be so a queue full of these can fit at
>    * least one full packet?
>    * Logic below assumes the mergeable buffer header is used.
> @@ -3801,6 +3898,13 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
>   		vi->rq[i].vq = vqs[rxq2vq(i)];
>   		vi->rq[i].min_buf_len = mergeable_min_buf_len(vi, vi->rq[i].vq);
>   		vi->sq[i].vq = vqs[txq2vq(i)];
> +
> +		if (page_pool_enabled && vi->mergeable_rx_bufs)
> +			virtnet_alloc_page_pool(&vi->rq[i]);
> +		else
> +			dev_warn(&vi->vdev->dev,
> +				 "page pool only support mergeable mode\n");
> +
>   	}
>   
>   	/* run here: ret == 0. */


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-11-30  5:30             ` Zhu Yanjun
@ 2023-12-01  1:38               ` Xuan Zhuo
  2023-12-04  5:24                 ` Zhu Yanjun
  0 siblings, 1 reply; 56+ messages in thread
From: Xuan Zhuo @ 2023-12-01  1:38 UTC (permalink / raw)
  To: Zhu Yanjun
  Cc: Liang Chen, jasowang, virtualization, netdev, linux-kernel, kuba,
	edumazet, davem, pabeni, alexander.duyck, Michael S. Tsirkin

On Thu, 30 Nov 2023 13:30:40 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote:
>
> 在 2023/11/30 10:34, Xuan Zhuo 写道:
> > On Wed, 29 Nov 2023 23:29:10 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote:
> >> 在 2023/11/29 23:22, Zhu Yanjun 写道:
> >>> 在 2023/11/29 22:59, Michael S. Tsirkin 写道:
> >>>> On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote:
> >>>>> 在 2023/5/26 13:46, Liang Chen 写道:
> >>>> what made you respond to a patch from May, now?
> >>> I want to apply page_pool to our virtio_net. This virtio_net works on
> >>> our device.
> >>>
> >>> I want to verify whether page_pool on virtio_net with our device can
> >>> improve the performance or not.
> >>>
> >>> And I found that ethtool is wrong.
> >>>
> >>> I use virtio_net on our device. I found that page member variable in
> >>> rq is not used in recv path.
> >>>
> >>> When virtio_net is modprobe, I checked page member variable in rq with
> >>> kprobe or crash tool.  page member variable in rq is always NULL.
> >>>
> >>> But sg in recv path is used.
> >>>
> >>> So how to use page member variable in rq? If page member variable in
> >>> rq is always NULL, can we remove it?
> >>>
> >>> BTW, I use ping and iperf tool to make tests with virtio_net. In the
> >>> tests, page member variable in rq is always NULL.
> >>
> >> And I replaced page member variable in rq with page_pool, but the
> >> statistics of page_pool are always 0.
> >>
> >> It is interesting that page_pool member variable in rq is not used in
> >> ping and iperf tests.
> >>
> >> I am not sure what tests can make page member variable not NULL. ^_^
> > Do you mean rq->pages?
> >
> > That is for big mode.
>
> Hi, Xuan
>
> Got it. What is big mode? Do you mean big packet size? I run iperf with
> the packet size 2^23.
>
> The rq->pages is still NULL.
>
> It is interesting.

You may need to check the code of virtnet_probe().

Thanks.


>
> Zhu Yanjun
>
>
> >
> > Thanks.
> >
> >
> >> Best Regards,
> >>
> >> Zhu Yanjun
> >>
> >>
> >>> It is interesting.
> >>>
> >>> Zhu Yanjun
> >>>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance
  2023-12-01  1:38               ` Xuan Zhuo
@ 2023-12-04  5:24                 ` Zhu Yanjun
  0 siblings, 0 replies; 56+ messages in thread
From: Zhu Yanjun @ 2023-12-04  5:24 UTC (permalink / raw)
  To: Xuan Zhuo
  Cc: Liang Chen, jasowang, virtualization, netdev, linux-kernel, kuba,
	edumazet, davem, pabeni, alexander.duyck, Michael S. Tsirkin

在 2023/12/1 9:38, Xuan Zhuo 写道:
> On Thu, 30 Nov 2023 13:30:40 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote:
>>
>> 在 2023/11/30 10:34, Xuan Zhuo 写道:
>>> On Wed, 29 Nov 2023 23:29:10 +0800, Zhu Yanjun <yanjun.zhu@linux.dev> wrote:
>>>> 在 2023/11/29 23:22, Zhu Yanjun 写道:
>>>>> 在 2023/11/29 22:59, Michael S. Tsirkin 写道:
>>>>>> On Wed, Nov 29, 2023 at 10:50:57PM +0800, Zhu Yanjun wrote:
>>>>>>> 在 2023/5/26 13:46, Liang Chen 写道:
>>>>>> what made you respond to a patch from May, now?
>>>>> I want to apply page_pool to our virtio_net. This virtio_net works on
>>>>> our device.
>>>>>
>>>>> I want to verify whether page_pool on virtio_net with our device can
>>>>> improve the performance or not.
>>>>>
>>>>> And I found that ethtool is wrong.
>>>>>
>>>>> I use virtio_net on our device. I found that page member variable in
>>>>> rq is not used in recv path.
>>>>>
>>>>> When virtio_net is modprobe, I checked page member variable in rq with
>>>>> kprobe or crash tool.  page member variable in rq is always NULL.
>>>>>
>>>>> But sg in recv path is used.
>>>>>
>>>>> So how to use page member variable in rq? If page member variable in
>>>>> rq is always NULL, can we remove it?
>>>>>
>>>>> BTW, I use ping and iperf tool to make tests with virtio_net. In the
>>>>> tests, page member variable in rq is always NULL.
>>>>
>>>> And I replaced page member variable in rq with page_pool, but the
>>>> statistics of page_pool are always 0.
>>>>
>>>> It is interesting that page_pool member variable in rq is not used in
>>>> ping and iperf tests.
>>>>
>>>> I am not sure what tests can make page member variable not NULL. ^_^
>>> Do you mean rq->pages?
>>>
>>> That is for big mode.
>>
>> Hi, Xuan
>>
>> Got it. What is big mode? Do you mean big packet size? I run iperf with
>> the packet size 2^23.
>>
>> The rq->pages is still NULL.
>>
>> It is interesting.
> 
> You may need to check the code of virtnet_probe().

Thanks a lot. From virtnet_probe, big mode and mergeable mode can be found.

Zhu Yanjun

> 
> Thanks.
> 
> 
>>
>> Zhu Yanjun
>>
>>
>>>
>>> Thanks.
>>>
>>>
>>>> Best Regards,
>>>>
>>>> Zhu Yanjun
>>>>
>>>>
>>>>> It is interesting.
>>>>>
>>>>> Zhu Yanjun
>>>>>


^ permalink raw reply	[flat|nested] 56+ messages in thread

end of thread, other threads:[~2023-12-04  5:24 UTC | newest]

Thread overview: 56+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-05-26  5:46 [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Liang Chen
2023-05-26  5:46 ` [PATCH net-next 2/5] virtio_net: Add page_pool support to improve performance Liang Chen
2023-05-26  6:50   ` Jason Wang
2023-05-27 12:35     ` Liang Chen
2023-05-28  6:40       ` Michael S. Tsirkin
2023-05-29  7:28         ` Liang Chen
2023-05-31  3:10           ` Xuan Zhuo
2023-06-07  9:11             ` Liang Chen
2023-06-07  9:33               ` Xuan Zhuo
2023-05-26 16:11   ` kernel test robot
2023-05-28  6:27     ` Michael S. Tsirkin
2023-05-29  7:28       ` Liang Chen
2023-05-28  6:20   ` Michael S. Tsirkin
2023-05-29  7:27     ` Liang Chen
2023-05-29  9:55       ` Michael S. Tsirkin
2023-05-30  1:19         ` Liang Chen
2023-06-07  9:08           ` Liang Chen
2023-06-07  9:35             ` Xuan Zhuo
2023-06-07 13:58               ` Liang Chen
2023-06-07 20:17             ` Michael S. Tsirkin
2023-06-08  0:38               ` Jason Wang
2023-06-08  3:54                 ` Xuan Zhuo
2023-06-09  2:57                 ` Liang Chen
2023-07-05  5:41                   ` Liang Chen
2023-07-05  6:04                     ` Jason Wang
2023-07-06  1:01                       ` Liang Chen
2023-06-09  2:57               ` Liang Chen
2023-11-29 14:50   ` Zhu Yanjun
2023-11-29 14:59     ` Michael S. Tsirkin
2023-11-29 15:22       ` Zhu Yanjun
2023-11-29 15:29         ` Zhu Yanjun
2023-11-30  2:34           ` Xuan Zhuo
2023-11-30  5:30             ` Zhu Yanjun
2023-12-01  1:38               ` Xuan Zhuo
2023-12-04  5:24                 ` Zhu Yanjun
2023-11-30  7:17   ` Zhu Yanjun
2023-05-26  5:46 ` [PATCH net-next 3/5] virtio_net: Add page pool fragmentation support Liang Chen
2023-05-26  8:29   ` Horatiu Vultur
2023-05-27 12:36     ` Liang Chen
2023-05-26 17:44   ` kernel test robot
2023-05-28  6:25   ` Michael S. Tsirkin
2023-05-29  7:29     ` Liang Chen
2023-05-29  1:33   ` Yunsheng Lin
2023-05-29  7:30     ` Liang Chen
2023-05-26  5:46 ` [PATCH net-next 4/5] virtio_ring: Introduce DMA pre-handler Liang Chen
2023-05-26  6:57   ` Jason Wang
2023-05-26  5:46 ` [PATCH net-next 5/5] virtio_net: Implement " Liang Chen
2023-05-26  7:06   ` Jason Wang
2023-05-27 12:35     ` Liang Chen
2023-05-26 17:34   ` kernel test robot
2023-05-26  6:38 ` [PATCH net-next 1/5] virtio_net: Fix an unsafe reference to the page chain Jason Wang
2023-05-27 12:33   ` Liang Chen
2023-05-28  6:29   ` Michael S. Tsirkin
2023-05-29  7:25     ` Liang Chen
2023-05-28  6:16 ` Michael S. Tsirkin
2023-05-29  7:25   ` Liang Chen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).