linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 02/12] mm: Provide a page_frag_cache allocator cleanup function
       [not found] <20230524153311.3625329-1-dhowells@redhat.com>
@ 2023-05-24 15:33 ` David Howells
  2023-05-24 15:33 ` [PATCH net-next 03/12] mm: Make the page_frag_cache allocator alignment param a pow-of-2 David Howells
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 19+ messages in thread
From: David Howells @ 2023-05-24 15:33 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Willem de Bruijn, David Ahern, Matthew Wilcox,
	Jens Axboe, linux-mm, linux-kernel, Jeroen de Borst,
	Catherine Sullivan, Shailend Chand, Felix Fietkau, John Crispin,
	Sean Wang, Mark Lee, Lorenzo Bianconi, Matthias Brugger,
	AngeloGioacchino Del Regno, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Chaitanya Kulkarni,
	Andrew Morton, linux-arm-kernel, linux-mediatek, linux-nvme

Provide a function to clean up a page_frag_cache allocator rather than
doing it manually each time.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Jeroen de Borst <jeroendb@google.com>
cc: Catherine Sullivan <csully@google.com>
cc: Shailend Chand <shailend@google.com>
cc: Felix Fietkau <nbd@nbd.name>
cc: John Crispin <john@phrozen.org>
cc: Sean Wang <sean.wang@mediatek.com>
cc: Mark Lee <Mark-MC.Lee@mediatek.com>
cc: Lorenzo Bianconi <lorenzo@kernel.org>
cc: Matthias Brugger <matthias.bgg@gmail.com>
cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
cc: Keith Busch <kbusch@kernel.org>
cc: Jens Axboe <axboe@fb.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Sagi Grimberg <sagi@grimberg.me>
cc: Chaitanya Kulkarni <kch@nvidia.com>
cc: Andrew Morton <akpm@linux-foundation.org>
cc: Matthew Wilcox <willy@infradead.org>
cc: netdev@vger.kernel.org
cc: linux-arm-kernel@lists.infradead.org
cc: linux-mediatek@lists.infradead.org
cc: linux-nvme@lists.infradead.org
cc: linux-mm@kvack.org
---
 drivers/net/ethernet/google/gve/gve_main.c | 11 ++---------
 drivers/net/ethernet/mediatek/mtk_wed_wo.c | 17 ++---------------
 drivers/nvme/host/tcp.c                    |  8 +-------
 drivers/nvme/target/tcp.c                  |  5 +----
 include/linux/gfp.h                        |  2 ++
 mm/page_frag_alloc.c                       | 17 +++++++++++++++++
 6 files changed, 25 insertions(+), 35 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 8fb70db63b8b..55feab29bed9 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1251,17 +1251,10 @@ static void gve_unreg_xdp_info(struct gve_priv *priv)
 
 static void gve_drain_page_cache(struct gve_priv *priv)
 {
-	struct page_frag_cache *nc;
 	int i;
 
-	for (i = 0; i < priv->rx_cfg.num_queues; i++) {
-		nc = &priv->rx[i].page_cache;
-		if (nc->va) {
-			__page_frag_cache_drain(virt_to_page(nc->va),
-						nc->pagecnt_bias);
-			nc->va = NULL;
-		}
-	}
+	for (i = 0; i < priv->rx_cfg.num_queues; i++)
+		page_frag_cache_clear(&priv->rx[i].page_cache);
 }
 
 static int gve_open(struct net_device *dev)
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
index 69fba29055e9..d90fea2c7d04 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -286,7 +286,6 @@ mtk_wed_wo_queue_free(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 static void
 mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-	struct page *page;
 	int i;
 
 	for (i = 0; i < q->n_desc; i++) {
@@ -298,19 +297,12 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 		entry->buf = NULL;
 	}
 
-	if (!q->cache.va)
-		return;
-
-	page = virt_to_page(q->cache.va);
-	__page_frag_cache_drain(page, q->cache.pagecnt_bias);
-	memset(&q->cache, 0, sizeof(q->cache));
+	page_frag_cache_clear(&q->cache);
 }
 
 static void
 mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 {
-	struct page *page;
-
 	for (;;) {
 		void *buf = mtk_wed_wo_dequeue(wo, q, NULL, true);
 
@@ -320,12 +312,7 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 		skb_free_frag(buf);
 	}
 
-	if (!q->cache.va)
-		return;
-
-	page = virt_to_page(q->cache.va);
-	__page_frag_cache_drain(page, q->cache.pagecnt_bias);
-	memset(&q->cache, 0, sizeof(q->cache));
+	page_frag_cache_clear(&q->cache);
 }
 
 static void
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bf0230442d57..dcc35f6bff8c 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1315,7 +1315,6 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
 
 static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 {
-	struct page *page;
 	struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
 	unsigned int noreclaim_flag;
@@ -1326,12 +1325,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 	if (queue->hdr_digest || queue->data_digest)
 		nvme_tcp_free_crypto(queue);
 
-	if (queue->pf_cache.va) {
-		page = virt_to_head_page(queue->pf_cache.va);
-		__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
-		queue->pf_cache.va = NULL;
-	}
-
+	page_frag_cache_clear(&queue->pf_cache);
 	noreclaim_flag = memalloc_noreclaim_save();
 	sock_release(queue->sock);
 	memalloc_noreclaim_restore(noreclaim_flag);
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index ed98df72c76b..984e6ce85dcd 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1464,7 +1464,6 @@ static void nvmet_tcp_free_cmd_data_in_buffers(struct nvmet_tcp_queue *queue)
 
 static void nvmet_tcp_release_queue_work(struct work_struct *w)
 {
-	struct page *page;
 	struct nvmet_tcp_queue *queue =
 		container_of(w, struct nvmet_tcp_queue, release_work);
 
@@ -1486,9 +1485,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
 	if (queue->hdr_digest || queue->data_digest)
 		nvmet_tcp_free_crypto(queue);
 	ida_free(&nvmet_tcp_queue_ida, queue->idx);
-
-	page = virt_to_head_page(queue->pf_cache.va);
-	__page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias);
+	page_frag_cache_clear(&queue->pf_cache);
 	kfree(queue);
 }
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index ed8cb537c6a7..03504beb51e4 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -314,6 +314,8 @@ static inline void *page_frag_alloc(struct page_frag_cache *nc,
 	return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
 }
 
+void page_frag_cache_clear(struct page_frag_cache *nc);
+
 extern void page_frag_free(void *addr);
 
 #define __free_page(page) __free_pages((page), 0)
diff --git a/mm/page_frag_alloc.c b/mm/page_frag_alloc.c
index bee95824ef8f..e02b81d68dc4 100644
--- a/mm/page_frag_alloc.c
+++ b/mm/page_frag_alloc.c
@@ -46,6 +46,23 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
+/**
+ * page_frag_cache_clear - Clear out a page fragment cache
+ * @nc: The cache to clear
+ *
+ * Discard any pages still cached in a page fragment cache.
+ */
+void page_frag_cache_clear(struct page_frag_cache *nc)
+{
+	if (nc->va) {
+		struct page *page = virt_to_head_page(nc->va);
+
+		__page_frag_cache_drain(page, nc->pagecnt_bias);
+		nc->va = NULL;
+	}
+}
+EXPORT_SYMBOL(page_frag_cache_clear);
+
 void *page_frag_alloc_align(struct page_frag_cache *nc,
 		      unsigned int fragsz, gfp_t gfp_mask,
 		      unsigned int align_mask)



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH net-next 03/12] mm: Make the page_frag_cache allocator alignment param a pow-of-2
       [not found] <20230524153311.3625329-1-dhowells@redhat.com>
  2023-05-24 15:33 ` [PATCH net-next 02/12] mm: Provide a page_frag_cache allocator cleanup function David Howells
@ 2023-05-24 15:33 ` David Howells
  2023-05-27 15:54   ` Alexander H Duyck
  2023-06-16 15:28   ` David Howells
  2023-05-24 15:33 ` [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios David Howells
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 19+ messages in thread
From: David Howells @ 2023-05-24 15:33 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Willem de Bruijn, David Ahern, Matthew Wilcox,
	Jens Axboe, linux-mm, linux-kernel, Jeroen de Borst,
	Catherine Sullivan, Shailend Chand, Felix Fietkau, John Crispin,
	Sean Wang, Mark Lee, Lorenzo Bianconi, Matthias Brugger,
	AngeloGioacchino Del Regno, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Chaitanya Kulkarni,
	Andrew Morton, linux-arm-kernel, linux-mediatek, linux-nvme

Make the page_frag_cache allocator's alignment parameter a power of 2
rather than a mask and give a warning if it isn't.

This means that it's consistent with {napi,netdec}_alloc_frag_align() and
allows __{napi,netdev}_alloc_frag_align() to be removed.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Jeroen de Borst <jeroendb@google.com>
cc: Catherine Sullivan <csully@google.com>
cc: Shailend Chand <shailend@google.com>
cc: Felix Fietkau <nbd@nbd.name>
cc: John Crispin <john@phrozen.org>
cc: Sean Wang <sean.wang@mediatek.com>
cc: Mark Lee <Mark-MC.Lee@mediatek.com>
cc: Lorenzo Bianconi <lorenzo@kernel.org>
cc: Matthias Brugger <matthias.bgg@gmail.com>
cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
cc: Keith Busch <kbusch@kernel.org>
cc: Jens Axboe <axboe@fb.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Sagi Grimberg <sagi@grimberg.me>
cc: Chaitanya Kulkarni <kch@nvidia.com>
cc: Andrew Morton <akpm@linux-foundation.org>
cc: Matthew Wilcox <willy@infradead.org>
cc: netdev@vger.kernel.org
cc: linux-arm-kernel@lists.infradead.org
cc: linux-mediatek@lists.infradead.org
cc: linux-nvme@lists.infradead.org
cc: linux-mm@kvack.org
---
 include/linux/gfp.h    |  4 ++--
 include/linux/skbuff.h | 22 ++++------------------
 mm/page_frag_alloc.c   |  8 +++++---
 net/core/skbuff.c      | 14 +++++++-------
 4 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 03504beb51e4..fa30100f46ad 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -306,12 +306,12 @@ struct page_frag_cache;
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
 extern void *page_frag_alloc_align(struct page_frag_cache *nc,
 				   unsigned int fragsz, gfp_t gfp_mask,
-				   unsigned int align_mask);
+				   unsigned int align);
 
 static inline void *page_frag_alloc(struct page_frag_cache *nc,
 			     unsigned int fragsz, gfp_t gfp_mask)
 {
-	return page_frag_alloc_align(nc, fragsz, gfp_mask, ~0u);
+	return page_frag_alloc_align(nc, fragsz, gfp_mask, 1);
 }
 
 void page_frag_cache_clear(struct page_frag_cache *nc);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 1b2ebf6113e0..41b63e72c6c3 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3158,7 +3158,7 @@ void skb_queue_purge(struct sk_buff_head *list);
 
 unsigned int skb_rbtree_purge(struct rb_root *root);
 
-void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);
+void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align);
 
 /**
  * netdev_alloc_frag - allocate a page fragment
@@ -3169,14 +3169,7 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);
  */
 static inline void *netdev_alloc_frag(unsigned int fragsz)
 {
-	return __netdev_alloc_frag_align(fragsz, ~0u);
-}
-
-static inline void *netdev_alloc_frag_align(unsigned int fragsz,
-					    unsigned int align)
-{
-	WARN_ON_ONCE(!is_power_of_2(align));
-	return __netdev_alloc_frag_align(fragsz, -align);
+	return netdev_alloc_frag_align(fragsz, 1);
 }
 
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length,
@@ -3236,18 +3229,11 @@ static inline void skb_free_frag(void *addr)
 	page_frag_free(addr);
 }
 
-void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask);
+void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align);
 
 static inline void *napi_alloc_frag(unsigned int fragsz)
 {
-	return __napi_alloc_frag_align(fragsz, ~0u);
-}
-
-static inline void *napi_alloc_frag_align(unsigned int fragsz,
-					  unsigned int align)
-{
-	WARN_ON_ONCE(!is_power_of_2(align));
-	return __napi_alloc_frag_align(fragsz, -align);
+	return napi_alloc_frag_align(fragsz, 1);
 }
 
 struct sk_buff *__napi_alloc_skb(struct napi_struct *napi,
diff --git a/mm/page_frag_alloc.c b/mm/page_frag_alloc.c
index e02b81d68dc4..9d3f6fbd9a07 100644
--- a/mm/page_frag_alloc.c
+++ b/mm/page_frag_alloc.c
@@ -64,13 +64,15 @@ void page_frag_cache_clear(struct page_frag_cache *nc)
 EXPORT_SYMBOL(page_frag_cache_clear);
 
 void *page_frag_alloc_align(struct page_frag_cache *nc,
-		      unsigned int fragsz, gfp_t gfp_mask,
-		      unsigned int align_mask)
+			    unsigned int fragsz, gfp_t gfp_mask,
+			    unsigned int align)
 {
 	unsigned int size = PAGE_SIZE;
 	struct page *page;
 	int offset;
 
+	WARN_ON_ONCE(!is_power_of_2(align));
+
 	if (unlikely(!nc->va)) {
 refill:
 		page = __page_frag_cache_refill(nc, gfp_mask);
@@ -129,7 +131,7 @@ void *page_frag_alloc_align(struct page_frag_cache *nc,
 	}
 
 	nc->pagecnt_bias--;
-	offset &= align_mask;
+	offset &= ~(align - 1);
 	nc->offset = offset;
 
 	return nc->va + offset;
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f4a5b51aed22..cc507433b357 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -289,17 +289,17 @@ void napi_get_frags_check(struct napi_struct *napi)
 	local_bh_enable();
 }
 
-void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align)
 {
 	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
 
 	fragsz = SKB_DATA_ALIGN(fragsz);
 
-	return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
+	return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align);
 }
-EXPORT_SYMBOL(__napi_alloc_frag_align);
+EXPORT_SYMBOL(napi_alloc_frag_align);
 
-void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
+void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align)
 {
 	void *data;
 
@@ -307,18 +307,18 @@ void *__netdev_alloc_frag_align(unsigned int fragsz, unsigned int align_mask)
 	if (in_hardirq() || irqs_disabled()) {
 		struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
 
-		data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align_mask);
+		data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align);
 	} else {
 		struct napi_alloc_cache *nc;
 
 		local_bh_disable();
 		nc = this_cpu_ptr(&napi_alloc_cache);
-		data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align_mask);
+		data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align);
 		local_bh_enable();
 	}
 	return data;
 }
-EXPORT_SYMBOL(__netdev_alloc_frag_align);
+EXPORT_SYMBOL(netdev_alloc_frag_align);
 
 static struct sk_buff *napi_skb_cache_get(void)
 {



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios
       [not found] <20230524153311.3625329-1-dhowells@redhat.com>
  2023-05-24 15:33 ` [PATCH net-next 02/12] mm: Provide a page_frag_cache allocator cleanup function David Howells
  2023-05-24 15:33 ` [PATCH net-next 03/12] mm: Make the page_frag_cache allocator alignment param a pow-of-2 David Howells
@ 2023-05-24 15:33 ` David Howells
  2023-05-26 11:56   ` Yunsheng Lin
                     ` (2 more replies)
  2023-05-24 15:33 ` [PATCH net-next 05/12] mm: Make the page_frag_cache allocator handle __GFP_ZERO itself David Howells
  2023-05-24 15:33 ` [PATCH net-next 06/12] mm: Make the page_frag_cache allocator use per-cpu David Howells
  4 siblings, 3 replies; 19+ messages in thread
From: David Howells @ 2023-05-24 15:33 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Willem de Bruijn, David Ahern, Matthew Wilcox,
	Jens Axboe, linux-mm, linux-kernel, Jeroen de Borst,
	Catherine Sullivan, Shailend Chand, Felix Fietkau, John Crispin,
	Sean Wang, Mark Lee, Lorenzo Bianconi, Matthias Brugger,
	AngeloGioacchino Del Regno, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Chaitanya Kulkarni,
	Andrew Morton, linux-arm-kernel, linux-mediatek, linux-nvme

Change the page_frag_cache allocator to use multipage folios rather than
groups of pages.  This reduces page_frag_free to just a folio_put() or
put_page().

Signed-off-by: David Howells <dhowells@redhat.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Jeroen de Borst <jeroendb@google.com>
cc: Catherine Sullivan <csully@google.com>
cc: Shailend Chand <shailend@google.com>
cc: Felix Fietkau <nbd@nbd.name>
cc: John Crispin <john@phrozen.org>
cc: Sean Wang <sean.wang@mediatek.com>
cc: Mark Lee <Mark-MC.Lee@mediatek.com>
cc: Lorenzo Bianconi <lorenzo@kernel.org>
cc: Matthias Brugger <matthias.bgg@gmail.com>
cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
cc: Keith Busch <kbusch@kernel.org>
cc: Jens Axboe <axboe@fb.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Sagi Grimberg <sagi@grimberg.me>
cc: Chaitanya Kulkarni <kch@nvidia.com>
cc: Andrew Morton <akpm@linux-foundation.org>
cc: Matthew Wilcox <willy@infradead.org>
cc: netdev@vger.kernel.org
cc: linux-arm-kernel@lists.infradead.org
cc: linux-mediatek@lists.infradead.org
cc: linux-nvme@lists.infradead.org
cc: linux-mm@kvack.org
---
 include/linux/mm_types.h | 13 ++----
 mm/page_frag_alloc.c     | 99 +++++++++++++++++++---------------------
 2 files changed, 52 insertions(+), 60 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 306a3d1a0fa6..d7c52a5979cc 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -420,18 +420,13 @@ static inline void *folio_get_private(struct folio *folio)
 }
 
 struct page_frag_cache {
-	void * va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	__u16 offset;
-	__u16 size;
-#else
-	__u32 offset;
-#endif
+	struct folio	*folio;
+	unsigned int	offset;
 	/* we maintain a pagecount bias, so that we dont dirty cache line
 	 * containing page->_refcount every time we allocate a fragment.
 	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
+	unsigned int	pagecnt_bias;
+	bool		pfmemalloc;
 };
 
 typedef unsigned long vm_flags_t;
diff --git a/mm/page_frag_alloc.c b/mm/page_frag_alloc.c
index 9d3f6fbd9a07..ffd68bfb677d 100644
--- a/mm/page_frag_alloc.c
+++ b/mm/page_frag_alloc.c
@@ -16,33 +16,34 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
-					     gfp_t gfp_mask)
+/*
+ * Allocate a new folio for the frag cache.
+ */
+static struct folio *page_frag_cache_refill(struct page_frag_cache *nc,
+					    gfp_t gfp_mask)
 {
-	struct page *page = NULL;
+	struct folio *folio = NULL;
 	gfp_t gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-		    __GFP_NOMEMALLOC;
-	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
+	gfp_mask |= __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
+	folio = folio_alloc(gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER);
 #endif
-	if (unlikely(!page))
-		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
+	if (unlikely(!folio))
+		folio = folio_alloc(gfp, 0);
 
-	nc->va = page ? page_address(page) : NULL;
-
-	return page;
+	if (folio)
+		nc->folio = folio;
+	return folio;
 }
 
 void __page_frag_cache_drain(struct page *page, unsigned int count)
 {
-	VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
+	struct folio *folio = page_folio(page);
+
+	VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio);
 
-	if (page_ref_sub_and_test(page, count - 1))
-		__free_pages(page, compound_order(page));
+	folio_put_refs(folio, count);
 }
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
@@ -54,11 +55,12 @@ EXPORT_SYMBOL(__page_frag_cache_drain);
  */
 void page_frag_cache_clear(struct page_frag_cache *nc)
 {
-	if (nc->va) {
-		struct page *page = virt_to_head_page(nc->va);
+	struct folio *folio = nc->folio;
 
-		__page_frag_cache_drain(page, nc->pagecnt_bias);
-		nc->va = NULL;
+	if (folio) {
+		VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio);
+		folio_put_refs(folio, nc->pagecnt_bias);
+		nc->folio = NULL;
 	}
 }
 EXPORT_SYMBOL(page_frag_cache_clear);
@@ -67,56 +69,51 @@ void *page_frag_alloc_align(struct page_frag_cache *nc,
 			    unsigned int fragsz, gfp_t gfp_mask,
 			    unsigned int align)
 {
-	unsigned int size = PAGE_SIZE;
-	struct page *page;
-	int offset;
+	struct folio *folio = nc->folio;
+	size_t offset;
 
 	WARN_ON_ONCE(!is_power_of_2(align));
 
-	if (unlikely(!nc->va)) {
+	if (unlikely(!folio)) {
 refill:
-		page = __page_frag_cache_refill(nc, gfp_mask);
-		if (!page)
+		folio = page_frag_cache_refill(nc, gfp_mask);
+		if (!folio)
 			return NULL;
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
 		/* Even if we own the page, we do not use atomic_set().
 		 * This would break get_page_unless_zero() users.
 		 */
-		page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
+		folio_ref_add(folio, PAGE_FRAG_CACHE_MAX_SIZE);
 
 		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page_is_pfmemalloc(page);
+		nc->pfmemalloc = folio_is_pfmemalloc(folio);
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = size;
+		nc->offset = folio_size(folio);
 	}
 
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
-		page = virt_to_page(nc->va);
-
-		if (page_ref_count(page) != nc->pagecnt_bias)
+	offset = nc->offset;
+	if (unlikely(fragsz > offset)) {
+		/* Reuse the folio if everyone we gave it to has finished with
+		 * it.
+		 */
+		if (!folio_ref_sub_and_test(folio, nc->pagecnt_bias)) {
+			nc->folio = NULL;
 			goto refill;
+		}
+
 		if (unlikely(nc->pfmemalloc)) {
-			page_ref_sub(page, nc->pagecnt_bias - 1);
-			__free_pages(page, compound_order(page));
+			__folio_put(folio);
+			nc->folio = NULL;
 			goto refill;
 		}
 
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
 		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
+		folio_set_count(folio, PAGE_FRAG_CACHE_MAX_SIZE + 1);
 
 		/* reset page count bias and offset to start of new frag */
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		offset = size - fragsz;
-		if (unlikely(offset < 0)) {
+		offset = folio_size(folio);
+		if (unlikely(fragsz > offset)) {
 			/*
 			 * The caller is trying to allocate a fragment
 			 * with fragsz > PAGE_SIZE but the cache isn't big
@@ -126,15 +123,17 @@ void *page_frag_alloc_align(struct page_frag_cache *nc,
 			 * it could make memory pressure worse
 			 * so we simply return NULL here.
 			 */
+			nc->offset = offset;
 			return NULL;
 		}
 	}
 
 	nc->pagecnt_bias--;
+	offset -= fragsz;
 	offset &= ~(align - 1);
 	nc->offset = offset;
 
-	return nc->va + offset;
+	return folio_address(folio) + offset;
 }
 EXPORT_SYMBOL(page_frag_alloc_align);
 
@@ -143,8 +142,6 @@ EXPORT_SYMBOL(page_frag_alloc_align);
  */
 void page_frag_free(void *addr)
 {
-	struct page *page = virt_to_head_page(addr);
-
-	__free_pages(page, compound_order(page));
+	folio_put(virt_to_folio(addr));
 }
 EXPORT_SYMBOL(page_frag_free);



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH net-next 05/12] mm: Make the page_frag_cache allocator handle __GFP_ZERO itself
       [not found] <20230524153311.3625329-1-dhowells@redhat.com>
                   ` (2 preceding siblings ...)
  2023-05-24 15:33 ` [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios David Howells
@ 2023-05-24 15:33 ` David Howells
  2023-05-27  0:57   ` Jakub Kicinski
  2023-05-24 15:33 ` [PATCH net-next 06/12] mm: Make the page_frag_cache allocator use per-cpu David Howells
  4 siblings, 1 reply; 19+ messages in thread
From: David Howells @ 2023-05-24 15:33 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Willem de Bruijn, David Ahern, Matthew Wilcox,
	Jens Axboe, linux-mm, linux-kernel, Jeroen de Borst,
	Catherine Sullivan, Shailend Chand, Felix Fietkau, John Crispin,
	Sean Wang, Mark Lee, Lorenzo Bianconi, Matthias Brugger,
	AngeloGioacchino Del Regno, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Chaitanya Kulkarni,
	Andrew Morton, linux-arm-kernel, linux-mediatek, linux-nvme

Make the page_frag_cache allocator handle __GFP_ZERO itself rather than
passing it off to the page allocator.  There may be a mix of callers, some
specifying __GFP_ZERO and some not - and even if all specify __GFP_ZERO, we
might refurbish the page, in which case the returned memory doesn't get
cleared.

This is a potential bug in the nvme over TCP driver.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Jeroen de Borst <jeroendb@google.com>
cc: Catherine Sullivan <csully@google.com>
cc: Shailend Chand <shailend@google.com>
cc: Felix Fietkau <nbd@nbd.name>
cc: John Crispin <john@phrozen.org>
cc: Sean Wang <sean.wang@mediatek.com>
cc: Mark Lee <Mark-MC.Lee@mediatek.com>
cc: Lorenzo Bianconi <lorenzo@kernel.org>
cc: Matthias Brugger <matthias.bgg@gmail.com>
cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
cc: Keith Busch <kbusch@kernel.org>
cc: Jens Axboe <axboe@fb.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Sagi Grimberg <sagi@grimberg.me>
cc: Chaitanya Kulkarni <kch@nvidia.com>
cc: Andrew Morton <akpm@linux-foundation.org>
cc: Matthew Wilcox <willy@infradead.org>
cc: netdev@vger.kernel.org
cc: linux-arm-kernel@lists.infradead.org
cc: linux-mediatek@lists.infradead.org
cc: linux-nvme@lists.infradead.org
cc: linux-mm@kvack.org
---
 mm/page_frag_alloc.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/page_frag_alloc.c b/mm/page_frag_alloc.c
index ffd68bfb677d..2b73c7f5d9a9 100644
--- a/mm/page_frag_alloc.c
+++ b/mm/page_frag_alloc.c
@@ -23,7 +23,10 @@ static struct folio *page_frag_cache_refill(struct page_frag_cache *nc,
 					    gfp_t gfp_mask)
 {
 	struct folio *folio = NULL;
-	gfp_t gfp = gfp_mask;
+	gfp_t gfp;
+
+	gfp_mask &= ~__GFP_ZERO;
+	gfp = gfp_mask;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
 	gfp_mask |= __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
@@ -71,6 +74,7 @@ void *page_frag_alloc_align(struct page_frag_cache *nc,
 {
 	struct folio *folio = nc->folio;
 	size_t offset;
+	void *p;
 
 	WARN_ON_ONCE(!is_power_of_2(align));
 
@@ -133,7 +137,10 @@ void *page_frag_alloc_align(struct page_frag_cache *nc,
 	offset &= ~(align - 1);
 	nc->offset = offset;
 
-	return folio_address(folio) + offset;
+	p = folio_address(folio) + offset;
+	if (gfp_mask & __GFP_ZERO)
+		return memset(p, 0, fragsz);
+	return p;
 }
 EXPORT_SYMBOL(page_frag_alloc_align);
 



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* [PATCH net-next 06/12] mm: Make the page_frag_cache allocator use per-cpu
       [not found] <20230524153311.3625329-1-dhowells@redhat.com>
                   ` (3 preceding siblings ...)
  2023-05-24 15:33 ` [PATCH net-next 05/12] mm: Make the page_frag_cache allocator handle __GFP_ZERO itself David Howells
@ 2023-05-24 15:33 ` David Howells
  2023-05-27  1:02   ` Jakub Kicinski
  4 siblings, 1 reply; 19+ messages in thread
From: David Howells @ 2023-05-24 15:33 UTC (permalink / raw)
  To: netdev
  Cc: David Howells, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Willem de Bruijn, David Ahern, Matthew Wilcox,
	Jens Axboe, linux-mm, linux-kernel, Jeroen de Borst,
	Catherine Sullivan, Shailend Chand, Felix Fietkau, John Crispin,
	Sean Wang, Mark Lee, Lorenzo Bianconi, Matthias Brugger,
	AngeloGioacchino Del Regno, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Chaitanya Kulkarni,
	Andrew Morton, linux-arm-kernel, linux-mediatek, linux-nvme

Make the page_frag_cache allocator have a separate allocation bucket for
each cpu to avoid racing.  This means that no lock is required, other than
preempt disablement, to allocate from it, though if a softirq wants to
access it, then softirq disablement will need to be added.

Make the NVMe, mediatek and GVE drivers pass in NULL to page_frag_cache()
and use the default allocation buckets rather than defining their own.

Signed-off-by: David Howells <dhowells@redhat.com>
cc: "David S. Miller" <davem@davemloft.net>
cc: Eric Dumazet <edumazet@google.com>
cc: Jakub Kicinski <kuba@kernel.org>
cc: Paolo Abeni <pabeni@redhat.com>
cc: Jens Axboe <axboe@kernel.dk>
cc: Jeroen de Borst <jeroendb@google.com>
cc: Catherine Sullivan <csully@google.com>
cc: Shailend Chand <shailend@google.com>
cc: Felix Fietkau <nbd@nbd.name>
cc: John Crispin <john@phrozen.org>
cc: Sean Wang <sean.wang@mediatek.com>
cc: Mark Lee <Mark-MC.Lee@mediatek.com>
cc: Lorenzo Bianconi <lorenzo@kernel.org>
cc: Matthias Brugger <matthias.bgg@gmail.com>
cc: AngeloGioacchino Del Regno <angelogioacchino.delregno@collabora.com>
cc: Keith Busch <kbusch@kernel.org>
cc: Jens Axboe <axboe@fb.com>
cc: Christoph Hellwig <hch@lst.de>
cc: Sagi Grimberg <sagi@grimberg.me>
cc: Chaitanya Kulkarni <kch@nvidia.com>
cc: Andrew Morton <akpm@linux-foundation.org>
cc: Matthew Wilcox <willy@infradead.org>
cc: netdev@vger.kernel.org
cc: linux-arm-kernel@lists.infradead.org
cc: linux-mediatek@lists.infradead.org
cc: linux-nvme@lists.infradead.org
cc: linux-mm@kvack.org
---
 drivers/net/ethernet/google/gve/gve.h      |   1 -
 drivers/net/ethernet/google/gve/gve_main.c |   9 -
 drivers/net/ethernet/google/gve/gve_rx.c   |   2 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.c |   6 +-
 drivers/net/ethernet/mediatek/mtk_wed_wo.h |   2 -
 drivers/nvme/host/tcp.c                    |  13 +-
 drivers/nvme/target/tcp.c                  |  19 +-
 include/linux/gfp.h                        |  19 +-
 mm/page_frag_alloc.c                       | 202 +++++++++++++--------
 net/core/skbuff.c                          |  32 ++--
 10 files changed, 163 insertions(+), 142 deletions(-)

diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h
index 98eb78d98e9f..87244ab911bd 100644
--- a/drivers/net/ethernet/google/gve/gve.h
+++ b/drivers/net/ethernet/google/gve/gve.h
@@ -250,7 +250,6 @@ struct gve_rx_ring {
 	struct xdp_rxq_info xdp_rxq;
 	struct xdp_rxq_info xsk_rxq;
 	struct xsk_buff_pool *xsk_pool;
-	struct page_frag_cache page_cache; /* Page cache to allocate XDP frames */
 };
 
 /* A TX desc ring entry */
diff --git a/drivers/net/ethernet/google/gve/gve_main.c b/drivers/net/ethernet/google/gve/gve_main.c
index 55feab29bed9..9f0fb986d61e 100644
--- a/drivers/net/ethernet/google/gve/gve_main.c
+++ b/drivers/net/ethernet/google/gve/gve_main.c
@@ -1249,14 +1249,6 @@ static void gve_unreg_xdp_info(struct gve_priv *priv)
 	}
 }
 
-static void gve_drain_page_cache(struct gve_priv *priv)
-{
-	int i;
-
-	for (i = 0; i < priv->rx_cfg.num_queues; i++)
-		page_frag_cache_clear(&priv->rx[i].page_cache);
-}
-
 static int gve_open(struct net_device *dev)
 {
 	struct gve_priv *priv = netdev_priv(dev);
@@ -1340,7 +1332,6 @@ static int gve_close(struct net_device *dev)
 	netif_carrier_off(dev);
 	if (gve_get_device_rings_ok(priv)) {
 		gve_turndown(priv);
-		gve_drain_page_cache(priv);
 		err = gve_destroy_rings(priv);
 		if (err)
 			goto err;
diff --git a/drivers/net/ethernet/google/gve/gve_rx.c b/drivers/net/ethernet/google/gve/gve_rx.c
index d1da7413dc4d..7ae8377c394f 100644
--- a/drivers/net/ethernet/google/gve/gve_rx.c
+++ b/drivers/net/ethernet/google/gve/gve_rx.c
@@ -634,7 +634,7 @@ static int gve_xdp_redirect(struct net_device *dev, struct gve_rx_ring *rx,
 
 	total_len = headroom + SKB_DATA_ALIGN(len) +
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
-	frame = page_frag_alloc(&rx->page_cache, total_len, GFP_ATOMIC);
+	frame = page_frag_alloc(NULL, total_len, GFP_ATOMIC);
 	if (!frame) {
 		u64_stats_update_begin(&rx->statss);
 		rx->xdp_alloc_fails++;
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.c b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
index d90fea2c7d04..859f34447f2f 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.c
@@ -143,7 +143,7 @@ mtk_wed_wo_queue_refill(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q,
 		dma_addr_t addr;
 		void *buf;
 
-		buf = page_frag_alloc(&q->cache, q->buf_size, GFP_ATOMIC);
+		buf = page_frag_alloc(NULL, q->buf_size, GFP_ATOMIC);
 		if (!buf)
 			break;
 
@@ -296,8 +296,6 @@ mtk_wed_wo_queue_tx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 		skb_free_frag(entry->buf);
 		entry->buf = NULL;
 	}
-
-	page_frag_cache_clear(&q->cache);
 }
 
 static void
@@ -311,8 +309,6 @@ mtk_wed_wo_queue_rx_clean(struct mtk_wed_wo *wo, struct mtk_wed_wo_queue *q)
 
 		skb_free_frag(buf);
 	}
-
-	page_frag_cache_clear(&q->cache);
 }
 
 static void
diff --git a/drivers/net/ethernet/mediatek/mtk_wed_wo.h b/drivers/net/ethernet/mediatek/mtk_wed_wo.h
index 7a1a2a28f1ac..f69bd83dc486 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed_wo.h
+++ b/drivers/net/ethernet/mediatek/mtk_wed_wo.h
@@ -211,8 +211,6 @@ struct mtk_wed_wo_queue_entry {
 struct mtk_wed_wo_queue {
 	struct mtk_wed_wo_queue_regs regs;
 
-	struct page_frag_cache cache;
-
 	struct mtk_wed_wo_queue_desc *desc;
 	dma_addr_t desc_dma;
 
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index dcc35f6bff8c..145cf6186509 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -147,8 +147,6 @@ struct nvme_tcp_queue {
 	__le32			exp_ddgst;
 	__le32			recv_ddgst;
 
-	struct page_frag_cache	pf_cache;
-
 	void (*state_change)(struct sock *);
 	void (*data_ready)(struct sock *);
 	void (*write_space)(struct sock *);
@@ -482,9 +480,8 @@ static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
 	struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 
-	req->pdu = page_frag_alloc(&queue->pf_cache,
-		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
-		GFP_KERNEL | __GFP_ZERO);
+	req->pdu = page_frag_alloc(NULL, sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+				   GFP_KERNEL | __GFP_ZERO);
 	if (!req->pdu)
 		return -ENOMEM;
 
@@ -1303,9 +1300,8 @@ static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
 	struct nvme_tcp_request *async = &ctrl->async_req;
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 
-	async->pdu = page_frag_alloc(&queue->pf_cache,
-		sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
-		GFP_KERNEL | __GFP_ZERO);
+	async->pdu = page_frag_alloc(NULL, sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
+				     GFP_KERNEL | __GFP_ZERO);
 	if (!async->pdu)
 		return -ENOMEM;
 
@@ -1325,7 +1321,6 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 	if (queue->hdr_digest || queue->data_digest)
 		nvme_tcp_free_crypto(queue);
 
-	page_frag_cache_clear(&queue->pf_cache);
 	noreclaim_flag = memalloc_noreclaim_save();
 	sock_release(queue->sock);
 	memalloc_noreclaim_restore(noreclaim_flag);
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 984e6ce85dcd..cb352f5d2bbf 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -169,8 +169,6 @@ struct nvmet_tcp_queue {
 
 	struct nvmet_tcp_cmd	connect;
 
-	struct page_frag_cache	pf_cache;
-
 	void (*data_ready)(struct sock *);
 	void (*state_change)(struct sock *);
 	void (*write_space)(struct sock *);
@@ -1338,25 +1336,25 @@ static int nvmet_tcp_alloc_cmd(struct nvmet_tcp_queue *queue,
 	c->queue = queue;
 	c->req.port = queue->port->nport;
 
-	c->cmd_pdu = page_frag_alloc(&queue->pf_cache,
-			sizeof(*c->cmd_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	c->cmd_pdu = page_frag_alloc(NULL, sizeof(*c->cmd_pdu) + hdgst,
+				     GFP_KERNEL | __GFP_ZERO);
 	if (!c->cmd_pdu)
 		return -ENOMEM;
 	c->req.cmd = &c->cmd_pdu->cmd;
 
-	c->rsp_pdu = page_frag_alloc(&queue->pf_cache,
-			sizeof(*c->rsp_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	c->rsp_pdu = page_frag_alloc(NULL, sizeof(*c->rsp_pdu) + hdgst,
+				     GFP_KERNEL | __GFP_ZERO);
 	if (!c->rsp_pdu)
 		goto out_free_cmd;
 	c->req.cqe = &c->rsp_pdu->cqe;
 
-	c->data_pdu = page_frag_alloc(&queue->pf_cache,
-			sizeof(*c->data_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	c->data_pdu = page_frag_alloc(NULL, sizeof(*c->data_pdu) + hdgst,
+				      GFP_KERNEL | __GFP_ZERO);
 	if (!c->data_pdu)
 		goto out_free_rsp;
 
-	c->r2t_pdu = page_frag_alloc(&queue->pf_cache,
-			sizeof(*c->r2t_pdu) + hdgst, GFP_KERNEL | __GFP_ZERO);
+	c->r2t_pdu = page_frag_alloc(NULL, sizeof(*c->r2t_pdu) + hdgst,
+				     GFP_KERNEL | __GFP_ZERO);
 	if (!c->r2t_pdu)
 		goto out_free_data;
 
@@ -1485,7 +1483,6 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
 	if (queue->hdr_digest || queue->data_digest)
 		nvmet_tcp_free_crypto(queue);
 	ida_free(&nvmet_tcp_queue_ida, queue->idx);
-	page_frag_cache_clear(&queue->pf_cache);
 	kfree(queue);
 }
 
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index fa30100f46ad..baa25a00d9e3 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -304,18 +304,19 @@ extern void free_pages(unsigned long addr, unsigned int order);
 
 struct page_frag_cache;
 extern void __page_frag_cache_drain(struct page *page, unsigned int count);
-extern void *page_frag_alloc_align(struct page_frag_cache *nc,
-				   unsigned int fragsz, gfp_t gfp_mask,
-				   unsigned int align);
-
-static inline void *page_frag_alloc(struct page_frag_cache *nc,
-			     unsigned int fragsz, gfp_t gfp_mask)
+extern void *page_frag_alloc_align(struct page_frag_cache __percpu *frag_cache,
+				   size_t fragsz, gfp_t gfp,
+				   unsigned long align);
+extern void *page_frag_memdup(struct page_frag_cache __percpu *frag_cache,
+			      const void *p, size_t fragsz, gfp_t gfp,
+			      unsigned long align);
+
+static inline void *page_frag_alloc(struct page_frag_cache __percpu *frag_cache,
+				    size_t fragsz, gfp_t gfp)
 {
-	return page_frag_alloc_align(nc, fragsz, gfp_mask, 1);
+	return page_frag_alloc_align(frag_cache, fragsz, gfp, 1);
 }
 
-void page_frag_cache_clear(struct page_frag_cache *nc);
-
 extern void page_frag_free(void *addr);
 
 #define __free_page(page) __free_pages((page), 0)
diff --git a/mm/page_frag_alloc.c b/mm/page_frag_alloc.c
index 2b73c7f5d9a9..b035bbb34fac 100644
--- a/mm/page_frag_alloc.c
+++ b/mm/page_frag_alloc.c
@@ -16,28 +16,25 @@
 #include <linux/init.h>
 #include <linux/mm.h>
 
+static DEFINE_PER_CPU(struct page_frag_cache, page_frag_default_allocator);
+
 /*
  * Allocate a new folio for the frag cache.
  */
-static struct folio *page_frag_cache_refill(struct page_frag_cache *nc,
-					    gfp_t gfp_mask)
+static struct folio *page_frag_cache_refill(gfp_t gfp)
 {
-	struct folio *folio = NULL;
-	gfp_t gfp;
+	struct folio *folio;
 
-	gfp_mask &= ~__GFP_ZERO;
-	gfp = gfp_mask;
+	gfp &= ~__GFP_ZERO;
 
 #if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	gfp_mask |= __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC;
-	folio = folio_alloc(gfp_mask, PAGE_FRAG_CACHE_MAX_ORDER);
+	folio = folio_alloc(gfp | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC,
+			    PAGE_FRAG_CACHE_MAX_ORDER);
+	if (folio)
+		return folio;
 #endif
-	if (unlikely(!folio))
-		folio = folio_alloc(gfp, 0);
 
-	if (folio)
-		nc->folio = folio;
-	return folio;
+	return folio_alloc(gfp, 0);
 }
 
 void __page_frag_cache_drain(struct page *page, unsigned int count)
@@ -51,63 +48,70 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
 /**
- * page_frag_cache_clear - Clear out a page fragment cache
- * @nc: The cache to clear
+ * page_frag_alloc_align - Allocate some memory for use in zerocopy
+ * @frag_cache: The frag cache to use (or NULL for the default)
+ * @fragsz: The size of the fragment desired
+ * @gfp: Allocation flags under which to make an allocation
+ * @align: The required alignment
+ *
+ * Allocate some memory for use with zerocopy where protocol bits have to be
+ * mixed in with spliced/zerocopied data.  Unlike memory allocated from the
+ * slab, this memory's lifetime is purely dependent on the folio's refcount.
+ *
+ * The way it works is that a folio is allocated and fragments are broken off
+ * sequentially and returned to the caller with a ref until the folio no longer
+ * has enough spare space - at which point the allocator's ref is dropped and a
+ * new folio is allocated.  The folio remains in existence until the last ref
+ * held by, say, an sk_buff is discarded and then the page is returned to the
+ * page allocator.
  *
- * Discard any pages still cached in a page fragment cache.
+ * Returns a pointer to the memory on success and -ENOMEM on allocation
+ * failure.
+ *
+ * The allocated memory should be disposed of with folio_put().
  */
-void page_frag_cache_clear(struct page_frag_cache *nc)
-{
-	struct folio *folio = nc->folio;
-
-	if (folio) {
-		VM_BUG_ON_FOLIO(folio_ref_count(folio) == 0, folio);
-		folio_put_refs(folio, nc->pagecnt_bias);
-		nc->folio = NULL;
-	}
-}
-EXPORT_SYMBOL(page_frag_cache_clear);
-
-void *page_frag_alloc_align(struct page_frag_cache *nc,
-			    unsigned int fragsz, gfp_t gfp_mask,
-			    unsigned int align)
+void *page_frag_alloc_align(struct page_frag_cache __percpu *frag_cache,
+			    size_t fragsz, gfp_t gfp, unsigned long align)
 {
-	struct folio *folio = nc->folio;
+	struct page_frag_cache *nc;
+	struct folio *folio, *spare = NULL;
 	size_t offset;
 	void *p;
 
 	WARN_ON_ONCE(!is_power_of_2(align));
 
-	if (unlikely(!folio)) {
-refill:
-		folio = page_frag_cache_refill(nc, gfp_mask);
-		if (!folio)
-			return NULL;
-
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		folio_ref_add(folio, PAGE_FRAG_CACHE_MAX_SIZE);
+	if (!frag_cache)
+		frag_cache = &page_frag_default_allocator;
+	if (WARN_ON_ONCE(fragsz == 0))
+		fragsz = 1;
 
-		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = folio_is_pfmemalloc(folio);
-		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
-		nc->offset = folio_size(folio);
+	nc = get_cpu_ptr(frag_cache);
+reload:
+	folio = nc->folio;
+	offset = nc->offset;
+try_again:
+
+	/* Make the allocation if there's sufficient space. */
+	if (fragsz <= offset) {
+		nc->pagecnt_bias--;
+		offset = (offset - fragsz) & ~(align - 1);
+		nc->offset = offset;
+		p = folio_address(folio) + offset;
+		put_cpu_ptr(frag_cache);
+		if (spare)
+			folio_put(spare);
+		if (gfp & __GFP_ZERO)
+			return memset(p, 0, fragsz);
+		return p;
 	}
 
-	offset = nc->offset;
-	if (unlikely(fragsz > offset)) {
-		/* Reuse the folio if everyone we gave it to has finished with
-		 * it.
-		 */
-		if (!folio_ref_sub_and_test(folio, nc->pagecnt_bias)) {
-			nc->folio = NULL;
+	/* Insufficient space - see if we can refurbish the current folio. */
+	if (folio) {
+		if (!folio_ref_sub_and_test(folio, nc->pagecnt_bias))
 			goto refill;
-		}
 
 		if (unlikely(nc->pfmemalloc)) {
 			__folio_put(folio);
-			nc->folio = NULL;
 			goto refill;
 		}
 
@@ -117,30 +121,56 @@ void *page_frag_alloc_align(struct page_frag_cache *nc,
 		/* reset page count bias and offset to start of new frag */
 		nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
 		offset = folio_size(folio);
-		if (unlikely(fragsz > offset)) {
-			/*
-			 * The caller is trying to allocate a fragment
-			 * with fragsz > PAGE_SIZE but the cache isn't big
-			 * enough to satisfy the request, this may
-			 * happen in low memory conditions.
-			 * We don't release the cache page because
-			 * it could make memory pressure worse
-			 * so we simply return NULL here.
-			 */
-			nc->offset = offset;
+		if (unlikely(fragsz > offset))
+			goto frag_too_big;
+		goto try_again;
+	}
+
+refill:
+	if (!spare) {
+		nc->folio = NULL;
+		put_cpu_ptr(frag_cache);
+
+		spare = page_frag_cache_refill(gfp);
+		if (!spare)
 			return NULL;
-		}
+
+		nc = get_cpu_ptr(frag_cache);
+		/* We may now be on a different cpu and/or someone else may
+		 * have refilled it
+		 */
+		nc->pfmemalloc = folio_is_pfmemalloc(spare);
+		if (nc->folio)
+			goto reload;
 	}
 
-	nc->pagecnt_bias--;
-	offset -= fragsz;
-	offset &= ~(align - 1);
+	nc->folio = spare;
+	folio = spare;
+	spare = NULL;
+
+	/* Even if we own the page, we do not use atomic_set().  This would
+	 * break get_page_unless_zero() users.
+	 */
+	folio_ref_add(folio, PAGE_FRAG_CACHE_MAX_SIZE);
+
+	/* Reset page count bias and offset to start of new frag */
+	nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
+	offset = folio_size(folio);
+	goto try_again;
+
+frag_too_big:
+	/*
+	 * The caller is trying to allocate a fragment with fragsz > PAGE_SIZE
+	 * but the cache isn't big enough to satisfy the request, this may
+	 * happen in low memory conditions.  We don't release the cache page
+	 * because it could make memory pressure worse so we simply return NULL
+	 * here.
+	 */
 	nc->offset = offset;
-
-	p = folio_address(folio) + offset;
-	if (gfp_mask & __GFP_ZERO)
-		return memset(p, 0, fragsz);
-	return p;
+	put_cpu_ptr(frag_cache);
+	if (spare)
+		folio_put(spare);
+	return NULL;
 }
 EXPORT_SYMBOL(page_frag_alloc_align);
 
@@ -152,3 +182,25 @@ void page_frag_free(void *addr)
 	folio_put(virt_to_folio(addr));
 }
 EXPORT_SYMBOL(page_frag_free);
+
+/**
+ * page_frag_memdup - Allocate a page fragment and duplicate some data into it
+ * @frag_cache: The frag cache to use (or NULL for the default)
+ * @fragsz: The amount of memory to copy (maximum 1/2 page).
+ * @p: The source data to copy
+ * @gfp: Allocation flags under which to make an allocation
+ * @align_mask: The required alignment
+ */
+void *page_frag_memdup(struct page_frag_cache __percpu *frag_cache,
+		       const void *p, size_t fragsz, gfp_t gfp,
+		       unsigned long align_mask)
+{
+	void *q;
+
+	q = page_frag_alloc_align(frag_cache, fragsz, gfp, align_mask);
+	if (!q)
+		return q;
+
+	return memcpy(q, p, fragsz);
+}
+EXPORT_SYMBOL(page_frag_memdup);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index cc507433b357..225a16f3713f 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -263,13 +263,13 @@ static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask)
 #endif
 
 struct napi_alloc_cache {
-	struct page_frag_cache page;
 	struct page_frag_1k page_small;
 	unsigned int skb_count;
 	void *skb_cache[NAPI_SKB_CACHE_SIZE];
 };
 
 static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
+static DEFINE_PER_CPU(struct page_frag_cache, napi_frag_cache);
 static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
 
 /* Double check that napi_get_frags() allocates skbs with
@@ -291,11 +291,9 @@ void napi_get_frags_check(struct napi_struct *napi)
 
 void *napi_alloc_frag_align(unsigned int fragsz, unsigned int align)
 {
-	struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
-
 	fragsz = SKB_DATA_ALIGN(fragsz);
 
-	return page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align);
+	return page_frag_alloc_align(&napi_frag_cache, fragsz, GFP_ATOMIC, align);
 }
 EXPORT_SYMBOL(napi_alloc_frag_align);
 
@@ -305,15 +303,12 @@ void *netdev_alloc_frag_align(unsigned int fragsz, unsigned int align)
 
 	fragsz = SKB_DATA_ALIGN(fragsz);
 	if (in_hardirq() || irqs_disabled()) {
-		struct page_frag_cache *nc = this_cpu_ptr(&netdev_alloc_cache);
-
-		data = page_frag_alloc_align(nc, fragsz, GFP_ATOMIC, align);
+		data = page_frag_alloc_align(&netdev_alloc_cache,
+					     fragsz, GFP_ATOMIC, align);
 	} else {
-		struct napi_alloc_cache *nc;
-
 		local_bh_disable();
-		nc = this_cpu_ptr(&napi_alloc_cache);
-		data = page_frag_alloc_align(&nc->page, fragsz, GFP_ATOMIC, align);
+		data = page_frag_alloc_align(&napi_frag_cache,
+					     fragsz, GFP_ATOMIC, align);
 		local_bh_enable();
 	}
 	return data;
@@ -691,7 +686,6 @@ EXPORT_SYMBOL(__alloc_skb);
 struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 				   gfp_t gfp_mask)
 {
-	struct page_frag_cache *nc;
 	struct sk_buff *skb;
 	bool pfmemalloc;
 	void *data;
@@ -716,14 +710,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 		gfp_mask |= __GFP_MEMALLOC;
 
 	if (in_hardirq() || irqs_disabled()) {
-		nc = this_cpu_ptr(&netdev_alloc_cache);
-		data = page_frag_alloc(nc, len, gfp_mask);
-		pfmemalloc = nc->pfmemalloc;
+		data = page_frag_alloc(&netdev_alloc_cache, len, gfp_mask);
+		pfmemalloc = folio_is_pfmemalloc(virt_to_folio(data));
 	} else {
 		local_bh_disable();
-		nc = this_cpu_ptr(&napi_alloc_cache.page);
-		data = page_frag_alloc(nc, len, gfp_mask);
-		pfmemalloc = nc->pfmemalloc;
+		data = page_frag_alloc(&napi_frag_cache, len, gfp_mask);
+		pfmemalloc = folio_is_pfmemalloc(virt_to_folio(data));
 		local_bh_enable();
 	}
 
@@ -811,8 +803,8 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 	} else {
 		len = SKB_HEAD_ALIGN(len);
 
-		data = page_frag_alloc(&nc->page, len, gfp_mask);
-		pfmemalloc = nc->page.pfmemalloc;
+		data = page_frag_alloc(&napi_frag_cache, len, gfp_mask);
+		pfmemalloc = folio_is_pfmemalloc(virt_to_folio(data));
 	}
 
 	if (unlikely(!data))



^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios
  2023-05-24 15:33 ` [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios David Howells
@ 2023-05-26 11:56   ` Yunsheng Lin
  2023-05-27 15:47     ` Alexander H Duyck
  2023-06-06  8:25     ` David Howells
  2023-05-26 12:47   ` David Howells
  2023-05-27  0:50   ` Jakub Kicinski
  2 siblings, 2 replies; 19+ messages in thread
From: Yunsheng Lin @ 2023-05-26 11:56 UTC (permalink / raw)
  To: David Howells, netdev
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Willem de Bruijn, David Ahern, Matthew Wilcox, Jens Axboe,
	linux-mm, linux-kernel, Jeroen de Borst, Catherine Sullivan,
	Shailend Chand, Felix Fietkau, John Crispin, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Andrew Morton, linux-arm-kernel,
	linux-mediatek, linux-nvme

On 2023/5/24 23:33, David Howells wrote:
> Change the page_frag_cache allocator to use multipage folios rather than
> groups of pages.  This reduces page_frag_free to just a folio_put() or
> put_page().

Hi, David

put_page() is not used in this patch, perhaps remove it to avoid
the confusion?
Also, Is there any significant difference between __free_pages()
and folio_put()? IOW, what does the 'reduces' part means here?

I followed some disscusion about folio before, but have not really
understood about real difference between 'multipage folios' and
'groups of pages' yet. Is folio mostly used to avoid the confusion
about whether a page is 'headpage of compound page', 'base page' or
'tailpage of compound page'? Or is there any abvious benefit about
folio that I missed?

> 
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 306a3d1a0fa6..d7c52a5979cc 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -420,18 +420,13 @@ static inline void *folio_get_private(struct folio *folio)
>  }
>  
>  struct page_frag_cache {
> -	void * va;
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> -	__u16 offset;
> -	__u16 size;
> -#else
> -	__u32 offset;
> -#endif
> +	struct folio	*folio;
> +	unsigned int	offset;
>  	/* we maintain a pagecount bias, so that we dont dirty cache line
>  	 * containing page->_refcount every time we allocate a fragment.
>  	 */
> -	unsigned int		pagecnt_bias;
> -	bool pfmemalloc;
> +	unsigned int	pagecnt_bias;
> +	bool		pfmemalloc;
>  };

It seems 'va' and 'size' field is used to avoid touching 'stuct page' to
avoid possible cache bouncing when there is more frag can be allocated
from the page while other frags is freed at the same time before this patch?
It might be worth calling that out in the commit log or split it into another
patch to make it clearer and easier to review?


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios
  2023-05-24 15:33 ` [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios David Howells
  2023-05-26 11:56   ` Yunsheng Lin
@ 2023-05-26 12:47   ` David Howells
  2023-05-26 14:06     ` Mika Penttilä
  2023-05-27  0:50   ` Jakub Kicinski
  2 siblings, 1 reply; 19+ messages in thread
From: David Howells @ 2023-05-26 12:47 UTC (permalink / raw)
  To: Yunsheng Lin, Matthew Wilcox
  Cc: David Howells, netdev, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Jens Axboe, linux-mm, linux-kernel, Jeroen de Borst,
	Catherine Sullivan, Shailend Chand, Felix Fietkau, John Crispin,
	Sean Wang, Mark Lee, Lorenzo Bianconi, Matthias Brugger,
	AngeloGioacchino Del Regno, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Chaitanya Kulkarni,
	Andrew Morton, linux-arm-kernel, linux-mediatek, linux-nvme

Yunsheng Lin <linyunsheng@huawei.com> wrote:

> > Change the page_frag_cache allocator to use multipage folios rather than
> > groups of pages.  This reduces page_frag_free to just a folio_put() or
> > put_page().
> 
> put_page() is not used in this patch, perhaps remove it to avoid
> the confusion?

Will do if I need to respin the patches.

> Also, Is there any significant difference between __free_pages()
> and folio_put()? IOW, what does the 'reduces' part means here?

I meant that the folio code handles page compounding for us and we don't need
to work out how big the page is for ourselves.

If you look at __free_pages(), you can see a PageHead() call.  folio_put()
doesn't need that.

> I followed some disscusion about folio before, but have not really
> understood about real difference between 'multipage folios' and
> 'groups of pages' yet. Is folio mostly used to avoid the confusion
> about whether a page is 'headpage of compound page', 'base page' or
> 'tailpage of compound page'? Or is there any abvious benefit about
> folio that I missed?

There is a benefit: a folio pointer always points to the head page and so we
never need to do "is this compound? where's the head?" logic to find it.  When
going from a page pointer, we still have to find the head.

Ultimately, the aim is to reduce struct page to a typed pointer to massively
reduce the amount of space consumed by mem_map[].  A page struct will then
point at a folio or a slab struct or one of a number of different types.  But
to get to that point, we have to stop a whole lot of things from using page
structs, but rather use some other type, such as folio.

Eventually, there won't be a need for head pages and tail pages per se - just
memory objects of different sizes.

> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 306a3d1a0fa6..d7c52a5979cc 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -420,18 +420,13 @@ static inline void *folio_get_private(struct folio *folio)
> >  }
> >  
> >  struct page_frag_cache {
> > -	void * va;
> > -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > -	__u16 offset;
> > -	__u16 size;
> > -#else
> > -	__u32 offset;
> > -#endif
> > +	struct folio	*folio;
> > +	unsigned int	offset;
> >  	/* we maintain a pagecount bias, so that we dont dirty cache line
> >  	 * containing page->_refcount every time we allocate a fragment.
> >  	 */
> > -	unsigned int		pagecnt_bias;
> > -	bool pfmemalloc;
> > +	unsigned int	pagecnt_bias;
> > +	bool		pfmemalloc;
> >  };
> 
> It seems 'va' and 'size' field is used to avoid touching 'stuct page' to
> avoid possible cache bouncing when there is more frag can be allocated
> from the page while other frags is freed at the same time before this patch?

Hmmm... fair point, though va is calculated from the page pointer on most
arches without the need to dereference struct page (only arc, m68k and sparc
define WANT_PAGE_VIRTUAL).

David



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios
  2023-05-26 12:47   ` David Howells
@ 2023-05-26 14:06     ` Mika Penttilä
  0 siblings, 0 replies; 19+ messages in thread
From: Mika Penttilä @ 2023-05-26 14:06 UTC (permalink / raw)
  To: David Howells, Yunsheng Lin, Matthew Wilcox
  Cc: netdev, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Willem de Bruijn, David Ahern, Jens Axboe, linux-mm,
	linux-kernel, Jeroen de Borst, Catherine Sullivan,
	Shailend Chand, Felix Fietkau, John Crispin, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Andrew Morton, linux-arm-kernel,
	linux-mediatek, linux-nvme

Hi,

On 26.5.2023 15.47, David Howells wrote:
> Yunsheng Lin <linyunsheng@huawei.com> wrote:
> 
>>> Change the page_frag_cache allocator to use multipage folios rather than
>>> groups of pages.  This reduces page_frag_free to just a folio_put() or
>>> put_page().
>>
>> put_page() is not used in this patch, perhaps remove it to avoid
>> the confusion?
> 
> Will do if I need to respin the patches.
> 
>> Also, Is there any significant difference between __free_pages()
>> and folio_put()? IOW, what does the 'reduces' part means here?
> 
> I meant that the folio code handles page compounding for us and we don't need
> to work out how big the page is for ourselves.
> 
> If you look at __free_pages(), you can see a PageHead() call.  folio_put()
> doesn't need that.
> 
>> I followed some disscusion about folio before, but have not really
>> understood about real difference between 'multipage folios' and
>> 'groups of pages' yet. Is folio mostly used to avoid the confusion
>> about whether a page is 'headpage of compound page', 'base page' or
>> 'tailpage of compound page'? Or is there any abvious benefit about
>> folio that I missed?
> 
> There is a benefit: a folio pointer always points to the head page and so we
> never need to do "is this compound? where's the head?" logic to find it.  When
> going from a page pointer, we still have to find the head.
> 


But page_frag_free() uses folio_put(virt_to_folio(addr)) and 
virt_to_folio() depends on the compound infrastructure to get the head 
page and folio.


> Ultimately, the aim is to reduce struct page to a typed pointer to massively
> reduce the amount of space consumed by mem_map[].  A page struct will then
> point at a folio or a slab struct or one of a number of different types.  But
> to get to that point, we have to stop a whole lot of things from using page
> structs, but rather use some other type, such as folio.
> 
> Eventually, there won't be a need for head pages and tail pages per se - just
> memory objects of different sizes.
> 
>>> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
>>> index 306a3d1a0fa6..d7c52a5979cc 100644
>>> --- a/include/linux/mm_types.h
>>> +++ b/include/linux/mm_types.h
>>> @@ -420,18 +420,13 @@ static inline void *folio_get_private(struct folio *folio)
>>>   }
>>>   
>>>   struct page_frag_cache {
>>> -	void * va;
>>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>> -	__u16 offset;
>>> -	__u16 size;
>>> -#else
>>> -	__u32 offset;
>>> -#endif
>>> +	struct folio	*folio;
>>> +	unsigned int	offset;
>>>   	/* we maintain a pagecount bias, so that we dont dirty cache line
>>>   	 * containing page->_refcount every time we allocate a fragment.
>>>   	 */
>>> -	unsigned int		pagecnt_bias;
>>> -	bool pfmemalloc;
>>> +	unsigned int	pagecnt_bias;
>>> +	bool		pfmemalloc;
>>>   };
>>
>> It seems 'va' and 'size' field is used to avoid touching 'stuct page' to
>> avoid possible cache bouncing when there is more frag can be allocated
>> from the page while other frags is freed at the same time before this patch?
> 
> Hmmm... fair point, though va is calculated from the page pointer on most
> arches without the need to dereference struct page (only arc, m68k and sparc
> define WANT_PAGE_VIRTUAL).
> 
> David
> 

--Mika



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios
  2023-05-24 15:33 ` [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios David Howells
  2023-05-26 11:56   ` Yunsheng Lin
  2023-05-26 12:47   ` David Howells
@ 2023-05-27  0:50   ` Jakub Kicinski
  2 siblings, 0 replies; 19+ messages in thread
From: Jakub Kicinski @ 2023-05-27  0:50 UTC (permalink / raw)
  To: David Howells
  Cc: netdev, David S. Miller, Eric Dumazet, Paolo Abeni,
	Willem de Bruijn, David Ahern, Matthew Wilcox, Jens Axboe,
	linux-mm, linux-kernel, Jeroen de Borst, Catherine Sullivan,
	Shailend Chand, Felix Fietkau, John Crispin, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Andrew Morton, linux-arm-kernel,
	linux-mediatek, linux-nvme

On Wed, 24 May 2023 16:33:03 +0100 David Howells wrote:
> -	offset = nc->offset - fragsz;
> -	if (unlikely(offset < 0)) {
> -		page = virt_to_page(nc->va);
> -
> -		if (page_ref_count(page) != nc->pagecnt_bias)
> +	offset = nc->offset;
> +	if (unlikely(fragsz > offset)) {
> +		/* Reuse the folio if everyone we gave it to has finished with
> +		 * it.
> +		 */
> +		if (!folio_ref_sub_and_test(folio, nc->pagecnt_bias)) {
> +			nc->folio = NULL;
>  			goto refill;
> +		}
> +
>  		if (unlikely(nc->pfmemalloc)) {
> -			page_ref_sub(page, nc->pagecnt_bias - 1);
> -			__free_pages(page, compound_order(page));
> +			__folio_put(folio);

This is not a pure 1:1 page -> folio conversion.
Why mix conversion with other code changes?


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 05/12] mm: Make the page_frag_cache allocator handle __GFP_ZERO itself
  2023-05-24 15:33 ` [PATCH net-next 05/12] mm: Make the page_frag_cache allocator handle __GFP_ZERO itself David Howells
@ 2023-05-27  0:57   ` Jakub Kicinski
  2023-05-27 15:54     ` Alexander Duyck
  0 siblings, 1 reply; 19+ messages in thread
From: Jakub Kicinski @ 2023-05-27  0:57 UTC (permalink / raw)
  To: David Howells
  Cc: netdev, David S. Miller, Eric Dumazet, Paolo Abeni,
	Willem de Bruijn, David Ahern, Matthew Wilcox, Jens Axboe,
	linux-mm, linux-kernel, Jeroen de Borst, Catherine Sullivan,
	Shailend Chand, Felix Fietkau, John Crispin, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Andrew Morton, linux-arm-kernel,
	linux-mediatek, linux-nvme, Alexander Duyck

On Wed, 24 May 2023 16:33:04 +0100 David Howells wrote:
> Make the page_frag_cache allocator handle __GFP_ZERO itself rather than
> passing it off to the page allocator.  There may be a mix of callers, some
> specifying __GFP_ZERO and some not - and even if all specify __GFP_ZERO, we
> might refurbish the page, in which case the returned memory doesn't get
> cleared.

I think it's pretty clear that page frag allocator was never supposed
to support GFP_ZERO, as we don't need it in networking.. So maybe
you're better off adding the memset() in nvme?

CCing Alex, who I think would say something along those lines :)
IDK how much we still care given that most networking drivers are
migrating to page_poll these days.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 06/12] mm: Make the page_frag_cache allocator use per-cpu
  2023-05-24 15:33 ` [PATCH net-next 06/12] mm: Make the page_frag_cache allocator use per-cpu David Howells
@ 2023-05-27  1:02   ` Jakub Kicinski
  0 siblings, 0 replies; 19+ messages in thread
From: Jakub Kicinski @ 2023-05-27  1:02 UTC (permalink / raw)
  To: David Howells
  Cc: netdev, David S. Miller, Eric Dumazet, Paolo Abeni,
	Willem de Bruijn, David Ahern, Matthew Wilcox, Jens Axboe,
	linux-mm, linux-kernel, Jeroen de Borst, Catherine Sullivan,
	Shailend Chand, Felix Fietkau, John Crispin, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Andrew Morton, linux-arm-kernel,
	linux-mediatek, linux-nvme

On Wed, 24 May 2023 16:33:05 +0100 David Howells wrote:
> though if a softirq wants to access it, then softirq disablement will
> need to be added.

Pretty sure GVE uses their allocator from softirq.
So this doesn't work, right?


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios
  2023-05-26 11:56   ` Yunsheng Lin
@ 2023-05-27 15:47     ` Alexander H Duyck
  2023-06-06  8:25     ` David Howells
  1 sibling, 0 replies; 19+ messages in thread
From: Alexander H Duyck @ 2023-05-27 15:47 UTC (permalink / raw)
  To: Yunsheng Lin, David Howells, netdev
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Willem de Bruijn, David Ahern, Matthew Wilcox, Jens Axboe,
	linux-mm, linux-kernel, Jeroen de Borst, Catherine Sullivan,
	Shailend Chand, Felix Fietkau, John Crispin, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Andrew Morton, linux-arm-kernel,
	linux-mediatek, linux-nvme

On Fri, 2023-05-26 at 19:56 +0800, Yunsheng Lin wrote:
> On 2023/5/24 23:33, David Howells wrote:
> > Change the page_frag_cache allocator to use multipage folios rather than
> > groups of pages.  This reduces page_frag_free to just a folio_put() or
> > put_page().
> 
> Hi, David
> 
> put_page() is not used in this patch, perhaps remove it to avoid
> the confusion?
> Also, Is there any significant difference between __free_pages()
> and folio_put()? IOW, what does the 'reduces' part means here?
> 
> I followed some disscusion about folio before, but have not really
> understood about real difference between 'multipage folios' and
> 'groups of pages' yet. Is folio mostly used to avoid the confusion
> about whether a page is 'headpage of compound page', 'base page' or
> 'tailpage of compound page'? Or is there any abvious benefit about
> folio that I missed?
> 
> > 
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 306a3d1a0fa6..d7c52a5979cc 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -420,18 +420,13 @@ static inline void *folio_get_private(struct folio *folio)
> >  }
> >  
> >  struct page_frag_cache {
> > -	void * va;
> > -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> > -	__u16 offset;
> > -	__u16 size;
> > -#else
> > -	__u32 offset;
> > -#endif
> > +	struct folio	*folio;
> > +	unsigned int	offset;
> >  	/* we maintain a pagecount bias, so that we dont dirty cache line
> >  	 * containing page->_refcount every time we allocate a fragment.
> >  	 */
> > -	unsigned int		pagecnt_bias;
> > -	bool pfmemalloc;
> > +	unsigned int	pagecnt_bias;
> > +	bool		pfmemalloc;
> >  };
> 
> It seems 'va' and 'size' field is used to avoid touching 'stuct page' to
> avoid possible cache bouncing when there is more frag can be allocated
> from the page while other frags is freed at the same time before this patch?
> It might be worth calling that out in the commit log or split it into another
> patch to make it clearer and easier to review?

Yes, there is a cost for going from page to virtual address. That is
why we only use the page when we finally get to freeing or resetting
the pagecnt_bias.

Also I have some concerns about going from page to folio as it seems
like the folio_alloc setups the transparent hugepage destructor instead
of using the compound page destructor. I would think that would slow
down most users as it looks like there is a spinlock that is taken in
the hugepage destructor that isn't there in the compound page
destructor.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 03/12] mm: Make the page_frag_cache allocator alignment param a pow-of-2
  2023-05-24 15:33 ` [PATCH net-next 03/12] mm: Make the page_frag_cache allocator alignment param a pow-of-2 David Howells
@ 2023-05-27 15:54   ` Alexander H Duyck
  2023-11-30  9:00     ` Yunsheng Lin
  2023-06-16 15:28   ` David Howells
  1 sibling, 1 reply; 19+ messages in thread
From: Alexander H Duyck @ 2023-05-27 15:54 UTC (permalink / raw)
  To: David Howells, netdev
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Willem de Bruijn, David Ahern, Matthew Wilcox, Jens Axboe,
	linux-mm, linux-kernel, Jeroen de Borst, Catherine Sullivan,
	Shailend Chand, Felix Fietkau, John Crispin, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Andrew Morton, linux-arm-kernel,
	linux-mediatek, linux-nvme

On Wed, 2023-05-24 at 16:33 +0100, David Howells wrote:
> Make the page_frag_cache allocator's alignment parameter a power of 2
> rather than a mask and give a warning if it isn't.
> 
> This means that it's consistent with {napi,netdec}_alloc_frag_align() and
> allows __{napi,netdev}_alloc_frag_align() to be removed.
> 

This goes against the original intention of these functions. One of the
reasons why this is being used is because when somebody enables
something like 2K jumbo frames they don't necessarily want to have to
allocate 4K SLABs. Instead they can just add a bit of overhead and get
almost twice the utilization out of an order 3 page.

The requirement should only be cache alignment, not power of 2
alignment. This isn't meant to be a slab allocator. We are just
sectioning up pages to handle mixed workloads. In the case of
networking we can end up getting everything from 60B packets, to 1514B
in the standard cases. That was why we started sectioning up pages in
the first place so putting a power of 2 requirement on it doens't fit
our use case at all and is what we were trying to get away from with
the SLAB allocators.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 05/12] mm: Make the page_frag_cache allocator handle __GFP_ZERO itself
  2023-05-27  0:57   ` Jakub Kicinski
@ 2023-05-27 15:54     ` Alexander Duyck
  0 siblings, 0 replies; 19+ messages in thread
From: Alexander Duyck @ 2023-05-27 15:54 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Howells, netdev, David S. Miller, Eric Dumazet,
	Paolo Abeni, Willem de Bruijn, David Ahern, Matthew Wilcox,
	Jens Axboe, linux-mm, linux-kernel, Jeroen de Borst,
	Catherine Sullivan, Shailend Chand, Felix Fietkau, John Crispin,
	Sean Wang, Mark Lee, Lorenzo Bianconi, Matthias Brugger,
	AngeloGioacchino Del Regno, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Chaitanya Kulkarni,
	Andrew Morton, linux-arm-kernel, linux-mediatek, linux-nvme

On Fri, May 26, 2023 at 5:57 PM Jakub Kicinski <kuba@kernel.org> wrote:
>
> On Wed, 24 May 2023 16:33:04 +0100 David Howells wrote:
> > Make the page_frag_cache allocator handle __GFP_ZERO itself rather than
> > passing it off to the page allocator.  There may be a mix of callers, some
> > specifying __GFP_ZERO and some not - and even if all specify __GFP_ZERO, we
> > might refurbish the page, in which case the returned memory doesn't get
> > cleared.
>
> I think it's pretty clear that page frag allocator was never supposed
> to support GFP_ZERO, as we don't need it in networking.. So maybe
> you're better off adding the memset() in nvme?
>
> CCing Alex, who I think would say something along those lines :)
> IDK how much we still care given that most networking drivers are
> migrating to page_poll these days.

Yeah, the page frag allocator wasn't meant to handle things like this.
Generally the cache was meant to be used within one context so that
the GFP flags used were consistent between calls. Currently the only
thing passed appears to be GFP_ATOMIC.

Also I am not a big fan of pulling this out of page_alloc.c The fact
is that is where the allocation functions live so it makes sense to
just leave it there. It isn't as if there is enough code added in my
point of view to create yet another file and make it harder to track
git history as a result.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios
  2023-05-26 11:56   ` Yunsheng Lin
  2023-05-27 15:47     ` Alexander H Duyck
@ 2023-06-06  8:25     ` David Howells
  2023-06-06 14:59       ` Alexander Duyck
  1 sibling, 1 reply; 19+ messages in thread
From: David Howells @ 2023-06-06  8:25 UTC (permalink / raw)
  To: Alexander H Duyck
  Cc: Yunsheng Lin, David Howells, netdev, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Willem de Bruijn,
	David Ahern, Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel,
	Jeroen de Borst, Catherine Sullivan, Shailend Chand,
	Felix Fietkau, John Crispin, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Andrew Morton, linux-arm-kernel,
	linux-mediatek, linux-nvme

Alexander H Duyck <alexander.duyck@gmail.com> wrote:

> Also I have some concerns about going from page to folio as it seems
> like the folio_alloc setups the transparent hugepage destructor instead
> of using the compound page destructor. I would think that would slow
> down most users as it looks like there is a spinlock that is taken in
> the hugepage destructor that isn't there in the compound page
> destructor.

Note that this code is going to have to move to folios[*] at some point.
"Old-style" compound pages are going to go away, I believe.  Matthew Wilcox
and the mm folks are on a drive towards simplifying memory management,
formalising chunks larger than a single page - with the ultimate aim of
reducing the page struct to a single, typed pointer.

So, take, for example, a folio: As I understand it, this will no longer
overlay struct page, but rather will become a single, dynamically-allocated
struct that covers a pow-of-2 number of pages.  A contiguous subset of page
structs will point at it.

However, rather than using a folio, we could define a "page fragment" memory
type.  Rather than having all the flags and fields to be found in struct
folio, it could have just the set to be found in page_frag_cache.

David

[*] It will be possible to have some other type than "folio".  See "struct
slab" in mm/slab.h for example.  struct slab corresponds to a set of pages
and, in the future, a number of struct pages will point at it.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios
  2023-06-06  8:25     ` David Howells
@ 2023-06-06 14:59       ` Alexander Duyck
  0 siblings, 0 replies; 19+ messages in thread
From: Alexander Duyck @ 2023-06-06 14:59 UTC (permalink / raw)
  To: David Howells
  Cc: Yunsheng Lin, netdev, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Willem de Bruijn, David Ahern,
	Matthew Wilcox, Jens Axboe, linux-mm, linux-kernel,
	Jeroen de Borst, Catherine Sullivan, Shailend Chand,
	Felix Fietkau, John Crispin, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Andrew Morton, linux-arm-kernel,
	linux-mediatek, linux-nvme

On Tue, Jun 6, 2023 at 1:25 AM David Howells <dhowells@redhat.com> wrote:
>
> Alexander H Duyck <alexander.duyck@gmail.com> wrote:
>
> > Also I have some concerns about going from page to folio as it seems
> > like the folio_alloc setups the transparent hugepage destructor instead
> > of using the compound page destructor. I would think that would slow
> > down most users as it looks like there is a spinlock that is taken in
> > the hugepage destructor that isn't there in the compound page
> > destructor.
>
> Note that this code is going to have to move to folios[*] at some point.
> "Old-style" compound pages are going to go away, I believe.  Matthew Wilcox
> and the mm folks are on a drive towards simplifying memory management,
> formalising chunks larger than a single page - with the ultimate aim of
> reducing the page struct to a single, typed pointer.

I'm not against making the move, but as others have pointed out this
is getting into unrelated things. One of those being the fact that to
transition to using folios we don't need to get rid of the use of the
virtual address. The idea behind using the virtual address here is
that we can avoid a bunch of address translation overhead since we
only need to use the folio if we are going to allocate, retire, or
recycle a page/folio. If we are using an order 3 page that shouldn't
be very often.

> So, take, for example, a folio: As I understand it, this will no longer
> overlay struct page, but rather will become a single, dynamically-allocated
> struct that covers a pow-of-2 number of pages.  A contiguous subset of page
> structs will point at it.
>
> However, rather than using a folio, we could define a "page fragment" memory
> type.  Rather than having all the flags and fields to be found in struct
> folio, it could have just the set to be found in page_frag_cache.

I don't think we need a new memory type. For the most part the page
fragment code is really more a subset of something like a
__get_free_pages where the requester provides the size, is just given
a virtual address, and we shouldn't need to be allocating a new page
as often as ideally the allocations are 2K or less in size.

Also one thing I would want to avoid is adding complexity to the
freeing path. The general idea with page frags is that they are meant
to be lightweight in terms of freeing as well. So just as they are
similar to __get_free_pages in terms of allocation the freeing is
meant to be similar to free_pages.

> David
>
> [*] It will be possible to have some other type than "folio".  See "struct
> slab" in mm/slab.h for example.  struct slab corresponds to a set of pages
> and, in the future, a number of struct pages will point at it.

I want to avoid getting anywhere near the complexity of a slab
allocator. The whole point of this was to keep it simple so that
drivers could use it and get decent performance. When I had
implemented it in the Intel drivers back in the day this approach was
essentially just a reference count/page offset hack that allowed us to
split a page in 2 and use the pages as a sort of mobius strip within
the ring buffer.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 03/12] mm: Make the page_frag_cache allocator alignment param a pow-of-2
  2023-05-24 15:33 ` [PATCH net-next 03/12] mm: Make the page_frag_cache allocator alignment param a pow-of-2 David Howells
  2023-05-27 15:54   ` Alexander H Duyck
@ 2023-06-16 15:28   ` David Howells
  2023-06-16 16:06     ` Alexander Duyck
  1 sibling, 1 reply; 19+ messages in thread
From: David Howells @ 2023-06-16 15:28 UTC (permalink / raw)
  To: Alexander H Duyck
  Cc: dhowells, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Willem de Bruijn, David Ahern, Matthew Wilcox,
	Jens Axboe, linux-mm, linux-kernel, Jeroen de Borst,
	Catherine Sullivan, Shailend Chand, Felix Fietkau, John Crispin,
	Sean Wang, Mark Lee, Lorenzo Bianconi, Matthias Brugger,
	AngeloGioacchino Del Regno, Keith Busch, Jens Axboe,
	Christoph Hellwig, Sagi Grimberg, Chaitanya Kulkarni,
	Andrew Morton, linux-arm-kernel, linux-mediatek, linux-nvme

Alexander H Duyck <alexander.duyck@gmail.com> wrote:

> The requirement should only be cache alignment, not power of 2
> alignment.

Sure, but, upstream, page_frag_alloc_align() allows the specification of an
alignment and applies that alignment by:

	offset &= align_mask;

which doesn't really make sense unless the alignment boils down to being a
power of two.  Now, it might make sense to kill off the align_mask parameter
and just go with SMP_CACHE_BYTES (which will be a power of two).

Note, though, that most users seem to use an align_mask of ~0u which feels a
bit dodgy (it's not an unsigned long), but is probably fine.

David



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 03/12] mm: Make the page_frag_cache allocator alignment param a pow-of-2
  2023-06-16 15:28   ` David Howells
@ 2023-06-16 16:06     ` Alexander Duyck
  0 siblings, 0 replies; 19+ messages in thread
From: Alexander Duyck @ 2023-06-16 16:06 UTC (permalink / raw)
  To: David Howells
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Willem de Bruijn, David Ahern, Matthew Wilcox, Jens Axboe,
	linux-mm, linux-kernel, Jeroen de Borst, Catherine Sullivan,
	Shailend Chand, Felix Fietkau, John Crispin, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Andrew Morton, linux-arm-kernel,
	linux-mediatek, linux-nvme

On Fri, Jun 16, 2023 at 8:28 AM David Howells <dhowells@redhat.com> wrote:
>
> Alexander H Duyck <alexander.duyck@gmail.com> wrote:
>
> > The requirement should only be cache alignment, not power of 2
> > alignment.
>
> Sure, but, upstream, page_frag_alloc_align() allows the specification of an
> alignment and applies that alignment by:
>
>         offset &= align_mask;
>
> which doesn't really make sense unless the alignment boils down to being a
> power of two.  Now, it might make sense to kill off the align_mask parameter
> and just go with SMP_CACHE_BYTES (which will be a power of two).
>
> Note, though, that most users seem to use an align_mask of ~0u which feels a
> bit dodgy (it's not an unsigned long), but is probably fine.

Yeah, now that I think about it this should be fine. I must have been
responding to this without enough sleep. When I originally read it I
saw it as limiting it to a power of 2 allocation, not limiting the
alignment mask to a power of 2.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next 03/12] mm: Make the page_frag_cache allocator alignment param a pow-of-2
  2023-05-27 15:54   ` Alexander H Duyck
@ 2023-11-30  9:00     ` Yunsheng Lin
  0 siblings, 0 replies; 19+ messages in thread
From: Yunsheng Lin @ 2023-11-30  9:00 UTC (permalink / raw)
  To: Alexander H Duyck, David Howells, netdev
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Willem de Bruijn, David Ahern, Matthew Wilcox, Jens Axboe,
	linux-mm, linux-kernel, Jeroen de Borst, Catherine Sullivan,
	Shailend Chand, Felix Fietkau, John Crispin, Sean Wang, Mark Lee,
	Lorenzo Bianconi, Matthias Brugger, AngeloGioacchino Del Regno,
	Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Chaitanya Kulkarni, Andrew Morton, linux-arm-kernel,
	linux-mediatek, linux-nvme

On 2023/5/27 23:54, Alexander H Duyck wrote:
> On Wed, 2023-05-24 at 16:33 +0100, David Howells wrote:
>> Make the page_frag_cache allocator's alignment parameter a power of 2
>> rather than a mask and give a warning if it isn't.
>>
>> This means that it's consistent with {napi,netdec}_alloc_frag_align() and
>> allows __{napi,netdev}_alloc_frag_align() to be removed.

I am trying to rmove the page frag implemetation in
vhost_net_page_frag_refill() by using page_frag_alloc_align(), and
I ended up having a simiar patch as this one.

>>
> 
> This goes against the original intention of these functions. One of the
> reasons why this is being used is because when somebody enables
> something like 2K jumbo frames they don't necessarily want to have to
> allocate 4K SLABs. Instead they can just add a bit of overhead and get
> almost twice the utilization out of an order 3 page.
> 
> The requirement should only be cache alignment, not power of 2
> alignment. This isn't meant to be a slab allocator. We are just
> sectioning up pages to handle mixed workloads. In the case of
> networking we can end up getting everything from 60B packets, to 1514B
> in the standard cases. That was why we started sectioning up pages in
> the first place so putting a power of 2 requirement on it doens't fit
> our use case at all and is what we were trying to get away from with
> the SLAB allocators.

It seems that is_power_of_2() checking in this patch does not excluding
the non-align case if we are passing 'align' being 1, which means we still
can support the 'everything from 60B packets, to 1514B' case.

> 
> .
> 


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2023-12-04  4:31 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20230524153311.3625329-1-dhowells@redhat.com>
2023-05-24 15:33 ` [PATCH net-next 02/12] mm: Provide a page_frag_cache allocator cleanup function David Howells
2023-05-24 15:33 ` [PATCH net-next 03/12] mm: Make the page_frag_cache allocator alignment param a pow-of-2 David Howells
2023-05-27 15:54   ` Alexander H Duyck
2023-11-30  9:00     ` Yunsheng Lin
2023-06-16 15:28   ` David Howells
2023-06-16 16:06     ` Alexander Duyck
2023-05-24 15:33 ` [PATCH net-next 04/12] mm: Make the page_frag_cache allocator use multipage folios David Howells
2023-05-26 11:56   ` Yunsheng Lin
2023-05-27 15:47     ` Alexander H Duyck
2023-06-06  8:25     ` David Howells
2023-06-06 14:59       ` Alexander Duyck
2023-05-26 12:47   ` David Howells
2023-05-26 14:06     ` Mika Penttilä
2023-05-27  0:50   ` Jakub Kicinski
2023-05-24 15:33 ` [PATCH net-next 05/12] mm: Make the page_frag_cache allocator handle __GFP_ZERO itself David Howells
2023-05-27  0:57   ` Jakub Kicinski
2023-05-27 15:54     ` Alexander Duyck
2023-05-24 15:33 ` [PATCH net-next 06/12] mm: Make the page_frag_cache allocator use per-cpu David Howells
2023-05-27  1:02   ` Jakub Kicinski

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).