From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <netdev-owner@vger.kernel.org>
Received: from bombadil.infradead.org ([198.137.202.133]:37212 "EHLO
        bombadil.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S1751832AbeCOTxc (ORCPT
        <rfc822;netdev@vger.kernel.org>); Thu, 15 Mar 2018 15:53:32 -0400
From: Matthew Wilcox <willy@infradead.org>
To: Alexander Duyck <alexander.h.duyck@intel.com>
Cc: linux-mm@vger.kernel.org, netdev@vger.kernel.org,
        Matthew Wilcox <mawilcox@microsoft.com>
Subject: [RFC 2/2] page_frag_cache: Store metadata in struct page
Date: Thu, 15 Mar 2018 12:53:29 -0700
Message-Id: <20180315195329.7787-3-willy@infradead.org>
In-Reply-To: <20180315195329.7787-1-willy@infradead.org>
References: <20180315195329.7787-1-willy@infradead.org>
Sender: netdev-owner@vger.kernel.org
List-ID: <netdev.vger.kernel.org>

From: Matthew Wilcox <mawilcox@microsoft.com>

Shrink page_frag_cache from 24 to 8 bytes (a single pointer to the
currently-in-use struct page) by using the page's refcount directly
(instead of maintaining a bias) and storing our current progress through
the page in the same bits currently used for page->index.  We no longer
need to reflect the page pfmemalloc state if we're storing the page
directly.

On the downside, we now call page_address() on every allocation, and we
do an atomic_inc() rather than a non-atomic decrement, but we should
touch the same number of cachelines and there is far less code (and
the code is less complex).

Signed-off-by: Matthew Wilcox <mawilcox@microsoft.com>
---
 include/linux/mm_types.h |  17 +-----
 mm/page_alloc.c          | 135 ++++++++++++++++++++++++-----------------------
 net/core/skbuff.c        |   4 +-
 3 files changed, 74 insertions(+), 82 deletions(-)

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 1c5dea402501..f922cb62bd91 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -90,6 +90,7 @@ struct page {
 	union {
 		pgoff_t index;		/* Our offset within mapping. */
 		void *freelist;		/* sl[aou]b first free object */
+		unsigned int offset;	/* page_frag highwater mark */
 		/* page_deferred_list().prev	-- second tail page */
 	};
 
@@ -219,22 +220,8 @@ struct page {
 #endif
 } _struct_page_alignment;
 
-#define PAGE_FRAG_CACHE_MAX_SIZE	__ALIGN_MASK(32768, ~PAGE_MASK)
-#define PAGE_FRAG_CACHE_MAX_ORDER	get_order(PAGE_FRAG_CACHE_MAX_SIZE)
-
 struct page_frag_cache {
-	void * va;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	__u16 offset;
-	__u16 size;
-#else
-	__u32 offset;
-#endif
-	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_refcount every time we allocate a fragment.
-	 */
-	unsigned int		pagecnt_bias;
-	bool pfmemalloc;
+	struct page *page;
 };
 
 typedef unsigned long vm_flags_t;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7a9c14214ed2..f8a176aab287 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4319,34 +4319,72 @@ void free_pages(unsigned long addr, unsigned int order)
 EXPORT_SYMBOL(free_pages);
 
 /*
- * Page Fragment:
- *  An arbitrary-length arbitrary-offset area of memory which resides
- *  within a 0 or higher order page.  Multiple fragments within that page
- *  are individually refcounted, in the page's reference counter.
+ * The page fragment allocator is simple, yet effective.  It allocates
+ * pages from the page allocator, then hands out fragments of those
+ * pages to its callers.  It makes no effort to track which parts of
+ * the page remain in use, always allocating fresh memory.  The page
+ * reference count is used to keep track of whether any fragment is
+ * still in use; when all fragments in a page have been freed, the
+ * entire page is returned to the page allocator.
  *
- * The page_frag functions below provide a simple allocation framework for
- * page fragments.  This is used by the network stack and network device
- * drivers to provide a backing region of memory for use as either an
- * sk_buff->head, or to be used in the "frags" portion of skb_shared_info.
+ * The page fragment allocator performs no locking.  The caller is
+ * expected to ensure that two callers cannot simultaneously allocate
+ * from the same page_frag_cache.  Freeing is atomic and is permitted
+ * to happen simultaneously with other frees or an allocation.
+ *
+ * The allocator uses the struct page to store its state.  The 'offset'
+ * field in struct page is used to track how far through the page the
+ * allocation has proceeded.  The 'refcount' field is used to track
+ * how many fragments have been allocated from this page.  All other
+ * fields in struct page may be used by the owner of the page_frag_cache.
+ * The refcount is incremented by one while the page is still actively being
+ * allocated from; this prevents it from being freed prematurely.
  */
-static struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
-					     gfp_t gfp_mask)
+
+#define PAGE_FRAG_ALLOC_SIZE	(64 * 1024)
+#define PAGE_FRAG_ORDER		get_order(PAGE_FRAG_ALLOC_SIZE)
+
+static noinline
+struct page *__page_frag_cache_refill(struct page_frag_cache *nc,
+		unsigned int size, gfp_t gfp_mask)
 {
+	struct page *old = nc->page;
 	struct page *page = NULL;
-	gfp_t gfp = gfp_mask;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-	gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY |
-		    __GFP_NOMEMALLOC;
-	page = alloc_pages_node(NUMA_NO_NODE, gfp_mask,
-				PAGE_FRAG_CACHE_MAX_ORDER);
-	nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE;
-#endif
-	if (unlikely(!page))
-		page = alloc_pages_node(NUMA_NO_NODE, gfp, 0);
 
-	nc->va = page ? page_address(page) : NULL;
+	if (size > PAGE_FRAG_ALLOC_SIZE)
+		return NULL;
+
+	/*
+	 * If all the previous allocations from this page have already been
+	 * freed, reuse the page if it can satisfy this allocation.
+	 */
+	if (old && page_ref_count(old) == 1) {
+		unsigned int offset = PAGE_SIZE << compound_order(old);
+
+		if (offset > size) {
+			old->offset = offset;
+			return old;
+		}
+	}
+
+	if (PAGE_FRAG_ORDER > 0) {
+		gfp_t gfp = gfp_mask | __GFP_COMP | __GFP_NOWARN |
+				__GFP_NORETRY | __GFP_NOMEMALLOC;
+
+		page = alloc_pages_node(NUMA_NO_NODE, gfp, PAGE_FRAG_ORDER);
+		if (unlikely(!page) && size > PAGE_SIZE)
+			return NULL;
+	}
 
+	if (unlikely(!page))
+		page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0);
+	if (unlikely(!page))
+		return NULL;
+
+	if (old)
+		put_page(old);
+	nc->page = page;
+	page->offset = PAGE_SIZE << compound_order(page);
 	return page;
 }
 
@@ -4366,56 +4404,23 @@ void __page_frag_cache_drain(struct page *page, unsigned int count)
 EXPORT_SYMBOL(__page_frag_cache_drain);
 
 void *page_frag_alloc(struct page_frag_cache *nc,
-		      unsigned int fragsz, gfp_t gfp_mask)
+		      unsigned int size, gfp_t gfp_mask)
 {
-	unsigned int size = PAGE_SIZE;
-	struct page *page;
-	int offset;
+	struct page *page = nc->page;
+	unsigned int offset = page->offset;
 
-	if (unlikely(!nc->va)) {
-refill:
-		page = __page_frag_cache_refill(nc, gfp_mask);
+	if (unlikely(!page || offset < size)) {
+		page = __page_frag_cache_refill(nc, size, gfp_mask);
 		if (!page)
 			return NULL;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* Even if we own the page, we do not use atomic_set().
-		 * This would break get_page_unless_zero() users.
-		 */
-		page_ref_add(page, size - 1);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pfmemalloc = page_is_pfmemalloc(page);
-		nc->pagecnt_bias = size;
-		nc->offset = size;
-	}
-
-	offset = nc->offset - fragsz;
-	if (unlikely(offset < 0)) {
-		page = virt_to_page(nc->va);
-
-		if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
-			goto refill;
-
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
-		/* if size can vary use size else just use PAGE_SIZE */
-		size = nc->size;
-#endif
-		/* OK, page count is 0, we can safely set it */
-		set_page_count(page, size);
-
-		/* reset page count bias and offset to start of new frag */
-		nc->pagecnt_bias = size;
-		offset = size - fragsz;
+		offset = page->offset;
 	}
 
-	nc->pagecnt_bias--;
-	nc->offset = offset;
+	page_ref_inc(page);
+	offset -= size;
+	page->offset = offset;
 
-	return nc->va + offset;
+	return page_address(page) + offset;
 }
 EXPORT_SYMBOL(page_frag_alloc);
 
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 09bd89c90a71..59df4db31aed 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -412,7 +412,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
 
 	nc = this_cpu_ptr(&netdev_alloc_cache);
 	data = page_frag_alloc(nc, len, gfp_mask);
-	pfmemalloc = nc->pfmemalloc;
+	pfmemalloc = page_is_pfmemalloc(nc->page);
 
 	local_irq_restore(flags);
 
@@ -486,7 +486,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
 	}
 
 	/* use OR instead of assignment to avoid clearing of bits in mask */
-	if (nc->page.pfmemalloc)
+	if (page_is_pfmemalloc(nc->page.page))
 		skb->pfmemalloc = 1;
 	skb->head_frag = 1;
 
-- 
2.16.2