From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from bombadil.infradead.org ([198.137.202.133]:37212 "EHLO bombadil.infradead.org" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751832AbeCOTxc (ORCPT ); Thu, 15 Mar 2018 15:53:32 -0400 From: Matthew Wilcox To: Alexander Duyck Cc: linux-mm@vger.kernel.org, netdev@vger.kernel.org, Matthew Wilcox Subject: [RFC 2/2] page_frag_cache: Store metadata in struct page Date: Thu, 15 Mar 2018 12:53:29 -0700 Message-Id: <20180315195329.7787-3-willy@infradead.org> In-Reply-To: <20180315195329.7787-1-willy@infradead.org> References: <20180315195329.7787-1-willy@infradead.org> Sender: netdev-owner@vger.kernel.org List-ID: From: Matthew Wilcox Shrink page_frag_cache from 24 to 8 bytes (a single pointer to the currently-in-use struct page) by using the page's refcount directly (instead of maintaining a bias) and storing our current progress through the page in the same bits currently used for page->index. We no longer need to reflect the page pfmemalloc state if we're storing the page directly. On the downside, we now call page_address() on every allocation, and we do an atomic_inc() rather than a non-atomic decrement, but we should touch the same number of cachelines and there is far less code (and the code is less complex). Signed-off-by: Matthew Wilcox --- include/linux/mm_types.h | 17 +----- mm/page_alloc.c | 135 ++++++++++++++++++++++++----------------------- net/core/skbuff.c | 4 +- 3 files changed, 74 insertions(+), 82 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c5dea402501..f922cb62bd91 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -90,6 +90,7 @@ struct page { union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* sl[aou]b first free object */ + unsigned int offset; /* page_frag highwater mark */ /* page_deferred_list().prev -- second tail page */ }; @@ -219,22 +220,8 @@ struct page { #endif } _struct_page_alignment; -#define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) -#define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) - struct page_frag_cache { - void * va; -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - __u16 offset; - __u16 size; -#else - __u32 offset; -#endif - /* we maintain a pagecount bias, so that we dont dirty cache line - * containing page->_refcount every time we allocate a fragment. - */ - unsigned int pagecnt_bias; - bool pfmemalloc; + struct page *page; }; typedef unsigned long vm_flags_t; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7a9c14214ed2..f8a176aab287 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4319,34 +4319,72 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * Page Fragment: - * An arbitrary-length arbitrary-offset area of memory which resides - * within a 0 or higher order page. Multiple fragments within that page - * are individually refcounted, in the page's reference counter. + * The page fragment allocator is simple, yet effective. It allocates + * pages from the page allocator, then hands out fragments of those + * pages to its callers. It makes no effort to track which parts of + * the page remain in use, always allocating fresh memory. The page + * reference count is used to keep track of whether any fragment is + * still in use; when all fragments in a page have been freed, the + * entire page is returned to the page allocator. * - * The page_frag functions below provide a simple allocation framework for - * page fragments. This is used by the network stack and network device - * drivers to provide a backing region of memory for use as either an - * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. + * The page fragment allocator performs no locking. The caller is + * expected to ensure that two callers cannot simultaneously allocate + * from the same page_frag_cache. Freeing is atomic and is permitted + * to happen simultaneously with other frees or an allocation. + * + * The allocator uses the struct page to store its state. The 'offset' + * field in struct page is used to track how far through the page the + * allocation has proceeded. The 'refcount' field is used to track + * how many fragments have been allocated from this page. All other + * fields in struct page may be used by the owner of the page_frag_cache. + * The refcount is incremented by one while the page is still actively being + * allocated from; this prevents it from being freed prematurely. */ -static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, - gfp_t gfp_mask) + +#define PAGE_FRAG_ALLOC_SIZE (64 * 1024) +#define PAGE_FRAG_ORDER get_order(PAGE_FRAG_ALLOC_SIZE) + +static noinline +struct page *__page_frag_cache_refill(struct page_frag_cache *nc, + unsigned int size, gfp_t gfp_mask) { + struct page *old = nc->page; struct page *page = NULL; - gfp_t gfp = gfp_mask; - -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - gfp_mask |= __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY | - __GFP_NOMEMALLOC; - page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, - PAGE_FRAG_CACHE_MAX_ORDER); - nc->size = page ? PAGE_FRAG_CACHE_MAX_SIZE : PAGE_SIZE; -#endif - if (unlikely(!page)) - page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); - nc->va = page ? page_address(page) : NULL; + if (size > PAGE_FRAG_ALLOC_SIZE) + return NULL; + + /* + * If all the previous allocations from this page have already been + * freed, reuse the page if it can satisfy this allocation. + */ + if (old && page_ref_count(old) == 1) { + unsigned int offset = PAGE_SIZE << compound_order(old); + + if (offset > size) { + old->offset = offset; + return old; + } + } + + if (PAGE_FRAG_ORDER > 0) { + gfp_t gfp = gfp_mask | __GFP_COMP | __GFP_NOWARN | + __GFP_NORETRY | __GFP_NOMEMALLOC; + + page = alloc_pages_node(NUMA_NO_NODE, gfp, PAGE_FRAG_ORDER); + if (unlikely(!page) && size > PAGE_SIZE) + return NULL; + } + if (unlikely(!page)) + page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, 0); + if (unlikely(!page)) + return NULL; + + if (old) + put_page(old); + nc->page = page; + page->offset = PAGE_SIZE << compound_order(page); return page; } @@ -4366,56 +4404,23 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) EXPORT_SYMBOL(__page_frag_cache_drain); void *page_frag_alloc(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) + unsigned int size, gfp_t gfp_mask) { - unsigned int size = PAGE_SIZE; - struct page *page; - int offset; + struct page *page = nc->page; + unsigned int offset = page->offset; - if (unlikely(!nc->va)) { -refill: - page = __page_frag_cache_refill(nc, gfp_mask); + if (unlikely(!page || offset < size)) { + page = __page_frag_cache_refill(nc, size, gfp_mask); if (!page) return NULL; - -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - /* if size can vary use size else just use PAGE_SIZE */ - size = nc->size; -#endif - /* Even if we own the page, we do not use atomic_set(). - * This would break get_page_unless_zero() users. - */ - page_ref_add(page, size - 1); - - /* reset page count bias and offset to start of new frag */ - nc->pfmemalloc = page_is_pfmemalloc(page); - nc->pagecnt_bias = size; - nc->offset = size; - } - - offset = nc->offset - fragsz; - if (unlikely(offset < 0)) { - page = virt_to_page(nc->va); - - if (!page_ref_sub_and_test(page, nc->pagecnt_bias)) - goto refill; - -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE) - /* if size can vary use size else just use PAGE_SIZE */ - size = nc->size; -#endif - /* OK, page count is 0, we can safely set it */ - set_page_count(page, size); - - /* reset page count bias and offset to start of new frag */ - nc->pagecnt_bias = size; - offset = size - fragsz; + offset = page->offset; } - nc->pagecnt_bias--; - nc->offset = offset; + page_ref_inc(page); + offset -= size; + page->offset = offset; - return nc->va + offset; + return page_address(page) + offset; } EXPORT_SYMBOL(page_frag_alloc); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 09bd89c90a71..59df4db31aed 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -412,7 +412,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, nc = this_cpu_ptr(&netdev_alloc_cache); data = page_frag_alloc(nc, len, gfp_mask); - pfmemalloc = nc->pfmemalloc; + pfmemalloc = page_is_pfmemalloc(nc->page); local_irq_restore(flags); @@ -486,7 +486,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } /* use OR instead of assignment to avoid clearing of bits in mask */ - if (nc->page.pfmemalloc) + if (page_is_pfmemalloc(nc->page.page)) skb->pfmemalloc = 1; skb->head_frag = 1; -- 2.16.2