[PATCHv12,2/4] zbud: add to mm/
diff mbox series

Message ID 1369067168-12291-3-git-send-email-sjenning@linux.vnet.ibm.com
State New, archived
Headers show
Series
  • zswap: compressed swap caching
Related show

Commit Message

Seth Jennings May 20, 2013, 4:26 p.m. UTC
zbud is an special purpose allocator for storing compressed pages. It is
designed to store up to two compressed pages per physical page.  While this
design limits storage density, it has simple and deterministic reclaim
properties that make it preferable to a higher density approach when reclaim
will be used.

zbud works by storing compressed pages, or "zpages", together in pairs in a
single memory page called a "zbud page".  The first buddy is "left
justifed" at the beginning of the zbud page, and the last buddy is "right
justified" at the end of the zbud page.  The benefit is that if either
buddy is freed, the freed buddy space, coalesced with whatever slack space
that existed between the buddies, results in the largest possible free region
within the zbud page.

zbud also provides an attractive lower bound on density. The ratio of zpages
to zbud pages can not be less than 1.  This ensures that zbud can never "do
harm" by using more pages to store zpages than the uncompressed zpages would
have used on their own.

This implementation is a rewrite of the zbud allocator internally used
by zcache in the driver/staging tree.  The rewrite was necessary to
remove some of the zcache specific elements that were ingrained throughout
and provide a generic allocation interface that can later be used by
zsmalloc and others.

This patch adds zbud to mm/ for later use by zswap.

Signed-off-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
Acked-by: Rik van Riel <riel@redhat.com>
---
 include/linux/zbud.h |  22 +++
 mm/Kconfig           |  10 +
 mm/Makefile          |   1 +
 mm/zbud.c            | 543 +++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 576 insertions(+)
 create mode 100644 include/linux/zbud.h
 create mode 100644 mm/zbud.c

Comments

Bob Liu May 21, 2013, 3:37 a.m. UTC | #1
On 05/21/2013 12:26 AM, Seth Jennings wrote:
> zbud is an special purpose allocator for storing compressed pages. It is
> designed to store up to two compressed pages per physical page.  While this
> design limits storage density, it has simple and deterministic reclaim
> properties that make it preferable to a higher density approach when reclaim
> will be used.
> 
> zbud works by storing compressed pages, or "zpages", together in pairs in a
> single memory page called a "zbud page".  The first buddy is "left
> justifed" at the beginning of the zbud page, and the last buddy is "right
> justified" at the end of the zbud page.  The benefit is that if either
> buddy is freed, the freed buddy space, coalesced with whatever slack space
> that existed between the buddies, results in the largest possible free region
> within the zbud page.
> 
> zbud also provides an attractive lower bound on density. The ratio of zpages
> to zbud pages can not be less than 1.  This ensures that zbud can never "do
> harm" by using more pages to store zpages than the uncompressed zpages would
> have used on their own.
> 
> This implementation is a rewrite of the zbud allocator internally used
> by zcache in the driver/staging tree.  The rewrite was necessary to
> remove some of the zcache specific elements that were ingrained throughout
> and provide a generic allocation interface that can later be used by
> zsmalloc and others.
> 
> This patch adds zbud to mm/ for later use by zswap.
> 
> Signed-off-by: Seth Jennings <sjenning@linux.vnet.ibm.com>
> Acked-by: Rik van Riel <riel@redhat.com>
> ---
>  include/linux/zbud.h |  22 +++
>  mm/Kconfig           |  10 +
>  mm/Makefile          |   1 +
>  mm/zbud.c            | 543 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  4 files changed, 576 insertions(+)
>  create mode 100644 include/linux/zbud.h
>  create mode 100644 mm/zbud.c
> 
> diff --git a/include/linux/zbud.h b/include/linux/zbud.h
> new file mode 100644
> index 0000000..2571a5c
> --- /dev/null
> +++ b/include/linux/zbud.h
> @@ -0,0 +1,22 @@
> +#ifndef _ZBUD_H_
> +#define _ZBUD_H_
> +
> +#include <linux/types.h>
> +
> +struct zbud_pool;
> +
> +struct zbud_ops {
> +	int (*evict)(struct zbud_pool *pool, unsigned long handle);
> +};
> +
> +struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
> +void zbud_destroy_pool(struct zbud_pool *pool);
> +int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
> +	unsigned long *handle);
> +void zbud_free(struct zbud_pool *pool, unsigned long handle);
> +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
> +void *zbud_map(struct zbud_pool *pool, unsigned long handle);
> +void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
> +u64 zbud_get_pool_size(struct zbud_pool *pool);
> +
> +#endif /* _ZBUD_H_ */
> diff --git a/mm/Kconfig b/mm/Kconfig
> index e742d06..45ec90d 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -477,3 +477,13 @@ config FRONTSWAP
>  	  and swap data is stored as normal on the matching swap device.
>  
>  	  If unsure, say Y to enable frontswap.
> +
> +config ZBUD
> +	tristate
> +	default n
> +	help
> +	  A special purpose allocator for storing compressed pages.
> +	  It is designed to store up to two compressed pages per physical
> +	  page.  While this design limits storage density, it has simple and
> +	  deterministic reclaim properties that make it preferable to a higher
> +	  density approach when reclaim will be used.  
> diff --git a/mm/Makefile b/mm/Makefile
> index 72c5acb..95f0197 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -58,3 +58,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
>  obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
>  obj-$(CONFIG_CLEANCACHE) += cleancache.o
>  obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
> +obj-$(CONFIG_ZBUD)	+= zbud.o
> diff --git a/mm/zbud.c b/mm/zbud.c
> new file mode 100644
> index 0000000..b10a1f1
> --- /dev/null
> +++ b/mm/zbud.c
> @@ -0,0 +1,543 @@
> +/*
> + * zbud.c
> + *
> + * Copyright (C) 2013, Seth Jennings, IBM
> + *
> + * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
> + *
> + * zbud is an special purpose allocator for storing compressed pages.  Contrary
> + * to what its name may suggest, zbud is not a buddy allocator, but rather an
> + * allocator that "buddies" two compressed pages together in a single memory
> + * page.
> + *
> + * While this design limits storage density, it has simple and deterministic
> + * reclaim properties that make it preferable to a higher density approach when
> + * reclaim will be used.
> + *
> + * zbud works by storing compressed pages, or "zpages", together in pairs in a
> + * single memory page called a "zbud page".  The first buddy is "left
> + * justifed" at the beginning of the zbud page, and the last buddy is "right
> + * justified" at the end of the zbud page.  The benefit is that if either
> + * buddy is freed, the freed buddy space, coalesced with whatever slack space
> + * that existed between the buddies, results in the largest possible free region
> + * within the zbud page.
> + *
> + * zbud also provides an attractive lower bound on density. The ratio of zpages
> + * to zbud pages can not be less than 1.  This ensures that zbud can never "do
> + * harm" by using more pages to store zpages than the uncompressed zpages would
> + * have used on their own.
> + *
> + * zbud pages are divided into "chunks".  The size of the chunks is fixed at
> + * compile time and determined by NCHUNKS_ORDER below.  Dividing zbud pages
> + * into chunks allows organizing unbuddied zbud pages into a manageable number
> + * of unbuddied lists according to the number of free chunks available in the
> + * zbud page.
> + *
> + * The zbud API differs from that of conventional allocators in that the
> + * allocation function, zbud_alloc(), returns an opaque handle to the user,
> + * not a dereferenceable pointer.  The user must map the handle using
> + * zbud_map() in order to get a usable pointer by which to access the
> + * allocation data and unmap the handle with zbud_unmap() when operations
> + * on the allocation data are complete.
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/atomic.h>
> +#include <linux/list.h>
> +#include <linux/mm.h>
> +#include <linux/module.h>
> +#include <linux/preempt.h>
> +#include <linux/slab.h>
> +#include <linux/spinlock.h>
> +#include <linux/zbud.h>
> +
> +/*****************
> + * Structures
> +*****************/
> +/**
> + * struct zbud_page - zbud page metadata overlay
> + * @page:	typed reference to the underlying struct page
> + * @donotuse:	this overlays the page flags and should not be used
> + * @first_chunks:	the size of the first buddy in chunks, 0 if free
> + * @last_chunks:	the size of the last buddy in chunks, 0 if free
> + * @buddy:	links the zbud page into the unbuddied/buddied lists in the pool
> + * @lru:	links the zbud page into the lru list in the pool
> + *
> + * This structure overlays the struct page to store metadata needed for a
> + * single storage page in for zbud.  There is a BUILD_BUG_ON in zbud_init()
> + * that ensures this structure is not larger that struct page.
> + *
> + * The PG_reclaim flag of the underlying page is used for indicating
> + * that this zbud page is under reclaim (see zbud_reclaim_page())
> + */
> +struct zbud_page {
> +	union {
> +		struct page page;
> +		struct {
> +			unsigned long donotuse;
> +			u16 first_chunks;
> +			u16 last_chunks;
> +			struct list_head buddy;
> +			struct list_head lru;
> +		};
> +	};
> +};
> +
> +/*
> + * NCHUNKS_ORDER determines the internal allocation granularity, effectively
> + * adjusting internal fragmentation.  It also determines the number of
> + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
> + * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
> + * will be 64 freelists per pool.
> + */
> +#define NCHUNKS_ORDER	6
> +
> +#define CHUNK_SHIFT	(PAGE_SHIFT - NCHUNKS_ORDER)
> +#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
> +#define NCHUNKS		(PAGE_SIZE >> CHUNK_SHIFT)
> +
> +/**
> + * struct zbud_pool - stores metadata for each zbud pool
> + * @lock:	protects all pool fields and first|last_chunk fields of any
> + *		zbud page in the pool
> + * @unbuddied:	array of lists tracking zbud pages that only contain one buddy;
> + *		the lists each zbud page is added to depends on the size of
> + *		its free region.
> + * @buddied:	list tracking the zbud pages that contain two buddies;
> + *		these zbud pages are full
> + * @lru:	list tracking the zbud pages in LRU order by most recently
> + *		added buddy.
> + * @pages_nr:	number of zbud pages in the pool.
> + * @ops:	pointer to a structure of user defined operations specified at
> + *		pool creation time.
> + *
> + * This structure is allocated at pool creation time and maintains metadata
> + * pertaining to a particular zbud pool.
> + */
> +struct zbud_pool {
> +	spinlock_t lock;
> +	struct list_head unbuddied[NCHUNKS];
> +	struct list_head buddied;
> +	struct list_head lru;
> +	u64 pages_nr;
> +	struct zbud_ops *ops;
> +};
> +
> +/*****************
> + * Helpers
> +*****************/
> +/* Just to make the code easier to read */
> +enum buddy {
> +	FIRST,
> +	LAST
> +};
> +
> +/* Converts an allocation size in bytes to size in zbud chunks */
> +static int size_to_chunks(int size)
> +{
> +	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
> +}
> +
> +#define for_each_unbuddied_list(_iter, _begin) \
> +	for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
> +
> +/* Initializes a zbud page from a newly allocated page */
> +static struct zbud_page *init_zbud_page(struct page *page)
> +{
> +	struct zbud_page *zbpage = (struct zbud_page *)page;
> +	zbpage->first_chunks = 0;
> +	zbpage->last_chunks = 0;
> +	INIT_LIST_HEAD(&zbpage->buddy);
> +	INIT_LIST_HEAD(&zbpage->lru);
> +	return zbpage;
> +}
> +
> +/* Resets the struct page fields and frees the page */
> +static void free_zbud_page(struct zbud_page *zbpage)
> +{
> +	struct page *page = &zbpage->page;
> +	set_page_private(page, 0);
> +	page->mapping = NULL;
> +	page->index = 0;
> +	page_mapcount_reset(page);
> +	init_page_count(page);
> +	INIT_LIST_HEAD(&page->lru);
> +	__free_page(page);
> +}
> +
> +/*
> + * Encodes the handle of a particular buddy within a zbud page
> + * Pool lock should be held as this function accesses first|last_chunks
> + */
> +static unsigned long encode_handle(struct zbud_page *zbpage,
> +					enum buddy bud)
> +{
> +	unsigned long handle;
> +
> +	/*
> +	 * For now, the encoded handle is actually just the pointer to the data
> +	 * but this might not always be the case.  A little information hiding.
> +	 */
> +	handle = (unsigned long)page_address(&zbpage->page);
> +	if (bud == FIRST)
> +		return handle;
> +	handle += PAGE_SIZE - (zbpage->last_chunks  << CHUNK_SHIFT);
> +	return handle;
> +}
> +
> +/* Returns the zbud page where a given handle is stored */
> +static struct zbud_page *handle_to_zbud_page(unsigned long handle)
> +{
> +	return (struct zbud_page *)(virt_to_page(handle));
> +}
> +
> +/* Returns the number of free chunks in a zbud page */
> +static int num_free_chunks(struct zbud_page *zbpage)
> +{
> +	/*
> +	 * Rather than branch for different situations, just use the fact that
> +	 * free buddies have a length of zero to simplify everything.
> +	 */
> +	return NCHUNKS - zbpage->first_chunks - zbpage->last_chunks;
> +}
> +
> +/*****************
> + * API Functions
> +*****************/
> +/**
> + * zbud_create_pool() - create a new zbud pool
> + * @gfp:	gfp flags when allocating the zbud pool structure
> + * @ops:	user-defined operations for the zbud pool
> + *
> + * Return: pointer to the new zbud pool or NULL if the metadata allocation
> + * failed.
> + */
> +struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
> +{
> +	struct zbud_pool *pool;
> +	int i;
> +
> +	pool = kmalloc(sizeof(struct zbud_pool), gfp);
> +	if (!pool)
> +		return NULL;
> +	spin_lock_init(&pool->lock);
> +	for_each_unbuddied_list(i, 0)
> +		INIT_LIST_HEAD(&pool->unbuddied[i]);
> +	INIT_LIST_HEAD(&pool->buddied);
> +	INIT_LIST_HEAD(&pool->lru);
> +	pool->pages_nr = 0;
> +	pool->ops = ops;
> +	return pool;
> +}
> +
> +/**
> + * zbud_destroy_pool() - destroys an existing zbud pool
> + * @pool:	the zbud pool to be destroyed
> + *
> + * The pool should be emptied before this function is called.
> + */
> +void zbud_destroy_pool(struct zbud_pool *pool)
> +{
> +	kfree(pool);
> +}
> +
> +/**
> + * zbud_alloc() - allocates a region of a given size
> + * @pool:	zbud pool from which to allocate
> + * @size:	size in bytes of the desired allocation
> + * @gfp:	gfp flags used if the pool needs to grow
> + * @handle:	handle of the new allocation
> + *
> + * This function will attempt to find a free region in the pool large enough to
> + * satisfy the allocation request.  A search of the unbuddied lists is
> + * performed first. If no suitable free region is found, then a new page is
> + * allocated and added to the pool to satisfy the request.
> + *
> + * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
> + * as zbud pool pages.
> + *
> + * Return: 0 if success and handle is set, otherwise -EINVAL is the size or
> + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
> + * a new page.
> + */
> +int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
> +			unsigned long *handle)
> +{
> +	int chunks, i, freechunks;
> +	struct zbud_page *zbpage = NULL;
> +	enum buddy bud;
> +	struct page *page;
> +
> +	if (size <= 0 || gfp & __GFP_HIGHMEM)
> +		return -EINVAL;
> +	if (size > PAGE_SIZE)
> +		return -E2BIG;
> +	chunks = size_to_chunks(size);
> +	spin_lock(&pool->lock);
> +
> +	/* First, try to find an unbuddied zbpage. */
> +	zbpage = NULL;
> +	for_each_unbuddied_list(i, chunks) {
> +		if (!list_empty(&pool->unbuddied[i])) {
> +			zbpage = list_first_entry(&pool->unbuddied[i],
> +					struct zbud_page, buddy);
> +			list_del(&zbpage->buddy);
> +			if (zbpage->first_chunks == 0)
> +				bud = FIRST;
> +			else
> +				bud = LAST;
> +			goto found;
> +		}
> +	}
> +
> +	/* Couldn't find unbuddied zbpage, create new one */

How about try a direct reclaim at first?
Call zbud_reclaim_page() here instead of frontswap_store().
And reuse the reclaimed zbud page directly.

If failed then go to alloc_page()...

> +	spin_unlock(&pool->lock);
> +	page = alloc_page(gfp);
> +	if (!page)
> +		return -ENOMEM;
> +	spin_lock(&pool->lock);
> +	pool->pages_nr++;
> +	zbpage = init_zbud_page(page);
> +	bud = FIRST;
> +
> +found:
> +	if (bud == FIRST)
> +		zbpage->first_chunks = chunks;
> +	else
> +		zbpage->last_chunks = chunks;
> +
> +	if (zbpage->first_chunks == 0 || zbpage->last_chunks == 0) {
> +		/* Add to unbuddied list */
> +		freechunks = num_free_chunks(zbpage);
> +		list_add(&zbpage->buddy, &pool->unbuddied[freechunks]);
> +	} else {
> +		/* Add to buddied list */
> +		list_add(&zbpage->buddy, &pool->buddied);
> +	}
> +
> +	/* Add/move zbpage to beginning of LRU */
> +	if (!list_empty(&zbpage->lru))
> +		list_del(&zbpage->lru);
> +	list_add(&zbpage->lru, &pool->lru);
> +
> +	*handle = encode_handle(zbpage, bud);
> +	spin_unlock(&pool->lock);
> +
> +	return 0;
> +}
Andrew Morton May 28, 2013, 9:59 p.m. UTC | #2
On Mon, 20 May 2013 11:26:06 -0500 Seth Jennings <sjenning@linux.vnet.ibm.com> wrote:

> zbud is an special purpose allocator for storing compressed pages. It is
> designed to store up to two compressed pages per physical page.  While this
> design limits storage density, it has simple and deterministic reclaim
> properties that make it preferable to a higher density approach when reclaim
> will be used.
> 
> zbud works by storing compressed pages, or "zpages", together in pairs in a
> single memory page called a "zbud page".  The first buddy is "left
> justifed" at the beginning of the zbud page, and the last buddy is "right
> justified" at the end of the zbud page.  The benefit is that if either
> buddy is freed, the freed buddy space, coalesced with whatever slack space
> that existed between the buddies, results in the largest possible free region
> within the zbud page.
> 
> zbud also provides an attractive lower bound on density. The ratio of zpages
> to zbud pages can not be less than 1.  This ensures that zbud can never "do
> harm" by using more pages to store zpages than the uncompressed zpages would
> have used on their own.
> 
> This implementation is a rewrite of the zbud allocator internally used
> by zcache in the driver/staging tree.  The rewrite was necessary to
> remove some of the zcache specific elements that were ingrained throughout
> and provide a generic allocation interface that can later be used by
> zsmalloc and others.
> 
> This patch adds zbud to mm/ for later use by zswap.
> 
> ...
>
> +/**
> + * struct zbud_page - zbud page metadata overlay
> + * @page:	typed reference to the underlying struct page
> + * @donotuse:	this overlays the page flags and should not be used
> + * @first_chunks:	the size of the first buddy in chunks, 0 if free
> + * @last_chunks:	the size of the last buddy in chunks, 0 if free
> + * @buddy:	links the zbud page into the unbuddied/buddied lists in the pool
> + * @lru:	links the zbud page into the lru list in the pool
> + *
> + * This structure overlays the struct page to store metadata needed for a
> + * single storage page in for zbud.  There is a BUILD_BUG_ON in zbud_init()
> + * that ensures this structure is not larger that struct page.
> + *
> + * The PG_reclaim flag of the underlying page is used for indicating
> + * that this zbud page is under reclaim (see zbud_reclaim_page())
> + */
> +struct zbud_page {
> +	union {
> +		struct page page;
> +		struct {
> +			unsigned long donotuse;
> +			u16 first_chunks;
> +			u16 last_chunks;
> +			struct list_head buddy;
> +			struct list_head lru;
> +		};
> +	};
> +};

Whoa.  So zbud scribbles on existing pageframes?

Please tell us about this, in some detail.  How is it done and why is
this necessary?

Presumably the pageframe must be restored at some stage, so this code
has to be kept in sync with external unrelated changes to core MM?

Why was it implemented in this fashion rather than going into the main
`struct page' definition and adding the appropriate unionised fields?

I worry about any code which independently looks at the pageframe
tables and expects to find page struts there.  One example is probably
memory_failure() but there are probably others.

> 
> ...
>
> +int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
> +			unsigned long *handle)
> +{
> +	int chunks, i, freechunks;
> +	struct zbud_page *zbpage = NULL;
> +	enum buddy bud;
> +	struct page *page;
> +
> +	if (size <= 0 || gfp & __GFP_HIGHMEM)
> +		return -EINVAL;
> +	if (size > PAGE_SIZE)
> +		return -E2BIG;

Means "Argument list too long" and isn't appropriate here.

> +	chunks = size_to_chunks(size);
> +	spin_lock(&pool->lock);
> +
> +	/* First, try to find an unbuddied zbpage. */
> +	zbpage = NULL;
> +	for_each_unbuddied_list(i, chunks) {
> +		if (!list_empty(&pool->unbuddied[i])) {
> +			zbpage = list_first_entry(&pool->unbuddied[i],
> +					struct zbud_page, buddy);
> +			list_del(&zbpage->buddy);
> +			if (zbpage->first_chunks == 0)
> +				bud = FIRST;
> +			else
> +				bud = LAST;
> +			goto found;
> +		}
> +	}
> +
> +	/* Couldn't find unbuddied zbpage, create new one */
> +	spin_unlock(&pool->lock);
> +	page = alloc_page(gfp);
> +	if (!page)
> +		return -ENOMEM;
> +	spin_lock(&pool->lock);
> +	pool->pages_nr++;
> +	zbpage = init_zbud_page(page);
> +	bud = FIRST;
> +
> +found:
> +	if (bud == FIRST)
> +		zbpage->first_chunks = chunks;
> +	else
> +		zbpage->last_chunks = chunks;
> +
> +	if (zbpage->first_chunks == 0 || zbpage->last_chunks == 0) {
> +		/* Add to unbuddied list */
> +		freechunks = num_free_chunks(zbpage);
> +		list_add(&zbpage->buddy, &pool->unbuddied[freechunks]);
> +	} else {
> +		/* Add to buddied list */
> +		list_add(&zbpage->buddy, &pool->buddied);
> +	}
> +
> +	/* Add/move zbpage to beginning of LRU */
> +	if (!list_empty(&zbpage->lru))
> +		list_del(&zbpage->lru);
> +	list_add(&zbpage->lru, &pool->lru);
> +
> +	*handle = encode_handle(zbpage, bud);
> +	spin_unlock(&pool->lock);
> +
> +	return 0;
> +}
> 
> ...
>
> +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
> +{
> +	int i, ret, freechunks;
> +	struct zbud_page *zbpage;
> +	unsigned long first_handle = 0, last_handle = 0;
> +
> +	spin_lock(&pool->lock);
> +	if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
> +			retries == 0) {
> +		spin_unlock(&pool->lock);
> +		return -EINVAL;
> +	}
> +	for (i = 0; i < retries; i++) {
> +		zbpage = list_tail_entry(&pool->lru, struct zbud_page, lru);
> +		list_del(&zbpage->lru);
> +		list_del(&zbpage->buddy);
> +		/* Protect zbpage against free */

Against free by who?  What other code paths can access this page at
this time?

> +		SetPageReclaim(&zbpage->page);
> +		/*
> +		 * We need encode the handles before unlocking, since we can
> +		 * race with free that will set (first|last)_chunks to 0
> +		 */
> +		first_handle = 0;
> +		last_handle = 0;
> +		if (zbpage->first_chunks)
> +			first_handle = encode_handle(zbpage, FIRST);
> +		if (zbpage->last_chunks)
> +			last_handle = encode_handle(zbpage, LAST);
> +		spin_unlock(&pool->lock);
> +
> +		/* Issue the eviction callback(s) */
> +		if (first_handle) {
> +			ret = pool->ops->evict(pool, first_handle);
> +			if (ret)
> +				goto next;
> +		}
> +		if (last_handle) {
> +			ret = pool->ops->evict(pool, last_handle);
> +			if (ret)
> +				goto next;
> +		}
> +next:
> +		spin_lock(&pool->lock);
> +		ClearPageReclaim(&zbpage->page);
> +		if (zbpage->first_chunks == 0 && zbpage->last_chunks == 0) {
> +			/*
> +			 * Both buddies are now free, free the zbpage and
> +			 * return success.
> +			 */
> +			free_zbud_page(zbpage);
> +			pool->pages_nr--;
> +			spin_unlock(&pool->lock);
> +			return 0;
> +		} else if (zbpage->first_chunks == 0 ||
> +				zbpage->last_chunks == 0) {
> +			/* add to unbuddied list */
> +			freechunks = num_free_chunks(zbpage);
> +			list_add(&zbpage->buddy, &pool->unbuddied[freechunks]);
> +		} else {
> +			/* add to buddied list */
> +			list_add(&zbpage->buddy, &pool->buddied);
> +		}
> +
> +		/* add to beginning of LRU */
> +		list_add(&zbpage->lru, &pool->lru);
> +	}
> +	spin_unlock(&pool->lock);
> +	return -EAGAIN;
> +}
> 
> ...
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Seth Jennings May 29, 2013, 3:45 p.m. UTC | #3
On Tue, May 28, 2013 at 02:59:11PM -0700, Andrew Morton wrote:
> On Mon, 20 May 2013 11:26:06 -0500 Seth Jennings <sjenning@linux.vnet.ibm.com> wrote:
> 
> > zbud is an special purpose allocator for storing compressed pages. It is
> > designed to store up to two compressed pages per physical page.  While this
> > design limits storage density, it has simple and deterministic reclaim
> > properties that make it preferable to a higher density approach when reclaim
> > will be used.
> > 
> > zbud works by storing compressed pages, or "zpages", together in pairs in a
> > single memory page called a "zbud page".  The first buddy is "left
> > justifed" at the beginning of the zbud page, and the last buddy is "right
> > justified" at the end of the zbud page.  The benefit is that if either
> > buddy is freed, the freed buddy space, coalesced with whatever slack space
> > that existed between the buddies, results in the largest possible free region
> > within the zbud page.
> > 
> > zbud also provides an attractive lower bound on density. The ratio of zpages
> > to zbud pages can not be less than 1.  This ensures that zbud can never "do
> > harm" by using more pages to store zpages than the uncompressed zpages would
> > have used on their own.
> > 
> > This implementation is a rewrite of the zbud allocator internally used
> > by zcache in the driver/staging tree.  The rewrite was necessary to
> > remove some of the zcache specific elements that were ingrained throughout
> > and provide a generic allocation interface that can later be used by
> > zsmalloc and others.
> > 
> > This patch adds zbud to mm/ for later use by zswap.
> > 
> > ...
> >
> > +/**
> > + * struct zbud_page - zbud page metadata overlay
> > + * @page:	typed reference to the underlying struct page
> > + * @donotuse:	this overlays the page flags and should not be used
> > + * @first_chunks:	the size of the first buddy in chunks, 0 if free
> > + * @last_chunks:	the size of the last buddy in chunks, 0 if free
> > + * @buddy:	links the zbud page into the unbuddied/buddied lists in the pool
> > + * @lru:	links the zbud page into the lru list in the pool
> > + *
> > + * This structure overlays the struct page to store metadata needed for a
> > + * single storage page in for zbud.  There is a BUILD_BUG_ON in zbud_init()
> > + * that ensures this structure is not larger that struct page.
> > + *
> > + * The PG_reclaim flag of the underlying page is used for indicating
> > + * that this zbud page is under reclaim (see zbud_reclaim_page())
> > + */
> > +struct zbud_page {
> > +	union {
> > +		struct page page;
> > +		struct {
> > +			unsigned long donotuse;
> > +			u16 first_chunks;
> > +			u16 last_chunks;
> > +			struct list_head buddy;
> > +			struct list_head lru;
> > +		};
> > +	};
> > +};
> 
> Whoa.  So zbud scribbles on existing pageframes?

Yes.

> 
> Please tell us about this, in some detail.  How is it done and why is
> this necessary?
> 
> Presumably the pageframe must be restored at some stage, so this code
> has to be kept in sync with external unrelated changes to core MM?

Yes, this is done in free_zbud_page().

> 
> Why was it implemented in this fashion rather than going into the main
> `struct page' definition and adding the appropriate unionised fields?

Yes, modifying the struct page is the cleaner way.  I thought that adding more
convolution to struct page would create more friction on the path to getting
this merged.  Plus overlaying the struct page was the approach used by zsmalloc
and so I was thinking more along these lines.

If you'd rather add the zbud fields directly into unions in struct page,
I'm ok with that if you are.

Of course, this doesn't avoid having to reset the fields for the page allocator
before we free them.  Even slub/slob reset the mapcount before calling
__free_page(), for example.

> 
> I worry about any code which independently looks at the pageframe
> tables and expects to find page struts there.  One example is probably
> memory_failure() but there are probably others.
> 
> > 
> > ...
> >
> > +int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
> > +			unsigned long *handle)
> > +{
> > +	int chunks, i, freechunks;
> > +	struct zbud_page *zbpage = NULL;
> > +	enum buddy bud;
> > +	struct page *page;
> > +
> > +	if (size <= 0 || gfp & __GFP_HIGHMEM)
> > +		return -EINVAL;
> > +	if (size > PAGE_SIZE)
> > +		return -E2BIG;
> 
> Means "Argument list too long" and isn't appropriate here.

Ok, I need a return value other than -EINVAL to convey to the user that the
allocation is larger than what the allocator can hold. I don't see an existing
errno that would be more suited for that.  Do you have a suggestion?

> 
> > +	chunks = size_to_chunks(size);
> > +	spin_lock(&pool->lock);
> > +
> > +	/* First, try to find an unbuddied zbpage. */
> > +	zbpage = NULL;
> > +	for_each_unbuddied_list(i, chunks) {
> > +		if (!list_empty(&pool->unbuddied[i])) {
> > +			zbpage = list_first_entry(&pool->unbuddied[i],
> > +					struct zbud_page, buddy);
> > +			list_del(&zbpage->buddy);
> > +			if (zbpage->first_chunks == 0)
> > +				bud = FIRST;
> > +			else
> > +				bud = LAST;
> > +			goto found;
> > +		}
> > +	}
> > +
> > +	/* Couldn't find unbuddied zbpage, create new one */
> > +	spin_unlock(&pool->lock);
> > +	page = alloc_page(gfp);
> > +	if (!page)
> > +		return -ENOMEM;
> > +	spin_lock(&pool->lock);
> > +	pool->pages_nr++;
> > +	zbpage = init_zbud_page(page);
> > +	bud = FIRST;
> > +
> > +found:
> > +	if (bud == FIRST)
> > +		zbpage->first_chunks = chunks;
> > +	else
> > +		zbpage->last_chunks = chunks;
> > +
> > +	if (zbpage->first_chunks == 0 || zbpage->last_chunks == 0) {
> > +		/* Add to unbuddied list */
> > +		freechunks = num_free_chunks(zbpage);
> > +		list_add(&zbpage->buddy, &pool->unbuddied[freechunks]);
> > +	} else {
> > +		/* Add to buddied list */
> > +		list_add(&zbpage->buddy, &pool->buddied);
> > +	}
> > +
> > +	/* Add/move zbpage to beginning of LRU */
> > +	if (!list_empty(&zbpage->lru))
> > +		list_del(&zbpage->lru);
> > +	list_add(&zbpage->lru, &pool->lru);
> > +
> > +	*handle = encode_handle(zbpage, bud);
> > +	spin_unlock(&pool->lock);
> > +
> > +	return 0;
> > +}
> > 
> > ...
> >
> > +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
> > +{
> > +	int i, ret, freechunks;
> > +	struct zbud_page *zbpage;
> > +	unsigned long first_handle = 0, last_handle = 0;
> > +
> > +	spin_lock(&pool->lock);
> > +	if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
> > +			retries == 0) {
> > +		spin_unlock(&pool->lock);
> > +		return -EINVAL;
> > +	}
> > +	for (i = 0; i < retries; i++) {
> > +		zbpage = list_tail_entry(&pool->lru, struct zbud_page, lru);
> > +		list_del(&zbpage->lru);
> > +		list_del(&zbpage->buddy);
> > +		/* Protect zbpage against free */
> 
> Against free by who?  What other code paths can access this page at
> this time?

zbud has no way of serializing with the user (zswap) to prevent it calling
zbud_free() during zbud reclaim.  To prevent the zbud page from being freed
while reclaim is operating on it, we set the reclaim flag in the struct page.
zbud_free() checks this flag and, if set, only sets the chunk length of the
allocation to 0, but does not actually free the zbud page.  That is left to
this reclaim path.

Seth

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Andrew Morton May 29, 2013, 6:34 p.m. UTC | #4
On Wed, 29 May 2013 10:45:00 -0500 Seth Jennings <sjenning@linux.vnet.ibm.com> wrote:

> > > +struct zbud_page {
> > > +	union {
> > > +		struct page page;
> > > +		struct {
> > > +			unsigned long donotuse;
> > > +			u16 first_chunks;
> > > +			u16 last_chunks;
> > > +			struct list_head buddy;
> > > +			struct list_head lru;
> > > +		};
> > > +	};
> > > +};
> > 
> > Whoa.  So zbud scribbles on existing pageframes?
> 
> Yes.
> 
> > 
> > Please tell us about this, in some detail.  How is it done and why is
> > this necessary?
> > 
> > Presumably the pageframe must be restored at some stage, so this code
> > has to be kept in sync with external unrelated changes to core MM?
> 
> Yes, this is done in free_zbud_page().
> 
> > 
> > Why was it implemented in this fashion rather than going into the main
> > `struct page' definition and adding the appropriate unionised fields?
> 
> Yes, modifying the struct page is the cleaner way.  I thought that adding more
> convolution to struct page would create more friction on the path to getting
> this merged.  Plus overlaying the struct page was the approach used by zsmalloc
> and so I was thinking more along these lines.

I'd be interested in seeing what the modifications to struct page look
like.  It really is the better way.

> If you'd rather add the zbud fields directly into unions in struct page,
> I'm ok with that if you are.
> 
> Of course, this doesn't avoid having to reset the fields for the page allocator
> before we free them.  Even slub/slob reset the mapcount before calling
> __free_page(), for example.
> 
> > 
> > I worry about any code which independently looks at the pageframe
> > tables and expects to find page struts there.  One example is probably
> > memory_failure() but there are probably others.

^^ this, please.  It could be kinda fatal.

> > > 
> > > ...
> > >
> > > +int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
> > > +			unsigned long *handle)
> > > +{
> > > +	int chunks, i, freechunks;
> > > +	struct zbud_page *zbpage = NULL;
> > > +	enum buddy bud;
> > > +	struct page *page;
> > > +
> > > +	if (size <= 0 || gfp & __GFP_HIGHMEM)
> > > +		return -EINVAL;
> > > +	if (size > PAGE_SIZE)
> > > +		return -E2BIG;
> > 
> > Means "Argument list too long" and isn't appropriate here.
> 
> Ok, I need a return value other than -EINVAL to convey to the user that the
> allocation is larger than what the allocator can hold. I don't see an existing
> errno that would be more suited for that.  Do you have a suggestion?

ENOMEM perhaps.  That's also somewhat misleading, but I guess there's
precedent for ENOMEM meaning "allocation too large" as well as "out
of memory".

> > > +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
> > > +{
> > > +	int i, ret, freechunks;
> > > +	struct zbud_page *zbpage;
> > > +	unsigned long first_handle = 0, last_handle = 0;
> > > +
> > > +	spin_lock(&pool->lock);
> > > +	if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
> > > +			retries == 0) {
> > > +		spin_unlock(&pool->lock);
> > > +		return -EINVAL;
> > > +	}
> > > +	for (i = 0; i < retries; i++) {
> > > +		zbpage = list_tail_entry(&pool->lru, struct zbud_page, lru);
> > > +		list_del(&zbpage->lru);
> > > +		list_del(&zbpage->buddy);
> > > +		/* Protect zbpage against free */
> > 
> > Against free by who?  What other code paths can access this page at
> > this time?
> 
> zbud has no way of serializing with the user (zswap) to prevent it calling
> zbud_free() during zbud reclaim.  To prevent the zbud page from being freed
> while reclaim is operating on it, we set the reclaim flag in the struct page.
> zbud_free() checks this flag and, if set, only sets the chunk length of the
> allocation to 0, but does not actually free the zbud page.  That is left to
> this reclaim path.

Sounds strange.  Page refcounting is a well-established protocol and
works well in other places?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Seth Jennings May 29, 2013, 8:42 p.m. UTC | #5
On Wed, May 29, 2013 at 11:34:34AM -0700, Andrew Morton wrote:
> On Wed, 29 May 2013 10:45:00 -0500 Seth Jennings <sjenning@linux.vnet.ibm.com> wrote:
> 
> > > > +struct zbud_page {
> > > > +	union {
> > > > +		struct page page;
> > > > +		struct {
> > > > +			unsigned long donotuse;
> > > > +			u16 first_chunks;
> > > > +			u16 last_chunks;
> > > > +			struct list_head buddy;
> > > > +			struct list_head lru;
> > > > +		};
> > > > +	};
> > > > +};
> > > 
> > > Whoa.  So zbud scribbles on existing pageframes?
> > 
> > Yes.
> > 
> > > 
> > > Please tell us about this, in some detail.  How is it done and why is
> > > this necessary?
> > > 
> > > Presumably the pageframe must be restored at some stage, so this code
> > > has to be kept in sync with external unrelated changes to core MM?
> > 
> > Yes, this is done in free_zbud_page().
> > 
> > > 
> > > Why was it implemented in this fashion rather than going into the main
> > > `struct page' definition and adding the appropriate unionised fields?
> > 
> > Yes, modifying the struct page is the cleaner way.  I thought that adding more
> > convolution to struct page would create more friction on the path to getting
> > this merged.  Plus overlaying the struct page was the approach used by zsmalloc
> > and so I was thinking more along these lines.
> 
> I'd be interested in seeing what the modifications to struct page look
> like.  It really is the better way.

I'll do it then.

> 
> > If you'd rather add the zbud fields directly into unions in struct page,
> > I'm ok with that if you are.
> > 
> > Of course, this doesn't avoid having to reset the fields for the page allocator
> > before we free them.  Even slub/slob reset the mapcount before calling
> > __free_page(), for example.
> > 
> > > 
> > > I worry about any code which independently looks at the pageframe
> > > tables and expects to find page struts there.  One example is probably
> > > memory_failure() but there are probably others.
> 
> ^^ this, please.  It could be kinda fatal.

I'll look into this.

The expected behavior is that memory_failure() should handle zbud pages in the
same way that it handles in-use slub/slab/slob pages and return -EBUSY.

> 
> > > > 
> > > > ...
> > > >
> > > > +int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
> > > > +			unsigned long *handle)
> > > > +{
> > > > +	int chunks, i, freechunks;
> > > > +	struct zbud_page *zbpage = NULL;
> > > > +	enum buddy bud;
> > > > +	struct page *page;
> > > > +
> > > > +	if (size <= 0 || gfp & __GFP_HIGHMEM)
> > > > +		return -EINVAL;
> > > > +	if (size > PAGE_SIZE)
> > > > +		return -E2BIG;
> > > 
> > > Means "Argument list too long" and isn't appropriate here.
> > 
> > Ok, I need a return value other than -EINVAL to convey to the user that the
> > allocation is larger than what the allocator can hold. I don't see an existing
> > errno that would be more suited for that.  Do you have a suggestion?
> 
> ENOMEM perhaps.  That's also somewhat misleading, but I guess there's
> precedent for ENOMEM meaning "allocation too large" as well as "out
> of memory".

Works for me.

> 
> > > > +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
> > > > +{
> > > > +	int i, ret, freechunks;
> > > > +	struct zbud_page *zbpage;
> > > > +	unsigned long first_handle = 0, last_handle = 0;
> > > > +
> > > > +	spin_lock(&pool->lock);
> > > > +	if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
> > > > +			retries == 0) {
> > > > +		spin_unlock(&pool->lock);
> > > > +		return -EINVAL;
> > > > +	}
> > > > +	for (i = 0; i < retries; i++) {
> > > > +		zbpage = list_tail_entry(&pool->lru, struct zbud_page, lru);
> > > > +		list_del(&zbpage->lru);
> > > > +		list_del(&zbpage->buddy);
> > > > +		/* Protect zbpage against free */
> > > 
> > > Against free by who?  What other code paths can access this page at
> > > this time?
> > 
> > zbud has no way of serializing with the user (zswap) to prevent it calling
> > zbud_free() during zbud reclaim.  To prevent the zbud page from being freed
> > while reclaim is operating on it, we set the reclaim flag in the struct page.
> > zbud_free() checks this flag and, if set, only sets the chunk length of the
> > allocation to 0, but does not actually free the zbud page.  That is left to
> > this reclaim path.
> 
> Sounds strange.  Page refcounting is a well-established protocol and
> works well in other places?

Yes, refcounting seemed like overkill for this situation since the refcount
will only ever be 1 or 2 (2 if under reclaim) which basically reduces it to a
boolean. I'm also not sure if there is room left in the struct page for a
refcount with all the existing zbud metadata.

However, if you really don't like this, I can look at doing it via refcounts.

Seth

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Seth Jennings May 29, 2013, 8:45 p.m. UTC | #6
On Wed, May 29, 2013 at 11:34:34AM -0700, Andrew Morton wrote:
> > > > +	if (size <= 0 || gfp & __GFP_HIGHMEM)
> > > > +		return -EINVAL;
> > > > +	if (size > PAGE_SIZE)
> > > > +		return -E2BIG;
> > > 
> > > Means "Argument list too long" and isn't appropriate here.
> > 
> > Ok, I need a return value other than -EINVAL to convey to the user that the
> > allocation is larger than what the allocator can hold. I don't see an existing
> > errno that would be more suited for that.  Do you have a suggestion?
> 
> ENOMEM perhaps.  That's also somewhat misleading, but I guess there's
> precedent for ENOMEM meaning "allocation too large" as well as "out
> of memory".

Ah, spoke to soon. ENOMEM is already being used to indicate that an allocation
to grow the pool failed.

Seth

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Andrew Morton May 29, 2013, 8:48 p.m. UTC | #7
On Wed, 29 May 2013 15:42:36 -0500 Seth Jennings <sjenning@linux.vnet.ibm.com> wrote:

> > > > I worry about any code which independently looks at the pageframe
> > > > tables and expects to find page struts there.  One example is probably
> > > > memory_failure() but there are probably others.
> > 
> > ^^ this, please.  It could be kinda fatal.
> 
> I'll look into this.
> 
> The expected behavior is that memory_failure() should handle zbud pages in the
> same way that it handles in-use slub/slab/slob pages and return -EBUSY.

memory_failure() is merely an example of a general problem: code which
reads from the memmap[] array and expects its elements to be of type
`struct page'.  Other examples might be memory hotplugging, memory leak
checkers etc.  I have vague memories of out-of-tree patches
(bigphysarea?) doing this as well.

It's a general problem to which we need a general solution.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Dan Magenheimer May 29, 2013, 9:09 p.m. UTC | #8
> From: Andrew Morton [mailto:akpm@linux-foundation.org]
> Subject: Re: [PATCHv12 2/4] zbud: add to mm/
> 
> On Wed, 29 May 2013 15:42:36 -0500 Seth Jennings <sjenning@linux.vnet.ibm.com> wrote:
> 
> > > > > I worry about any code which independently looks at the pageframe
> > > > > tables and expects to find page struts there.  One example is probably
> > > > > memory_failure() but there are probably others.
> > >
> > > ^^ this, please.  It could be kinda fatal.
> >
> > I'll look into this.
> >
> > The expected behavior is that memory_failure() should handle zbud pages in the
> > same way that it handles in-use slub/slab/slob pages and return -EBUSY.
> 
> memory_failure() is merely an example of a general problem: code which
> reads from the memmap[] array and expects its elements to be of type
> `struct page'.  Other examples might be memory hotplugging, memory leak
> checkers etc.  I have vague memories of out-of-tree patches
> (bigphysarea?) doing this as well.
> 
> It's a general problem to which we need a general solution.

<Obi-tmem Kenobe slowly materializes... "use the force, Luke!">

One could reasonably argue that any code that makes incorrect
assumptions about the contents of a struct page structure is buggy
and should be fixed.  Isn't the "general solution" already described
in the following comment, excerpted from include/linux/mm.h, which
implies that "scribbling on existing pageframes" [carefully], is fine?
(And, if not, shouldn't that comment be fixed, or am I misreading
it?)

<start excerpt>
 * For the non-reserved pages, page_count(page) denotes a reference count.
 *   page_count() == 0 means the page is free. page->lru is then used for
 *   freelist management in the buddy allocator.
 *   page_count() > 0  means the page has been allocated.
 *
 * Pages are allocated by the slab allocator in order to provide memory
 * to kmalloc and kmem_cache_alloc. In this case, the management of the
 * page, and the fields in 'struct page' are the responsibility of mm/slab.c
 * unless a particular usage is carefully commented. (the responsibility of
 * freeing the kmalloc memory is the caller's, of course).
 *
 * A page may be used by anyone else who does a __get_free_page().
 * In this case, page_count still tracks the references, and should only
 * be used through the normal accessor functions. The top bits of page->flags
 * and page->virtual store page management information, but all other fields
 * are unused and could be used privately, carefully. The management of this
 * page is the responsibility of the one who allocated it, and those who have
 * subsequently been given references to it.
 *
 * The other pages (we may call them "pagecache pages") are completely
 * managed by the Linux memory manager: I/O, buffers, swapping etc.
 * The following discussion applies only to them.
<end excerpt>

<Obi-tmem Kenobe slowly dematerializes>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Andrew Morton May 29, 2013, 9:29 p.m. UTC | #9
On Wed, 29 May 2013 14:09:02 -0700 (PDT) Dan Magenheimer <dan.magenheimer@oracle.com> wrote:

> > memory_failure() is merely an example of a general problem: code which
> > reads from the memmap[] array and expects its elements to be of type
> > `struct page'.  Other examples might be memory hotplugging, memory leak
> > checkers etc.  I have vague memories of out-of-tree patches
> > (bigphysarea?) doing this as well.
> > 
> > It's a general problem to which we need a general solution.
> 
> <Obi-tmem Kenobe slowly materializes... "use the force, Luke!">
> 
> One could reasonably argue that any code that makes incorrect
> assumptions about the contents of a struct page structure is buggy
> and should be fixed.

Well it has type "struct page" and all code has a right to expect the
contents to match that type.

>  Isn't the "general solution" already described
> in the following comment, excerpted from include/linux/mm.h, which
> implies that "scribbling on existing pageframes" [carefully], is fine?
> (And, if not, shouldn't that comment be fixed, or am I misreading
> it?)
> 
> <start excerpt>
>  * For the non-reserved pages, page_count(page) denotes a reference count.
>  *   page_count() == 0 means the page is free. page->lru is then used for
>  *   freelist management in the buddy allocator.
>  *   page_count() > 0  means the page has been allocated.

Well kinda maybe.  How all the random memmap-peekers handle this I do
not know.  Setting PageReserved is a big hammer which should keep other
little paws out of there, although I guess it's abusive of whatever
PageReserved is supposed to mean.

It's what we used to call a variant record.  The tag is page.flags and
the protocol is, umm,

PageReserved: doesn't refer to a page at all - don't touch
PageSlab: belongs to slab or slub
!PageSlab: regular kernel/user/pagecache page

Are there any more?

So what to do here?  How about

- Position the zbud fields within struct page via the preferred
  means: editing its definition.

- Decide upon and document the means by which the zbud variant is tagged

- Demonstrate how this is safe against existing memmap-peekers

- Do all this without consuming another page flag :)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Seth Jennings May 30, 2013, 5:43 p.m. UTC | #10
On Wed, May 29, 2013 at 02:29:04PM -0700, Andrew Morton wrote:
> On Wed, 29 May 2013 14:09:02 -0700 (PDT) Dan Magenheimer <dan.magenheimer@oracle.com> wrote:
> 
> > > memory_failure() is merely an example of a general problem: code which
> > > reads from the memmap[] array and expects its elements to be of type
> > > `struct page'.  Other examples might be memory hotplugging, memory leak
> > > checkers etc.  I have vague memories of out-of-tree patches
> > > (bigphysarea?) doing this as well.
> > > 
> > > It's a general problem to which we need a general solution.
> > 
> > <Obi-tmem Kenobe slowly materializes... "use the force, Luke!">
> > 
> > One could reasonably argue that any code that makes incorrect
> > assumptions about the contents of a struct page structure is buggy
> > and should be fixed.
> 
> Well it has type "struct page" and all code has a right to expect the
> contents to match that type.
> 
> >  Isn't the "general solution" already described
> > in the following comment, excerpted from include/linux/mm.h, which
> > implies that "scribbling on existing pageframes" [carefully], is fine?
> > (And, if not, shouldn't that comment be fixed, or am I misreading
> > it?)
> > 
> > <start excerpt>
> >  * For the non-reserved pages, page_count(page) denotes a reference count.
> >  *   page_count() == 0 means the page is free. page->lru is then used for
> >  *   freelist management in the buddy allocator.
> >  *   page_count() > 0  means the page has been allocated.
> 
> Well kinda maybe.  How all the random memmap-peekers handle this I do
> not know.  Setting PageReserved is a big hammer which should keep other
> little paws out of there, although I guess it's abusive of whatever
> PageReserved is supposed to mean.
> 
> It's what we used to call a variant record.  The tag is page.flags and
> the protocol is, umm,
> 
> PageReserved: doesn't refer to a page at all - don't touch
> PageSlab: belongs to slab or slub
> !PageSlab: regular kernel/user/pagecache page

In the !PageSlab case, the page _count has to be considered to determine if the
page is a free page or if it is an allocated non-slab page.

So looking at the fields that need to remained untouched in the struct page for
the memmap-peekers, they are
- page->flags
- page->_count

Is this correct?

> 
> Are there any more?
> 
> So what to do here?  How about
> 
> - Position the zbud fields within struct page via the preferred
>   means: editing its definition.
> 
> - Decide upon and document the means by which the zbud variant is tagged

I'm not sure if there is going to be a way to tag zbud pages in particular
without using a page flag.  However, if we can tag it as a non-slab allocated
kernel page with no userspace mappings, that could be sufficient.  I think this
can be done with:

!PageSlab(p) && page_count(p) > 0 && page_mapcount(p) <= 0

An alternative is to set PG_slab for zbud pages then we get all the same
treatment as slab pages, which is basically what we want. Setting PG_slab
also conveys that no assumption can be made about the contents of _mapcount.

However, a memmap-peeker could call slab functions on the page which obviously
won't be under the control of the slab allocator. Afaict though, it doesn't
seem that any of them do this since there aren't any functions in the slab
allocator API that take raw struct pages.  The worst I've seen is calling
shrink_slab in an effort to get the slab allocator to free up the page.

In summary, I think that maintaining a positive page->_count and setting
PG_slab on zbud pages should provide safety against existing memmap-peekers.

Do you agree?

Seth

> 
> - Demonstrate how this is safe against existing memmap-peekers
> 
> - Do all this without consuming another page flag :)
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Seth Jennings May 30, 2013, 9:20 p.m. UTC | #11
Andrew, Mel,

This struct page stuffing is taking a lot of time to work out and _might_ be
fraught with peril when memmap peekers are considered.

What do you think about just storing the zbud page metadata inline in the
memory page in the first zbud page chunk?

Mel, this kinda hurts you plans for making NCHUNKS = 2, since there would
only be one chunk available for storage and would make zbud useless.

Just a way to sidestep this whole issue.  What do you think?

Seth

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/
Bob Liu May 31, 2013, 1:48 a.m. UTC | #12
Hi Seth,

On 05/31/2013 05:20 AM, Seth Jennings wrote:
> Andrew, Mel,
> 
> This struct page stuffing is taking a lot of time to work out and _might_ be
> fraught with peril when memmap peekers are considered.
> 
> What do you think about just storing the zbud page metadata inline in the
> memory page in the first zbud page chunk?

How about making zswap based on SLAB? Create a PAGE_SIZE slab and when
zswap need to alloc_page() using kmem_cache_alloc() instead.

So that leave SLAB layer to handler the NUMA problem and do the
dynamical pool size for us.

In this way, an extra struct need to manage the zbud page metadate
instead of using struct page.
But I think it's easy and won't occupy many memory.

> 
> Mel, this kinda hurts you plans for making NCHUNKS = 2, since there would
> only be one chunk available for storage and would make zbud useless.
> 
> Just a way to sidestep this whole issue.  What do you think?
> 
> Seth
>
Konrad Rzeszutek Wilk June 3, 2013, 1:48 p.m. UTC | #13
On Thu, May 30, 2013 at 12:43:44PM -0500, Seth Jennings wrote:
> On Wed, May 29, 2013 at 02:29:04PM -0700, Andrew Morton wrote:
> > On Wed, 29 May 2013 14:09:02 -0700 (PDT) Dan Magenheimer <dan.magenheimer@oracle.com> wrote:
> > 
> > > > memory_failure() is merely an example of a general problem: code which
> > > > reads from the memmap[] array and expects its elements to be of type
> > > > `struct page'.  Other examples might be memory hotplugging, memory leak
> > > > checkers etc.  I have vague memories of out-of-tree patches
> > > > (bigphysarea?) doing this as well.
> > > > 
> > > > It's a general problem to which we need a general solution.
> > > 
> > > <Obi-tmem Kenobe slowly materializes... "use the force, Luke!">
> > > 
> > > One could reasonably argue that any code that makes incorrect
> > > assumptions about the contents of a struct page structure is buggy
> > > and should be fixed.
> > 
> > Well it has type "struct page" and all code has a right to expect the
> > contents to match that type.
> > 
> > >  Isn't the "general solution" already described
> > > in the following comment, excerpted from include/linux/mm.h, which
> > > implies that "scribbling on existing pageframes" [carefully], is fine?
> > > (And, if not, shouldn't that comment be fixed, or am I misreading
> > > it?)
> > > 
> > > <start excerpt>
> > >  * For the non-reserved pages, page_count(page) denotes a reference count.
> > >  *   page_count() == 0 means the page is free. page->lru is then used for
> > >  *   freelist management in the buddy allocator.
> > >  *   page_count() > 0  means the page has been allocated.
> > 
> > Well kinda maybe.  How all the random memmap-peekers handle this I do
> > not know.  Setting PageReserved is a big hammer which should keep other
> > little paws out of there, although I guess it's abusive of whatever
> > PageReserved is supposed to mean.
> > 
> > It's what we used to call a variant record.  The tag is page.flags and
> > the protocol is, umm,
> > 
> > PageReserved: doesn't refer to a page at all - don't touch
> > PageSlab: belongs to slab or slub
> > !PageSlab: regular kernel/user/pagecache page
> 
> In the !PageSlab case, the page _count has to be considered to determine if the
> page is a free page or if it is an allocated non-slab page.
> 
> So looking at the fields that need to remained untouched in the struct page for
> the memmap-peekers, they are
> - page->flags
> - page->_count
> 
> Is this correct?
> 
> > 
> > Are there any more?
> > 
> > So what to do here?  How about
> > 
> > - Position the zbud fields within struct page via the preferred
> >   means: editing its definition.
> > 
> > - Decide upon and document the means by which the zbud variant is tagged
> 
> I'm not sure if there is going to be a way to tag zbud pages in particular
> without using a page flag.  However, if we can tag it as a non-slab allocated
> kernel page with no userspace mappings, that could be sufficient.  I think this
> can be done with:
> 
> !PageSlab(p) && page_count(p) > 0 && page_mapcount(p) <= 0
> 
> An alternative is to set PG_slab for zbud pages then we get all the same
> treatment as slab pages, which is basically what we want. Setting PG_slab
> also conveys that no assumption can be made about the contents of _mapcount.
> 
> However, a memmap-peeker could call slab functions on the page which obviously
> won't be under the control of the slab allocator. Afaict though, it doesn't
> seem that any of them do this since there aren't any functions in the slab
> allocator API that take raw struct pages.  The worst I've seen is calling
> shrink_slab in an effort to get the slab allocator to free up the page.

And does it error out properly on non-slab-pages-but-have-PG_slab set?
> 
> In summary, I think that maintaining a positive page->_count and setting
> PG_slab on zbud pages should provide safety against existing memmap-peekers.
> 
> Do you agree?

The page_>_count will thwart memmap-peeker from fiddling around right?

> 
> Seth
> 
> > 
> > - Demonstrate how this is safe against existing memmap-peekers
> > 
> > - Do all this without consuming another page flag :)
> > 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Patch
diff mbox series

diff --git a/include/linux/zbud.h b/include/linux/zbud.h
new file mode 100644
index 0000000..2571a5c
--- /dev/null
+++ b/include/linux/zbud.h
@@ -0,0 +1,22 @@ 
+#ifndef _ZBUD_H_
+#define _ZBUD_H_
+
+#include <linux/types.h>
+
+struct zbud_pool;
+
+struct zbud_ops {
+	int (*evict)(struct zbud_pool *pool, unsigned long handle);
+};
+
+struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops);
+void zbud_destroy_pool(struct zbud_pool *pool);
+int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
+	unsigned long *handle);
+void zbud_free(struct zbud_pool *pool, unsigned long handle);
+int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries);
+void *zbud_map(struct zbud_pool *pool, unsigned long handle);
+void zbud_unmap(struct zbud_pool *pool, unsigned long handle);
+u64 zbud_get_pool_size(struct zbud_pool *pool);
+
+#endif /* _ZBUD_H_ */
diff --git a/mm/Kconfig b/mm/Kconfig
index e742d06..45ec90d 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -477,3 +477,13 @@  config FRONTSWAP
 	  and swap data is stored as normal on the matching swap device.
 
 	  If unsure, say Y to enable frontswap.
+
+config ZBUD
+	tristate
+	default n
+	help
+	  A special purpose allocator for storing compressed pages.
+	  It is designed to store up to two compressed pages per physical
+	  page.  While this design limits storage density, it has simple and
+	  deterministic reclaim properties that make it preferable to a higher
+	  density approach when reclaim will be used.  
diff --git a/mm/Makefile b/mm/Makefile
index 72c5acb..95f0197 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -58,3 +58,4 @@  obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
+obj-$(CONFIG_ZBUD)	+= zbud.o
diff --git a/mm/zbud.c b/mm/zbud.c
new file mode 100644
index 0000000..b10a1f1
--- /dev/null
+++ b/mm/zbud.c
@@ -0,0 +1,543 @@ 
+/*
+ * zbud.c
+ *
+ * Copyright (C) 2013, Seth Jennings, IBM
+ *
+ * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
+ *
+ * zbud is an special purpose allocator for storing compressed pages.  Contrary
+ * to what its name may suggest, zbud is not a buddy allocator, but rather an
+ * allocator that "buddies" two compressed pages together in a single memory
+ * page.
+ *
+ * While this design limits storage density, it has simple and deterministic
+ * reclaim properties that make it preferable to a higher density approach when
+ * reclaim will be used.
+ *
+ * zbud works by storing compressed pages, or "zpages", together in pairs in a
+ * single memory page called a "zbud page".  The first buddy is "left
+ * justifed" at the beginning of the zbud page, and the last buddy is "right
+ * justified" at the end of the zbud page.  The benefit is that if either
+ * buddy is freed, the freed buddy space, coalesced with whatever slack space
+ * that existed between the buddies, results in the largest possible free region
+ * within the zbud page.
+ *
+ * zbud also provides an attractive lower bound on density. The ratio of zpages
+ * to zbud pages can not be less than 1.  This ensures that zbud can never "do
+ * harm" by using more pages to store zpages than the uncompressed zpages would
+ * have used on their own.
+ *
+ * zbud pages are divided into "chunks".  The size of the chunks is fixed at
+ * compile time and determined by NCHUNKS_ORDER below.  Dividing zbud pages
+ * into chunks allows organizing unbuddied zbud pages into a manageable number
+ * of unbuddied lists according to the number of free chunks available in the
+ * zbud page.
+ *
+ * The zbud API differs from that of conventional allocators in that the
+ * allocation function, zbud_alloc(), returns an opaque handle to the user,
+ * not a dereferenceable pointer.  The user must map the handle using
+ * zbud_map() in order to get a usable pointer by which to access the
+ * allocation data and unmap the handle with zbud_unmap() when operations
+ * on the allocation data are complete.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/zbud.h>
+
+/*****************
+ * Structures
+*****************/
+/**
+ * struct zbud_page - zbud page metadata overlay
+ * @page:	typed reference to the underlying struct page
+ * @donotuse:	this overlays the page flags and should not be used
+ * @first_chunks:	the size of the first buddy in chunks, 0 if free
+ * @last_chunks:	the size of the last buddy in chunks, 0 if free
+ * @buddy:	links the zbud page into the unbuddied/buddied lists in the pool
+ * @lru:	links the zbud page into the lru list in the pool
+ *
+ * This structure overlays the struct page to store metadata needed for a
+ * single storage page in for zbud.  There is a BUILD_BUG_ON in zbud_init()
+ * that ensures this structure is not larger that struct page.
+ *
+ * The PG_reclaim flag of the underlying page is used for indicating
+ * that this zbud page is under reclaim (see zbud_reclaim_page())
+ */
+struct zbud_page {
+	union {
+		struct page page;
+		struct {
+			unsigned long donotuse;
+			u16 first_chunks;
+			u16 last_chunks;
+			struct list_head buddy;
+			struct list_head lru;
+		};
+	};
+};
+
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation.  It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
+ * will be 64 freelists per pool.
+ */
+#define NCHUNKS_ORDER	6
+
+#define CHUNK_SHIFT	(PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE	(1 << CHUNK_SHIFT)
+#define NCHUNKS		(PAGE_SIZE >> CHUNK_SHIFT)
+
+/**
+ * struct zbud_pool - stores metadata for each zbud pool
+ * @lock:	protects all pool fields and first|last_chunk fields of any
+ *		zbud page in the pool
+ * @unbuddied:	array of lists tracking zbud pages that only contain one buddy;
+ *		the lists each zbud page is added to depends on the size of
+ *		its free region.
+ * @buddied:	list tracking the zbud pages that contain two buddies;
+ *		these zbud pages are full
+ * @lru:	list tracking the zbud pages in LRU order by most recently
+ *		added buddy.
+ * @pages_nr:	number of zbud pages in the pool.
+ * @ops:	pointer to a structure of user defined operations specified at
+ *		pool creation time.
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * pertaining to a particular zbud pool.
+ */
+struct zbud_pool {
+	spinlock_t lock;
+	struct list_head unbuddied[NCHUNKS];
+	struct list_head buddied;
+	struct list_head lru;
+	u64 pages_nr;
+	struct zbud_ops *ops;
+};
+
+/*****************
+ * Helpers
+*****************/
+/* Just to make the code easier to read */
+enum buddy {
+	FIRST,
+	LAST
+};
+
+/* Converts an allocation size in bytes to size in zbud chunks */
+static int size_to_chunks(int size)
+{
+	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+
+#define for_each_unbuddied_list(_iter, _begin) \
+	for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+
+/* Initializes a zbud page from a newly allocated page */
+static struct zbud_page *init_zbud_page(struct page *page)
+{
+	struct zbud_page *zbpage = (struct zbud_page *)page;
+	zbpage->first_chunks = 0;
+	zbpage->last_chunks = 0;
+	INIT_LIST_HEAD(&zbpage->buddy);
+	INIT_LIST_HEAD(&zbpage->lru);
+	return zbpage;
+}
+
+/* Resets the struct page fields and frees the page */
+static void free_zbud_page(struct zbud_page *zbpage)
+{
+	struct page *page = &zbpage->page;
+	set_page_private(page, 0);
+	page->mapping = NULL;
+	page->index = 0;
+	page_mapcount_reset(page);
+	init_page_count(page);
+	INIT_LIST_HEAD(&page->lru);
+	__free_page(page);
+}
+
+/*
+ * Encodes the handle of a particular buddy within a zbud page
+ * Pool lock should be held as this function accesses first|last_chunks
+ */
+static unsigned long encode_handle(struct zbud_page *zbpage,
+					enum buddy bud)
+{
+	unsigned long handle;
+
+	/*
+	 * For now, the encoded handle is actually just the pointer to the data
+	 * but this might not always be the case.  A little information hiding.
+	 */
+	handle = (unsigned long)page_address(&zbpage->page);
+	if (bud == FIRST)
+		return handle;
+	handle += PAGE_SIZE - (zbpage->last_chunks  << CHUNK_SHIFT);
+	return handle;
+}
+
+/* Returns the zbud page where a given handle is stored */
+static struct zbud_page *handle_to_zbud_page(unsigned long handle)
+{
+	return (struct zbud_page *)(virt_to_page(handle));
+}
+
+/* Returns the number of free chunks in a zbud page */
+static int num_free_chunks(struct zbud_page *zbpage)
+{
+	/*
+	 * Rather than branch for different situations, just use the fact that
+	 * free buddies have a length of zero to simplify everything.
+	 */
+	return NCHUNKS - zbpage->first_chunks - zbpage->last_chunks;
+}
+
+/*****************
+ * API Functions
+*****************/
+/**
+ * zbud_create_pool() - create a new zbud pool
+ * @gfp:	gfp flags when allocating the zbud pool structure
+ * @ops:	user-defined operations for the zbud pool
+ *
+ * Return: pointer to the new zbud pool or NULL if the metadata allocation
+ * failed.
+ */
+struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
+{
+	struct zbud_pool *pool;
+	int i;
+
+	pool = kmalloc(sizeof(struct zbud_pool), gfp);
+	if (!pool)
+		return NULL;
+	spin_lock_init(&pool->lock);
+	for_each_unbuddied_list(i, 0)
+		INIT_LIST_HEAD(&pool->unbuddied[i]);
+	INIT_LIST_HEAD(&pool->buddied);
+	INIT_LIST_HEAD(&pool->lru);
+	pool->pages_nr = 0;
+	pool->ops = ops;
+	return pool;
+}
+
+/**
+ * zbud_destroy_pool() - destroys an existing zbud pool
+ * @pool:	the zbud pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+void zbud_destroy_pool(struct zbud_pool *pool)
+{
+	kfree(pool);
+}
+
+/**
+ * zbud_alloc() - allocates a region of a given size
+ * @pool:	zbud pool from which to allocate
+ * @size:	size in bytes of the desired allocation
+ * @gfp:	gfp flags used if the pool needs to grow
+ * @handle:	handle of the new allocation
+ *
+ * This function will attempt to find a free region in the pool large enough to
+ * satisfy the allocation request.  A search of the unbuddied lists is
+ * performed first. If no suitable free region is found, then a new page is
+ * allocated and added to the pool to satisfy the request.
+ *
+ * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
+ * as zbud pool pages.
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL is the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new page.
+ */
+int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
+			unsigned long *handle)
+{
+	int chunks, i, freechunks;
+	struct zbud_page *zbpage = NULL;
+	enum buddy bud;
+	struct page *page;
+
+	if (size <= 0 || gfp & __GFP_HIGHMEM)
+		return -EINVAL;
+	if (size > PAGE_SIZE)
+		return -E2BIG;
+	chunks = size_to_chunks(size);
+	spin_lock(&pool->lock);
+
+	/* First, try to find an unbuddied zbpage. */
+	zbpage = NULL;
+	for_each_unbuddied_list(i, chunks) {
+		if (!list_empty(&pool->unbuddied[i])) {
+			zbpage = list_first_entry(&pool->unbuddied[i],
+					struct zbud_page, buddy);
+			list_del(&zbpage->buddy);
+			if (zbpage->first_chunks == 0)
+				bud = FIRST;
+			else
+				bud = LAST;
+			goto found;
+		}
+	}
+
+	/* Couldn't find unbuddied zbpage, create new one */
+	spin_unlock(&pool->lock);
+	page = alloc_page(gfp);
+	if (!page)
+		return -ENOMEM;
+	spin_lock(&pool->lock);
+	pool->pages_nr++;
+	zbpage = init_zbud_page(page);
+	bud = FIRST;
+
+found:
+	if (bud == FIRST)
+		zbpage->first_chunks = chunks;
+	else
+		zbpage->last_chunks = chunks;
+
+	if (zbpage->first_chunks == 0 || zbpage->last_chunks == 0) {
+		/* Add to unbuddied list */
+		freechunks = num_free_chunks(zbpage);
+		list_add(&zbpage->buddy, &pool->unbuddied[freechunks]);
+	} else {
+		/* Add to buddied list */
+		list_add(&zbpage->buddy, &pool->buddied);
+	}
+
+	/* Add/move zbpage to beginning of LRU */
+	if (!list_empty(&zbpage->lru))
+		list_del(&zbpage->lru);
+	list_add(&zbpage->lru, &pool->lru);
+
+	*handle = encode_handle(zbpage, bud);
+	spin_unlock(&pool->lock);
+
+	return 0;
+}
+
+/**
+ * zbud_free() - frees the allocation associated with the given handle
+ * @pool:	pool in which the allocation resided
+ * @handle:	handle associated with the allocation returned by zbud_alloc()
+ *
+ * In the case that the zbud page in which the allocation resides is under
+ * reclaim, as indicated by the PG_reclaim flag being set, this function
+ * only sets the first|last_chunks to 0.  The page is actually freed
+ * once both buddies are evicted (see zbud_reclaim_page() below).
+ */
+void zbud_free(struct zbud_pool *pool, unsigned long handle)
+{
+	struct zbud_page *zbpage;
+	int freechunks;
+
+	spin_lock(&pool->lock);
+	zbpage = handle_to_zbud_page(handle);
+
+	/* If first buddy, handle will be page aligned */
+	if (handle & ~PAGE_MASK)
+		zbpage->last_chunks = 0;
+	else
+		zbpage->first_chunks = 0;
+
+	if (PageReclaim(&zbpage->page)) {
+		/* zbpage is under reclaim, reclaim will free */
+		spin_unlock(&pool->lock);
+		return;
+	}
+
+	/* Remove from existing buddy list */
+	list_del(&zbpage->buddy);
+
+	if (zbpage->first_chunks == 0 && zbpage->last_chunks == 0) {
+		/* zbpage is empty, free */
+		list_del(&zbpage->lru);
+		free_zbud_page(zbpage);
+		pool->pages_nr--;
+	} else {
+		/* Add to unbuddied list */
+		freechunks = num_free_chunks(zbpage);
+		list_add(&zbpage->buddy, &pool->unbuddied[freechunks]);
+	}
+
+	spin_unlock(&pool->lock);
+}
+
+#define list_tail_entry(ptr, type, member) \
+	list_entry((ptr)->prev, type, member)
+
+/**
+ * zbud_reclaim_page() - evicts allocations from a pool page and frees it
+ * @pool:	pool from which a page will attempt to be evicted
+ * @retires:	number of pages on the LRU list for which eviction will
+ *		be attempted before failing
+ *
+ * zbud reclaim is different from normal system reclaim in that the reclaim is
+ * done from the bottom, up.  This is because only the bottom layer, zbud, has
+ * information on how the allocations are organized within each zbud page. This
+ * has the potential to create interesting locking situations between zbud and
+ * the user, however.
+ *
+ * To avoid these, this is how zbud_reclaim_page() should be called:
+
+ * The user detects a page should be reclaimed and calls zbud_reclaim_page().
+ * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
+ * the user-defined eviction handler with the pool and handle as arguments.
+ *
+ * If the handle can not be evicted, the eviction handler should return
+ * non-zero. zbud_reclaim_page() will add the zbud page back to the
+ * appropriate list and try the next zbud page on the LRU up to
+ * a user defined number of retries.
+ *
+ * If the handle is successfully evicted, the eviction handler should
+ * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
+ * contains logic to delay freeing the page if the page is under reclaim,
+ * as indicated by the setting of the PG_reclaim flag on the underlying page.
+ *
+ * If all buddies in the zbud page are successfully evicted, then the
+ * zbud page can be freed.
+ *
+ * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
+ * no pages to evict or an eviction handler is not registered, -EAGAIN if
+ * the retry limit was hit.
+ */
+int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
+{
+	int i, ret, freechunks;
+	struct zbud_page *zbpage;
+	unsigned long first_handle = 0, last_handle = 0;
+
+	spin_lock(&pool->lock);
+	if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
+			retries == 0) {
+		spin_unlock(&pool->lock);
+		return -EINVAL;
+	}
+	for (i = 0; i < retries; i++) {
+		zbpage = list_tail_entry(&pool->lru, struct zbud_page, lru);
+		list_del(&zbpage->lru);
+		list_del(&zbpage->buddy);
+		/* Protect zbpage against free */
+		SetPageReclaim(&zbpage->page);
+		/*
+		 * We need encode the handles before unlocking, since we can
+		 * race with free that will set (first|last)_chunks to 0
+		 */
+		first_handle = 0;
+		last_handle = 0;
+		if (zbpage->first_chunks)
+			first_handle = encode_handle(zbpage, FIRST);
+		if (zbpage->last_chunks)
+			last_handle = encode_handle(zbpage, LAST);
+		spin_unlock(&pool->lock);
+
+		/* Issue the eviction callback(s) */
+		if (first_handle) {
+			ret = pool->ops->evict(pool, first_handle);
+			if (ret)
+				goto next;
+		}
+		if (last_handle) {
+			ret = pool->ops->evict(pool, last_handle);
+			if (ret)
+				goto next;
+		}
+next:
+		spin_lock(&pool->lock);
+		ClearPageReclaim(&zbpage->page);
+		if (zbpage->first_chunks == 0 && zbpage->last_chunks == 0) {
+			/*
+			 * Both buddies are now free, free the zbpage and
+			 * return success.
+			 */
+			free_zbud_page(zbpage);
+			pool->pages_nr--;
+			spin_unlock(&pool->lock);
+			return 0;
+		} else if (zbpage->first_chunks == 0 ||
+				zbpage->last_chunks == 0) {
+			/* add to unbuddied list */
+			freechunks = num_free_chunks(zbpage);
+			list_add(&zbpage->buddy, &pool->unbuddied[freechunks]);
+		} else {
+			/* add to buddied list */
+			list_add(&zbpage->buddy, &pool->buddied);
+		}
+
+		/* add to beginning of LRU */
+		list_add(&zbpage->lru, &pool->lru);
+	}
+	spin_unlock(&pool->lock);
+	return -EAGAIN;
+}
+
+/**
+ * zbud_map() - maps the allocation associated with the given handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation to be mapped
+ *
+ * While trivial for zbud, the mapping functions for others allocators
+ * implementing this allocation API could have more complex information encoded
+ * in the handle and could create temporary mappings to make the data
+ * accessible to the user.
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+void *zbud_map(struct zbud_pool *pool, unsigned long handle)
+{
+	return (void *)(handle);
+}
+
+/**
+ * zbud_unmap() - maps the allocation associated with the given handle
+ * @pool:	pool in which the allocation resides
+ * @handle:	handle associated with the allocation to be unmapped
+ */
+void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
+{
+}
+
+/**
+ * zbud_get_pool_size() - gets the zbud pool size in pages
+ * @pool:	pool whose size is being queried
+ *
+ * Returns: size in pages of the given pool.  The pool lock need not be
+ * taken to access pages_nr.
+ */
+u64 zbud_get_pool_size(struct zbud_pool *pool)
+{
+	return pool->pages_nr;
+}
+
+static int __init init_zbud(void)
+{
+	/* Make sure we aren't overflowing the underlying struct page */
+	BUILD_BUG_ON(sizeof(struct zbud_page) != sizeof(struct page));
+	/* Make sure we can represent any chunk offset with a u16 */
+	BUILD_BUG_ON(sizeof(u16) * BITS_PER_BYTE < PAGE_SHIFT - CHUNK_SHIFT);
+	pr_info("loaded\n");
+	return 0;
+}
+
+static void __exit exit_zbud(void)
+{
+	pr_info("unloaded\n");
+}
+
+module_init(init_zbud);
+module_exit(exit_zbud);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");