All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24  4:37 ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24  4:37 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, linux-mm-cc

xvmalloc is a memory allocator designed specifically for ramzswap project.

* Features:
 - Low metadata overhead (just 4 bytes per object)
 - O(1) Alloc/Free - except when we have to call system page allocator to
   get additional memory.
 - Very low fragmentation: In all tests, xvmalloc memory usage is within 12%
   of "Ideal".
 - Pool based allocator: Each pool can grow and shrink.
 - It maps pages only when required. So, it does not hog vmalloc area which
   is very small on 32-bit systems.

SLUB allocator could not be used due to fragmentation issues:
http://code.google.com/p/compcache/wiki/AllocatorsComparison
Data here shows kmalloc using ~43% more memory than TLSF and xvMalloc
is showed ~2% more space efficiency than TLSF (due to smaller metadata).
Creating various kmem_caches can reduce space efficiency gap but still
problem of being limited to low memory exists. Also, it depends on
allocating higher order pages to reduce fragmentation - this is not
acceptable for ramzswap as it is used under memory crunch (its a swap
device!).

SLOB allocator could not be used do to reasons mentioned here:
http://lkml.org/lkml/2009/3/18/210

* Implementation:
It uses two-level bitmap search to find free list containing block of
correct size. This idea is taken from TLSF (Two-Level Segregate Fit)
allocator and is well explained in its paper (see [Links] below).

* Limitations:
 - Poor scalability: No per-cpu data structures (work in progress).

[Links]
1. Details and Performance data:
http://code.google.com/p/compcache/wiki/xvMalloc
http://code.google.com/p/compcache/wiki/xvMallocPerformance

2. TLSF memory allocator:
home: http://rtportal.upv.es/rtmalloc/
paper: http://rtportal.upv.es/rtmalloc/files/MRBC_2008.pdf

Signed-off-by: Nitin Gupta <ngupta@vflare.org>
---

 drivers/block/ramzswap/xvmalloc.c     |  556 +++++++++++++++++++++++++++++++++
 drivers/block/ramzswap/xvmalloc.h     |   30 ++
 drivers/block/ramzswap/xvmalloc_int.h |   86 +++++
 3 files changed, 672 insertions(+), 0 deletions(-)

diff --git a/drivers/block/ramzswap/xvmalloc.c b/drivers/block/ramzswap/xvmalloc.c
new file mode 100644
index 0000000..57a3639
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc.c
@@ -0,0 +1,556 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include "xvmalloc.h"
+#include "xvmalloc_int.h"
+
+static void stat_inc(u64 *value)
+{
+	*value = *value + 1;
+}
+
+static void stat_dec(u64 *value)
+{
+	*value = *value - 1;
+}
+
+static int test_flag(struct block_header *block, enum blockflags flag)
+{
+	return block->prev & BIT(flag);
+}
+
+static void set_flag(struct block_header *block, enum blockflags flag)
+{
+	block->prev |= BIT(flag);
+}
+
+static void clear_flag(struct block_header *block, enum blockflags flag)
+{
+	block->prev &= ~BIT(flag);
+}
+
+/*
+ * Given <pagenum, offset> pair, provide a derefrencable pointer.
+ * This is called from xv_malloc/xv_free path, so it needs to be fast.
+ */
+static void *get_ptr_atomic(u32 pagenum, u16 offset, enum km_type type)
+{
+	unsigned char *base;
+
+	base = kmap_atomic(pfn_to_page(pagenum), type);
+	return base + offset;
+}
+
+static void put_ptr_atomic(void *ptr, enum km_type type)
+{
+	kunmap_atomic(ptr, type);
+}
+
+static u32 get_blockprev(struct block_header *block)
+{
+	return block->prev & PREV_MASK;
+}
+
+static void set_blockprev(struct block_header *block, u16 new_offset)
+{
+	block->prev = new_offset | (block->prev & FLAGS_MASK);
+}
+
+static struct block_header *BLOCK_NEXT(struct block_header *block)
+{
+	return (struct block_header *)((char *)block + block->size + XV_ALIGN);
+}
+
+/*
+ * Get index of free list containing blocks of maximum size
+ * which is less than or equal to given size.
+ */
+static u32 get_index_for_insert(u32 size)
+{
+	if (unlikely(size > XV_MAX_ALLOC_SIZE))
+		size = XV_MAX_ALLOC_SIZE;
+	size &= ~FL_DELTA_MASK;
+	return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
+}
+
+/*
+ * Get index of free list having blocks of size greater than
+ * or equal to requested size.
+ */
+static u32 get_index(u32 size)
+{
+	if (unlikely(size < XV_MIN_ALLOC_SIZE))
+		size = XV_MIN_ALLOC_SIZE;
+	size = ALIGN(size, FL_DELTA);
+	return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
+}
+
+/*
+ * Allocate a memory page. Called when a pool needs to grow.
+ */
+static u32 xv_alloc_page(gfp_t flags)
+{
+	struct page *page;
+
+	page = alloc_page(flags);
+	if (unlikely(!page))
+		return 0;
+
+	return page_to_pfn(page);
+}
+
+/*
+ * Called when all objects in a page are freed.
+ */
+static void xv_free_page(u32 pagenum)
+{
+	__free_page(pfn_to_page(pagenum));
+}
+
+/**
+ * find_block - find block of at least given size
+ * @pool: memory pool to search from
+ * @size: size of block required
+ * @pagenum: page no. containing required block
+ * @offset: offset within the page where block is located.
+ *
+ * Searches two level bitmap to locate block of at least
+ * the given size. If such a block is found, it provides
+ * <pagenum, offset> to identify this block and returns index
+ * in freelist where we found this block.
+ * Otherwise, returns 0 and <pagenum, offset> params are not touched.
+ */
+static u32 find_block(struct xv_pool *pool, u32 size,
+			u32 *pagenum, u32 *offset)
+{
+	ulong flbitmap, slbitmap;
+	u32 flindex, slindex, slbitstart;
+
+	/* There are no free blocks in this pool */
+	if (!pool->flbitmap)
+		return 0;
+
+	/* Get freelist index correspoding to this size */
+	slindex = get_index(size);
+	slbitmap = pool->slbitmap[slindex / BITS_PER_LONG];
+	slbitstart = slindex % BITS_PER_LONG;
+
+	/*
+	 * If freelist is not empty at this index, we found the
+	 * block - head of this list. This is approximate best-fit match.
+	 */
+	if (test_bit(slbitstart, &slbitmap)) {
+		*pagenum = pool->freelist[slindex].pagenum;
+		*offset = pool->freelist[slindex].offset;
+		return slindex;
+	}
+
+	/*
+	 * No best-fit found. Search a bit further in bitmap for a free block.
+	 * Second level bitmap consists of series of 32-bit chunks. Search
+	 * further in the chunk where we expected a best-fit, starting from
+	 * index location found above.
+	 */
+	slbitstart++;
+	slbitmap >>= slbitstart;
+
+	/* Skip this search if we were already at end of this bitmap chunk */
+	if ((slbitstart != BITS_PER_LONG) && slbitmap) {
+		slindex += __ffs(slbitmap) + 1;
+		*pagenum = pool->freelist[slindex].pagenum;
+		*offset = pool->freelist[slindex].offset;
+		return slindex;
+	}
+
+	/* Now do a full two-level bitmap search to find next nearest fit */
+	flindex = slindex / BITS_PER_LONG;
+
+	flbitmap = (pool->flbitmap) >> (flindex + 1);
+	if (!flbitmap)
+		return 0;
+
+	flindex += __ffs(flbitmap) + 1;
+	slbitmap = pool->slbitmap[flindex];
+	slindex = (flindex * BITS_PER_LONG) + __ffs(slbitmap);
+	*pagenum = pool->freelist[slindex].pagenum;
+	*offset = pool->freelist[slindex].offset;
+
+	return slindex;
+}
+
+/*
+ * Insert block at <pagenum, offset> in freelist of given pool.
+ * freelist used depends on block size.
+ */
+static void insert_block(struct xv_pool *pool, u32 pagenum, u32 offset,
+			struct block_header *block)
+{
+	u32 flindex, slindex;
+	struct block_header *nextblock;
+
+	slindex = get_index_for_insert(block->size);
+	flindex = slindex / BITS_PER_LONG;
+
+	block->link.prev_pagenum = 0;
+	block->link.prev_offset = 0;
+	block->link.next_pagenum = pool->freelist[slindex].pagenum;
+	block->link.next_offset = pool->freelist[slindex].offset;
+	pool->freelist[slindex].pagenum = pagenum;
+	pool->freelist[slindex].offset = offset;
+
+	if (block->link.next_pagenum) {
+		nextblock = get_ptr_atomic(block->link.next_pagenum,
+					block->link.next_offset, KM_USER1);
+		nextblock->link.prev_pagenum = pagenum;
+		nextblock->link.prev_offset = offset;
+		put_ptr_atomic(nextblock, KM_USER1);
+	}
+
+	__set_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
+	__set_bit(flindex, &pool->flbitmap);
+}
+
+/*
+ * Remove block from head of freelist. Index 'slindex' identifies the freelist.
+ */
+static void remove_block_head(struct xv_pool *pool,
+			struct block_header *block, u32 slindex)
+{
+	struct block_header *tmpblock;
+	u32 flindex = slindex / BITS_PER_LONG;
+
+	pool->freelist[slindex].pagenum = block->link.next_pagenum;
+	pool->freelist[slindex].offset = block->link.next_offset;
+	block->link.prev_pagenum = 0;
+	block->link.prev_offset = 0;
+
+	if (!pool->freelist[slindex].pagenum) {
+		__clear_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
+		if (!pool->slbitmap[flindex])
+			__clear_bit(flindex, &pool->flbitmap);
+	} else {
+		/*
+		 * DEBUG ONLY: We need not reinitialize freelist head previous
+		 * pointer to 0 - we never depend on its value. But just for
+		 * sanity, lets do it.
+		 */
+		tmpblock = get_ptr_atomic(pool->freelist[slindex].pagenum,
+				pool->freelist[slindex].offset, KM_USER1);
+		tmpblock->link.prev_pagenum = 0;
+		tmpblock->link.prev_offset = 0;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+}
+
+/*
+ * Remove block from freelist. Index 'slindex' identifies the freelist.
+ */
+static void remove_block(struct xv_pool *pool, u32 pagenum, u32 offset,
+			struct block_header *block, u32 slindex)
+{
+	u32 flindex;
+	struct block_header *tmpblock;
+
+	if (pool->freelist[slindex].pagenum == pagenum
+	   && pool->freelist[slindex].offset == offset) {
+		remove_block_head(pool, block, slindex);
+		return;
+	}
+
+	flindex = slindex / BITS_PER_LONG;
+
+	if (block->link.prev_pagenum) {
+		tmpblock = get_ptr_atomic(block->link.prev_pagenum,
+				block->link.prev_offset, KM_USER1);
+		tmpblock->link.next_pagenum = block->link.next_pagenum;
+		tmpblock->link.next_offset = block->link.next_offset;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+
+	if (block->link.next_pagenum) {
+		tmpblock = get_ptr_atomic(block->link.next_pagenum,
+				block->link.next_offset, KM_USER1);
+		tmpblock->link.prev_pagenum = block->link.prev_pagenum;
+		tmpblock->link.prev_offset = block->link.prev_offset;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+
+	return;
+}
+
+/*
+ * Allocate a page and add it freelist of given pool.
+ */
+static int grow_pool(struct xv_pool *pool, gfp_t flags)
+{
+	u32 pagenum;
+	struct block_header *block;
+
+	pagenum = xv_alloc_page(flags);
+	if (unlikely(!pagenum))
+		return -ENOMEM;
+
+	stat_inc(&pool->total_pages);
+
+	spin_lock(&pool->lock);
+	block = get_ptr_atomic(pagenum, 0, KM_USER0);
+
+	block->size = PAGE_SIZE - XV_ALIGN;
+	set_flag(block, BLOCK_FREE);
+	clear_flag(block, PREV_FREE);
+	set_blockprev(block, 0);
+
+	insert_block(pool, pagenum, 0, block);
+
+	put_ptr_atomic(block, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	return 0;
+}
+
+/*
+ * Create a memory pool. Allocates freelist, bitmaps and other
+ * per-pool metadata.
+ */
+struct xv_pool *xv_create_pool(void)
+{
+	u32 ovhd_size;
+	struct xv_pool *pool;
+
+	ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
+	pool = kzalloc(ovhd_size, GFP_KERNEL);
+	if (!pool)
+		return NULL;
+
+	spin_lock_init(&pool->lock);
+
+	return pool;
+}
+EXPORT_SYMBOL_GPL(xv_create_pool);
+
+void xv_destroy_pool(struct xv_pool *pool)
+{
+	kfree(pool);
+}
+EXPORT_SYMBOL_GPL(xv_destroy_pool);
+
+/**
+ * xv_malloc - Allocate block of given size from pool.
+ * @pool: pool to allocate from
+ * @size: size of block to allocate
+ * @pagenum: page no. that holds the object
+ * @offset: location of object within pagenum
+ *
+ * On success, <pagenum, offset> identifies block allocated
+ * and 0 is returned. On failure, <pagenum, offset> is set to
+ * 0 and -ENOMEM is returned.
+ *
+ * Allocation requests with size > XV_MAX_ALLOC_SIZE will fail.
+ */
+int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
+							gfp_t flags)
+{
+	int error;
+	u32 index, tmpsize, origsize, tmpoffset;
+	struct block_header *block, *tmpblock;
+
+	*pagenum = 0;
+	*offset = 0;
+	origsize = size;
+
+	if (unlikely(!size || size > XV_MAX_ALLOC_SIZE))
+		return -ENOMEM;
+
+	size = ALIGN(size, XV_ALIGN);
+
+	spin_lock(&pool->lock);
+
+	index = find_block(pool, size, pagenum, offset);
+
+	if (!*pagenum) {
+		spin_unlock(&pool->lock);
+		if (flags & GFP_NOWAIT)
+			return -ENOMEM;
+		error = grow_pool(pool, flags);
+		if (unlikely(error))
+			return -ENOMEM;
+
+		spin_lock(&pool->lock);
+		index = find_block(pool, size, pagenum, offset);
+	}
+
+	if (!*pagenum) {
+		spin_unlock(&pool->lock);
+		return -ENOMEM;
+	}
+
+	block = get_ptr_atomic(*pagenum, *offset, KM_USER0);
+
+	remove_block_head(pool, block, index);
+
+	/* Split the block if required */
+	tmpoffset = *offset + size + XV_ALIGN;
+	tmpsize = block->size - size;
+	tmpblock = (struct block_header *)((char *)block + size + XV_ALIGN);
+	if (tmpsize) {
+		tmpblock->size = tmpsize - XV_ALIGN;
+		set_flag(tmpblock, BLOCK_FREE);
+		clear_flag(tmpblock, PREV_FREE);
+
+		set_blockprev(tmpblock, *offset);
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
+			insert_block(pool, *pagenum, tmpoffset, tmpblock);
+
+		if (tmpoffset + XV_ALIGN + tmpblock->size != PAGE_SIZE) {
+			tmpblock = BLOCK_NEXT(tmpblock);
+			set_blockprev(tmpblock, tmpoffset);
+		}
+	} else {
+		/* This block is exact fit */
+		if (tmpoffset != PAGE_SIZE)
+			clear_flag(tmpblock, PREV_FREE);
+	}
+
+	block->size = origsize;
+	clear_flag(block, BLOCK_FREE);
+
+	put_ptr_atomic(block, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	*offset += XV_ALIGN;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xv_malloc);
+
+/*
+ * Free block identified with <pagenum, offset>
+ */
+void xv_free(struct xv_pool *pool, u32 pagenum, u32 offset)
+{
+	void *page;
+	struct block_header *block, *tmpblock;
+
+	offset -= XV_ALIGN;
+
+	spin_lock(&pool->lock);
+
+	page = get_ptr_atomic(pagenum, 0, KM_USER0);
+	block = (struct block_header *)((char *)page + offset);
+
+	/* Catch double free bugs */
+	BUG_ON(test_flag(block, BLOCK_FREE));
+
+	block->size = ALIGN(block->size, XV_ALIGN);
+
+	tmpblock = BLOCK_NEXT(block);
+	if (offset + block->size + XV_ALIGN == PAGE_SIZE)
+		tmpblock = NULL;
+
+	/* Merge next block if its free */
+	if (tmpblock && test_flag(tmpblock, BLOCK_FREE)) {
+		/*
+		 * Blocks smaller than XV_MIN_ALLOC_SIZE
+		 * are not inserted in any free list.
+		 */
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE) {
+			remove_block(pool, pagenum,
+				    offset + block->size + XV_ALIGN, tmpblock,
+				    get_index_for_insert(tmpblock->size));
+		}
+		block->size += tmpblock->size + XV_ALIGN;
+	}
+
+	/* Merge previous block if its free */
+	if (test_flag(block, PREV_FREE)) {
+		tmpblock = (struct block_header *)((char *)(page) +
+						get_blockprev(block));
+		offset = offset - tmpblock->size - XV_ALIGN;
+
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
+			remove_block(pool, pagenum, offset, tmpblock,
+				    get_index_for_insert(tmpblock->size));
+
+		tmpblock->size += block->size + XV_ALIGN;
+		block = tmpblock;
+	}
+
+	/* No used objects in this page. Free it. */
+	if (block->size == PAGE_SIZE - XV_ALIGN) {
+		put_ptr_atomic(page, KM_USER0);
+		spin_unlock(&pool->lock);
+
+		xv_free_page(pagenum);
+		stat_dec(&pool->total_pages);
+		return;
+	}
+
+	set_flag(block, BLOCK_FREE);
+	if (block->size >= XV_MIN_ALLOC_SIZE)
+		insert_block(pool, pagenum, offset, block);
+
+	if (offset + block->size + XV_ALIGN != PAGE_SIZE) {
+		tmpblock = BLOCK_NEXT(block);
+		set_flag(tmpblock, PREV_FREE);
+		set_blockprev(tmpblock, offset);
+	}
+
+	put_ptr_atomic(page, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	return;
+}
+EXPORT_SYMBOL_GPL(xv_free);
+
+u32 xv_get_object_size(void *obj)
+{
+	struct block_header *blk;
+
+	blk = (struct block_header *)((char *)(obj) - XV_ALIGN);
+	return blk->size;
+}
+EXPORT_SYMBOL_GPL(xv_get_object_size);
+
+/*
+ * Returns total memory used by allocator (userdata + metadata)
+ */
+u64 xv_get_total_size_bytes(struct xv_pool *pool)
+{
+	return pool->total_pages << PAGE_SHIFT;
+}
+EXPORT_SYMBOL_GPL(xv_get_total_size_bytes);
+
+static int __init xv_malloc_init(void)
+{
+	return 0;
+}
+
+static void __exit xv_malloc_exit(void)
+{
+	return;
+}
+
+module_init(xv_malloc_init);
+module_exit(xv_malloc_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
+MODULE_DESCRIPTION("xvmalloc memory allocator");
diff --git a/drivers/block/ramzswap/xvmalloc.h b/drivers/block/ramzswap/xvmalloc.h
new file mode 100644
index 0000000..699bb04
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc.h
@@ -0,0 +1,30 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#ifndef _XVMALLOC_H_
+#define _XVMALLOC_H_
+
+#include <linux/types.h>
+
+struct xv_pool;
+
+struct xv_pool *xv_create_pool(void);
+void xv_destroy_pool(struct xv_pool *pool);
+
+int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
+							gfp_t flags);
+void xv_free(struct xv_pool *pool, u32 pagenum, u32 offset);
+
+u32 xv_get_object_size(void *obj);
+u64 xv_get_total_size_bytes(struct xv_pool *pool);
+
+#endif
diff --git a/drivers/block/ramzswap/xvmalloc_int.h b/drivers/block/ramzswap/xvmalloc_int.h
new file mode 100644
index 0000000..4d96c48
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc_int.h
@@ -0,0 +1,86 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#ifndef _XVMALLOC_INT_H_
+#define _XVMALLOC_INT_H_
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+/* User configurable params */
+
+/* This must be greater than sizeof(LinkFree) */
+#define XV_MIN_ALLOC_SIZE       32
+#define XV_MAX_ALLOC_SIZE       (PAGE_SIZE - XV_ALIGN)
+
+/* Must be power of two */
+#define XV_ALIGN_SHIFT	2
+#define XV_ALIGN	(1 << XV_ALIGN_SHIFT)
+#define XV_ALIGN_MASK	(XV_ALIGN - 1)
+
+/* Free lists are separated by FL_DELTA bytes */
+#define FL_DELTA_SHIFT	3
+#define FL_DELTA	(1 << FL_DELTA_SHIFT)
+#define FL_DELTA_MASK	(FL_DELTA - 1)
+#define NUM_FREE_LISTS	((XV_MAX_ALLOC_SIZE - XV_MIN_ALLOC_SIZE) \
+				/ FL_DELTA + 1)
+
+#define MAX_FLI		DIV_ROUND_UP(NUM_FREE_LISTS, BITS_PER_LONG)
+
+/* End of user params */
+
+enum blockflags {
+	BLOCK_FREE,
+	PREV_FREE,
+	__NR_BLOCKFLAGS,
+};
+
+#define FLAGS_MASK	XV_ALIGN_MASK
+#define PREV_MASK	(~FLAGS_MASK)
+
+struct freelist_entry {
+	u32 pagenum;
+	u16 offset;
+	u16 pad;
+};
+
+struct link_free {
+	u32 prev_pagenum;
+	u32 next_pagenum;
+	u16 prev_offset;
+	u16 next_offset;
+};
+
+struct block_header {
+	union {
+		/* This common header must be ALIGN bytes */
+		u8 common[XV_ALIGN];
+		struct {
+			u16 size;
+			u16 prev;
+		};
+	};
+	struct link_free link;
+};
+
+struct xv_pool {
+	ulong flbitmap;
+	ulong slbitmap[MAX_FLI];
+	spinlock_t lock;
+
+	struct freelist_entry freelist[NUM_FREE_LISTS];
+
+	/* stats */
+	u64 total_pages;
+};
+
+#endif

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24  4:37 ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24  4:37 UTC (permalink / raw)
  To: akpm; +Cc: linux-kernel, linux-mm, linux-mm-cc

xvmalloc is a memory allocator designed specifically for ramzswap project.

* Features:
 - Low metadata overhead (just 4 bytes per object)
 - O(1) Alloc/Free - except when we have to call system page allocator to
   get additional memory.
 - Very low fragmentation: In all tests, xvmalloc memory usage is within 12%
   of "Ideal".
 - Pool based allocator: Each pool can grow and shrink.
 - It maps pages only when required. So, it does not hog vmalloc area which
   is very small on 32-bit systems.

SLUB allocator could not be used due to fragmentation issues:
http://code.google.com/p/compcache/wiki/AllocatorsComparison
Data here shows kmalloc using ~43% more memory than TLSF and xvMalloc
is showed ~2% more space efficiency than TLSF (due to smaller metadata).
Creating various kmem_caches can reduce space efficiency gap but still
problem of being limited to low memory exists. Also, it depends on
allocating higher order pages to reduce fragmentation - this is not
acceptable for ramzswap as it is used under memory crunch (its a swap
device!).

SLOB allocator could not be used do to reasons mentioned here:
http://lkml.org/lkml/2009/3/18/210

* Implementation:
It uses two-level bitmap search to find free list containing block of
correct size. This idea is taken from TLSF (Two-Level Segregate Fit)
allocator and is well explained in its paper (see [Links] below).

* Limitations:
 - Poor scalability: No per-cpu data structures (work in progress).

[Links]
1. Details and Performance data:
http://code.google.com/p/compcache/wiki/xvMalloc
http://code.google.com/p/compcache/wiki/xvMallocPerformance

2. TLSF memory allocator:
home: http://rtportal.upv.es/rtmalloc/
paper: http://rtportal.upv.es/rtmalloc/files/MRBC_2008.pdf

Signed-off-by: Nitin Gupta <ngupta@vflare.org>
---

 drivers/block/ramzswap/xvmalloc.c     |  556 +++++++++++++++++++++++++++++++++
 drivers/block/ramzswap/xvmalloc.h     |   30 ++
 drivers/block/ramzswap/xvmalloc_int.h |   86 +++++
 3 files changed, 672 insertions(+), 0 deletions(-)

diff --git a/drivers/block/ramzswap/xvmalloc.c b/drivers/block/ramzswap/xvmalloc.c
new file mode 100644
index 0000000..57a3639
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc.c
@@ -0,0 +1,556 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include "xvmalloc.h"
+#include "xvmalloc_int.h"
+
+static void stat_inc(u64 *value)
+{
+	*value = *value + 1;
+}
+
+static void stat_dec(u64 *value)
+{
+	*value = *value - 1;
+}
+
+static int test_flag(struct block_header *block, enum blockflags flag)
+{
+	return block->prev & BIT(flag);
+}
+
+static void set_flag(struct block_header *block, enum blockflags flag)
+{
+	block->prev |= BIT(flag);
+}
+
+static void clear_flag(struct block_header *block, enum blockflags flag)
+{
+	block->prev &= ~BIT(flag);
+}
+
+/*
+ * Given <pagenum, offset> pair, provide a derefrencable pointer.
+ * This is called from xv_malloc/xv_free path, so it needs to be fast.
+ */
+static void *get_ptr_atomic(u32 pagenum, u16 offset, enum km_type type)
+{
+	unsigned char *base;
+
+	base = kmap_atomic(pfn_to_page(pagenum), type);
+	return base + offset;
+}
+
+static void put_ptr_atomic(void *ptr, enum km_type type)
+{
+	kunmap_atomic(ptr, type);
+}
+
+static u32 get_blockprev(struct block_header *block)
+{
+	return block->prev & PREV_MASK;
+}
+
+static void set_blockprev(struct block_header *block, u16 new_offset)
+{
+	block->prev = new_offset | (block->prev & FLAGS_MASK);
+}
+
+static struct block_header *BLOCK_NEXT(struct block_header *block)
+{
+	return (struct block_header *)((char *)block + block->size + XV_ALIGN);
+}
+
+/*
+ * Get index of free list containing blocks of maximum size
+ * which is less than or equal to given size.
+ */
+static u32 get_index_for_insert(u32 size)
+{
+	if (unlikely(size > XV_MAX_ALLOC_SIZE))
+		size = XV_MAX_ALLOC_SIZE;
+	size &= ~FL_DELTA_MASK;
+	return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
+}
+
+/*
+ * Get index of free list having blocks of size greater than
+ * or equal to requested size.
+ */
+static u32 get_index(u32 size)
+{
+	if (unlikely(size < XV_MIN_ALLOC_SIZE))
+		size = XV_MIN_ALLOC_SIZE;
+	size = ALIGN(size, FL_DELTA);
+	return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
+}
+
+/*
+ * Allocate a memory page. Called when a pool needs to grow.
+ */
+static u32 xv_alloc_page(gfp_t flags)
+{
+	struct page *page;
+
+	page = alloc_page(flags);
+	if (unlikely(!page))
+		return 0;
+
+	return page_to_pfn(page);
+}
+
+/*
+ * Called when all objects in a page are freed.
+ */
+static void xv_free_page(u32 pagenum)
+{
+	__free_page(pfn_to_page(pagenum));
+}
+
+/**
+ * find_block - find block of at least given size
+ * @pool: memory pool to search from
+ * @size: size of block required
+ * @pagenum: page no. containing required block
+ * @offset: offset within the page where block is located.
+ *
+ * Searches two level bitmap to locate block of at least
+ * the given size. If such a block is found, it provides
+ * <pagenum, offset> to identify this block and returns index
+ * in freelist where we found this block.
+ * Otherwise, returns 0 and <pagenum, offset> params are not touched.
+ */
+static u32 find_block(struct xv_pool *pool, u32 size,
+			u32 *pagenum, u32 *offset)
+{
+	ulong flbitmap, slbitmap;
+	u32 flindex, slindex, slbitstart;
+
+	/* There are no free blocks in this pool */
+	if (!pool->flbitmap)
+		return 0;
+
+	/* Get freelist index correspoding to this size */
+	slindex = get_index(size);
+	slbitmap = pool->slbitmap[slindex / BITS_PER_LONG];
+	slbitstart = slindex % BITS_PER_LONG;
+
+	/*
+	 * If freelist is not empty at this index, we found the
+	 * block - head of this list. This is approximate best-fit match.
+	 */
+	if (test_bit(slbitstart, &slbitmap)) {
+		*pagenum = pool->freelist[slindex].pagenum;
+		*offset = pool->freelist[slindex].offset;
+		return slindex;
+	}
+
+	/*
+	 * No best-fit found. Search a bit further in bitmap for a free block.
+	 * Second level bitmap consists of series of 32-bit chunks. Search
+	 * further in the chunk where we expected a best-fit, starting from
+	 * index location found above.
+	 */
+	slbitstart++;
+	slbitmap >>= slbitstart;
+
+	/* Skip this search if we were already at end of this bitmap chunk */
+	if ((slbitstart != BITS_PER_LONG) && slbitmap) {
+		slindex += __ffs(slbitmap) + 1;
+		*pagenum = pool->freelist[slindex].pagenum;
+		*offset = pool->freelist[slindex].offset;
+		return slindex;
+	}
+
+	/* Now do a full two-level bitmap search to find next nearest fit */
+	flindex = slindex / BITS_PER_LONG;
+
+	flbitmap = (pool->flbitmap) >> (flindex + 1);
+	if (!flbitmap)
+		return 0;
+
+	flindex += __ffs(flbitmap) + 1;
+	slbitmap = pool->slbitmap[flindex];
+	slindex = (flindex * BITS_PER_LONG) + __ffs(slbitmap);
+	*pagenum = pool->freelist[slindex].pagenum;
+	*offset = pool->freelist[slindex].offset;
+
+	return slindex;
+}
+
+/*
+ * Insert block at <pagenum, offset> in freelist of given pool.
+ * freelist used depends on block size.
+ */
+static void insert_block(struct xv_pool *pool, u32 pagenum, u32 offset,
+			struct block_header *block)
+{
+	u32 flindex, slindex;
+	struct block_header *nextblock;
+
+	slindex = get_index_for_insert(block->size);
+	flindex = slindex / BITS_PER_LONG;
+
+	block->link.prev_pagenum = 0;
+	block->link.prev_offset = 0;
+	block->link.next_pagenum = pool->freelist[slindex].pagenum;
+	block->link.next_offset = pool->freelist[slindex].offset;
+	pool->freelist[slindex].pagenum = pagenum;
+	pool->freelist[slindex].offset = offset;
+
+	if (block->link.next_pagenum) {
+		nextblock = get_ptr_atomic(block->link.next_pagenum,
+					block->link.next_offset, KM_USER1);
+		nextblock->link.prev_pagenum = pagenum;
+		nextblock->link.prev_offset = offset;
+		put_ptr_atomic(nextblock, KM_USER1);
+	}
+
+	__set_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
+	__set_bit(flindex, &pool->flbitmap);
+}
+
+/*
+ * Remove block from head of freelist. Index 'slindex' identifies the freelist.
+ */
+static void remove_block_head(struct xv_pool *pool,
+			struct block_header *block, u32 slindex)
+{
+	struct block_header *tmpblock;
+	u32 flindex = slindex / BITS_PER_LONG;
+
+	pool->freelist[slindex].pagenum = block->link.next_pagenum;
+	pool->freelist[slindex].offset = block->link.next_offset;
+	block->link.prev_pagenum = 0;
+	block->link.prev_offset = 0;
+
+	if (!pool->freelist[slindex].pagenum) {
+		__clear_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
+		if (!pool->slbitmap[flindex])
+			__clear_bit(flindex, &pool->flbitmap);
+	} else {
+		/*
+		 * DEBUG ONLY: We need not reinitialize freelist head previous
+		 * pointer to 0 - we never depend on its value. But just for
+		 * sanity, lets do it.
+		 */
+		tmpblock = get_ptr_atomic(pool->freelist[slindex].pagenum,
+				pool->freelist[slindex].offset, KM_USER1);
+		tmpblock->link.prev_pagenum = 0;
+		tmpblock->link.prev_offset = 0;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+}
+
+/*
+ * Remove block from freelist. Index 'slindex' identifies the freelist.
+ */
+static void remove_block(struct xv_pool *pool, u32 pagenum, u32 offset,
+			struct block_header *block, u32 slindex)
+{
+	u32 flindex;
+	struct block_header *tmpblock;
+
+	if (pool->freelist[slindex].pagenum == pagenum
+	   && pool->freelist[slindex].offset == offset) {
+		remove_block_head(pool, block, slindex);
+		return;
+	}
+
+	flindex = slindex / BITS_PER_LONG;
+
+	if (block->link.prev_pagenum) {
+		tmpblock = get_ptr_atomic(block->link.prev_pagenum,
+				block->link.prev_offset, KM_USER1);
+		tmpblock->link.next_pagenum = block->link.next_pagenum;
+		tmpblock->link.next_offset = block->link.next_offset;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+
+	if (block->link.next_pagenum) {
+		tmpblock = get_ptr_atomic(block->link.next_pagenum,
+				block->link.next_offset, KM_USER1);
+		tmpblock->link.prev_pagenum = block->link.prev_pagenum;
+		tmpblock->link.prev_offset = block->link.prev_offset;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+
+	return;
+}
+
+/*
+ * Allocate a page and add it freelist of given pool.
+ */
+static int grow_pool(struct xv_pool *pool, gfp_t flags)
+{
+	u32 pagenum;
+	struct block_header *block;
+
+	pagenum = xv_alloc_page(flags);
+	if (unlikely(!pagenum))
+		return -ENOMEM;
+
+	stat_inc(&pool->total_pages);
+
+	spin_lock(&pool->lock);
+	block = get_ptr_atomic(pagenum, 0, KM_USER0);
+
+	block->size = PAGE_SIZE - XV_ALIGN;
+	set_flag(block, BLOCK_FREE);
+	clear_flag(block, PREV_FREE);
+	set_blockprev(block, 0);
+
+	insert_block(pool, pagenum, 0, block);
+
+	put_ptr_atomic(block, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	return 0;
+}
+
+/*
+ * Create a memory pool. Allocates freelist, bitmaps and other
+ * per-pool metadata.
+ */
+struct xv_pool *xv_create_pool(void)
+{
+	u32 ovhd_size;
+	struct xv_pool *pool;
+
+	ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
+	pool = kzalloc(ovhd_size, GFP_KERNEL);
+	if (!pool)
+		return NULL;
+
+	spin_lock_init(&pool->lock);
+
+	return pool;
+}
+EXPORT_SYMBOL_GPL(xv_create_pool);
+
+void xv_destroy_pool(struct xv_pool *pool)
+{
+	kfree(pool);
+}
+EXPORT_SYMBOL_GPL(xv_destroy_pool);
+
+/**
+ * xv_malloc - Allocate block of given size from pool.
+ * @pool: pool to allocate from
+ * @size: size of block to allocate
+ * @pagenum: page no. that holds the object
+ * @offset: location of object within pagenum
+ *
+ * On success, <pagenum, offset> identifies block allocated
+ * and 0 is returned. On failure, <pagenum, offset> is set to
+ * 0 and -ENOMEM is returned.
+ *
+ * Allocation requests with size > XV_MAX_ALLOC_SIZE will fail.
+ */
+int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
+							gfp_t flags)
+{
+	int error;
+	u32 index, tmpsize, origsize, tmpoffset;
+	struct block_header *block, *tmpblock;
+
+	*pagenum = 0;
+	*offset = 0;
+	origsize = size;
+
+	if (unlikely(!size || size > XV_MAX_ALLOC_SIZE))
+		return -ENOMEM;
+
+	size = ALIGN(size, XV_ALIGN);
+
+	spin_lock(&pool->lock);
+
+	index = find_block(pool, size, pagenum, offset);
+
+	if (!*pagenum) {
+		spin_unlock(&pool->lock);
+		if (flags & GFP_NOWAIT)
+			return -ENOMEM;
+		error = grow_pool(pool, flags);
+		if (unlikely(error))
+			return -ENOMEM;
+
+		spin_lock(&pool->lock);
+		index = find_block(pool, size, pagenum, offset);
+	}
+
+	if (!*pagenum) {
+		spin_unlock(&pool->lock);
+		return -ENOMEM;
+	}
+
+	block = get_ptr_atomic(*pagenum, *offset, KM_USER0);
+
+	remove_block_head(pool, block, index);
+
+	/* Split the block if required */
+	tmpoffset = *offset + size + XV_ALIGN;
+	tmpsize = block->size - size;
+	tmpblock = (struct block_header *)((char *)block + size + XV_ALIGN);
+	if (tmpsize) {
+		tmpblock->size = tmpsize - XV_ALIGN;
+		set_flag(tmpblock, BLOCK_FREE);
+		clear_flag(tmpblock, PREV_FREE);
+
+		set_blockprev(tmpblock, *offset);
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
+			insert_block(pool, *pagenum, tmpoffset, tmpblock);
+
+		if (tmpoffset + XV_ALIGN + tmpblock->size != PAGE_SIZE) {
+			tmpblock = BLOCK_NEXT(tmpblock);
+			set_blockprev(tmpblock, tmpoffset);
+		}
+	} else {
+		/* This block is exact fit */
+		if (tmpoffset != PAGE_SIZE)
+			clear_flag(tmpblock, PREV_FREE);
+	}
+
+	block->size = origsize;
+	clear_flag(block, BLOCK_FREE);
+
+	put_ptr_atomic(block, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	*offset += XV_ALIGN;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xv_malloc);
+
+/*
+ * Free block identified with <pagenum, offset>
+ */
+void xv_free(struct xv_pool *pool, u32 pagenum, u32 offset)
+{
+	void *page;
+	struct block_header *block, *tmpblock;
+
+	offset -= XV_ALIGN;
+
+	spin_lock(&pool->lock);
+
+	page = get_ptr_atomic(pagenum, 0, KM_USER0);
+	block = (struct block_header *)((char *)page + offset);
+
+	/* Catch double free bugs */
+	BUG_ON(test_flag(block, BLOCK_FREE));
+
+	block->size = ALIGN(block->size, XV_ALIGN);
+
+	tmpblock = BLOCK_NEXT(block);
+	if (offset + block->size + XV_ALIGN == PAGE_SIZE)
+		tmpblock = NULL;
+
+	/* Merge next block if its free */
+	if (tmpblock && test_flag(tmpblock, BLOCK_FREE)) {
+		/*
+		 * Blocks smaller than XV_MIN_ALLOC_SIZE
+		 * are not inserted in any free list.
+		 */
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE) {
+			remove_block(pool, pagenum,
+				    offset + block->size + XV_ALIGN, tmpblock,
+				    get_index_for_insert(tmpblock->size));
+		}
+		block->size += tmpblock->size + XV_ALIGN;
+	}
+
+	/* Merge previous block if its free */
+	if (test_flag(block, PREV_FREE)) {
+		tmpblock = (struct block_header *)((char *)(page) +
+						get_blockprev(block));
+		offset = offset - tmpblock->size - XV_ALIGN;
+
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
+			remove_block(pool, pagenum, offset, tmpblock,
+				    get_index_for_insert(tmpblock->size));
+
+		tmpblock->size += block->size + XV_ALIGN;
+		block = tmpblock;
+	}
+
+	/* No used objects in this page. Free it. */
+	if (block->size == PAGE_SIZE - XV_ALIGN) {
+		put_ptr_atomic(page, KM_USER0);
+		spin_unlock(&pool->lock);
+
+		xv_free_page(pagenum);
+		stat_dec(&pool->total_pages);
+		return;
+	}
+
+	set_flag(block, BLOCK_FREE);
+	if (block->size >= XV_MIN_ALLOC_SIZE)
+		insert_block(pool, pagenum, offset, block);
+
+	if (offset + block->size + XV_ALIGN != PAGE_SIZE) {
+		tmpblock = BLOCK_NEXT(block);
+		set_flag(tmpblock, PREV_FREE);
+		set_blockprev(tmpblock, offset);
+	}
+
+	put_ptr_atomic(page, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	return;
+}
+EXPORT_SYMBOL_GPL(xv_free);
+
+u32 xv_get_object_size(void *obj)
+{
+	struct block_header *blk;
+
+	blk = (struct block_header *)((char *)(obj) - XV_ALIGN);
+	return blk->size;
+}
+EXPORT_SYMBOL_GPL(xv_get_object_size);
+
+/*
+ * Returns total memory used by allocator (userdata + metadata)
+ */
+u64 xv_get_total_size_bytes(struct xv_pool *pool)
+{
+	return pool->total_pages << PAGE_SHIFT;
+}
+EXPORT_SYMBOL_GPL(xv_get_total_size_bytes);
+
+static int __init xv_malloc_init(void)
+{
+	return 0;
+}
+
+static void __exit xv_malloc_exit(void)
+{
+	return;
+}
+
+module_init(xv_malloc_init);
+module_exit(xv_malloc_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
+MODULE_DESCRIPTION("xvmalloc memory allocator");
diff --git a/drivers/block/ramzswap/xvmalloc.h b/drivers/block/ramzswap/xvmalloc.h
new file mode 100644
index 0000000..699bb04
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc.h
@@ -0,0 +1,30 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#ifndef _XVMALLOC_H_
+#define _XVMALLOC_H_
+
+#include <linux/types.h>
+
+struct xv_pool;
+
+struct xv_pool *xv_create_pool(void);
+void xv_destroy_pool(struct xv_pool *pool);
+
+int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
+							gfp_t flags);
+void xv_free(struct xv_pool *pool, u32 pagenum, u32 offset);
+
+u32 xv_get_object_size(void *obj);
+u64 xv_get_total_size_bytes(struct xv_pool *pool);
+
+#endif
diff --git a/drivers/block/ramzswap/xvmalloc_int.h b/drivers/block/ramzswap/xvmalloc_int.h
new file mode 100644
index 0000000..4d96c48
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc_int.h
@@ -0,0 +1,86 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#ifndef _XVMALLOC_INT_H_
+#define _XVMALLOC_INT_H_
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+/* User configurable params */
+
+/* This must be greater than sizeof(LinkFree) */
+#define XV_MIN_ALLOC_SIZE       32
+#define XV_MAX_ALLOC_SIZE       (PAGE_SIZE - XV_ALIGN)
+
+/* Must be power of two */
+#define XV_ALIGN_SHIFT	2
+#define XV_ALIGN	(1 << XV_ALIGN_SHIFT)
+#define XV_ALIGN_MASK	(XV_ALIGN - 1)
+
+/* Free lists are separated by FL_DELTA bytes */
+#define FL_DELTA_SHIFT	3
+#define FL_DELTA	(1 << FL_DELTA_SHIFT)
+#define FL_DELTA_MASK	(FL_DELTA - 1)
+#define NUM_FREE_LISTS	((XV_MAX_ALLOC_SIZE - XV_MIN_ALLOC_SIZE) \
+				/ FL_DELTA + 1)
+
+#define MAX_FLI		DIV_ROUND_UP(NUM_FREE_LISTS, BITS_PER_LONG)
+
+/* End of user params */
+
+enum blockflags {
+	BLOCK_FREE,
+	PREV_FREE,
+	__NR_BLOCKFLAGS,
+};
+
+#define FLAGS_MASK	XV_ALIGN_MASK
+#define PREV_MASK	(~FLAGS_MASK)
+
+struct freelist_entry {
+	u32 pagenum;
+	u16 offset;
+	u16 pad;
+};
+
+struct link_free {
+	u32 prev_pagenum;
+	u32 next_pagenum;
+	u16 prev_offset;
+	u16 next_offset;
+};
+
+struct block_header {
+	union {
+		/* This common header must be ALIGN bytes */
+		u8 common[XV_ALIGN];
+		struct {
+			u16 size;
+			u16 prev;
+		};
+	};
+	struct link_free link;
+};
+
+struct xv_pool {
+	ulong flbitmap;
+	ulong slbitmap[MAX_FLI];
+	spinlock_t lock;
+
+	struct freelist_entry freelist[NUM_FREE_LISTS];
+
+	/* stats */
+	u64 total_pages;
+};
+
+#endif

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24  4:37 ` Nitin Gupta
@ 2009-08-24 17:33   ` Pekka Enberg
  -1 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-24 17:33 UTC (permalink / raw)
  To: ngupta; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

Hi Nitin,

[ Nit: the name xmalloc() is usually reserved for non-failing allocators in
  user-space which is why xvmalloc() looks so confusing to me. Can we
  get a better name for the thing? Also, I'm not sure why xvmalloc is a
  separate module. Can't you just make it in-kernel or compile it in to the
  ramzswap module? ]

On Mon, Aug 24, 2009 at 7:37 AM, Nitin Gupta<ngupta@vflare.org> wrote:
> +/**
> + * xv_malloc - Allocate block of given size from pool.
> + * @pool: pool to allocate from
> + * @size: size of block to allocate
> + * @pagenum: page no. that holds the object
> + * @offset: location of object within pagenum
> + *
> + * On success, <pagenum, offset> identifies block allocated
> + * and 0 is returned. On failure, <pagenum, offset> is set to
> + * 0 and -ENOMEM is returned.
> + *
> + * Allocation requests with size > XV_MAX_ALLOC_SIZE will fail.
> + */
> +int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
> +                                                       gfp_t flags)
> +{
> +       int error;
> +       u32 index, tmpsize, origsize, tmpoffset;
> +       struct block_header *block, *tmpblock;
> +
> +       *pagenum = 0;
> +       *offset = 0;
> +       origsize = size;
> +
> +       if (unlikely(!size || size > XV_MAX_ALLOC_SIZE))
> +               return -ENOMEM;
> +
> +       size = ALIGN(size, XV_ALIGN);
> +
> +       spin_lock(&pool->lock);
> +
> +       index = find_block(pool, size, pagenum, offset);
> +
> +       if (!*pagenum) {
> +               spin_unlock(&pool->lock);
> +               if (flags & GFP_NOWAIT)
> +                       return -ENOMEM;
> +               error = grow_pool(pool, flags);
> +               if (unlikely(error))
> +                       return -ENOMEM;
> +
> +               spin_lock(&pool->lock);
> +               index = find_block(pool, size, pagenum, offset);
> +       }
> +
> +       if (!*pagenum) {
> +               spin_unlock(&pool->lock);
> +               return -ENOMEM;
> +       }
> +
> +       block = get_ptr_atomic(*pagenum, *offset, KM_USER0);
> +
> +       remove_block_head(pool, block, index);
> +
> +       /* Split the block if required */
> +       tmpoffset = *offset + size + XV_ALIGN;
> +       tmpsize = block->size - size;
> +       tmpblock = (struct block_header *)((char *)block + size + XV_ALIGN);
> +       if (tmpsize) {
> +               tmpblock->size = tmpsize - XV_ALIGN;
> +               set_flag(tmpblock, BLOCK_FREE);
> +               clear_flag(tmpblock, PREV_FREE);
> +
> +               set_blockprev(tmpblock, *offset);
> +               if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
> +                       insert_block(pool, *pagenum, tmpoffset, tmpblock);
> +
> +               if (tmpoffset + XV_ALIGN + tmpblock->size != PAGE_SIZE) {
> +                       tmpblock = BLOCK_NEXT(tmpblock);
> +                       set_blockprev(tmpblock, tmpoffset);
> +               }
> +       } else {
> +               /* This block is exact fit */
> +               if (tmpoffset != PAGE_SIZE)
> +                       clear_flag(tmpblock, PREV_FREE);
> +       }
> +
> +       block->size = origsize;
> +       clear_flag(block, BLOCK_FREE);
> +
> +       put_ptr_atomic(block, KM_USER0);
> +       spin_unlock(&pool->lock);
> +
> +       *offset += XV_ALIGN;
> +
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(xv_malloc);

What's the purpose of passing PFNs around? There's quite a lot of PFN
to struct page conversion going on because of it. Wouldn't it make
more sense to return (and pass) a pointer to struct page instead?

                        Pekka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 17:33   ` Pekka Enberg
  0 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-24 17:33 UTC (permalink / raw)
  To: ngupta; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

Hi Nitin,

[ Nit: the name xmalloc() is usually reserved for non-failing allocators in
  user-space which is why xvmalloc() looks so confusing to me. Can we
  get a better name for the thing? Also, I'm not sure why xvmalloc is a
  separate module. Can't you just make it in-kernel or compile it in to the
  ramzswap module? ]

On Mon, Aug 24, 2009 at 7:37 AM, Nitin Gupta<ngupta@vflare.org> wrote:
> +/**
> + * xv_malloc - Allocate block of given size from pool.
> + * @pool: pool to allocate from
> + * @size: size of block to allocate
> + * @pagenum: page no. that holds the object
> + * @offset: location of object within pagenum
> + *
> + * On success, <pagenum, offset> identifies block allocated
> + * and 0 is returned. On failure, <pagenum, offset> is set to
> + * 0 and -ENOMEM is returned.
> + *
> + * Allocation requests with size > XV_MAX_ALLOC_SIZE will fail.
> + */
> +int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
> +                                                       gfp_t flags)
> +{
> +       int error;
> +       u32 index, tmpsize, origsize, tmpoffset;
> +       struct block_header *block, *tmpblock;
> +
> +       *pagenum = 0;
> +       *offset = 0;
> +       origsize = size;
> +
> +       if (unlikely(!size || size > XV_MAX_ALLOC_SIZE))
> +               return -ENOMEM;
> +
> +       size = ALIGN(size, XV_ALIGN);
> +
> +       spin_lock(&pool->lock);
> +
> +       index = find_block(pool, size, pagenum, offset);
> +
> +       if (!*pagenum) {
> +               spin_unlock(&pool->lock);
> +               if (flags & GFP_NOWAIT)
> +                       return -ENOMEM;
> +               error = grow_pool(pool, flags);
> +               if (unlikely(error))
> +                       return -ENOMEM;
> +
> +               spin_lock(&pool->lock);
> +               index = find_block(pool, size, pagenum, offset);
> +       }
> +
> +       if (!*pagenum) {
> +               spin_unlock(&pool->lock);
> +               return -ENOMEM;
> +       }
> +
> +       block = get_ptr_atomic(*pagenum, *offset, KM_USER0);
> +
> +       remove_block_head(pool, block, index);
> +
> +       /* Split the block if required */
> +       tmpoffset = *offset + size + XV_ALIGN;
> +       tmpsize = block->size - size;
> +       tmpblock = (struct block_header *)((char *)block + size + XV_ALIGN);
> +       if (tmpsize) {
> +               tmpblock->size = tmpsize - XV_ALIGN;
> +               set_flag(tmpblock, BLOCK_FREE);
> +               clear_flag(tmpblock, PREV_FREE);
> +
> +               set_blockprev(tmpblock, *offset);
> +               if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
> +                       insert_block(pool, *pagenum, tmpoffset, tmpblock);
> +
> +               if (tmpoffset + XV_ALIGN + tmpblock->size != PAGE_SIZE) {
> +                       tmpblock = BLOCK_NEXT(tmpblock);
> +                       set_blockprev(tmpblock, tmpoffset);
> +               }
> +       } else {
> +               /* This block is exact fit */
> +               if (tmpoffset != PAGE_SIZE)
> +                       clear_flag(tmpblock, PREV_FREE);
> +       }
> +
> +       block->size = origsize;
> +       clear_flag(block, BLOCK_FREE);
> +
> +       put_ptr_atomic(block, KM_USER0);
> +       spin_unlock(&pool->lock);
> +
> +       *offset += XV_ALIGN;
> +
> +       return 0;
> +}
> +EXPORT_SYMBOL_GPL(xv_malloc);

What's the purpose of passing PFNs around? There's quite a lot of PFN
to struct page conversion going on because of it. Wouldn't it make
more sense to return (and pass) a pointer to struct page instead?

                        Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 17:33   ` Pekka Enberg
@ 2009-08-24 17:52     ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 17:52 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

Hi Pekka,

On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>
> [ Nit: the name xmalloc() is usually reserved for non-failing allocators in
>    user-space which is why xvmalloc() looks so confusing to me. Can we
>    get a better name for the thing? Also, I'm not sure why xvmalloc is a
>    separate module. Can't you just make it in-kernel or compile it in to the
>    ramzswap module? ]
>

xvmalloc is still a separate module to make sure I do not make it ramzswap
specific.

I am okay with renaming it to rzmalloc and compiling it with ramzswap instead
of as separate module. I will make these changes in next revision of these
patches.

Thanks,
Nitin



^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 17:52     ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 17:52 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

Hi Pekka,

On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>
> [ Nit: the name xmalloc() is usually reserved for non-failing allocators in
>    user-space which is why xvmalloc() looks so confusing to me. Can we
>    get a better name for the thing? Also, I'm not sure why xvmalloc is a
>    separate module. Can't you just make it in-kernel or compile it in to the
>    ramzswap module? ]
>

xvmalloc is still a separate module to make sure I do not make it ramzswap
specific.

I am okay with renaming it to rzmalloc and compiling it with ramzswap instead
of as separate module. I will make these changes in next revision of these
patches.

Thanks,
Nitin


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 17:52     ` Nitin Gupta
@ 2009-08-24 18:08       ` Pekka Enberg
  -1 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-24 18:08 UTC (permalink / raw)
  To: ngupta; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

Hi Nitin,

On Mon, Aug 24, 2009 at 8:52 PM, Nitin Gupta<ngupta@vflare.org> wrote:
> I am okay with renaming it to rzmalloc and compiling it with ramzswap
> instead of as separate module.

Is the name rzmalloc() too similar to kzalloc() which stands for
zeroing allocator, though? I think I suggested
ramzswap_alloc()/ramzswap_free() in the past to avoid confusion. I'd
rather go with that if we can't come up with a nice generic name that
stands for alloc_part_of_page_including_highmem().

                       Pekka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 18:08       ` Pekka Enberg
  0 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-24 18:08 UTC (permalink / raw)
  To: ngupta; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

Hi Nitin,

On Mon, Aug 24, 2009 at 8:52 PM, Nitin Gupta<ngupta@vflare.org> wrote:
> I am okay with renaming it to rzmalloc and compiling it with ramzswap
> instead of as separate module.

Is the name rzmalloc() too similar to kzalloc() which stands for
zeroing allocator, though? I think I suggested
ramzswap_alloc()/ramzswap_free() in the past to avoid confusion. I'd
rather go with that if we can't come up with a nice generic name that
stands for alloc_part_of_page_including_highmem().

                       Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 18:08       ` Pekka Enberg
@ 2009-08-24 18:11         ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 18:11 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/24/2009 11:38 PM, Pekka Enberg wrote:
> Hi Nitin,
>
> On Mon, Aug 24, 2009 at 8:52 PM, Nitin Gupta<ngupta@vflare.org>  wrote:
>> I am okay with renaming it to rzmalloc and compiling it with ramzswap
>> instead of as separate module.
>
> Is the name rzmalloc() too similar to kzalloc() which stands for
> zeroing allocator, though? I think I suggested
> ramzswap_alloc()/ramzswap_free() in the past to avoid confusion. I'd
> rather go with that if we can't come up with a nice generic name that
> stands for alloc_part_of_page_including_highmem().
>

rzs_malloc()/rzs_free() ?

Nitin

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 18:11         ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 18:11 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/24/2009 11:38 PM, Pekka Enberg wrote:
> Hi Nitin,
>
> On Mon, Aug 24, 2009 at 8:52 PM, Nitin Gupta<ngupta@vflare.org>  wrote:
>> I am okay with renaming it to rzmalloc and compiling it with ramzswap
>> instead of as separate module.
>
> Is the name rzmalloc() too similar to kzalloc() which stands for
> zeroing allocator, though? I think I suggested
> ramzswap_alloc()/ramzswap_free() in the past to avoid confusion. I'd
> rather go with that if we can't come up with a nice generic name that
> stands for alloc_part_of_page_including_highmem().
>

rzs_malloc()/rzs_free() ?

Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 18:11         ` Nitin Gupta
@ 2009-08-24 18:27           ` Pekka Enberg
  -1 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-24 18:27 UTC (permalink / raw)
  To: ngupta; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

On Mon, Aug 24, 2009 at 9:11 PM, Nitin Gupta<ngupta@vflare.org> wrote:
>> Is the name rzmalloc() too similar to kzalloc() which stands for
>> zeroing allocator, though? I think I suggested
>> ramzswap_alloc()/ramzswap_free() in the past to avoid confusion. I'd
>> rather go with that if we can't come up with a nice generic name that
>> stands for alloc_part_of_page_including_highmem().
>
> rzs_malloc()/rzs_free() ?

I am not sure what we gain from the shorter and more cryptic "rzs"
prefix compared to "ramzswap" but yeah, it's less likely to be
confused with kzalloc() so I'm okay with that.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 18:27           ` Pekka Enberg
  0 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-24 18:27 UTC (permalink / raw)
  To: ngupta; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

On Mon, Aug 24, 2009 at 9:11 PM, Nitin Gupta<ngupta@vflare.org> wrote:
>> Is the name rzmalloc() too similar to kzalloc() which stands for
>> zeroing allocator, though? I think I suggested
>> ramzswap_alloc()/ramzswap_free() in the past to avoid confusion. I'd
>> rather go with that if we can't come up with a nice generic name that
>> stands for alloc_part_of_page_including_highmem().
>
> rzs_malloc()/rzs_free() ?

I am not sure what we gain from the shorter and more cryptic "rzs"
prefix compared to "ramzswap" but yeah, it's less likely to be
confused with kzalloc() so I'm okay with that.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 18:27           ` Pekka Enberg
@ 2009-08-24 18:40             ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 18:40 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/24/2009 11:57 PM, Pekka Enberg wrote:
> On Mon, Aug 24, 2009 at 9:11 PM, Nitin Gupta<ngupta@vflare.org>  wrote:
>>> Is the name rzmalloc() too similar to kzalloc() which stands for
>>> zeroing allocator, though? I think I suggested
>>> ramzswap_alloc()/ramzswap_free() in the past to avoid confusion. I'd
>>> rather go with that if we can't come up with a nice generic name that
>>> stands for alloc_part_of_page_including_highmem().
>>
>> rzs_malloc()/rzs_free() ?
>
> I am not sure what we gain from the shorter and more cryptic "rzs"
> prefix compared to "ramzswap" but yeah, it's less likely to be
> confused with kzalloc() so I'm okay with that.
>

Perhaps, I'm just too bad with naming :)

xvmalloc -> ramzswap_alloc() (compiled with ramzswap instead of as a separate 
module).

BTW, [rzs]control is the name of userspace utility to send ioctl()s to ramzswap.
Somehow, I am happy with rzscontrol name atleast.

Thanks,
Nitin

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 18:40             ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 18:40 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/24/2009 11:57 PM, Pekka Enberg wrote:
> On Mon, Aug 24, 2009 at 9:11 PM, Nitin Gupta<ngupta@vflare.org>  wrote:
>>> Is the name rzmalloc() too similar to kzalloc() which stands for
>>> zeroing allocator, though? I think I suggested
>>> ramzswap_alloc()/ramzswap_free() in the past to avoid confusion. I'd
>>> rather go with that if we can't come up with a nice generic name that
>>> stands for alloc_part_of_page_including_highmem().
>>
>> rzs_malloc()/rzs_free() ?
>
> I am not sure what we gain from the shorter and more cryptic "rzs"
> prefix compared to "ramzswap" but yeah, it's less likely to be
> confused with kzalloc() so I'm okay with that.
>

Perhaps, I'm just too bad with naming :)

xvmalloc -> ramzswap_alloc() (compiled with ramzswap instead of as a separate 
module).

BTW, [rzs]control is the name of userspace utility to send ioctl()s to ramzswap.
Somehow, I am happy with rzscontrol name atleast.

Thanks,
Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 17:33   ` Pekka Enberg
@ 2009-08-24 19:36     ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 19:36 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

Hi Pekka,

On 08/24/2009 11:03 PM, Pekka Enberg wrote:

<snip>

> On Mon, Aug 24, 2009 at 7:37 AM, Nitin Gupta<ngupta@vflare.org>  wrote:
>> +/**
>> + * xv_malloc - Allocate block of given size from pool.
>> + * @pool: pool to allocate from
>> + * @size: size of block to allocate
>> + * @pagenum: page no. that holds the object
>> + * @offset: location of object within pagenum
>> + *
>> + * On success,<pagenum, offset>  identifies block allocated
>> + * and 0 is returned. On failure,<pagenum, offset>  is set to
>> + * 0 and -ENOMEM is returned.
>> + *
>> + * Allocation requests with size>  XV_MAX_ALLOC_SIZE will fail.
>> + */
>> +int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
>> +                                                       gfp_t flags)

<snip>

>
> What's the purpose of passing PFNs around? There's quite a lot of PFN
> to struct page conversion going on because of it. Wouldn't it make
> more sense to return (and pass) a pointer to struct page instead?


PFNs are 32-bit on all archs while for 'struct page *', we require 32-bit or
64-bit depending on arch. ramzswap allocates a table entry <pagenum, offset>
corresponding to every swap slot. So, the size of table will unnecessarily
increase on 64-bit archs. Same is the argument for xvmalloc free list sizes.

Also, xvmalloc and ramzswap itself does PFN -> 'struct page *' conversion
only when freeing the page or to get a deferencable pointer.

Thanks,
Nitin


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 19:36     ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 19:36 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

Hi Pekka,

On 08/24/2009 11:03 PM, Pekka Enberg wrote:

<snip>

> On Mon, Aug 24, 2009 at 7:37 AM, Nitin Gupta<ngupta@vflare.org>  wrote:
>> +/**
>> + * xv_malloc - Allocate block of given size from pool.
>> + * @pool: pool to allocate from
>> + * @size: size of block to allocate
>> + * @pagenum: page no. that holds the object
>> + * @offset: location of object within pagenum
>> + *
>> + * On success,<pagenum, offset>  identifies block allocated
>> + * and 0 is returned. On failure,<pagenum, offset>  is set to
>> + * 0 and -ENOMEM is returned.
>> + *
>> + * Allocation requests with size>  XV_MAX_ALLOC_SIZE will fail.
>> + */
>> +int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
>> +                                                       gfp_t flags)

<snip>

>
> What's the purpose of passing PFNs around? There's quite a lot of PFN
> to struct page conversion going on because of it. Wouldn't it make
> more sense to return (and pass) a pointer to struct page instead?


PFNs are 32-bit on all archs while for 'struct page *', we require 32-bit or
64-bit depending on arch. ramzswap allocates a table entry <pagenum, offset>
corresponding to every swap slot. So, the size of table will unnecessarily
increase on 64-bit archs. Same is the argument for xvmalloc free list sizes.

Also, xvmalloc and ramzswap itself does PFN -> 'struct page *' conversion
only when freeing the page or to get a deferencable pointer.

Thanks,
Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 19:36     ` Nitin Gupta
@ 2009-08-24 19:43       ` Pekka Enberg
  -1 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-24 19:43 UTC (permalink / raw)
  To: ngupta; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

Hi Nitin,

On Mon, Aug 24, 2009 at 10:36 PM, Nitin Gupta<ngupta@vflare.org> wrote:
> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>
> <snip>
>
>> On Mon, Aug 24, 2009 at 7:37 AM, Nitin Gupta<ngupta@vflare.org>  wrote:
>>>
>>> +/**
>>> + * xv_malloc - Allocate block of given size from pool.
>>> + * @pool: pool to allocate from
>>> + * @size: size of block to allocate
>>> + * @pagenum: page no. that holds the object
>>> + * @offset: location of object within pagenum
>>> + *
>>> + * On success,<pagenum, offset>  identifies block allocated
>>> + * and 0 is returned. On failure,<pagenum, offset>  is set to
>>> + * 0 and -ENOMEM is returned.
>>> + *
>>> + * Allocation requests with size>  XV_MAX_ALLOC_SIZE will fail.
>>> + */
>>> +int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
>>> +                                                       gfp_t flags)
>
> <snip>
>
>>
>> What's the purpose of passing PFNs around? There's quite a lot of PFN
>> to struct page conversion going on because of it. Wouldn't it make
>> more sense to return (and pass) a pointer to struct page instead?
>
> PFNs are 32-bit on all archs while for 'struct page *', we require 32-bit or
> 64-bit depending on arch. ramzswap allocates a table entry <pagenum, offset>
> corresponding to every swap slot. So, the size of table will unnecessarily
> increase on 64-bit archs. Same is the argument for xvmalloc free list sizes.
>
> Also, xvmalloc and ramzswap itself does PFN -> 'struct page *' conversion
> only when freeing the page or to get a deferencable pointer.

I still don't see why the APIs have work on PFNs. You can obviously do
the conversion once for store and load. Look at what the code does,
it's converting struct page to PFN just to do the reverse for kmap().
I think that could be cleaned by passing struct page around.

                        Pekka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 19:43       ` Pekka Enberg
  0 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-24 19:43 UTC (permalink / raw)
  To: ngupta; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

Hi Nitin,

On Mon, Aug 24, 2009 at 10:36 PM, Nitin Gupta<ngupta@vflare.org> wrote:
> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>
> <snip>
>
>> On Mon, Aug 24, 2009 at 7:37 AM, Nitin Gupta<ngupta@vflare.org>  wrote:
>>>
>>> +/**
>>> + * xv_malloc - Allocate block of given size from pool.
>>> + * @pool: pool to allocate from
>>> + * @size: size of block to allocate
>>> + * @pagenum: page no. that holds the object
>>> + * @offset: location of object within pagenum
>>> + *
>>> + * On success,<pagenum, offset>  identifies block allocated
>>> + * and 0 is returned. On failure,<pagenum, offset>  is set to
>>> + * 0 and -ENOMEM is returned.
>>> + *
>>> + * Allocation requests with size>  XV_MAX_ALLOC_SIZE will fail.
>>> + */
>>> +int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
>>> +                                                       gfp_t flags)
>
> <snip>
>
>>
>> What's the purpose of passing PFNs around? There's quite a lot of PFN
>> to struct page conversion going on because of it. Wouldn't it make
>> more sense to return (and pass) a pointer to struct page instead?
>
> PFNs are 32-bit on all archs while for 'struct page *', we require 32-bit or
> 64-bit depending on arch. ramzswap allocates a table entry <pagenum, offset>
> corresponding to every swap slot. So, the size of table will unnecessarily
> increase on 64-bit archs. Same is the argument for xvmalloc free list sizes.
>
> Also, xvmalloc and ramzswap itself does PFN -> 'struct page *' conversion
> only when freeing the page or to get a deferencable pointer.

I still don't see why the APIs have work on PFNs. You can obviously do
the conversion once for store and load. Look at what the code does,
it's converting struct page to PFN just to do the reverse for kmap().
I think that could be cleaned by passing struct page around.

                        Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 19:36     ` Nitin Gupta
@ 2009-08-24 20:39       ` Hugh Dickins
  -1 siblings, 0 replies; 40+ messages in thread
From: Hugh Dickins @ 2009-08-24 20:39 UTC (permalink / raw)
  To: Nitin Gupta; +Cc: Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On Tue, 25 Aug 2009, Nitin Gupta wrote:
> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
> >
> > What's the purpose of passing PFNs around? There's quite a lot of PFN
> > to struct page conversion going on because of it. Wouldn't it make
> > more sense to return (and pass) a pointer to struct page instead?
> 
> PFNs are 32-bit on all archs

Are you sure?  If it happens to be so for all machines built today,
I think it can easily change tomorrow.  We consistently use unsigned long
for pfn (there, now I've said that, I bet you'll find somewhere we don't!)

x86_64 says MAX_PHYSMEM_BITS 46 and ia64 says MAX_PHYSMEM_BITS 50 and
mm/sparse.c says
unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);

Hugh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 20:39       ` Hugh Dickins
  0 siblings, 0 replies; 40+ messages in thread
From: Hugh Dickins @ 2009-08-24 20:39 UTC (permalink / raw)
  To: Nitin Gupta; +Cc: Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On Tue, 25 Aug 2009, Nitin Gupta wrote:
> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
> >
> > What's the purpose of passing PFNs around? There's quite a lot of PFN
> > to struct page conversion going on because of it. Wouldn't it make
> > more sense to return (and pass) a pointer to struct page instead?
> 
> PFNs are 32-bit on all archs

Are you sure?  If it happens to be so for all machines built today,
I think it can easily change tomorrow.  We consistently use unsigned long
for pfn (there, now I've said that, I bet you'll find somewhere we don't!)

x86_64 says MAX_PHYSMEM_BITS 46 and ia64 says MAX_PHYSMEM_BITS 50 and
mm/sparse.c says
unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT);

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 20:39       ` Hugh Dickins
@ 2009-08-24 21:16         ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 21:16 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/25/2009 02:09 AM, Hugh Dickins wrote:
> On Tue, 25 Aug 2009, Nitin Gupta wrote:
>> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>>>
>>> What's the purpose of passing PFNs around? There's quite a lot of PFN
>>> to struct page conversion going on because of it. Wouldn't it make
>>> more sense to return (and pass) a pointer to struct page instead?
>>
>> PFNs are 32-bit on all archs
>
> Are you sure?  If it happens to be so for all machines built today,
> I think it can easily change tomorrow.  We consistently use unsigned long
> for pfn (there, now I've said that, I bet you'll find somewhere we don't!)
>
> x86_64 says MAX_PHYSMEM_BITS 46 and ia64 says MAX_PHYSMEM_BITS 50 and
> mm/sparse.c says
> unsigned long max_sparsemem_pfn = 1UL<<  (MAX_PHYSMEM_BITS-PAGE_SHIFT);
>

For PFN to exceed 32-bit we need to have physical memory > 16TB (2^32 * 4KB).
So, maybe I can simply add a check in ramzswap module load to make sure that
RAM is indeed < 16TB and then safely use 32-bit for PFN?

Thanks,
Nitin

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 21:16         ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 21:16 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/25/2009 02:09 AM, Hugh Dickins wrote:
> On Tue, 25 Aug 2009, Nitin Gupta wrote:
>> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>>>
>>> What's the purpose of passing PFNs around? There's quite a lot of PFN
>>> to struct page conversion going on because of it. Wouldn't it make
>>> more sense to return (and pass) a pointer to struct page instead?
>>
>> PFNs are 32-bit on all archs
>
> Are you sure?  If it happens to be so for all machines built today,
> I think it can easily change tomorrow.  We consistently use unsigned long
> for pfn (there, now I've said that, I bet you'll find somewhere we don't!)
>
> x86_64 says MAX_PHYSMEM_BITS 46 and ia64 says MAX_PHYSMEM_BITS 50 and
> mm/sparse.c says
> unsigned long max_sparsemem_pfn = 1UL<<  (MAX_PHYSMEM_BITS-PAGE_SHIFT);
>

For PFN to exceed 32-bit we need to have physical memory > 16TB (2^32 * 4KB).
So, maybe I can simply add a check in ramzswap module load to make sure that
RAM is indeed < 16TB and then safely use 32-bit for PFN?

Thanks,
Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 19:43       ` Pekka Enberg
@ 2009-08-24 21:16         ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 21:16 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/25/2009 01:13 AM, Pekka Enberg wrote:
> On Mon, Aug 24, 2009 at 10:36 PM, Nitin Gupta<ngupta@vflare.org>  wrote:
>> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>>
>> <snip>
>>
>>> On Mon, Aug 24, 2009 at 7:37 AM, Nitin Gupta<ngupta@vflare.org>    wrote:
>>>>
>>>> +/**
>>>> + * xv_malloc - Allocate block of given size from pool.
>>>> + * @pool: pool to allocate from
>>>> + * @size: size of block to allocate
>>>> + * @pagenum: page no. that holds the object
>>>> + * @offset: location of object within pagenum
>>>> + *
>>>> + * On success,<pagenum, offset>    identifies block allocated
>>>> + * and 0 is returned. On failure,<pagenum, offset>    is set to
>>>> + * 0 and -ENOMEM is returned.
>>>> + *
>>>> + * Allocation requests with size>    XV_MAX_ALLOC_SIZE will fail.
>>>> + */
>>>> +int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
>>>> +                                                       gfp_t flags)
>>
>> <snip>
>>
>>>
>>> What's the purpose of passing PFNs around? There's quite a lot of PFN
>>> to struct page conversion going on because of it. Wouldn't it make
>>> more sense to return (and pass) a pointer to struct page instead?
>>
>> PFNs are 32-bit on all archs while for 'struct page *', we require 32-bit or
>> 64-bit depending on arch. ramzswap allocates a table entry<pagenum, offset>
>> corresponding to every swap slot. So, the size of table will unnecessarily
>> increase on 64-bit archs. Same is the argument for xvmalloc free list sizes.
>>
>> Also, xvmalloc and ramzswap itself does PFN ->  'struct page *' conversion
>> only when freeing the page or to get a deferencable pointer.
>
> I still don't see why the APIs have work on PFNs. You can obviously do
> the conversion once for store and load. Look at what the code does,
> it's converting struct page to PFN just to do the reverse for kmap().
> I think that could be cleaned by passing struct page around.
>


* Allocator side:
Since allocator stores PFN in internal freelists, so all internal routines
naturally use PFN instead of struct page (try changing them all to use struct
page instead to see the mess it will create). So, kmap will still end up doing
PFN -> struct page conversion since we just pass around PFNs.

What if we convert only the interfaces: xv_malloc() and xv_free()
to use struct page:
  - xv_malloc(): we will not save any PFN -> struct page conversion as we simply
move it from kmap wrapper to futher up in alloc routine.
  - xv_free(): same as above; we now move it down the function to pass to
internal routines


* ramzswap block driver side:
ramzswap also stores PFNs in swap slot table. Thus, due to reasons same as
above, number of conversions will not reduce.


Now, if code cleanup is the aim rather that reducing the no. of conversions,
then I think use of PFNs is still preferred due to minor implementation details
mentioned above.

So, I think the interface should be left in its current state.

Thanks,
Nitin

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 21:16         ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-24 21:16 UTC (permalink / raw)
  To: Pekka Enberg; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/25/2009 01:13 AM, Pekka Enberg wrote:
> On Mon, Aug 24, 2009 at 10:36 PM, Nitin Gupta<ngupta@vflare.org>  wrote:
>> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>>
>> <snip>
>>
>>> On Mon, Aug 24, 2009 at 7:37 AM, Nitin Gupta<ngupta@vflare.org>    wrote:
>>>>
>>>> +/**
>>>> + * xv_malloc - Allocate block of given size from pool.
>>>> + * @pool: pool to allocate from
>>>> + * @size: size of block to allocate
>>>> + * @pagenum: page no. that holds the object
>>>> + * @offset: location of object within pagenum
>>>> + *
>>>> + * On success,<pagenum, offset>    identifies block allocated
>>>> + * and 0 is returned. On failure,<pagenum, offset>    is set to
>>>> + * 0 and -ENOMEM is returned.
>>>> + *
>>>> + * Allocation requests with size>    XV_MAX_ALLOC_SIZE will fail.
>>>> + */
>>>> +int xv_malloc(struct xv_pool *pool, u32 size, u32 *pagenum, u32 *offset,
>>>> +                                                       gfp_t flags)
>>
>> <snip>
>>
>>>
>>> What's the purpose of passing PFNs around? There's quite a lot of PFN
>>> to struct page conversion going on because of it. Wouldn't it make
>>> more sense to return (and pass) a pointer to struct page instead?
>>
>> PFNs are 32-bit on all archs while for 'struct page *', we require 32-bit or
>> 64-bit depending on arch. ramzswap allocates a table entry<pagenum, offset>
>> corresponding to every swap slot. So, the size of table will unnecessarily
>> increase on 64-bit archs. Same is the argument for xvmalloc free list sizes.
>>
>> Also, xvmalloc and ramzswap itself does PFN ->  'struct page *' conversion
>> only when freeing the page or to get a deferencable pointer.
>
> I still don't see why the APIs have work on PFNs. You can obviously do
> the conversion once for store and load. Look at what the code does,
> it's converting struct page to PFN just to do the reverse for kmap().
> I think that could be cleaned by passing struct page around.
>


* Allocator side:
Since allocator stores PFN in internal freelists, so all internal routines
naturally use PFN instead of struct page (try changing them all to use struct
page instead to see the mess it will create). So, kmap will still end up doing
PFN -> struct page conversion since we just pass around PFNs.

What if we convert only the interfaces: xv_malloc() and xv_free()
to use struct page:
  - xv_malloc(): we will not save any PFN -> struct page conversion as we simply
move it from kmap wrapper to futher up in alloc routine.
  - xv_free(): same as above; we now move it down the function to pass to
internal routines


* ramzswap block driver side:
ramzswap also stores PFNs in swap slot table. Thus, due to reasons same as
above, number of conversions will not reduce.


Now, if code cleanup is the aim rather that reducing the no. of conversions,
then I think use of PFNs is still preferred due to minor implementation details
mentioned above.

So, I think the interface should be left in its current state.

Thanks,
Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 21:16         ` Nitin Gupta
@ 2009-08-24 21:46           ` Hugh Dickins
  -1 siblings, 0 replies; 40+ messages in thread
From: Hugh Dickins @ 2009-08-24 21:46 UTC (permalink / raw)
  To: Nitin Gupta; +Cc: Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On Tue, 25 Aug 2009, Nitin Gupta wrote:
> On 08/25/2009 02:09 AM, Hugh Dickins wrote:
> > On Tue, 25 Aug 2009, Nitin Gupta wrote:
> > > On 08/24/2009 11:03 PM, Pekka Enberg wrote:
> > > >
> > > > What's the purpose of passing PFNs around? There's quite a lot of PFN
> > > > to struct page conversion going on because of it. Wouldn't it make
> > > > more sense to return (and pass) a pointer to struct page instead?
> > >
> > > PFNs are 32-bit on all archs
> >
> > Are you sure?  If it happens to be so for all machines built today,
> > I think it can easily change tomorrow.  We consistently use unsigned long
> > for pfn (there, now I've said that, I bet you'll find somewhere we don't!)
> >
> > x86_64 says MAX_PHYSMEM_BITS 46 and ia64 says MAX_PHYSMEM_BITS 50 and
> > mm/sparse.c says
> > unsigned long max_sparsemem_pfn = 1UL<<  (MAX_PHYSMEM_BITS-PAGE_SHIFT);
> >
> 
> For PFN to exceed 32-bit we need to have physical memory > 16TB (2^32 * 4KB).
> So, maybe I can simply add a check in ramzswap module load to make sure that
> RAM is indeed < 16TB and then safely use 32-bit for PFN?

Others know much more about it, but I believe that with sparsemem you
may be handling vast holes in physical memory: so a relatively small
amount of physical memory might in part be mapped with gigantic pfns.

So if you go that route, I think you'd rather have to refuse pages
with oversized pfns (or refuse configurations with any oversized pfns),
than base it upon the quantity of physical memory in the machine.

Seems ugly to me, as it did to Pekka; but I can understand that you're
very much in the business of saving memory, so doubling the size of some
of your tables (I may be oversimplifying) would be repugnant to you.

You could add a CONFIG option, rather like CONFIG_LBDAF, to switch on
u64-sized pfns; but you'd still have to handle what happens when the
pfn is too big to fit in u32 without that option; and if distros always
switch the option on, to accomodate the larger machines, then there may
have been no point to adding it.

I'm undecided.

Hugh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-24 21:46           ` Hugh Dickins
  0 siblings, 0 replies; 40+ messages in thread
From: Hugh Dickins @ 2009-08-24 21:46 UTC (permalink / raw)
  To: Nitin Gupta; +Cc: Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On Tue, 25 Aug 2009, Nitin Gupta wrote:
> On 08/25/2009 02:09 AM, Hugh Dickins wrote:
> > On Tue, 25 Aug 2009, Nitin Gupta wrote:
> > > On 08/24/2009 11:03 PM, Pekka Enberg wrote:
> > > >
> > > > What's the purpose of passing PFNs around? There's quite a lot of PFN
> > > > to struct page conversion going on because of it. Wouldn't it make
> > > > more sense to return (and pass) a pointer to struct page instead?
> > >
> > > PFNs are 32-bit on all archs
> >
> > Are you sure?  If it happens to be so for all machines built today,
> > I think it can easily change tomorrow.  We consistently use unsigned long
> > for pfn (there, now I've said that, I bet you'll find somewhere we don't!)
> >
> > x86_64 says MAX_PHYSMEM_BITS 46 and ia64 says MAX_PHYSMEM_BITS 50 and
> > mm/sparse.c says
> > unsigned long max_sparsemem_pfn = 1UL<<  (MAX_PHYSMEM_BITS-PAGE_SHIFT);
> >
> 
> For PFN to exceed 32-bit we need to have physical memory > 16TB (2^32 * 4KB).
> So, maybe I can simply add a check in ramzswap module load to make sure that
> RAM is indeed < 16TB and then safely use 32-bit for PFN?

Others know much more about it, but I believe that with sparsemem you
may be handling vast holes in physical memory: so a relatively small
amount of physical memory might in part be mapped with gigantic pfns.

So if you go that route, I think you'd rather have to refuse pages
with oversized pfns (or refuse configurations with any oversized pfns),
than base it upon the quantity of physical memory in the machine.

Seems ugly to me, as it did to Pekka; but I can understand that you're
very much in the business of saving memory, so doubling the size of some
of your tables (I may be oversimplifying) would be repugnant to you.

You could add a CONFIG option, rather like CONFIG_LBDAF, to switch on
u64-sized pfns; but you'd still have to handle what happens when the
pfn is too big to fit in u32 without that option; and if distros always
switch the option on, to accomodate the larger machines, then there may
have been no point to adding it.

I'm undecided.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 21:16         ` Nitin Gupta
@ 2009-08-25  4:26           ` Pekka Enberg
  -1 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-25  4:26 UTC (permalink / raw)
  To: ngupta; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc, hugh.dickins

Hi Nitin,

On Tue, Aug 25, 2009 at 12:16 AM, Nitin Gupta<ngupta@vflare.org> wrote:
> Now, if code cleanup is the aim rather that reducing the no. of conversions,
> then I think use of PFNs is still preferred due to minor implementation
> details mentioned above.
>
> So, I think the interface should be left in its current state.

I don't agree. For example, grow_pool() does xv_alloc_page() and
immediately passes the PFN to get_ptr_atomic() which does conversion
back to struct page. Passing PFNs around is not a good idea because
it's very non-obvious, potentially broken (the 64-bit issue Hugh
mentioned), and you lose type checking. The whole wrapper thing around
kmap() (which is also duplicated in the actual driver) is a pretty
clear indication that you're doing it the wrong way.

So again, _storing_ PFNs in internal data structures is probably a
reasonable optimization (given the 64-bit issues are sorted out) but
making the APIs work on them is not. It's much cleaner to have few
places that do page_to_pfn() on stores and pass struct pages around.

                        Pekka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-25  4:26           ` Pekka Enberg
  0 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-25  4:26 UTC (permalink / raw)
  To: ngupta; +Cc: akpm, linux-kernel, linux-mm, linux-mm-cc, hugh.dickins

Hi Nitin,

On Tue, Aug 25, 2009 at 12:16 AM, Nitin Gupta<ngupta@vflare.org> wrote:
> Now, if code cleanup is the aim rather that reducing the no. of conversions,
> then I think use of PFNs is still preferred due to minor implementation
> details mentioned above.
>
> So, I think the interface should be left in its current state.

I don't agree. For example, grow_pool() does xv_alloc_page() and
immediately passes the PFN to get_ptr_atomic() which does conversion
back to struct page. Passing PFNs around is not a good idea because
it's very non-obvious, potentially broken (the 64-bit issue Hugh
mentioned), and you lose type checking. The whole wrapper thing around
kmap() (which is also duplicated in the actual driver) is a pretty
clear indication that you're doing it the wrong way.

So again, _storing_ PFNs in internal data structures is probably a
reasonable optimization (given the 64-bit issues are sorted out) but
making the APIs work on them is not. It's much cleaner to have few
places that do page_to_pfn() on stores and pass struct pages around.

                        Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 21:46           ` Hugh Dickins
@ 2009-08-25 14:52             ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-25 14:52 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/25/2009 03:16 AM, Hugh Dickins wrote:
> On Tue, 25 Aug 2009, Nitin Gupta wrote:
>> On 08/25/2009 02:09 AM, Hugh Dickins wrote:
>>> On Tue, 25 Aug 2009, Nitin Gupta wrote:
>>>> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>>>>>
>>>>> What's the purpose of passing PFNs around? There's quite a lot of PFN
>>>>> to struct page conversion going on because of it. Wouldn't it make
>>>>> more sense to return (and pass) a pointer to struct page instead?
>>>>
>>>> PFNs are 32-bit on all archs
>>>
>>> Are you sure?  If it happens to be so for all machines built today,
>>> I think it can easily change tomorrow.  We consistently use unsigned long
>>> for pfn (there, now I've said that, I bet you'll find somewhere we don't!)
>>>
>>> x86_64 says MAX_PHYSMEM_BITS 46 and ia64 says MAX_PHYSMEM_BITS 50 and
>>> mm/sparse.c says
>>> unsigned long max_sparsemem_pfn = 1UL<<   (MAX_PHYSMEM_BITS-PAGE_SHIFT);
>>>
>>
>> For PFN to exceed 32-bit we need to have physical memory>  16TB (2^32 * 4KB).
>> So, maybe I can simply add a check in ramzswap module load to make sure that
>> RAM is indeed<  16TB and then safely use 32-bit for PFN?
>
> Others know much more about it, but I believe that with sparsemem you
> may be handling vast holes in physical memory: so a relatively small
> amount of physical memory might in part be mapped with gigantic pfns.
>
> So if you go that route, I think you'd rather have to refuse pages
> with oversized pfns (or refuse configurations with any oversized pfns),
> than base it upon the quantity of physical memory in the machine.
>
> Seems ugly to me, as it did to Pekka; but I can understand that you're
> very much in the business of saving memory, so doubling the size of some
> of your tables (I may be oversimplifying) would be repugnant to you.
>
> You could add a CONFIG option, rather like CONFIG_LBDAF, to switch on
> u64-sized pfns; but you'd still have to handle what happens when the
> pfn is too big to fit in u32 without that option; and if distros always
> switch the option on, to accomodate the larger machines, then there may
> have been no point to adding it.
>

Thanks for these details.

Now I understand that use of 32-bit PFN on 64-bit archs is unsafe. So,
there is no option but to include extra bits for PFNs or use struct page.

* Solution of ramzswap block device:

Use 48 bit PFNs (32 + 8) and have a compile time error to make sure that
that MAX_PHYSMEM_BITS is < 48 + PAGE_SHIFT. The ramzswap table can accommodate
48-bits without any increase in table size.

--- ramzswap_new.h	2009-08-25 20:10:38.054033804 +0530
+++ ramzswap.h	2009-08-25 20:09:28.386069100 +0530
@@ -110,9 +110,9 @@

  /* Indexed by page no. */
  struct table {
-	u32 pagenum_1;
+	u32 pagenum;
  	u16 offset;
-	u8 pagenum_2;
+	u8 count;	/* object ref count (not yet used) */
  	u8 flags;
  };


(removal for 'count' field will hurt later when we implement
memory defragmentation support).


* Solution for allocator:

Use struct page instead of PFN. This is better than always using 64-bit PFNs
since we get rid of all casts. Use of 48-bit PFNs as above will create too
much mess. However, use of struct page increases per-pool overhead by 4K on
64-bit systems. This should be okay.


Please let me know if you have any comments. I will make these changes in next
revision.

There is still some problem with memory allocator naming. Its no longer a
separate module, the symbols are not exported and its now compiled with ramzswap
block driver itself. So, I am hoping xv_malloc() does not causes any confusion
with any existing name now. It really should not cause any confustion. I would
love to retain this name for allocator.

Thanks,
Nitin





^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-25 14:52             ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-25 14:52 UTC (permalink / raw)
  To: Hugh Dickins; +Cc: Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/25/2009 03:16 AM, Hugh Dickins wrote:
> On Tue, 25 Aug 2009, Nitin Gupta wrote:
>> On 08/25/2009 02:09 AM, Hugh Dickins wrote:
>>> On Tue, 25 Aug 2009, Nitin Gupta wrote:
>>>> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>>>>>
>>>>> What's the purpose of passing PFNs around? There's quite a lot of PFN
>>>>> to struct page conversion going on because of it. Wouldn't it make
>>>>> more sense to return (and pass) a pointer to struct page instead?
>>>>
>>>> PFNs are 32-bit on all archs
>>>
>>> Are you sure?  If it happens to be so for all machines built today,
>>> I think it can easily change tomorrow.  We consistently use unsigned long
>>> for pfn (there, now I've said that, I bet you'll find somewhere we don't!)
>>>
>>> x86_64 says MAX_PHYSMEM_BITS 46 and ia64 says MAX_PHYSMEM_BITS 50 and
>>> mm/sparse.c says
>>> unsigned long max_sparsemem_pfn = 1UL<<   (MAX_PHYSMEM_BITS-PAGE_SHIFT);
>>>
>>
>> For PFN to exceed 32-bit we need to have physical memory>  16TB (2^32 * 4KB).
>> So, maybe I can simply add a check in ramzswap module load to make sure that
>> RAM is indeed<  16TB and then safely use 32-bit for PFN?
>
> Others know much more about it, but I believe that with sparsemem you
> may be handling vast holes in physical memory: so a relatively small
> amount of physical memory might in part be mapped with gigantic pfns.
>
> So if you go that route, I think you'd rather have to refuse pages
> with oversized pfns (or refuse configurations with any oversized pfns),
> than base it upon the quantity of physical memory in the machine.
>
> Seems ugly to me, as it did to Pekka; but I can understand that you're
> very much in the business of saving memory, so doubling the size of some
> of your tables (I may be oversimplifying) would be repugnant to you.
>
> You could add a CONFIG option, rather like CONFIG_LBDAF, to switch on
> u64-sized pfns; but you'd still have to handle what happens when the
> pfn is too big to fit in u32 without that option; and if distros always
> switch the option on, to accomodate the larger machines, then there may
> have been no point to adding it.
>

Thanks for these details.

Now I understand that use of 32-bit PFN on 64-bit archs is unsafe. So,
there is no option but to include extra bits for PFNs or use struct page.

* Solution of ramzswap block device:

Use 48 bit PFNs (32 + 8) and have a compile time error to make sure that
that MAX_PHYSMEM_BITS is < 48 + PAGE_SHIFT. The ramzswap table can accommodate
48-bits without any increase in table size.

--- ramzswap_new.h	2009-08-25 20:10:38.054033804 +0530
+++ ramzswap.h	2009-08-25 20:09:28.386069100 +0530
@@ -110,9 +110,9 @@

  /* Indexed by page no. */
  struct table {
-	u32 pagenum_1;
+	u32 pagenum;
  	u16 offset;
-	u8 pagenum_2;
+	u8 count;	/* object ref count (not yet used) */
  	u8 flags;
  };


(removal for 'count' field will hurt later when we implement
memory defragmentation support).


* Solution for allocator:

Use struct page instead of PFN. This is better than always using 64-bit PFNs
since we get rid of all casts. Use of 48-bit PFNs as above will create too
much mess. However, use of struct page increases per-pool overhead by 4K on
64-bit systems. This should be okay.


Please let me know if you have any comments. I will make these changes in next
revision.

There is still some problem with memory allocator naming. Its no longer a
separate module, the symbols are not exported and its now compiled with ramzswap
block driver itself. So, I am hoping xv_malloc() does not causes any confusion
with any existing name now. It really should not cause any confustion. I would
love to retain this name for allocator.

Thanks,
Nitin




--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-25 14:52             ` Nitin Gupta
@ 2009-08-25 19:03               ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-25 19:03 UTC (permalink / raw)
  To: ngupta
  Cc: Hugh Dickins, Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/25/2009 08:22 PM, Nitin Gupta wrote:
> On 08/25/2009 03:16 AM, Hugh Dickins wrote:
>> On Tue, 25 Aug 2009, Nitin Gupta wrote:
>>> On 08/25/2009 02:09 AM, Hugh Dickins wrote:
>>>> On Tue, 25 Aug 2009, Nitin Gupta wrote:
>>>>> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>>>>>>
>>>>>> What's the purpose of passing PFNs around? There's quite a lot of PFN
>>>>>> to struct page conversion going on because of it. Wouldn't it make
>>>>>> more sense to return (and pass) a pointer to struct page instead?
>>>>>
>>>>> PFNs are 32-bit on all archs
>>>>
>>>> Are you sure? If it happens to be so for all machines built today,
>>>> I think it can easily change tomorrow. We consistently use unsigned
>>>> long
>>>> for pfn (there, now I've said that, I bet you'll find somewhere we
>>>> don't!)
>>>>
>>>> x86_64 says MAX_PHYSMEM_BITS 46 and ia64 says MAX_PHYSMEM_BITS 50 and
>>>> mm/sparse.c says
>>>> unsigned long max_sparsemem_pfn = 1UL<< (MAX_PHYSMEM_BITS-PAGE_SHIFT);
>>>>
>>>
>>> For PFN to exceed 32-bit we need to have physical memory> 16TB (2^32
>>> * 4KB).
>>> So, maybe I can simply add a check in ramzswap module load to make
>>> sure that
>>> RAM is indeed< 16TB and then safely use 32-bit for PFN?
>>
>> Others know much more about it, but I believe that with sparsemem you
>> may be handling vast holes in physical memory: so a relatively small
>> amount of physical memory might in part be mapped with gigantic pfns.
>>
>> So if you go that route, I think you'd rather have to refuse pages
>> with oversized pfns (or refuse configurations with any oversized pfns),
>> than base it upon the quantity of physical memory in the machine.
>>
>> Seems ugly to me, as it did to Pekka; but I can understand that you're
>> very much in the business of saving memory, so doubling the size of some
>> of your tables (I may be oversimplifying) would be repugnant to you.
>>
>> You could add a CONFIG option, rather like CONFIG_LBDAF, to switch on
>> u64-sized pfns; but you'd still have to handle what happens when the
>> pfn is too big to fit in u32 without that option; and if distros always
>> switch the option on, to accomodate the larger machines, then there may
>> have been no point to adding it.
>>
>
> Thanks for these details.
>
> Now I understand that use of 32-bit PFN on 64-bit archs is unsafe. So,
> there is no option but to include extra bits for PFNs or use struct page.
>
> * Solution of ramzswap block device:
>
> Use 48 bit PFNs (32 + 8) and have a compile time error to make sure that
> that MAX_PHYSMEM_BITS is < 48 + PAGE_SHIFT. The ramzswap table can
> accommodate
> 48-bits without any increase in table size.
>


I went crazy. I meant 40 bits for PFN -- not 48. This 40-bit PFN should be 
sufficient for all archs. For archs where 40 + PAGE_SHIFT < MAX_PHYSMEM_BITS
ramzswap will just issue a compiler error.

Thanks,
Nitin


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-25 19:03               ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-25 19:03 UTC (permalink / raw)
  To: ngupta
  Cc: Hugh Dickins, Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/25/2009 08:22 PM, Nitin Gupta wrote:
> On 08/25/2009 03:16 AM, Hugh Dickins wrote:
>> On Tue, 25 Aug 2009, Nitin Gupta wrote:
>>> On 08/25/2009 02:09 AM, Hugh Dickins wrote:
>>>> On Tue, 25 Aug 2009, Nitin Gupta wrote:
>>>>> On 08/24/2009 11:03 PM, Pekka Enberg wrote:
>>>>>>
>>>>>> What's the purpose of passing PFNs around? There's quite a lot of PFN
>>>>>> to struct page conversion going on because of it. Wouldn't it make
>>>>>> more sense to return (and pass) a pointer to struct page instead?
>>>>>
>>>>> PFNs are 32-bit on all archs
>>>>
>>>> Are you sure? If it happens to be so for all machines built today,
>>>> I think it can easily change tomorrow. We consistently use unsigned
>>>> long
>>>> for pfn (there, now I've said that, I bet you'll find somewhere we
>>>> don't!)
>>>>
>>>> x86_64 says MAX_PHYSMEM_BITS 46 and ia64 says MAX_PHYSMEM_BITS 50 and
>>>> mm/sparse.c says
>>>> unsigned long max_sparsemem_pfn = 1UL<< (MAX_PHYSMEM_BITS-PAGE_SHIFT);
>>>>
>>>
>>> For PFN to exceed 32-bit we need to have physical memory> 16TB (2^32
>>> * 4KB).
>>> So, maybe I can simply add a check in ramzswap module load to make
>>> sure that
>>> RAM is indeed< 16TB and then safely use 32-bit for PFN?
>>
>> Others know much more about it, but I believe that with sparsemem you
>> may be handling vast holes in physical memory: so a relatively small
>> amount of physical memory might in part be mapped with gigantic pfns.
>>
>> So if you go that route, I think you'd rather have to refuse pages
>> with oversized pfns (or refuse configurations with any oversized pfns),
>> than base it upon the quantity of physical memory in the machine.
>>
>> Seems ugly to me, as it did to Pekka; but I can understand that you're
>> very much in the business of saving memory, so doubling the size of some
>> of your tables (I may be oversimplifying) would be repugnant to you.
>>
>> You could add a CONFIG option, rather like CONFIG_LBDAF, to switch on
>> u64-sized pfns; but you'd still have to handle what happens when the
>> pfn is too big to fit in u32 without that option; and if distros always
>> switch the option on, to accomodate the larger machines, then there may
>> have been no point to adding it.
>>
>
> Thanks for these details.
>
> Now I understand that use of 32-bit PFN on 64-bit archs is unsafe. So,
> there is no option but to include extra bits for PFNs or use struct page.
>
> * Solution of ramzswap block device:
>
> Use 48 bit PFNs (32 + 8) and have a compile time error to make sure that
> that MAX_PHYSMEM_BITS is < 48 + PAGE_SHIFT. The ramzswap table can
> accommodate
> 48-bits without any increase in table size.
>


I went crazy. I meant 40 bits for PFN -- not 48. This 40-bit PFN should be 
sufficient for all archs. For archs where 40 + PAGE_SHIFT < MAX_PHYSMEM_BITS
ramzswap will just issue a compiler error.

Thanks,
Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-24 19:36     ` Nitin Gupta
@ 2009-08-26 16:07       ` Christoph Lameter
  -1 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2009-08-26 16:07 UTC (permalink / raw)
  To: Nitin Gupta; +Cc: Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On Tue, 25 Aug 2009, Nitin Gupta wrote:

> PFNs are 32-bit on all archs while for 'struct page *', we require 32-bit or
> 64-bit depending on arch. ramzswap allocates a table entry <pagenum, offset>
> corresponding to every swap slot. So, the size of table will unnecessarily
> increase on 64-bit archs. Same is the argument for xvmalloc free list sizes.

Wrong. PFNs must be longer than 32 bit otherwise a system cannot
address more than 2^12 + 2^32 = 2^44 =>  16TB.

The type used for PFNs is unsigned long which are 64 bit on 64 bit platforms.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-26 16:07       ` Christoph Lameter
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2009-08-26 16:07 UTC (permalink / raw)
  To: Nitin Gupta; +Cc: Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On Tue, 25 Aug 2009, Nitin Gupta wrote:

> PFNs are 32-bit on all archs while for 'struct page *', we require 32-bit or
> 64-bit depending on arch. ramzswap allocates a table entry <pagenum, offset>
> corresponding to every swap slot. So, the size of table will unnecessarily
> increase on 64-bit archs. Same is the argument for xvmalloc free list sizes.

Wrong. PFNs must be longer than 32 bit otherwise a system cannot
address more than 2^12 + 2^32 = 2^44 =>  16TB.

The type used for PFNs is unsigned long which are 64 bit on 64 bit platforms.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-25 19:03               ` Nitin Gupta
@ 2009-08-26 16:10                 ` Christoph Lameter
  -1 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2009-08-26 16:10 UTC (permalink / raw)
  To: Nitin Gupta
  Cc: Hugh Dickins, Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On Wed, 26 Aug 2009, Nitin Gupta wrote:

> I went crazy. I meant 40 bits for PFN -- not 48. This 40-bit PFN should be
> sufficient for all archs. For archs where 40 + PAGE_SHIFT < MAX_PHYSMEM_BITS
> ramzswap will just issue a compiler error.

How about restricting the xvmalloc memory allocator to 32 bit? If I
understand correctly xvmalloc main use in on 32 bit in order to be
able to use HIGHMEM?


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-26 16:10                 ` Christoph Lameter
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2009-08-26 16:10 UTC (permalink / raw)
  To: Nitin Gupta
  Cc: Hugh Dickins, Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On Wed, 26 Aug 2009, Nitin Gupta wrote:

> I went crazy. I meant 40 bits for PFN -- not 48. This 40-bit PFN should be
> sufficient for all archs. For archs where 40 + PAGE_SHIFT < MAX_PHYSMEM_BITS
> ramzswap will just issue a compiler error.

How about restricting the xvmalloc memory allocator to 32 bit? If I
understand correctly xvmalloc main use in on 32 bit in order to be
able to use HIGHMEM?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-26 16:10                 ` Christoph Lameter
@ 2009-08-26 16:17                   ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-26 16:17 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Hugh Dickins, Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/26/2009 09:40 PM, Christoph Lameter wrote:
> On Wed, 26 Aug 2009, Nitin Gupta wrote:
>
>> I went crazy. I meant 40 bits for PFN -- not 48. This 40-bit PFN should be
>> sufficient for all archs. For archs where 40 + PAGE_SHIFT<  MAX_PHYSMEM_BITS
>> ramzswap will just issue a compiler error.
>
> How about restricting the xvmalloc memory allocator to 32 bit? If I
> understand correctly xvmalloc main use in on 32 bit in order to be
> able to use HIGHMEM?
>
>

I have just replaced all PFN usage with struct page in xvmalloc.

The main use of xvmalloc is not just the use of HIGHMEM -- its just one
of the things. Other reasons are:
  - O(1) alloc/free
  - Low fragmentation
  - Allocates 0-order pages to expand pools

Following gives more information:
http://code.google.com/p/compcache/wiki/xvMalloc
http://code.google.com/p/compcache/wiki/xvMallocPerformance

Thanks,
Nitin

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-26 16:17                   ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-08-26 16:17 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Hugh Dickins, Pekka Enberg, akpm, linux-kernel, linux-mm, linux-mm-cc

On 08/26/2009 09:40 PM, Christoph Lameter wrote:
> On Wed, 26 Aug 2009, Nitin Gupta wrote:
>
>> I went crazy. I meant 40 bits for PFN -- not 48. This 40-bit PFN should be
>> sufficient for all archs. For archs where 40 + PAGE_SHIFT<  MAX_PHYSMEM_BITS
>> ramzswap will just issue a compiler error.
>
> How about restricting the xvmalloc memory allocator to 32 bit? If I
> understand correctly xvmalloc main use in on 32 bit in order to be
> able to use HIGHMEM?
>
>

I have just replaced all PFN usage with struct page in xvmalloc.

The main use of xvmalloc is not just the use of HIGHMEM -- its just one
of the things. Other reasons are:
  - O(1) alloc/free
  - Low fragmentation
  - Allocates 0-order pages to expand pools

Following gives more information:
http://code.google.com/p/compcache/wiki/xvMalloc
http://code.google.com/p/compcache/wiki/xvMallocPerformance

Thanks,
Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
  2009-08-26 16:10                 ` Christoph Lameter
@ 2009-08-26 16:19                   ` Pekka Enberg
  -1 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-26 16:19 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nitin Gupta, Hugh Dickins, akpm, linux-kernel, linux-mm, linux-mm-cc

On Wed, 26 Aug 2009, Nitin Gupta wrote:
>> I went crazy. I meant 40 bits for PFN -- not 48. This 40-bit PFN should be
>> sufficient for all archs. For archs where 40 + PAGE_SHIFT < MAX_PHYSMEM_BITS
>> ramzswap will just issue a compiler error.

On Wed, Aug 26, 2009 at 7:10 PM, Christoph
Lameter<cl@linux-foundation.org> wrote:
> How about restricting the xvmalloc memory allocator to 32 bit? If I
> understand correctly xvmalloc main use in on 32 bit in order to be
> able to use HIGHMEM?

That was the main reason for a specialized allocator rather than
trying to use SLOB. However, if "xvmalloc" is merged with ramzswap, it
makes sense to use it on desktop class 64-bit machines as well.

                                Pekka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 1/4] compcache: xvmalloc memory allocator
@ 2009-08-26 16:19                   ` Pekka Enberg
  0 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-08-26 16:19 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Nitin Gupta, Hugh Dickins, akpm, linux-kernel, linux-mm, linux-mm-cc

On Wed, 26 Aug 2009, Nitin Gupta wrote:
>> I went crazy. I meant 40 bits for PFN -- not 48. This 40-bit PFN should be
>> sufficient for all archs. For archs where 40 + PAGE_SHIFT < MAX_PHYSMEM_BITS
>> ramzswap will just issue a compiler error.

On Wed, Aug 26, 2009 at 7:10 PM, Christoph
Lameter<cl@linux-foundation.org> wrote:
> How about restricting the xvmalloc memory allocator to 32 bit? If I
> understand correctly xvmalloc main use in on 32 bit in order to be
> able to use HIGHMEM?

That was the main reason for a specialized allocator rather than
trying to use SLOB. However, if "xvmalloc" is merged with ramzswap, it
makes sense to use it on desktop class 64-bit machines as well.

                                Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2009-08-26 16:26 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-08-24  4:37 [PATCH 1/4] compcache: xvmalloc memory allocator Nitin Gupta
2009-08-24  4:37 ` Nitin Gupta
2009-08-24 17:33 ` Pekka Enberg
2009-08-24 17:33   ` Pekka Enberg
2009-08-24 17:52   ` Nitin Gupta
2009-08-24 17:52     ` Nitin Gupta
2009-08-24 18:08     ` Pekka Enberg
2009-08-24 18:08       ` Pekka Enberg
2009-08-24 18:11       ` Nitin Gupta
2009-08-24 18:11         ` Nitin Gupta
2009-08-24 18:27         ` Pekka Enberg
2009-08-24 18:27           ` Pekka Enberg
2009-08-24 18:40           ` Nitin Gupta
2009-08-24 18:40             ` Nitin Gupta
2009-08-24 19:36   ` Nitin Gupta
2009-08-24 19:36     ` Nitin Gupta
2009-08-24 19:43     ` Pekka Enberg
2009-08-24 19:43       ` Pekka Enberg
2009-08-24 21:16       ` Nitin Gupta
2009-08-24 21:16         ` Nitin Gupta
2009-08-25  4:26         ` Pekka Enberg
2009-08-25  4:26           ` Pekka Enberg
2009-08-24 20:39     ` Hugh Dickins
2009-08-24 20:39       ` Hugh Dickins
2009-08-24 21:16       ` Nitin Gupta
2009-08-24 21:16         ` Nitin Gupta
2009-08-24 21:46         ` Hugh Dickins
2009-08-24 21:46           ` Hugh Dickins
2009-08-25 14:52           ` Nitin Gupta
2009-08-25 14:52             ` Nitin Gupta
2009-08-25 19:03             ` Nitin Gupta
2009-08-25 19:03               ` Nitin Gupta
2009-08-26 16:10               ` Christoph Lameter
2009-08-26 16:10                 ` Christoph Lameter
2009-08-26 16:17                 ` Nitin Gupta
2009-08-26 16:17                   ` Nitin Gupta
2009-08-26 16:19                 ` Pekka Enberg
2009-08-26 16:19                   ` Pekka Enberg
2009-08-26 16:07     ` Christoph Lameter
2009-08-26 16:07       ` Christoph Lameter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.