All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 2/4] virtual block device driver (ramzswap)
       [not found] <200909100215.36350.ngupta@vflare.org>
@ 2009-09-09 21:19   ` Nitin Gupta
  2009-09-09 21:21   ` Nitin Gupta
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-09 21:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Pekka Enberg, Ed Tomlinson, linux-kernel, linux-mm,
	linux-mm-cc

Creates RAM based block devices (/dev/ramzswapX) which can be
used (only) as swap disks. Pages swapped to these are compressed
and stored in memory itself.

The module is called ramzswap.ko. It depends on:
 - xvmalloc.ko: memory allocator
 - lzo_compress.ko
 - lzo_decompress.ko

See Documentation/blockdev/ramzswap.txt for usage details.

Signed-off-by: Nitin Gupta <ngupta@vflare.org>
---

 drivers/block/Kconfig                 |   22 +
 drivers/block/Makefile                |    1 +
 drivers/block/ramzswap/Makefile       |    3 +
 drivers/block/ramzswap/ramzswap_drv.c | 1529 +++++++++++++++++++++++++++++++++
 drivers/block/ramzswap/ramzswap_drv.h |  183 ++++
 include/linux/ramzswap_ioctl.h        |   51 ++
 6 files changed, 1789 insertions(+), 0 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e0..4c2cac6 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -381,6 +381,28 @@ config BLK_DEV_XIP
 	  will prevent RAM block device backing store memory from being
 	  allocated from highmem (only a problem for highmem systems).
 
+config BLK_DEV_RAMZSWAP
+	tristate "Compressed in-memory swap device"
+	depends on SWAP
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	default m
+	help
+	  Creates virtual block devices which can be used (only) as swap
+	  disks. Pages swapped to these disks are compressed and stored in
+	  memory itself.
+
+	  See Documentation/blockdev/ramzswap.txt for more information.
+	  Project home: http://compcache.googlecode.com/
+
+config BLK_DEV_RAMZSWAP_STATS
+	bool "Enable ramzswap stats"
+	depends on BLK_DEV_RAMZSWAP
+	default y
+	help
+	  Enable statistics collection for ramzswap. This adds only a minimal
+	  overhead. In unsure, say Y.
+
 config CDROM_PKTCDVD
 	tristate "Packet writing on CD/DVD media"
 	depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index cdaa3f8..174efcc 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM)		+= ps3vram.o
 obj-$(CONFIG_ATARI_FLOPPY)	+= ataflop.o
 obj-$(CONFIG_AMIGA_Z2RAM)	+= z2ram.o
 obj-$(CONFIG_BLK_DEV_RAM)	+= brd.o
+obj-$(CONFIG_BLK_DEV_RAMZSWAP)	+= ramzswap/
 obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
 obj-$(CONFIG_BLK_DEV_XD)	+= xd.o
 obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
diff --git a/drivers/block/ramzswap/Makefile b/drivers/block/ramzswap/Makefile
new file mode 100644
index 0000000..94e99b5
--- /dev/null
+++ b/drivers/block/ramzswap/Makefile
@@ -0,0 +1,3 @@
+ramzswap-objs	:=	ramzswap_drv.o xvmalloc.o
+
+obj-$(CONFIG_BLK_DEV_RAMZSWAP)	+=	ramzswap.o
diff --git a/drivers/block/ramzswap/ramzswap_drv.c b/drivers/block/ramzswap/ramzswap_drv.c
new file mode 100644
index 0000000..c475c45
--- /dev/null
+++ b/drivers/block/ramzswap/ramzswap_drv.c
@@ -0,0 +1,1529 @@
+/*
+ * Compressed RAM based swap device
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ * Project home: http://compcache.googlecode.com
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/lzo.h>
+#include <linux/marker.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+
+#include "ramzswap_drv.h"
+
+/* Globals */
+static int RAMZSWAP_MAJOR;
+static struct ramzswap *DEVICES;
+
+/*
+ * Pages that compress to larger than this size are
+ * forwarded to backing swap, if present or stored
+ * uncompressed in memory otherwise.
+ */
+static unsigned int MAX_CPAGE_SIZE;
+
+/* Module params (documentation at end) */
+static unsigned long NUM_DEVICES;
+
+/* Function declarations */
+static int __init ramzswap_init(void);
+static int ramzswap_ioctl(struct block_device *, fmode_t,
+			unsigned, unsigned long);
+static int setup_swap_header(struct ramzswap *, union swap_header *);
+static void ramzswap_set_memlimit(struct ramzswap *, size_t);
+static void ramzswap_set_disksize(struct ramzswap *, size_t);
+static void reset_device(struct ramzswap *rzs);
+
+static struct block_device_operations ramzswap_devops = {
+	.ioctl = ramzswap_ioctl,
+	.owner = THIS_MODULE,
+};
+
+static int test_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
+{
+	return rzs->table[index].flags & BIT(flag);
+}
+
+static void set_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
+{
+	rzs->table[index].flags |= BIT(flag);
+}
+
+static void clear_flag(struct ramzswap *rzs, u32 index,
+					enum rzs_pageflags flag)
+{
+	rzs->table[index].flags &= ~BIT(flag);
+}
+
+static int page_zero_filled(void *ptr)
+{
+	u32 pos;
+	u64 *page;
+
+	page = (u64 *)ptr;
+
+	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
+		if (page[pos])
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Given <pagenum, offset> pair, provide a dereferencable pointer.
+ */
+static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
+{
+	unsigned char *base;
+
+	base = kmap_atomic(page, type);
+	return base + offset;
+}
+
+static void put_ptr_atomic(void *ptr, enum km_type type)
+{
+	kunmap_atomic(ptr, type);
+}
+
+static void ramzswap_flush_dcache_page(struct page *page)
+{
+#ifdef CONFIG_ARM
+	int flag = 0;
+	/*
+	 * Ugly hack to get flush_dcache_page() work on ARM.
+	 * page_mapping(page) == NULL after clearing this swap cache flag.
+	 * Without clearing this flag, flush_dcache_page() will simply set
+	 * "PG_dcache_dirty" bit and return.
+	 */
+	if (PageSwapCache(page)) {
+		flag = 1;
+		ClearPageSwapCache(page);
+	}
+#endif
+	flush_dcache_page(page);
+#ifdef CONFIG_ARM
+	if (flag)
+		SetPageSwapCache(page);
+#endif
+}
+
+void ramzswap_ioctl_get_stats(struct ramzswap *rzs,
+			struct ramzswap_ioctl_stats *s)
+{
+	strncpy(s->backing_swap_name, rzs->backing_swap_name,
+		MAX_SWAP_NAME_LEN - 1);
+	s->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
+
+	s->disksize = rzs->disksize;
+	s->memlimit = rzs->memlimit;
+
+#if defined(STATS)
+	{
+	struct ramzswap_stats *rs = &rzs->stats;
+	size_t succ_writes, mem_used;
+	unsigned int good_compress_perc = 0, no_compress_perc = 0;
+
+	mem_used = xv_get_total_size_bytes(rzs->mem_pool)
+			+ (rs->pages_expand << PAGE_SHIFT);
+	succ_writes = rs->num_writes - rs->failed_writes;
+
+	if (succ_writes && rs->pages_stored) {
+		good_compress_perc = rs->good_compress * 100
+					/ rs->pages_stored;
+		no_compress_perc = rs->pages_expand * 100
+					/ rs->pages_stored;
+	}
+
+	s->num_reads = rs->num_reads;
+	s->num_writes = rs->num_writes;
+	s->failed_reads = rs->failed_reads;
+	s->failed_writes = rs->failed_writes;
+	s->invalid_io = rs->invalid_io;
+	s->notify_free = rs->notify_free;
+	s->pages_discard = rs->pages_discard;
+	s->pages_zero = rs->pages_zero;
+
+	s->good_compress_pct = good_compress_perc;
+	s->pages_expand_pct = no_compress_perc;
+
+	s->pages_stored = rs->pages_stored;
+	s->pages_used = mem_used >> PAGE_SHIFT;
+	s->orig_data_size = rs->pages_stored << PAGE_SHIFT;
+	s->compr_data_size = rs->compr_size;
+	s->mem_used_total = mem_used;
+
+	s->bdev_num_reads = rs->bdev_num_reads;
+	s->bdev_num_writes = rs->bdev_num_writes;
+	}
+#endif /* STATS */
+}
+
+static int add_backing_swap_extent(struct ramzswap *rzs,
+				pgoff_t phy_pagenum,
+				pgoff_t num_pages)
+{
+	unsigned int idx;
+	struct list_head *head;
+	struct page *curr_page, *new_page;
+	unsigned int extents_per_page = PAGE_SIZE /
+				sizeof(struct ramzswap_backing_extent);
+
+	idx = rzs->num_extents % extents_per_page;
+	if (!idx) {
+		new_page = alloc_page(__GFP_ZERO);
+		if (!new_page)
+			return -ENOMEM;
+
+		if (rzs->num_extents) {
+			curr_page = virt_to_page(rzs->curr_extent);
+			head = &curr_page->lru;
+		} else {
+			head = &rzs->backing_swap_extent_list;
+		}
+
+		list_add(&new_page->lru, head);
+		rzs->curr_extent = page_address(new_page);
+	}
+
+	rzs->curr_extent->phy_pagenum = phy_pagenum;
+	rzs->curr_extent->num_pages = num_pages;
+
+	pr_debug(C "extent: [%lu, %lu] %lu\n", phy_pagenum,
+		phy_pagenum + num_pages - 1, num_pages);
+
+	if (idx != extents_per_page - 1)
+		rzs->curr_extent++;
+
+	return 0;
+}
+
+static int setup_backing_swap_extents(struct ramzswap *rzs,
+				struct inode *inode, unsigned long *num_pages)
+{
+	int ret = 0;
+	unsigned blkbits;
+	unsigned blocks_per_page;
+	pgoff_t contig_pages = 0, total_pages = 0;
+	pgoff_t pagenum = 0, prev_pagenum = 0;
+	sector_t probe_block = 0;
+	sector_t last_block;
+
+	blkbits = inode->i_blkbits;
+	blocks_per_page = PAGE_SIZE >> blkbits;
+
+	last_block = i_size_read(inode) >> blkbits;
+	while (probe_block + blocks_per_page <= last_block) {
+		unsigned block_in_page;
+		sector_t first_block;
+
+		first_block = bmap(inode, probe_block);
+		if (first_block == 0)
+			goto bad_bmap;
+
+		/* It must be PAGE_SIZE aligned on-disk */
+		if (first_block & (blocks_per_page - 1)) {
+			probe_block++;
+			goto probe_next;
+		}
+
+		/* All blocks within this page must be contiguous on disk */
+		for (block_in_page = 1; block_in_page < blocks_per_page;
+					block_in_page++) {
+			sector_t block;
+
+			block = bmap(inode, probe_block + block_in_page);
+			if (block == 0)
+				goto bad_bmap;
+			if (block != first_block + block_in_page) {
+				/* Discontiguity */
+				probe_block++;
+				goto probe_next;
+			}
+		}
+
+		/*
+		 * We found a PAGE_SIZE length, PAGE_SIZE aligned
+		 * run of blocks.
+		 */
+		pagenum = first_block >> (PAGE_SHIFT - blkbits);
+
+		if (total_pages && (pagenum != prev_pagenum + 1)) {
+			ret = add_backing_swap_extent(rzs, prev_pagenum -
+					(contig_pages - 1), contig_pages);
+			if (ret < 0)
+				goto out;
+			rzs->num_extents++;
+			contig_pages = 0;
+		}
+		total_pages++;
+		contig_pages++;
+		prev_pagenum = pagenum;
+		probe_block += blocks_per_page;
+
+probe_next:
+		continue;
+	}
+
+	if (contig_pages) {
+		pr_debug(C "adding last extent: pagenum=%lu, "
+			"contig_pages=%lu\n", pagenum, contig_pages);
+		ret = add_backing_swap_extent(rzs,
+			prev_pagenum - (contig_pages - 1), contig_pages);
+		if (ret < 0)
+			goto out;
+		rzs->num_extents++;
+	}
+	if (!rzs->num_extents) {
+		pr_err(C "No swap extents found!\n");
+		ret = -EINVAL;
+	}
+
+	if (!ret) {
+		*num_pages = total_pages;
+		pr_info(C "Found %lu extents containing %luk\n",
+			rzs->num_extents, *num_pages << (PAGE_SHIFT - 10));
+	}
+	goto out;
+
+bad_bmap:
+	pr_err(C "Backing swapfile has holes\n");
+	ret = -EINVAL;
+out:
+	while (ret && !list_empty(&rzs->backing_swap_extent_list)) {
+		struct page *page;
+		struct list_head *entry = rzs->backing_swap_extent_list.next;
+		page = list_entry(entry, struct page, lru);
+		list_del(entry);
+		__free_page(page);
+	}
+	return ret;
+}
+
+static void map_backing_swap_extents(struct ramzswap *rzs)
+{
+	struct ramzswap_backing_extent *se;
+	struct page *table_page, *se_page;
+	unsigned long num_pages, num_table_pages, entry;
+	unsigned long se_idx, span;
+	unsigned entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
+	unsigned extents_per_page = PAGE_SIZE / sizeof(*se);
+
+	/* True for block device */
+	if (!rzs->num_extents)
+		return;
+
+	se_page = list_entry(rzs->backing_swap_extent_list.next,
+					struct page, lru);
+	se = page_address(se_page);
+	span = se->num_pages;
+	num_pages = rzs->disksize >> PAGE_SHIFT;
+	num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
+							PAGE_SIZE);
+
+	entry = 0;
+	se_idx = 0;
+	while (num_table_pages--) {
+		table_page = vmalloc_to_page(&rzs->table[entry]);
+		while (span <= entry) {
+			se_idx++;
+			if (se_idx == rzs->num_extents)
+				BUG();
+
+			if (!(se_idx % extents_per_page)) {
+				se_page = list_entry(se_page->lru.next,
+						struct page, lru);
+				se = page_address(se_page);
+			} else
+				se++;
+
+			span += se->num_pages;
+		}
+		table_page->mapping = (struct address_space *)se;
+		table_page->private = se->num_pages - (span - entry);
+		pr_debug(C "map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n",
+			entry, span, table_page->mapping, table_page->private);
+		entry += entries_per_page;
+	}
+}
+
+/*
+ * Check if value of backing_swap module param is sane.
+ * Claim this device and set ramzswap size equal to
+ * size of this block device.
+ */
+static int setup_backing_swap(struct ramzswap *rzs)
+{
+	int ret = 0;
+	size_t disksize;
+	unsigned long num_pages = 0;
+	struct inode *inode;
+	struct file *swap_file;
+	struct address_space *mapping;
+	struct block_device *bdev = NULL;
+
+	if (!rzs->backing_swap_name[0]) {
+		pr_debug(C "backing_swap param not given\n");
+		goto out;
+	}
+
+	pr_info(C "Using backing swap device: %s\n", rzs->backing_swap_name);
+
+	swap_file = filp_open(rzs->backing_swap_name,
+					O_RDWR | O_LARGEFILE, 0);
+	if (IS_ERR(swap_file)) {
+		pr_err(C "Error opening backing device: %s\n",
+						rzs->backing_swap_name);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	mapping = swap_file->f_mapping;
+	inode = mapping->host;
+
+	if (S_ISBLK(inode->i_mode)) {
+		bdev = I_BDEV(inode);
+		ret = bd_claim(bdev, setup_backing_swap);
+		if (ret < 0) {
+			bdev = NULL;
+			goto bad_param;
+		}
+		disksize = i_size_read(inode);
+	} else if (S_ISREG(inode->i_mode)) {
+		bdev = inode->i_sb->s_bdev;
+		if (IS_SWAPFILE(inode)) {
+			ret = -EBUSY;
+			goto bad_param;
+		}
+		ret = setup_backing_swap_extents(rzs, inode, &num_pages);
+		if (ret < 0)
+			goto bad_param;
+		disksize = num_pages << PAGE_SHIFT;
+	} else {
+		goto bad_param;
+	}
+
+	rzs->swap_file = swap_file;
+	rzs->backing_swap = bdev;
+	rzs->disksize = disksize;
+	BUG_ON(!rzs->disksize);
+
+	return 0;
+
+bad_param:
+	if (bdev)
+		bd_release(bdev);
+	filp_close(swap_file, NULL);
+
+out:
+	rzs->backing_swap = NULL;
+	return ret;
+}
+
+/*
+ * Map logical page number 'pagenum' to physical page number
+ * on backing swap device. For block device, this is a nop.
+ */
+pgoff_t map_backing_swap_page(struct ramzswap *rzs, pgoff_t pagenum)
+{
+	u32 skip_pages, entries_per_page;
+	size_t delta, se_offset, skipped;
+	struct page *table_page, *se_page;
+	struct ramzswap_backing_extent *se;
+
+	if (!rzs->num_extents)
+		return pagenum;
+
+	entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
+
+	table_page = vmalloc_to_page(&rzs->table[pagenum]);
+	se = (struct ramzswap_backing_extent *)table_page->mapping;
+	se_page = virt_to_page(se);
+
+	skip_pages = pagenum - (pagenum / entries_per_page * entries_per_page);
+	se_offset = table_page->private + skip_pages;
+
+	if (se_offset < se->num_pages) {
+		pr_debug(C "%lu -> %lu\n",
+			pagenum, se->phy_pagenum + se_offset);
+		return se->phy_pagenum + se_offset;
+	}
+
+	skipped = se->num_pages - table_page->private;
+	do {
+		struct ramzswap_backing_extent *se_base;
+		u32 se_entries_per_page = PAGE_SIZE / sizeof(*se);
+
+		/* Get next swap extent */
+		se_base = (struct ramzswap_backing_extent *)
+						page_address(se_page);
+		if (se - se_base == se_entries_per_page - 1) {
+			se_page = list_entry(se_page->lru.next,
+						struct page, lru);
+			se = page_address(se_page);
+		} else {
+			se++;
+		}
+
+		skipped += se->num_pages;
+	} while (skipped < skip_pages);
+
+	delta = skipped - skip_pages;
+	se_offset = se->num_pages - delta;
+
+	pr_debug(C "%lu -> %lu\n",
+		pagenum, se->phy_pagenum + se_offset);
+	return se->phy_pagenum + se_offset;
+}
+
+/*
+ * Check if request is within bounds and page aligned.
+ */
+static inline int valid_swap_request(struct ramzswap *rzs, struct bio *bio)
+{
+	if (unlikely(
+		(bio->bi_sector >= (rzs->disksize >> SECTOR_SHIFT)) ||
+		(bio->bi_sector & (SECTORS_PER_PAGE - 1)) ||
+		(bio->bi_vcnt != 1) ||
+		(bio->bi_size != PAGE_SIZE) ||
+		(bio->bi_io_vec[0].bv_offset != 0))) {
+
+		return 0;
+	}
+
+	/* swap request is valid */
+	return 1;
+}
+
+static void ramzswap_free_page(struct ramzswap *rzs, size_t index)
+{
+	u32 clen;
+	void *obj;
+
+	struct page *page = rzs->table[index].page;
+	u32 offset = rzs->table[index].offset;
+
+	if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED))) {
+		clen = PAGE_SIZE;
+		__free_page(page);
+		clear_flag(rzs, index, RZS_UNCOMPRESSED);
+		stat_dec(rzs->stats.pages_expand);
+		goto out;
+	}
+
+	obj = get_ptr_atomic(page, offset, KM_USER0);
+	clen = xv_get_object_size(obj) - sizeof(struct zobj_header);
+	put_ptr_atomic(obj, KM_USER0);
+
+	xv_free(rzs->mem_pool, page, offset);
+	stat_dec_if_less(rzs->stats.good_compress, clen, PAGE_SIZE / 2 + 1);
+
+out:
+	rzs->stats.compr_size -= clen;
+	stat_dec(rzs->stats.pages_stored);
+
+	rzs->table[index].page = NULL;
+	rzs->table[index].offset = 0;
+}
+
+/*
+ * callback function called when swap_map[offset] == 0
+ * i.e page at this swap offset is no longer used
+ */
+static void ramzswap_free_notify(struct block_device *bdev,
+				unsigned long index)
+{
+	struct ramzswap *rzs = bdev->bd_disk->private_data;
+
+	if (rzs->table[index].page) {
+		ramzswap_free_page(rzs, index);
+		stat_inc(rzs->stats.notify_free);
+	}
+}
+
+static int ramzswap_prepare_discard(struct request_queue *q,
+				struct request *req)
+{
+	return 0;
+}
+
+/*
+ * Called by main I/O handler function. This helper
+ * function handles 'discard' I/O requests which means
+ * that  some swap pages are no longer required, so
+ * swap device can take needed action -- we free memory
+ * allocated for these pages.
+ */
+static int ramzswap_discard(struct ramzswap *rzs, struct bio *bio)
+{
+	size_t index, start_page, num_pages;
+
+	start_page = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+	num_pages = bio->bi_size >> (SECTOR_SHIFT + SECTORS_PER_PAGE_SHIFT);
+
+	for (index = start_page; index < start_page + num_pages; index++) {
+		if (rzs->table[index].page) {
+			ramzswap_free_page(rzs, index);
+			stat_inc(rzs->stats.pages_discard);
+		}
+	}
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+}
+
+static int handle_zero_page(struct bio *bio)
+{
+	void *user_mem;
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	user_mem = get_ptr_atomic(page, 0, KM_USER0);
+	memset(user_mem, 0, PAGE_SIZE);
+	put_ptr_atomic(user_mem, KM_USER0);
+
+	ramzswap_flush_dcache_page(page);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+}
+
+static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio)
+{
+	u32 index;
+	struct page *page;
+	unsigned char *user_mem, *cmem;
+
+	page = bio->bi_io_vec[0].bv_page;
+	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+
+	user_mem = get_ptr_atomic(page, 0, KM_USER0);
+	cmem = get_ptr_atomic(rzs->table[index].page,
+			rzs->table[index].offset, KM_USER1);
+
+	memcpy(user_mem, cmem, PAGE_SIZE);
+	put_ptr_atomic(user_mem, KM_USER0);
+	put_ptr_atomic(cmem, KM_USER1);
+
+	ramzswap_flush_dcache_page(page);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+}
+
+
+/*
+ * Called when request page is not present in ramzswap.
+ * Its either in backing swap device (if present) or
+ * this is an attempt to read before any previous write
+ * to this location - this happens due to readahead when
+ * swap device is read from user-space (e.g. during swapon)
+ */
+static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio)
+{
+	/*
+	 * Always forward such requests to backing swap
+	 * device (if present)
+	 */
+	if (rzs->backing_swap) {
+		pgoff_t pagenum;
+		stat_dec(rzs->stats.num_reads);
+		stat_inc(rzs->stats.bdev_num_reads);
+		bio->bi_bdev = rzs->backing_swap;
+
+		/*
+		 * In case backing swap is a file, find the right offset within
+		 * the file corresponding to logical position 'index'. For block
+		 * device, this is a nop.
+		 */
+		pagenum = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+		bio->bi_sector = map_backing_swap_page(rzs, pagenum)
+					<< SECTORS_PER_PAGE_SHIFT;
+		return 1;
+	}
+
+	/*
+	 * Its unlikely event in case backing dev is
+	 * not present
+	 */
+	pr_debug(C "Read before write on swap device: "
+		"sector=%lu, size=%u, offset=%u\n",
+		(ulong)(bio->bi_sector), bio->bi_size,
+		bio->bi_io_vec[0].bv_offset);
+
+	/* Do nothing. Just return success */
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+}
+
+static int ramzswap_read(struct ramzswap *rzs, struct bio *bio)
+{
+	int ret;
+	u32 index;
+	size_t clen;
+	struct page *page;
+	struct zobj_header *zheader;
+	unsigned char *user_mem, *cmem;
+
+	stat_inc(rzs->stats.num_reads);
+
+	page = bio->bi_io_vec[0].bv_page;
+	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+
+	if (unlikely(!rzs->init_notify_callback) && PageSwapCache(page)) {
+		set_swap_free_notify(bio->bi_bdev, ramzswap_free_notify);
+		rzs->init_notify_callback = 1;
+	}
+
+	if (test_flag(rzs, index, RZS_ZERO))
+		return handle_zero_page(bio);
+
+	/* Requested page is not present in compressed area */
+	if (!rzs->table[index].page)
+		return handle_ramzswap_fault(rzs, bio);
+
+	/* Page is stored uncompressed since its incompressible */
+	if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED)))
+		return handle_uncompressed_page(rzs, bio);
+
+	user_mem = get_ptr_atomic(page, 0, KM_USER0);
+	clen = PAGE_SIZE;
+
+	cmem = get_ptr_atomic(rzs->table[index].page,
+			rzs->table[index].offset, KM_USER1);
+
+	ret = lzo1x_decompress_safe(
+		cmem + sizeof(*zheader),
+		xv_get_object_size(cmem) - sizeof(*zheader),
+		user_mem, &clen);
+
+	put_ptr_atomic(user_mem, KM_USER0);
+	put_ptr_atomic(cmem, KM_USER1);
+
+	/* should NEVER happen */
+	if (unlikely(ret != LZO_E_OK)) {
+		pr_err(C "Decompression failed! err=%d, page=%u\n",
+			ret, index);
+		stat_inc(rzs->stats.failed_reads);
+		goto out;
+	}
+
+	ramzswap_flush_dcache_page(page);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+
+out:
+	bio_io_error(bio);
+	return 0;
+}
+
+static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
+{
+	int ret, fwd_write_request = 0;
+	u32 offset;
+	size_t clen;
+	pgoff_t index;
+	struct zobj_header *zheader;
+	struct page *page, *page_store;
+	unsigned char *user_mem, *cmem, *src;
+
+	stat_inc(rzs->stats.num_writes);
+
+	page = bio->bi_io_vec[0].bv_page;
+	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+
+	src = rzs->compress_buffer;
+
+	/*
+	 * System swaps to same sector again when the stored page
+	 * is no longer referenced by any process. So, its now safe
+	 * to free the memory that was allocated for this page.
+	 */
+	if (rzs->table[index].page)
+		ramzswap_free_page(rzs, index);
+
+	/*
+	 * No memory ia allocated for zero filled pages.
+	 * Simply clear zero page flag.
+	 */
+	if (test_flag(rzs, index, RZS_ZERO)) {
+		stat_dec(rzs->stats.pages_zero);
+		clear_flag(rzs, index, RZS_ZERO);
+	}
+
+	trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
+	mutex_lock(&rzs->lock);
+	trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
+
+	user_mem = get_ptr_atomic(page, 0, KM_USER0);
+	if (page_zero_filled(user_mem)) {
+		put_ptr_atomic(user_mem, KM_USER0);
+		mutex_unlock(&rzs->lock);
+		stat_inc(rzs->stats.pages_zero);
+		set_flag(rzs, index, RZS_ZERO);
+
+		set_bit(BIO_UPTODATE, &bio->bi_flags);
+		bio_endio(bio, 0);
+		return 0;
+	}
+
+	if (rzs->backing_swap &&
+		(rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) {
+		put_ptr_atomic(user_mem, KM_USER0);
+		mutex_unlock(&rzs->lock);
+		fwd_write_request = 1;
+		goto out;
+	}
+
+	ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
+				rzs->compress_workmem);
+
+	put_ptr_atomic(user_mem, KM_USER0);
+
+	if (unlikely(ret != LZO_E_OK)) {
+		mutex_unlock(&rzs->lock);
+		pr_err(C "Compression failed! err=%d\n", ret);
+		stat_inc(rzs->stats.failed_writes);
+		goto out;
+	}
+
+	/*
+	 * Page is incompressible. Forward it to backing swap
+	 * if present. Otherwise, store it as-is (uncompressed)
+	 * since we do not want to return too many swap write
+	 * errors which has side effect of hanging the system.
+	 */
+	if (unlikely(clen > MAX_CPAGE_SIZE)) {
+		if (rzs->backing_swap) {
+			mutex_unlock(&rzs->lock);
+			fwd_write_request = 1;
+			goto out;
+		}
+
+		clen = PAGE_SIZE;
+		page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+		if (unlikely(!page_store)) {
+			mutex_unlock(&rzs->lock);
+			pr_info(C "Error allocating memory for incompressible "
+				"page: %lu\n", index);
+			stat_inc(rzs->stats.failed_writes);
+			goto out;
+		}
+
+		offset = 0;
+		set_flag(rzs, index, RZS_UNCOMPRESSED);
+		stat_inc(rzs->stats.pages_expand);
+		rzs->table[index].page = page_store;
+		src = get_ptr_atomic(page, 0, KM_USER0);
+		goto memstore;
+	}
+
+	if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
+			&rzs->table[index].page, &offset,
+			GFP_NOIO | __GFP_HIGHMEM)) {
+		mutex_unlock(&rzs->lock);
+		pr_info(C "Error allocating memory for compressed "
+			"page: %lu, size=%zu\n", index, clen);
+		stat_inc(rzs->stats.failed_writes);
+		if (rzs->backing_swap)
+			fwd_write_request = 1;
+		goto out;
+	}
+
+memstore:
+	rzs->table[index].offset = offset;
+
+	cmem = get_ptr_atomic(rzs->table[index].page,
+			rzs->table[index].offset, KM_USER1);
+
+#if 0
+	/* Back-reference needed for memory defragmentation */
+	if (!test_flag(rzs, index, RZS_UNCOMPRESSED)) {
+		zheader = (struct zobj_header *)cmem;
+		zheader->table_idx = index;
+		cmem += sizeof(*zheader);
+	}
+#endif
+
+	memcpy(cmem, src, clen);
+
+	put_ptr_atomic(cmem, KM_USER1);
+	if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED)))
+		put_ptr_atomic(src, KM_USER0);
+
+	/* Update stats */
+	rzs->stats.compr_size += clen;
+	stat_inc(rzs->stats.pages_stored);
+	stat_inc_if_less(rzs->stats.good_compress, clen, PAGE_SIZE / 2 + 1);
+
+	mutex_unlock(&rzs->lock);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+
+out:
+	if (fwd_write_request) {
+		stat_inc(rzs->stats.bdev_num_writes);
+		bio->bi_bdev = rzs->backing_swap;
+#if 0
+		/*
+		 * TODO: We currently have linear mapping of ramzswap and
+		 * backing swap sectors. This is not desired since we want
+		 * to optimize writes to backing swap to minimize disk seeks
+		 * or have effective wear leveling (for SSDs). Also, a
+		 * non-linear mapping is required to implement compressed
+		 * on-disk swapping.
+		 */
+		 bio->bi_sector = get_backing_swap_page()
+					<< SECTORS_PER_PAGE_SHIFT;
+#endif
+		/*
+		 * In case backing swap is a file, find the right offset within
+		 * the file corresponding to logical position 'index'. For block
+		 * device, this is a nop.
+		 */
+		bio->bi_sector = map_backing_swap_page(rzs, index)
+					<< SECTORS_PER_PAGE_SHIFT;
+		return 1;
+	}
+
+	bio_io_error(bio);
+	return 0;
+}
+
+/*
+ * Handler function for all ramzswap I/O requests.
+ */
+static int ramzswap_make_request(struct request_queue *queue, struct bio *bio)
+{
+	int ret = 0;
+	struct ramzswap *rzs = queue->queuedata;
+
+	if (unlikely(!rzs->init_done)) {
+		bio_io_error(bio);
+		return 0;
+	}
+
+	if (bio_discard(bio))
+		return ramzswap_discard(rzs, bio);
+
+	if (!valid_swap_request(rzs, bio)) {
+		stat_inc(rzs->stats.invalid_io);
+		bio_io_error(bio);
+		return 0;
+	}
+
+	switch (bio_data_dir(bio)) {
+	case READ:
+		ret = ramzswap_read(rzs, bio);
+		break;
+
+	case WRITE:
+		ret = ramzswap_write(rzs, bio);
+		break;
+	}
+
+	return ret;
+}
+
+static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
+{
+	int ret;
+	size_t num_pages, totalram_bytes;
+	struct sysinfo i;
+	struct page *page;
+	union swap_header *swap_header;
+
+	if (rzs->init_done) {
+		pr_info(C "Device already initialized!\n");
+		return -EBUSY;
+	}
+
+	ret = setup_backing_swap(rzs);
+	if (ret)
+		goto fail;
+
+	si_meminfo(&i);
+	/* Here is a trivia: guess unit used for i.totalram !! */
+	totalram_bytes = i.totalram << PAGE_SHIFT;
+
+	if (rzs->backing_swap)
+		ramzswap_set_memlimit(rzs, totalram_bytes);
+	else
+		ramzswap_set_disksize(rzs, totalram_bytes);
+
+	rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+	if (rzs->compress_workmem == NULL) {
+		pr_err(C "Error allocating compressor working memory!\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);
+	if (rzs->compress_buffer == NULL) {
+		pr_err(C "Error allocating compressor buffer space\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	num_pages = rzs->disksize >> PAGE_SHIFT;
+	rzs->table = vmalloc(num_pages * sizeof(*rzs->table));
+	if (rzs->table == NULL) {
+		pr_err(C "Error allocating ramzswap address table\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+	memset(rzs->table, 0, num_pages * sizeof(*rzs->table));
+
+	map_backing_swap_extents(rzs);
+
+	page = alloc_page(__GFP_ZERO);
+	if (page == NULL) {
+		pr_err(C "Error allocating swap header page\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+	rzs->table[0].page = page;
+	set_flag(rzs, 0, RZS_UNCOMPRESSED);
+
+	swap_header = kmap(page);
+	ret = setup_swap_header(rzs, swap_header);
+	kunmap(page);
+	if (ret) {
+		pr_err(C "Error setting swap header\n");
+		goto fail;
+	}
+
+	set_capacity(rzs->disk, rzs->disksize >> SECTOR_SHIFT);
+
+	/*
+	 * We have ident mapping of sectors for ramzswap and
+	 * and the backing swap device. So, this queue flag
+	 * should be according to backing dev.
+	 */
+	if (!rzs->backing_swap ||
+			blk_queue_nonrot(rzs->backing_swap->bd_disk->queue))
+		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, rzs->disk->queue);
+
+	blk_queue_set_discard(rzs->disk->queue, ramzswap_prepare_discard);
+	blk_queue_logical_block_size(rzs->disk->queue, PAGE_SIZE);
+
+	rzs->mem_pool = xv_create_pool();
+	if (!rzs->mem_pool) {
+		pr_err(C "Error creating memory pool\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/*
+	 * Pages that compress to size greater than this are forwarded
+	 * to physical swap disk (if backing dev is provided)
+	 * TODO: make this configurable
+	 */
+	if (rzs->backing_swap)
+		MAX_CPAGE_SIZE = MAX_CPAGE_SIZE_BDEV;
+	else
+		MAX_CPAGE_SIZE = MAX_CPAGE_SIZE_NOBDEV;
+
+	pr_debug(C "Max compressed page size: %u bytes\n", MAX_CPAGE_SIZE);
+
+	rzs->init_done = 1;
+
+	pr_debug(C "Initialization done!\n");
+	return 0;
+
+fail:
+	if (rzs->table && rzs->table[0].page)
+		__free_page(rzs->table[0].page);
+	kfree(rzs->compress_workmem);
+	kfree(rzs->compress_buffer);
+	vfree(rzs->table);
+	xv_destroy_pool(rzs->mem_pool);
+
+	pr_err(C "Initialization failed: err=%d\n", ret);
+	return ret;
+}
+
+static int ramzswap_ioctl_reset_device(struct ramzswap *rzs)
+{
+	reset_device(rzs);
+	return 0;
+}
+
+static int ramzswap_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned int cmd, unsigned long arg)
+{
+	int ret = 0;
+	size_t disksize_kb, memlimit_kb;
+
+	struct ramzswap *rzs = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case RZSIO_SET_DISKSIZE_KB:
+		if (rzs->init_done) {
+			ret = -EBUSY;
+			goto out;
+		}
+		if (copy_from_user(&disksize_kb, (void *)arg,
+						_IOC_SIZE(cmd))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		rzs->disksize = disksize_kb << 10;
+		pr_info(C "Disk size set to %zu KB\n", disksize_kb);
+		break;
+
+	case RZSIO_SET_MEMLIMIT_KB:
+		if (rzs->init_done) {
+			/* TODO: allow changing memlimit */
+			ret = -EBUSY;
+			goto out;
+		}
+		if (copy_from_user(&memlimit_kb, (void *)arg,
+						_IOC_SIZE(cmd))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		rzs->memlimit = memlimit_kb << 10;
+		pr_info(C "Memory limit set to %zu KB\n", memlimit_kb);
+		break;
+
+	case RZSIO_SET_BACKING_SWAP:
+		if (rzs->init_done) {
+			ret = -EBUSY;
+			goto out;
+		}
+
+		if (copy_from_user(&rzs->backing_swap_name, (void *)arg,
+						_IOC_SIZE(cmd))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		rzs->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
+		pr_info(C "Backing swap set to %s\n", rzs->backing_swap_name);
+		break;
+
+	case RZSIO_GET_STATS:
+	{
+		struct ramzswap_ioctl_stats *stats;
+		if (!rzs->init_done) {
+			ret = -ENOTTY;
+			goto out;
+		}
+		stats = kzalloc(sizeof(*stats), GFP_KERNEL);
+		if (!stats) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ramzswap_ioctl_get_stats(rzs, stats);
+		if (copy_to_user((void *)arg, stats, sizeof(*stats))) {
+			kfree(stats);
+			ret = -EFAULT;
+			goto out;
+		}
+		kfree(stats);
+		break;
+	}
+	case RZSIO_INIT:
+		ret = ramzswap_ioctl_init_device(rzs);
+		break;
+
+	case RZSIO_RESET:
+		/* Do not reset an active device! */
+		if (bdev->bd_holders) {
+			ret = -EBUSY;
+			goto out;
+		}
+		ret = ramzswap_ioctl_reset_device(rzs);
+		/*
+		 * Racy! Device has already been swapoff'ed.  Bad things
+		 * can happen if another swapon is done before this reset.
+		 * TODO: A callback from swapoff() will solve this problem.
+		 */
+		set_swap_free_notify(bdev, NULL);
+		rzs->init_notify_callback = 0;
+		break;
+
+	default:
+		pr_info(C "Invalid ioctl %u\n", cmd);
+		ret = -ENOTTY;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Swap header (1st page of swap device) contains information
+ * to indentify it as a swap partition. Prepare such a header
+ * for ramzswap device (ramzswap0) so that swapon can identify
+ * it as swap partition. In case backing swap device is provided,
+ * copy its swap header.
+ */
+static int setup_swap_header(struct ramzswap *rzs, union swap_header *s)
+{
+	int ret = 0;
+	struct page *page;
+	struct address_space *mapping;
+	union swap_header *backing_swap_header;
+
+	/*
+	 * There is no backing swap device. Create a swap header
+	 * that is acceptable by swapon.
+	 */
+	if (rzs->backing_swap == NULL) {
+		s->info.version = 1;
+		s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
+		s->info.nr_badpages = 0;
+		memcpy(s->magic.magic, "SWAPSPACE2", 10);
+		return 0;
+	}
+
+	/*
+	 * We have a backing swap device. Copy its swap header
+	 * to ramzswap device header. If this header contains
+	 * invalid information (backing device not a swap
+	 * partition, etc.), swapon will fail for ramzswap
+	 * which is correct behavior - we don't want to swap
+	 * over filesystem partition!
+	 */
+
+	/* Read the backing swap header (code from sys_swapon) */
+	mapping = rzs->swap_file->f_mapping;
+	if (!mapping->a_ops->readpage) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	page = read_mapping_page(mapping, 0, rzs->swap_file);
+	if (IS_ERR(page)) {
+		ret = PTR_ERR(page);
+		goto out;
+	}
+
+	backing_swap_header = kmap(page);
+	memcpy(s, backing_swap_header, sizeof(*s));
+	if (rzs->backing_swap) {
+		/*
+		 * ramzswap disksize equals number of usable pages in backing
+		 * swap. Set last_page in swap header to match this disksize
+		 * ('last_page' means 0-based index of last usable swap page).
+		 */
+		s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
+	}
+	kunmap(page);
+
+out:
+	return ret;
+}
+
+static void ramzswap_set_disksize(struct ramzswap *rzs, size_t totalram_bytes)
+{
+	if (!rzs->disksize) {
+		pr_info(C
+		"disk size not provided. You can use disksize_kb module "
+		"param to specify size.\nUsing default: (%u%% of RAM).\n",
+		DEFAULT_DISKSIZE_PERC_RAM
+		);
+		rzs->disksize = DEFAULT_DISKSIZE_PERC_RAM *
+					(totalram_bytes / 100);
+	}
+
+	if (rzs->disksize > 2 * (totalram_bytes)) {
+		pr_info(C
+		"There is little point creating a ramzswap of greater than "
+		"twice the size of memory since we expect a 2:1 compression "
+		"ratio. Note that ramzswap uses about 0.1%% of the size of "
+		"the swap device when not in use so a huge ramzswap is "
+		"wasteful.\n"
+		"\tMemory Size: %zu kB\n"
+		"\tSize you selected: %zu kB\n"
+		"Continuing anyway ...\n",
+		totalram_bytes >> 10, rzs->disksize
+		);
+	}
+
+	rzs->disksize &= PAGE_MASK;
+	pr_info(C "disk size set to %zu kB\n", rzs->disksize >> 10);
+}
+
+/*
+ * memlimit cannot be greater than backing disk size.
+ */
+static void ramzswap_set_memlimit(struct ramzswap *rzs, size_t totalram_bytes)
+{
+	int memlimit_valid = 1;
+
+	if (!rzs->memlimit) {
+		pr_info(C "Memory limit not set.\n");
+		memlimit_valid = 0;
+	}
+
+	if (rzs->memlimit > rzs->disksize) {
+		pr_info(C "Memory limit cannot be greater than "
+			"disksize: limit=%zu, disksize=%zu\n",
+			rzs->memlimit, rzs->disksize);
+		memlimit_valid = 0;
+	}
+
+	if (!memlimit_valid) {
+		size_t mempart, disksize;
+		pr_info(C "Using default: smaller of (%u%% of RAM) and "
+			"(backing disk size).\n",
+			DEFAULT_MEMLIMIT_PERC_RAM);
+		mempart = DEFAULT_MEMLIMIT_PERC_RAM * (totalram_bytes / 100);
+		disksize = rzs->disksize;
+		rzs->memlimit = mempart > disksize ? disksize : mempart;
+	}
+
+	if (rzs->memlimit > totalram_bytes / 2) {
+		pr_info(C
+		"Its not advisable setting limit more than half of "
+		"size of memory since we expect a 2:1 compression ratio. "
+		"Limit represents amount of *compressed* data we can keep "
+		"in memory!\n"
+		"\tMemory Size: %zu kB\n"
+		"\tLimit you selected: %zu kB\n"
+		"Continuing anyway ...\n",
+		totalram_bytes >> 10, rzs->memlimit >> 10
+		);
+	}
+
+	rzs->memlimit &= PAGE_MASK;
+	BUG_ON(!rzs->memlimit);
+
+	pr_info(C "Memory limit set to %zu kB\n", rzs->memlimit >> 10);
+}
+
+static void create_device(struct ramzswap *rzs, int device_id)
+{
+	mutex_init(&rzs->lock);
+	INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
+
+	rzs->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!rzs->queue) {
+		pr_err(C "Error allocating disk queue for device %d\n",
+			device_id);
+		return;
+	}
+
+	blk_queue_make_request(rzs->queue, ramzswap_make_request);
+	blk_queue_logical_block_size(rzs->queue, PAGE_SIZE);
+
+	rzs->queue->queuedata = rzs;
+
+	 /* gendisk structure */
+	rzs->disk = alloc_disk(1);
+	if (!rzs->disk) {
+		blk_cleanup_queue(rzs->queue);
+		pr_warning(C "Error allocating disk structure for device %d\n",
+			device_id);
+		return;
+	}
+
+	rzs->disk->major = RAMZSWAP_MAJOR;
+	rzs->disk->first_minor = device_id;
+	rzs->disk->fops = &ramzswap_devops;
+	rzs->disk->queue = rzs->queue;
+	rzs->disk->private_data = rzs;
+	snprintf(rzs->disk->disk_name, 16, "ramzswap%d", device_id);
+
+	/*
+	 * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl
+	 * or set equal to backing swap device (if provided)
+	 */
+	set_capacity(rzs->disk, 0);
+	add_disk(rzs->disk);
+
+	rzs->init_done = 0;
+
+	return;
+}
+
+static void destroy_device(struct ramzswap *rzs)
+{
+	if (rzs->disk) {
+		del_gendisk(rzs->disk);
+		put_disk(rzs->disk);
+	}
+
+	if (rzs->queue)
+		blk_cleanup_queue(rzs->queue);
+}
+
+static void reset_device(struct ramzswap *rzs)
+{
+	int is_backing_blkdev = 0;
+	size_t index, num_pages;
+	unsigned entries_per_page;
+	unsigned long entry, num_table_pages;
+
+	if (!rzs->init_done)
+		return;
+
+	if (rzs->backing_swap && !rzs->num_extents)
+		is_backing_blkdev = 1;
+
+	num_pages = rzs->disksize >> PAGE_SHIFT;
+
+	/* Free various per-device buffers */
+	kfree(rzs->compress_workmem);
+	kfree(rzs->compress_buffer);
+
+	rzs->compress_workmem = NULL;
+	rzs->compress_buffer = NULL;
+
+	/* Free all pages that are still in this ramzswap device */
+	for (index = 0; index < num_pages; index++) {
+		struct page *page;
+		u16 offset;
+
+		page = rzs->table[index].page;
+		offset = rzs->table[index].offset;
+
+		if (!page)
+			continue;
+
+		if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED)))
+			__free_page(page);
+		else
+			xv_free(rzs->mem_pool, page, offset);
+	}
+
+	entry = 0;
+	entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
+	num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
+					PAGE_SIZE);
+	/*
+	 * Set page->mapping to NULL for every table page.
+	 * Otherwise, we will hit bad_page() during free.
+	 */
+	while (rzs->num_extents && num_table_pages--) {
+		struct page *page;
+		page = vmalloc_to_page(&rzs->table[entry]);
+		page->mapping = NULL;
+		entry += entries_per_page;
+	}
+	vfree(rzs->table);
+	rzs->table = NULL;
+
+	xv_destroy_pool(rzs->mem_pool);
+	rzs->mem_pool = NULL;
+
+	/* Free all swap extent pages */
+	while (!list_empty(&rzs->backing_swap_extent_list)) {
+		struct page *page;
+		struct list_head *entry;
+		entry = rzs->backing_swap_extent_list.next;
+		page = list_entry(entry, struct page, lru);
+		list_del(entry);
+		__free_page(page);
+	}
+	INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
+	rzs->num_extents = 0;
+
+	/* Close backing swap device, if present */
+	if (rzs->backing_swap) {
+		if (is_backing_blkdev)
+			bd_release(rzs->backing_swap);
+		filp_close(rzs->swap_file, NULL);
+		rzs->backing_swap = NULL;
+	}
+
+	/* Reset stats */
+	memset(&rzs->stats, 0, sizeof(rzs->stats));
+
+	rzs->disksize = 0;
+	rzs->memlimit = 0;
+
+	/* Back to uninitialized state */
+	rzs->init_done = 0;
+}
+
+static int __init ramzswap_init(void)
+{
+	int i;
+
+	if (NUM_DEVICES > MAX_NUM_DEVICES) {
+		pr_warning(C "Invalid value for NUM_DEVICES: %lu\n",
+						NUM_DEVICES);
+		return -EINVAL;
+	}
+
+	RAMZSWAP_MAJOR = register_blkdev(0, "ramzswap");
+	if (RAMZSWAP_MAJOR <= 0) {
+		pr_warning(C "Unable to get major number\n");
+		return -EBUSY;
+	}
+
+	if (!NUM_DEVICES) {
+		pr_info(C "NUM_DEVICES not specified. Using default: 1\n");
+		NUM_DEVICES = 1;
+	}
+
+	/* Allocate the device array and initialize each one */
+	pr_info(C "Creating %lu devices ...\n", NUM_DEVICES);
+	DEVICES = kzalloc(NUM_DEVICES * sizeof(struct ramzswap), GFP_KERNEL);
+	if (!DEVICES)
+		goto out;
+
+	for (i = 0; i < NUM_DEVICES; i++)
+		create_device(&DEVICES[i], i);
+
+	return 0;
+
+out:
+	unregister_blkdev(RAMZSWAP_MAJOR, "ramzswap");
+	return -ENOMEM;
+}
+
+static void __exit ramzswap_exit(void)
+{
+	int i;
+
+	for (i = 0; i < NUM_DEVICES; i++) {
+		destroy_device(&DEVICES[i]);
+		reset_device(&DEVICES[i]);
+	}
+
+	unregister_blkdev(RAMZSWAP_MAJOR, "ramzswap");
+
+	kfree(DEVICES);
+	pr_debug(C "Cleanup done!\n");
+}
+
+module_param(NUM_DEVICES, ulong, 0);
+MODULE_PARM_DESC(NUM_DEVICES, "Number of ramzswap devices");
+
+module_init(ramzswap_init);
+module_exit(ramzswap_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
+MODULE_DESCRIPTION("Compressed RAM Based Swap Device");
diff --git a/drivers/block/ramzswap/ramzswap_drv.h b/drivers/block/ramzswap/ramzswap_drv.h
new file mode 100644
index 0000000..7f77edc
--- /dev/null
+++ b/drivers/block/ramzswap/ramzswap_drv.h
@@ -0,0 +1,183 @@
+/*
+ * Compressed RAM based swap device
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ * Project home: http://compcache.googlecode.com
+ */
+
+#ifndef _RAMZSWAP_DRV_H_
+#define _RAMZSWAP_DRV_H_
+
+#include <linux/ramzswap_ioctl.h>
+#include "xvmalloc.h"
+
+/*
+ * Some arbitrary value. This is just to catch
+ * invalid value for NUM_DEVICES module parameter.
+ */
+#define MAX_NUM_DEVICES 32
+
+/*
+ * Stored at beginning of each compressed object.
+ *
+ * It stores back-reference to table entry which points to this
+ * object. This is required to support memory defragmentation or
+ * migrating compressed pages to backing swap disk.
+ */
+struct zobj_header {
+#if 0
+	u32 table_idx;
+#endif
+};
+
+/*-- Configurable parameters */
+
+/* Default ramzswap disk size: 25% of total RAM */
+#define DEFAULT_DISKSIZE_PERC_RAM	25
+#define DEFAULT_MEMLIMIT_PERC_RAM	15
+
+/*
+ * Max compressed page size when backing device is provided.
+ * Pages that compress to size greater than this are sent to
+ * physical swap disk.
+ */
+#define MAX_CPAGE_SIZE_BDEV	(PAGE_SIZE / 2)
+
+/*
+ * Max compressed page size when there is no backing dev.
+ * Pages that compress to size greater than this are stored
+ * uncompressed in memory.
+ */
+#define MAX_CPAGE_SIZE_NOBDEV	(PAGE_SIZE / 4 * 3)
+
+/*
+ * NOTE: MAX_CPAGE_SIZE_{BDEV,NOBDEV} sizes must be
+ * less than or equal to:
+ *   XV_MAX_ALLOC_SIZE - sizeof(struct zobj_header)
+ * since otherwise xvMalloc would always return failure.
+ */
+
+/*-- End of configurable params */
+
+#define SECTOR_SHIFT		9
+#define SECTOR_SIZE		(1 << SECTOR_SHIFT)
+#define SECTORS_PER_PAGE_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
+#define SECTORS_PER_PAGE	(1 << SECTORS_PER_PAGE_SHIFT)
+
+/* Message prefix */
+#define C "ramzswap: "
+
+/* Debugging and Stats */
+#define NOP	do { } while (0)
+
+#if defined(CONFIG_BLK_DEV_RAMZSWAP_STATS)
+#define STATS
+#endif
+
+#if defined(STATS)
+#define stat_inc(stat)			((stat)++)
+#define stat_dec(stat)			((stat)--)
+#define stat_inc_if_less(stat, val1, val2) \
+				((stat) += ((val1) < (val2) ? 1 : 0))
+#define stat_dec_if_less(stat, val1, val2) \
+				((stat) -= ((val1) < (val2) ? 1 : 0))
+#else	/* STATS */
+#define stat_inc(x)			NOP
+#define stat_dec(x)			NOP
+#define stat_inc_if_less(x, v1, v2)	NOP
+#define stat_dec_if_less(x, v1, v2)	NOP
+#endif	/* STATS */
+
+/* Flags for ramzswap pages (table[page_no].flags) */
+enum rzs_pageflags {
+	/* Page is stored uncompressed */
+	RZS_UNCOMPRESSED,
+
+	/* Page consists entirely of zeros */
+	RZS_ZERO,
+
+	__NR_RZS_PAGEFLAGS,
+};
+
+/*-- Data structures */
+
+/* Indexed by page no. These have to fit exactly in PAGE_SIZE */
+struct table {
+	struct page *page;
+	u16 offset;
+	u8 count;	/* object ref count (not yet used) */
+	u8 flags;
+} __attribute__ ((aligned(4)));
+
+/* These have to fit exactly in PAGE_SIZE */
+struct ramzswap_backing_extent {
+	pgoff_t phy_pagenum;
+	pgoff_t num_pages;
+} __attribute__ ((aligned(4)));
+
+struct ramzswap_stats {
+	/* basic stats */
+	size_t compr_size;	/* compressed size of pages stored -
+				 * needed to enforce memlimit */
+	/* more stats */
+#if defined(STATS)
+	u64 num_reads;		/* failed + successful */
+	u64 num_writes;		/* --do-- */
+	u64 failed_reads;	/* can happen when memory is too low */
+	u64 failed_writes;	/* should NEVER! happen */
+	u64 invalid_io;		/* non-swap I/O requests */
+	u64 notify_free;	/* no. of pages freed by swap free callback */
+	u64 pages_discard;	/* no. of pages freed by discard callback */
+	u32 pages_zero;		/* no. of zero filled pages */
+	u32 pages_stored;	/* no. of pages currently stored */
+	u32 good_compress;	/* % of pages with compression ratio<=50% */
+	u32 pages_expand;	/* % of incompressible pages */
+	u64 bdev_num_reads;	/* no. of reads on backing dev */
+	u64 bdev_num_writes;	/* no. of writes on backing dev */
+#endif
+};
+
+struct ramzswap {
+	struct xv_pool *mem_pool;
+	void *compress_workmem;
+	void *compress_buffer;
+	struct table *table;
+	struct mutex lock;
+	struct request_queue *queue;
+	struct gendisk *disk;
+	int init_done;
+	int init_notify_callback;
+	/*
+	 * This is limit on compressed data size (stats.compr_size)
+	 * Its applicable only when backing swap device is present.
+	 */
+	size_t memlimit;	/* bytes */
+	/*
+	 * This is limit on amount of *uncompressed* worth of data
+	 * we can hold. When backing swap device is provided, it is
+	 * set equal to device size.
+	 */
+	size_t disksize;	/* bytes */
+
+	struct ramzswap_stats stats;
+
+	/* backing swap device info */
+	struct ramzswap_backing_extent *curr_extent;
+	struct list_head backing_swap_extent_list;
+	unsigned long num_extents;
+	char backing_swap_name[MAX_SWAP_NAME_LEN];
+	struct block_device *backing_swap;
+	struct file *swap_file;
+};
+
+/*-- */
+
+#endif
+
diff --git a/include/linux/ramzswap_ioctl.h b/include/linux/ramzswap_ioctl.h
new file mode 100644
index 0000000..c9a10a5
--- /dev/null
+++ b/include/linux/ramzswap_ioctl.h
@@ -0,0 +1,51 @@
+/*
+ * Compressed RAM based swap device
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ * Project home: http://compcache.googlecode.com
+ */
+
+#ifndef _RAMZSWAP_IOCTL_H_
+#define _RAMZSWAP_IOCTL_H_
+
+#define MAX_SWAP_NAME_LEN 128
+
+struct ramzswap_ioctl_stats {
+	char backing_swap_name[MAX_SWAP_NAME_LEN];
+	u64 memlimit;		/* only applicable if backing swap present */
+	u64 disksize;		/* user specified or equal to backing swap
+				 * size (if present) */
+	u64 num_reads;		/* failed + successful */
+	u64 num_writes;		/* --do-- */
+	u64 failed_reads;	/* can happen when memory is too low */
+	u64 failed_writes;	/* should NEVER! happen */
+	u64 invalid_io;		/* non-swap I/O requests */
+	u64 notify_free;	/* no. of pages freed by swap free callback */
+	u64 pages_discard;	/* no. of pages freed by discard callback */
+	u32 pages_zero;		/* no. of zero filled pages */
+	u32 good_compress_pct;	/* no. of pages with compression ratio<=50% */
+	u32 pages_expand_pct;	/* no. of incompressible pages */
+	u32 pages_stored;
+	u32 pages_used;
+	u64 orig_data_size;
+	u64 compr_data_size;
+	u64 mem_used_total;
+	u64 bdev_num_reads;	/* no. of reads on backing dev */
+	u64 bdev_num_writes;	/* no. of writes on backing dev */
+} __attribute__ ((packed, aligned(4)));
+
+#define RZSIO_SET_DISKSIZE_KB	_IOW('z', 0, size_t)
+#define RZSIO_SET_MEMLIMIT_KB	_IOW('z', 1, size_t)
+#define RZSIO_SET_BACKING_SWAP	_IOW('z', 2, unsigned char[MAX_SWAP_NAME_LEN])
+#define RZSIO_GET_STATS		_IOR('z', 3, struct ramzswap_ioctl_stats)
+#define RZSIO_INIT		_IO('z', 4)
+#define RZSIO_RESET		_IO('z', 5)
+
+#endif

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-09 21:19   ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-09 21:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Pekka Enberg, Ed Tomlinson, linux-kernel, linux-mm,
	linux-mm-cc

Creates RAM based block devices (/dev/ramzswapX) which can be
used (only) as swap disks. Pages swapped to these are compressed
and stored in memory itself.

The module is called ramzswap.ko. It depends on:
 - xvmalloc.ko: memory allocator
 - lzo_compress.ko
 - lzo_decompress.ko

See Documentation/blockdev/ramzswap.txt for usage details.

Signed-off-by: Nitin Gupta <ngupta@vflare.org>
---

 drivers/block/Kconfig                 |   22 +
 drivers/block/Makefile                |    1 +
 drivers/block/ramzswap/Makefile       |    3 +
 drivers/block/ramzswap/ramzswap_drv.c | 1529 +++++++++++++++++++++++++++++++++
 drivers/block/ramzswap/ramzswap_drv.h |  183 ++++
 include/linux/ramzswap_ioctl.h        |   51 ++
 6 files changed, 1789 insertions(+), 0 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 1d886e0..4c2cac6 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -381,6 +381,28 @@ config BLK_DEV_XIP
 	  will prevent RAM block device backing store memory from being
 	  allocated from highmem (only a problem for highmem systems).
 
+config BLK_DEV_RAMZSWAP
+	tristate "Compressed in-memory swap device"
+	depends on SWAP
+	select LZO_COMPRESS
+	select LZO_DECOMPRESS
+	default m
+	help
+	  Creates virtual block devices which can be used (only) as swap
+	  disks. Pages swapped to these disks are compressed and stored in
+	  memory itself.
+
+	  See Documentation/blockdev/ramzswap.txt for more information.
+	  Project home: http://compcache.googlecode.com/
+
+config BLK_DEV_RAMZSWAP_STATS
+	bool "Enable ramzswap stats"
+	depends on BLK_DEV_RAMZSWAP
+	default y
+	help
+	  Enable statistics collection for ramzswap. This adds only a minimal
+	  overhead. In unsure, say Y.
+
 config CDROM_PKTCDVD
 	tristate "Packet writing on CD/DVD media"
 	depends on !UML
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index cdaa3f8..174efcc 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -14,6 +14,7 @@ obj-$(CONFIG_PS3_VRAM)		+= ps3vram.o
 obj-$(CONFIG_ATARI_FLOPPY)	+= ataflop.o
 obj-$(CONFIG_AMIGA_Z2RAM)	+= z2ram.o
 obj-$(CONFIG_BLK_DEV_RAM)	+= brd.o
+obj-$(CONFIG_BLK_DEV_RAMZSWAP)	+= ramzswap/
 obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
 obj-$(CONFIG_BLK_DEV_XD)	+= xd.o
 obj-$(CONFIG_BLK_CPQ_DA)	+= cpqarray.o
diff --git a/drivers/block/ramzswap/Makefile b/drivers/block/ramzswap/Makefile
new file mode 100644
index 0000000..94e99b5
--- /dev/null
+++ b/drivers/block/ramzswap/Makefile
@@ -0,0 +1,3 @@
+ramzswap-objs	:=	ramzswap_drv.o xvmalloc.o
+
+obj-$(CONFIG_BLK_DEV_RAMZSWAP)	+=	ramzswap.o
diff --git a/drivers/block/ramzswap/ramzswap_drv.c b/drivers/block/ramzswap/ramzswap_drv.c
new file mode 100644
index 0000000..c475c45
--- /dev/null
+++ b/drivers/block/ramzswap/ramzswap_drv.c
@@ -0,0 +1,1529 @@
+/*
+ * Compressed RAM based swap device
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ * Project home: http://compcache.googlecode.com
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/device.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/lzo.h>
+#include <linux/marker.h>
+#include <linux/mutex.h>
+#include <linux/string.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/vmalloc.h>
+#include <linux/version.h>
+
+#include "ramzswap_drv.h"
+
+/* Globals */
+static int RAMZSWAP_MAJOR;
+static struct ramzswap *DEVICES;
+
+/*
+ * Pages that compress to larger than this size are
+ * forwarded to backing swap, if present or stored
+ * uncompressed in memory otherwise.
+ */
+static unsigned int MAX_CPAGE_SIZE;
+
+/* Module params (documentation at end) */
+static unsigned long NUM_DEVICES;
+
+/* Function declarations */
+static int __init ramzswap_init(void);
+static int ramzswap_ioctl(struct block_device *, fmode_t,
+			unsigned, unsigned long);
+static int setup_swap_header(struct ramzswap *, union swap_header *);
+static void ramzswap_set_memlimit(struct ramzswap *, size_t);
+static void ramzswap_set_disksize(struct ramzswap *, size_t);
+static void reset_device(struct ramzswap *rzs);
+
+static struct block_device_operations ramzswap_devops = {
+	.ioctl = ramzswap_ioctl,
+	.owner = THIS_MODULE,
+};
+
+static int test_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
+{
+	return rzs->table[index].flags & BIT(flag);
+}
+
+static void set_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
+{
+	rzs->table[index].flags |= BIT(flag);
+}
+
+static void clear_flag(struct ramzswap *rzs, u32 index,
+					enum rzs_pageflags flag)
+{
+	rzs->table[index].flags &= ~BIT(flag);
+}
+
+static int page_zero_filled(void *ptr)
+{
+	u32 pos;
+	u64 *page;
+
+	page = (u64 *)ptr;
+
+	for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
+		if (page[pos])
+			return 0;
+	}
+
+	return 1;
+}
+
+/*
+ * Given <pagenum, offset> pair, provide a dereferencable pointer.
+ */
+static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
+{
+	unsigned char *base;
+
+	base = kmap_atomic(page, type);
+	return base + offset;
+}
+
+static void put_ptr_atomic(void *ptr, enum km_type type)
+{
+	kunmap_atomic(ptr, type);
+}
+
+static void ramzswap_flush_dcache_page(struct page *page)
+{
+#ifdef CONFIG_ARM
+	int flag = 0;
+	/*
+	 * Ugly hack to get flush_dcache_page() work on ARM.
+	 * page_mapping(page) == NULL after clearing this swap cache flag.
+	 * Without clearing this flag, flush_dcache_page() will simply set
+	 * "PG_dcache_dirty" bit and return.
+	 */
+	if (PageSwapCache(page)) {
+		flag = 1;
+		ClearPageSwapCache(page);
+	}
+#endif
+	flush_dcache_page(page);
+#ifdef CONFIG_ARM
+	if (flag)
+		SetPageSwapCache(page);
+#endif
+}
+
+void ramzswap_ioctl_get_stats(struct ramzswap *rzs,
+			struct ramzswap_ioctl_stats *s)
+{
+	strncpy(s->backing_swap_name, rzs->backing_swap_name,
+		MAX_SWAP_NAME_LEN - 1);
+	s->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
+
+	s->disksize = rzs->disksize;
+	s->memlimit = rzs->memlimit;
+
+#if defined(STATS)
+	{
+	struct ramzswap_stats *rs = &rzs->stats;
+	size_t succ_writes, mem_used;
+	unsigned int good_compress_perc = 0, no_compress_perc = 0;
+
+	mem_used = xv_get_total_size_bytes(rzs->mem_pool)
+			+ (rs->pages_expand << PAGE_SHIFT);
+	succ_writes = rs->num_writes - rs->failed_writes;
+
+	if (succ_writes && rs->pages_stored) {
+		good_compress_perc = rs->good_compress * 100
+					/ rs->pages_stored;
+		no_compress_perc = rs->pages_expand * 100
+					/ rs->pages_stored;
+	}
+
+	s->num_reads = rs->num_reads;
+	s->num_writes = rs->num_writes;
+	s->failed_reads = rs->failed_reads;
+	s->failed_writes = rs->failed_writes;
+	s->invalid_io = rs->invalid_io;
+	s->notify_free = rs->notify_free;
+	s->pages_discard = rs->pages_discard;
+	s->pages_zero = rs->pages_zero;
+
+	s->good_compress_pct = good_compress_perc;
+	s->pages_expand_pct = no_compress_perc;
+
+	s->pages_stored = rs->pages_stored;
+	s->pages_used = mem_used >> PAGE_SHIFT;
+	s->orig_data_size = rs->pages_stored << PAGE_SHIFT;
+	s->compr_data_size = rs->compr_size;
+	s->mem_used_total = mem_used;
+
+	s->bdev_num_reads = rs->bdev_num_reads;
+	s->bdev_num_writes = rs->bdev_num_writes;
+	}
+#endif /* STATS */
+}
+
+static int add_backing_swap_extent(struct ramzswap *rzs,
+				pgoff_t phy_pagenum,
+				pgoff_t num_pages)
+{
+	unsigned int idx;
+	struct list_head *head;
+	struct page *curr_page, *new_page;
+	unsigned int extents_per_page = PAGE_SIZE /
+				sizeof(struct ramzswap_backing_extent);
+
+	idx = rzs->num_extents % extents_per_page;
+	if (!idx) {
+		new_page = alloc_page(__GFP_ZERO);
+		if (!new_page)
+			return -ENOMEM;
+
+		if (rzs->num_extents) {
+			curr_page = virt_to_page(rzs->curr_extent);
+			head = &curr_page->lru;
+		} else {
+			head = &rzs->backing_swap_extent_list;
+		}
+
+		list_add(&new_page->lru, head);
+		rzs->curr_extent = page_address(new_page);
+	}
+
+	rzs->curr_extent->phy_pagenum = phy_pagenum;
+	rzs->curr_extent->num_pages = num_pages;
+
+	pr_debug(C "extent: [%lu, %lu] %lu\n", phy_pagenum,
+		phy_pagenum + num_pages - 1, num_pages);
+
+	if (idx != extents_per_page - 1)
+		rzs->curr_extent++;
+
+	return 0;
+}
+
+static int setup_backing_swap_extents(struct ramzswap *rzs,
+				struct inode *inode, unsigned long *num_pages)
+{
+	int ret = 0;
+	unsigned blkbits;
+	unsigned blocks_per_page;
+	pgoff_t contig_pages = 0, total_pages = 0;
+	pgoff_t pagenum = 0, prev_pagenum = 0;
+	sector_t probe_block = 0;
+	sector_t last_block;
+
+	blkbits = inode->i_blkbits;
+	blocks_per_page = PAGE_SIZE >> blkbits;
+
+	last_block = i_size_read(inode) >> blkbits;
+	while (probe_block + blocks_per_page <= last_block) {
+		unsigned block_in_page;
+		sector_t first_block;
+
+		first_block = bmap(inode, probe_block);
+		if (first_block == 0)
+			goto bad_bmap;
+
+		/* It must be PAGE_SIZE aligned on-disk */
+		if (first_block & (blocks_per_page - 1)) {
+			probe_block++;
+			goto probe_next;
+		}
+
+		/* All blocks within this page must be contiguous on disk */
+		for (block_in_page = 1; block_in_page < blocks_per_page;
+					block_in_page++) {
+			sector_t block;
+
+			block = bmap(inode, probe_block + block_in_page);
+			if (block == 0)
+				goto bad_bmap;
+			if (block != first_block + block_in_page) {
+				/* Discontiguity */
+				probe_block++;
+				goto probe_next;
+			}
+		}
+
+		/*
+		 * We found a PAGE_SIZE length, PAGE_SIZE aligned
+		 * run of blocks.
+		 */
+		pagenum = first_block >> (PAGE_SHIFT - blkbits);
+
+		if (total_pages && (pagenum != prev_pagenum + 1)) {
+			ret = add_backing_swap_extent(rzs, prev_pagenum -
+					(contig_pages - 1), contig_pages);
+			if (ret < 0)
+				goto out;
+			rzs->num_extents++;
+			contig_pages = 0;
+		}
+		total_pages++;
+		contig_pages++;
+		prev_pagenum = pagenum;
+		probe_block += blocks_per_page;
+
+probe_next:
+		continue;
+	}
+
+	if (contig_pages) {
+		pr_debug(C "adding last extent: pagenum=%lu, "
+			"contig_pages=%lu\n", pagenum, contig_pages);
+		ret = add_backing_swap_extent(rzs,
+			prev_pagenum - (contig_pages - 1), contig_pages);
+		if (ret < 0)
+			goto out;
+		rzs->num_extents++;
+	}
+	if (!rzs->num_extents) {
+		pr_err(C "No swap extents found!\n");
+		ret = -EINVAL;
+	}
+
+	if (!ret) {
+		*num_pages = total_pages;
+		pr_info(C "Found %lu extents containing %luk\n",
+			rzs->num_extents, *num_pages << (PAGE_SHIFT - 10));
+	}
+	goto out;
+
+bad_bmap:
+	pr_err(C "Backing swapfile has holes\n");
+	ret = -EINVAL;
+out:
+	while (ret && !list_empty(&rzs->backing_swap_extent_list)) {
+		struct page *page;
+		struct list_head *entry = rzs->backing_swap_extent_list.next;
+		page = list_entry(entry, struct page, lru);
+		list_del(entry);
+		__free_page(page);
+	}
+	return ret;
+}
+
+static void map_backing_swap_extents(struct ramzswap *rzs)
+{
+	struct ramzswap_backing_extent *se;
+	struct page *table_page, *se_page;
+	unsigned long num_pages, num_table_pages, entry;
+	unsigned long se_idx, span;
+	unsigned entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
+	unsigned extents_per_page = PAGE_SIZE / sizeof(*se);
+
+	/* True for block device */
+	if (!rzs->num_extents)
+		return;
+
+	se_page = list_entry(rzs->backing_swap_extent_list.next,
+					struct page, lru);
+	se = page_address(se_page);
+	span = se->num_pages;
+	num_pages = rzs->disksize >> PAGE_SHIFT;
+	num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
+							PAGE_SIZE);
+
+	entry = 0;
+	se_idx = 0;
+	while (num_table_pages--) {
+		table_page = vmalloc_to_page(&rzs->table[entry]);
+		while (span <= entry) {
+			se_idx++;
+			if (se_idx == rzs->num_extents)
+				BUG();
+
+			if (!(se_idx % extents_per_page)) {
+				se_page = list_entry(se_page->lru.next,
+						struct page, lru);
+				se = page_address(se_page);
+			} else
+				se++;
+
+			span += se->num_pages;
+		}
+		table_page->mapping = (struct address_space *)se;
+		table_page->private = se->num_pages - (span - entry);
+		pr_debug(C "map_table: entry=%lu, span=%lu, map=%p, priv=%lu\n",
+			entry, span, table_page->mapping, table_page->private);
+		entry += entries_per_page;
+	}
+}
+
+/*
+ * Check if value of backing_swap module param is sane.
+ * Claim this device and set ramzswap size equal to
+ * size of this block device.
+ */
+static int setup_backing_swap(struct ramzswap *rzs)
+{
+	int ret = 0;
+	size_t disksize;
+	unsigned long num_pages = 0;
+	struct inode *inode;
+	struct file *swap_file;
+	struct address_space *mapping;
+	struct block_device *bdev = NULL;
+
+	if (!rzs->backing_swap_name[0]) {
+		pr_debug(C "backing_swap param not given\n");
+		goto out;
+	}
+
+	pr_info(C "Using backing swap device: %s\n", rzs->backing_swap_name);
+
+	swap_file = filp_open(rzs->backing_swap_name,
+					O_RDWR | O_LARGEFILE, 0);
+	if (IS_ERR(swap_file)) {
+		pr_err(C "Error opening backing device: %s\n",
+						rzs->backing_swap_name);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	mapping = swap_file->f_mapping;
+	inode = mapping->host;
+
+	if (S_ISBLK(inode->i_mode)) {
+		bdev = I_BDEV(inode);
+		ret = bd_claim(bdev, setup_backing_swap);
+		if (ret < 0) {
+			bdev = NULL;
+			goto bad_param;
+		}
+		disksize = i_size_read(inode);
+	} else if (S_ISREG(inode->i_mode)) {
+		bdev = inode->i_sb->s_bdev;
+		if (IS_SWAPFILE(inode)) {
+			ret = -EBUSY;
+			goto bad_param;
+		}
+		ret = setup_backing_swap_extents(rzs, inode, &num_pages);
+		if (ret < 0)
+			goto bad_param;
+		disksize = num_pages << PAGE_SHIFT;
+	} else {
+		goto bad_param;
+	}
+
+	rzs->swap_file = swap_file;
+	rzs->backing_swap = bdev;
+	rzs->disksize = disksize;
+	BUG_ON(!rzs->disksize);
+
+	return 0;
+
+bad_param:
+	if (bdev)
+		bd_release(bdev);
+	filp_close(swap_file, NULL);
+
+out:
+	rzs->backing_swap = NULL;
+	return ret;
+}
+
+/*
+ * Map logical page number 'pagenum' to physical page number
+ * on backing swap device. For block device, this is a nop.
+ */
+pgoff_t map_backing_swap_page(struct ramzswap *rzs, pgoff_t pagenum)
+{
+	u32 skip_pages, entries_per_page;
+	size_t delta, se_offset, skipped;
+	struct page *table_page, *se_page;
+	struct ramzswap_backing_extent *se;
+
+	if (!rzs->num_extents)
+		return pagenum;
+
+	entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
+
+	table_page = vmalloc_to_page(&rzs->table[pagenum]);
+	se = (struct ramzswap_backing_extent *)table_page->mapping;
+	se_page = virt_to_page(se);
+
+	skip_pages = pagenum - (pagenum / entries_per_page * entries_per_page);
+	se_offset = table_page->private + skip_pages;
+
+	if (se_offset < se->num_pages) {
+		pr_debug(C "%lu -> %lu\n",
+			pagenum, se->phy_pagenum + se_offset);
+		return se->phy_pagenum + se_offset;
+	}
+
+	skipped = se->num_pages - table_page->private;
+	do {
+		struct ramzswap_backing_extent *se_base;
+		u32 se_entries_per_page = PAGE_SIZE / sizeof(*se);
+
+		/* Get next swap extent */
+		se_base = (struct ramzswap_backing_extent *)
+						page_address(se_page);
+		if (se - se_base == se_entries_per_page - 1) {
+			se_page = list_entry(se_page->lru.next,
+						struct page, lru);
+			se = page_address(se_page);
+		} else {
+			se++;
+		}
+
+		skipped += se->num_pages;
+	} while (skipped < skip_pages);
+
+	delta = skipped - skip_pages;
+	se_offset = se->num_pages - delta;
+
+	pr_debug(C "%lu -> %lu\n",
+		pagenum, se->phy_pagenum + se_offset);
+	return se->phy_pagenum + se_offset;
+}
+
+/*
+ * Check if request is within bounds and page aligned.
+ */
+static inline int valid_swap_request(struct ramzswap *rzs, struct bio *bio)
+{
+	if (unlikely(
+		(bio->bi_sector >= (rzs->disksize >> SECTOR_SHIFT)) ||
+		(bio->bi_sector & (SECTORS_PER_PAGE - 1)) ||
+		(bio->bi_vcnt != 1) ||
+		(bio->bi_size != PAGE_SIZE) ||
+		(bio->bi_io_vec[0].bv_offset != 0))) {
+
+		return 0;
+	}
+
+	/* swap request is valid */
+	return 1;
+}
+
+static void ramzswap_free_page(struct ramzswap *rzs, size_t index)
+{
+	u32 clen;
+	void *obj;
+
+	struct page *page = rzs->table[index].page;
+	u32 offset = rzs->table[index].offset;
+
+	if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED))) {
+		clen = PAGE_SIZE;
+		__free_page(page);
+		clear_flag(rzs, index, RZS_UNCOMPRESSED);
+		stat_dec(rzs->stats.pages_expand);
+		goto out;
+	}
+
+	obj = get_ptr_atomic(page, offset, KM_USER0);
+	clen = xv_get_object_size(obj) - sizeof(struct zobj_header);
+	put_ptr_atomic(obj, KM_USER0);
+
+	xv_free(rzs->mem_pool, page, offset);
+	stat_dec_if_less(rzs->stats.good_compress, clen, PAGE_SIZE / 2 + 1);
+
+out:
+	rzs->stats.compr_size -= clen;
+	stat_dec(rzs->stats.pages_stored);
+
+	rzs->table[index].page = NULL;
+	rzs->table[index].offset = 0;
+}
+
+/*
+ * callback function called when swap_map[offset] == 0
+ * i.e page at this swap offset is no longer used
+ */
+static void ramzswap_free_notify(struct block_device *bdev,
+				unsigned long index)
+{
+	struct ramzswap *rzs = bdev->bd_disk->private_data;
+
+	if (rzs->table[index].page) {
+		ramzswap_free_page(rzs, index);
+		stat_inc(rzs->stats.notify_free);
+	}
+}
+
+static int ramzswap_prepare_discard(struct request_queue *q,
+				struct request *req)
+{
+	return 0;
+}
+
+/*
+ * Called by main I/O handler function. This helper
+ * function handles 'discard' I/O requests which means
+ * that  some swap pages are no longer required, so
+ * swap device can take needed action -- we free memory
+ * allocated for these pages.
+ */
+static int ramzswap_discard(struct ramzswap *rzs, struct bio *bio)
+{
+	size_t index, start_page, num_pages;
+
+	start_page = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+	num_pages = bio->bi_size >> (SECTOR_SHIFT + SECTORS_PER_PAGE_SHIFT);
+
+	for (index = start_page; index < start_page + num_pages; index++) {
+		if (rzs->table[index].page) {
+			ramzswap_free_page(rzs, index);
+			stat_inc(rzs->stats.pages_discard);
+		}
+	}
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+}
+
+static int handle_zero_page(struct bio *bio)
+{
+	void *user_mem;
+	struct page *page = bio->bi_io_vec[0].bv_page;
+
+	user_mem = get_ptr_atomic(page, 0, KM_USER0);
+	memset(user_mem, 0, PAGE_SIZE);
+	put_ptr_atomic(user_mem, KM_USER0);
+
+	ramzswap_flush_dcache_page(page);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+}
+
+static int handle_uncompressed_page(struct ramzswap *rzs, struct bio *bio)
+{
+	u32 index;
+	struct page *page;
+	unsigned char *user_mem, *cmem;
+
+	page = bio->bi_io_vec[0].bv_page;
+	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+
+	user_mem = get_ptr_atomic(page, 0, KM_USER0);
+	cmem = get_ptr_atomic(rzs->table[index].page,
+			rzs->table[index].offset, KM_USER1);
+
+	memcpy(user_mem, cmem, PAGE_SIZE);
+	put_ptr_atomic(user_mem, KM_USER0);
+	put_ptr_atomic(cmem, KM_USER1);
+
+	ramzswap_flush_dcache_page(page);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+}
+
+
+/*
+ * Called when request page is not present in ramzswap.
+ * Its either in backing swap device (if present) or
+ * this is an attempt to read before any previous write
+ * to this location - this happens due to readahead when
+ * swap device is read from user-space (e.g. during swapon)
+ */
+static int handle_ramzswap_fault(struct ramzswap *rzs, struct bio *bio)
+{
+	/*
+	 * Always forward such requests to backing swap
+	 * device (if present)
+	 */
+	if (rzs->backing_swap) {
+		pgoff_t pagenum;
+		stat_dec(rzs->stats.num_reads);
+		stat_inc(rzs->stats.bdev_num_reads);
+		bio->bi_bdev = rzs->backing_swap;
+
+		/*
+		 * In case backing swap is a file, find the right offset within
+		 * the file corresponding to logical position 'index'. For block
+		 * device, this is a nop.
+		 */
+		pagenum = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+		bio->bi_sector = map_backing_swap_page(rzs, pagenum)
+					<< SECTORS_PER_PAGE_SHIFT;
+		return 1;
+	}
+
+	/*
+	 * Its unlikely event in case backing dev is
+	 * not present
+	 */
+	pr_debug(C "Read before write on swap device: "
+		"sector=%lu, size=%u, offset=%u\n",
+		(ulong)(bio->bi_sector), bio->bi_size,
+		bio->bi_io_vec[0].bv_offset);
+
+	/* Do nothing. Just return success */
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+}
+
+static int ramzswap_read(struct ramzswap *rzs, struct bio *bio)
+{
+	int ret;
+	u32 index;
+	size_t clen;
+	struct page *page;
+	struct zobj_header *zheader;
+	unsigned char *user_mem, *cmem;
+
+	stat_inc(rzs->stats.num_reads);
+
+	page = bio->bi_io_vec[0].bv_page;
+	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+
+	if (unlikely(!rzs->init_notify_callback) && PageSwapCache(page)) {
+		set_swap_free_notify(bio->bi_bdev, ramzswap_free_notify);
+		rzs->init_notify_callback = 1;
+	}
+
+	if (test_flag(rzs, index, RZS_ZERO))
+		return handle_zero_page(bio);
+
+	/* Requested page is not present in compressed area */
+	if (!rzs->table[index].page)
+		return handle_ramzswap_fault(rzs, bio);
+
+	/* Page is stored uncompressed since its incompressible */
+	if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED)))
+		return handle_uncompressed_page(rzs, bio);
+
+	user_mem = get_ptr_atomic(page, 0, KM_USER0);
+	clen = PAGE_SIZE;
+
+	cmem = get_ptr_atomic(rzs->table[index].page,
+			rzs->table[index].offset, KM_USER1);
+
+	ret = lzo1x_decompress_safe(
+		cmem + sizeof(*zheader),
+		xv_get_object_size(cmem) - sizeof(*zheader),
+		user_mem, &clen);
+
+	put_ptr_atomic(user_mem, KM_USER0);
+	put_ptr_atomic(cmem, KM_USER1);
+
+	/* should NEVER happen */
+	if (unlikely(ret != LZO_E_OK)) {
+		pr_err(C "Decompression failed! err=%d, page=%u\n",
+			ret, index);
+		stat_inc(rzs->stats.failed_reads);
+		goto out;
+	}
+
+	ramzswap_flush_dcache_page(page);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+
+out:
+	bio_io_error(bio);
+	return 0;
+}
+
+static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
+{
+	int ret, fwd_write_request = 0;
+	u32 offset;
+	size_t clen;
+	pgoff_t index;
+	struct zobj_header *zheader;
+	struct page *page, *page_store;
+	unsigned char *user_mem, *cmem, *src;
+
+	stat_inc(rzs->stats.num_writes);
+
+	page = bio->bi_io_vec[0].bv_page;
+	index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
+
+	src = rzs->compress_buffer;
+
+	/*
+	 * System swaps to same sector again when the stored page
+	 * is no longer referenced by any process. So, its now safe
+	 * to free the memory that was allocated for this page.
+	 */
+	if (rzs->table[index].page)
+		ramzswap_free_page(rzs, index);
+
+	/*
+	 * No memory ia allocated for zero filled pages.
+	 * Simply clear zero page flag.
+	 */
+	if (test_flag(rzs, index, RZS_ZERO)) {
+		stat_dec(rzs->stats.pages_zero);
+		clear_flag(rzs, index, RZS_ZERO);
+	}
+
+	trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
+	mutex_lock(&rzs->lock);
+	trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
+
+	user_mem = get_ptr_atomic(page, 0, KM_USER0);
+	if (page_zero_filled(user_mem)) {
+		put_ptr_atomic(user_mem, KM_USER0);
+		mutex_unlock(&rzs->lock);
+		stat_inc(rzs->stats.pages_zero);
+		set_flag(rzs, index, RZS_ZERO);
+
+		set_bit(BIO_UPTODATE, &bio->bi_flags);
+		bio_endio(bio, 0);
+		return 0;
+	}
+
+	if (rzs->backing_swap &&
+		(rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) {
+		put_ptr_atomic(user_mem, KM_USER0);
+		mutex_unlock(&rzs->lock);
+		fwd_write_request = 1;
+		goto out;
+	}
+
+	ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
+				rzs->compress_workmem);
+
+	put_ptr_atomic(user_mem, KM_USER0);
+
+	if (unlikely(ret != LZO_E_OK)) {
+		mutex_unlock(&rzs->lock);
+		pr_err(C "Compression failed! err=%d\n", ret);
+		stat_inc(rzs->stats.failed_writes);
+		goto out;
+	}
+
+	/*
+	 * Page is incompressible. Forward it to backing swap
+	 * if present. Otherwise, store it as-is (uncompressed)
+	 * since we do not want to return too many swap write
+	 * errors which has side effect of hanging the system.
+	 */
+	if (unlikely(clen > MAX_CPAGE_SIZE)) {
+		if (rzs->backing_swap) {
+			mutex_unlock(&rzs->lock);
+			fwd_write_request = 1;
+			goto out;
+		}
+
+		clen = PAGE_SIZE;
+		page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+		if (unlikely(!page_store)) {
+			mutex_unlock(&rzs->lock);
+			pr_info(C "Error allocating memory for incompressible "
+				"page: %lu\n", index);
+			stat_inc(rzs->stats.failed_writes);
+			goto out;
+		}
+
+		offset = 0;
+		set_flag(rzs, index, RZS_UNCOMPRESSED);
+		stat_inc(rzs->stats.pages_expand);
+		rzs->table[index].page = page_store;
+		src = get_ptr_atomic(page, 0, KM_USER0);
+		goto memstore;
+	}
+
+	if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
+			&rzs->table[index].page, &offset,
+			GFP_NOIO | __GFP_HIGHMEM)) {
+		mutex_unlock(&rzs->lock);
+		pr_info(C "Error allocating memory for compressed "
+			"page: %lu, size=%zu\n", index, clen);
+		stat_inc(rzs->stats.failed_writes);
+		if (rzs->backing_swap)
+			fwd_write_request = 1;
+		goto out;
+	}
+
+memstore:
+	rzs->table[index].offset = offset;
+
+	cmem = get_ptr_atomic(rzs->table[index].page,
+			rzs->table[index].offset, KM_USER1);
+
+#if 0
+	/* Back-reference needed for memory defragmentation */
+	if (!test_flag(rzs, index, RZS_UNCOMPRESSED)) {
+		zheader = (struct zobj_header *)cmem;
+		zheader->table_idx = index;
+		cmem += sizeof(*zheader);
+	}
+#endif
+
+	memcpy(cmem, src, clen);
+
+	put_ptr_atomic(cmem, KM_USER1);
+	if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED)))
+		put_ptr_atomic(src, KM_USER0);
+
+	/* Update stats */
+	rzs->stats.compr_size += clen;
+	stat_inc(rzs->stats.pages_stored);
+	stat_inc_if_less(rzs->stats.good_compress, clen, PAGE_SIZE / 2 + 1);
+
+	mutex_unlock(&rzs->lock);
+
+	set_bit(BIO_UPTODATE, &bio->bi_flags);
+	bio_endio(bio, 0);
+	return 0;
+
+out:
+	if (fwd_write_request) {
+		stat_inc(rzs->stats.bdev_num_writes);
+		bio->bi_bdev = rzs->backing_swap;
+#if 0
+		/*
+		 * TODO: We currently have linear mapping of ramzswap and
+		 * backing swap sectors. This is not desired since we want
+		 * to optimize writes to backing swap to minimize disk seeks
+		 * or have effective wear leveling (for SSDs). Also, a
+		 * non-linear mapping is required to implement compressed
+		 * on-disk swapping.
+		 */
+		 bio->bi_sector = get_backing_swap_page()
+					<< SECTORS_PER_PAGE_SHIFT;
+#endif
+		/*
+		 * In case backing swap is a file, find the right offset within
+		 * the file corresponding to logical position 'index'. For block
+		 * device, this is a nop.
+		 */
+		bio->bi_sector = map_backing_swap_page(rzs, index)
+					<< SECTORS_PER_PAGE_SHIFT;
+		return 1;
+	}
+
+	bio_io_error(bio);
+	return 0;
+}
+
+/*
+ * Handler function for all ramzswap I/O requests.
+ */
+static int ramzswap_make_request(struct request_queue *queue, struct bio *bio)
+{
+	int ret = 0;
+	struct ramzswap *rzs = queue->queuedata;
+
+	if (unlikely(!rzs->init_done)) {
+		bio_io_error(bio);
+		return 0;
+	}
+
+	if (bio_discard(bio))
+		return ramzswap_discard(rzs, bio);
+
+	if (!valid_swap_request(rzs, bio)) {
+		stat_inc(rzs->stats.invalid_io);
+		bio_io_error(bio);
+		return 0;
+	}
+
+	switch (bio_data_dir(bio)) {
+	case READ:
+		ret = ramzswap_read(rzs, bio);
+		break;
+
+	case WRITE:
+		ret = ramzswap_write(rzs, bio);
+		break;
+	}
+
+	return ret;
+}
+
+static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
+{
+	int ret;
+	size_t num_pages, totalram_bytes;
+	struct sysinfo i;
+	struct page *page;
+	union swap_header *swap_header;
+
+	if (rzs->init_done) {
+		pr_info(C "Device already initialized!\n");
+		return -EBUSY;
+	}
+
+	ret = setup_backing_swap(rzs);
+	if (ret)
+		goto fail;
+
+	si_meminfo(&i);
+	/* Here is a trivia: guess unit used for i.totalram !! */
+	totalram_bytes = i.totalram << PAGE_SHIFT;
+
+	if (rzs->backing_swap)
+		ramzswap_set_memlimit(rzs, totalram_bytes);
+	else
+		ramzswap_set_disksize(rzs, totalram_bytes);
+
+	rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
+	if (rzs->compress_workmem == NULL) {
+		pr_err(C "Error allocating compressor working memory!\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);
+	if (rzs->compress_buffer == NULL) {
+		pr_err(C "Error allocating compressor buffer space\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	num_pages = rzs->disksize >> PAGE_SHIFT;
+	rzs->table = vmalloc(num_pages * sizeof(*rzs->table));
+	if (rzs->table == NULL) {
+		pr_err(C "Error allocating ramzswap address table\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+	memset(rzs->table, 0, num_pages * sizeof(*rzs->table));
+
+	map_backing_swap_extents(rzs);
+
+	page = alloc_page(__GFP_ZERO);
+	if (page == NULL) {
+		pr_err(C "Error allocating swap header page\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+	rzs->table[0].page = page;
+	set_flag(rzs, 0, RZS_UNCOMPRESSED);
+
+	swap_header = kmap(page);
+	ret = setup_swap_header(rzs, swap_header);
+	kunmap(page);
+	if (ret) {
+		pr_err(C "Error setting swap header\n");
+		goto fail;
+	}
+
+	set_capacity(rzs->disk, rzs->disksize >> SECTOR_SHIFT);
+
+	/*
+	 * We have ident mapping of sectors for ramzswap and
+	 * and the backing swap device. So, this queue flag
+	 * should be according to backing dev.
+	 */
+	if (!rzs->backing_swap ||
+			blk_queue_nonrot(rzs->backing_swap->bd_disk->queue))
+		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, rzs->disk->queue);
+
+	blk_queue_set_discard(rzs->disk->queue, ramzswap_prepare_discard);
+	blk_queue_logical_block_size(rzs->disk->queue, PAGE_SIZE);
+
+	rzs->mem_pool = xv_create_pool();
+	if (!rzs->mem_pool) {
+		pr_err(C "Error creating memory pool\n");
+		ret = -ENOMEM;
+		goto fail;
+	}
+
+	/*
+	 * Pages that compress to size greater than this are forwarded
+	 * to physical swap disk (if backing dev is provided)
+	 * TODO: make this configurable
+	 */
+	if (rzs->backing_swap)
+		MAX_CPAGE_SIZE = MAX_CPAGE_SIZE_BDEV;
+	else
+		MAX_CPAGE_SIZE = MAX_CPAGE_SIZE_NOBDEV;
+
+	pr_debug(C "Max compressed page size: %u bytes\n", MAX_CPAGE_SIZE);
+
+	rzs->init_done = 1;
+
+	pr_debug(C "Initialization done!\n");
+	return 0;
+
+fail:
+	if (rzs->table && rzs->table[0].page)
+		__free_page(rzs->table[0].page);
+	kfree(rzs->compress_workmem);
+	kfree(rzs->compress_buffer);
+	vfree(rzs->table);
+	xv_destroy_pool(rzs->mem_pool);
+
+	pr_err(C "Initialization failed: err=%d\n", ret);
+	return ret;
+}
+
+static int ramzswap_ioctl_reset_device(struct ramzswap *rzs)
+{
+	reset_device(rzs);
+	return 0;
+}
+
+static int ramzswap_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned int cmd, unsigned long arg)
+{
+	int ret = 0;
+	size_t disksize_kb, memlimit_kb;
+
+	struct ramzswap *rzs = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case RZSIO_SET_DISKSIZE_KB:
+		if (rzs->init_done) {
+			ret = -EBUSY;
+			goto out;
+		}
+		if (copy_from_user(&disksize_kb, (void *)arg,
+						_IOC_SIZE(cmd))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		rzs->disksize = disksize_kb << 10;
+		pr_info(C "Disk size set to %zu KB\n", disksize_kb);
+		break;
+
+	case RZSIO_SET_MEMLIMIT_KB:
+		if (rzs->init_done) {
+			/* TODO: allow changing memlimit */
+			ret = -EBUSY;
+			goto out;
+		}
+		if (copy_from_user(&memlimit_kb, (void *)arg,
+						_IOC_SIZE(cmd))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		rzs->memlimit = memlimit_kb << 10;
+		pr_info(C "Memory limit set to %zu KB\n", memlimit_kb);
+		break;
+
+	case RZSIO_SET_BACKING_SWAP:
+		if (rzs->init_done) {
+			ret = -EBUSY;
+			goto out;
+		}
+
+		if (copy_from_user(&rzs->backing_swap_name, (void *)arg,
+						_IOC_SIZE(cmd))) {
+			ret = -EFAULT;
+			goto out;
+		}
+		rzs->backing_swap_name[MAX_SWAP_NAME_LEN - 1] = '\0';
+		pr_info(C "Backing swap set to %s\n", rzs->backing_swap_name);
+		break;
+
+	case RZSIO_GET_STATS:
+	{
+		struct ramzswap_ioctl_stats *stats;
+		if (!rzs->init_done) {
+			ret = -ENOTTY;
+			goto out;
+		}
+		stats = kzalloc(sizeof(*stats), GFP_KERNEL);
+		if (!stats) {
+			ret = -ENOMEM;
+			goto out;
+		}
+		ramzswap_ioctl_get_stats(rzs, stats);
+		if (copy_to_user((void *)arg, stats, sizeof(*stats))) {
+			kfree(stats);
+			ret = -EFAULT;
+			goto out;
+		}
+		kfree(stats);
+		break;
+	}
+	case RZSIO_INIT:
+		ret = ramzswap_ioctl_init_device(rzs);
+		break;
+
+	case RZSIO_RESET:
+		/* Do not reset an active device! */
+		if (bdev->bd_holders) {
+			ret = -EBUSY;
+			goto out;
+		}
+		ret = ramzswap_ioctl_reset_device(rzs);
+		/*
+		 * Racy! Device has already been swapoff'ed.  Bad things
+		 * can happen if another swapon is done before this reset.
+		 * TODO: A callback from swapoff() will solve this problem.
+		 */
+		set_swap_free_notify(bdev, NULL);
+		rzs->init_notify_callback = 0;
+		break;
+
+	default:
+		pr_info(C "Invalid ioctl %u\n", cmd);
+		ret = -ENOTTY;
+	}
+
+out:
+	return ret;
+}
+
+/*
+ * Swap header (1st page of swap device) contains information
+ * to indentify it as a swap partition. Prepare such a header
+ * for ramzswap device (ramzswap0) so that swapon can identify
+ * it as swap partition. In case backing swap device is provided,
+ * copy its swap header.
+ */
+static int setup_swap_header(struct ramzswap *rzs, union swap_header *s)
+{
+	int ret = 0;
+	struct page *page;
+	struct address_space *mapping;
+	union swap_header *backing_swap_header;
+
+	/*
+	 * There is no backing swap device. Create a swap header
+	 * that is acceptable by swapon.
+	 */
+	if (rzs->backing_swap == NULL) {
+		s->info.version = 1;
+		s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
+		s->info.nr_badpages = 0;
+		memcpy(s->magic.magic, "SWAPSPACE2", 10);
+		return 0;
+	}
+
+	/*
+	 * We have a backing swap device. Copy its swap header
+	 * to ramzswap device header. If this header contains
+	 * invalid information (backing device not a swap
+	 * partition, etc.), swapon will fail for ramzswap
+	 * which is correct behavior - we don't want to swap
+	 * over filesystem partition!
+	 */
+
+	/* Read the backing swap header (code from sys_swapon) */
+	mapping = rzs->swap_file->f_mapping;
+	if (!mapping->a_ops->readpage) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	page = read_mapping_page(mapping, 0, rzs->swap_file);
+	if (IS_ERR(page)) {
+		ret = PTR_ERR(page);
+		goto out;
+	}
+
+	backing_swap_header = kmap(page);
+	memcpy(s, backing_swap_header, sizeof(*s));
+	if (rzs->backing_swap) {
+		/*
+		 * ramzswap disksize equals number of usable pages in backing
+		 * swap. Set last_page in swap header to match this disksize
+		 * ('last_page' means 0-based index of last usable swap page).
+		 */
+		s->info.last_page = (rzs->disksize >> PAGE_SHIFT) - 1;
+	}
+	kunmap(page);
+
+out:
+	return ret;
+}
+
+static void ramzswap_set_disksize(struct ramzswap *rzs, size_t totalram_bytes)
+{
+	if (!rzs->disksize) {
+		pr_info(C
+		"disk size not provided. You can use disksize_kb module "
+		"param to specify size.\nUsing default: (%u%% of RAM).\n",
+		DEFAULT_DISKSIZE_PERC_RAM
+		);
+		rzs->disksize = DEFAULT_DISKSIZE_PERC_RAM *
+					(totalram_bytes / 100);
+	}
+
+	if (rzs->disksize > 2 * (totalram_bytes)) {
+		pr_info(C
+		"There is little point creating a ramzswap of greater than "
+		"twice the size of memory since we expect a 2:1 compression "
+		"ratio. Note that ramzswap uses about 0.1%% of the size of "
+		"the swap device when not in use so a huge ramzswap is "
+		"wasteful.\n"
+		"\tMemory Size: %zu kB\n"
+		"\tSize you selected: %zu kB\n"
+		"Continuing anyway ...\n",
+		totalram_bytes >> 10, rzs->disksize
+		);
+	}
+
+	rzs->disksize &= PAGE_MASK;
+	pr_info(C "disk size set to %zu kB\n", rzs->disksize >> 10);
+}
+
+/*
+ * memlimit cannot be greater than backing disk size.
+ */
+static void ramzswap_set_memlimit(struct ramzswap *rzs, size_t totalram_bytes)
+{
+	int memlimit_valid = 1;
+
+	if (!rzs->memlimit) {
+		pr_info(C "Memory limit not set.\n");
+		memlimit_valid = 0;
+	}
+
+	if (rzs->memlimit > rzs->disksize) {
+		pr_info(C "Memory limit cannot be greater than "
+			"disksize: limit=%zu, disksize=%zu\n",
+			rzs->memlimit, rzs->disksize);
+		memlimit_valid = 0;
+	}
+
+	if (!memlimit_valid) {
+		size_t mempart, disksize;
+		pr_info(C "Using default: smaller of (%u%% of RAM) and "
+			"(backing disk size).\n",
+			DEFAULT_MEMLIMIT_PERC_RAM);
+		mempart = DEFAULT_MEMLIMIT_PERC_RAM * (totalram_bytes / 100);
+		disksize = rzs->disksize;
+		rzs->memlimit = mempart > disksize ? disksize : mempart;
+	}
+
+	if (rzs->memlimit > totalram_bytes / 2) {
+		pr_info(C
+		"Its not advisable setting limit more than half of "
+		"size of memory since we expect a 2:1 compression ratio. "
+		"Limit represents amount of *compressed* data we can keep "
+		"in memory!\n"
+		"\tMemory Size: %zu kB\n"
+		"\tLimit you selected: %zu kB\n"
+		"Continuing anyway ...\n",
+		totalram_bytes >> 10, rzs->memlimit >> 10
+		);
+	}
+
+	rzs->memlimit &= PAGE_MASK;
+	BUG_ON(!rzs->memlimit);
+
+	pr_info(C "Memory limit set to %zu kB\n", rzs->memlimit >> 10);
+}
+
+static void create_device(struct ramzswap *rzs, int device_id)
+{
+	mutex_init(&rzs->lock);
+	INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
+
+	rzs->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!rzs->queue) {
+		pr_err(C "Error allocating disk queue for device %d\n",
+			device_id);
+		return;
+	}
+
+	blk_queue_make_request(rzs->queue, ramzswap_make_request);
+	blk_queue_logical_block_size(rzs->queue, PAGE_SIZE);
+
+	rzs->queue->queuedata = rzs;
+
+	 /* gendisk structure */
+	rzs->disk = alloc_disk(1);
+	if (!rzs->disk) {
+		blk_cleanup_queue(rzs->queue);
+		pr_warning(C "Error allocating disk structure for device %d\n",
+			device_id);
+		return;
+	}
+
+	rzs->disk->major = RAMZSWAP_MAJOR;
+	rzs->disk->first_minor = device_id;
+	rzs->disk->fops = &ramzswap_devops;
+	rzs->disk->queue = rzs->queue;
+	rzs->disk->private_data = rzs;
+	snprintf(rzs->disk->disk_name, 16, "ramzswap%d", device_id);
+
+	/*
+	 * Actual capacity set using RZSIO_SET_DISKSIZE_KB ioctl
+	 * or set equal to backing swap device (if provided)
+	 */
+	set_capacity(rzs->disk, 0);
+	add_disk(rzs->disk);
+
+	rzs->init_done = 0;
+
+	return;
+}
+
+static void destroy_device(struct ramzswap *rzs)
+{
+	if (rzs->disk) {
+		del_gendisk(rzs->disk);
+		put_disk(rzs->disk);
+	}
+
+	if (rzs->queue)
+		blk_cleanup_queue(rzs->queue);
+}
+
+static void reset_device(struct ramzswap *rzs)
+{
+	int is_backing_blkdev = 0;
+	size_t index, num_pages;
+	unsigned entries_per_page;
+	unsigned long entry, num_table_pages;
+
+	if (!rzs->init_done)
+		return;
+
+	if (rzs->backing_swap && !rzs->num_extents)
+		is_backing_blkdev = 1;
+
+	num_pages = rzs->disksize >> PAGE_SHIFT;
+
+	/* Free various per-device buffers */
+	kfree(rzs->compress_workmem);
+	kfree(rzs->compress_buffer);
+
+	rzs->compress_workmem = NULL;
+	rzs->compress_buffer = NULL;
+
+	/* Free all pages that are still in this ramzswap device */
+	for (index = 0; index < num_pages; index++) {
+		struct page *page;
+		u16 offset;
+
+		page = rzs->table[index].page;
+		offset = rzs->table[index].offset;
+
+		if (!page)
+			continue;
+
+		if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED)))
+			__free_page(page);
+		else
+			xv_free(rzs->mem_pool, page, offset);
+	}
+
+	entry = 0;
+	entries_per_page = PAGE_SIZE / sizeof(*rzs->table);
+	num_table_pages = DIV_ROUND_UP(num_pages * sizeof(*rzs->table),
+					PAGE_SIZE);
+	/*
+	 * Set page->mapping to NULL for every table page.
+	 * Otherwise, we will hit bad_page() during free.
+	 */
+	while (rzs->num_extents && num_table_pages--) {
+		struct page *page;
+		page = vmalloc_to_page(&rzs->table[entry]);
+		page->mapping = NULL;
+		entry += entries_per_page;
+	}
+	vfree(rzs->table);
+	rzs->table = NULL;
+
+	xv_destroy_pool(rzs->mem_pool);
+	rzs->mem_pool = NULL;
+
+	/* Free all swap extent pages */
+	while (!list_empty(&rzs->backing_swap_extent_list)) {
+		struct page *page;
+		struct list_head *entry;
+		entry = rzs->backing_swap_extent_list.next;
+		page = list_entry(entry, struct page, lru);
+		list_del(entry);
+		__free_page(page);
+	}
+	INIT_LIST_HEAD(&rzs->backing_swap_extent_list);
+	rzs->num_extents = 0;
+
+	/* Close backing swap device, if present */
+	if (rzs->backing_swap) {
+		if (is_backing_blkdev)
+			bd_release(rzs->backing_swap);
+		filp_close(rzs->swap_file, NULL);
+		rzs->backing_swap = NULL;
+	}
+
+	/* Reset stats */
+	memset(&rzs->stats, 0, sizeof(rzs->stats));
+
+	rzs->disksize = 0;
+	rzs->memlimit = 0;
+
+	/* Back to uninitialized state */
+	rzs->init_done = 0;
+}
+
+static int __init ramzswap_init(void)
+{
+	int i;
+
+	if (NUM_DEVICES > MAX_NUM_DEVICES) {
+		pr_warning(C "Invalid value for NUM_DEVICES: %lu\n",
+						NUM_DEVICES);
+		return -EINVAL;
+	}
+
+	RAMZSWAP_MAJOR = register_blkdev(0, "ramzswap");
+	if (RAMZSWAP_MAJOR <= 0) {
+		pr_warning(C "Unable to get major number\n");
+		return -EBUSY;
+	}
+
+	if (!NUM_DEVICES) {
+		pr_info(C "NUM_DEVICES not specified. Using default: 1\n");
+		NUM_DEVICES = 1;
+	}
+
+	/* Allocate the device array and initialize each one */
+	pr_info(C "Creating %lu devices ...\n", NUM_DEVICES);
+	DEVICES = kzalloc(NUM_DEVICES * sizeof(struct ramzswap), GFP_KERNEL);
+	if (!DEVICES)
+		goto out;
+
+	for (i = 0; i < NUM_DEVICES; i++)
+		create_device(&DEVICES[i], i);
+
+	return 0;
+
+out:
+	unregister_blkdev(RAMZSWAP_MAJOR, "ramzswap");
+	return -ENOMEM;
+}
+
+static void __exit ramzswap_exit(void)
+{
+	int i;
+
+	for (i = 0; i < NUM_DEVICES; i++) {
+		destroy_device(&DEVICES[i]);
+		reset_device(&DEVICES[i]);
+	}
+
+	unregister_blkdev(RAMZSWAP_MAJOR, "ramzswap");
+
+	kfree(DEVICES);
+	pr_debug(C "Cleanup done!\n");
+}
+
+module_param(NUM_DEVICES, ulong, 0);
+MODULE_PARM_DESC(NUM_DEVICES, "Number of ramzswap devices");
+
+module_init(ramzswap_init);
+module_exit(ramzswap_exit);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
+MODULE_DESCRIPTION("Compressed RAM Based Swap Device");
diff --git a/drivers/block/ramzswap/ramzswap_drv.h b/drivers/block/ramzswap/ramzswap_drv.h
new file mode 100644
index 0000000..7f77edc
--- /dev/null
+++ b/drivers/block/ramzswap/ramzswap_drv.h
@@ -0,0 +1,183 @@
+/*
+ * Compressed RAM based swap device
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ * Project home: http://compcache.googlecode.com
+ */
+
+#ifndef _RAMZSWAP_DRV_H_
+#define _RAMZSWAP_DRV_H_
+
+#include <linux/ramzswap_ioctl.h>
+#include "xvmalloc.h"
+
+/*
+ * Some arbitrary value. This is just to catch
+ * invalid value for NUM_DEVICES module parameter.
+ */
+#define MAX_NUM_DEVICES 32
+
+/*
+ * Stored at beginning of each compressed object.
+ *
+ * It stores back-reference to table entry which points to this
+ * object. This is required to support memory defragmentation or
+ * migrating compressed pages to backing swap disk.
+ */
+struct zobj_header {
+#if 0
+	u32 table_idx;
+#endif
+};
+
+/*-- Configurable parameters */
+
+/* Default ramzswap disk size: 25% of total RAM */
+#define DEFAULT_DISKSIZE_PERC_RAM	25
+#define DEFAULT_MEMLIMIT_PERC_RAM	15
+
+/*
+ * Max compressed page size when backing device is provided.
+ * Pages that compress to size greater than this are sent to
+ * physical swap disk.
+ */
+#define MAX_CPAGE_SIZE_BDEV	(PAGE_SIZE / 2)
+
+/*
+ * Max compressed page size when there is no backing dev.
+ * Pages that compress to size greater than this are stored
+ * uncompressed in memory.
+ */
+#define MAX_CPAGE_SIZE_NOBDEV	(PAGE_SIZE / 4 * 3)
+
+/*
+ * NOTE: MAX_CPAGE_SIZE_{BDEV,NOBDEV} sizes must be
+ * less than or equal to:
+ *   XV_MAX_ALLOC_SIZE - sizeof(struct zobj_header)
+ * since otherwise xvMalloc would always return failure.
+ */
+
+/*-- End of configurable params */
+
+#define SECTOR_SHIFT		9
+#define SECTOR_SIZE		(1 << SECTOR_SHIFT)
+#define SECTORS_PER_PAGE_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
+#define SECTORS_PER_PAGE	(1 << SECTORS_PER_PAGE_SHIFT)
+
+/* Message prefix */
+#define C "ramzswap: "
+
+/* Debugging and Stats */
+#define NOP	do { } while (0)
+
+#if defined(CONFIG_BLK_DEV_RAMZSWAP_STATS)
+#define STATS
+#endif
+
+#if defined(STATS)
+#define stat_inc(stat)			((stat)++)
+#define stat_dec(stat)			((stat)--)
+#define stat_inc_if_less(stat, val1, val2) \
+				((stat) += ((val1) < (val2) ? 1 : 0))
+#define stat_dec_if_less(stat, val1, val2) \
+				((stat) -= ((val1) < (val2) ? 1 : 0))
+#else	/* STATS */
+#define stat_inc(x)			NOP
+#define stat_dec(x)			NOP
+#define stat_inc_if_less(x, v1, v2)	NOP
+#define stat_dec_if_less(x, v1, v2)	NOP
+#endif	/* STATS */
+
+/* Flags for ramzswap pages (table[page_no].flags) */
+enum rzs_pageflags {
+	/* Page is stored uncompressed */
+	RZS_UNCOMPRESSED,
+
+	/* Page consists entirely of zeros */
+	RZS_ZERO,
+
+	__NR_RZS_PAGEFLAGS,
+};
+
+/*-- Data structures */
+
+/* Indexed by page no. These have to fit exactly in PAGE_SIZE */
+struct table {
+	struct page *page;
+	u16 offset;
+	u8 count;	/* object ref count (not yet used) */
+	u8 flags;
+} __attribute__ ((aligned(4)));
+
+/* These have to fit exactly in PAGE_SIZE */
+struct ramzswap_backing_extent {
+	pgoff_t phy_pagenum;
+	pgoff_t num_pages;
+} __attribute__ ((aligned(4)));
+
+struct ramzswap_stats {
+	/* basic stats */
+	size_t compr_size;	/* compressed size of pages stored -
+				 * needed to enforce memlimit */
+	/* more stats */
+#if defined(STATS)
+	u64 num_reads;		/* failed + successful */
+	u64 num_writes;		/* --do-- */
+	u64 failed_reads;	/* can happen when memory is too low */
+	u64 failed_writes;	/* should NEVER! happen */
+	u64 invalid_io;		/* non-swap I/O requests */
+	u64 notify_free;	/* no. of pages freed by swap free callback */
+	u64 pages_discard;	/* no. of pages freed by discard callback */
+	u32 pages_zero;		/* no. of zero filled pages */
+	u32 pages_stored;	/* no. of pages currently stored */
+	u32 good_compress;	/* % of pages with compression ratio<=50% */
+	u32 pages_expand;	/* % of incompressible pages */
+	u64 bdev_num_reads;	/* no. of reads on backing dev */
+	u64 bdev_num_writes;	/* no. of writes on backing dev */
+#endif
+};
+
+struct ramzswap {
+	struct xv_pool *mem_pool;
+	void *compress_workmem;
+	void *compress_buffer;
+	struct table *table;
+	struct mutex lock;
+	struct request_queue *queue;
+	struct gendisk *disk;
+	int init_done;
+	int init_notify_callback;
+	/*
+	 * This is limit on compressed data size (stats.compr_size)
+	 * Its applicable only when backing swap device is present.
+	 */
+	size_t memlimit;	/* bytes */
+	/*
+	 * This is limit on amount of *uncompressed* worth of data
+	 * we can hold. When backing swap device is provided, it is
+	 * set equal to device size.
+	 */
+	size_t disksize;	/* bytes */
+
+	struct ramzswap_stats stats;
+
+	/* backing swap device info */
+	struct ramzswap_backing_extent *curr_extent;
+	struct list_head backing_swap_extent_list;
+	unsigned long num_extents;
+	char backing_swap_name[MAX_SWAP_NAME_LEN];
+	struct block_device *backing_swap;
+	struct file *swap_file;
+};
+
+/*-- */
+
+#endif
+
diff --git a/include/linux/ramzswap_ioctl.h b/include/linux/ramzswap_ioctl.h
new file mode 100644
index 0000000..c9a10a5
--- /dev/null
+++ b/include/linux/ramzswap_ioctl.h
@@ -0,0 +1,51 @@
+/*
+ * Compressed RAM based swap device
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ * Project home: http://compcache.googlecode.com
+ */
+
+#ifndef _RAMZSWAP_IOCTL_H_
+#define _RAMZSWAP_IOCTL_H_
+
+#define MAX_SWAP_NAME_LEN 128
+
+struct ramzswap_ioctl_stats {
+	char backing_swap_name[MAX_SWAP_NAME_LEN];
+	u64 memlimit;		/* only applicable if backing swap present */
+	u64 disksize;		/* user specified or equal to backing swap
+				 * size (if present) */
+	u64 num_reads;		/* failed + successful */
+	u64 num_writes;		/* --do-- */
+	u64 failed_reads;	/* can happen when memory is too low */
+	u64 failed_writes;	/* should NEVER! happen */
+	u64 invalid_io;		/* non-swap I/O requests */
+	u64 notify_free;	/* no. of pages freed by swap free callback */
+	u64 pages_discard;	/* no. of pages freed by discard callback */
+	u32 pages_zero;		/* no. of zero filled pages */
+	u32 good_compress_pct;	/* no. of pages with compression ratio<=50% */
+	u32 pages_expand_pct;	/* no. of incompressible pages */
+	u32 pages_stored;
+	u32 pages_used;
+	u64 orig_data_size;
+	u64 compr_data_size;
+	u64 mem_used_total;
+	u64 bdev_num_reads;	/* no. of reads on backing dev */
+	u64 bdev_num_writes;	/* no. of writes on backing dev */
+} __attribute__ ((packed, aligned(4)));
+
+#define RZSIO_SET_DISKSIZE_KB	_IOW('z', 0, size_t)
+#define RZSIO_SET_MEMLIMIT_KB	_IOW('z', 1, size_t)
+#define RZSIO_SET_BACKING_SWAP	_IOW('z', 2, unsigned char[MAX_SWAP_NAME_LEN])
+#define RZSIO_GET_STATS		_IOR('z', 3, struct ramzswap_ioctl_stats)
+#define RZSIO_INIT		_IO('z', 4)
+#define RZSIO_RESET		_IO('z', 5)
+
+#endif

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 4/4] documentation
       [not found] <200909100215.36350.ngupta@vflare.org>
@ 2009-09-09 21:21   ` Nitin Gupta
  2009-09-09 21:21   ` Nitin Gupta
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-09 21:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Pekka Enberg, Ed Tomlinson, linux-kernel, linux-mm,
	linux-mm-cc

Short guide on how to setup and use ramzswap.

Signed-off-by: Nitin Gupta <ngupta@vflare.org>
---

 Documentation/blockdev/00-INDEX     |    2 +
 Documentation/blockdev/ramzswap.txt |   50 +++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/Documentation/blockdev/00-INDEX b/Documentation/blockdev/00-INDEX
index c08df56..c1cb074 100644
--- a/Documentation/blockdev/00-INDEX
+++ b/Documentation/blockdev/00-INDEX
@@ -16,3 +16,5 @@ paride.txt
 	- information about the parallel port IDE subsystem.
 ramdisk.txt
 	- short guide on how to set up and use the RAM disk.
+ramzswap.txt
+	- short guide on how to setup compressed in-memory swap device.
diff --git a/Documentation/blockdev/ramzswap.txt b/Documentation/blockdev/ramzswap.txt
new file mode 100644
index 0000000..b026715
--- /dev/null
+++ b/Documentation/blockdev/ramzswap.txt
@@ -0,0 +1,50 @@
+ramzswap: Compressed RAM based swap device
+-------------------------------------------
+
+Project home: http://compcache.googlecode.com/
+
+* Introduction
+
+It creates RAM based block devices which can be used (only) as swap disks.
+Pages swapped to these devices are compressed and stored in memory itself.
+See project home for use cases, performance numbers and a lot more.
+
+The configuration parameters for individual ramzswap devices are set using
+rzscontrol userspace utility as shown below.
+
+* Usage
+
+Following shows a typical sequence of steps for using ramzswap.
+
+1) Load Modules:
+	modprobe ramzswap NUM_DEVICES=4
+	This creates 4 (uninitialized) devices: /dev/ramzswap{0,1,2,3}
+	(NUM_DEVICES parameter is optional. Default: 1)
+
+2) Initialize:
+	Use rzscontrol utility to configure and initialize individual
+	ramzswap devices. Example:
+	rzscontrol /dev/ramzswap2 --init # uses default value of disksize_kb
+
+	*See rzscontrol manpage for more details and examples*
+
+3) Activate:
+	swapon /dev/ramzswap2 # or any other initialized ramzswap device
+
+4) Stats:
+	rzscontrol /dev/ramzswap2 --stats
+
+5) Deactivate:
+	swapoff /dev/ramzswap2
+
+6) Reset:
+	rzscontrol /dev/ramzswap2 --reset
+	(This frees all the memory allocated for this device).
+
+
+Please report any problems at:
+ - Mailing list: linux-mm-cc at laptop dot org
+ - Issue tracker: http://code.google.com/p/compcache/issues/list
+
+Nitin Gupta
+ngupta@vflare.org

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 4/4] documentation
@ 2009-09-09 21:21   ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-09 21:21 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Pekka Enberg, Ed Tomlinson, linux-kernel, linux-mm,
	linux-mm-cc

Short guide on how to setup and use ramzswap.

Signed-off-by: Nitin Gupta <ngupta@vflare.org>
---

 Documentation/blockdev/00-INDEX     |    2 +
 Documentation/blockdev/ramzswap.txt |   50 +++++++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 0 deletions(-)

diff --git a/Documentation/blockdev/00-INDEX b/Documentation/blockdev/00-INDEX
index c08df56..c1cb074 100644
--- a/Documentation/blockdev/00-INDEX
+++ b/Documentation/blockdev/00-INDEX
@@ -16,3 +16,5 @@ paride.txt
 	- information about the parallel port IDE subsystem.
 ramdisk.txt
 	- short guide on how to set up and use the RAM disk.
+ramzswap.txt
+	- short guide on how to setup compressed in-memory swap device.
diff --git a/Documentation/blockdev/ramzswap.txt b/Documentation/blockdev/ramzswap.txt
new file mode 100644
index 0000000..b026715
--- /dev/null
+++ b/Documentation/blockdev/ramzswap.txt
@@ -0,0 +1,50 @@
+ramzswap: Compressed RAM based swap device
+-------------------------------------------
+
+Project home: http://compcache.googlecode.com/
+
+* Introduction
+
+It creates RAM based block devices which can be used (only) as swap disks.
+Pages swapped to these devices are compressed and stored in memory itself.
+See project home for use cases, performance numbers and a lot more.
+
+The configuration parameters for individual ramzswap devices are set using
+rzscontrol userspace utility as shown below.
+
+* Usage
+
+Following shows a typical sequence of steps for using ramzswap.
+
+1) Load Modules:
+	modprobe ramzswap NUM_DEVICES=4
+	This creates 4 (uninitialized) devices: /dev/ramzswap{0,1,2,3}
+	(NUM_DEVICES parameter is optional. Default: 1)
+
+2) Initialize:
+	Use rzscontrol utility to configure and initialize individual
+	ramzswap devices. Example:
+	rzscontrol /dev/ramzswap2 --init # uses default value of disksize_kb
+
+	*See rzscontrol manpage for more details and examples*
+
+3) Activate:
+	swapon /dev/ramzswap2 # or any other initialized ramzswap device
+
+4) Stats:
+	rzscontrol /dev/ramzswap2 --stats
+
+5) Deactivate:
+	swapoff /dev/ramzswap2
+
+6) Reset:
+	rzscontrol /dev/ramzswap2 --reset
+	(This frees all the memory allocated for this device).
+
+
+Please report any problems at:
+ - Mailing list: linux-mm-cc at laptop dot org
+ - Issue tracker: http://code.google.com/p/compcache/issues/list
+
+Nitin Gupta
+ngupta@vflare.org

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 3/4] send callback when swap slot is freed
       [not found] <200909100215.36350.ngupta@vflare.org>
@ 2009-09-09 21:50   ` Nitin Gupta
  2009-09-09 21:21   ` Nitin Gupta
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-09 21:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Pekka Enberg, Ed Tomlinson, linux-kernel, linux-mm,
	linux-mm-cc

Currently, we have "swap discard" mechanism which sends a discard bio request
when we find a free cluster during scan_swap_map(). This callback can come a
long time after swap slots are actually freed.

This delay in callback is a great problem when (compressed) RAM [1] is used
as a swap device. So, this change adds a callback which is called as
soon as a swap slot becomes free. For above mentioned case of swapping
over compressed RAM device, this is very useful since we can immediately
free memory allocated for this swap page.

This callback does not replace swap discard support. It is called with
swap_lock held, so it is meant to trigger action that finishes quickly.
However, swap discard is an I/O request and can be used for taking longer
actions.

It is preferred to use this callback for ramzswap case even if discard
mechanism could be improved such that it can be called as often as required.
This is because, allocation of 'bio'(s) is undesirable since ramzswap always
operates under low memory conditions (its a swap device). Also, batching of
discard bio requests is not optimal since stale data can accumulate very
quickly in ramzswap devices, pushing system further into low memory state.

Links:
[1] http://compcache.googlecode.com/

Signed-off-by: Nitin Gupta <ngupta@vflare.org>
---

 include/linux/swap.h |    5 +++++
 mm/swapfile.c        |   34 ++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 0 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7c15334..64796fc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -8,6 +8,7 @@
 #include <linux/memcontrol.h>
 #include <linux/sched.h>
 #include <linux/node.h>
+#include <linux/blkdev.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -20,6 +21,8 @@ struct bio;
 #define SWAP_FLAG_PRIO_MASK	0x7fff
 #define SWAP_FLAG_PRIO_SHIFT	0
 
+typedef void (swap_free_notify_fn) (struct block_device *, unsigned long);
+
 static inline int current_is_kswapd(void)
 {
 	return current->flags & PF_KSWAPD;
@@ -155,6 +158,7 @@ struct swap_info_struct {
 	unsigned int max;
 	unsigned int inuse_pages;
 	unsigned int old_block_size;
+	swap_free_notify_fn *swap_free_notify_fn;
 };
 
 struct swap_list_t {
@@ -295,6 +299,7 @@ extern sector_t swapdev_block(int, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
+extern void set_swap_free_notify(struct block_device *, swap_free_notify_fn *);
 struct backing_dev_info;
 
 /* linux/mm/thrash.c */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8ffdc0d..fb6b8b2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -552,6 +552,38 @@ out:
 	return NULL;
 }
 
+/*
+ * Sets callback for event when swap_map[offset] == 0
+ * i.e. page at this swap offset is no longer used.
+ */
+void set_swap_free_notify(struct block_device *bdev,
+			swap_free_notify_fn *notify_fn)
+{
+	unsigned int i;
+	struct swap_info_struct *sis;
+
+	spin_lock(&swap_lock);
+	for (i = 0; i <= nr_swapfiles; i++) {
+		sis = &swap_info[i];
+		if (!(sis->flags & SWP_USED))
+			continue;
+		if (sis->bdev == bdev)
+			break;
+	}
+
+	/* swap device not found */
+	if (i > nr_swapfiles) {
+		spin_unlock(&swap_lock);
+		return;
+	}
+
+	BUG_ON(!sis || sis->swap_free_notify_fn);
+	sis->swap_free_notify_fn = notify_fn;
+	spin_unlock(&swap_lock);
+	return;
+}
+EXPORT_SYMBOL_GPL(set_swap_free_notify);
+
 static int swap_entry_free(struct swap_info_struct *p,
 			   swp_entry_t ent, int cache)
 {
@@ -583,6 +615,8 @@ static int swap_entry_free(struct swap_info_struct *p,
 			swap_list.next = p - swap_info;
 		nr_swap_pages++;
 		p->inuse_pages--;
+		if (p->swap_free_notify_fn)
+			p->swap_free_notify_fn(p->bdev, offset);
 	}
 	if (!swap_count(count))
 		mem_cgroup_uncharge_swap(ent);

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 3/4] send callback when swap slot is freed
@ 2009-09-09 21:50   ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-09 21:50 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Pekka Enberg, Ed Tomlinson, linux-kernel, linux-mm,
	linux-mm-cc

Currently, we have "swap discard" mechanism which sends a discard bio request
when we find a free cluster during scan_swap_map(). This callback can come a
long time after swap slots are actually freed.

This delay in callback is a great problem when (compressed) RAM [1] is used
as a swap device. So, this change adds a callback which is called as
soon as a swap slot becomes free. For above mentioned case of swapping
over compressed RAM device, this is very useful since we can immediately
free memory allocated for this swap page.

This callback does not replace swap discard support. It is called with
swap_lock held, so it is meant to trigger action that finishes quickly.
However, swap discard is an I/O request and can be used for taking longer
actions.

It is preferred to use this callback for ramzswap case even if discard
mechanism could be improved such that it can be called as often as required.
This is because, allocation of 'bio'(s) is undesirable since ramzswap always
operates under low memory conditions (its a swap device). Also, batching of
discard bio requests is not optimal since stale data can accumulate very
quickly in ramzswap devices, pushing system further into low memory state.

Links:
[1] http://compcache.googlecode.com/

Signed-off-by: Nitin Gupta <ngupta@vflare.org>
---

 include/linux/swap.h |    5 +++++
 mm/swapfile.c        |   34 ++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 0 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 7c15334..64796fc 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -8,6 +8,7 @@
 #include <linux/memcontrol.h>
 #include <linux/sched.h>
 #include <linux/node.h>
+#include <linux/blkdev.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -20,6 +21,8 @@ struct bio;
 #define SWAP_FLAG_PRIO_MASK	0x7fff
 #define SWAP_FLAG_PRIO_SHIFT	0
 
+typedef void (swap_free_notify_fn) (struct block_device *, unsigned long);
+
 static inline int current_is_kswapd(void)
 {
 	return current->flags & PF_KSWAPD;
@@ -155,6 +158,7 @@ struct swap_info_struct {
 	unsigned int max;
 	unsigned int inuse_pages;
 	unsigned int old_block_size;
+	swap_free_notify_fn *swap_free_notify_fn;
 };
 
 struct swap_list_t {
@@ -295,6 +299,7 @@ extern sector_t swapdev_block(int, pgoff_t);
 extern struct swap_info_struct *get_swap_info_struct(unsigned);
 extern int reuse_swap_page(struct page *);
 extern int try_to_free_swap(struct page *);
+extern void set_swap_free_notify(struct block_device *, swap_free_notify_fn *);
 struct backing_dev_info;
 
 /* linux/mm/thrash.c */
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8ffdc0d..fb6b8b2 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -552,6 +552,38 @@ out:
 	return NULL;
 }
 
+/*
+ * Sets callback for event when swap_map[offset] == 0
+ * i.e. page at this swap offset is no longer used.
+ */
+void set_swap_free_notify(struct block_device *bdev,
+			swap_free_notify_fn *notify_fn)
+{
+	unsigned int i;
+	struct swap_info_struct *sis;
+
+	spin_lock(&swap_lock);
+	for (i = 0; i <= nr_swapfiles; i++) {
+		sis = &swap_info[i];
+		if (!(sis->flags & SWP_USED))
+			continue;
+		if (sis->bdev == bdev)
+			break;
+	}
+
+	/* swap device not found */
+	if (i > nr_swapfiles) {
+		spin_unlock(&swap_lock);
+		return;
+	}
+
+	BUG_ON(!sis || sis->swap_free_notify_fn);
+	sis->swap_free_notify_fn = notify_fn;
+	spin_unlock(&swap_lock);
+	return;
+}
+EXPORT_SYMBOL_GPL(set_swap_free_notify);
+
 static int swap_entry_free(struct swap_info_struct *p,
 			   swp_entry_t ent, int cache)
 {
@@ -583,6 +615,8 @@ static int swap_entry_free(struct swap_info_struct *p,
 			swap_list.next = p - swap_info;
 		nr_swap_pages++;
 		p->inuse_pages--;
+		if (p->swap_free_notify_fn)
+			p->swap_free_notify_fn(p->bdev, offset);
 	}
 	if (!swap_count(count))
 		mem_cgroup_uncharge_swap(ent);

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 1/4] xvmalloc memory allocator
       [not found] <200909100215.36350.ngupta@vflare.org>
@ 2009-09-09 21:53   ` Nitin Gupta
  2009-09-09 21:21   ` Nitin Gupta
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-09 21:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Pekka Enberg, Christoph Lameter, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc

xvmalloc is a memory allocator designed specifically for ramzswap project.

* Features:
 - Low metadata overhead (just 4 bytes per object)
 - O(1) Alloc/Free - except when we have to call system page allocator to
   get additional memory.
 - Very low fragmentation: In all tests, xvmalloc memory usage is within 12%
   of "Ideal".
 - Pool based allocator: Each pool can grow and shrink.
 - It maps pages only when required. So, it does not hog vmalloc area which
   is very small on 32-bit systems.

SLUB allocator could not be used due to fragmentation issues:
http://code.google.com/p/compcache/wiki/AllocatorsComparison
Data here shows kmalloc using ~43% more memory than TLSF and xvMalloc
is showed ~2% more space efficiency than TLSF (due to smaller metadata).
Creating various kmem_caches can reduce space efficiency gap but still
problem of being limited to low memory exists. Also, it depends on
allocating higher order pages to reduce fragmentation - this is not
acceptable for ramzswap as it is used under memory crunch (its a swap
device!).

SLOB allocator could not be used do to reasons mentioned here:
http://lkml.org/lkml/2009/3/18/210

* Implementation:
It uses two-level bitmap search to find free list containing block of
correct size. This idea is taken from TLSF (Two-Level Segregate Fit)
allocator and is well explained in its paper (see [Links] below).

* Limitations:
 - Poor scalability: No per-cpu data structures (work in progress).

[Links]
1. Details and Performance data:
http://code.google.com/p/compcache/wiki/xvMalloc
http://code.google.com/p/compcache/wiki/xvMallocPerformance

2. TLSF memory allocator:
home: http://rtportal.upv.es/rtmalloc/
paper: http://rtportal.upv.es/rtmalloc/files/MRBC_2008.pdf

Signed-off-by: Nitin Gupta <ngupta@vflare.org>
---

 drivers/block/ramzswap/xvmalloc.c     |  533 +++++++++++++++++++++++++++++++++
 drivers/block/ramzswap/xvmalloc.h     |   30 ++
 drivers/block/ramzswap/xvmalloc_int.h |   86 ++++++
 3 files changed, 649 insertions(+), 0 deletions(-)

diff --git a/drivers/block/ramzswap/xvmalloc.c b/drivers/block/ramzswap/xvmalloc.c
new file mode 100644
index 0000000..12ad1a3
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc.c
@@ -0,0 +1,533 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include "xvmalloc.h"
+#include "xvmalloc_int.h"
+
+static void stat_inc(u64 *value)
+{
+	*value = *value + 1;
+}
+
+static void stat_dec(u64 *value)
+{
+	*value = *value - 1;
+}
+
+static int test_flag(struct block_header *block, enum blockflags flag)
+{
+	return block->prev & BIT(flag);
+}
+
+static void set_flag(struct block_header *block, enum blockflags flag)
+{
+	block->prev |= BIT(flag);
+}
+
+static void clear_flag(struct block_header *block, enum blockflags flag)
+{
+	block->prev &= ~BIT(flag);
+}
+
+/*
+ * Given <page, offset> pair, provide a derefrencable pointer.
+ * This is called from xv_malloc/xv_free path, so it
+ * needs to be fast.
+ */
+static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
+{
+	unsigned char *base;
+
+	base = kmap_atomic(page, type);
+	return base + offset;
+}
+
+static void put_ptr_atomic(void *ptr, enum km_type type)
+{
+	kunmap_atomic(ptr, type);
+}
+
+static u32 get_blockprev(struct block_header *block)
+{
+	return block->prev & PREV_MASK;
+}
+
+static void set_blockprev(struct block_header *block, u16 new_offset)
+{
+	block->prev = new_offset | (block->prev & FLAGS_MASK);
+}
+
+static struct block_header *BLOCK_NEXT(struct block_header *block)
+{
+	return (struct block_header *)
+		((char *)block + block->size + XV_ALIGN);
+}
+
+/*
+ * Get index of free list containing blocks of maximum size
+ * which is less than or equal to given size.
+ */
+static u32 get_index_for_insert(u32 size)
+{
+	if (unlikely(size > XV_MAX_ALLOC_SIZE))
+		size = XV_MAX_ALLOC_SIZE;
+	size &= ~FL_DELTA_MASK;
+	return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
+}
+
+/*
+ * Get index of free list having blocks of size greater than
+ * or equal to requested size.
+ */
+static u32 get_index(u32 size)
+{
+	if (unlikely(size < XV_MIN_ALLOC_SIZE))
+		size = XV_MIN_ALLOC_SIZE;
+	size = ALIGN(size, FL_DELTA);
+	return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
+}
+
+/*
+ * Allocate a memory page. Called when a pool needs to grow.
+ */
+static struct page *xv_alloc_page(gfp_t flags)
+{
+	struct page *page;
+
+	page = alloc_page(flags);
+	if (unlikely(!page))
+		return 0;
+
+	return page;
+}
+
+/*
+ * Called when all objects in a page are freed.
+ */
+static void xv_free_page(struct page *page)
+{
+	__free_page(page);
+}
+
+/**
+ * find_block - find block of at least given size
+ * @pool: memory pool to search from
+ * @size: size of block required
+ * @page: page containing required block
+ * @offset: offset within the page where block is located.
+ *
+ * Searches two level bitmap to locate block of at least
+ * the given size. If such a block is found, it provides
+ * <page, offset> to identify this block and returns index
+ * in freelist where we found this block.
+ * Otherwise, returns 0 and <page, offset> params are not touched.
+ */
+static u32 find_block(struct xv_pool *pool, u32 size,
+			struct page **page, u32 *offset)
+{
+	ulong flbitmap, slbitmap;
+	u32 flindex, slindex, slbitstart;
+
+	/* There are no free blocks in this pool */
+	if (!pool->flbitmap)
+		return 0;
+
+	/* Get freelist index correspoding to this size */
+	slindex = get_index(size);
+	slbitmap = pool->slbitmap[slindex / BITS_PER_LONG];
+	slbitstart = slindex % BITS_PER_LONG;
+
+	/*
+	 * If freelist is not empty at this index, we found the
+	 * block - head of this list. This is approximate best-fit match.
+	 */
+	if (test_bit(slbitstart, &slbitmap)) {
+		*page = pool->freelist[slindex].page;
+		*offset = pool->freelist[slindex].offset;
+		return slindex;
+	}
+
+	/*
+	 * No best-fit found. Search a bit further in bitmap for a free block.
+	 * Second level bitmap consists of series of 32-bit chunks. Search
+	 * further in the chunk where we expected a best-fit, starting from
+	 * index location found above.
+	 */
+	slbitstart++;
+	slbitmap >>= slbitstart;
+
+	/* Skip this search if we were already at end of this bitmap chunk */
+	if ((slbitstart != BITS_PER_LONG) && slbitmap) {
+		slindex += __ffs(slbitmap) + 1;
+		*page = pool->freelist[slindex].page;
+		*offset = pool->freelist[slindex].offset;
+		return slindex;
+	}
+
+	/* Now do a full two-level bitmap search to find next nearest fit */
+	flindex = slindex / BITS_PER_LONG;
+
+	flbitmap = (pool->flbitmap) >> (flindex + 1);
+	if (!flbitmap)
+		return 0;
+
+	flindex += __ffs(flbitmap) + 1;
+	slbitmap = pool->slbitmap[flindex];
+	slindex = (flindex * BITS_PER_LONG) + __ffs(slbitmap);
+	*page = pool->freelist[slindex].page;
+	*offset = pool->freelist[slindex].offset;
+
+	return slindex;
+}
+
+/*
+ * Insert block at <page, offset> in freelist of given pool.
+ * freelist used depends on block size.
+ */
+static void insert_block(struct xv_pool *pool, struct page *page, u32 offset,
+			struct block_header *block)
+{
+	u32 flindex, slindex;
+	struct block_header *nextblock;
+
+	slindex = get_index_for_insert(block->size);
+	flindex = slindex / BITS_PER_LONG;
+
+	block->link.prev_page = 0;
+	block->link.prev_offset = 0;
+	block->link.next_page = pool->freelist[slindex].page;
+	block->link.next_offset = pool->freelist[slindex].offset;
+	pool->freelist[slindex].page = page;
+	pool->freelist[slindex].offset = offset;
+
+	if (block->link.next_page) {
+		nextblock = get_ptr_atomic(block->link.next_page,
+					block->link.next_offset, KM_USER1);
+		nextblock->link.prev_page = page;
+		nextblock->link.prev_offset = offset;
+		put_ptr_atomic(nextblock, KM_USER1);
+	}
+
+	__set_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
+	__set_bit(flindex, &pool->flbitmap);
+}
+
+/*
+ * Remove block from head of freelist. Index 'slindex' identifies the freelist.
+ */
+static void remove_block_head(struct xv_pool *pool,
+			struct block_header *block, u32 slindex)
+{
+	struct block_header *tmpblock;
+	u32 flindex = slindex / BITS_PER_LONG;
+
+	pool->freelist[slindex].page = block->link.next_page;
+	pool->freelist[slindex].offset = block->link.next_offset;
+	block->link.prev_page = 0;
+	block->link.prev_offset = 0;
+
+	if (!pool->freelist[slindex].page) {
+		__clear_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
+		if (!pool->slbitmap[flindex])
+			__clear_bit(flindex, &pool->flbitmap);
+	} else {
+		/*
+		 * DEBUG ONLY: We need not reinitialize freelist head previous
+		 * pointer to 0 - we never depend on its value. But just for
+		 * sanity, lets do it.
+		 */
+		tmpblock = get_ptr_atomic(pool->freelist[slindex].page,
+				pool->freelist[slindex].offset, KM_USER1);
+		tmpblock->link.prev_page = 0;
+		tmpblock->link.prev_offset = 0;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+}
+
+/*
+ * Remove block from freelist. Index 'slindex' identifies the freelist.
+ */
+static void remove_block(struct xv_pool *pool, struct page *page, u32 offset,
+			struct block_header *block, u32 slindex)
+{
+	u32 flindex;
+	struct block_header *tmpblock;
+
+	if (pool->freelist[slindex].page == page
+	   && pool->freelist[slindex].offset == offset) {
+		remove_block_head(pool, block, slindex);
+		return;
+	}
+
+	flindex = slindex / BITS_PER_LONG;
+
+	if (block->link.prev_page) {
+		tmpblock = get_ptr_atomic(block->link.prev_page,
+				block->link.prev_offset, KM_USER1);
+		tmpblock->link.next_page = block->link.next_page;
+		tmpblock->link.next_offset = block->link.next_offset;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+
+	if (block->link.next_page) {
+		tmpblock = get_ptr_atomic(block->link.next_page,
+				block->link.next_offset, KM_USER1);
+		tmpblock->link.prev_page = block->link.prev_page;
+		tmpblock->link.prev_offset = block->link.prev_offset;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+
+	return;
+}
+
+/*
+ * Allocate a page and add it freelist of given pool.
+ */
+static int grow_pool(struct xv_pool *pool, gfp_t flags)
+{
+	struct page *page;
+	struct block_header *block;
+
+	page = xv_alloc_page(flags);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	stat_inc(&pool->total_pages);
+
+	spin_lock(&pool->lock);
+	block = get_ptr_atomic(page, 0, KM_USER0);
+
+	block->size = PAGE_SIZE - XV_ALIGN;
+	set_flag(block, BLOCK_FREE);
+	clear_flag(block, PREV_FREE);
+	set_blockprev(block, 0);
+
+	insert_block(pool, page, 0, block);
+
+	put_ptr_atomic(block, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	return 0;
+}
+
+/*
+ * Create a memory pool. Allocates freelist, bitmaps and other
+ * per-pool metadata.
+ */
+struct xv_pool *xv_create_pool(void)
+{
+	u32 ovhd_size;
+	struct xv_pool *pool;
+
+	ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
+	pool = kzalloc(ovhd_size, GFP_KERNEL);
+	if (!pool)
+		return NULL;
+
+	spin_lock_init(&pool->lock);
+
+	return pool;
+}
+
+void xv_destroy_pool(struct xv_pool *pool)
+{
+	kfree(pool);
+}
+
+/**
+ * xv_malloc - Allocate block of given size from pool.
+ * @pool: pool to allocate from
+ * @size: size of block to allocate
+ * @page: page no. that holds the object
+ * @offset: location of object within page
+ *
+ * On success, <page, offset> identifies block allocated
+ * and 0 is returned. On failure, <page, offset> is set to
+ * 0 and -ENOMEM is returned.
+ *
+ * Allocation requests with size > XV_MAX_ALLOC_SIZE will fail.
+ */
+int xv_malloc(struct xv_pool *pool, u32 size, struct page **page,
+		u32 *offset, gfp_t flags)
+{
+	int error;
+	u32 index, tmpsize, origsize, tmpoffset;
+	struct block_header *block, *tmpblock;
+
+	*page = NULL;
+	*offset = 0;
+	origsize = size;
+
+	if (unlikely(!size || size > XV_MAX_ALLOC_SIZE))
+		return -ENOMEM;
+
+	size = ALIGN(size, XV_ALIGN);
+
+	spin_lock(&pool->lock);
+
+	index = find_block(pool, size, page, offset);
+
+	if (!*page) {
+		spin_unlock(&pool->lock);
+		if (flags & GFP_NOWAIT)
+			return -ENOMEM;
+		error = grow_pool(pool, flags);
+		if (unlikely(error))
+			return -ENOMEM;
+
+		spin_lock(&pool->lock);
+		index = find_block(pool, size, page, offset);
+	}
+
+	if (!*page) {
+		spin_unlock(&pool->lock);
+		return -ENOMEM;
+	}
+
+	block = get_ptr_atomic(*page, *offset, KM_USER0);
+
+	remove_block_head(pool, block, index);
+
+	/* Split the block if required */
+	tmpoffset = *offset + size + XV_ALIGN;
+	tmpsize = block->size - size;
+	tmpblock = (struct block_header *)((char *)block + size + XV_ALIGN);
+	if (tmpsize) {
+		tmpblock->size = tmpsize - XV_ALIGN;
+		set_flag(tmpblock, BLOCK_FREE);
+		clear_flag(tmpblock, PREV_FREE);
+
+		set_blockprev(tmpblock, *offset);
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
+			insert_block(pool, *page, tmpoffset, tmpblock);
+
+		if (tmpoffset + XV_ALIGN + tmpblock->size != PAGE_SIZE) {
+			tmpblock = BLOCK_NEXT(tmpblock);
+			set_blockprev(tmpblock, tmpoffset);
+		}
+	} else {
+		/* This block is exact fit */
+		if (tmpoffset != PAGE_SIZE)
+			clear_flag(tmpblock, PREV_FREE);
+	}
+
+	block->size = origsize;
+	clear_flag(block, BLOCK_FREE);
+
+	put_ptr_atomic(block, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	*offset += XV_ALIGN;
+
+	return 0;
+}
+
+/*
+ * Free block identified with <page, offset>
+ */
+void xv_free(struct xv_pool *pool, struct page *page, u32 offset)
+{
+	void *page_start;
+	struct block_header *block, *tmpblock;
+
+	offset -= XV_ALIGN;
+
+	spin_lock(&pool->lock);
+
+	page_start = get_ptr_atomic(page, 0, KM_USER0);
+	block = (struct block_header *)((char *)page_start + offset);
+
+	/* Catch double free bugs */
+	BUG_ON(test_flag(block, BLOCK_FREE));
+
+	block->size = ALIGN(block->size, XV_ALIGN);
+
+	tmpblock = BLOCK_NEXT(block);
+	if (offset + block->size + XV_ALIGN == PAGE_SIZE)
+		tmpblock = NULL;
+
+	/* Merge next block if its free */
+	if (tmpblock && test_flag(tmpblock, BLOCK_FREE)) {
+		/*
+		 * Blocks smaller than XV_MIN_ALLOC_SIZE
+		 * are not inserted in any free list.
+		 */
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE) {
+			remove_block(pool, page,
+				    offset + block->size + XV_ALIGN, tmpblock,
+				    get_index_for_insert(tmpblock->size));
+		}
+		block->size += tmpblock->size + XV_ALIGN;
+	}
+
+	/* Merge previous block if its free */
+	if (test_flag(block, PREV_FREE)) {
+		tmpblock = (struct block_header *)((char *)(page_start) +
+						get_blockprev(block));
+		offset = offset - tmpblock->size - XV_ALIGN;
+
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
+			remove_block(pool, page, offset, tmpblock,
+				    get_index_for_insert(tmpblock->size));
+
+		tmpblock->size += block->size + XV_ALIGN;
+		block = tmpblock;
+	}
+
+	/* No used objects in this page. Free it. */
+	if (block->size == PAGE_SIZE - XV_ALIGN) {
+		put_ptr_atomic(page_start, KM_USER0);
+		spin_unlock(&pool->lock);
+
+		xv_free_page(page);
+		stat_dec(&pool->total_pages);
+		return;
+	}
+
+	set_flag(block, BLOCK_FREE);
+	if (block->size >= XV_MIN_ALLOC_SIZE)
+		insert_block(pool, page, offset, block);
+
+	if (offset + block->size + XV_ALIGN != PAGE_SIZE) {
+		tmpblock = BLOCK_NEXT(block);
+		set_flag(tmpblock, PREV_FREE);
+		set_blockprev(tmpblock, offset);
+	}
+
+	put_ptr_atomic(page_start, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	return;
+}
+
+u32 xv_get_object_size(void *obj)
+{
+	struct block_header *blk;
+
+	blk = (struct block_header *)((char *)(obj) - XV_ALIGN);
+	return blk->size;
+}
+
+/*
+ * Returns total memory used by allocator (userdata + metadata)
+ */
+u64 xv_get_total_size_bytes(struct xv_pool *pool)
+{
+	return pool->total_pages << PAGE_SHIFT;
+}
diff --git a/drivers/block/ramzswap/xvmalloc.h b/drivers/block/ramzswap/xvmalloc.h
new file mode 100644
index 0000000..010c6fe
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc.h
@@ -0,0 +1,30 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#ifndef _XV_MALLOC_H_
+#define _XV_MALLOC_H_
+
+#include <linux/types.h>
+
+struct xv_pool;
+
+struct xv_pool *xv_create_pool(void);
+void xv_destroy_pool(struct xv_pool *pool);
+
+int xv_malloc(struct xv_pool *pool, u32 size, struct page **page,
+			u32 *offset, gfp_t flags);
+void xv_free(struct xv_pool *pool, struct page *page, u32 offset);
+
+u32 xv_get_object_size(void *obj);
+u64 xv_get_total_size_bytes(struct xv_pool *pool);
+
+#endif
diff --git a/drivers/block/ramzswap/xvmalloc_int.h b/drivers/block/ramzswap/xvmalloc_int.h
new file mode 100644
index 0000000..03c1a65
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc_int.h
@@ -0,0 +1,86 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#ifndef _XV_MALLOC_INT_H_
+#define _XV_MALLOC_INT_H_
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+/* User configurable params */
+
+/* Must be power of two */
+#define XV_ALIGN_SHIFT	2
+#define XV_ALIGN	(1 << XV_ALIGN_SHIFT)
+#define XV_ALIGN_MASK	(XV_ALIGN - 1)
+
+/* This must be greater than sizeof(link_free) */
+#define XV_MIN_ALLOC_SIZE	32
+#define XV_MAX_ALLOC_SIZE	(PAGE_SIZE - XV_ALIGN)
+
+/* Free lists are separated by FL_DELTA bytes */
+#define FL_DELTA_SHIFT	3
+#define FL_DELTA	(1 << FL_DELTA_SHIFT)
+#define FL_DELTA_MASK	(FL_DELTA - 1)
+#define NUM_FREE_LISTS	((XV_MAX_ALLOC_SIZE - XV_MIN_ALLOC_SIZE) \
+				/ FL_DELTA + 1)
+
+#define MAX_FLI		DIV_ROUND_UP(NUM_FREE_LISTS, BITS_PER_LONG)
+
+/* End of user params */
+
+enum blockflags {
+	BLOCK_FREE,
+	PREV_FREE,
+	__NR_BLOCKFLAGS,
+};
+
+#define FLAGS_MASK	XV_ALIGN_MASK
+#define PREV_MASK	(~FLAGS_MASK)
+
+struct freelist_entry {
+	struct page *page;
+	u16 offset;
+	u16 pad;
+};
+
+struct link_free {
+	struct page *prev_page;
+	struct page *next_page;
+	u16 prev_offset;
+	u16 next_offset;
+};
+
+struct block_header {
+	union {
+		/* This common header must be ALIGN bytes */
+		u8 common[XV_ALIGN];
+		struct {
+			u16 size;
+			u16 prev;
+		};
+	};
+	struct link_free link;
+};
+
+struct xv_pool {
+	ulong flbitmap;
+	ulong slbitmap[MAX_FLI];
+	spinlock_t lock;
+
+	struct freelist_entry freelist[NUM_FREE_LISTS];
+
+	/* stats */
+	u64 total_pages;
+};
+
+#endif

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 1/4] xvmalloc memory allocator
@ 2009-09-09 21:53   ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-09 21:53 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Pekka Enberg, Christoph Lameter, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc

xvmalloc is a memory allocator designed specifically for ramzswap project.

* Features:
 - Low metadata overhead (just 4 bytes per object)
 - O(1) Alloc/Free - except when we have to call system page allocator to
   get additional memory.
 - Very low fragmentation: In all tests, xvmalloc memory usage is within 12%
   of "Ideal".
 - Pool based allocator: Each pool can grow and shrink.
 - It maps pages only when required. So, it does not hog vmalloc area which
   is very small on 32-bit systems.

SLUB allocator could not be used due to fragmentation issues:
http://code.google.com/p/compcache/wiki/AllocatorsComparison
Data here shows kmalloc using ~43% more memory than TLSF and xvMalloc
is showed ~2% more space efficiency than TLSF (due to smaller metadata).
Creating various kmem_caches can reduce space efficiency gap but still
problem of being limited to low memory exists. Also, it depends on
allocating higher order pages to reduce fragmentation - this is not
acceptable for ramzswap as it is used under memory crunch (its a swap
device!).

SLOB allocator could not be used do to reasons mentioned here:
http://lkml.org/lkml/2009/3/18/210

* Implementation:
It uses two-level bitmap search to find free list containing block of
correct size. This idea is taken from TLSF (Two-Level Segregate Fit)
allocator and is well explained in its paper (see [Links] below).

* Limitations:
 - Poor scalability: No per-cpu data structures (work in progress).

[Links]
1. Details and Performance data:
http://code.google.com/p/compcache/wiki/xvMalloc
http://code.google.com/p/compcache/wiki/xvMallocPerformance

2. TLSF memory allocator:
home: http://rtportal.upv.es/rtmalloc/
paper: http://rtportal.upv.es/rtmalloc/files/MRBC_2008.pdf

Signed-off-by: Nitin Gupta <ngupta@vflare.org>
---

 drivers/block/ramzswap/xvmalloc.c     |  533 +++++++++++++++++++++++++++++++++
 drivers/block/ramzswap/xvmalloc.h     |   30 ++
 drivers/block/ramzswap/xvmalloc_int.h |   86 ++++++
 3 files changed, 649 insertions(+), 0 deletions(-)

diff --git a/drivers/block/ramzswap/xvmalloc.c b/drivers/block/ramzswap/xvmalloc.c
new file mode 100644
index 0000000..12ad1a3
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc.c
@@ -0,0 +1,533 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#include <linux/bitops.h>
+#include <linux/errno.h>
+#include <linux/highmem.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include "xvmalloc.h"
+#include "xvmalloc_int.h"
+
+static void stat_inc(u64 *value)
+{
+	*value = *value + 1;
+}
+
+static void stat_dec(u64 *value)
+{
+	*value = *value - 1;
+}
+
+static int test_flag(struct block_header *block, enum blockflags flag)
+{
+	return block->prev & BIT(flag);
+}
+
+static void set_flag(struct block_header *block, enum blockflags flag)
+{
+	block->prev |= BIT(flag);
+}
+
+static void clear_flag(struct block_header *block, enum blockflags flag)
+{
+	block->prev &= ~BIT(flag);
+}
+
+/*
+ * Given <page, offset> pair, provide a derefrencable pointer.
+ * This is called from xv_malloc/xv_free path, so it
+ * needs to be fast.
+ */
+static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
+{
+	unsigned char *base;
+
+	base = kmap_atomic(page, type);
+	return base + offset;
+}
+
+static void put_ptr_atomic(void *ptr, enum km_type type)
+{
+	kunmap_atomic(ptr, type);
+}
+
+static u32 get_blockprev(struct block_header *block)
+{
+	return block->prev & PREV_MASK;
+}
+
+static void set_blockprev(struct block_header *block, u16 new_offset)
+{
+	block->prev = new_offset | (block->prev & FLAGS_MASK);
+}
+
+static struct block_header *BLOCK_NEXT(struct block_header *block)
+{
+	return (struct block_header *)
+		((char *)block + block->size + XV_ALIGN);
+}
+
+/*
+ * Get index of free list containing blocks of maximum size
+ * which is less than or equal to given size.
+ */
+static u32 get_index_for_insert(u32 size)
+{
+	if (unlikely(size > XV_MAX_ALLOC_SIZE))
+		size = XV_MAX_ALLOC_SIZE;
+	size &= ~FL_DELTA_MASK;
+	return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
+}
+
+/*
+ * Get index of free list having blocks of size greater than
+ * or equal to requested size.
+ */
+static u32 get_index(u32 size)
+{
+	if (unlikely(size < XV_MIN_ALLOC_SIZE))
+		size = XV_MIN_ALLOC_SIZE;
+	size = ALIGN(size, FL_DELTA);
+	return (size - XV_MIN_ALLOC_SIZE) >> FL_DELTA_SHIFT;
+}
+
+/*
+ * Allocate a memory page. Called when a pool needs to grow.
+ */
+static struct page *xv_alloc_page(gfp_t flags)
+{
+	struct page *page;
+
+	page = alloc_page(flags);
+	if (unlikely(!page))
+		return 0;
+
+	return page;
+}
+
+/*
+ * Called when all objects in a page are freed.
+ */
+static void xv_free_page(struct page *page)
+{
+	__free_page(page);
+}
+
+/**
+ * find_block - find block of at least given size
+ * @pool: memory pool to search from
+ * @size: size of block required
+ * @page: page containing required block
+ * @offset: offset within the page where block is located.
+ *
+ * Searches two level bitmap to locate block of at least
+ * the given size. If such a block is found, it provides
+ * <page, offset> to identify this block and returns index
+ * in freelist where we found this block.
+ * Otherwise, returns 0 and <page, offset> params are not touched.
+ */
+static u32 find_block(struct xv_pool *pool, u32 size,
+			struct page **page, u32 *offset)
+{
+	ulong flbitmap, slbitmap;
+	u32 flindex, slindex, slbitstart;
+
+	/* There are no free blocks in this pool */
+	if (!pool->flbitmap)
+		return 0;
+
+	/* Get freelist index correspoding to this size */
+	slindex = get_index(size);
+	slbitmap = pool->slbitmap[slindex / BITS_PER_LONG];
+	slbitstart = slindex % BITS_PER_LONG;
+
+	/*
+	 * If freelist is not empty at this index, we found the
+	 * block - head of this list. This is approximate best-fit match.
+	 */
+	if (test_bit(slbitstart, &slbitmap)) {
+		*page = pool->freelist[slindex].page;
+		*offset = pool->freelist[slindex].offset;
+		return slindex;
+	}
+
+	/*
+	 * No best-fit found. Search a bit further in bitmap for a free block.
+	 * Second level bitmap consists of series of 32-bit chunks. Search
+	 * further in the chunk where we expected a best-fit, starting from
+	 * index location found above.
+	 */
+	slbitstart++;
+	slbitmap >>= slbitstart;
+
+	/* Skip this search if we were already at end of this bitmap chunk */
+	if ((slbitstart != BITS_PER_LONG) && slbitmap) {
+		slindex += __ffs(slbitmap) + 1;
+		*page = pool->freelist[slindex].page;
+		*offset = pool->freelist[slindex].offset;
+		return slindex;
+	}
+
+	/* Now do a full two-level bitmap search to find next nearest fit */
+	flindex = slindex / BITS_PER_LONG;
+
+	flbitmap = (pool->flbitmap) >> (flindex + 1);
+	if (!flbitmap)
+		return 0;
+
+	flindex += __ffs(flbitmap) + 1;
+	slbitmap = pool->slbitmap[flindex];
+	slindex = (flindex * BITS_PER_LONG) + __ffs(slbitmap);
+	*page = pool->freelist[slindex].page;
+	*offset = pool->freelist[slindex].offset;
+
+	return slindex;
+}
+
+/*
+ * Insert block at <page, offset> in freelist of given pool.
+ * freelist used depends on block size.
+ */
+static void insert_block(struct xv_pool *pool, struct page *page, u32 offset,
+			struct block_header *block)
+{
+	u32 flindex, slindex;
+	struct block_header *nextblock;
+
+	slindex = get_index_for_insert(block->size);
+	flindex = slindex / BITS_PER_LONG;
+
+	block->link.prev_page = 0;
+	block->link.prev_offset = 0;
+	block->link.next_page = pool->freelist[slindex].page;
+	block->link.next_offset = pool->freelist[slindex].offset;
+	pool->freelist[slindex].page = page;
+	pool->freelist[slindex].offset = offset;
+
+	if (block->link.next_page) {
+		nextblock = get_ptr_atomic(block->link.next_page,
+					block->link.next_offset, KM_USER1);
+		nextblock->link.prev_page = page;
+		nextblock->link.prev_offset = offset;
+		put_ptr_atomic(nextblock, KM_USER1);
+	}
+
+	__set_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
+	__set_bit(flindex, &pool->flbitmap);
+}
+
+/*
+ * Remove block from head of freelist. Index 'slindex' identifies the freelist.
+ */
+static void remove_block_head(struct xv_pool *pool,
+			struct block_header *block, u32 slindex)
+{
+	struct block_header *tmpblock;
+	u32 flindex = slindex / BITS_PER_LONG;
+
+	pool->freelist[slindex].page = block->link.next_page;
+	pool->freelist[slindex].offset = block->link.next_offset;
+	block->link.prev_page = 0;
+	block->link.prev_offset = 0;
+
+	if (!pool->freelist[slindex].page) {
+		__clear_bit(slindex % BITS_PER_LONG, &pool->slbitmap[flindex]);
+		if (!pool->slbitmap[flindex])
+			__clear_bit(flindex, &pool->flbitmap);
+	} else {
+		/*
+		 * DEBUG ONLY: We need not reinitialize freelist head previous
+		 * pointer to 0 - we never depend on its value. But just for
+		 * sanity, lets do it.
+		 */
+		tmpblock = get_ptr_atomic(pool->freelist[slindex].page,
+				pool->freelist[slindex].offset, KM_USER1);
+		tmpblock->link.prev_page = 0;
+		tmpblock->link.prev_offset = 0;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+}
+
+/*
+ * Remove block from freelist. Index 'slindex' identifies the freelist.
+ */
+static void remove_block(struct xv_pool *pool, struct page *page, u32 offset,
+			struct block_header *block, u32 slindex)
+{
+	u32 flindex;
+	struct block_header *tmpblock;
+
+	if (pool->freelist[slindex].page == page
+	   && pool->freelist[slindex].offset == offset) {
+		remove_block_head(pool, block, slindex);
+		return;
+	}
+
+	flindex = slindex / BITS_PER_LONG;
+
+	if (block->link.prev_page) {
+		tmpblock = get_ptr_atomic(block->link.prev_page,
+				block->link.prev_offset, KM_USER1);
+		tmpblock->link.next_page = block->link.next_page;
+		tmpblock->link.next_offset = block->link.next_offset;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+
+	if (block->link.next_page) {
+		tmpblock = get_ptr_atomic(block->link.next_page,
+				block->link.next_offset, KM_USER1);
+		tmpblock->link.prev_page = block->link.prev_page;
+		tmpblock->link.prev_offset = block->link.prev_offset;
+		put_ptr_atomic(tmpblock, KM_USER1);
+	}
+
+	return;
+}
+
+/*
+ * Allocate a page and add it freelist of given pool.
+ */
+static int grow_pool(struct xv_pool *pool, gfp_t flags)
+{
+	struct page *page;
+	struct block_header *block;
+
+	page = xv_alloc_page(flags);
+	if (unlikely(!page))
+		return -ENOMEM;
+
+	stat_inc(&pool->total_pages);
+
+	spin_lock(&pool->lock);
+	block = get_ptr_atomic(page, 0, KM_USER0);
+
+	block->size = PAGE_SIZE - XV_ALIGN;
+	set_flag(block, BLOCK_FREE);
+	clear_flag(block, PREV_FREE);
+	set_blockprev(block, 0);
+
+	insert_block(pool, page, 0, block);
+
+	put_ptr_atomic(block, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	return 0;
+}
+
+/*
+ * Create a memory pool. Allocates freelist, bitmaps and other
+ * per-pool metadata.
+ */
+struct xv_pool *xv_create_pool(void)
+{
+	u32 ovhd_size;
+	struct xv_pool *pool;
+
+	ovhd_size = roundup(sizeof(*pool), PAGE_SIZE);
+	pool = kzalloc(ovhd_size, GFP_KERNEL);
+	if (!pool)
+		return NULL;
+
+	spin_lock_init(&pool->lock);
+
+	return pool;
+}
+
+void xv_destroy_pool(struct xv_pool *pool)
+{
+	kfree(pool);
+}
+
+/**
+ * xv_malloc - Allocate block of given size from pool.
+ * @pool: pool to allocate from
+ * @size: size of block to allocate
+ * @page: page no. that holds the object
+ * @offset: location of object within page
+ *
+ * On success, <page, offset> identifies block allocated
+ * and 0 is returned. On failure, <page, offset> is set to
+ * 0 and -ENOMEM is returned.
+ *
+ * Allocation requests with size > XV_MAX_ALLOC_SIZE will fail.
+ */
+int xv_malloc(struct xv_pool *pool, u32 size, struct page **page,
+		u32 *offset, gfp_t flags)
+{
+	int error;
+	u32 index, tmpsize, origsize, tmpoffset;
+	struct block_header *block, *tmpblock;
+
+	*page = NULL;
+	*offset = 0;
+	origsize = size;
+
+	if (unlikely(!size || size > XV_MAX_ALLOC_SIZE))
+		return -ENOMEM;
+
+	size = ALIGN(size, XV_ALIGN);
+
+	spin_lock(&pool->lock);
+
+	index = find_block(pool, size, page, offset);
+
+	if (!*page) {
+		spin_unlock(&pool->lock);
+		if (flags & GFP_NOWAIT)
+			return -ENOMEM;
+		error = grow_pool(pool, flags);
+		if (unlikely(error))
+			return -ENOMEM;
+
+		spin_lock(&pool->lock);
+		index = find_block(pool, size, page, offset);
+	}
+
+	if (!*page) {
+		spin_unlock(&pool->lock);
+		return -ENOMEM;
+	}
+
+	block = get_ptr_atomic(*page, *offset, KM_USER0);
+
+	remove_block_head(pool, block, index);
+
+	/* Split the block if required */
+	tmpoffset = *offset + size + XV_ALIGN;
+	tmpsize = block->size - size;
+	tmpblock = (struct block_header *)((char *)block + size + XV_ALIGN);
+	if (tmpsize) {
+		tmpblock->size = tmpsize - XV_ALIGN;
+		set_flag(tmpblock, BLOCK_FREE);
+		clear_flag(tmpblock, PREV_FREE);
+
+		set_blockprev(tmpblock, *offset);
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
+			insert_block(pool, *page, tmpoffset, tmpblock);
+
+		if (tmpoffset + XV_ALIGN + tmpblock->size != PAGE_SIZE) {
+			tmpblock = BLOCK_NEXT(tmpblock);
+			set_blockprev(tmpblock, tmpoffset);
+		}
+	} else {
+		/* This block is exact fit */
+		if (tmpoffset != PAGE_SIZE)
+			clear_flag(tmpblock, PREV_FREE);
+	}
+
+	block->size = origsize;
+	clear_flag(block, BLOCK_FREE);
+
+	put_ptr_atomic(block, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	*offset += XV_ALIGN;
+
+	return 0;
+}
+
+/*
+ * Free block identified with <page, offset>
+ */
+void xv_free(struct xv_pool *pool, struct page *page, u32 offset)
+{
+	void *page_start;
+	struct block_header *block, *tmpblock;
+
+	offset -= XV_ALIGN;
+
+	spin_lock(&pool->lock);
+
+	page_start = get_ptr_atomic(page, 0, KM_USER0);
+	block = (struct block_header *)((char *)page_start + offset);
+
+	/* Catch double free bugs */
+	BUG_ON(test_flag(block, BLOCK_FREE));
+
+	block->size = ALIGN(block->size, XV_ALIGN);
+
+	tmpblock = BLOCK_NEXT(block);
+	if (offset + block->size + XV_ALIGN == PAGE_SIZE)
+		tmpblock = NULL;
+
+	/* Merge next block if its free */
+	if (tmpblock && test_flag(tmpblock, BLOCK_FREE)) {
+		/*
+		 * Blocks smaller than XV_MIN_ALLOC_SIZE
+		 * are not inserted in any free list.
+		 */
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE) {
+			remove_block(pool, page,
+				    offset + block->size + XV_ALIGN, tmpblock,
+				    get_index_for_insert(tmpblock->size));
+		}
+		block->size += tmpblock->size + XV_ALIGN;
+	}
+
+	/* Merge previous block if its free */
+	if (test_flag(block, PREV_FREE)) {
+		tmpblock = (struct block_header *)((char *)(page_start) +
+						get_blockprev(block));
+		offset = offset - tmpblock->size - XV_ALIGN;
+
+		if (tmpblock->size >= XV_MIN_ALLOC_SIZE)
+			remove_block(pool, page, offset, tmpblock,
+				    get_index_for_insert(tmpblock->size));
+
+		tmpblock->size += block->size + XV_ALIGN;
+		block = tmpblock;
+	}
+
+	/* No used objects in this page. Free it. */
+	if (block->size == PAGE_SIZE - XV_ALIGN) {
+		put_ptr_atomic(page_start, KM_USER0);
+		spin_unlock(&pool->lock);
+
+		xv_free_page(page);
+		stat_dec(&pool->total_pages);
+		return;
+	}
+
+	set_flag(block, BLOCK_FREE);
+	if (block->size >= XV_MIN_ALLOC_SIZE)
+		insert_block(pool, page, offset, block);
+
+	if (offset + block->size + XV_ALIGN != PAGE_SIZE) {
+		tmpblock = BLOCK_NEXT(block);
+		set_flag(tmpblock, PREV_FREE);
+		set_blockprev(tmpblock, offset);
+	}
+
+	put_ptr_atomic(page_start, KM_USER0);
+	spin_unlock(&pool->lock);
+
+	return;
+}
+
+u32 xv_get_object_size(void *obj)
+{
+	struct block_header *blk;
+
+	blk = (struct block_header *)((char *)(obj) - XV_ALIGN);
+	return blk->size;
+}
+
+/*
+ * Returns total memory used by allocator (userdata + metadata)
+ */
+u64 xv_get_total_size_bytes(struct xv_pool *pool)
+{
+	return pool->total_pages << PAGE_SHIFT;
+}
diff --git a/drivers/block/ramzswap/xvmalloc.h b/drivers/block/ramzswap/xvmalloc.h
new file mode 100644
index 0000000..010c6fe
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc.h
@@ -0,0 +1,30 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#ifndef _XV_MALLOC_H_
+#define _XV_MALLOC_H_
+
+#include <linux/types.h>
+
+struct xv_pool;
+
+struct xv_pool *xv_create_pool(void);
+void xv_destroy_pool(struct xv_pool *pool);
+
+int xv_malloc(struct xv_pool *pool, u32 size, struct page **page,
+			u32 *offset, gfp_t flags);
+void xv_free(struct xv_pool *pool, struct page *page, u32 offset);
+
+u32 xv_get_object_size(void *obj);
+u64 xv_get_total_size_bytes(struct xv_pool *pool);
+
+#endif
diff --git a/drivers/block/ramzswap/xvmalloc_int.h b/drivers/block/ramzswap/xvmalloc_int.h
new file mode 100644
index 0000000..03c1a65
--- /dev/null
+++ b/drivers/block/ramzswap/xvmalloc_int.h
@@ -0,0 +1,86 @@
+/*
+ * xvmalloc memory allocator
+ *
+ * Copyright (C) 2008, 2009  Nitin Gupta
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ */
+
+#ifndef _XV_MALLOC_INT_H_
+#define _XV_MALLOC_INT_H_
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+/* User configurable params */
+
+/* Must be power of two */
+#define XV_ALIGN_SHIFT	2
+#define XV_ALIGN	(1 << XV_ALIGN_SHIFT)
+#define XV_ALIGN_MASK	(XV_ALIGN - 1)
+
+/* This must be greater than sizeof(link_free) */
+#define XV_MIN_ALLOC_SIZE	32
+#define XV_MAX_ALLOC_SIZE	(PAGE_SIZE - XV_ALIGN)
+
+/* Free lists are separated by FL_DELTA bytes */
+#define FL_DELTA_SHIFT	3
+#define FL_DELTA	(1 << FL_DELTA_SHIFT)
+#define FL_DELTA_MASK	(FL_DELTA - 1)
+#define NUM_FREE_LISTS	((XV_MAX_ALLOC_SIZE - XV_MIN_ALLOC_SIZE) \
+				/ FL_DELTA + 1)
+
+#define MAX_FLI		DIV_ROUND_UP(NUM_FREE_LISTS, BITS_PER_LONG)
+
+/* End of user params */
+
+enum blockflags {
+	BLOCK_FREE,
+	PREV_FREE,
+	__NR_BLOCKFLAGS,
+};
+
+#define FLAGS_MASK	XV_ALIGN_MASK
+#define PREV_MASK	(~FLAGS_MASK)
+
+struct freelist_entry {
+	struct page *page;
+	u16 offset;
+	u16 pad;
+};
+
+struct link_free {
+	struct page *prev_page;
+	struct page *next_page;
+	u16 prev_offset;
+	u16 next_offset;
+};
+
+struct block_header {
+	union {
+		/* This common header must be ALIGN bytes */
+		u8 common[XV_ALIGN];
+		struct {
+			u16 size;
+			u16 prev;
+		};
+	};
+	struct link_free link;
+};
+
+struct xv_pool {
+	ulong flbitmap;
+	ulong slbitmap[MAX_FLI];
+	spinlock_t lock;
+
+	struct freelist_entry freelist[NUM_FREE_LISTS];
+
+	/* stats */
+	u64 total_pages;
+};
+
+#endif

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [PATCH 0/4] compcache: in-memory compressed swapping v2
       [not found] <200909100215.36350.ngupta@vflare.org>
@ 2009-09-09 22:02   ` Nitin Gupta
  2009-09-09 21:21   ` Nitin Gupta
                     ` (3 subsequent siblings)
  4 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-09 22:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Pekka Enberg, Ed Tomlinson, linux-kernel, linux-mm,
	linux-mm-cc

Hi,

Project home: http://compcache.googlecode.com/

* Changelog: v2 vs initial revision
 - Use 'struct page' instead of 32-bit PFNs in ramzswap driver and xvmalloc.
   This is to make these 64-bit safe.
 - xvmalloc is no longer a separate module and does not export any symbols.
   Its compiled directly with ramzswap block driver. This is to avoid any
   last bit of confusion with any other allocator.
 - set_swap_free_notify() now accepts block_device as parameter instead of
   swp_entry_t (interface cleanup).
 - Fix: Make sure ramzswap disksize matches usable pages in backing swap file.
   This caused initialization error in case backing swap file had intra-page
   fragmentation.

It creates RAM based block devices which can be used (only) as swap disks.
Pages swapped to these disks are compressed and stored in memory itself. This
is a big win over swapping to slow hard-disk which are typically used as swap
disk. For flash, these suffer from wear-leveling issues when used as swap disk
- so again its helpful. For swapless systems, it allows more apps to run for a
given amount of memory.

It can create multiple ramzswap devices (/dev/ramzswapX, X = 0, 1, 2, ...).
Each of these devices can have separate backing swap (file or disk partition)
which is used when incompressible page is found or memory limit for device is
reached.

A separate userspace utility called rzscontrol is used to manage individual
ramzswap devices.

* Testing notes

Tested on x86, x64, ARM
ARM:
 - Cortex-A8 (Beagleboard)
 - ARM11 (Android G1)
 - OMAP2420 (Nokia N810)

* Performance

All performance numbers/plots can be found at:
http://code.google.com/p/compcache/wiki/Performance

Below is a summary of this data:

General:
 - Swap R/W times are reduced from milliseconds (in case of hard disks)
down to microseconds.

Positive cases:
 - Shows 33% improvement in 'scan' benchmark which allocates given amount
of memory and linearly reads/writes to this region. This benchmark also
exposes bottlenecks in ramzswap code (global mutex) due to which this gain
is so small.
 - On Linux thin clients, it gives the effect of nearly doubling the amount of
memory.

Negative cases:
Any workload that has active working set w.r.t. filesystem cache that is
nearly equal to amount of RAM while has minimal anonymous memory requirement,
is expected to suffer maximum loss in performance with ramzswap enabled.

Iozone filesystem benchmark can simulate exactly this kind of workload.
As expected, this test shows performance loss of ~25% with ramzswap.

(Sorry for long patch[2/4] but its now very hard to split it up).

 Documentation/blockdev/00-INDEX       |    2 +
 Documentation/blockdev/ramzswap.txt   |   50 ++
 drivers/block/Kconfig                 |   22 +
 drivers/block/Makefile                |    1 +
 drivers/block/ramzswap/Makefile       |    3 +
 drivers/block/ramzswap/ramzswap_drv.c | 1529 +++++++++++++++++++++++++++++++++
 drivers/block/ramzswap/ramzswap_drv.h |  183 ++++
 drivers/block/ramzswap/xvmalloc.c     |  533 ++++++++++++
 drivers/block/ramzswap/xvmalloc.h     |   30 +
 drivers/block/ramzswap/xvmalloc_int.h |   86 ++
 include/linux/ramzswap_ioctl.h        |   51 ++
 include/linux/swap.h                  |    5 +
 mm/swapfile.c                         |   34 +
 13 files changed, 2529 insertions(+), 0 deletions(-)

^ permalink raw reply	[flat|nested] 40+ messages in thread

* [PATCH 0/4] compcache: in-memory compressed swapping v2
@ 2009-09-09 22:02   ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-09 22:02 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Hugh Dickins, Pekka Enberg, Ed Tomlinson, linux-kernel, linux-mm,
	linux-mm-cc

Hi,

Project home: http://compcache.googlecode.com/

* Changelog: v2 vs initial revision
 - Use 'struct page' instead of 32-bit PFNs in ramzswap driver and xvmalloc.
   This is to make these 64-bit safe.
 - xvmalloc is no longer a separate module and does not export any symbols.
   Its compiled directly with ramzswap block driver. This is to avoid any
   last bit of confusion with any other allocator.
 - set_swap_free_notify() now accepts block_device as parameter instead of
   swp_entry_t (interface cleanup).
 - Fix: Make sure ramzswap disksize matches usable pages in backing swap file.
   This caused initialization error in case backing swap file had intra-page
   fragmentation.

It creates RAM based block devices which can be used (only) as swap disks.
Pages swapped to these disks are compressed and stored in memory itself. This
is a big win over swapping to slow hard-disk which are typically used as swap
disk. For flash, these suffer from wear-leveling issues when used as swap disk
- so again its helpful. For swapless systems, it allows more apps to run for a
given amount of memory.

It can create multiple ramzswap devices (/dev/ramzswapX, X = 0, 1, 2, ...).
Each of these devices can have separate backing swap (file or disk partition)
which is used when incompressible page is found or memory limit for device is
reached.

A separate userspace utility called rzscontrol is used to manage individual
ramzswap devices.

* Testing notes

Tested on x86, x64, ARM
ARM:
 - Cortex-A8 (Beagleboard)
 - ARM11 (Android G1)
 - OMAP2420 (Nokia N810)

* Performance

All performance numbers/plots can be found at:
http://code.google.com/p/compcache/wiki/Performance

Below is a summary of this data:

General:
 - Swap R/W times are reduced from milliseconds (in case of hard disks)
down to microseconds.

Positive cases:
 - Shows 33% improvement in 'scan' benchmark which allocates given amount
of memory and linearly reads/writes to this region. This benchmark also
exposes bottlenecks in ramzswap code (global mutex) due to which this gain
is so small.
 - On Linux thin clients, it gives the effect of nearly doubling the amount of
memory.

Negative cases:
Any workload that has active working set w.r.t. filesystem cache that is
nearly equal to amount of RAM while has minimal anonymous memory requirement,
is expected to suffer maximum loss in performance with ramzswap enabled.

Iozone filesystem benchmark can simulate exactly this kind of workload.
As expected, this test shows performance loss of ~25% with ramzswap.

(Sorry for long patch[2/4] but its now very hard to split it up).

 Documentation/blockdev/00-INDEX       |    2 +
 Documentation/blockdev/ramzswap.txt   |   50 ++
 drivers/block/Kconfig                 |   22 +
 drivers/block/Makefile                |    1 +
 drivers/block/ramzswap/Makefile       |    3 +
 drivers/block/ramzswap/ramzswap_drv.c | 1529 +++++++++++++++++++++++++++++++++
 drivers/block/ramzswap/ramzswap_drv.h |  183 ++++
 drivers/block/ramzswap/xvmalloc.c     |  533 ++++++++++++
 drivers/block/ramzswap/xvmalloc.h     |   30 +
 drivers/block/ramzswap/xvmalloc_int.h |   86 ++
 include/linux/ramzswap_ioctl.h        |   51 ++
 include/linux/swap.h                  |    5 +
 mm/swapfile.c                         |   34 +
 13 files changed, 2529 insertions(+), 0 deletions(-)

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 0/4] compcache: in-memory compressed swapping v2
  2009-09-09 22:02   ` Nitin Gupta
@ 2009-09-12  2:24     ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-12  2:24 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ed Tomlinson, Hugh Dickins, Pekka Enberg, linux-kernel, linux-mm,
	linux-mm-cc

Hi,


On 09/10/2009 03:32 AM, Nitin Gupta wrote:
> Project home: http://compcache.googlecode.com/
>
> * Changelog: v2 vs initial revision
>   - Use 'struct page' instead of 32-bit PFNs in ramzswap driver and xvmalloc.
>     This is to make these 64-bit safe.
>   - xvmalloc is no longer a separate module and does not export any symbols.
>     Its compiled directly with ramzswap block driver. This is to avoid any
>     last bit of confusion with any other allocator.
>   - set_swap_free_notify() now accepts block_device as parameter instead of
>     swp_entry_t (interface cleanup).
>   - Fix: Make sure ramzswap disksize matches usable pages in backing swap file.
>     This caused initialization error in case backing swap file had intra-page
>     fragmentation.
>
>    


Can anyone please review these patches for possible inclusion in 2.6.32?
Sorry for the weird email threading.

Thanks,
Nitin


> It creates RAM based block devices which can be used (only) as swap disks.
> Pages swapped to these disks are compressed and stored in memory itself. This
> is a big win over swapping to slow hard-disk which are typically used as swap
> disk. For flash, these suffer from wear-leveling issues when used as swap disk
> - so again its helpful. For swapless systems, it allows more apps to run for a
> given amount of memory.
>
> It can create multiple ramzswap devices (/dev/ramzswapX, X = 0, 1, 2, ...).
> Each of these devices can have separate backing swap (file or disk partition)
> which is used when incompressible page is found or memory limit for device is
> reached.
>
> A separate userspace utility called rzscontrol is used to manage individual
> ramzswap devices.
>
> * Testing notes
>
> Tested on x86, x64, ARM
> ARM:
>   - Cortex-A8 (Beagleboard)
>   - ARM11 (Android G1)
>   - OMAP2420 (Nokia N810)
>
> * Performance
>
> All performance numbers/plots can be found at:
> http://code.google.com/p/compcache/wiki/Performance
>
> Below is a summary of this data:
>
> General:
>   - Swap R/W times are reduced from milliseconds (in case of hard disks)
> down to microseconds.
>
> Positive cases:
>   - Shows 33% improvement in 'scan' benchmark which allocates given amount
> of memory and linearly reads/writes to this region. This benchmark also
> exposes bottlenecks in ramzswap code (global mutex) due to which this gain
> is so small.
>   - On Linux thin clients, it gives the effect of nearly doubling the amount of
> memory.
>
> Negative cases:
> Any workload that has active working set w.r.t. filesystem cache that is
> nearly equal to amount of RAM while has minimal anonymous memory requirement,
> is expected to suffer maximum loss in performance with ramzswap enabled.
>
> Iozone filesystem benchmark can simulate exactly this kind of workload.
> As expected, this test shows performance loss of ~25% with ramzswap.
>
> (Sorry for long patch[2/4] but its now very hard to split it up).
>
>   Documentation/blockdev/00-INDEX       |    2 +
>   Documentation/blockdev/ramzswap.txt   |   50 ++
>   drivers/block/Kconfig                 |   22 +
>   drivers/block/Makefile                |    1 +
>   drivers/block/ramzswap/Makefile       |    3 +
>   drivers/block/ramzswap/ramzswap_drv.c | 1529 +++++++++++++++++++++++++++++++++
>   drivers/block/ramzswap/ramzswap_drv.h |  183 ++++
>   drivers/block/ramzswap/xvmalloc.c     |  533 ++++++++++++
>   drivers/block/ramzswap/xvmalloc.h     |   30 +
>   drivers/block/ramzswap/xvmalloc_int.h |   86 ++
>   include/linux/ramzswap_ioctl.h        |   51 ++
>   include/linux/swap.h                  |    5 +
>   mm/swapfile.c                         |   34 +
>   13 files changed, 2529 insertions(+), 0 deletions(-)
> _______________________________________________
>    


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 0/4] compcache: in-memory compressed swapping v2
@ 2009-09-12  2:24     ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-12  2:24 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Ed Tomlinson, Hugh Dickins, Pekka Enberg, linux-kernel, linux-mm,
	linux-mm-cc

Hi,


On 09/10/2009 03:32 AM, Nitin Gupta wrote:
> Project home: http://compcache.googlecode.com/
>
> * Changelog: v2 vs initial revision
>   - Use 'struct page' instead of 32-bit PFNs in ramzswap driver and xvmalloc.
>     This is to make these 64-bit safe.
>   - xvmalloc is no longer a separate module and does not export any symbols.
>     Its compiled directly with ramzswap block driver. This is to avoid any
>     last bit of confusion with any other allocator.
>   - set_swap_free_notify() now accepts block_device as parameter instead of
>     swp_entry_t (interface cleanup).
>   - Fix: Make sure ramzswap disksize matches usable pages in backing swap file.
>     This caused initialization error in case backing swap file had intra-page
>     fragmentation.
>
>    


Can anyone please review these patches for possible inclusion in 2.6.32?
Sorry for the weird email threading.

Thanks,
Nitin


> It creates RAM based block devices which can be used (only) as swap disks.
> Pages swapped to these disks are compressed and stored in memory itself. This
> is a big win over swapping to slow hard-disk which are typically used as swap
> disk. For flash, these suffer from wear-leveling issues when used as swap disk
> - so again its helpful. For swapless systems, it allows more apps to run for a
> given amount of memory.
>
> It can create multiple ramzswap devices (/dev/ramzswapX, X = 0, 1, 2, ...).
> Each of these devices can have separate backing swap (file or disk partition)
> which is used when incompressible page is found or memory limit for device is
> reached.
>
> A separate userspace utility called rzscontrol is used to manage individual
> ramzswap devices.
>
> * Testing notes
>
> Tested on x86, x64, ARM
> ARM:
>   - Cortex-A8 (Beagleboard)
>   - ARM11 (Android G1)
>   - OMAP2420 (Nokia N810)
>
> * Performance
>
> All performance numbers/plots can be found at:
> http://code.google.com/p/compcache/wiki/Performance
>
> Below is a summary of this data:
>
> General:
>   - Swap R/W times are reduced from milliseconds (in case of hard disks)
> down to microseconds.
>
> Positive cases:
>   - Shows 33% improvement in 'scan' benchmark which allocates given amount
> of memory and linearly reads/writes to this region. This benchmark also
> exposes bottlenecks in ramzswap code (global mutex) due to which this gain
> is so small.
>   - On Linux thin clients, it gives the effect of nearly doubling the amount of
> memory.
>
> Negative cases:
> Any workload that has active working set w.r.t. filesystem cache that is
> nearly equal to amount of RAM while has minimal anonymous memory requirement,
> is expected to suffer maximum loss in performance with ramzswap enabled.
>
> Iozone filesystem benchmark can simulate exactly this kind of workload.
> As expected, this test shows performance loss of ~25% with ramzswap.
>
> (Sorry for long patch[2/4] but its now very hard to split it up).
>
>   Documentation/blockdev/00-INDEX       |    2 +
>   Documentation/blockdev/ramzswap.txt   |   50 ++
>   drivers/block/Kconfig                 |   22 +
>   drivers/block/Makefile                |    1 +
>   drivers/block/ramzswap/Makefile       |    3 +
>   drivers/block/ramzswap/ramzswap_drv.c | 1529 +++++++++++++++++++++++++++++++++
>   drivers/block/ramzswap/ramzswap_drv.h |  183 ++++
>   drivers/block/ramzswap/xvmalloc.c     |  533 ++++++++++++
>   drivers/block/ramzswap/xvmalloc.h     |   30 +
>   drivers/block/ramzswap/xvmalloc_int.h |   86 ++
>   include/linux/ramzswap_ioctl.h        |   51 ++
>   include/linux/swap.h                  |    5 +
>   mm/swapfile.c                         |   34 +
>   13 files changed, 2529 insertions(+), 0 deletions(-)
> _______________________________________________
>    

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 0/4] compcache: in-memory compressed swapping v2
  2009-09-12  2:24     ` Nitin Gupta
@ 2009-09-13 19:07       ` Hugh Dickins
  -1 siblings, 0 replies; 40+ messages in thread
From: Hugh Dickins @ 2009-09-13 19:07 UTC (permalink / raw)
  To: Nitin Gupta
  Cc: Andrew Morton, Ed Tomlinson, Pekka Enberg, linux-kernel,
	linux-mm, linux-mm-cc

On Sat, 12 Sep 2009, Nitin Gupta wrote:
> On 09/10/2009 03:32 AM, Nitin Gupta wrote:
> > Project home: http://compcache.googlecode.com/
> >
> > * Changelog: v2 vs initial revision
> >   - Use 'struct page' instead of 32-bit PFNs in ramzswap driver and
> >   xvmalloc.
> >     This is to make these 64-bit safe.
> >   - xvmalloc is no longer a separate module and does not export any symbols.
> >     Its compiled directly with ramzswap block driver. This is to avoid any
> >     last bit of confusion with any other allocator.
> >   - set_swap_free_notify() now accepts block_device as parameter instead of
> >     swp_entry_t (interface cleanup).
> >   - Fix: Make sure ramzswap disksize matches usable pages in backing swap
> >   file.
> >     This caused initialization error in case backing swap file had
> >     intra-page
> >     fragmentation.
> 
> Can anyone please review these patches for possible inclusion in 2.6.32?

Sorry, I certainly wouldn't be able to review them for 2.6.32 myself.

Since we're already in the merge window, and this work has not yet
had exposure in mmotm (preferably) or linux-next, I really doubt
anyone should be pushing it for 2.6.32.

I'd be quite glad to see it and experiment with it in mmotm,
so it could go into 2.6.33 if all okay.  And I now fully accept
that the discard/trim situation is so hazy that you are quite
right to be asking for your own well-defined notifier instead.

But I'm not going to pretend to have reviewed it.

Hugh

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 0/4] compcache: in-memory compressed swapping v2
@ 2009-09-13 19:07       ` Hugh Dickins
  0 siblings, 0 replies; 40+ messages in thread
From: Hugh Dickins @ 2009-09-13 19:07 UTC (permalink / raw)
  To: Nitin Gupta
  Cc: Andrew Morton, Ed Tomlinson, Pekka Enberg, linux-kernel,
	linux-mm, linux-mm-cc

On Sat, 12 Sep 2009, Nitin Gupta wrote:
> On 09/10/2009 03:32 AM, Nitin Gupta wrote:
> > Project home: http://compcache.googlecode.com/
> >
> > * Changelog: v2 vs initial revision
> >   - Use 'struct page' instead of 32-bit PFNs in ramzswap driver and
> >   xvmalloc.
> >     This is to make these 64-bit safe.
> >   - xvmalloc is no longer a separate module and does not export any symbols.
> >     Its compiled directly with ramzswap block driver. This is to avoid any
> >     last bit of confusion with any other allocator.
> >   - set_swap_free_notify() now accepts block_device as parameter instead of
> >     swp_entry_t (interface cleanup).
> >   - Fix: Make sure ramzswap disksize matches usable pages in backing swap
> >   file.
> >     This caused initialization error in case backing swap file had
> >     intra-page
> >     fragmentation.
> 
> Can anyone please review these patches for possible inclusion in 2.6.32?

Sorry, I certainly wouldn't be able to review them for 2.6.32 myself.

Since we're already in the merge window, and this work has not yet
had exposure in mmotm (preferably) or linux-next, I really doubt
anyone should be pushing it for 2.6.32.

I'd be quite glad to see it and experiment with it in mmotm,
so it could go into 2.6.33 if all okay.  And I now fully accept
that the discard/trim situation is so hazy that you are quite
right to be asking for your own well-defined notifier instead.

But I'm not going to pretend to have reviewed it.

Hugh

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 0/4] compcache: in-memory compressed swapping v2
  2009-09-13 19:07       ` Hugh Dickins
@ 2009-09-14  4:04         ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-14  4:04 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, Ed Tomlinson, Pekka Enberg, linux-kernel,
	linux-mm, linux-mm-cc

On Mon, Sep 14, 2009 at 12:37 AM, Hugh Dickins
<hugh.dickins@tiscali.co.uk> wrote:
> On Sat, 12 Sep 2009, Nitin Gupta wrote:
>> On 09/10/2009 03:32 AM, Nitin Gupta wrote:
>> > Project home: http://compcache.googlecode.com/
>> >
>> > * Changelog: v2 vs initial revision
>> >   - Use 'struct page' instead of 32-bit PFNs in ramzswap driver and
>> >   xvmalloc.
>> >     This is to make these 64-bit safe.
>> >   - xvmalloc is no longer a separate module and does not export any symbols.
>> >     Its compiled directly with ramzswap block driver. This is to avoid any
>> >     last bit of confusion with any other allocator.
>> >   - set_swap_free_notify() now accepts block_device as parameter instead of
>> >     swp_entry_t (interface cleanup).
>> >   - Fix: Make sure ramzswap disksize matches usable pages in backing swap
>> >   file.
>> >     This caused initialization error in case backing swap file had
>> >     intra-page
>> >     fragmentation.
>>
>> Can anyone please review these patches for possible inclusion in 2.6.32?
>
> Sorry, I certainly wouldn't be able to review them for 2.6.32 myself.
>
> Since we're already in the merge window, and this work has not yet
> had exposure in mmotm (preferably) or linux-next, I really doubt
> anyone should be pushing it for 2.6.32.
>
> I'd be quite glad to see it and experiment with it in mmotm,
> so it could go into 2.6.33 if all okay.  And I now fully accept
> that the discard/trim situation is so hazy that you are quite
> right to be asking for your own well-defined notifier instead.
>
> But I'm not going to pretend to have reviewed it.
>

Thanks for the pointer Hugh -- I will try to post patches against mmotm later.

Nitin

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 0/4] compcache: in-memory compressed swapping v2
@ 2009-09-14  4:04         ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-14  4:04 UTC (permalink / raw)
  To: Hugh Dickins
  Cc: Andrew Morton, Ed Tomlinson, Pekka Enberg, linux-kernel,
	linux-mm, linux-mm-cc

On Mon, Sep 14, 2009 at 12:37 AM, Hugh Dickins
<hugh.dickins@tiscali.co.uk> wrote:
> On Sat, 12 Sep 2009, Nitin Gupta wrote:
>> On 09/10/2009 03:32 AM, Nitin Gupta wrote:
>> > Project home: http://compcache.googlecode.com/
>> >
>> > * Changelog: v2 vs initial revision
>> >   - Use 'struct page' instead of 32-bit PFNs in ramzswap driver and
>> >   xvmalloc.
>> >     This is to make these 64-bit safe.
>> >   - xvmalloc is no longer a separate module and does not export any symbols.
>> >     Its compiled directly with ramzswap block driver. This is to avoid any
>> >     last bit of confusion with any other allocator.
>> >   - set_swap_free_notify() now accepts block_device as parameter instead of
>> >     swp_entry_t (interface cleanup).
>> >   - Fix: Make sure ramzswap disksize matches usable pages in backing swap
>> >   file.
>> >     This caused initialization error in case backing swap file had
>> >     intra-page
>> >     fragmentation.
>>
>> Can anyone please review these patches for possible inclusion in 2.6.32?
>
> Sorry, I certainly wouldn't be able to review them for 2.6.32 myself.
>
> Since we're already in the merge window, and this work has not yet
> had exposure in mmotm (preferably) or linux-next, I really doubt
> anyone should be pushing it for 2.6.32.
>
> I'd be quite glad to see it and experiment with it in mmotm,
> so it could go into 2.6.33 if all okay.  And I now fully accept
> that the discard/trim situation is so hazy that you are quite
> right to be asking for your own well-defined notifier instead.
>
> But I'm not going to pretend to have reviewed it.
>

Thanks for the pointer Hugh -- I will try to post patches against mmotm later.

Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-09 21:19   ` Nitin Gupta
@ 2009-09-14 20:10     ` Pekka Enberg
  -1 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-09-14 20:10 UTC (permalink / raw)
  To: ngupta
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc

Hi Nitin,

I am not a block driver expert but here are some comments on the code
that probably need to be addressed before merging.

                        Pekka

On Thu, Sep 10, 2009 at 12:19 AM, Nitin Gupta <ngupta@vflare.org> wrote:
> @@ -0,0 +1,1529 @@
> +/*
> + * Compressed RAM based swap device
> + *
> + * Copyright (C) 2008, 2009  Nitin Gupta
> + *
> + * This code is released using a dual license strategy: BSD/GPL
> + * You can choose the licence that better fits your requirements.
> + *
> + * Released under the terms of 3-clause BSD License
> + * Released under the terms of GNU General Public License Version 2.0
> + *
> + * Project home: http://compcache.googlecode.com
> + */
> +
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/bitops.h>
> +#include <linux/blkdev.h>
> +#include <linux/buffer_head.h>
> +#include <linux/device.h>
> +#include <linux/genhd.h>
> +#include <linux/highmem.h>
> +#include <linux/lzo.h>
> +#include <linux/marker.h>
> +#include <linux/mutex.h>
> +#include <linux/string.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
> +#include <linux/vmalloc.h>
> +#include <linux/version.h>
> +
> +#include "ramzswap_drv.h"
> +
> +/* Globals */
> +static int RAMZSWAP_MAJOR;
> +static struct ramzswap *DEVICES;
> +
> +/*
> + * Pages that compress to larger than this size are
> + * forwarded to backing swap, if present or stored
> + * uncompressed in memory otherwise.
> + */
> +static unsigned int MAX_CPAGE_SIZE;
> +
> +/* Module params (documentation at end) */
> +static unsigned long NUM_DEVICES;

These variable names should be in lower case.

> +
> +/* Function declarations */
> +static int __init ramzswap_init(void);
> +static int ramzswap_ioctl(struct block_device *, fmode_t,
> +                       unsigned, unsigned long);
> +static int setup_swap_header(struct ramzswap *, union swap_header *);
> +static void ramzswap_set_memlimit(struct ramzswap *, size_t);
> +static void ramzswap_set_disksize(struct ramzswap *, size_t);
> +static void reset_device(struct ramzswap *rzs);

It's preferable not to use forward declarations in new kernel code.

> +static int test_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
> +{
> +       return rzs->table[index].flags & BIT(flag);
> +}
> +
> +static void set_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
> +{
> +       rzs->table[index].flags |= BIT(flag);
> +}
> +
> +static void clear_flag(struct ramzswap *rzs, u32 index,
> +                                       enum rzs_pageflags flag)
> +{
> +       rzs->table[index].flags &= ~BIT(flag);
> +}

These function names could use a ramzswap specific prefix.

> +
> +static int page_zero_filled(void *ptr)
> +{
> +       u32 pos;
> +       u64 *page;
> +
> +       page = (u64 *)ptr;
> +
> +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
> +               if (page[pos])
> +                       return 0;
> +       }
> +
> +       return 1;
> +}

This looks like something that could be in lib/string.c.

/me looks

There's strspn so maybe you could introduce a memspn equivalent.

> +
> +/*
> + * Given <pagenum, offset> pair, provide a dereferencable pointer.
> + */
> +static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
> +{
> +       unsigned char *base;
> +
> +       base = kmap_atomic(page, type);
> +       return base + offset;
> +}
> +
> +static void put_ptr_atomic(void *ptr, enum km_type type)
> +{
> +       kunmap_atomic(ptr, type);
> +}

These two functions also appear in xmalloc. It's probably best to just
kill the wrappers and use kmap/kunmap directly.

> +
> +static void ramzswap_flush_dcache_page(struct page *page)
> +{
> +#ifdef CONFIG_ARM
> +       int flag = 0;
> +       /*
> +        * Ugly hack to get flush_dcache_page() work on ARM.
> +        * page_mapping(page) == NULL after clearing this swap cache flag.
> +        * Without clearing this flag, flush_dcache_page() will simply set
> +        * "PG_dcache_dirty" bit and return.
> +        */
> +       if (PageSwapCache(page)) {
> +               flag = 1;
> +               ClearPageSwapCache(page);
> +       }
> +#endif
> +       flush_dcache_page(page);
> +#ifdef CONFIG_ARM
> +       if (flag)
> +               SetPageSwapCache(page);
> +#endif
> +}

The above CONFIG_ARM magic really has no place in drivers/block.

> +static int add_backing_swap_extent(struct ramzswap *rzs,
> +                               pgoff_t phy_pagenum,
> +                               pgoff_t num_pages)
> +{
> +       unsigned int idx;
> +       struct list_head *head;
> +       struct page *curr_page, *new_page;
> +       unsigned int extents_per_page = PAGE_SIZE /
> +                               sizeof(struct ramzswap_backing_extent);
> +
> +       idx = rzs->num_extents % extents_per_page;
> +       if (!idx) {
> +               new_page = alloc_page(__GFP_ZERO);
> +               if (!new_page)
> +                       return -ENOMEM;
> +
> +               if (rzs->num_extents) {
> +                       curr_page = virt_to_page(rzs->curr_extent);
> +                       head = &curr_page->lru;
> +               } else {
> +                       head = &rzs->backing_swap_extent_list;
> +               }
> +
> +               list_add(&new_page->lru, head);
> +               rzs->curr_extent = page_address(new_page);
> +       }
> +
> +       rzs->curr_extent->phy_pagenum = phy_pagenum;
> +       rzs->curr_extent->num_pages = num_pages;
> +
> +       pr_debug(C "extent: [%lu, %lu] %lu\n", phy_pagenum,
> +               phy_pagenum + num_pages - 1, num_pages);

What's this "C" thing everywhere? A subsystem prefix? Shouldn't you
override pr_fmt() instead?

> +static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
> +{
> +       int ret, fwd_write_request = 0;
> +       u32 offset;
> +       size_t clen;
> +       pgoff_t index;
> +       struct zobj_header *zheader;
> +       struct page *page, *page_store;
> +       unsigned char *user_mem, *cmem, *src;
> +
> +       stat_inc(rzs->stats.num_writes);
> +
> +       page = bio->bi_io_vec[0].bv_page;
> +       index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
> +
> +       src = rzs->compress_buffer;
> +
> +       /*
> +        * System swaps to same sector again when the stored page
> +        * is no longer referenced by any process. So, its now safe
> +        * to free the memory that was allocated for this page.
> +        */
> +       if (rzs->table[index].page)
> +               ramzswap_free_page(rzs, index);
> +
> +       /*
> +        * No memory ia allocated for zero filled pages.
> +        * Simply clear zero page flag.
> +        */
> +       if (test_flag(rzs, index, RZS_ZERO)) {
> +               stat_dec(rzs->stats.pages_zero);
> +               clear_flag(rzs, index, RZS_ZERO);
> +       }
> +
> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
> +       mutex_lock(&rzs->lock);
> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");

Hmm? What's this? I don't think you should be doing ad hoc
trace_mark() in driver code.

> +
> +       user_mem = get_ptr_atomic(page, 0, KM_USER0);
> +       if (page_zero_filled(user_mem)) {
> +               put_ptr_atomic(user_mem, KM_USER0);
> +               mutex_unlock(&rzs->lock);
> +               stat_inc(rzs->stats.pages_zero);
> +               set_flag(rzs, index, RZS_ZERO);
> +
> +               set_bit(BIO_UPTODATE, &bio->bi_flags);
> +               bio_endio(bio, 0);
> +               return 0;
> +       }
> +
> +       if (rzs->backing_swap &&
> +               (rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) {
> +               put_ptr_atomic(user_mem, KM_USER0);
> +               mutex_unlock(&rzs->lock);
> +               fwd_write_request = 1;
> +               goto out;
> +       }
> +
> +       ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
> +                               rzs->compress_workmem);
> +
> +       put_ptr_atomic(user_mem, KM_USER0);
> +
> +       if (unlikely(ret != LZO_E_OK)) {
> +               mutex_unlock(&rzs->lock);
> +               pr_err(C "Compression failed! err=%d\n", ret);
> +               stat_inc(rzs->stats.failed_writes);
> +               goto out;
> +       }
> +
> +       /*
> +        * Page is incompressible. Forward it to backing swap
> +        * if present. Otherwise, store it as-is (uncompressed)
> +        * since we do not want to return too many swap write
> +        * errors which has side effect of hanging the system.
> +        */
> +       if (unlikely(clen > MAX_CPAGE_SIZE)) {
> +               if (rzs->backing_swap) {
> +                       mutex_unlock(&rzs->lock);
> +                       fwd_write_request = 1;
> +                       goto out;
> +               }
> +
> +               clen = PAGE_SIZE;
> +               page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
> +               if (unlikely(!page_store)) {
> +                       mutex_unlock(&rzs->lock);
> +                       pr_info(C "Error allocating memory for incompressible "
> +                               "page: %lu\n", index);
> +                       stat_inc(rzs->stats.failed_writes);
> +                       goto out;
> +               }
> +
> +               offset = 0;
> +               set_flag(rzs, index, RZS_UNCOMPRESSED);
> +               stat_inc(rzs->stats.pages_expand);
> +               rzs->table[index].page = page_store;
> +               src = get_ptr_atomic(page, 0, KM_USER0);
> +               goto memstore;
> +       }
> +
> +       if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
> +                       &rzs->table[index].page, &offset,
> +                       GFP_NOIO | __GFP_HIGHMEM)) {
> +               mutex_unlock(&rzs->lock);
> +               pr_info(C "Error allocating memory for compressed "
> +                       "page: %lu, size=%zu\n", index, clen);
> +               stat_inc(rzs->stats.failed_writes);
> +               if (rzs->backing_swap)
> +                       fwd_write_request = 1;
> +               goto out;
> +       }
> +
> +memstore:
> +       rzs->table[index].offset = offset;
> +
> +       cmem = get_ptr_atomic(rzs->table[index].page,
> +                       rzs->table[index].offset, KM_USER1);
> +
> +#if 0
> +       /* Back-reference needed for memory defragmentation */
> +       if (!test_flag(rzs, index, RZS_UNCOMPRESSED)) {
> +               zheader = (struct zobj_header *)cmem;
> +               zheader->table_idx = index;
> +               cmem += sizeof(*zheader);
> +       }
> +#endif

Drop the above dead code?

> +
> +       memcpy(cmem, src, clen);
> +
> +       put_ptr_atomic(cmem, KM_USER1);
> +       if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED)))
> +               put_ptr_atomic(src, KM_USER0);
> +
> +       /* Update stats */
> +       rzs->stats.compr_size += clen;
> +       stat_inc(rzs->stats.pages_stored);
> +       stat_inc_if_less(rzs->stats.good_compress, clen, PAGE_SIZE / 2 + 1);
> +
> +       mutex_unlock(&rzs->lock);
> +
> +       set_bit(BIO_UPTODATE, &bio->bi_flags);
> +       bio_endio(bio, 0);
> +       return 0;
> +
> +out:
> +       if (fwd_write_request) {
> +               stat_inc(rzs->stats.bdev_num_writes);
> +               bio->bi_bdev = rzs->backing_swap;
> +#if 0
> +               /*
> +                * TODO: We currently have linear mapping of ramzswap and
> +                * backing swap sectors. This is not desired since we want
> +                * to optimize writes to backing swap to minimize disk seeks
> +                * or have effective wear leveling (for SSDs). Also, a
> +                * non-linear mapping is required to implement compressed
> +                * on-disk swapping.
> +                */
> +                bio->bi_sector = get_backing_swap_page()
> +                                       << SECTORS_PER_PAGE_SHIFT;
> +#endif

This too?

> +static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
> +{
> +       int ret;
> +       size_t num_pages, totalram_bytes;
> +       struct sysinfo i;
> +       struct page *page;
> +       union swap_header *swap_header;
> +
> +       if (rzs->init_done) {
> +               pr_info(C "Device already initialized!\n");
> +               return -EBUSY;
> +       }
> +
> +       ret = setup_backing_swap(rzs);
> +       if (ret)
> +               goto fail;
> +
> +       si_meminfo(&i);
> +       /* Here is a trivia: guess unit used for i.totalram !! */
> +       totalram_bytes = i.totalram << PAGE_SHIFT;

You can use totalram_pages here. OTOH, I'm not sure why the driver
needs this information. Hmm?

> +
> +       if (rzs->backing_swap)
> +               ramzswap_set_memlimit(rzs, totalram_bytes);
> +       else
> +               ramzswap_set_disksize(rzs, totalram_bytes);
> +
> +       rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
> +       if (rzs->compress_workmem == NULL) {
> +               pr_err(C "Error allocating compressor working memory!\n");
> +               ret = -ENOMEM;
> +               goto fail;
> +       }
> +
> +       rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);

Use alloc_pages(__GFP_ZERO) here?

> diff --git a/drivers/block/ramzswap/ramzswap_drv.h b/drivers/block/ramzswap/ramzswap_drv.h
> new file mode 100644
> index 0000000..7f77edc
> --- /dev/null
> +++ b/drivers/block/ramzswap/ramzswap_drv.h

> +
> +#define SECTOR_SHIFT           9
> +#define SECTOR_SIZE            (1 << SECTOR_SHIFT)
> +#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
> +#define SECTORS_PER_PAGE       (1 << SECTORS_PER_PAGE_SHIFT)

Don't we have these defines somewhere in include/linux?

> +
> +/* Message prefix */
> +#define C "ramzswap: "

Use pr_fmt() instead.

> +
> +/* Debugging and Stats */
> +#define NOP    do { } while (0)

Huh? Drop this.

> +
> +#if defined(CONFIG_BLK_DEV_RAMZSWAP_STATS)
> +#define STATS
> +#endif

Why can't you rename that to CONFIG_RAMZSWAP_STATS and use that
instead of the very generic STATS?

> +
> +#if defined(STATS)
> +#define stat_inc(stat)                 ((stat)++)
> +#define stat_dec(stat)                 ((stat)--)
> +#define stat_inc_if_less(stat, val1, val2) \
> +                               ((stat) += ((val1) < (val2) ? 1 : 0))
> +#define stat_dec_if_less(stat, val1, val2) \
> +                               ((stat) -= ((val1) < (val2) ? 1 : 0))
> +#else  /* STATS */
> +#define stat_inc(x)                    NOP
> +#define stat_dec(x)                    NOP
> +#define stat_inc_if_less(x, v1, v2)    NOP
> +#define stat_dec_if_less(x, v1, v2)    NOP
> +#endif /* STATS */

Why do you need inc_if_less() and dec_if_less()? And why are these not
static inlines?

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-14 20:10     ` Pekka Enberg
  0 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-09-14 20:10 UTC (permalink / raw)
  To: ngupta
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc

Hi Nitin,

I am not a block driver expert but here are some comments on the code
that probably need to be addressed before merging.

                        Pekka

On Thu, Sep 10, 2009 at 12:19 AM, Nitin Gupta <ngupta@vflare.org> wrote:
> @@ -0,0 +1,1529 @@
> +/*
> + * Compressed RAM based swap device
> + *
> + * Copyright (C) 2008, 2009  Nitin Gupta
> + *
> + * This code is released using a dual license strategy: BSD/GPL
> + * You can choose the licence that better fits your requirements.
> + *
> + * Released under the terms of 3-clause BSD License
> + * Released under the terms of GNU General Public License Version 2.0
> + *
> + * Project home: http://compcache.googlecode.com
> + */
> +
> +#include <linux/module.h>
> +#include <linux/kernel.h>
> +#include <linux/bitops.h>
> +#include <linux/blkdev.h>
> +#include <linux/buffer_head.h>
> +#include <linux/device.h>
> +#include <linux/genhd.h>
> +#include <linux/highmem.h>
> +#include <linux/lzo.h>
> +#include <linux/marker.h>
> +#include <linux/mutex.h>
> +#include <linux/string.h>
> +#include <linux/swap.h>
> +#include <linux/swapops.h>
> +#include <linux/vmalloc.h>
> +#include <linux/version.h>
> +
> +#include "ramzswap_drv.h"
> +
> +/* Globals */
> +static int RAMZSWAP_MAJOR;
> +static struct ramzswap *DEVICES;
> +
> +/*
> + * Pages that compress to larger than this size are
> + * forwarded to backing swap, if present or stored
> + * uncompressed in memory otherwise.
> + */
> +static unsigned int MAX_CPAGE_SIZE;
> +
> +/* Module params (documentation at end) */
> +static unsigned long NUM_DEVICES;

These variable names should be in lower case.

> +
> +/* Function declarations */
> +static int __init ramzswap_init(void);
> +static int ramzswap_ioctl(struct block_device *, fmode_t,
> +                       unsigned, unsigned long);
> +static int setup_swap_header(struct ramzswap *, union swap_header *);
> +static void ramzswap_set_memlimit(struct ramzswap *, size_t);
> +static void ramzswap_set_disksize(struct ramzswap *, size_t);
> +static void reset_device(struct ramzswap *rzs);

It's preferable not to use forward declarations in new kernel code.

> +static int test_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
> +{
> +       return rzs->table[index].flags & BIT(flag);
> +}
> +
> +static void set_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
> +{
> +       rzs->table[index].flags |= BIT(flag);
> +}
> +
> +static void clear_flag(struct ramzswap *rzs, u32 index,
> +                                       enum rzs_pageflags flag)
> +{
> +       rzs->table[index].flags &= ~BIT(flag);
> +}

These function names could use a ramzswap specific prefix.

> +
> +static int page_zero_filled(void *ptr)
> +{
> +       u32 pos;
> +       u64 *page;
> +
> +       page = (u64 *)ptr;
> +
> +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
> +               if (page[pos])
> +                       return 0;
> +       }
> +
> +       return 1;
> +}

This looks like something that could be in lib/string.c.

/me looks

There's strspn so maybe you could introduce a memspn equivalent.

> +
> +/*
> + * Given <pagenum, offset> pair, provide a dereferencable pointer.
> + */
> +static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
> +{
> +       unsigned char *base;
> +
> +       base = kmap_atomic(page, type);
> +       return base + offset;
> +}
> +
> +static void put_ptr_atomic(void *ptr, enum km_type type)
> +{
> +       kunmap_atomic(ptr, type);
> +}

These two functions also appear in xmalloc. It's probably best to just
kill the wrappers and use kmap/kunmap directly.

> +
> +static void ramzswap_flush_dcache_page(struct page *page)
> +{
> +#ifdef CONFIG_ARM
> +       int flag = 0;
> +       /*
> +        * Ugly hack to get flush_dcache_page() work on ARM.
> +        * page_mapping(page) == NULL after clearing this swap cache flag.
> +        * Without clearing this flag, flush_dcache_page() will simply set
> +        * "PG_dcache_dirty" bit and return.
> +        */
> +       if (PageSwapCache(page)) {
> +               flag = 1;
> +               ClearPageSwapCache(page);
> +       }
> +#endif
> +       flush_dcache_page(page);
> +#ifdef CONFIG_ARM
> +       if (flag)
> +               SetPageSwapCache(page);
> +#endif
> +}

The above CONFIG_ARM magic really has no place in drivers/block.

> +static int add_backing_swap_extent(struct ramzswap *rzs,
> +                               pgoff_t phy_pagenum,
> +                               pgoff_t num_pages)
> +{
> +       unsigned int idx;
> +       struct list_head *head;
> +       struct page *curr_page, *new_page;
> +       unsigned int extents_per_page = PAGE_SIZE /
> +                               sizeof(struct ramzswap_backing_extent);
> +
> +       idx = rzs->num_extents % extents_per_page;
> +       if (!idx) {
> +               new_page = alloc_page(__GFP_ZERO);
> +               if (!new_page)
> +                       return -ENOMEM;
> +
> +               if (rzs->num_extents) {
> +                       curr_page = virt_to_page(rzs->curr_extent);
> +                       head = &curr_page->lru;
> +               } else {
> +                       head = &rzs->backing_swap_extent_list;
> +               }
> +
> +               list_add(&new_page->lru, head);
> +               rzs->curr_extent = page_address(new_page);
> +       }
> +
> +       rzs->curr_extent->phy_pagenum = phy_pagenum;
> +       rzs->curr_extent->num_pages = num_pages;
> +
> +       pr_debug(C "extent: [%lu, %lu] %lu\n", phy_pagenum,
> +               phy_pagenum + num_pages - 1, num_pages);

What's this "C" thing everywhere? A subsystem prefix? Shouldn't you
override pr_fmt() instead?

> +static int ramzswap_write(struct ramzswap *rzs, struct bio *bio)
> +{
> +       int ret, fwd_write_request = 0;
> +       u32 offset;
> +       size_t clen;
> +       pgoff_t index;
> +       struct zobj_header *zheader;
> +       struct page *page, *page_store;
> +       unsigned char *user_mem, *cmem, *src;
> +
> +       stat_inc(rzs->stats.num_writes);
> +
> +       page = bio->bi_io_vec[0].bv_page;
> +       index = bio->bi_sector >> SECTORS_PER_PAGE_SHIFT;
> +
> +       src = rzs->compress_buffer;
> +
> +       /*
> +        * System swaps to same sector again when the stored page
> +        * is no longer referenced by any process. So, its now safe
> +        * to free the memory that was allocated for this page.
> +        */
> +       if (rzs->table[index].page)
> +               ramzswap_free_page(rzs, index);
> +
> +       /*
> +        * No memory ia allocated for zero filled pages.
> +        * Simply clear zero page flag.
> +        */
> +       if (test_flag(rzs, index, RZS_ZERO)) {
> +               stat_dec(rzs->stats.pages_zero);
> +               clear_flag(rzs, index, RZS_ZERO);
> +       }
> +
> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
> +       mutex_lock(&rzs->lock);
> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");

Hmm? What's this? I don't think you should be doing ad hoc
trace_mark() in driver code.

> +
> +       user_mem = get_ptr_atomic(page, 0, KM_USER0);
> +       if (page_zero_filled(user_mem)) {
> +               put_ptr_atomic(user_mem, KM_USER0);
> +               mutex_unlock(&rzs->lock);
> +               stat_inc(rzs->stats.pages_zero);
> +               set_flag(rzs, index, RZS_ZERO);
> +
> +               set_bit(BIO_UPTODATE, &bio->bi_flags);
> +               bio_endio(bio, 0);
> +               return 0;
> +       }
> +
> +       if (rzs->backing_swap &&
> +               (rzs->stats.compr_size > rzs->memlimit - PAGE_SIZE)) {
> +               put_ptr_atomic(user_mem, KM_USER0);
> +               mutex_unlock(&rzs->lock);
> +               fwd_write_request = 1;
> +               goto out;
> +       }
> +
> +       ret = lzo1x_1_compress(user_mem, PAGE_SIZE, src, &clen,
> +                               rzs->compress_workmem);
> +
> +       put_ptr_atomic(user_mem, KM_USER0);
> +
> +       if (unlikely(ret != LZO_E_OK)) {
> +               mutex_unlock(&rzs->lock);
> +               pr_err(C "Compression failed! err=%d\n", ret);
> +               stat_inc(rzs->stats.failed_writes);
> +               goto out;
> +       }
> +
> +       /*
> +        * Page is incompressible. Forward it to backing swap
> +        * if present. Otherwise, store it as-is (uncompressed)
> +        * since we do not want to return too many swap write
> +        * errors which has side effect of hanging the system.
> +        */
> +       if (unlikely(clen > MAX_CPAGE_SIZE)) {
> +               if (rzs->backing_swap) {
> +                       mutex_unlock(&rzs->lock);
> +                       fwd_write_request = 1;
> +                       goto out;
> +               }
> +
> +               clen = PAGE_SIZE;
> +               page_store = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
> +               if (unlikely(!page_store)) {
> +                       mutex_unlock(&rzs->lock);
> +                       pr_info(C "Error allocating memory for incompressible "
> +                               "page: %lu\n", index);
> +                       stat_inc(rzs->stats.failed_writes);
> +                       goto out;
> +               }
> +
> +               offset = 0;
> +               set_flag(rzs, index, RZS_UNCOMPRESSED);
> +               stat_inc(rzs->stats.pages_expand);
> +               rzs->table[index].page = page_store;
> +               src = get_ptr_atomic(page, 0, KM_USER0);
> +               goto memstore;
> +       }
> +
> +       if (xv_malloc(rzs->mem_pool, clen + sizeof(*zheader),
> +                       &rzs->table[index].page, &offset,
> +                       GFP_NOIO | __GFP_HIGHMEM)) {
> +               mutex_unlock(&rzs->lock);
> +               pr_info(C "Error allocating memory for compressed "
> +                       "page: %lu, size=%zu\n", index, clen);
> +               stat_inc(rzs->stats.failed_writes);
> +               if (rzs->backing_swap)
> +                       fwd_write_request = 1;
> +               goto out;
> +       }
> +
> +memstore:
> +       rzs->table[index].offset = offset;
> +
> +       cmem = get_ptr_atomic(rzs->table[index].page,
> +                       rzs->table[index].offset, KM_USER1);
> +
> +#if 0
> +       /* Back-reference needed for memory defragmentation */
> +       if (!test_flag(rzs, index, RZS_UNCOMPRESSED)) {
> +               zheader = (struct zobj_header *)cmem;
> +               zheader->table_idx = index;
> +               cmem += sizeof(*zheader);
> +       }
> +#endif

Drop the above dead code?

> +
> +       memcpy(cmem, src, clen);
> +
> +       put_ptr_atomic(cmem, KM_USER1);
> +       if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED)))
> +               put_ptr_atomic(src, KM_USER0);
> +
> +       /* Update stats */
> +       rzs->stats.compr_size += clen;
> +       stat_inc(rzs->stats.pages_stored);
> +       stat_inc_if_less(rzs->stats.good_compress, clen, PAGE_SIZE / 2 + 1);
> +
> +       mutex_unlock(&rzs->lock);
> +
> +       set_bit(BIO_UPTODATE, &bio->bi_flags);
> +       bio_endio(bio, 0);
> +       return 0;
> +
> +out:
> +       if (fwd_write_request) {
> +               stat_inc(rzs->stats.bdev_num_writes);
> +               bio->bi_bdev = rzs->backing_swap;
> +#if 0
> +               /*
> +                * TODO: We currently have linear mapping of ramzswap and
> +                * backing swap sectors. This is not desired since we want
> +                * to optimize writes to backing swap to minimize disk seeks
> +                * or have effective wear leveling (for SSDs). Also, a
> +                * non-linear mapping is required to implement compressed
> +                * on-disk swapping.
> +                */
> +                bio->bi_sector = get_backing_swap_page()
> +                                       << SECTORS_PER_PAGE_SHIFT;
> +#endif

This too?

> +static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
> +{
> +       int ret;
> +       size_t num_pages, totalram_bytes;
> +       struct sysinfo i;
> +       struct page *page;
> +       union swap_header *swap_header;
> +
> +       if (rzs->init_done) {
> +               pr_info(C "Device already initialized!\n");
> +               return -EBUSY;
> +       }
> +
> +       ret = setup_backing_swap(rzs);
> +       if (ret)
> +               goto fail;
> +
> +       si_meminfo(&i);
> +       /* Here is a trivia: guess unit used for i.totalram !! */
> +       totalram_bytes = i.totalram << PAGE_SHIFT;

You can use totalram_pages here. OTOH, I'm not sure why the driver
needs this information. Hmm?

> +
> +       if (rzs->backing_swap)
> +               ramzswap_set_memlimit(rzs, totalram_bytes);
> +       else
> +               ramzswap_set_disksize(rzs, totalram_bytes);
> +
> +       rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
> +       if (rzs->compress_workmem == NULL) {
> +               pr_err(C "Error allocating compressor working memory!\n");
> +               ret = -ENOMEM;
> +               goto fail;
> +       }
> +
> +       rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);

Use alloc_pages(__GFP_ZERO) here?

> diff --git a/drivers/block/ramzswap/ramzswap_drv.h b/drivers/block/ramzswap/ramzswap_drv.h
> new file mode 100644
> index 0000000..7f77edc
> --- /dev/null
> +++ b/drivers/block/ramzswap/ramzswap_drv.h

> +
> +#define SECTOR_SHIFT           9
> +#define SECTOR_SIZE            (1 << SECTOR_SHIFT)
> +#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
> +#define SECTORS_PER_PAGE       (1 << SECTORS_PER_PAGE_SHIFT)

Don't we have these defines somewhere in include/linux?

> +
> +/* Message prefix */
> +#define C "ramzswap: "

Use pr_fmt() instead.

> +
> +/* Debugging and Stats */
> +#define NOP    do { } while (0)

Huh? Drop this.

> +
> +#if defined(CONFIG_BLK_DEV_RAMZSWAP_STATS)
> +#define STATS
> +#endif

Why can't you rename that to CONFIG_RAMZSWAP_STATS and use that
instead of the very generic STATS?

> +
> +#if defined(STATS)
> +#define stat_inc(stat)                 ((stat)++)
> +#define stat_dec(stat)                 ((stat)--)
> +#define stat_inc_if_less(stat, val1, val2) \
> +                               ((stat) += ((val1) < (val2) ? 1 : 0))
> +#define stat_dec_if_less(stat, val1, val2) \
> +                               ((stat) -= ((val1) < (val2) ? 1 : 0))
> +#else  /* STATS */
> +#define stat_inc(x)                    NOP
> +#define stat_dec(x)                    NOP
> +#define stat_inc_if_less(x, v1, v2)    NOP
> +#define stat_dec_if_less(x, v1, v2)    NOP
> +#endif /* STATS */

Why do you need inc_if_less() and dec_if_less()? And why are these not
static inlines?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-14 20:10     ` Pekka Enberg
@ 2009-09-15  6:39       ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-15  6:39 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc

Hi Pekka,

Thanks for review. My comments inline.

On Tue, Sep 15, 2009 at 1:40 AM, Pekka Enberg <penberg@cs.helsinki.fi> wrote:
>
> I am not a block driver expert but here are some comments on the code
> that probably need to be addressed before merging.
>
>                        Pekka

>
> On Thu, Sep 10, 2009 at 12:19 AM, Nitin Gupta <ngupta@vflare.org> wrote:
>> +
>> +/* Globals */
>> +static int RAMZSWAP_MAJOR;
>> +static struct ramzswap *DEVICES;
>> +
>> +/*
>> + * Pages that compress to larger than this size are
>> + * forwarded to backing swap, if present or stored
>> + * uncompressed in memory otherwise.
>> + */
>> +static unsigned int MAX_CPAGE_SIZE;
>> +
>> +/* Module params (documentation at end) */
>> +static unsigned long NUM_DEVICES;
>
> These variable names should be in lower case.
>

Global variables with lower case causes confusion.


>> +
>> +/* Function declarations */
>> +static int __init ramzswap_init(void);
>> +static int ramzswap_ioctl(struct block_device *, fmode_t,
>> +                       unsigned, unsigned long);
>> +static int setup_swap_header(struct ramzswap *, union swap_header *);
>> +static void ramzswap_set_memlimit(struct ramzswap *, size_t);
>> +static void ramzswap_set_disksize(struct ramzswap *, size_t);
>> +static void reset_device(struct ramzswap *rzs);
>
> It's preferable not to use forward declarations in new kernel code.
>

okay, I will rearrange functions to avoid this.


>> +static int test_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
>> +{
>> +       return rzs->table[index].flags & BIT(flag);
>> +}
>> +
>> +static void set_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
>> +{
>> +       rzs->table[index].flags |= BIT(flag);
>> +}
>> +
>> +static void clear_flag(struct ramzswap *rzs, u32 index,
>> +                                       enum rzs_pageflags flag)
>> +{
>> +       rzs->table[index].flags &= ~BIT(flag);
>> +}
>
> These function names could use a ramzswap specific prefix.

okay.

>
>> +
>> +static int page_zero_filled(void *ptr)
>> +{
>> +       u32 pos;
>> +       u64 *page;
>> +
>> +       page = (u64 *)ptr;
>> +
>> +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
>> +               if (page[pos])
>> +                       return 0;
>> +       }
>> +
>> +       return 1;
>> +}
>
> This looks like something that could be in lib/string.c.
>
> /me looks
>
> There's strspn so maybe you could introduce a memspn equivalent.
>


Maybe this is just too specific to this driver. Who else will use it?
So, this simple function should stay within this driver only. If it
finds more user,
we can them move it to lib/string.c.

If I now move it to string.c I am sure I will get reverse argument
from someone else:
"currently, it has no other users so bury it with this driver only".


>> +
>> +/*
>> + * Given <pagenum, offset> pair, provide a dereferencable pointer.
>> + */
>> +static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
>> +{
>> +       unsigned char *base;
>> +
>> +       base = kmap_atomic(page, type);
>> +       return base + offset;
>> +}
>> +
>> +static void put_ptr_atomic(void *ptr, enum km_type type)
>> +{
>> +       kunmap_atomic(ptr, type);
>> +}
>
> These two functions also appear in xmalloc. It's probably best to just
> kill the wrappers and use kmap/kunmap directly.
>

Wrapper for kmap_atomic is nice as spreading:
kmap_atomic(page, KM_USER0,1) + offset everywhere looks worse.
What is the problem if these little 1-liner wrappers are repeated in
xvmalloc too?
To me, they just add some clarity.


>> +
>> +static void ramzswap_flush_dcache_page(struct page *page)
>> +{
>> +#ifdef CONFIG_ARM
>> +       int flag = 0;
>> +       /*
>> +        * Ugly hack to get flush_dcache_page() work on ARM.
>> +        * page_mapping(page) == NULL after clearing this swap cache flag.
>> +        * Without clearing this flag, flush_dcache_page() will simply set
>> +        * "PG_dcache_dirty" bit and return.
>> +        */
>> +       if (PageSwapCache(page)) {
>> +               flag = 1;
>> +               ClearPageSwapCache(page);
>> +       }
>> +#endif
>> +       flush_dcache_page(page);
>> +#ifdef CONFIG_ARM
>> +       if (flag)
>> +               SetPageSwapCache(page);
>> +#endif
>> +}
>
> The above CONFIG_ARM magic really has no place in drivers/block.
>

Please read the comment above this hack to see why its needed. Also,
for details see this mail:
http://www.linux-mips.org/archives/linux-mips/2008-11/msg00038.html

No one replied to above mail. So, I though just to temporarily introduce this
hack while someone makes a proper fix for ARM (I will probably ping ARM/MIPS
folks again for this).

Without this hack, ramzswap simply won't work on ARM. See:
http://code.google.com/p/compcache/issues/detail?id=33

So, its extremely difficult to wait for the _proper_ fix.


>> +
>> +       pr_debug(C "extent: [%lu, %lu] %lu\n", phy_pagenum,
>> +               phy_pagenum + num_pages - 1, num_pages);
>
> What's this "C" thing everywhere? A subsystem prefix? Shouldn't you
> override pr_fmt() instead?
>

Yes, "C" is subsystem prefix. Will now override pr_fmt() instead.


>> +
>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
>> +       mutex_lock(&rzs->lock);
>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
>
> Hmm? What's this? I don't think you should be doing ad hoc
> trace_mark() in driver code.
>

This is not ad hoc. It is to see contention over this lock which I believe is a
major bottleneck even on dual-cores. I need to keep this to measure improvements
as I gradually make this locking more fine grained (using per-cpu buffer etc).



>> +#if 0
>> +       /* Back-reference needed for memory defragmentation */
>> +       if (!test_flag(rzs, index, RZS_UNCOMPRESSED)) {
>> +               zheader = (struct zobj_header *)cmem;
>> +               zheader->table_idx = index;
>> +               cmem += sizeof(*zheader);
>> +       }
>> +#endif
>
> Drop the above dead code?


This is a reminder for me that every object has to contain this header
ultimately
and hence object data does not start immediately from start of chunk.

More than the dead code, the comment adds more value to it. So lets keep it.


>
>> +
>> +       memcpy(cmem, src, clen);
>> +
>> +       put_ptr_atomic(cmem, KM_USER1);
>> +       if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED)))
>> +               put_ptr_atomic(src, KM_USER0);
>> +
>> +       /* Update stats */
>> +       rzs->stats.compr_size += clen;
>> +       stat_inc(rzs->stats.pages_stored);
>> +       stat_inc_if_less(rzs->stats.good_compress, clen, PAGE_SIZE / 2 + 1);
>> +
>> +       mutex_unlock(&rzs->lock);
>> +
>> +       set_bit(BIO_UPTODATE, &bio->bi_flags);
>> +       bio_endio(bio, 0);
>> +       return 0;
>> +
>> +out:
>> +       if (fwd_write_request) {
>> +               stat_inc(rzs->stats.bdev_num_writes);
>> +               bio->bi_bdev = rzs->backing_swap;
>> +#if 0
>> +               /*
>> +                * TODO: We currently have linear mapping of ramzswap and
>> +                * backing swap sectors. This is not desired since we want
>> +                * to optimize writes to backing swap to minimize disk seeks
>> +                * or have effective wear leveling (for SSDs). Also, a
>> +                * non-linear mapping is required to implement compressed
>> +                * on-disk swapping.
>> +                */
>> +                bio->bi_sector = get_backing_swap_page()
>> +                                       << SECTORS_PER_PAGE_SHIFT;
>> +#endif
>
> This too?
>

Again, I want to retain this comment. Its very important for me. So, I prefer to
keep this small bit of dead code.


>> +static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
>> +{
>> +       int ret;
>> +       size_t num_pages, totalram_bytes;
>> +       struct sysinfo i;
>> +       struct page *page;
>> +       union swap_header *swap_header;
>> +
>> +       if (rzs->init_done) {
>> +               pr_info(C "Device already initialized!\n");
>> +               return -EBUSY;
>> +       }
>> +
>> +       ret = setup_backing_swap(rzs);
>> +       if (ret)
>> +               goto fail;
>> +
>> +       si_meminfo(&i);
>> +       /* Here is a trivia: guess unit used for i.totalram !! */
>> +       totalram_bytes = i.totalram << PAGE_SHIFT;
>
> You can use totalram_pages here. OTOH, I'm not sure why the driver
> needs this information. Hmm?
>

The driver sets 'disksize' as 25% of RAM by default. So, it needs to know
how much RAM the system has.

>> +
>> +       if (rzs->backing_swap)
>> +               ramzswap_set_memlimit(rzs, totalram_bytes);
>> +       else
>> +               ramzswap_set_disksize(rzs, totalram_bytes);
>> +
>> +       rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
>> +       if (rzs->compress_workmem == NULL) {
>> +               pr_err(C "Error allocating compressor working memory!\n");
>> +               ret = -ENOMEM;
>> +               goto fail;
>> +       }
>> +
>> +       rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);
>
> Use alloc_pages(__GFP_ZERO) here?
>

alloc pages then map them (i.e. vmalloc). What did we gain? With
vmalloc, pages might
not be physically contiguous which might hurt performance as
compressor runs over this buffer.

So, use kzalloc().


>> diff --git a/drivers/block/ramzswap/ramzswap_drv.h b/drivers/block/ramzswap/ramzswap_drv.h
>> new file mode 100644
>> index 0000000..7f77edc
>> --- /dev/null
>> +++ b/drivers/block/ramzswap/ramzswap_drv.h
>
>> +
>> +#define SECTOR_SHIFT           9
>> +#define SECTOR_SIZE            (1 << SECTOR_SHIFT)
>> +#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
>> +#define SECTORS_PER_PAGE       (1 << SECTORS_PER_PAGE_SHIFT)
>
> Don't we have these defines somewhere in include/linux?
>

I couldn't find something equivalent. At least swap code hard codes a
value of 9.
So, a #define looks somewhat better.


>> +
>> +/* Message prefix */
>> +#define C "ramzswap: "
>
> Use pr_fmt() instead.

okay.


>
>> +
>> +/* Debugging and Stats */
>> +#define NOP    do { } while (0)
>
> Huh? Drop this.

This is more of individual taste. This makes the code look cleaner to me.
I hope its not considered 'over decoration'.


>
>> +
>> +#if defined(CONFIG_BLK_DEV_RAMZSWAP_STATS)
>> +#define STATS
>> +#endif
>
> Why can't you rename that to CONFIG_RAMZSWAP_STATS and use that
> instead of the very generic STATS?
>

Everything in drivers/block/Kconfig has BLK_DEV_ as prefix
BLK_DEV_RAM
BLK_DEV_RAM_COUNT
BLK_DEV_RAM_SIZE

and following the pattern:

BLK_DEV_RAMZSWAP
BLK_DEV_RAMZSWAP_STATS


STATS is just a convenient shortcut for very longish BLK_DEV_RAMZSWAP_STATS
but I see this not used very often, so this shortcut is really not
needed. I will now directly
use the long version.


>> +
>> +#if defined(STATS)
>> +#define stat_inc(stat)                 ((stat)++)
>> +#define stat_dec(stat)                 ((stat)--)
>> +#define stat_inc_if_less(stat, val1, val2) \
>> +                               ((stat) += ((val1) < (val2) ? 1 : 0))
>> +#define stat_dec_if_less(stat, val1, val2) \
>> +                               ((stat) -= ((val1) < (val2) ? 1 : 0))
>> +#else  /* STATS */
>> +#define stat_inc(x)                    NOP
>> +#define stat_dec(x)                    NOP
>> +#define stat_inc_if_less(x, v1, v2)    NOP
>> +#define stat_dec_if_less(x, v1, v2)    NOP
>> +#endif /* STATS */
>
> Why do you need inc_if_less() and dec_if_less()?

No good reason. I will get rid of inc/dec_if_less().


> And why are these not static inlines?
>

stats variables exist only when 'STATS' is defined. So, every call to static
inline will have to be enclosed within '#if defined (STATS)'. Thus we use
macros instead.

Thanks,
Nitin

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-15  6:39       ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-15  6:39 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc

Hi Pekka,

Thanks for review. My comments inline.

On Tue, Sep 15, 2009 at 1:40 AM, Pekka Enberg <penberg@cs.helsinki.fi> wrote:
>
> I am not a block driver expert but here are some comments on the code
> that probably need to be addressed before merging.
>
>                        Pekka

>
> On Thu, Sep 10, 2009 at 12:19 AM, Nitin Gupta <ngupta@vflare.org> wrote:
>> +
>> +/* Globals */
>> +static int RAMZSWAP_MAJOR;
>> +static struct ramzswap *DEVICES;
>> +
>> +/*
>> + * Pages that compress to larger than this size are
>> + * forwarded to backing swap, if present or stored
>> + * uncompressed in memory otherwise.
>> + */
>> +static unsigned int MAX_CPAGE_SIZE;
>> +
>> +/* Module params (documentation at end) */
>> +static unsigned long NUM_DEVICES;
>
> These variable names should be in lower case.
>

Global variables with lower case causes confusion.


>> +
>> +/* Function declarations */
>> +static int __init ramzswap_init(void);
>> +static int ramzswap_ioctl(struct block_device *, fmode_t,
>> +                       unsigned, unsigned long);
>> +static int setup_swap_header(struct ramzswap *, union swap_header *);
>> +static void ramzswap_set_memlimit(struct ramzswap *, size_t);
>> +static void ramzswap_set_disksize(struct ramzswap *, size_t);
>> +static void reset_device(struct ramzswap *rzs);
>
> It's preferable not to use forward declarations in new kernel code.
>

okay, I will rearrange functions to avoid this.


>> +static int test_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
>> +{
>> +       return rzs->table[index].flags & BIT(flag);
>> +}
>> +
>> +static void set_flag(struct ramzswap *rzs, u32 index, enum rzs_pageflags flag)
>> +{
>> +       rzs->table[index].flags |= BIT(flag);
>> +}
>> +
>> +static void clear_flag(struct ramzswap *rzs, u32 index,
>> +                                       enum rzs_pageflags flag)
>> +{
>> +       rzs->table[index].flags &= ~BIT(flag);
>> +}
>
> These function names could use a ramzswap specific prefix.

okay.

>
>> +
>> +static int page_zero_filled(void *ptr)
>> +{
>> +       u32 pos;
>> +       u64 *page;
>> +
>> +       page = (u64 *)ptr;
>> +
>> +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
>> +               if (page[pos])
>> +                       return 0;
>> +       }
>> +
>> +       return 1;
>> +}
>
> This looks like something that could be in lib/string.c.
>
> /me looks
>
> There's strspn so maybe you could introduce a memspn equivalent.
>


Maybe this is just too specific to this driver. Who else will use it?
So, this simple function should stay within this driver only. If it
finds more user,
we can them move it to lib/string.c.

If I now move it to string.c I am sure I will get reverse argument
from someone else:
"currently, it has no other users so bury it with this driver only".


>> +
>> +/*
>> + * Given <pagenum, offset> pair, provide a dereferencable pointer.
>> + */
>> +static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
>> +{
>> +       unsigned char *base;
>> +
>> +       base = kmap_atomic(page, type);
>> +       return base + offset;
>> +}
>> +
>> +static void put_ptr_atomic(void *ptr, enum km_type type)
>> +{
>> +       kunmap_atomic(ptr, type);
>> +}
>
> These two functions also appear in xmalloc. It's probably best to just
> kill the wrappers and use kmap/kunmap directly.
>

Wrapper for kmap_atomic is nice as spreading:
kmap_atomic(page, KM_USER0,1) + offset everywhere looks worse.
What is the problem if these little 1-liner wrappers are repeated in
xvmalloc too?
To me, they just add some clarity.


>> +
>> +static void ramzswap_flush_dcache_page(struct page *page)
>> +{
>> +#ifdef CONFIG_ARM
>> +       int flag = 0;
>> +       /*
>> +        * Ugly hack to get flush_dcache_page() work on ARM.
>> +        * page_mapping(page) == NULL after clearing this swap cache flag.
>> +        * Without clearing this flag, flush_dcache_page() will simply set
>> +        * "PG_dcache_dirty" bit and return.
>> +        */
>> +       if (PageSwapCache(page)) {
>> +               flag = 1;
>> +               ClearPageSwapCache(page);
>> +       }
>> +#endif
>> +       flush_dcache_page(page);
>> +#ifdef CONFIG_ARM
>> +       if (flag)
>> +               SetPageSwapCache(page);
>> +#endif
>> +}
>
> The above CONFIG_ARM magic really has no place in drivers/block.
>

Please read the comment above this hack to see why its needed. Also,
for details see this mail:
http://www.linux-mips.org/archives/linux-mips/2008-11/msg00038.html

No one replied to above mail. So, I though just to temporarily introduce this
hack while someone makes a proper fix for ARM (I will probably ping ARM/MIPS
folks again for this).

Without this hack, ramzswap simply won't work on ARM. See:
http://code.google.com/p/compcache/issues/detail?id=33

So, its extremely difficult to wait for the _proper_ fix.


>> +
>> +       pr_debug(C "extent: [%lu, %lu] %lu\n", phy_pagenum,
>> +               phy_pagenum + num_pages - 1, num_pages);
>
> What's this "C" thing everywhere? A subsystem prefix? Shouldn't you
> override pr_fmt() instead?
>

Yes, "C" is subsystem prefix. Will now override pr_fmt() instead.


>> +
>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
>> +       mutex_lock(&rzs->lock);
>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
>
> Hmm? What's this? I don't think you should be doing ad hoc
> trace_mark() in driver code.
>

This is not ad hoc. It is to see contention over this lock which I believe is a
major bottleneck even on dual-cores. I need to keep this to measure improvements
as I gradually make this locking more fine grained (using per-cpu buffer etc).



>> +#if 0
>> +       /* Back-reference needed for memory defragmentation */
>> +       if (!test_flag(rzs, index, RZS_UNCOMPRESSED)) {
>> +               zheader = (struct zobj_header *)cmem;
>> +               zheader->table_idx = index;
>> +               cmem += sizeof(*zheader);
>> +       }
>> +#endif
>
> Drop the above dead code?


This is a reminder for me that every object has to contain this header
ultimately
and hence object data does not start immediately from start of chunk.

More than the dead code, the comment adds more value to it. So lets keep it.


>
>> +
>> +       memcpy(cmem, src, clen);
>> +
>> +       put_ptr_atomic(cmem, KM_USER1);
>> +       if (unlikely(test_flag(rzs, index, RZS_UNCOMPRESSED)))
>> +               put_ptr_atomic(src, KM_USER0);
>> +
>> +       /* Update stats */
>> +       rzs->stats.compr_size += clen;
>> +       stat_inc(rzs->stats.pages_stored);
>> +       stat_inc_if_less(rzs->stats.good_compress, clen, PAGE_SIZE / 2 + 1);
>> +
>> +       mutex_unlock(&rzs->lock);
>> +
>> +       set_bit(BIO_UPTODATE, &bio->bi_flags);
>> +       bio_endio(bio, 0);
>> +       return 0;
>> +
>> +out:
>> +       if (fwd_write_request) {
>> +               stat_inc(rzs->stats.bdev_num_writes);
>> +               bio->bi_bdev = rzs->backing_swap;
>> +#if 0
>> +               /*
>> +                * TODO: We currently have linear mapping of ramzswap and
>> +                * backing swap sectors. This is not desired since we want
>> +                * to optimize writes to backing swap to minimize disk seeks
>> +                * or have effective wear leveling (for SSDs). Also, a
>> +                * non-linear mapping is required to implement compressed
>> +                * on-disk swapping.
>> +                */
>> +                bio->bi_sector = get_backing_swap_page()
>> +                                       << SECTORS_PER_PAGE_SHIFT;
>> +#endif
>
> This too?
>

Again, I want to retain this comment. Its very important for me. So, I prefer to
keep this small bit of dead code.


>> +static int ramzswap_ioctl_init_device(struct ramzswap *rzs)
>> +{
>> +       int ret;
>> +       size_t num_pages, totalram_bytes;
>> +       struct sysinfo i;
>> +       struct page *page;
>> +       union swap_header *swap_header;
>> +
>> +       if (rzs->init_done) {
>> +               pr_info(C "Device already initialized!\n");
>> +               return -EBUSY;
>> +       }
>> +
>> +       ret = setup_backing_swap(rzs);
>> +       if (ret)
>> +               goto fail;
>> +
>> +       si_meminfo(&i);
>> +       /* Here is a trivia: guess unit used for i.totalram !! */
>> +       totalram_bytes = i.totalram << PAGE_SHIFT;
>
> You can use totalram_pages here. OTOH, I'm not sure why the driver
> needs this information. Hmm?
>

The driver sets 'disksize' as 25% of RAM by default. So, it needs to know
how much RAM the system has.

>> +
>> +       if (rzs->backing_swap)
>> +               ramzswap_set_memlimit(rzs, totalram_bytes);
>> +       else
>> +               ramzswap_set_disksize(rzs, totalram_bytes);
>> +
>> +       rzs->compress_workmem = kzalloc(LZO1X_MEM_COMPRESS, GFP_KERNEL);
>> +       if (rzs->compress_workmem == NULL) {
>> +               pr_err(C "Error allocating compressor working memory!\n");
>> +               ret = -ENOMEM;
>> +               goto fail;
>> +       }
>> +
>> +       rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);
>
> Use alloc_pages(__GFP_ZERO) here?
>

alloc pages then map them (i.e. vmalloc). What did we gain? With
vmalloc, pages might
not be physically contiguous which might hurt performance as
compressor runs over this buffer.

So, use kzalloc().


>> diff --git a/drivers/block/ramzswap/ramzswap_drv.h b/drivers/block/ramzswap/ramzswap_drv.h
>> new file mode 100644
>> index 0000000..7f77edc
>> --- /dev/null
>> +++ b/drivers/block/ramzswap/ramzswap_drv.h
>
>> +
>> +#define SECTOR_SHIFT           9
>> +#define SECTOR_SIZE            (1 << SECTOR_SHIFT)
>> +#define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
>> +#define SECTORS_PER_PAGE       (1 << SECTORS_PER_PAGE_SHIFT)
>
> Don't we have these defines somewhere in include/linux?
>

I couldn't find something equivalent. At least swap code hard codes a
value of 9.
So, a #define looks somewhat better.


>> +
>> +/* Message prefix */
>> +#define C "ramzswap: "
>
> Use pr_fmt() instead.

okay.


>
>> +
>> +/* Debugging and Stats */
>> +#define NOP    do { } while (0)
>
> Huh? Drop this.

This is more of individual taste. This makes the code look cleaner to me.
I hope its not considered 'over decoration'.


>
>> +
>> +#if defined(CONFIG_BLK_DEV_RAMZSWAP_STATS)
>> +#define STATS
>> +#endif
>
> Why can't you rename that to CONFIG_RAMZSWAP_STATS and use that
> instead of the very generic STATS?
>

Everything in drivers/block/Kconfig has BLK_DEV_ as prefix
BLK_DEV_RAM
BLK_DEV_RAM_COUNT
BLK_DEV_RAM_SIZE

and following the pattern:

BLK_DEV_RAMZSWAP
BLK_DEV_RAMZSWAP_STATS


STATS is just a convenient shortcut for very longish BLK_DEV_RAMZSWAP_STATS
but I see this not used very often, so this shortcut is really not
needed. I will now directly
use the long version.


>> +
>> +#if defined(STATS)
>> +#define stat_inc(stat)                 ((stat)++)
>> +#define stat_dec(stat)                 ((stat)--)
>> +#define stat_inc_if_less(stat, val1, val2) \
>> +                               ((stat) += ((val1) < (val2) ? 1 : 0))
>> +#define stat_dec_if_less(stat, val1, val2) \
>> +                               ((stat) -= ((val1) < (val2) ? 1 : 0))
>> +#else  /* STATS */
>> +#define stat_inc(x)                    NOP
>> +#define stat_dec(x)                    NOP
>> +#define stat_inc_if_less(x, v1, v2)    NOP
>> +#define stat_dec_if_less(x, v1, v2)    NOP
>> +#endif /* STATS */
>
> Why do you need inc_if_less() and dec_if_less()?

No good reason. I will get rid of inc/dec_if_less().


> And why are these not static inlines?
>

stats variables exist only when 'STATS' is defined. So, every call to static
inline will have to be enclosed within '#if defined (STATS)'. Thus we use
macros instead.

Thanks,
Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-15  6:39       ` Nitin Gupta
@ 2009-09-15  7:30         ` Pekka Enberg
  -1 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-09-15  7:30 UTC (permalink / raw)
  To: Nitin Gupta
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt

Hi Nitin,

On Tue, Sep 15, 2009 at 9:39 AM, Nitin Gupta <ngupta@vflare.org> wrote:
>> On Thu, Sep 10, 2009 at 12:19 AM, Nitin Gupta <ngupta@vflare.org> wrote:
>>> +
>>> +/* Globals */
>>> +static int RAMZSWAP_MAJOR;
>>> +static struct ramzswap *DEVICES;
>>> +
>>> +/*
>>> + * Pages that compress to larger than this size are
>>> + * forwarded to backing swap, if present or stored
>>> + * uncompressed in memory otherwise.
>>> + */
>>> +static unsigned int MAX_CPAGE_SIZE;
>>> +
>>> +/* Module params (documentation at end) */
>>> +static unsigned long NUM_DEVICES;
>>
>> These variable names should be in lower case.
>
> Global variables with lower case causes confusion.

Hmm? You are not following the kernel coding style here. It's as simple as that.

>>> +static int page_zero_filled(void *ptr)
>>> +{
>>> +       u32 pos;
>>> +       u64 *page;
>>> +
>>> +       page = (u64 *)ptr;
>>> +
>>> +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
>>> +               if (page[pos])
>>> +                       return 0;
>>> +       }
>>> +
>>> +       return 1;
>>> +}
>>
>> This looks like something that could be in lib/string.c.
>>
>> /me looks
>>
>> There's strspn so maybe you could introduce a memspn equivalent.
>
> Maybe this is just too specific to this driver. Who else will use it?
> So, this simple function should stay within this driver only. If it
> finds more user, we can them move it to lib/string.c.
>
> If I now move it to string.c I am sure I will get reverse argument
> from someone else:
> "currently, it has no other users so bury it with this driver only".

How can you be sure about that? If you don't want to move it to
generic code, fine, but the above argumentation doesn't really
convince me. Check the git logs to see that this is *exactly* how new
functions get added to lib/string.c. It's not always a question of two
or more users, it's also an API issue. It doesn't make sense to put
helpers in driver code where they don't belong (and won't be
discovered if they're needed somewhere else).

>>> +/*
>>> + * Given <pagenum, offset> pair, provide a dereferencable pointer.
>>> + */
>>> +static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
>>> +{
>>> +       unsigned char *base;
>>> +
>>> +       base = kmap_atomic(page, type);
>>> +       return base + offset;
>>> +}
>>> +
>>> +static void put_ptr_atomic(void *ptr, enum km_type type)
>>> +{
>>> +       kunmap_atomic(ptr, type);
>>> +}
>>
>> These two functions also appear in xmalloc. It's probably best to just
>> kill the wrappers and use kmap/kunmap directly.
>
> Wrapper for kmap_atomic is nice as spreading:
> kmap_atomic(page, KM_USER0,1) + offset everywhere looks worse.
> What is the problem if these little 1-liner wrappers are repeated in
> xvmalloc too?
> To me, they just add some clarity.

To me, they look like useless wrappers which we don't do in the kernel.

>>> +static void ramzswap_flush_dcache_page(struct page *page)
>>> +{
>>> +#ifdef CONFIG_ARM
>>> +       int flag = 0;
>>> +       /*
>>> +        * Ugly hack to get flush_dcache_page() work on ARM.
>>> +        * page_mapping(page) == NULL after clearing this swap cache flag.
>>> +        * Without clearing this flag, flush_dcache_page() will simply set
>>> +        * "PG_dcache_dirty" bit and return.
>>> +        */
>>> +       if (PageSwapCache(page)) {
>>> +               flag = 1;
>>> +               ClearPageSwapCache(page);
>>> +       }
>>> +#endif
>>> +       flush_dcache_page(page);
>>> +#ifdef CONFIG_ARM
>>> +       if (flag)
>>> +               SetPageSwapCache(page);
>>> +#endif
>>> +}
>>
>> The above CONFIG_ARM magic really has no place in drivers/block.
>>
>
> Please read the comment above this hack to see why its needed. Also,
> for details see this mail:
> http://www.linux-mips.org/archives/linux-mips/2008-11/msg00038.html
>
> No one replied to above mail. So, I though just to temporarily introduce this
> hack while someone makes a proper fix for ARM (I will probably ping ARM/MIPS
> folks again for this).
>
> Without this hack, ramzswap simply won't work on ARM. See:
> http://code.google.com/p/compcache/issues/detail?id=33
>
> So, its extremely difficult to wait for the _proper_ fix.

Then make ramzswap depend on !CONFIG_ARM. In any case, CONFIG_ARM bits
really don't belong into drivers/block.

>>> +
>>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
>>> +       mutex_lock(&rzs->lock);
>>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
>>
>> Hmm? What's this? I don't think you should be doing ad hoc
>> trace_mark() in driver code.
>
> This is not ad hoc. It is to see contention over this lock which I believe is a
> major bottleneck even on dual-cores. I need to keep this to measure improvements
> as I gradually make this locking more fine grained (using per-cpu buffer etc).

It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
those bits out-of-tree until the issue is resolved, really.

>>> +       rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);
>>
>> Use alloc_pages(__GFP_ZERO) here?
>
> alloc pages then map them (i.e. vmalloc). What did we gain? With
> vmalloc, pages might
> not be physically contiguous which might hurt performance as
> compressor runs over this buffer.
>
> So, use kzalloc().

I don't know what you're talking about. kzalloc() calls
__get_free_pages() directly for your allocation. You probably should
use that directly.

>>> +/* Debugging and Stats */
>>> +#define NOP    do { } while (0)
>>
>> Huh? Drop this.
>
> This is more of individual taste. This makes the code look cleaner to me.
> I hope its not considered 'over decoration'.

Hey, the kernel doesn't care about your or my individual taste. I'm
pointing out things that I think fall in the category of "we don't do
shit like this in the kernel", not things _I_ personally find
annoying. If you want to ignore those comments, fine, that's your
prerogative. However, people usually have better results in getting
their code merged when they listen to kernel developers who take the
time to review their code.

                        Pekka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-15  7:30         ` Pekka Enberg
  0 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-09-15  7:30 UTC (permalink / raw)
  To: Nitin Gupta
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt

Hi Nitin,

On Tue, Sep 15, 2009 at 9:39 AM, Nitin Gupta <ngupta@vflare.org> wrote:
>> On Thu, Sep 10, 2009 at 12:19 AM, Nitin Gupta <ngupta@vflare.org> wrote:
>>> +
>>> +/* Globals */
>>> +static int RAMZSWAP_MAJOR;
>>> +static struct ramzswap *DEVICES;
>>> +
>>> +/*
>>> + * Pages that compress to larger than this size are
>>> + * forwarded to backing swap, if present or stored
>>> + * uncompressed in memory otherwise.
>>> + */
>>> +static unsigned int MAX_CPAGE_SIZE;
>>> +
>>> +/* Module params (documentation at end) */
>>> +static unsigned long NUM_DEVICES;
>>
>> These variable names should be in lower case.
>
> Global variables with lower case causes confusion.

Hmm? You are not following the kernel coding style here. It's as simple as that.

>>> +static int page_zero_filled(void *ptr)
>>> +{
>>> +       u32 pos;
>>> +       u64 *page;
>>> +
>>> +       page = (u64 *)ptr;
>>> +
>>> +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
>>> +               if (page[pos])
>>> +                       return 0;
>>> +       }
>>> +
>>> +       return 1;
>>> +}
>>
>> This looks like something that could be in lib/string.c.
>>
>> /me looks
>>
>> There's strspn so maybe you could introduce a memspn equivalent.
>
> Maybe this is just too specific to this driver. Who else will use it?
> So, this simple function should stay within this driver only. If it
> finds more user, we can them move it to lib/string.c.
>
> If I now move it to string.c I am sure I will get reverse argument
> from someone else:
> "currently, it has no other users so bury it with this driver only".

How can you be sure about that? If you don't want to move it to
generic code, fine, but the above argumentation doesn't really
convince me. Check the git logs to see that this is *exactly* how new
functions get added to lib/string.c. It's not always a question of two
or more users, it's also an API issue. It doesn't make sense to put
helpers in driver code where they don't belong (and won't be
discovered if they're needed somewhere else).

>>> +/*
>>> + * Given <pagenum, offset> pair, provide a dereferencable pointer.
>>> + */
>>> +static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
>>> +{
>>> +       unsigned char *base;
>>> +
>>> +       base = kmap_atomic(page, type);
>>> +       return base + offset;
>>> +}
>>> +
>>> +static void put_ptr_atomic(void *ptr, enum km_type type)
>>> +{
>>> +       kunmap_atomic(ptr, type);
>>> +}
>>
>> These two functions also appear in xmalloc. It's probably best to just
>> kill the wrappers and use kmap/kunmap directly.
>
> Wrapper for kmap_atomic is nice as spreading:
> kmap_atomic(page, KM_USER0,1) + offset everywhere looks worse.
> What is the problem if these little 1-liner wrappers are repeated in
> xvmalloc too?
> To me, they just add some clarity.

To me, they look like useless wrappers which we don't do in the kernel.

>>> +static void ramzswap_flush_dcache_page(struct page *page)
>>> +{
>>> +#ifdef CONFIG_ARM
>>> +       int flag = 0;
>>> +       /*
>>> +        * Ugly hack to get flush_dcache_page() work on ARM.
>>> +        * page_mapping(page) == NULL after clearing this swap cache flag.
>>> +        * Without clearing this flag, flush_dcache_page() will simply set
>>> +        * "PG_dcache_dirty" bit and return.
>>> +        */
>>> +       if (PageSwapCache(page)) {
>>> +               flag = 1;
>>> +               ClearPageSwapCache(page);
>>> +       }
>>> +#endif
>>> +       flush_dcache_page(page);
>>> +#ifdef CONFIG_ARM
>>> +       if (flag)
>>> +               SetPageSwapCache(page);
>>> +#endif
>>> +}
>>
>> The above CONFIG_ARM magic really has no place in drivers/block.
>>
>
> Please read the comment above this hack to see why its needed. Also,
> for details see this mail:
> http://www.linux-mips.org/archives/linux-mips/2008-11/msg00038.html
>
> No one replied to above mail. So, I though just to temporarily introduce this
> hack while someone makes a proper fix for ARM (I will probably ping ARM/MIPS
> folks again for this).
>
> Without this hack, ramzswap simply won't work on ARM. See:
> http://code.google.com/p/compcache/issues/detail?id=33
>
> So, its extremely difficult to wait for the _proper_ fix.

Then make ramzswap depend on !CONFIG_ARM. In any case, CONFIG_ARM bits
really don't belong into drivers/block.

>>> +
>>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
>>> +       mutex_lock(&rzs->lock);
>>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
>>
>> Hmm? What's this? I don't think you should be doing ad hoc
>> trace_mark() in driver code.
>
> This is not ad hoc. It is to see contention over this lock which I believe is a
> major bottleneck even on dual-cores. I need to keep this to measure improvements
> as I gradually make this locking more fine grained (using per-cpu buffer etc).

It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
those bits out-of-tree until the issue is resolved, really.

>>> +       rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);
>>
>> Use alloc_pages(__GFP_ZERO) here?
>
> alloc pages then map them (i.e. vmalloc). What did we gain? With
> vmalloc, pages might
> not be physically contiguous which might hurt performance as
> compressor runs over this buffer.
>
> So, use kzalloc().

I don't know what you're talking about. kzalloc() calls
__get_free_pages() directly for your allocation. You probably should
use that directly.

>>> +/* Debugging and Stats */
>>> +#define NOP    do { } while (0)
>>
>> Huh? Drop this.
>
> This is more of individual taste. This makes the code look cleaner to me.
> I hope its not considered 'over decoration'.

Hey, the kernel doesn't care about your or my individual taste. I'm
pointing out things that I think fall in the category of "we don't do
shit like this in the kernel", not things _I_ personally find
annoying. If you want to ignore those comments, fine, that's your
prerogative. However, people usually have better results in getting
their code merged when they listen to kernel developers who take the
time to review their code.

                        Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-15  7:30         ` Pekka Enberg
@ 2009-09-15  8:21           ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-15  8:21 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt

Hi Pekka,

On Tue, Sep 15, 2009 at 1:00 PM, Pekka Enberg <penberg@cs.helsinki.fi> wrote:

>
> On Tue, Sep 15, 2009 at 9:39 AM, Nitin Gupta <ngupta@vflare.org> wrote:
>>> On Thu, Sep 10, 2009 at 12:19 AM, Nitin Gupta <ngupta@vflare.org> wrote:
>>>> +
>>>> +/* Globals */
>>>> +static int RAMZSWAP_MAJOR;
>>>> +static struct ramzswap *DEVICES;
>>>> +
>>>> +/*
>>>> + * Pages that compress to larger than this size are
>>>> + * forwarded to backing swap, if present or stored
>>>> + * uncompressed in memory otherwise.
>>>> + */
>>>> +static unsigned int MAX_CPAGE_SIZE;
>>>> +
>>>> +/* Module params (documentation at end) */
>>>> +static unsigned long NUM_DEVICES;
>>>
>>> These variable names should be in lower case.
>>
>> Global variables with lower case causes confusion.
>
> Hmm? You are not following the kernel coding style here. It's as simple as that.
>

ok....all lower case/.



>>>> +static int page_zero_filled(void *ptr)
>>>> +{
>>>> +       u32 pos;
>>>> +       u64 *page;
>>>> +
>>>> +       page = (u64 *)ptr;
>>>> +
>>>> +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
>>>> +               if (page[pos])
>>>> +                       return 0;
>>>> +       }
>>>> +
>>>> +       return 1;
>>>> +}
>>>
>>> This looks like something that could be in lib/string.c.
>>>
>>> /me looks
>>>
>>> There's strspn so maybe you could introduce a memspn equivalent.
>>
>> Maybe this is just too specific to this driver. Who else will use it?
>> So, this simple function should stay within this driver only. If it
>> finds more user, we can them move it to lib/string.c.
>>
>> If I now move it to string.c I am sure I will get reverse argument
>> from someone else:
>> "currently, it has no other users so bury it with this driver only".
>
> How can you be sure about that? If you don't want to move it to
> generic code, fine, but the above argumentation doesn't really
> convince me. Check the git logs to see that this is *exactly* how new
> functions get added to lib/string.c. It's not always a question of two
> or more users, it's also an API issue. It doesn't make sense to put
> helpers in driver code where they don't belong (and won't be
> discovered if they're needed somewhere else).
>

I don't want to ponder too much about this point now. If you all are okay
with keeping this function buried in driver, I will do so. I'm almost tired
maintaining this compcache thing outside of mainline.


>>>> +/*
>>>> + * Given <pagenum, offset> pair, provide a dereferencable pointer.
>>>> + */
>>>> +static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
>>>> +{
>>>> +       unsigned char *base;
>>>> +
>>>> +       base = kmap_atomic(page, type);
>>>> +       return base + offset;
>>>> +}
>>>> +
>>>> +static void put_ptr_atomic(void *ptr, enum km_type type)
>>>> +{
>>>> +       kunmap_atomic(ptr, type);
>>>> +}
>>>
>>> These two functions also appear in xmalloc. It's probably best to just
>>> kill the wrappers and use kmap/kunmap directly.
>>
>> Wrapper for kmap_atomic is nice as spreading:
>> kmap_atomic(page, KM_USER0,1) + offset everywhere looks worse.
>> What is the problem if these little 1-liner wrappers are repeated in
>> xvmalloc too?
>> To me, they just add some clarity.
>
> To me, they look like useless wrappers which we don't do in the kernel.
>

I will see how it looks without wrappers and depending on that I will decide
if keeping this wrapper is better. Again, I don't want to ponder too much about
this and will open code this if so required.


>>>> +static void ramzswap_flush_dcache_page(struct page *page)
>>>> +{
>>>> +#ifdef CONFIG_ARM
>>>> +       int flag = 0;
>>>> +       /*
>>>> +        * Ugly hack to get flush_dcache_page() work on ARM.
>>>> +        * page_mapping(page) == NULL after clearing this swap cache flag.
>>>> +        * Without clearing this flag, flush_dcache_page() will simply set
>>>> +        * "PG_dcache_dirty" bit and return.
>>>> +        */
>>>> +       if (PageSwapCache(page)) {
>>>> +               flag = 1;
>>>> +               ClearPageSwapCache(page);
>>>> +       }
>>>> +#endif
>>>> +       flush_dcache_page(page);
>>>> +#ifdef CONFIG_ARM
>>>> +       if (flag)
>>>> +               SetPageSwapCache(page);
>>>> +#endif
>>>> +}
>>>
>>> The above CONFIG_ARM magic really has no place in drivers/block.
>>>
>>
>> Please read the comment above this hack to see why its needed. Also,
>> for details see this mail:
>> http://www.linux-mips.org/archives/linux-mips/2008-11/msg00038.html
>>
>> No one replied to above mail. So, I though just to temporarily introduce this
>> hack while someone makes a proper fix for ARM (I will probably ping ARM/MIPS
>> folks again for this).
>>
>> Without this hack, ramzswap simply won't work on ARM. See:
>> http://code.google.com/p/compcache/issues/detail?id=33
>>
>> So, its extremely difficult to wait for the _proper_ fix.
>
> Then make ramzswap depend on !CONFIG_ARM. In any case, CONFIG_ARM bits
> really don't belong into drivers/block.
>

ARM is an extremely important user of compcache -- Its currently being
tested (unofficially)
on Android, Nokia etc.


>>>> +
>>>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
>>>> +       mutex_lock(&rzs->lock);
>>>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
>>>
>>> Hmm? What's this? I don't think you should be doing ad hoc
>>> trace_mark() in driver code.
>>
>> This is not ad hoc. It is to see contention over this lock which I believe is a
>> major bottleneck even on dual-cores. I need to keep this to measure improvements
>> as I gradually make this locking more fine grained (using per-cpu buffer etc).
>
> It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
> those bits out-of-tree until the issue is resolved, really.
>

/me is speechless.


>>>> +       rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);
>>>
>>> Use alloc_pages(__GFP_ZERO) here?
>>
>> alloc pages then map them (i.e. vmalloc). What did we gain? With
>> vmalloc, pages might
>> not be physically contiguous which might hurt performance as
>> compressor runs over this buffer.
>>
>> So, use kzalloc().
>
> I don't know what you're talking about. kzalloc() calls
> __get_free_pages() directly for your allocation. You probably should
> use that directly.
>

What is wrong with kzalloc? I'm wholly totally stumped.
I respect your time reviewing the code but this really goes over my head.
We can continue arguing about get_pages vs kzalloc but I doubt if we will
gain anything out of it.


>>>> +/* Debugging and Stats */
>>>> +#define NOP    do { } while (0)
>>>
>>> Huh? Drop this.
>>
>> This is more of individual taste. This makes the code look cleaner to me.
>> I hope its not considered 'over decoration'.
>
> Hey, the kernel doesn't care about your or my individual taste. I'm
> pointing out things that I think fall in the category of "we don't do
> shit like this in the kernel", not things _I_ personally find
> annoying. If you want to ignore those comments, fine, that's your
> prerogative. However, people usually have better results in getting
> their code merged when they listen to kernel developers who take the
> time to review their code.
>

Yikes! no NOP. okay.


Thanks,
Nitin

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-15  8:21           ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-15  8:21 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt

Hi Pekka,

On Tue, Sep 15, 2009 at 1:00 PM, Pekka Enberg <penberg@cs.helsinki.fi> wrote:

>
> On Tue, Sep 15, 2009 at 9:39 AM, Nitin Gupta <ngupta@vflare.org> wrote:
>>> On Thu, Sep 10, 2009 at 12:19 AM, Nitin Gupta <ngupta@vflare.org> wrote:
>>>> +
>>>> +/* Globals */
>>>> +static int RAMZSWAP_MAJOR;
>>>> +static struct ramzswap *DEVICES;
>>>> +
>>>> +/*
>>>> + * Pages that compress to larger than this size are
>>>> + * forwarded to backing swap, if present or stored
>>>> + * uncompressed in memory otherwise.
>>>> + */
>>>> +static unsigned int MAX_CPAGE_SIZE;
>>>> +
>>>> +/* Module params (documentation at end) */
>>>> +static unsigned long NUM_DEVICES;
>>>
>>> These variable names should be in lower case.
>>
>> Global variables with lower case causes confusion.
>
> Hmm? You are not following the kernel coding style here. It's as simple as that.
>

ok....all lower case/.



>>>> +static int page_zero_filled(void *ptr)
>>>> +{
>>>> +       u32 pos;
>>>> +       u64 *page;
>>>> +
>>>> +       page = (u64 *)ptr;
>>>> +
>>>> +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
>>>> +               if (page[pos])
>>>> +                       return 0;
>>>> +       }
>>>> +
>>>> +       return 1;
>>>> +}
>>>
>>> This looks like something that could be in lib/string.c.
>>>
>>> /me looks
>>>
>>> There's strspn so maybe you could introduce a memspn equivalent.
>>
>> Maybe this is just too specific to this driver. Who else will use it?
>> So, this simple function should stay within this driver only. If it
>> finds more user, we can them move it to lib/string.c.
>>
>> If I now move it to string.c I am sure I will get reverse argument
>> from someone else:
>> "currently, it has no other users so bury it with this driver only".
>
> How can you be sure about that? If you don't want to move it to
> generic code, fine, but the above argumentation doesn't really
> convince me. Check the git logs to see that this is *exactly* how new
> functions get added to lib/string.c. It's not always a question of two
> or more users, it's also an API issue. It doesn't make sense to put
> helpers in driver code where they don't belong (and won't be
> discovered if they're needed somewhere else).
>

I don't want to ponder too much about this point now. If you all are okay
with keeping this function buried in driver, I will do so. I'm almost tired
maintaining this compcache thing outside of mainline.


>>>> +/*
>>>> + * Given <pagenum, offset> pair, provide a dereferencable pointer.
>>>> + */
>>>> +static void *get_ptr_atomic(struct page *page, u16 offset, enum km_type type)
>>>> +{
>>>> +       unsigned char *base;
>>>> +
>>>> +       base = kmap_atomic(page, type);
>>>> +       return base + offset;
>>>> +}
>>>> +
>>>> +static void put_ptr_atomic(void *ptr, enum km_type type)
>>>> +{
>>>> +       kunmap_atomic(ptr, type);
>>>> +}
>>>
>>> These two functions also appear in xmalloc. It's probably best to just
>>> kill the wrappers and use kmap/kunmap directly.
>>
>> Wrapper for kmap_atomic is nice as spreading:
>> kmap_atomic(page, KM_USER0,1) + offset everywhere looks worse.
>> What is the problem if these little 1-liner wrappers are repeated in
>> xvmalloc too?
>> To me, they just add some clarity.
>
> To me, they look like useless wrappers which we don't do in the kernel.
>

I will see how it looks without wrappers and depending on that I will decide
if keeping this wrapper is better. Again, I don't want to ponder too much about
this and will open code this if so required.


>>>> +static void ramzswap_flush_dcache_page(struct page *page)
>>>> +{
>>>> +#ifdef CONFIG_ARM
>>>> +       int flag = 0;
>>>> +       /*
>>>> +        * Ugly hack to get flush_dcache_page() work on ARM.
>>>> +        * page_mapping(page) == NULL after clearing this swap cache flag.
>>>> +        * Without clearing this flag, flush_dcache_page() will simply set
>>>> +        * "PG_dcache_dirty" bit and return.
>>>> +        */
>>>> +       if (PageSwapCache(page)) {
>>>> +               flag = 1;
>>>> +               ClearPageSwapCache(page);
>>>> +       }
>>>> +#endif
>>>> +       flush_dcache_page(page);
>>>> +#ifdef CONFIG_ARM
>>>> +       if (flag)
>>>> +               SetPageSwapCache(page);
>>>> +#endif
>>>> +}
>>>
>>> The above CONFIG_ARM magic really has no place in drivers/block.
>>>
>>
>> Please read the comment above this hack to see why its needed. Also,
>> for details see this mail:
>> http://www.linux-mips.org/archives/linux-mips/2008-11/msg00038.html
>>
>> No one replied to above mail. So, I though just to temporarily introduce this
>> hack while someone makes a proper fix for ARM (I will probably ping ARM/MIPS
>> folks again for this).
>>
>> Without this hack, ramzswap simply won't work on ARM. See:
>> http://code.google.com/p/compcache/issues/detail?id=33
>>
>> So, its extremely difficult to wait for the _proper_ fix.
>
> Then make ramzswap depend on !CONFIG_ARM. In any case, CONFIG_ARM bits
> really don't belong into drivers/block.
>

ARM is an extremely important user of compcache -- Its currently being
tested (unofficially)
on Android, Nokia etc.


>>>> +
>>>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
>>>> +       mutex_lock(&rzs->lock);
>>>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
>>>
>>> Hmm? What's this? I don't think you should be doing ad hoc
>>> trace_mark() in driver code.
>>
>> This is not ad hoc. It is to see contention over this lock which I believe is a
>> major bottleneck even on dual-cores. I need to keep this to measure improvements
>> as I gradually make this locking more fine grained (using per-cpu buffer etc).
>
> It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
> those bits out-of-tree until the issue is resolved, really.
>

/me is speechless.


>>>> +       rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);
>>>
>>> Use alloc_pages(__GFP_ZERO) here?
>>
>> alloc pages then map them (i.e. vmalloc). What did we gain? With
>> vmalloc, pages might
>> not be physically contiguous which might hurt performance as
>> compressor runs over this buffer.
>>
>> So, use kzalloc().
>
> I don't know what you're talking about. kzalloc() calls
> __get_free_pages() directly for your allocation. You probably should
> use that directly.
>

What is wrong with kzalloc? I'm wholly totally stumped.
I respect your time reviewing the code but this really goes over my head.
We can continue arguing about get_pages vs kzalloc but I doubt if we will
gain anything out of it.


>>>> +/* Debugging and Stats */
>>>> +#define NOP    do { } while (0)
>>>
>>> Huh? Drop this.
>>
>> This is more of individual taste. This makes the code look cleaner to me.
>> I hope its not considered 'over decoration'.
>
> Hey, the kernel doesn't care about your or my individual taste. I'm
> pointing out things that I think fall in the category of "we don't do
> shit like this in the kernel", not things _I_ personally find
> annoying. If you want to ignore those comments, fine, that's your
> prerogative. However, people usually have better results in getting
> their code merged when they listen to kernel developers who take the
> time to review their code.
>

Yikes! no NOP. okay.


Thanks,
Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-15  8:21           ` Nitin Gupta
@ 2009-09-15  8:30             ` Pekka Enberg
  -1 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-09-15  8:30 UTC (permalink / raw)
  To: Nitin Gupta
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt, Greg KH

On Tue, Sep 15, 2009 at 11:21 AM, Nitin Gupta <ngupta@vflare.org> wrote:
> I don't want to ponder too much about this point now. If you all are okay
> with keeping this function buried in driver, I will do so. I'm almost tired
> maintaining this compcache thing outside of mainline.

Yup, whatever makes most sense to you.

>> Then make ramzswap depend on !CONFIG_ARM. In any case, CONFIG_ARM bits
>> really don't belong into drivers/block.
>
> ARM is an extremely important user of compcache -- Its currently being
> tested (unofficially) on Android, Nokia etc.

That's not a technical argument for keeping CONFIG_ARM in the driver.

>>>>> +
>>>>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
>>>>> +       mutex_lock(&rzs->lock);
>>>>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
>>>>
>>>> Hmm? What's this? I don't think you should be doing ad hoc
>>>> trace_mark() in driver code.
>>>
>>> This is not ad hoc. It is to see contention over this lock which I believe is a
>>> major bottleneck even on dual-cores. I need to keep this to measure improvements
>>> as I gradually make this locking more fine grained (using per-cpu buffer etc).
>>
>> It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
>> those bits out-of-tree until the issue is resolved, really.
>
> /me is speechless.

That's fine, I CC'd the ftrace folks. Hopefully they'll be able to help you.

>
>>>>> +       rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);
>>>>
>>>> Use alloc_pages(__GFP_ZERO) here?
>>>
>>> alloc pages then map them (i.e. vmalloc). What did we gain? With
>>> vmalloc, pages might
>>> not be physically contiguous which might hurt performance as
>>> compressor runs over this buffer.
>>>
>>> So, use kzalloc().
>>
>> I don't know what you're talking about. kzalloc() calls
>> __get_free_pages() directly for your allocation. You probably should
>> use that directly.
>
> What is wrong with kzalloc? I'm wholly totally stumped.
> I respect your time reviewing the code but this really goes over my head.
> We can continue arguing about get_pages vs kzalloc but I doubt if we will
> gain anything out of it.

The slab allocator needs metadata for the allocation so you're wasting
memory. If you really want *two pages*, why don't you simply use the
page allocator for that?

Btw, Nitin, why are you targeting drivers/block and not
drivers/staging at this point? It seems obvious enough that there are
still some issues that need to be ironed out (like the CONFIG_ARM
thing) so submitting the driver for inclusion in drivers/staging and
fixing it up there incrementally would likely save you from a lot of
trouble. Greg, does ramzswap sound like something that you'd be
willing to take?

                        Pekka

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-15  8:30             ` Pekka Enberg
  0 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-09-15  8:30 UTC (permalink / raw)
  To: Nitin Gupta
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt, Greg KH

On Tue, Sep 15, 2009 at 11:21 AM, Nitin Gupta <ngupta@vflare.org> wrote:
> I don't want to ponder too much about this point now. If you all are okay
> with keeping this function buried in driver, I will do so. I'm almost tired
> maintaining this compcache thing outside of mainline.

Yup, whatever makes most sense to you.

>> Then make ramzswap depend on !CONFIG_ARM. In any case, CONFIG_ARM bits
>> really don't belong into drivers/block.
>
> ARM is an extremely important user of compcache -- Its currently being
> tested (unofficially) on Android, Nokia etc.

That's not a technical argument for keeping CONFIG_ARM in the driver.

>>>>> +
>>>>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
>>>>> +       mutex_lock(&rzs->lock);
>>>>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
>>>>
>>>> Hmm? What's this? I don't think you should be doing ad hoc
>>>> trace_mark() in driver code.
>>>
>>> This is not ad hoc. It is to see contention over this lock which I believe is a
>>> major bottleneck even on dual-cores. I need to keep this to measure improvements
>>> as I gradually make this locking more fine grained (using per-cpu buffer etc).
>>
>> It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
>> those bits out-of-tree until the issue is resolved, really.
>
> /me is speechless.

That's fine, I CC'd the ftrace folks. Hopefully they'll be able to help you.

>
>>>>> +       rzs->compress_buffer = kzalloc(2 * PAGE_SIZE, GFP_KERNEL);
>>>>
>>>> Use alloc_pages(__GFP_ZERO) here?
>>>
>>> alloc pages then map them (i.e. vmalloc). What did we gain? With
>>> vmalloc, pages might
>>> not be physically contiguous which might hurt performance as
>>> compressor runs over this buffer.
>>>
>>> So, use kzalloc().
>>
>> I don't know what you're talking about. kzalloc() calls
>> __get_free_pages() directly for your allocation. You probably should
>> use that directly.
>
> What is wrong with kzalloc? I'm wholly totally stumped.
> I respect your time reviewing the code but this really goes over my head.
> We can continue arguing about get_pages vs kzalloc but I doubt if we will
> gain anything out of it.

The slab allocator needs metadata for the allocation so you're wasting
memory. If you really want *two pages*, why don't you simply use the
page allocator for that?

Btw, Nitin, why are you targeting drivers/block and not
drivers/staging at this point? It seems obvious enough that there are
still some issues that need to be ironed out (like the CONFIG_ARM
thing) so submitting the driver for inclusion in drivers/staging and
fixing it up there incrementally would likely save you from a lot of
trouble. Greg, does ramzswap sound like something that you'd be
willing to take?

                        Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-15  7:30         ` Pekka Enberg
@ 2009-09-15 11:45           ` Ed Tomlinson
  -1 siblings, 0 replies; 40+ messages in thread
From: Ed Tomlinson @ 2009-09-15 11:45 UTC (permalink / raw)
  To: Pekka Enberg, Nitin Gupta
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt, Greg KH

On Tuesday 15 September 2009 03:30:23 you wrote:
> > So, its extremely difficult to wait for the proper fix.
> 
> Then make ramzswap depend on !CONFIG_ARM. In any case, CONFIG_ARM bits
> really don't belong into drivers/block.

Problem is that ramzswap is usefull on boxes like the n800/n810...  So this is a bad
suggestion from my POV.    How about a comment saying this code goes when the 
fix arrives???

Ed Tomlinson

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-15 11:45           ` Ed Tomlinson
  0 siblings, 0 replies; 40+ messages in thread
From: Ed Tomlinson @ 2009-09-15 11:45 UTC (permalink / raw)
  To: Pekka Enberg, Nitin Gupta
  Cc: Andrew Morton, Hugh Dickins, Ed Tomlinson, linux-kernel,
	linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt, Greg KH

On Tuesday 15 September 2009 03:30:23 you wrote:
> > So, its extremely difficult to wait for the proper fix.
> 
> Then make ramzswap depend on !CONFIG_ARM. In any case, CONFIG_ARM bits
> really don't belong into drivers/block.

Problem is that ramzswap is usefull on boxes like the n800/n810...  So this is a bad
suggestion from my POV.    How about a comment saying this code goes when the 
fix arrives???

Ed Tomlinson

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-15 11:45           ` Ed Tomlinson
@ 2009-09-15 12:42             ` Pekka Enberg
  -1 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-09-15 12:42 UTC (permalink / raw)
  To: Ed Tomlinson
  Cc: Nitin Gupta, Andrew Morton, Hugh Dickins, linux-kernel, linux-mm,
	linux-mm-cc, Ingo Molnar, Frédéric Weisbecker,
	Steven Rostedt, Greg KH

On Tuesday 15 September 2009 03:30:23 you wrote:
>> > So, its extremely difficult to wait for the proper fix.
>>
>> Then make ramzswap depend on !CONFIG_ARM. In any case, CONFIG_ARM bits
>> really don't belong into drivers/block.

On Tue, Sep 15, 2009 at 2:45 PM, Ed Tomlinson <edt@aei.ca> wrote:
> Problem is that ramzswap is usefull on boxes like the n800/n810...  So this is a bad
> suggestion from my POV.    How about a comment saying this code goes when the
> fix arrives???

Just put the driver in driver/staging until the issue is resolved.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-15 12:42             ` Pekka Enberg
  0 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-09-15 12:42 UTC (permalink / raw)
  To: Ed Tomlinson
  Cc: Nitin Gupta, Andrew Morton, Hugh Dickins, linux-kernel, linux-mm,
	linux-mm-cc, Ingo Molnar, Frédéric Weisbecker,
	Steven Rostedt, Greg KH

On Tuesday 15 September 2009 03:30:23 you wrote:
>> > So, its extremely difficult to wait for the proper fix.
>>
>> Then make ramzswap depend on !CONFIG_ARM. In any case, CONFIG_ARM bits
>> really don't belong into drivers/block.

On Tue, Sep 15, 2009 at 2:45 PM, Ed Tomlinson <edt@aei.ca> wrote:
> Problem is that ramzswap is usefull on boxes like the n800/n810...  So this is a bad
> suggestion from my POV.    How about a comment saying this code goes when the
> fix arrives???

Just put the driver in driver/staging until the issue is resolved.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-15  7:30         ` Pekka Enberg
@ 2009-09-15 13:14           ` Steven Rostedt
  -1 siblings, 0 replies; 40+ messages in thread
From: Steven Rostedt @ 2009-09-15 13:14 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Nitin Gupta, Andrew Morton, Hugh Dickins, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker

On Tue, 2009-09-15 at 10:30 +0300, Pekka Enberg wrote:
> Hi Nitin,

> 
> >>> +static int page_zero_filled(void *ptr)
> >>> +{
> >>> +       u32 pos;
> >>> +       u64 *page;
> >>> +
> >>> +       page = (u64 *)ptr;
> >>> +
> >>> +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
> >>> +               if (page[pos])
> >>> +                       return 0;
> >>> +       }
> >>> +
> >>> +       return 1;
> >>> +}
> >>
> >> This looks like something that could be in lib/string.c.
> >>
> >> /me looks
> >>
> >> There's strspn so maybe you could introduce a memspn equivalent.
> >
> > Maybe this is just too specific to this driver. Who else will use it?
> > So, this simple function should stay within this driver only. If it
> > finds more user, we can them move it to lib/string.c.
> >
> > If I now move it to string.c I am sure I will get reverse argument
> > from someone else:
> > "currently, it has no other users so bury it with this driver only".
> 
> How can you be sure about that? If you don't want to move it to
> generic code, fine, but the above argumentation doesn't really
> convince me. Check the git logs to see that this is *exactly* how new
> functions get added to lib/string.c. It's not always a question of two
> or more users, it's also an API issue. It doesn't make sense to put
> helpers in driver code where they don't belong (and won't be
> discovered if they're needed somewhere else).

I agree, a generic function like this should be put into string.c (or
some library). That's the first place I look when I want to do some kind
of generic string or memory manipulation.

If you don't put it there, and another driver writer needs the same
thing, they will write their own. That's how we get 10 different
implementations of the same code in the kernel. Because everyone thinks
"this will only be used by me".


> >>> +
> >>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
> >>> +       mutex_lock(&rzs->lock);
> >>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
> >>
> >> Hmm? What's this? I don't think you should be doing ad hoc
> >> trace_mark() in driver code.
> >
> > This is not ad hoc. It is to see contention over this lock which I believe is a
> > major bottleneck even on dual-cores. I need to keep this to measure improvements
> > as I gradually make this locking more fine grained (using per-cpu buffer etc).
> 
> It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
> those bits out-of-tree until the issue is resolved, really.

Yes, trace_mark is deprecated. You want to use TRACE_EVENT. See how gfs2
does it in:

  fs/gfs2/gfs2_trace.h

and it is well documented in
samples/trace_events/trace-events-samples.[ch]

-- Steve




^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-15 13:14           ` Steven Rostedt
  0 siblings, 0 replies; 40+ messages in thread
From: Steven Rostedt @ 2009-09-15 13:14 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Nitin Gupta, Andrew Morton, Hugh Dickins, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker

On Tue, 2009-09-15 at 10:30 +0300, Pekka Enberg wrote:
> Hi Nitin,

> 
> >>> +static int page_zero_filled(void *ptr)
> >>> +{
> >>> +       u32 pos;
> >>> +       u64 *page;
> >>> +
> >>> +       page = (u64 *)ptr;
> >>> +
> >>> +       for (pos = 0; pos != PAGE_SIZE / sizeof(*page); pos++) {
> >>> +               if (page[pos])
> >>> +                       return 0;
> >>> +       }
> >>> +
> >>> +       return 1;
> >>> +}
> >>
> >> This looks like something that could be in lib/string.c.
> >>
> >> /me looks
> >>
> >> There's strspn so maybe you could introduce a memspn equivalent.
> >
> > Maybe this is just too specific to this driver. Who else will use it?
> > So, this simple function should stay within this driver only. If it
> > finds more user, we can them move it to lib/string.c.
> >
> > If I now move it to string.c I am sure I will get reverse argument
> > from someone else:
> > "currently, it has no other users so bury it with this driver only".
> 
> How can you be sure about that? If you don't want to move it to
> generic code, fine, but the above argumentation doesn't really
> convince me. Check the git logs to see that this is *exactly* how new
> functions get added to lib/string.c. It's not always a question of two
> or more users, it's also an API issue. It doesn't make sense to put
> helpers in driver code where they don't belong (and won't be
> discovered if they're needed somewhere else).

I agree, a generic function like this should be put into string.c (or
some library). That's the first place I look when I want to do some kind
of generic string or memory manipulation.

If you don't put it there, and another driver writer needs the same
thing, they will write their own. That's how we get 10 different
implementations of the same code in the kernel. Because everyone thinks
"this will only be used by me".


> >>> +
> >>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
> >>> +       mutex_lock(&rzs->lock);
> >>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
> >>
> >> Hmm? What's this? I don't think you should be doing ad hoc
> >> trace_mark() in driver code.
> >
> > This is not ad hoc. It is to see contention over this lock which I believe is a
> > major bottleneck even on dual-cores. I need to keep this to measure improvements
> > as I gradually make this locking more fine grained (using per-cpu buffer etc).
> 
> It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
> those bits out-of-tree until the issue is resolved, really.

Yes, trace_mark is deprecated. You want to use TRACE_EVENT. See how gfs2
does it in:

  fs/gfs2/gfs2_trace.h

and it is well documented in
samples/trace_events/trace-events-samples.[ch]

-- Steve



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-15 13:14           ` Steven Rostedt
@ 2009-09-15 13:43             ` Pekka Enberg
  -1 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-09-15 13:43 UTC (permalink / raw)
  To: rostedt
  Cc: Nitin Gupta, Andrew Morton, Hugh Dickins, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker

Hi Steve,

On Tue, 2009-09-15 at 09:14 -0400, Steven Rostedt wrote:
> > >>> +
> > >>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
> > >>> +       mutex_lock(&rzs->lock);
> > >>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
> > >>
> > >> Hmm? What's this? I don't think you should be doing ad hoc
> > >> trace_mark() in driver code.
> > >
> > > This is not ad hoc. It is to see contention over this lock which I believe is a
> > > major bottleneck even on dual-cores. I need to keep this to measure improvements
> > > as I gradually make this locking more fine grained (using per-cpu buffer etc).
> > 
> > It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
> > those bits out-of-tree until the issue is resolved, really.
> 
> Yes, trace_mark is deprecated. You want to use TRACE_EVENT. See how gfs2
> does it in:
> 
>   fs/gfs2/gfs2_trace.h
> 
> and it is well documented in
> samples/trace_events/trace-events-samples.[ch]

Does it really make sense to add special-case tracing in driver code to
profile lock contention for a _single mutex_?

			Pekka


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-15 13:43             ` Pekka Enberg
  0 siblings, 0 replies; 40+ messages in thread
From: Pekka Enberg @ 2009-09-15 13:43 UTC (permalink / raw)
  To: rostedt
  Cc: Nitin Gupta, Andrew Morton, Hugh Dickins, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker

Hi Steve,

On Tue, 2009-09-15 at 09:14 -0400, Steven Rostedt wrote:
> > >>> +
> > >>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
> > >>> +       mutex_lock(&rzs->lock);
> > >>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
> > >>
> > >> Hmm? What's this? I don't think you should be doing ad hoc
> > >> trace_mark() in driver code.
> > >
> > > This is not ad hoc. It is to see contention over this lock which I believe is a
> > > major bottleneck even on dual-cores. I need to keep this to measure improvements
> > > as I gradually make this locking more fine grained (using per-cpu buffer etc).
> > 
> > It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
> > those bits out-of-tree until the issue is resolved, really.
> 
> Yes, trace_mark is deprecated. You want to use TRACE_EVENT. See how gfs2
> does it in:
> 
>   fs/gfs2/gfs2_trace.h
> 
> and it is well documented in
> samples/trace_events/trace-events-samples.[ch]

Does it really make sense to add special-case tracing in driver code to
profile lock contention for a _single mutex_?

			Pekka

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-15 13:43             ` Pekka Enberg
@ 2009-09-15 14:08               ` Steven Rostedt
  -1 siblings, 0 replies; 40+ messages in thread
From: Steven Rostedt @ 2009-09-15 14:08 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Nitin Gupta, Andrew Morton, Hugh Dickins, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker

On Tue, 2009-09-15 at 16:43 +0300, Pekka Enberg wrote:
> Hi Steve,
> 
> On Tue, 2009-09-15 at 09:14 -0400, Steven Rostedt wrote:
> > > >>> +
> > > >>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
> > > >>> +       mutex_lock(&rzs->lock);
> > > >>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
> > > >>
> > > >> Hmm? What's this? I don't think you should be doing ad hoc
> > > >> trace_mark() in driver code.
> > > >
> > > > This is not ad hoc. It is to see contention over this lock which I believe is a
> > > > major bottleneck even on dual-cores. I need to keep this to measure improvements
> > > > as I gradually make this locking more fine grained (using per-cpu buffer etc).
> > > 
> > > It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
> > > those bits out-of-tree until the issue is resolved, really.
> > 
> > Yes, trace_mark is deprecated. You want to use TRACE_EVENT. See how gfs2
> > does it in:
> > 
> >   fs/gfs2/gfs2_trace.h
> > 
> > and it is well documented in
> > samples/trace_events/trace-events-samples.[ch]
> 
> Does it really make sense to add special-case tracing in driver code to
> profile lock contention for a _single mutex_?

No that's what LOCKSTAT is for ;-)

-- Steve



^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-15 14:08               ` Steven Rostedt
  0 siblings, 0 replies; 40+ messages in thread
From: Steven Rostedt @ 2009-09-15 14:08 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Nitin Gupta, Andrew Morton, Hugh Dickins, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker

On Tue, 2009-09-15 at 16:43 +0300, Pekka Enberg wrote:
> Hi Steve,
> 
> On Tue, 2009-09-15 at 09:14 -0400, Steven Rostedt wrote:
> > > >>> +
> > > >>> +       trace_mark(ramzswap_lock_wait, "ramzswap_lock_wait");
> > > >>> +       mutex_lock(&rzs->lock);
> > > >>> +       trace_mark(ramzswap_lock_acquired, "ramzswap_lock_acquired");
> > > >>
> > > >> Hmm? What's this? I don't think you should be doing ad hoc
> > > >> trace_mark() in driver code.
> > > >
> > > > This is not ad hoc. It is to see contention over this lock which I believe is a
> > > > major bottleneck even on dual-cores. I need to keep this to measure improvements
> > > > as I gradually make this locking more fine grained (using per-cpu buffer etc).
> > > 
> > > It is ad hoc. Talk to the ftrace folks how to do it properly. I'd keep
> > > those bits out-of-tree until the issue is resolved, really.
> > 
> > Yes, trace_mark is deprecated. You want to use TRACE_EVENT. See how gfs2
> > does it in:
> > 
> >   fs/gfs2/gfs2_trace.h
> > 
> > and it is well documented in
> > samples/trace_events/trace-events-samples.[ch]
> 
> Does it really make sense to add special-case tracing in driver code to
> profile lock contention for a _single mutex_?

No that's what LOCKSTAT is for ;-)

-- Steve


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-15  8:30             ` Pekka Enberg
@ 2009-09-15 15:26               ` Greg KH
  -1 siblings, 0 replies; 40+ messages in thread
From: Greg KH @ 2009-09-15 15:26 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Nitin Gupta, Andrew Morton, Hugh Dickins, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt

On Tue, Sep 15, 2009 at 11:30:16AM +0300, Pekka Enberg wrote:
> Btw, Nitin, why are you targeting drivers/block and not
> drivers/staging at this point? It seems obvious enough that there are
> still some issues that need to be ironed out (like the CONFIG_ARM
> thing) so submitting the driver for inclusion in drivers/staging and
> fixing it up there incrementally would likely save you from a lot of
> trouble. Greg, does ramzswap sound like something that you'd be
> willing to take?

If it is self-contained, actually builds, and there is an active
developer who is working getting it merged into the main kernel tree,
then yes, I will be willing to take it into the drivers/staging/ tree.
Just send me the patches.

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-15 15:26               ` Greg KH
  0 siblings, 0 replies; 40+ messages in thread
From: Greg KH @ 2009-09-15 15:26 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Nitin Gupta, Andrew Morton, Hugh Dickins, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt

On Tue, Sep 15, 2009 at 11:30:16AM +0300, Pekka Enberg wrote:
> Btw, Nitin, why are you targeting drivers/block and not
> drivers/staging at this point? It seems obvious enough that there are
> still some issues that need to be ironed out (like the CONFIG_ARM
> thing) so submitting the driver for inclusion in drivers/staging and
> fixing it up there incrementally would likely save you from a lot of
> trouble. Greg, does ramzswap sound like something that you'd be
> willing to take?

If it is self-contained, actually builds, and there is an active
developer who is working getting it merged into the main kernel tree,
then yes, I will be willing to take it into the drivers/staging/ tree.
Just send me the patches.

thanks,

greg k-h

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
  2009-09-15 15:26               ` Greg KH
@ 2009-09-15 15:55                 ` Nitin Gupta
  -1 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-15 15:55 UTC (permalink / raw)
  To: Greg KH
  Cc: Pekka Enberg, Andrew Morton, Hugh Dickins, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt

On 09/15/2009 08:56 PM, Greg KH wrote:
> On Tue, Sep 15, 2009 at 11:30:16AM +0300, Pekka Enberg wrote:
>> Btw, Nitin, why are you targeting drivers/block and not
>> drivers/staging at this point? It seems obvious enough that there are
>> still some issues that need to be ironed out (like the CONFIG_ARM
>> thing) so submitting the driver for inclusion in drivers/staging and
>> fixing it up there incrementally would likely save you from a lot of
>> trouble. Greg, does ramzswap sound like something that you'd be
>> willing to take?
>
> If it is self-contained, actually builds, and there is an active
> developer who is working getting it merged into the main kernel tree,
> then yes, I will be willing to take it into the drivers/staging/ tree.
> Just send me the patches.
>

It is self contained (except for swap notify callback part). I will send
revised patches addressing some of coding issues raised in reviews and moving
everything over to drivers/staging.

Thanks,
Nitin


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [PATCH 2/4] virtual block device driver (ramzswap)
@ 2009-09-15 15:55                 ` Nitin Gupta
  0 siblings, 0 replies; 40+ messages in thread
From: Nitin Gupta @ 2009-09-15 15:55 UTC (permalink / raw)
  To: Greg KH
  Cc: Pekka Enberg, Andrew Morton, Hugh Dickins, Ed Tomlinson,
	linux-kernel, linux-mm, linux-mm-cc, Ingo Molnar,
	Frédéric Weisbecker, Steven Rostedt

On 09/15/2009 08:56 PM, Greg KH wrote:
> On Tue, Sep 15, 2009 at 11:30:16AM +0300, Pekka Enberg wrote:
>> Btw, Nitin, why are you targeting drivers/block and not
>> drivers/staging at this point? It seems obvious enough that there are
>> still some issues that need to be ironed out (like the CONFIG_ARM
>> thing) so submitting the driver for inclusion in drivers/staging and
>> fixing it up there incrementally would likely save you from a lot of
>> trouble. Greg, does ramzswap sound like something that you'd be
>> willing to take?
>
> If it is self-contained, actually builds, and there is an active
> developer who is working getting it merged into the main kernel tree,
> then yes, I will be willing to take it into the drivers/staging/ tree.
> Just send me the patches.
>

It is self contained (except for swap notify callback part). I will send
revised patches addressing some of coding issues raised in reviews and moving
everything over to drivers/staging.

Thanks,
Nitin

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2009-09-15 15:56 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <200909100215.36350.ngupta@vflare.org>
2009-09-09 21:19 ` [PATCH 2/4] virtual block device driver (ramzswap) Nitin Gupta
2009-09-09 21:19   ` Nitin Gupta
2009-09-14 20:10   ` Pekka Enberg
2009-09-14 20:10     ` Pekka Enberg
2009-09-15  6:39     ` Nitin Gupta
2009-09-15  6:39       ` Nitin Gupta
2009-09-15  7:30       ` Pekka Enberg
2009-09-15  7:30         ` Pekka Enberg
2009-09-15  8:21         ` Nitin Gupta
2009-09-15  8:21           ` Nitin Gupta
2009-09-15  8:30           ` Pekka Enberg
2009-09-15  8:30             ` Pekka Enberg
2009-09-15 15:26             ` Greg KH
2009-09-15 15:26               ` Greg KH
2009-09-15 15:55               ` Nitin Gupta
2009-09-15 15:55                 ` Nitin Gupta
2009-09-15 11:45         ` Ed Tomlinson
2009-09-15 11:45           ` Ed Tomlinson
2009-09-15 12:42           ` Pekka Enberg
2009-09-15 12:42             ` Pekka Enberg
2009-09-15 13:14         ` Steven Rostedt
2009-09-15 13:14           ` Steven Rostedt
2009-09-15 13:43           ` Pekka Enberg
2009-09-15 13:43             ` Pekka Enberg
2009-09-15 14:08             ` Steven Rostedt
2009-09-15 14:08               ` Steven Rostedt
2009-09-09 21:21 ` [PATCH 4/4] documentation Nitin Gupta
2009-09-09 21:21   ` Nitin Gupta
2009-09-09 21:50 ` [PATCH 3/4] send callback when swap slot is freed Nitin Gupta
2009-09-09 21:50   ` Nitin Gupta
2009-09-09 21:53 ` [PATCH 1/4] xvmalloc memory allocator Nitin Gupta
2009-09-09 21:53   ` Nitin Gupta
2009-09-09 22:02 ` [PATCH 0/4] compcache: in-memory compressed swapping v2 Nitin Gupta
2009-09-09 22:02   ` Nitin Gupta
2009-09-12  2:24   ` Nitin Gupta
2009-09-12  2:24     ` Nitin Gupta
2009-09-13 19:07     ` Hugh Dickins
2009-09-13 19:07       ` Hugh Dickins
2009-09-14  4:04       ` Nitin Gupta
2009-09-14  4:04         ` Nitin Gupta

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.