All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] [linux-2.6.18-xen] (take 3) transcendent memory ("tmem") linux-side changes
@ 2009-06-16 18:02 Dan Magenheimer
  2009-06-17  6:31 ` Keir Fraser
  0 siblings, 1 reply; 3+ messages in thread
From: Dan Magenheimer @ 2009-06-16 18:02 UTC (permalink / raw)
  To: Xen-Devel (E-mail); +Cc: Keir Fraser, Jan Beulich

[-- Attachment #1: Type: text/plain, Size: 1471 bytes --]

(take 3: no anonymous unions; match Jan's xen-side patch)

Transcendent memory ("tmem") for Linux

Tmem, when called from a tmem-capable (paravirtualized) guest, makes
use of otherwise unutilized ("fallow") memory to create and manage
pools of pages that can be accessed from the guest either as
"ephemeral" pages or as "persistent" pages. In either case, the pages
are not directly addressible by the guest, only copied to and fro via
the tmem interface. Ephemeral pages are a nice place for a guest to
put recently evicted clean pages that it might need again; these pages
can be reclaimed synchronously by Xen for other guests or other uses.
Persistent pages are a nice place for a guest to put "swap" pages to
avoid sending them to disk. These pages retain data as long as the
guest lives, but count against the guest memory allocation.

This patch contains the Linux paravirtualization changes to
complement the tmem Xen patch (xen-unstable c/s 19646). It
implements "precache" (ext3 only as of now), "preswap",
and limited "shared precache" (ocfs2 only as of now) support.
CONFIG options are required to turn on
the support (but in this patch they default to "y").  If
the underlying Xen does not have tmem support or has it
turned off, this is sensed early to avoid nearly all
hypercalls.

Lots of useful prose about tmem can be found at
http://oss.oracle.com/projects/tmem 

Signed-off-by: Dan Magenheimer <dan.magenheimer@oracle.com>

[-- Attachment #2: tmem-linux-2.6.18-xen-903-090616.patch --]
[-- Type: application/octet-stream, Size: 37904 bytes --]

diff -r ca12928cdafe fs/buffer.c
--- a/fs/buffer.c	Mon Jun 08 12:23:24 2009 +0100
+++ b/fs/buffer.c	Tue Jun 16 11:27:44 2009 -0600
@@ -41,6 +41,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/precache.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 static void invalidate_bh_lrus(void);
@@ -482,6 +483,11 @@ void invalidate_bdev(struct block_device
 	 * that, but not until that's cleaned up.
 	 */
 	invalidate_inode_pages(mapping);
+
+	/* 99% of the time, we don't need to flush the precache on the bdev.
+	 * But, for the strange corners, lets be cautious
+	 */
+	precache_flush_inode(mapping);
 }
 
 /*
diff -r ca12928cdafe fs/ext3/super.c
--- a/fs/ext3/super.c	Mon Jun 08 12:23:24 2009 +0100
+++ b/fs/ext3/super.c	Tue Jun 16 11:27:44 2009 -0600
@@ -35,6 +35,7 @@
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
+#include <linux/precache.h>
 
 #include <asm/uaccess.h>
 
@@ -1167,6 +1168,7 @@ static int ext3_setup_super(struct super
 	} else {
 		printk("internal journal\n");
 	}
+	precache_init(sb);
 	return res;
 }
 
diff -r ca12928cdafe fs/mpage.c
--- a/fs/mpage.c	Mon Jun 08 12:23:24 2009 +0100
+++ b/fs/mpage.c	Tue Jun 16 11:27:44 2009 -0600
@@ -26,6 +26,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/precache.h>
 
 /*
  * I/O completion handler for multipage BIOs.
@@ -296,6 +297,13 @@ do_mpage_readpage(struct bio *bio, struc
 		}
 	} else if (fully_mapped) {
 		SetPageMappedToDisk(page);
+	}
+
+	if (fully_mapped &&
+	    blocks_per_page == 1 && !PageUptodate(page) &&
+	    precache_get(page->mapping, page->index, page) == 1) {
+		SetPageUptodate(page);
+		goto confused;
 	}
 
 	/*
diff -r ca12928cdafe fs/ocfs2/super.c
--- a/fs/ocfs2/super.c	Mon Jun 08 12:23:24 2009 +0100
+++ b/fs/ocfs2/super.c	Tue Jun 16 11:27:44 2009 -0600
@@ -39,6 +39,7 @@
 #include <linux/parser.h>
 #include <linux/crc32.h>
 #include <linux/debugfs.h>
+#include <linux/precache.h>
 
 #include <cluster/nodemanager.h>
 
@@ -1457,16 +1458,18 @@ static int ocfs2_initialize_super(struct
 
 	di = (struct ocfs2_dinode *) bitmap_bh->b_data;
 	osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
-	brelse(bitmap_bh);
 	mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
 	     (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
 
 	status = ocfs2_init_slot_info(osb);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_rel;
 	}
+	shared_precache_init(sb, &di->id2.i_super.s_uuid[0]);
 
+bail_rel:
+	brelse(bitmap_bh);
 bail:
 	mlog_exit(status);
 	return status;
diff -r ca12928cdafe fs/super.c
--- a/fs/super.c	Mon Jun 08 12:23:24 2009 +0100
+++ b/fs/super.c	Tue Jun 16 11:27:44 2009 -0600
@@ -37,6 +37,7 @@
 #include <linux/idr.h>
 #include <linux/kobject.h>
 #include <linux/mutex.h>
+#include <linux/precache.h>
 #include <asm/uaccess.h>
 
 
@@ -93,6 +94,9 @@ static struct super_block *alloc_super(s
 		s->s_qcop = sb_quotactl_ops;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
+#ifdef CONFIG_PRECACHE
+		s->precache_poolid = -1;
+#endif
 	}
 out:
 	return s;
@@ -181,6 +185,7 @@ void deactivate_super(struct super_block
 		DQUOT_OFF(s);
 		down_write(&s->s_umount);
 		fs->kill_sb(s);
+		precache_flush_filesystem(s);
 		put_filesystem(fs);
 		put_super(s);
 	}
@@ -777,6 +782,9 @@ int get_sb_nodev(struct file_system_type
 		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
+#ifdef CONFIG_PRECACHE
+	s->precache_poolid = -2;
+#endif
 	return simple_set_mnt(mnt, s);
 }
 
diff -r ca12928cdafe include/asm-i386/mach-xen/asm/hypercall.h
--- a/include/asm-i386/mach-xen/asm/hypercall.h	Mon Jun 08 12:23:24 2009 +0100
+++ b/include/asm-i386/mach-xen/asm/hypercall.h	Tue Jun 16 11:27:44 2009 -0600
@@ -404,6 +404,12 @@ HYPERVISOR_kexec_op(
 	return _hypercall2(int, kexec_op, op, args);
 }
 
+static inline int __must_check
+HYPERVISOR_tmem_op(
+	struct tmem_op *op)
+{
+	return _hypercall1(int, tmem_op, op);
+}
 
 
 #endif /* __HYPERCALL_H__ */
diff -r ca12928cdafe include/asm-x86_64/mach-xen/asm/hypercall.h
--- a/include/asm-x86_64/mach-xen/asm/hypercall.h	Mon Jun 08 12:23:24 2009 +0100
+++ b/include/asm-x86_64/mach-xen/asm/hypercall.h	Tue Jun 16 11:27:44 2009 -0600
@@ -405,4 +405,11 @@ HYPERVISOR_kexec_op(
 	return _hypercall2(int, kexec_op, op, args);
 }
 
+static inline int __must_check
+HYPERVISOR_tmem_op(
+	struct tmem_op *op)
+{
+	return _hypercall1(int, tmem_op, op);
+}
+
 #endif /* __HYPERCALL_H__ */
diff -r ca12928cdafe include/linux/fs.h
--- a/include/linux/fs.h	Mon Jun 08 12:23:24 2009 +0100
+++ b/include/linux/fs.h	Tue Jun 16 11:27:44 2009 -0600
@@ -907,6 +907,9 @@ struct super_block {
 	/* Granularity of c/m/atime in ns.
 	   Cannot be worse than a second */
 	u32		   s_time_gran;
+#ifdef CONFIG_PRECACHE
+	u32                precache_poolid;
+#endif
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
diff -r ca12928cdafe include/linux/precache.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/linux/precache.h	Tue Jun 16 11:27:44 2009 -0600
@@ -0,0 +1,55 @@
+#ifndef _LINUX_PRECACHE_H
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#ifdef CONFIG_PRECACHE
+extern void precache_init(struct super_block *sb);
+extern void shared_precache_init(struct super_block *sb, char *uuid);
+extern int precache_get(struct address_space *mapping, unsigned long index,
+	       struct page *empty_page);
+extern int precache_put(struct address_space *mapping, unsigned long index,
+		struct page *page);
+extern int precache_flush(struct address_space *mapping, unsigned long index);
+extern int precache_flush_inode(struct address_space *mapping);
+extern int precache_flush_filesystem(struct super_block *s);
+#else
+static inline void precache_init(struct super_block *sb)
+{
+}
+
+static inline void shared_precache_init(struct super_block *sb, char *uuid)
+{
+}
+
+static inline int precache_get(struct address_space *mapping,
+		unsigned long index, struct page *empty_page)
+{
+	return 0;
+}
+
+static inline int precache_put(struct address_space *mapping,
+		unsigned long index, struct page *page)
+{
+	return 0;
+}
+
+static inline int precache_flush(struct address_space *mapping,
+		unsigned long index)
+{
+	return 0;
+}
+
+static inline int precache_flush_inode(struct address_space *mapping)
+{
+	return 0;
+}
+
+static inline int precache_flush_filesystem(struct super_block *s)
+{
+	return 0;
+}
+#endif
+
+#define _LINUX_PRECACHE_H
+#endif /* _LINUX_PRECACHE_H */
diff -r ca12928cdafe include/linux/swap.h
--- a/include/linux/swap.h	Mon Jun 08 12:23:24 2009 +0100
+++ b/include/linux/swap.h	Tue Jun 16 11:27:44 2009 -0600
@@ -6,6 +6,7 @@
 #include <linux/mmzone.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -143,8 +144,59 @@ struct swap_info_struct {
 	unsigned int pages;
 	unsigned int max;
 	unsigned int inuse_pages;
+#ifdef CONFIG_PRESWAP
+	unsigned long *preswap_map;
+	unsigned int preswap_pages;
+#endif
 	int next;			/* next entry on swap list */
 };
+
+#ifdef CONFIG_PRESWAP
+
+#include <linux/sysctl.h>
+extern int preswap_sysctl_handler(struct ctl_table *, int, struct file *,
+	void __user *, size_t *, loff_t *);
+extern const unsigned long preswap_zero, preswap_infinity;
+
+extern void preswap_shrink(unsigned long);
+extern int preswap_test(struct swap_info_struct *, unsigned long);
+extern void preswap_init(unsigned);
+extern int preswap_put(struct page *);
+extern int preswap_get(struct page *);
+extern void preswap_flush(unsigned, unsigned long);
+extern void preswap_flush_area(unsigned);
+#else
+static inline void preswap_shrink(unsigned long target_pages)
+{
+}
+
+static inline int preswap_test(struct swap_info_struct *sis, unsigned long offset)
+{
+	return 0;
+}
+
+static inline void preswap_init(unsigned type)
+{
+}
+
+static inline int preswap_put(struct page *page)
+{
+	return 0;
+}
+
+static inline int preswap_get(struct page *get)
+{
+	return 0;
+}
+
+static inline void preswap_flush(unsigned type, unsigned long offset)
+{
+}
+
+static inline void preswap_flush_area(unsigned type)
+{
+}
+#endif /* CONFIG_PRESWAP */
 
 struct swap_list_t {
 	int head;	/* head of priority-ordered swapfile list */
@@ -247,7 +299,6 @@ extern int can_share_swap_page(struct pa
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
 struct backing_dev_info;
-
 extern spinlock_t swap_lock;
 
 /* linux/mm/thrash.c */
diff -r ca12928cdafe include/linux/sysctl.h
--- a/include/linux/sysctl.h	Mon Jun 08 12:23:24 2009 +0100
+++ b/include/linux/sysctl.h	Tue Jun 16 11:27:44 2009 -0600
@@ -200,6 +200,7 @@ enum
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
 	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
+	VM_PRESWAP_PAGES=36,	/* pages/target_pages in preswap */
 };
 
 
diff -r ca12928cdafe include/xen/interface/xen.h
--- a/include/xen/interface/xen.h	Mon Jun 08 12:23:24 2009 +0100
+++ b/include/xen/interface/xen.h	Tue Jun 16 11:27:44 2009 -0600
@@ -91,6 +91,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_pfn_t);
 #define __HYPERVISOR_sysctl               35
 #define __HYPERVISOR_domctl               36
 #define __HYPERVISOR_kexec_op             37
+#define __HYPERVISOR_tmem_op              38
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
@@ -324,6 +325,30 @@ DEFINE_XEN_GUEST_HANDLE(mmuext_op_t);
 #define VMASST_TYPE_pae_extended_cr3     3
 
 #define MAX_VMASST_TYPE                  3
+
+#ifndef __ASSEMBLY__
+struct tmem_op {
+    uint32_t cmd;
+    int32_t pool_id; /* private > 0; shared < 0; 0 is invalid */
+    union {
+        struct {  /* for cmd == TMEM_NEW_POOL */
+            uint64_t uuid[2];
+            uint32_t flags;
+        } new;
+        struct {
+            uint64_t object;
+            uint32_t index;
+            uint32_t tmem_offset;
+            uint32_t pfn_offset;
+            uint32_t len;
+            xen_pfn_t gmfn; /* guest machine page frame */
+        } gen;
+    } u;
+};
+typedef struct tmem_op tmem_op_t;
+DEFINE_XEN_GUEST_HANDLE(tmem_op_t);
+
+#endif
 
 #ifndef __ASSEMBLY__
 
diff -r ca12928cdafe kernel/sysctl.c
--- a/kernel/sysctl.c	Mon Jun 08 12:23:24 2009 +0100
+++ b/kernel/sysctl.c	Tue Jun 16 11:27:44 2009 -0600
@@ -965,6 +965,18 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
+	},
+#endif
+#ifdef CONFIG_PRESWAP
+	{
+		.ctl_name	= VM_PRESWAP_PAGES,
+		.procname	= "preswap",
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &preswap_sysctl_handler,
+		.extra1		= (void *)&preswap_zero,
+		.extra2		= (void *)&preswap_infinity,
 	},
 #endif
 	{ .ctl_name = 0 }
diff -r ca12928cdafe mm/Kconfig
--- a/mm/Kconfig	Mon Jun 08 12:23:24 2009 +0100
+++ b/mm/Kconfig	Tue Jun 16 11:27:44 2009 -0600
@@ -152,3 +152,33 @@ config RESOURCES_64BIT
 	default 64BIT
 	help
 	  This option allows memory and IO resources to be 64 bit.
+
+#
+# support for transcendent memory
+#
+config TMEM
+	bool
+	depends on XEN
+	help
+	  In a virtualized environment, allows unused and underutilized
+	  system physical memory to be made accessible through a narrow
+	  well-defined page-copy-based API.  If unsure, say Y.
+
+config PRECACHE
+	bool "Cache clean pages in transcendent memory"
+	depends on XEN
+	select TMEM
+	help
+	  Allows the transcendent memory pool to be used to store clean
+	  page-cache pages which, under some circumstances, will greatly
+	  reduce paging and thus improve performance.  If unsure, say Y.
+
+config PRESWAP
+	bool "Swap pages to transcendent memory"
+	depends on XEN
+	select TMEM
+	help
+	  Allows the transcendent memory pool to be used as a pseudo-swap
+	  device which, under some circumstances, will greatly reduce
+	  swapping and thus improve performance.  If unsure, say Y.
+
diff -r ca12928cdafe mm/Makefile
--- a/mm/Makefile	Mon Jun 08 12:23:24 2009 +0100
+++ b/mm/Makefile	Tue Jun 16 11:27:44 2009 -0600
@@ -13,6 +13,9 @@ obj-y			:= bootmem.o filemap.o mempool.o
 			   prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_TMEM)	+= tmem.o
+obj-$(CONFIG_PRESWAP)	+= preswap.o
+obj-$(CONFIG_PRECACHE)	+= precache.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
diff -r ca12928cdafe mm/filemap.c
--- a/mm/filemap.c	Mon Jun 08 12:23:24 2009 +0100
+++ b/mm/filemap.c	Tue Jun 16 11:27:44 2009 -0600
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
+#include <linux/precache.h>
 #include "filemap.h"
 #include "internal.h"
 
@@ -115,6 +116,16 @@ void __remove_from_page_cache(struct pag
 void __remove_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
+
+	/*
+	 * if we're uptodate, flush out into the precache, otherwise
+	 * invalidate any existing precache entries.  We can't leave
+	 * stale data around in the precache once our page is gone
+	 */
+	if (PageUptodate(page))
+		precache_put(page->mapping, page->index, page);
+	else
+		precache_flush(page->mapping, page->index);
 
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
diff -r ca12928cdafe mm/page_io.c
--- a/mm/page_io.c	Mon Jun 08 12:23:24 2009 +0100
+++ b/mm/page_io.c	Tue Jun 16 11:27:44 2009 -0600
@@ -99,10 +99,18 @@ int swap_writepage(struct page *page, st
 		ret = -ENOMEM;
 		goto out;
 	}
+
+	set_page_writeback(page);
+	if (preswap_put(page) == 1) {
+		unlock_page(page);
+		end_page_writeback(page);
+		bio_put(bio);
+		goto out;
+	}
+
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		rw |= (1 << BIO_RW_SYNC);
 	count_vm_event(PSWPOUT);
-	set_page_writeback(page);
 	unlock_page(page);
 	submit_bio(rw, bio);
 out:
@@ -116,6 +124,13 @@ int swap_readpage(struct file *file, str
 
 	BUG_ON(!PageLocked(page));
 	ClearPageUptodate(page);
+
+	if (preswap_get(page) == 1) {
+		SetPageUptodate(page);
+		unlock_page(page);
+		goto out;
+	}
+
 	bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
 				end_swap_bio_read);
 	if (bio == NULL) {
diff -r ca12928cdafe mm/precache.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mm/precache.c	Tue Jun 16 11:27:44 2009 -0600
@@ -0,0 +1,140 @@
+/*
+ * linux/mm/precache.c
+ *
+ * Implements "precache" for filesystems/pagecache on top of transcendent
+ * memory ("tmem") API.  A filesystem creates an "ephemeral tmem pool"
+ * and retains the returned pool_id in its superblock.  Clean pages evicted
+ * from pagecache may be "put" into the pool and associated with a "handle"
+ * consisting of the pool_id, an object (inode) id, and an index (page offset).
+ * Note that the page is copied to tmem; no kernel mappings are changed.
+ * If the page is later needed, the filesystem (or VFS) issues a "get", passing
+ * the same handle and an empty pageframe.  If successful, the page is copied
+ * into the pageframe and a disk read is avoided.  But since the tmem pool
+ * is of indeterminate size, a "put" page has indeterminate longevity
+ * ("ephemeral"), and the "get" may fail, in which case the filesystem must
+ * read the page from disk as before.  Note that the filesystem/pagecache are
+ * responsible for maintaining coherency between the pagecache, precache,
+ * and the disk, for which "flush page" and "flush object" actions are
+ * provided.  And when a filesystem is unmounted, it must "destroy" the pool.
+ *
+ * Two types of pools may be created for a precache: "private" or "shared".
+ * For a private pool, a successful "get" always flushes, implementing
+ * exclusive semantics; for a "shared" pool (which is intended for use by
+ * co-resident nodes of a cluster filesystem), the "flush" is not guaranteed.
+ * In either case, a failed "duplicate" put (overwrite) always guarantee
+ * the old data is flushed.
+ *
+ * Note also that multiple accesses to a tmem pool may be concurrent and any
+ * ordering must be guaranteed by the caller.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+#include <linux/precache.h>
+#include <linux/module.h>
+#include "tmem.h"
+
+static int precache_auto_allocate; /* set to 1 to auto_allocate */
+
+int precache_put(struct address_space *mapping, unsigned long index,
+ struct page *page)
+{
+	u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+	u64 obj = (unsigned long) mapping->host->i_ino;
+	u32 ind = (u32) index;
+	unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
+	int ret;
+
+	if ((s32)tmem_pool < 0) {
+		if (!precache_auto_allocate)
+			return 0;
+		/* a put on a non-existent precache may auto-allocate one */
+		ret = tmem_new_pool(0, 0, 0);
+		if (ret < 0)
+			return 0;
+		printk(KERN_INFO
+			"Mapping superblock for s_id=%s to precache_id=%d\n",
+			mapping->host->i_sb->s_id, tmem_pool);
+		mapping->host->i_sb->precache_poolid = tmem_pool;
+	}
+	if (ind != index)
+		return 0;
+	mb(); /* ensure page is quiescent; tmem may address it with an alias */
+	return tmem_put_page(tmem_pool, obj, ind, mfn);
+}
+
+int precache_get(struct address_space *mapping, unsigned long index,
+ struct page *empty_page)
+{
+	u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+	u64 obj = (unsigned long) mapping->host->i_ino;
+	u32 ind = (u32) index;
+	unsigned long mfn = pfn_to_mfn(page_to_pfn(empty_page));
+
+	if ((s32)tmem_pool < 0)
+		return 0;
+	if (ind != index)
+		return 0;
+
+	return tmem_get_page(tmem_pool, obj, ind, mfn);
+}
+EXPORT_SYMBOL(precache_get);
+
+int precache_flush(struct address_space *mapping, unsigned long index)
+{
+	u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+	u64 obj = (unsigned long) mapping->host->i_ino;
+	u32 ind = (u32) index;
+
+	if ((s32)tmem_pool < 0)
+		return 0;
+	if (ind != index)
+		return 0;
+
+	return tmem_flush_page(tmem_pool, obj, ind);
+}
+EXPORT_SYMBOL(precache_flush);
+
+int precache_flush_inode(struct address_space *mapping)
+{
+	u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+	u64 obj = (unsigned long) mapping->host->i_ino;
+
+	if ((s32)tmem_pool < 0)
+		return 0;
+
+	return tmem_flush_object(tmem_pool, obj);
+}
+EXPORT_SYMBOL(precache_flush_inode);
+
+int precache_flush_filesystem(struct super_block *sb)
+{
+	u32 tmem_pool = sb->precache_poolid;
+	int ret;
+
+	if ((s32)tmem_pool < 0)
+		return 0;
+	ret = tmem_destroy_pool(tmem_pool);
+	if (!ret)
+		return 0;
+	printk(KERN_INFO
+		"Unmapping superblock for s_id=%s from precache_id=%d\n",
+		sb->s_id, ret);
+	sb->precache_poolid = 0;
+	return 1;
+}
+EXPORT_SYMBOL(precache_flush_filesystem);
+
+void precache_init(struct super_block *sb)
+{
+	sb->precache_poolid = tmem_new_pool(0, 0, 0);
+}
+EXPORT_SYMBOL(precache_init);
+
+void shared_precache_init(struct super_block *sb, char *uuid)
+{
+	u64 uuid_lo = *(u64 *)uuid;
+	u64 uuid_hi = *(u64 *)(&uuid[8]);
+	sb->precache_poolid = tmem_new_pool(uuid_lo, uuid_hi, TMEM_POOL_SHARED);
+}
+EXPORT_SYMBOL(shared_precache_init);
diff -r ca12928cdafe mm/preswap.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mm/preswap.c	Tue Jun 16 11:27:44 2009 -0600
@@ -0,0 +1,184 @@
+/*
+ * linux/mm/preswap.c
+ *
+ * Implements a fast "preswap" on top of the transcendent memory ("tmem") API.
+ * When a swapdisk is enabled (with swapon), a "private persistent tmem pool"
+ * is created along with a bit-per-page preswap_map.  When swapping occurs
+ * and a page is about to be written to disk, a "put" into the pool may first
+ * be attempted by passing the pageframe to be swapped, along with a "handle"
+ * consisting of a pool_id, an object id, and an index.  Since the pool is of
+ * indeterminate size, the "put" may be rejected, in which case the page
+ * is swapped to disk as normal.  If the "put" is successful, the page is
+ * copied to tmem and the preswap_map records the success.  Later, when
+ * the page needs to be swapped in, the preswap_map is checked and, if set,
+ * the page may be obtained with a "get" operation.  Note that the swap
+ * subsystem is responsible for: maintaining coherency between the swapcache,
+ * preswap, and the swapdisk; for evicting stale pages from preswap; and for
+ * emptying preswap when swapoff is performed. The "flush page" and "flush
+ * object" actions are provided for this.
+ *
+ * Note that if a "duplicate put" is performed to overwrite a page and
+ * the "put" operation fails, the page (and old data) is flushed and lost.
+ * Also note that multiple accesses to a tmem pool may be concurrent and
+ * any ordering must be guaranteed by the caller.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/uaccess.h>
+#include "tmem.h"
+
+static u32 preswap_poolid = -1; /* if negative, preswap will never call tmem */
+
+const unsigned long preswap_zero = 0, preswap_infinity = ~0UL; /* for sysctl */
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS		4
+#define SWIZ_MASK		((1 << SWIZ_BITS) - 1)
+#define oswiz(_type, _ind)	((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind)		(_ind >> SWIZ_BITS)
+
+/*
+ * preswap_map test/set/clear operations (must be atomic)
+ */
+
+int preswap_test(struct swap_info_struct *sis, unsigned long offset)
+{
+	if (!sis->preswap_map)
+		return 0;
+	return test_bit(offset % BITS_PER_LONG,
+		&sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_set(struct swap_info_struct *sis,
+				unsigned long offset)
+{
+	if (!sis->preswap_map)
+		return;
+	set_bit(offset % BITS_PER_LONG,
+		&sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_clear(struct swap_info_struct *sis,
+				unsigned long offset)
+{
+	if (!sis->preswap_map)
+		return;
+	clear_bit(offset % BITS_PER_LONG,
+		&sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+/*
+ * preswap tmem operations
+ */
+
+/* returns 1 if the page was successfully put into preswap, 0 if the page
+ * was declined, and -ERRNO for a specific error */
+int preswap_put(struct page *page)
+{
+	swp_entry_t entry = { .val = page_private(page), };
+	unsigned type = swp_type(entry);
+	pgoff_t offset = swp_offset(entry);
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int dup = 0, ret;
+
+	if ((s32)preswap_poolid < 0)
+		return 0;
+	if (ind64 != ind)
+		return 0;
+	if (preswap_test(sis, offset))
+		dup = 1;
+	mb(); /* ensure page is quiescent; tmem may address it with an alias */
+	ret = tmem_put_page(preswap_poolid, oswiz(type, ind), iswiz(ind), mfn);
+	if (ret == 1) {
+		preswap_set(sis, offset);
+		if (!dup)
+			sis->preswap_pages++;
+	} else if (dup) {
+		/* failed dup put always results in an automatic flush of
+		 * the (older) page from preswap */
+		preswap_clear(sis, offset);
+		sis->preswap_pages--;
+	}
+	return ret;
+}
+
+/* returns 1 if the page was successfully gotten from preswap, 0 if the page
+ * was not present (should never happen!), and -ERRNO for a specific error */
+int preswap_get(struct page *page)
+{
+	swp_entry_t entry = { .val = page_private(page), };
+	unsigned type = swp_type(entry);
+	pgoff_t offset = swp_offset(entry);
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int ret;
+
+	if ((s32)preswap_poolid < 0)
+		return 0;
+	if (ind64 != ind)
+		return 0;
+	if (!preswap_test(sis, offset))
+		return 0;
+	ret = tmem_get_page(preswap_poolid, oswiz(type, ind), iswiz(ind), mfn);
+	return ret;
+}
+
+/* flush a single page from preswap */
+void preswap_flush(unsigned type, unsigned long offset)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int ret = 1;
+
+	if ((s32)preswap_poolid < 0)
+		return;
+	if (ind64 != ind)
+		return;
+	if (preswap_test(sis, offset)) {
+		ret = tmem_flush_page(preswap_poolid,
+					oswiz(type, ind), iswiz(ind));
+		sis->preswap_pages--;
+		preswap_clear(sis, offset);
+	}
+}
+
+/* flush all pages from the passed swaptype */
+void preswap_flush_area(unsigned type)
+{
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int ind;
+
+	if ((s32)preswap_poolid < 0)
+		return;
+	for (ind = SWIZ_MASK; ind >= 0; ind--)
+		(void)tmem_flush_object(preswap_poolid, oswiz(type, ind));
+	sis->preswap_pages = 0;
+}
+
+void preswap_init(unsigned type)
+{
+	/* only need one tmem pool for all swap types */
+	if ((s32)preswap_poolid >= 0)
+		return;
+	preswap_poolid = tmem_new_pool(0, 0, TMEM_POOL_PERSIST);
+	if (preswap_poolid < 0)
+		return;
+}
diff -r ca12928cdafe mm/swapfile.c
--- a/mm/swapfile.c	Mon Jun 08 12:23:24 2009 +0100
+++ b/mm/swapfile.c	Tue Jun 16 11:27:44 2009 -0600
@@ -284,6 +284,7 @@ static int swap_entry_free(struct swap_i
 				swap_list.next = p - swap_info;
 			nr_swap_pages++;
 			p->inuse_pages--;
+			preswap_flush(p - swap_info, offset);
 		}
 	}
 	return count;
@@ -623,7 +624,7 @@ static int unuse_mm(struct mm_struct *mm
  * Recycle to start on reaching the end, returning 0 when empty.
  */
 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
-					unsigned int prev)
+				unsigned int prev, unsigned int preswap)
 {
 	unsigned int max = si->max;
 	unsigned int i = prev;
@@ -649,6 +650,12 @@ static unsigned int find_next_to_unuse(s
 			prev = 0;
 			i = 1;
 		}
+		if (preswap) {
+			if (preswap_test(si, i))
+				break;
+			else
+				continue;
+		}
 		count = si->swap_map[i];
 		if (count && count != SWAP_MAP_BAD)
 			break;
@@ -660,8 +667,12 @@ static unsigned int find_next_to_unuse(s
  * We completely avoid races by reading each swap page in advance,
  * and then search for the process using it.  All the necessary
  * page table adjustments can then be made atomically.
+ *
+ * if the boolean preswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages
  */
-static int try_to_unuse(unsigned int type)
+static int try_to_unuse(unsigned int type, unsigned int preswap,
+		unsigned long pages_to_unuse)
 {
 	struct swap_info_struct * si = &swap_info[type];
 	struct mm_struct *start_mm;
@@ -697,7 +708,7 @@ static int try_to_unuse(unsigned int typ
 	 * one pass through swap_map is enough, but not necessarily:
 	 * there are races when an instance of an entry might be missed.
 	 */
-	while ((i = find_next_to_unuse(si, i)) != 0) {
+	while ((i = find_next_to_unuse(si, i, preswap)) != 0) {
 		if (signal_pending(current)) {
 			retval = -EINTR;
 			break;
@@ -876,6 +887,8 @@ static int try_to_unuse(unsigned int typ
 		 * interactive performance.
 		 */
 		cond_resched();
+		if (preswap && pages_to_unuse && !--pages_to_unuse)
+			break;
 	}
 
 	mmput(start_mm);
@@ -1198,7 +1211,7 @@ asmlinkage long sys_swapoff(const char _
 	spin_unlock(&swap_lock);
 
 	current->flags |= PF_SWAPOFF;
-	err = try_to_unuse(type);
+	err = try_to_unuse(type, 0, 0);
 	current->flags &= ~PF_SWAPOFF;
 
 	if (err) {
@@ -1242,9 +1255,14 @@ asmlinkage long sys_swapoff(const char _
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
+	preswap_flush_area(p - swap_info);
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
+#ifdef CONFIG_PRESWAP
+	if (p->preswap_map)
+		vfree(p->preswap_map);
+#endif
 	inode = mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
 		struct block_device *bdev = I_BDEV(inode);
@@ -1553,6 +1571,11 @@ asmlinkage long sys_swapon(const char __
 
 		error = 0;
 		memset(p->swap_map, 0, maxpages * sizeof(short));
+#ifdef CONFIG_PRESWAP
+		p->preswap_map = vmalloc(maxpages / sizeof(long));
+		if (p->preswap_map)
+			memset(p->preswap_map, 0, maxpages / sizeof(long));
+#endif
 		for (i = 0; i < swap_header->info.nr_badpages; i++) {
 			int page_nr = swap_header->info.badpages[i];
 			if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
@@ -1615,6 +1638,7 @@ asmlinkage long sys_swapon(const char __
 	} else {
 		swap_info[prev].next = p - swap_info;
 	}
+	preswap_init(p - swap_info);
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	error = 0;
@@ -1729,6 +1753,8 @@ int valid_swaphandles(swp_entry_t entry,
 
 	if (!page_cluster)	/* no readahead */
 		return 0;
+	if (preswap_test(swapdev, swp_offset(entry)))
+		return 0;
 	toff = (swp_offset(entry) >> page_cluster) << page_cluster;
 	if (!toff)		/* first page is swap header */
 		toff++, i--;
@@ -1744,9 +1770,101 @@ int valid_swaphandles(swp_entry_t entry,
 			break;
 		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
 			break;
+		/* Don't read in preswap pages */
+		if (preswap_test(swapdev, toff))
+			break;
 		toff++;
 		ret++;
 	} while (--i);
 	spin_unlock(&swap_lock);
 	return ret;
 }
+
+#ifdef CONFIG_PRESWAP
+/*
+ * preswap infrastructure functions
+ */
+
+/* code structure leveraged from sys_swapoff */
+void preswap_shrink(unsigned long target_pages)
+{
+	struct swap_info_struct *si = NULL;
+	unsigned long total_pages = 0, total_pages_to_unuse;
+	unsigned long pages = 0, unuse_pages = 0;
+	int type;
+	int wrapped = 0;
+
+	do {
+		/*
+		 * we don't want to hold swap_lock while doing a very
+		 * lengthy try_to_unuse, but swap_list may change
+		 * so restart scan from swap_list.head each time
+		 */
+		spin_lock(&swap_lock);
+		total_pages = 0;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = get_swap_info_struct(type);
+			total_pages += si->preswap_pages;
+		}
+		if (total_pages <= target_pages) {
+			spin_unlock(&swap_lock);
+			return;
+		}
+		total_pages_to_unuse = total_pages - target_pages;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = get_swap_info_struct(type);
+			if (total_pages_to_unuse < si->preswap_pages)
+				pages = unuse_pages = total_pages_to_unuse;
+			else {
+				pages = si->preswap_pages;
+				unuse_pages = 0; /* unuse all */
+			}
+			if (security_vm_enough_memory(pages))
+				continue;
+			vm_unacct_memory(pages);
+			break;
+		}
+		spin_unlock(&swap_lock);
+		if (type < 0)
+			return;
+		current->flags |= PF_SWAPOFF;
+		(void)try_to_unuse(type, 1, unuse_pages);
+		current->flags &= ~PF_SWAPOFF;
+		wrapped++;
+	} while (wrapped <= 3);
+}
+
+
+#ifdef CONFIG_SYSCTL
+/* cat /sys/proc/vm/preswap provides total number of pages in preswap
+ * across all swaptypes.  echo N > /sys/proc/vm/preswap attempts to shrink
+ * preswap page usage to N (usually 0) */
+int preswap_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	unsigned long npages;
+	int type;
+	unsigned long totalpages = 0;
+	struct swap_info_struct *si = NULL;
+
+	/* modeled after hugetlb_sysctl_handler in mm/hugetlb.c */
+	if (!write) {
+		spin_lock(&swap_lock);
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = get_swap_info_struct(type);
+			totalpages += si->preswap_pages;
+		}
+		spin_unlock(&swap_lock);
+		npages = totalpages;
+	}
+	table->data = &npages;
+	table->maxlen = sizeof(unsigned long);
+	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+
+	if (write)
+		preswap_shrink(npages);
+
+	return 0;
+}
+#endif
+#endif /* CONFIG_PRESWAP */
diff -r ca12928cdafe mm/tmem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mm/tmem.c	Tue Jun 16 11:27:44 2009 -0600
@@ -0,0 +1,41 @@
+/*
+ * Xen implementation for transcendent memory (tmem)
+ *
+ * Dan Magenheimer <dan.magenheimer@oracle.com> 2009
+ */
+
+#include <linux/types.h>
+#include <xen/interface/xen.h>
+#include <asm/hypervisor.h>
+
+int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, u64 object, u32 index,
+	unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
+{
+	struct tmem_op op;
+	int rc = 0;
+
+	op.cmd = tmem_cmd;
+	op.pool_id = tmem_pool;
+	op.u.gen.object = object;
+	op.u.gen.index = index;
+	op.u.gen.tmem_offset = tmem_offset;
+	op.u.gen.pfn_offset = pfn_offset;
+	op.u.gen.len = len;
+	op.u.gen.gmfn = gmfn;
+	rc = HYPERVISOR_tmem_op(&op);
+	return rc;
+}
+
+int xen_tmem_new_pool(uint32_t tmem_cmd, uint64_t uuid_lo,
+	uint64_t uuid_hi, uint32_t flags)
+{
+	struct tmem_op op;
+	int rc = 0;
+
+	op.cmd = tmem_cmd;
+	op.u.new.uuid[0] = uuid_lo;
+	op.u.new.uuid[1] = uuid_hi;
+	op.u.new.flags = flags;
+	rc = HYPERVISOR_tmem_op(&op);
+	return rc;
+}
diff -r ca12928cdafe mm/tmem.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mm/tmem.h	Tue Jun 16 11:27:44 2009 -0600
@@ -0,0 +1,115 @@
+/*
+ * linux/mm/tmem.h
+ *
+ * Interface to transcendent memory, used by mm/precache.c and mm/preswap.c
+ * Currently implemented on XEN, but may be implemented elsewhere in future.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+
+#define TMEM_CONTROL               0
+#define TMEM_NEW_POOL              1
+#define TMEM_DESTROY_POOL          2
+#define TMEM_NEW_PAGE              3
+#define TMEM_PUT_PAGE              4
+#define TMEM_GET_PAGE              5
+#define TMEM_FLUSH_PAGE            6
+#define TMEM_FLUSH_OBJECT          7
+#define TMEM_READ                  8
+#define TMEM_WRITE                 9
+#define TMEM_XCHG                 10
+
+/* Subops for HYPERVISOR_tmem_op(TMEM_CONTROL) */
+#define TMEMC_THAW                 0
+#define TMEMC_FREEZE               1
+#define TMEMC_FLUSH                2
+#define TMEMC_DESTROY              3
+#define TMEMC_LIST                 4
+#define TMEMC_SET_WEIGHT           5
+#define TMEMC_SET_CAP              6
+#define TMEMC_SET_COMPRESS         7
+
+/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
+#define TMEM_POOL_PERSIST          1
+#define TMEM_POOL_SHARED           2
+#define TMEM_POOL_PAGESIZE_SHIFT   4
+#define TMEM_POOL_MIN_PAGESHIFT   12
+#define TMEM_POOL_PAGEORDER       (PAGE_SHIFT - TMEM_POOL_MIN_PAGESHIFT)
+#define TMEM_POOL_PAGESIZE_MASK  0xf
+#define TMEM_POOL_VERSION_SHIFT   24
+#define TMEM_POOL_VERSION_MASK  0xff
+
+/* Special errno values */
+#define EFROZEN                 1000
+#define EEMPTY                  1001
+
+#ifdef CONFIG_XEN
+extern int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, u64 object, u32 index,
+	unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len);
+extern int xen_tmem_new_pool(u32 tmem_cmd, u64 uuid_lo, u64 uuid_hi, u32 flags);
+
+static inline int tmem_put_page(u32 pool_id, u64 object, u32 index,
+	unsigned long gmfn)
+{
+	return xen_tmem_op(TMEM_PUT_PAGE, pool_id, object, index,
+		gmfn, 0, 0, 0);
+}
+
+static inline int tmem_get_page(u32 pool_id, u64 object, u32 index,
+	unsigned long gmfn)
+{
+	return xen_tmem_op(TMEM_GET_PAGE, pool_id, object, index,
+		gmfn, 0, 0, 0);
+}
+
+static inline int tmem_flush_page(u32 pool_id, u64 object, u32 index)
+{
+	return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, object, index,
+		0, 0, 0, 0);
+}
+
+static inline int tmem_flush_object(u32 pool_id, u64 object)
+{
+	return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, object, 0, 0, 0, 0, 0);
+}
+
+static inline int tmem_new_pool(u64 uuid_lo, u64 uuid_hi, u32 flags)
+{
+	BUILD_BUG_ON((TMEM_POOL_PAGEORDER < 0) ||
+		(TMEM_POOL_PAGEORDER >= TMEM_POOL_PAGESIZE_MASK));
+	flags |= TMEM_POOL_PAGEORDER << TMEM_POOL_PAGESIZE_SHIFT;
+	return xen_tmem_new_pool(TMEM_NEW_POOL, uuid_lo, uuid_hi, flags);
+}
+
+static inline int tmem_destroy_pool(u32 pool_id)
+{
+	return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, 0, 0, 0, 0, 0, 0);
+}
+#else
+struct tmem_op {
+	u32 cmd;
+	s32 pool_id; /* private > 0; shared < 0; 0 is invalid */
+	union {
+		struct {  /* for cmd == TMEM_NEW_POOL */
+			u64 uuid[2];
+			u32 flags;
+		} new;
+		struct {  /* for cmd == TMEM_CONTROL */
+			u32 subop;
+			u32 cli_id;
+			u32 arg1;
+			u32 arg2;
+			void *buf;
+		} ctrl;
+		struct {
+			u64 object;
+			u32 index;
+			u32 tmem_offset;
+			u32 pfn_offset;
+			u32 len;
+			unsigned long pfn;  /* page frame */
+		} gen;
+	} u;
+};
+#endif
diff -r ca12928cdafe mm/truncate.c
--- a/mm/truncate.c	Mon Jun 08 12:23:24 2009 +0100
+++ b/mm/truncate.c	Tue Jun 16 11:27:44 2009 -0600
@@ -12,13 +12,14 @@
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
+#include <linux/precache.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
-
 
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+	precache_flush(page->mapping, page->index);
 	if (PagePrivate(page))
 		do_invalidatepage(page, partial);
 }
@@ -46,6 +47,10 @@ truncate_complete_page(struct address_sp
 	ClearPageUptodate(page);
 	ClearPageMappedToDisk(page);
 	remove_from_page_cache(page);
+	/* this must be after the remove_from_page_cache which
+	 * calls precache_put
+	 */
+	precache_flush(mapping, page->index);
 	page_cache_release(page);	/* pagecache ref */
 }
 
@@ -118,6 +123,7 @@ void truncate_inode_pages_range(struct a
 	pgoff_t next;
 	int i;
 
+	precache_flush_inode(mapping);
 	if (mapping->nrpages == 0)
 		return;
 
@@ -191,6 +197,7 @@ void truncate_inode_pages_range(struct a
 		}
 		pagevec_release(&pvec);
 	}
+	precache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
 
@@ -324,6 +331,7 @@ int invalidate_inode_pages2_range(struct
 	int did_range_unmap = 0;
 	int wrapped = 0;
 
+	precache_flush_inode(mapping);
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (next <= end && !ret && !wrapped &&
@@ -379,6 +387,7 @@ int invalidate_inode_pages2_range(struct
 		pagevec_release(&pvec);
 		cond_resched();
 	}
+	precache_flush_inode(mapping);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH] [linux-2.6.18-xen] (take 3) transcendent memory ("tmem") linux-side changes
  2009-06-16 18:02 [PATCH] [linux-2.6.18-xen] (take 3) transcendent memory ("tmem") linux-side changes Dan Magenheimer
@ 2009-06-17  6:31 ` Keir Fraser
  2009-06-17 16:11   ` Dan Magenheimer
  0 siblings, 1 reply; 3+ messages in thread
From: Keir Fraser @ 2009-06-17  6:31 UTC (permalink / raw)
  To: Dan Magenheimer, Xen-Devel (E-mail); +Cc: Jan Beulich

On 16/06/2009 19:02, "Dan Magenheimer" <dan.magenheimer@oracle.com> wrote:

> (take 3: no anonymous unions; match Jan's xen-side patch)
> 
> Transcendent memory ("tmem") for Linux

I've now synced the Xen headers over to Linux 2.6.18, so please rebase the
patch on that. This means you will not need to modify anything under
include/xen/interface, and probably most of mm/tmem.h goes as well.

 -- Keir

^ permalink raw reply	[flat|nested] 3+ messages in thread

* RE: [PATCH] [linux-2.6.18-xen] (take 3) transcendent memory ("tmem") linux-side changes
  2009-06-17  6:31 ` Keir Fraser
@ 2009-06-17 16:11   ` Dan Magenheimer
  0 siblings, 0 replies; 3+ messages in thread
From: Dan Magenheimer @ 2009-06-17 16:11 UTC (permalink / raw)
  To: Keir Fraser, Xen-Devel (E-mail); +Cc: Jan Beulich

[-- Attachment #1: Type: text/plain, Size: 820 bytes --]

Attached rebased against 906 in the staging tree.

> -----Original Message-----
> From: Keir Fraser [mailto:keir.fraser@eu.citrix.com]
> Sent: Wednesday, June 17, 2009 12:32 AM
> To: Dan Magenheimer; Xen-Devel (E-mail)
> Cc: Jan Beulich
> Subject: Re: [Xen-devel] [PATCH] [linux-2.6.18-xen] (take 3)
> transcendent memory ("tmem") linux-side changes
> 
> 
> On 16/06/2009 19:02, "Dan Magenheimer" 
> <dan.magenheimer@oracle.com> wrote:
> 
> > (take 3: no anonymous unions; match Jan's xen-side patch)
> > 
> > Transcendent memory ("tmem") for Linux
> 
> I've now synced the Xen headers over to Linux 2.6.18, so 
> please rebase the
> patch on that. This means you will not need to modify anything under
> include/xen/interface, and probably most of mm/tmem.h goes as well.
> 
>  -- Keir
> 
> 
>

[-- Attachment #2: tmem-linux-2.6.18-xen-906-090617.patch --]
[-- Type: application/octet-stream, Size: 35969 bytes --]

diff -r 3776d277956f fs/buffer.c
--- a/fs/buffer.c	Wed Jun 17 09:07:23 2009 +0100
+++ b/fs/buffer.c	Wed Jun 17 09:45:39 2009 -0600
@@ -41,6 +41,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/precache.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 static void invalidate_bh_lrus(void);
@@ -482,6 +483,11 @@ void invalidate_bdev(struct block_device
 	 * that, but not until that's cleaned up.
 	 */
 	invalidate_inode_pages(mapping);
+
+	/* 99% of the time, we don't need to flush the precache on the bdev.
+	 * But, for the strange corners, lets be cautious
+	 */
+	precache_flush_inode(mapping);
 }
 
 /*
diff -r 3776d277956f fs/ext3/super.c
--- a/fs/ext3/super.c	Wed Jun 17 09:07:23 2009 +0100
+++ b/fs/ext3/super.c	Wed Jun 17 09:45:39 2009 -0600
@@ -35,6 +35,7 @@
 #include <linux/namei.h>
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
+#include <linux/precache.h>
 
 #include <asm/uaccess.h>
 
@@ -1167,6 +1168,7 @@ static int ext3_setup_super(struct super
 	} else {
 		printk("internal journal\n");
 	}
+	precache_init(sb);
 	return res;
 }
 
diff -r 3776d277956f fs/mpage.c
--- a/fs/mpage.c	Wed Jun 17 09:07:23 2009 +0100
+++ b/fs/mpage.c	Wed Jun 17 09:45:39 2009 -0600
@@ -26,6 +26,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
+#include <linux/precache.h>
 
 /*
  * I/O completion handler for multipage BIOs.
@@ -296,6 +297,13 @@ do_mpage_readpage(struct bio *bio, struc
 		}
 	} else if (fully_mapped) {
 		SetPageMappedToDisk(page);
+	}
+
+	if (fully_mapped &&
+	    blocks_per_page == 1 && !PageUptodate(page) &&
+	    precache_get(page->mapping, page->index, page) == 1) {
+		SetPageUptodate(page);
+		goto confused;
 	}
 
 	/*
diff -r 3776d277956f fs/ocfs2/super.c
--- a/fs/ocfs2/super.c	Wed Jun 17 09:07:23 2009 +0100
+++ b/fs/ocfs2/super.c	Wed Jun 17 09:45:39 2009 -0600
@@ -39,6 +39,7 @@
 #include <linux/parser.h>
 #include <linux/crc32.h>
 #include <linux/debugfs.h>
+#include <linux/precache.h>
 
 #include <cluster/nodemanager.h>
 
@@ -1457,16 +1458,18 @@ static int ocfs2_initialize_super(struct
 
 	di = (struct ocfs2_dinode *) bitmap_bh->b_data;
 	osb->bitmap_cpg = le16_to_cpu(di->id2.i_chain.cl_cpg);
-	brelse(bitmap_bh);
 	mlog(0, "cluster bitmap inode: %llu, clusters per group: %u\n",
 	     (unsigned long long)osb->bitmap_blkno, osb->bitmap_cpg);
 
 	status = ocfs2_init_slot_info(osb);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_rel;
 	}
+	shared_precache_init(sb, &di->id2.i_super.s_uuid[0]);
 
+bail_rel:
+	brelse(bitmap_bh);
 bail:
 	mlog_exit(status);
 	return status;
diff -r 3776d277956f fs/super.c
--- a/fs/super.c	Wed Jun 17 09:07:23 2009 +0100
+++ b/fs/super.c	Wed Jun 17 09:45:39 2009 -0600
@@ -37,6 +37,7 @@
 #include <linux/idr.h>
 #include <linux/kobject.h>
 #include <linux/mutex.h>
+#include <linux/precache.h>
 #include <asm/uaccess.h>
 
 
@@ -93,6 +94,9 @@ static struct super_block *alloc_super(s
 		s->s_qcop = sb_quotactl_ops;
 		s->s_op = &default_op;
 		s->s_time_gran = 1000000000;
+#ifdef CONFIG_PRECACHE
+		s->precache_poolid = -1;
+#endif
 	}
 out:
 	return s;
@@ -181,6 +185,7 @@ void deactivate_super(struct super_block
 		DQUOT_OFF(s);
 		down_write(&s->s_umount);
 		fs->kill_sb(s);
+		precache_flush_filesystem(s);
 		put_filesystem(fs);
 		put_super(s);
 	}
@@ -777,6 +782,9 @@ int get_sb_nodev(struct file_system_type
 		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
+#ifdef CONFIG_PRECACHE
+	s->precache_poolid = -2;
+#endif
 	return simple_set_mnt(mnt, s);
 }
 
diff -r 3776d277956f include/asm-i386/mach-xen/asm/hypercall.h
--- a/include/asm-i386/mach-xen/asm/hypercall.h	Wed Jun 17 09:07:23 2009 +0100
+++ b/include/asm-i386/mach-xen/asm/hypercall.h	Wed Jun 17 09:45:39 2009 -0600
@@ -404,6 +404,12 @@ HYPERVISOR_kexec_op(
 	return _hypercall2(int, kexec_op, op, args);
 }
 
+static inline int __must_check
+HYPERVISOR_tmem_op(
+	struct tmem_op *op)
+{
+	return _hypercall1(int, tmem_op, op);
+}
 
 
 #endif /* __HYPERCALL_H__ */
diff -r 3776d277956f include/asm-i386/mach-xen/asm/hypervisor.h
--- a/include/asm-i386/mach-xen/asm/hypervisor.h	Wed Jun 17 09:07:23 2009 +0100
+++ b/include/asm-i386/mach-xen/asm/hypervisor.h	Wed Jun 17 09:45:39 2009 -0600
@@ -43,6 +43,7 @@
 #include <xen/interface/physdev.h>
 #include <xen/interface/sched.h>
 #include <xen/interface/nmi.h>
+#include <xen/interface/tmem.h>
 #include <asm/ptrace.h>
 #include <asm/page.h>
 #if defined(__i386__)
diff -r 3776d277956f include/asm-x86_64/mach-xen/asm/hypercall.h
--- a/include/asm-x86_64/mach-xen/asm/hypercall.h	Wed Jun 17 09:07:23 2009 +0100
+++ b/include/asm-x86_64/mach-xen/asm/hypercall.h	Wed Jun 17 09:45:39 2009 -0600
@@ -412,4 +412,11 @@ HYPERVISOR_kexec_op(
 	return _hypercall2(int, kexec_op, op, args);
 }
 
+static inline int __must_check
+HYPERVISOR_tmem_op(
+	struct tmem_op *op)
+{
+	return _hypercall1(int, tmem_op, op);
+}
+
 #endif /* __HYPERCALL_H__ */
diff -r 3776d277956f include/linux/fs.h
--- a/include/linux/fs.h	Wed Jun 17 09:07:23 2009 +0100
+++ b/include/linux/fs.h	Wed Jun 17 09:45:39 2009 -0600
@@ -907,6 +907,9 @@ struct super_block {
 	/* Granularity of c/m/atime in ns.
 	   Cannot be worse than a second */
 	u32		   s_time_gran;
+#ifdef CONFIG_PRECACHE
+	u32                precache_poolid;
+#endif
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
diff -r 3776d277956f include/linux/precache.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/include/linux/precache.h	Wed Jun 17 09:45:39 2009 -0600
@@ -0,0 +1,55 @@
+#ifndef _LINUX_PRECACHE_H
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+
+#ifdef CONFIG_PRECACHE
+extern void precache_init(struct super_block *sb);
+extern void shared_precache_init(struct super_block *sb, char *uuid);
+extern int precache_get(struct address_space *mapping, unsigned long index,
+	       struct page *empty_page);
+extern int precache_put(struct address_space *mapping, unsigned long index,
+		struct page *page);
+extern int precache_flush(struct address_space *mapping, unsigned long index);
+extern int precache_flush_inode(struct address_space *mapping);
+extern int precache_flush_filesystem(struct super_block *s);
+#else
+static inline void precache_init(struct super_block *sb)
+{
+}
+
+static inline void shared_precache_init(struct super_block *sb, char *uuid)
+{
+}
+
+static inline int precache_get(struct address_space *mapping,
+		unsigned long index, struct page *empty_page)
+{
+	return 0;
+}
+
+static inline int precache_put(struct address_space *mapping,
+		unsigned long index, struct page *page)
+{
+	return 0;
+}
+
+static inline int precache_flush(struct address_space *mapping,
+		unsigned long index)
+{
+	return 0;
+}
+
+static inline int precache_flush_inode(struct address_space *mapping)
+{
+	return 0;
+}
+
+static inline int precache_flush_filesystem(struct super_block *s)
+{
+	return 0;
+}
+#endif
+
+#define _LINUX_PRECACHE_H
+#endif /* _LINUX_PRECACHE_H */
diff -r 3776d277956f include/linux/swap.h
--- a/include/linux/swap.h	Wed Jun 17 09:07:23 2009 +0100
+++ b/include/linux/swap.h	Wed Jun 17 09:45:39 2009 -0600
@@ -6,6 +6,7 @@
 #include <linux/mmzone.h>
 #include <linux/list.h>
 #include <linux/sched.h>
+#include <linux/vmalloc.h>
 
 #include <asm/atomic.h>
 #include <asm/page.h>
@@ -143,8 +144,59 @@ struct swap_info_struct {
 	unsigned int pages;
 	unsigned int max;
 	unsigned int inuse_pages;
+#ifdef CONFIG_PRESWAP
+	unsigned long *preswap_map;
+	unsigned int preswap_pages;
+#endif
 	int next;			/* next entry on swap list */
 };
+
+#ifdef CONFIG_PRESWAP
+
+#include <linux/sysctl.h>
+extern int preswap_sysctl_handler(struct ctl_table *, int, struct file *,
+	void __user *, size_t *, loff_t *);
+extern const unsigned long preswap_zero, preswap_infinity;
+
+extern void preswap_shrink(unsigned long);
+extern int preswap_test(struct swap_info_struct *, unsigned long);
+extern void preswap_init(unsigned);
+extern int preswap_put(struct page *);
+extern int preswap_get(struct page *);
+extern void preswap_flush(unsigned, unsigned long);
+extern void preswap_flush_area(unsigned);
+#else
+static inline void preswap_shrink(unsigned long target_pages)
+{
+}
+
+static inline int preswap_test(struct swap_info_struct *sis, unsigned long offset)
+{
+	return 0;
+}
+
+static inline void preswap_init(unsigned type)
+{
+}
+
+static inline int preswap_put(struct page *page)
+{
+	return 0;
+}
+
+static inline int preswap_get(struct page *get)
+{
+	return 0;
+}
+
+static inline void preswap_flush(unsigned type, unsigned long offset)
+{
+}
+
+static inline void preswap_flush_area(unsigned type)
+{
+}
+#endif /* CONFIG_PRESWAP */
 
 struct swap_list_t {
 	int head;	/* head of priority-ordered swapfile list */
@@ -247,7 +299,6 @@ extern int can_share_swap_page(struct pa
 extern int can_share_swap_page(struct page *);
 extern int remove_exclusive_swap_page(struct page *);
 struct backing_dev_info;
-
 extern spinlock_t swap_lock;
 
 /* linux/mm/thrash.c */
diff -r 3776d277956f include/linux/sysctl.h
--- a/include/linux/sysctl.h	Wed Jun 17 09:07:23 2009 +0100
+++ b/include/linux/sysctl.h	Wed Jun 17 09:45:39 2009 -0600
@@ -200,6 +200,7 @@ enum
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
 	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
+	VM_PRESWAP_PAGES=36,	/* pages/target_pages in preswap */
 };
 
 
diff -r 3776d277956f kernel/sysctl.c
--- a/kernel/sysctl.c	Wed Jun 17 09:07:23 2009 +0100
+++ b/kernel/sysctl.c	Wed Jun 17 09:45:39 2009 -0600
@@ -965,6 +965,18 @@ static ctl_table vm_table[] = {
 		.proc_handler	= &proc_dointvec,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &zero,
+	},
+#endif
+#ifdef CONFIG_PRESWAP
+	{
+		.ctl_name	= VM_PRESWAP_PAGES,
+		.procname	= "preswap",
+		.data		= NULL,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &preswap_sysctl_handler,
+		.extra1		= (void *)&preswap_zero,
+		.extra2		= (void *)&preswap_infinity,
 	},
 #endif
 	{ .ctl_name = 0 }
diff -r 3776d277956f mm/Kconfig
--- a/mm/Kconfig	Wed Jun 17 09:07:23 2009 +0100
+++ b/mm/Kconfig	Wed Jun 17 09:45:39 2009 -0600
@@ -152,3 +152,33 @@ config RESOURCES_64BIT
 	default 64BIT
 	help
 	  This option allows memory and IO resources to be 64 bit.
+
+#
+# support for transcendent memory
+#
+config TMEM
+	bool
+	depends on XEN
+	help
+	  In a virtualized environment, allows unused and underutilized
+	  system physical memory to be made accessible through a narrow
+	  well-defined page-copy-based API.  If unsure, say Y.
+
+config PRECACHE
+	bool "Cache clean pages in transcendent memory"
+	depends on XEN
+	select TMEM
+	help
+	  Allows the transcendent memory pool to be used to store clean
+	  page-cache pages which, under some circumstances, will greatly
+	  reduce paging and thus improve performance.  If unsure, say Y.
+
+config PRESWAP
+	bool "Swap pages to transcendent memory"
+	depends on XEN
+	select TMEM
+	help
+	  Allows the transcendent memory pool to be used as a pseudo-swap
+	  device which, under some circumstances, will greatly reduce
+	  swapping and thus improve performance.  If unsure, say Y.
+
diff -r 3776d277956f mm/Makefile
--- a/mm/Makefile	Wed Jun 17 09:07:23 2009 +0100
+++ b/mm/Makefile	Wed Jun 17 09:45:39 2009 -0600
@@ -13,6 +13,9 @@ obj-y			:= bootmem.o filemap.o mempool.o
 			   prio_tree.o util.o mmzone.o vmstat.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o thrash.o
+obj-$(CONFIG_TMEM)	+= tmem.o
+obj-$(CONFIG_PRESWAP)	+= preswap.o
+obj-$(CONFIG_PRECACHE)	+= precache.o
 obj-$(CONFIG_HUGETLBFS)	+= hugetlb.o
 obj-$(CONFIG_NUMA) 	+= mempolicy.o
 obj-$(CONFIG_SPARSEMEM)	+= sparse.o
diff -r 3776d277956f mm/filemap.c
--- a/mm/filemap.c	Wed Jun 17 09:07:23 2009 +0100
+++ b/mm/filemap.c	Wed Jun 17 09:45:39 2009 -0600
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/cpuset.h>
+#include <linux/precache.h>
 #include "filemap.h"
 #include "internal.h"
 
@@ -115,6 +116,16 @@ void __remove_from_page_cache(struct pag
 void __remove_from_page_cache(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
+
+	/*
+	 * if we're uptodate, flush out into the precache, otherwise
+	 * invalidate any existing precache entries.  We can't leave
+	 * stale data around in the precache once our page is gone
+	 */
+	if (PageUptodate(page))
+		precache_put(page->mapping, page->index, page);
+	else
+		precache_flush(page->mapping, page->index);
 
 	radix_tree_delete(&mapping->page_tree, page->index);
 	page->mapping = NULL;
diff -r 3776d277956f mm/page_io.c
--- a/mm/page_io.c	Wed Jun 17 09:07:23 2009 +0100
+++ b/mm/page_io.c	Wed Jun 17 09:45:40 2009 -0600
@@ -99,10 +99,18 @@ int swap_writepage(struct page *page, st
 		ret = -ENOMEM;
 		goto out;
 	}
+
+	set_page_writeback(page);
+	if (preswap_put(page) == 1) {
+		unlock_page(page);
+		end_page_writeback(page);
+		bio_put(bio);
+		goto out;
+	}
+
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		rw |= (1 << BIO_RW_SYNC);
 	count_vm_event(PSWPOUT);
-	set_page_writeback(page);
 	unlock_page(page);
 	submit_bio(rw, bio);
 out:
@@ -116,6 +124,13 @@ int swap_readpage(struct file *file, str
 
 	BUG_ON(!PageLocked(page));
 	ClearPageUptodate(page);
+
+	if (preswap_get(page) == 1) {
+		SetPageUptodate(page);
+		unlock_page(page);
+		goto out;
+	}
+
 	bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
 				end_swap_bio_read);
 	if (bio == NULL) {
diff -r 3776d277956f mm/precache.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mm/precache.c	Wed Jun 17 09:45:40 2009 -0600
@@ -0,0 +1,140 @@
+/*
+ * linux/mm/precache.c
+ *
+ * Implements "precache" for filesystems/pagecache on top of transcendent
+ * memory ("tmem") API.  A filesystem creates an "ephemeral tmem pool"
+ * and retains the returned pool_id in its superblock.  Clean pages evicted
+ * from pagecache may be "put" into the pool and associated with a "handle"
+ * consisting of the pool_id, an object (inode) id, and an index (page offset).
+ * Note that the page is copied to tmem; no kernel mappings are changed.
+ * If the page is later needed, the filesystem (or VFS) issues a "get", passing
+ * the same handle and an empty pageframe.  If successful, the page is copied
+ * into the pageframe and a disk read is avoided.  But since the tmem pool
+ * is of indeterminate size, a "put" page has indeterminate longevity
+ * ("ephemeral"), and the "get" may fail, in which case the filesystem must
+ * read the page from disk as before.  Note that the filesystem/pagecache are
+ * responsible for maintaining coherency between the pagecache, precache,
+ * and the disk, for which "flush page" and "flush object" actions are
+ * provided.  And when a filesystem is unmounted, it must "destroy" the pool.
+ *
+ * Two types of pools may be created for a precache: "private" or "shared".
+ * For a private pool, a successful "get" always flushes, implementing
+ * exclusive semantics; for a "shared" pool (which is intended for use by
+ * co-resident nodes of a cluster filesystem), the "flush" is not guaranteed.
+ * In either case, a failed "duplicate" put (overwrite) always guarantee
+ * the old data is flushed.
+ *
+ * Note also that multiple accesses to a tmem pool may be concurrent and any
+ * ordering must be guaranteed by the caller.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+#include <linux/precache.h>
+#include <linux/module.h>
+#include "tmem.h"
+
+static int precache_auto_allocate; /* set to 1 to auto_allocate */
+
+int precache_put(struct address_space *mapping, unsigned long index,
+ struct page *page)
+{
+	u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+	u64 obj = (unsigned long) mapping->host->i_ino;
+	u32 ind = (u32) index;
+	unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
+	int ret;
+
+	if ((s32)tmem_pool < 0) {
+		if (!precache_auto_allocate)
+			return 0;
+		/* a put on a non-existent precache may auto-allocate one */
+		ret = tmem_new_pool(0, 0, 0);
+		if (ret < 0)
+			return 0;
+		printk(KERN_INFO
+			"Mapping superblock for s_id=%s to precache_id=%d\n",
+			mapping->host->i_sb->s_id, tmem_pool);
+		mapping->host->i_sb->precache_poolid = tmem_pool;
+	}
+	if (ind != index)
+		return 0;
+	mb(); /* ensure page is quiescent; tmem may address it with an alias */
+	return tmem_put_page(tmem_pool, obj, ind, mfn);
+}
+
+int precache_get(struct address_space *mapping, unsigned long index,
+ struct page *empty_page)
+{
+	u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+	u64 obj = (unsigned long) mapping->host->i_ino;
+	u32 ind = (u32) index;
+	unsigned long mfn = pfn_to_mfn(page_to_pfn(empty_page));
+
+	if ((s32)tmem_pool < 0)
+		return 0;
+	if (ind != index)
+		return 0;
+
+	return tmem_get_page(tmem_pool, obj, ind, mfn);
+}
+EXPORT_SYMBOL(precache_get);
+
+int precache_flush(struct address_space *mapping, unsigned long index)
+{
+	u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+	u64 obj = (unsigned long) mapping->host->i_ino;
+	u32 ind = (u32) index;
+
+	if ((s32)tmem_pool < 0)
+		return 0;
+	if (ind != index)
+		return 0;
+
+	return tmem_flush_page(tmem_pool, obj, ind);
+}
+EXPORT_SYMBOL(precache_flush);
+
+int precache_flush_inode(struct address_space *mapping)
+{
+	u32 tmem_pool = mapping->host->i_sb->precache_poolid;
+	u64 obj = (unsigned long) mapping->host->i_ino;
+
+	if ((s32)tmem_pool < 0)
+		return 0;
+
+	return tmem_flush_object(tmem_pool, obj);
+}
+EXPORT_SYMBOL(precache_flush_inode);
+
+int precache_flush_filesystem(struct super_block *sb)
+{
+	u32 tmem_pool = sb->precache_poolid;
+	int ret;
+
+	if ((s32)tmem_pool < 0)
+		return 0;
+	ret = tmem_destroy_pool(tmem_pool);
+	if (!ret)
+		return 0;
+	printk(KERN_INFO
+		"Unmapping superblock for s_id=%s from precache_id=%d\n",
+		sb->s_id, ret);
+	sb->precache_poolid = 0;
+	return 1;
+}
+EXPORT_SYMBOL(precache_flush_filesystem);
+
+void precache_init(struct super_block *sb)
+{
+	sb->precache_poolid = tmem_new_pool(0, 0, 0);
+}
+EXPORT_SYMBOL(precache_init);
+
+void shared_precache_init(struct super_block *sb, char *uuid)
+{
+	u64 uuid_lo = *(u64 *)uuid;
+	u64 uuid_hi = *(u64 *)(&uuid[8]);
+	sb->precache_poolid = tmem_new_pool(uuid_lo, uuid_hi, TMEM_POOL_SHARED);
+}
+EXPORT_SYMBOL(shared_precache_init);
diff -r 3776d277956f mm/preswap.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mm/preswap.c	Wed Jun 17 09:45:40 2009 -0600
@@ -0,0 +1,184 @@
+/*
+ * linux/mm/preswap.c
+ *
+ * Implements a fast "preswap" on top of the transcendent memory ("tmem") API.
+ * When a swapdisk is enabled (with swapon), a "private persistent tmem pool"
+ * is created along with a bit-per-page preswap_map.  When swapping occurs
+ * and a page is about to be written to disk, a "put" into the pool may first
+ * be attempted by passing the pageframe to be swapped, along with a "handle"
+ * consisting of a pool_id, an object id, and an index.  Since the pool is of
+ * indeterminate size, the "put" may be rejected, in which case the page
+ * is swapped to disk as normal.  If the "put" is successful, the page is
+ * copied to tmem and the preswap_map records the success.  Later, when
+ * the page needs to be swapped in, the preswap_map is checked and, if set,
+ * the page may be obtained with a "get" operation.  Note that the swap
+ * subsystem is responsible for: maintaining coherency between the swapcache,
+ * preswap, and the swapdisk; for evicting stale pages from preswap; and for
+ * emptying preswap when swapoff is performed. The "flush page" and "flush
+ * object" actions are provided for this.
+ *
+ * Note that if a "duplicate put" is performed to overwrite a page and
+ * the "put" operation fails, the page (and old data) is flushed and lost.
+ * Also note that multiple accesses to a tmem pool may be concurrent and
+ * any ordering must be guaranteed by the caller.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/sysctl.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/proc_fs.h>
+#include <linux/security.h>
+#include <linux/capability.h>
+#include <linux/uaccess.h>
+#include "tmem.h"
+
+static u32 preswap_poolid = -1; /* if negative, preswap will never call tmem */
+
+const unsigned long preswap_zero = 0, preswap_infinity = ~0UL; /* for sysctl */
+
+/*
+ * Swizzling increases objects per swaptype, increasing tmem concurrency
+ * for heavy swaploads.  Later, larger nr_cpus -> larger SWIZ_BITS
+ */
+#define SWIZ_BITS		4
+#define SWIZ_MASK		((1 << SWIZ_BITS) - 1)
+#define oswiz(_type, _ind)	((_type << SWIZ_BITS) | (_ind & SWIZ_MASK))
+#define iswiz(_ind)		(_ind >> SWIZ_BITS)
+
+/*
+ * preswap_map test/set/clear operations (must be atomic)
+ */
+
+int preswap_test(struct swap_info_struct *sis, unsigned long offset)
+{
+	if (!sis->preswap_map)
+		return 0;
+	return test_bit(offset % BITS_PER_LONG,
+		&sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_set(struct swap_info_struct *sis,
+				unsigned long offset)
+{
+	if (!sis->preswap_map)
+		return;
+	set_bit(offset % BITS_PER_LONG,
+		&sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+static inline void preswap_clear(struct swap_info_struct *sis,
+				unsigned long offset)
+{
+	if (!sis->preswap_map)
+		return;
+	clear_bit(offset % BITS_PER_LONG,
+		&sis->preswap_map[offset/BITS_PER_LONG]);
+}
+
+/*
+ * preswap tmem operations
+ */
+
+/* returns 1 if the page was successfully put into preswap, 0 if the page
+ * was declined, and -ERRNO for a specific error */
+int preswap_put(struct page *page)
+{
+	swp_entry_t entry = { .val = page_private(page), };
+	unsigned type = swp_type(entry);
+	pgoff_t offset = swp_offset(entry);
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int dup = 0, ret;
+
+	if ((s32)preswap_poolid < 0)
+		return 0;
+	if (ind64 != ind)
+		return 0;
+	if (preswap_test(sis, offset))
+		dup = 1;
+	mb(); /* ensure page is quiescent; tmem may address it with an alias */
+	ret = tmem_put_page(preswap_poolid, oswiz(type, ind), iswiz(ind), mfn);
+	if (ret == 1) {
+		preswap_set(sis, offset);
+		if (!dup)
+			sis->preswap_pages++;
+	} else if (dup) {
+		/* failed dup put always results in an automatic flush of
+		 * the (older) page from preswap */
+		preswap_clear(sis, offset);
+		sis->preswap_pages--;
+	}
+	return ret;
+}
+
+/* returns 1 if the page was successfully gotten from preswap, 0 if the page
+ * was not present (should never happen!), and -ERRNO for a specific error */
+int preswap_get(struct page *page)
+{
+	swp_entry_t entry = { .val = page_private(page), };
+	unsigned type = swp_type(entry);
+	pgoff_t offset = swp_offset(entry);
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	unsigned long mfn = pfn_to_mfn(page_to_pfn(page));
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int ret;
+
+	if ((s32)preswap_poolid < 0)
+		return 0;
+	if (ind64 != ind)
+		return 0;
+	if (!preswap_test(sis, offset))
+		return 0;
+	ret = tmem_get_page(preswap_poolid, oswiz(type, ind), iswiz(ind), mfn);
+	return ret;
+}
+
+/* flush a single page from preswap */
+void preswap_flush(unsigned type, unsigned long offset)
+{
+	u64 ind64 = (u64)offset;
+	u32 ind = (u32)offset;
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int ret = 1;
+
+	if ((s32)preswap_poolid < 0)
+		return;
+	if (ind64 != ind)
+		return;
+	if (preswap_test(sis, offset)) {
+		ret = tmem_flush_page(preswap_poolid,
+					oswiz(type, ind), iswiz(ind));
+		sis->preswap_pages--;
+		preswap_clear(sis, offset);
+	}
+}
+
+/* flush all pages from the passed swaptype */
+void preswap_flush_area(unsigned type)
+{
+	struct swap_info_struct *sis = get_swap_info_struct(type);
+	int ind;
+
+	if ((s32)preswap_poolid < 0)
+		return;
+	for (ind = SWIZ_MASK; ind >= 0; ind--)
+		(void)tmem_flush_object(preswap_poolid, oswiz(type, ind));
+	sis->preswap_pages = 0;
+}
+
+void preswap_init(unsigned type)
+{
+	/* only need one tmem pool for all swap types */
+	if ((s32)preswap_poolid >= 0)
+		return;
+	preswap_poolid = tmem_new_pool(0, 0, TMEM_POOL_PERSIST);
+	if (preswap_poolid < 0)
+		return;
+}
diff -r 3776d277956f mm/swapfile.c
--- a/mm/swapfile.c	Wed Jun 17 09:07:23 2009 +0100
+++ b/mm/swapfile.c	Wed Jun 17 09:45:40 2009 -0600
@@ -284,6 +284,7 @@ static int swap_entry_free(struct swap_i
 				swap_list.next = p - swap_info;
 			nr_swap_pages++;
 			p->inuse_pages--;
+			preswap_flush(p - swap_info, offset);
 		}
 	}
 	return count;
@@ -623,7 +624,7 @@ static int unuse_mm(struct mm_struct *mm
  * Recycle to start on reaching the end, returning 0 when empty.
  */
 static unsigned int find_next_to_unuse(struct swap_info_struct *si,
-					unsigned int prev)
+				unsigned int prev, unsigned int preswap)
 {
 	unsigned int max = si->max;
 	unsigned int i = prev;
@@ -649,6 +650,12 @@ static unsigned int find_next_to_unuse(s
 			prev = 0;
 			i = 1;
 		}
+		if (preswap) {
+			if (preswap_test(si, i))
+				break;
+			else
+				continue;
+		}
 		count = si->swap_map[i];
 		if (count && count != SWAP_MAP_BAD)
 			break;
@@ -660,8 +667,12 @@ static unsigned int find_next_to_unuse(s
  * We completely avoid races by reading each swap page in advance,
  * and then search for the process using it.  All the necessary
  * page table adjustments can then be made atomically.
+ *
+ * if the boolean preswap is true, only unuse pages_to_unuse pages;
+ * pages_to_unuse==0 means all pages
  */
-static int try_to_unuse(unsigned int type)
+static int try_to_unuse(unsigned int type, unsigned int preswap,
+		unsigned long pages_to_unuse)
 {
 	struct swap_info_struct * si = &swap_info[type];
 	struct mm_struct *start_mm;
@@ -697,7 +708,7 @@ static int try_to_unuse(unsigned int typ
 	 * one pass through swap_map is enough, but not necessarily:
 	 * there are races when an instance of an entry might be missed.
 	 */
-	while ((i = find_next_to_unuse(si, i)) != 0) {
+	while ((i = find_next_to_unuse(si, i, preswap)) != 0) {
 		if (signal_pending(current)) {
 			retval = -EINTR;
 			break;
@@ -876,6 +887,8 @@ static int try_to_unuse(unsigned int typ
 		 * interactive performance.
 		 */
 		cond_resched();
+		if (preswap && pages_to_unuse && !--pages_to_unuse)
+			break;
 	}
 
 	mmput(start_mm);
@@ -1198,7 +1211,7 @@ asmlinkage long sys_swapoff(const char _
 	spin_unlock(&swap_lock);
 
 	current->flags |= PF_SWAPOFF;
-	err = try_to_unuse(type);
+	err = try_to_unuse(type, 0, 0);
 	current->flags &= ~PF_SWAPOFF;
 
 	if (err) {
@@ -1242,9 +1255,14 @@ asmlinkage long sys_swapoff(const char _
 	swap_map = p->swap_map;
 	p->swap_map = NULL;
 	p->flags = 0;
+	preswap_flush_area(p - swap_info);
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	vfree(swap_map);
+#ifdef CONFIG_PRESWAP
+	if (p->preswap_map)
+		vfree(p->preswap_map);
+#endif
 	inode = mapping->host;
 	if (S_ISBLK(inode->i_mode)) {
 		struct block_device *bdev = I_BDEV(inode);
@@ -1553,6 +1571,11 @@ asmlinkage long sys_swapon(const char __
 
 		error = 0;
 		memset(p->swap_map, 0, maxpages * sizeof(short));
+#ifdef CONFIG_PRESWAP
+		p->preswap_map = vmalloc(maxpages / sizeof(long));
+		if (p->preswap_map)
+			memset(p->preswap_map, 0, maxpages / sizeof(long));
+#endif
 		for (i = 0; i < swap_header->info.nr_badpages; i++) {
 			int page_nr = swap_header->info.badpages[i];
 			if (page_nr <= 0 || page_nr >= swap_header->info.last_page)
@@ -1615,6 +1638,7 @@ asmlinkage long sys_swapon(const char __
 	} else {
 		swap_info[prev].next = p - swap_info;
 	}
+	preswap_init(p - swap_info);
 	spin_unlock(&swap_lock);
 	mutex_unlock(&swapon_mutex);
 	error = 0;
@@ -1729,6 +1753,8 @@ int valid_swaphandles(swp_entry_t entry,
 
 	if (!page_cluster)	/* no readahead */
 		return 0;
+	if (preswap_test(swapdev, swp_offset(entry)))
+		return 0;
 	toff = (swp_offset(entry) >> page_cluster) << page_cluster;
 	if (!toff)		/* first page is swap header */
 		toff++, i--;
@@ -1744,9 +1770,101 @@ int valid_swaphandles(swp_entry_t entry,
 			break;
 		if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
 			break;
+		/* Don't read in preswap pages */
+		if (preswap_test(swapdev, toff))
+			break;
 		toff++;
 		ret++;
 	} while (--i);
 	spin_unlock(&swap_lock);
 	return ret;
 }
+
+#ifdef CONFIG_PRESWAP
+/*
+ * preswap infrastructure functions
+ */
+
+/* code structure leveraged from sys_swapoff */
+void preswap_shrink(unsigned long target_pages)
+{
+	struct swap_info_struct *si = NULL;
+	unsigned long total_pages = 0, total_pages_to_unuse;
+	unsigned long pages = 0, unuse_pages = 0;
+	int type;
+	int wrapped = 0;
+
+	do {
+		/*
+		 * we don't want to hold swap_lock while doing a very
+		 * lengthy try_to_unuse, but swap_list may change
+		 * so restart scan from swap_list.head each time
+		 */
+		spin_lock(&swap_lock);
+		total_pages = 0;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = get_swap_info_struct(type);
+			total_pages += si->preswap_pages;
+		}
+		if (total_pages <= target_pages) {
+			spin_unlock(&swap_lock);
+			return;
+		}
+		total_pages_to_unuse = total_pages - target_pages;
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = get_swap_info_struct(type);
+			if (total_pages_to_unuse < si->preswap_pages)
+				pages = unuse_pages = total_pages_to_unuse;
+			else {
+				pages = si->preswap_pages;
+				unuse_pages = 0; /* unuse all */
+			}
+			if (security_vm_enough_memory(pages))
+				continue;
+			vm_unacct_memory(pages);
+			break;
+		}
+		spin_unlock(&swap_lock);
+		if (type < 0)
+			return;
+		current->flags |= PF_SWAPOFF;
+		(void)try_to_unuse(type, 1, unuse_pages);
+		current->flags &= ~PF_SWAPOFF;
+		wrapped++;
+	} while (wrapped <= 3);
+}
+
+
+#ifdef CONFIG_SYSCTL
+/* cat /sys/proc/vm/preswap provides total number of pages in preswap
+ * across all swaptypes.  echo N > /sys/proc/vm/preswap attempts to shrink
+ * preswap page usage to N (usually 0) */
+int preswap_sysctl_handler(ctl_table *table, int write,
+	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+{
+	unsigned long npages;
+	int type;
+	unsigned long totalpages = 0;
+	struct swap_info_struct *si = NULL;
+
+	/* modeled after hugetlb_sysctl_handler in mm/hugetlb.c */
+	if (!write) {
+		spin_lock(&swap_lock);
+		for (type = swap_list.head; type >= 0; type = si->next) {
+			si = get_swap_info_struct(type);
+			totalpages += si->preswap_pages;
+		}
+		spin_unlock(&swap_lock);
+		npages = totalpages;
+	}
+	table->data = &npages;
+	table->maxlen = sizeof(unsigned long);
+	proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+
+	if (write)
+		preswap_shrink(npages);
+
+	return 0;
+}
+#endif
+#endif /* CONFIG_PRESWAP */
diff -r 3776d277956f mm/tmem.c
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mm/tmem.c	Wed Jun 17 09:45:40 2009 -0600
@@ -0,0 +1,41 @@
+/*
+ * Xen implementation for transcendent memory (tmem)
+ *
+ * Dan Magenheimer <dan.magenheimer@oracle.com> 2009
+ */
+
+#include <linux/types.h>
+#include <xen/interface/xen.h>
+#include <asm/hypervisor.h>
+
+int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, u64 object, u32 index,
+	unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len)
+{
+	struct tmem_op op;
+	int rc = 0;
+
+	op.cmd = tmem_cmd;
+	op.pool_id = tmem_pool;
+	op.u.gen.object = object;
+	op.u.gen.index = index;
+	op.u.gen.tmem_offset = tmem_offset;
+	op.u.gen.pfn_offset = pfn_offset;
+	op.u.gen.len = len;
+	op.u.gen.cmfn = gmfn;
+	rc = HYPERVISOR_tmem_op(&op);
+	return rc;
+}
+
+int xen_tmem_new_pool(uint32_t tmem_cmd, uint64_t uuid_lo,
+	uint64_t uuid_hi, uint32_t flags)
+{
+	struct tmem_op op;
+	int rc = 0;
+
+	op.cmd = tmem_cmd;
+	op.u.new.uuid[0] = uuid_lo;
+	op.u.new.uuid[1] = uuid_hi;
+	op.u.new.flags = flags;
+	rc = HYPERVISOR_tmem_op(&op);
+	return rc;
+}
diff -r 3776d277956f mm/tmem.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mm/tmem.h	Wed Jun 17 09:45:40 2009 -0600
@@ -0,0 +1,84 @@
+/*
+ * linux/mm/tmem.h
+ *
+ * Interface to transcendent memory, used by mm/precache.c and mm/preswap.c
+ * Currently implemented on XEN, but may be implemented elsewhere in future.
+ *
+ * Copyright (C) 2008,2009 Dan Magenheimer, Oracle Corp.
+ */
+
+#ifdef CONFIG_XEN
+#include <xen/interface/xen.h>
+
+/* Bits for HYPERVISOR_tmem_op(TMEM_NEW_POOL) */
+#define TMEM_POOL_MIN_PAGESHIFT   12
+#define TMEM_POOL_PAGEORDER       (PAGE_SHIFT - TMEM_POOL_MIN_PAGESHIFT)
+
+extern int xen_tmem_op(u32 tmem_cmd, u32 tmem_pool, u64 object, u32 index,
+	unsigned long gmfn, u32 tmem_offset, u32 pfn_offset, u32 len);
+extern int xen_tmem_new_pool(u32 tmem_cmd, u64 uuid_lo, u64 uuid_hi, u32 flags);
+
+static inline int tmem_put_page(u32 pool_id, u64 object, u32 index,
+	unsigned long gmfn)
+{
+	return xen_tmem_op(TMEM_PUT_PAGE, pool_id, object, index,
+		gmfn, 0, 0, 0);
+}
+
+static inline int tmem_get_page(u32 pool_id, u64 object, u32 index,
+	unsigned long gmfn)
+{
+	return xen_tmem_op(TMEM_GET_PAGE, pool_id, object, index,
+		gmfn, 0, 0, 0);
+}
+
+static inline int tmem_flush_page(u32 pool_id, u64 object, u32 index)
+{
+	return xen_tmem_op(TMEM_FLUSH_PAGE, pool_id, object, index,
+		0, 0, 0, 0);
+}
+
+static inline int tmem_flush_object(u32 pool_id, u64 object)
+{
+	return xen_tmem_op(TMEM_FLUSH_OBJECT, pool_id, object, 0, 0, 0, 0, 0);
+}
+
+static inline int tmem_new_pool(u64 uuid_lo, u64 uuid_hi, u32 flags)
+{
+	BUILD_BUG_ON((TMEM_POOL_PAGEORDER < 0) ||
+		(TMEM_POOL_PAGEORDER >= TMEM_POOL_PAGESIZE_MASK));
+	flags |= TMEM_POOL_PAGEORDER << TMEM_POOL_PAGESIZE_SHIFT;
+	return xen_tmem_new_pool(TMEM_NEW_POOL, uuid_lo, uuid_hi, flags);
+}
+
+static inline int tmem_destroy_pool(u32 pool_id)
+{
+	return xen_tmem_op(TMEM_DESTROY_POOL, pool_id, 0, 0, 0, 0, 0, 0);
+}
+#else
+struct tmem_op {
+	u32 cmd;
+	s32 pool_id; /* private > 0; shared < 0; 0 is invalid */
+	union {
+		struct {  /* for cmd == TMEM_NEW_POOL */
+			u64 uuid[2];
+			u32 flags;
+		} new;
+		struct {  /* for cmd == TMEM_CONTROL */
+			u32 subop;
+			u32 cli_id;
+			u32 arg1;
+			u32 arg2;
+			void *buf;
+		} ctrl;
+		struct {
+			u64 object;
+			u32 index;
+			u32 tmem_offset;
+			u32 pfn_offset;
+			u32 len;
+			unsigned long pfn;  /* page frame */
+		} gen;
+	} u;
+};
+#endif
diff -r 3776d277956f mm/truncate.c
--- a/mm/truncate.c	Wed Jun 17 09:07:23 2009 +0100
+++ b/mm/truncate.c	Wed Jun 17 09:45:40 2009 -0600
@@ -12,13 +12,14 @@
 #include <linux/module.h>
 #include <linux/pagemap.h>
 #include <linux/pagevec.h>
+#include <linux/precache.h>
 #include <linux/buffer_head.h>	/* grr. try_to_release_page,
 				   do_invalidatepage */
-
 
 static inline void truncate_partial_page(struct page *page, unsigned partial)
 {
 	memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
+	precache_flush(page->mapping, page->index);
 	if (PagePrivate(page))
 		do_invalidatepage(page, partial);
 }
@@ -46,6 +47,10 @@ truncate_complete_page(struct address_sp
 	ClearPageUptodate(page);
 	ClearPageMappedToDisk(page);
 	remove_from_page_cache(page);
+	/* this must be after the remove_from_page_cache which
+	 * calls precache_put
+	 */
+	precache_flush(mapping, page->index);
 	page_cache_release(page);	/* pagecache ref */
 }
 
@@ -118,6 +123,7 @@ void truncate_inode_pages_range(struct a
 	pgoff_t next;
 	int i;
 
+	precache_flush_inode(mapping);
 	if (mapping->nrpages == 0)
 		return;
 
@@ -191,6 +197,7 @@ void truncate_inode_pages_range(struct a
 		}
 		pagevec_release(&pvec);
 	}
+	precache_flush_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
 
@@ -324,6 +331,7 @@ int invalidate_inode_pages2_range(struct
 	int did_range_unmap = 0;
 	int wrapped = 0;
 
+	precache_flush_inode(mapping);
 	pagevec_init(&pvec, 0);
 	next = start;
 	while (next <= end && !ret && !wrapped &&
@@ -379,6 +387,7 @@ int invalidate_inode_pages2_range(struct
 		pagevec_release(&pvec);
 		cond_resched();
 	}
+	precache_flush_inode(mapping);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);

[-- Attachment #3: Type: text/plain, Size: 138 bytes --]

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2009-06-17 16:11 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-06-16 18:02 [PATCH] [linux-2.6.18-xen] (take 3) transcendent memory ("tmem") linux-side changes Dan Magenheimer
2009-06-17  6:31 ` Keir Fraser
2009-06-17 16:11   ` Dan Magenheimer

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.