linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] Page prefetch ver. 0.1 for 2.5.68-mm2
@ 2003-04-28 20:12 Thomas Schlichter
  0 siblings, 0 replies; only message in thread
From: Thomas Schlichter @ 2003-04-28 20:12 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel


[-- Attachment #1.1: body text --]
[-- Type: text/plain, Size: 1156 bytes --]

This is a working version of the idea posted on the LKML a few weeks ago...

Currently it only works when loaded as a module. Then a kernel thread 
'kprefetchd' is started which prefetches swap and buffer pages when there is 
free buffer memory. When the module is unloaded the kernel thread is stopped 
again.

For me it works as expected and has no noticeable negative impact. But some 
benchmarks should be performed to ensure this...

Besides from the problem that this works only as a module, yet, there are a 
few points which could be improved when I've got some spare time...
For example I should take care of the disk I/O usage before prefetching. The 
list of swapped pages is growing too much, too. It can be seen be doing a
   grep swapped_entry /proc/slabinfo
An other point is that it doesn't work on NFS file systems as I set the file 
pointer to NULL for do_page_chache_readahead().

And I'm open for any other further improvements.
I hope someone else likes that patch as much as I do... ;-)

Best regards
   Thomas Schlichter

P.S.: For those who want it I've got a patch that applies on vanilla 2.5.68, 
too...

[-- Attachment #1.2: page_prefetch-2.5.68-mm2.diff --]
[-- Type: text/x-diff, Size: 15593 bytes --]

diff -urP linux-2.5.68-mm2/arch/i386/Kconfig linux-2.5.68_patched/arch/i386/Kconfig
--- linux-2.5.68-mm2/arch/i386/Kconfig	Mon Apr 28 20:41:41 2003
+++ linux-2.5.68_patched/arch/i386/Kconfig	Mon Apr 28 12:05:07 2003
@@ -373,6 +373,14 @@
 	depends on MK8 || MPENTIUM4
 	default y
 
+config PAGE_PREFETCH
+	tristate "Prefetch swapped memory pages (EXPERIMENTAL)"
+	depends on EXPERIMENTAL
+	help
+	  This option enables the kernel to prefetch swapped memory pages
+	  when idle. This feature is experimental and known not to work
+	  with the NFS filesystem!
+
 config HUGETLB_PAGE
 	bool "Huge TLB Page Support"
 	help
diff -urP linux-2.5.68-mm2/fs/inode.c linux-2.5.68_patched/fs/inode.c
--- linux-2.5.68-mm2/fs/inode.c	Sun Apr 20 04:51:13 2003
+++ linux-2.5.68_patched/fs/inode.c	Mon Apr 28 12:05:07 2003
@@ -18,6 +18,7 @@
 #include <linux/hash.h>
 #include <linux/swap.h>
 #include <linux/security.h>
+#include <linux/page_prefetch.h>
 
 /*
  * This is needed for the following functions:
@@ -176,10 +177,12 @@
 	INIT_LIST_HEAD(&inode->i_data.clean_pages);
 	INIT_LIST_HEAD(&inode->i_data.dirty_pages);
 	INIT_LIST_HEAD(&inode->i_data.locked_pages);
+	INIT_LIST_HEAD(&inode->i_data.swapped_pages);
 	INIT_LIST_HEAD(&inode->i_data.io_pages);
 	INIT_LIST_HEAD(&inode->i_dentry);
 	INIT_LIST_HEAD(&inode->i_devices);
 	sema_init(&inode->i_sem, 1);
+	INIT_RADIX_TREE(&inode->i_data.swap_tree, GFP_ATOMIC);
 	INIT_RADIX_TREE(&inode->i_data.page_tree, GFP_ATOMIC);
 	spin_lock_init(&inode->i_data.page_lock);
 	init_MUTEX(&inode->i_data.i_shared_sem);
@@ -227,6 +230,7 @@
  
 void clear_inode(struct inode *inode)
 {
+	invalidate_swapped_list(&inode->i_data);
 	invalidate_inode_buffers(inode);
        
 	if (inode->i_data.nrpages)
diff -urP linux-2.5.68-mm2/include/linux/fs.h linux-2.5.68_patched/include/linux/fs.h
--- linux-2.5.68-mm2/include/linux/fs.h	Sun Apr 20 04:48:56 2003
+++ linux-2.5.68_patched/include/linux/fs.h	Mon Apr 28 12:05:07 2003
@@ -312,11 +312,13 @@
 struct backing_dev_info;
 struct address_space {
 	struct inode		*host;		/* owner: inode, block_device */
+	struct radix_tree_root	swap_tree;	/* radix tree of swapped pages */
 	struct radix_tree_root	page_tree;	/* radix tree of all pages */
 	spinlock_t		page_lock;	/* and rwlock protecting it */
 	struct list_head	clean_pages;	/* list of clean pages */
 	struct list_head	dirty_pages;	/* list of dirty pages */
 	struct list_head	locked_pages;	/* list of locked pages */
+	struct list_head	swapped_pages;	/* list of swapped pages */
 	struct list_head	io_pages;	/* being prepared for I/O */
 	unsigned long		nrpages;	/* number of total pages */
 	struct address_space_operations *a_ops;	/* methods */
diff -urP linux-2.5.68-mm2/include/linux/page_prefetch.h linux-2.5.68_patched/include/linux/page_prefetch.h
--- linux-2.5.68-mm2/include/linux/page_prefetch.h	Thu Jan  1 01:00:00 1970
+++ linux-2.5.68_patched/include/linux/page_prefetch.h	Mon Apr 28 12:05:07 2003
@@ -0,0 +1,96 @@
+#ifndef _LINUX_PAGE_PREFETCH_H
+#define _LINUX_PAGE_PREFETCH_H
+
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/radix-tree.h>
+
+struct swapped_entry_t {
+	swp_entry_t		swp_entry;
+	struct address_space	*mapping;
+	struct list_head	mapping_list;
+	struct list_head	swapped_list;
+};
+
+struct swapped_root_t {
+	spinlock_t		lock;
+	unsigned int		count;
+	unsigned int		maxcount;
+	kmem_cache_t		*cache;
+	struct list_head	list;
+};
+
+extern struct swapped_root_t	swapped_root;
+
+static inline void add_to_swapped_list(struct address_space *mapping,
+							unsigned long index)
+{
+	struct swapped_entry_t *entry;
+	int error;
+
+	spin_lock(&swapped_root.lock);
+
+	if(swapped_root.count >= swapped_root.maxcount)
+	{
+		entry = list_entry(swapped_root.list.prev,
+				struct swapped_entry_t, swapped_list);
+		radix_tree_delete(&entry->mapping->swap_tree,
+				entry->swp_entry.val);
+		list_del(&entry->mapping_list);
+		list_del(&entry->swapped_list);
+		swapped_root.count--;
+	} else {
+		entry = kmem_cache_alloc(swapped_root.cache, GFP_ATOMIC);
+		if(!entry)
+			goto out_locked;
+	}
+
+	entry->swp_entry.val = index;
+	entry->mapping       = mapping;
+
+	error = radix_tree_insert(&mapping->swap_tree, index, entry);
+	if(!error) {
+		list_add(&entry->mapping_list, &mapping->swapped_pages);
+		list_add(&entry->swapped_list, &swapped_root.list);
+		swapped_root.count++;
+	} else {
+		kmem_cache_free(swapped_root.cache, entry);
+	}
+
+out_locked:
+	spin_unlock(&swapped_root.lock);
+}
+
+static inline void remove_from_swapped_list(struct address_space *mapping,
+							unsigned long index)
+{
+	struct swapped_entry_t *entry;
+
+	spin_lock(&swapped_root.lock);
+	entry = radix_tree_delete(&mapping->swap_tree, index);
+	if(entry) {
+		list_del(&entry->mapping_list);
+		list_del(&entry->swapped_list);
+		swapped_root.count--;
+		kmem_cache_free(swapped_root.cache, entry);
+	}
+	spin_unlock(&swapped_root.lock);
+}
+
+static inline void invalidate_swapped_list(struct address_space *mapping)
+{
+	spin_lock(&swapped_root.lock);
+	while(!list_empty(&mapping->swapped_pages)) {
+		struct swapped_entry_t *entry = list_entry(
+					mapping->swapped_pages.next,
+					struct swapped_entry_t, mapping_list);
+		radix_tree_delete(&mapping->swap_tree, entry->swp_entry.val);
+		list_del(&entry->mapping_list);
+		list_del(&entry->swapped_list);
+		swapped_root.count--;
+		kmem_cache_free(swapped_root.cache, entry);
+	}
+	spin_unlock(&swapped_root.lock);
+}
+
+#endif /* _LINUX_PAGE_PREFETCH_H */
diff -urP linux-2.5.68-mm2/include/linux/swap.h linux-2.5.68_patched/include/linux/swap.h
--- linux-2.5.68-mm2/include/linux/swap.h	Sun Apr 20 04:48:49 2003
+++ linux-2.5.68_patched/include/linux/swap.h	Mon Apr 28 12:05:07 2003
@@ -155,6 +155,8 @@
 extern unsigned int nr_free_pages_pgdat(pg_data_t *pgdat);
 extern unsigned int nr_free_buffer_pages(void);
 extern unsigned int nr_free_pagecache_pages(void);
+extern unsigned int nr_avail_buffer_pages(void);
+extern unsigned int nr_avail_pagecache_pages(void);
 
 /* linux/mm/swap.c */
 extern void FASTCALL(lru_cache_add(struct page *));
diff -urP linux-2.5.68-mm2/kernel/ksyms.c linux-2.5.68_patched/kernel/ksyms.c
--- linux-2.5.68-mm2/kernel/ksyms.c	Mon Apr 28 20:41:52 2003
+++ linux-2.5.68_patched/kernel/ksyms.c	Mon Apr 28 12:05:07 2003
@@ -59,6 +59,7 @@
 #include <linux/time.h>
 #include <linux/backing-dev.h>
 #include <linux/percpu_counter.h>
+#include <linux/page_prefetch.h>
 #include <asm/checksum.h>
 
 #if defined(CONFIG_PROC_FS)
@@ -71,6 +72,13 @@
 extern struct timezone sys_tz;
 
 extern int panic_timeout;
+
+/* needed for page prefetch support */
+EXPORT_SYMBOL(swapped_root);
+EXPORT_SYMBOL(swapper_space);
+EXPORT_SYMBOL(swapin_readahead);
+EXPORT_SYMBOL(do_page_cache_readahead);
+EXPORT_SYMBOL(nr_avail_pagecache_pages);
 
 /* process memory management */
 EXPORT_SYMBOL(do_mmap_pgoff);
diff -urP linux-2.5.68-mm2/mm/Makefile linux-2.5.68_patched/mm/Makefile
--- linux-2.5.68-mm2/mm/Makefile	Sun Apr 20 04:50:05 2003
+++ linux-2.5.68_patched/mm/Makefile	Mon Apr 28 12:05:08 2003
@@ -12,3 +12,5 @@
 			   slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
+
+obj-$(CONFIG_PAGE_PREFETCH)	+= page_prefetch.o
diff -urP linux-2.5.68-mm2/mm/filemap.c linux-2.5.68_patched/mm/filemap.c
--- linux-2.5.68-mm2/mm/filemap.c	Mon Apr 28 20:41:52 2003
+++ linux-2.5.68_patched/mm/filemap.c	Mon Apr 28 12:06:39 2003
@@ -36,6 +36,7 @@
  * FIXME: remove all knowledge of the buffer layer from the core VM
  */
 #include <linux/buffer_head.h> /* for generic_osync_inode */
+#include <linux/page_prefetch.h>
 
 #include <asm/uaccess.h>
 #include <asm/mman.h>
@@ -83,6 +84,7 @@
 
 	BUG_ON(PageDirty(page) && !PageSwapCache(page));
 
+	remove_from_swapped_list(mapping, page->index);
 	radix_tree_delete(&mapping->page_tree, page->index);
 	list_del(&page->list);
 	page->mapping = NULL;
@@ -222,8 +224,11 @@
 int add_to_page_cache(struct page *page, struct address_space *mapping,
 		pgoff_t offset, int gfp_mask)
 {
-	int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
+	int error;
 
+	remove_from_swapped_list(mapping, offset);
+
+	error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
 	if (error == 0) {
 		page_cache_get(page);
 		spin_lock(&mapping->page_lock);
diff -urP linux-2.5.68-mm2/mm/page_alloc.c linux-2.5.68_patched/mm/page_alloc.c
--- linux-2.5.68-mm2/mm/page_alloc.c	Mon Apr 28 20:41:52 2003
+++ linux-2.5.68_patched/mm/page_alloc.c	Mon Apr 28 12:05:08 2003
@@ -852,6 +852,48 @@
 }
 #endif
 
+static unsigned int nr_avail_zone_pages(int offset)
+{
+	pg_data_t *pgdat;
+	unsigned long avail = 0;
+
+	for_each_pgdat(pgdat) {
+		struct zonelist *zonelist = pgdat->node_zonelists + offset;
+		struct zone **zonep = zonelist->zones;
+		struct zone *zone;
+		unsigned long low = 0;
+
+		for (zone = *zonep++; zone; zone = *zonep++) {
+			unsigned long local_free = zone->free_pages;
+			unsigned long local_low  = zone->pages_low;
+			
+			low += local_low;
+			if (local_free > low) {
+				avail = max(avail, local_free - low);
+			}
+			low += local_low * sysctl_lower_zone_protection;
+		}
+	}
+
+	return avail;
+}
+
+/*
+ * Amount of available RAM allocatable within ZONE_DMA and ZONE_NORMAL
+ */
+unsigned int nr_avail_buffer_pages(void)
+{
+	return nr_avail_zone_pages(GFP_USER & GFP_ZONEMASK);
+}
+
+/*
+ * Amount of available RAM allocatable within all zones
+ */
+unsigned int nr_avail_pagecache_pages(void)
+{
+	return nr_avail_zone_pages(GFP_HIGHUSER & GFP_ZONEMASK);
+}
+
 #ifdef CONFIG_NUMA
 static void show_node(struct zone *zone)
 {
diff -urP linux-2.5.68-mm2/mm/page_prefetch.c linux-2.5.68_patched/mm/page_prefetch.c
--- linux-2.5.68-mm2/mm/page_prefetch.c	Thu Jan  1 01:00:00 1970
+++ linux-2.5.68_patched/mm/page_prefetch.c	Mon Apr 28 12:05:08 2003
@@ -0,0 +1,114 @@
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/page_prefetch.h>
+#include <linux/backing-dev.h>
+
+#define RESERVED_PAGES	256		/* let 1MByte of pagecache free */
+#define INTERVAL	60		/* (secs) Default is 1 minute */
+
+static int reserved_pages = RESERVED_PAGES;
+static int interval       = INTERVAL;
+
+MODULE_PARM(reserved_pages,"i");
+MODULE_PARM_DESC(reserved_pages,
+	"count of pagechache pages to let free (default 256)");
+
+MODULE_PARM(interval,"i");
+MODULE_PARM_DESC(interval,
+	"delay in seconds to wait between memory checks (default 60)");
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Thomas Schlichter <thomas.schlichter@web.de>");
+MODULE_DESCRIPTION("prefetches swap pages when there is free memory");
+
+/*
+ *	Our timer
+ */
+static void prefetch_timer_handler(unsigned long data);
+static struct timer_list prefetch_timer =
+		TIMER_INITIALIZER(prefetch_timer_handler, 0, 0);
+
+/*
+ *	Our waitqueue
+ */
+static DECLARE_WAIT_QUEUE_HEAD(kprefetchd_wait);
+
+/*
+ *	If the timer expires...
+ */
+static void prefetch_timer_handler(unsigned long data)
+{
+	wake_up_interruptible(&kprefetchd_wait);
+	prefetch_timer.expires = jiffies + interval * HZ;
+	add_timer(&prefetch_timer);
+}
+
+static int running;
+
+/*
+ *	...wake up the kernel thread
+ */
+static int kprefetchd(void *data)
+{
+	DEFINE_WAIT(wait);
+
+	daemonize("kprefetchd");
+
+	while(running) {
+		prepare_to_wait(&kprefetchd_wait, &wait, TASK_INTERRUPTIBLE);
+		schedule();
+		finish_wait(&kprefetchd_wait, &wait);
+
+		while(nr_avail_pagecache_pages() > reserved_pages) {
+			struct swapped_entry_t *entry;
+
+			spin_lock(&swapped_root.lock);
+
+			if(list_empty(&swapped_root.list)) {
+				spin_unlock(&swapped_root.lock);
+				break;
+			}
+
+			entry = list_entry(swapped_root.list.next,
+					struct swapped_entry_t, swapped_list);
+			radix_tree_delete(&entry->mapping->swap_tree,
+					entry->swp_entry.val);
+			list_del(&entry->mapping_list);
+			list_del(&entry->swapped_list);
+			swapped_root.count--;
+
+			spin_unlock(&swapped_root.lock);
+
+			if(entry->mapping == &swapper_space) {
+				swapin_readahead(entry->swp_entry);
+			} else {
+				do_page_cache_readahead(entry->mapping,
+					NULL, entry->swp_entry.val,
+					entry->mapping->backing_dev_info->ra_pages);
+			}
+
+			kmem_cache_free(swapped_root.cache, entry);
+		}
+	}
+
+	return 0;
+}
+
+static int __init prefetch_init(void)
+{
+	running = 1;
+	kernel_thread(kprefetchd, NULL, CLONE_KERNEL);
+	prefetch_timer.expires = jiffies + interval * HZ;
+	add_timer(&prefetch_timer);
+	return 0;
+}
+
+static void __exit prefetch_exit(void)
+{
+	del_timer(&prefetch_timer);
+	running = 0;
+	wake_up_interruptible(&kprefetchd_wait);
+}
+
+module_init(prefetch_init);
+module_exit(prefetch_exit);
diff -urP linux-2.5.68-mm2/mm/swap.c linux-2.5.68_patched/mm/swap.c
--- linux-2.5.68-mm2/mm/swap.c	Mon Apr 28 20:41:52 2003
+++ linux-2.5.68_patched/mm/swap.c	Mon Apr 28 12:05:08 2003
@@ -23,6 +23,13 @@
 #include <linux/mm_inline.h>
 #include <linux/buffer_head.h>	/* for try_to_release_page() */
 #include <linux/percpu.h>
+#include <linux/page_prefetch.h>
+
+struct swapped_root_t swapped_root = {
+	.lock  = SPIN_LOCK_UNLOCKED,
+	.count = 0,
+	.list  = LIST_HEAD_INIT(swapped_root.list),
+};
 
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
@@ -390,4 +397,17 @@
 	 * Right now other parts of the system means that we
 	 * _really_ don't want to cluster much more
 	 */
+
+	/*
+	 * Create kmem cache for swapped entries
+	 */
+ 	swapped_root.cache = kmem_cache_create("swapped_entry",
+		sizeof(struct swapped_entry_t), 0, 0, NULL, NULL);
+	if(!swapped_root.cache)
+		panic("swap_setup(): cannot create swapped_entry SLAB cache");
+
+	/*
+	 * Set max count of swapped entries
+	 */
+	 swapped_root.maxcount = nr_free_pagecache_pages();
 }
diff -urP linux-2.5.68-mm2/mm/swap_state.c linux-2.5.68_patched/mm/swap_state.c
--- linux-2.5.68-mm2/mm/swap_state.c	Mon Apr 28 20:41:52 2003
+++ linux-2.5.68_patched/mm/swap_state.c	Mon Apr 28 12:05:08 2003
@@ -32,12 +32,14 @@
 extern struct address_space_operations swap_aops;
 
 struct address_space swapper_space = {
+	.swap_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC),
 	.page_lock	= SPIN_LOCK_UNLOCKED,
 	.clean_pages	= LIST_HEAD_INIT(swapper_space.clean_pages),
 	.dirty_pages	= LIST_HEAD_INIT(swapper_space.dirty_pages),
 	.io_pages	= LIST_HEAD_INIT(swapper_space.io_pages),
 	.locked_pages	= LIST_HEAD_INIT(swapper_space.locked_pages),
+	.swapped_pages	= LIST_HEAD_INIT(swapper_space.swapped_pages),
 	.host		= &swapper_inode,
 	.a_ops		= &swap_aops,
 	.backing_dev_info = &swap_backing_dev_info,
diff -urP linux-2.5.68-mm2/mm/vmscan.c linux-2.5.68_patched/mm/vmscan.c
--- linux-2.5.68-mm2/mm/vmscan.c	Mon Apr 28 20:41:52 2003
+++ linux-2.5.68_patched/mm/vmscan.c	Mon Apr 28 12:05:08 2003
@@ -28,6 +28,7 @@
 #include <linux/pagevec.h>
 #include <linux/backing-dev.h>
 #include <linux/rmap-locking.h>
+#include <linux/page_prefetch.h>
 
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
@@ -414,6 +415,8 @@
 free_it:
 		unlock_page(page);
 		ret++;
+		if(mapping)
+			add_to_swapped_list(mapping, page->index);
 		if (!pagevec_add(&freed_pvec, page))
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;

[-- Attachment #2: signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2003-04-28 20:00 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2003-04-28 20:12 [RFC] Page prefetch ver. 0.1 for 2.5.68-mm2 Thomas Schlichter

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).