All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] Tracking mlocked pages and moving them off the LRU
@ 2007-02-03  6:20 Christoph Lameter
  2007-02-03  8:53 ` Andrew Morton
                   ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-02-03  6:20 UTC (permalink / raw)
  To: linux-kernel; +Cc: akpm, Nick Piggin, KAMEZAWA Hiroyuki, Rik van Riel

This is a new variation on the earlier RFC for tracking mlocked pages.
We now mark a mlocked page with a bit in the page flags and remove
them from the LRU. Pages get moved back when no vma that references
the page has VM_LOCKED set anymore.

This means that vmscan no longer uselessly cycles over large amounts
of mlocked memory should someone attempt to mlock large amounts of
memory (may even result in a livelock on large systems).

Synchronization is build around state changes of the PageMlocked bit.
The NR_MLOCK counter is incremented and decremented based on
state transitions of PageMlocked. So the count is accurate.

There is still some unfinished business:

1. We use the 21st page flag and we only have 20 on 32 bit NUMA platforms.

2. Since mlocked pages are now off the LRU page migration will no longer
   move them.

3. Use NR_MLOCK to tune various VM behaviors so that the VM does not 
   longer fall due to too many mlocked pages in certain areas.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: current/include/linux/mmzone.h
===================================================================
--- current.orig/include/linux/mmzone.h	2007-02-02 16:42:51.000000000 -0800
+++ current/include/linux/mmzone.h	2007-02-02 16:43:28.000000000 -0800
@@ -58,6 +58,7 @@ enum zone_stat_item {
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
 	/* Second 128 byte cacheline */
+	NR_MLOCK,		/* Mlocked pages */
 	NR_SLAB_RECLAIMABLE,
 	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
Index: current/mm/memory.c
===================================================================
--- current.orig/mm/memory.c	2007-02-02 16:42:51.000000000 -0800
+++ current/mm/memory.c	2007-02-02 21:24:20.000000000 -0800
@@ -682,6 +682,8 @@ static unsigned long zap_pte_range(struc
 				file_rss--;
 			}
 			page_remove_rmap(page, vma);
+			if (PageMlocked(page) && (vma->vm_flags & VM_LOCKED))
+				mlock_remove(page, vma);
 			tlb_remove_page(tlb, page);
 			continue;
 		}
@@ -898,6 +900,21 @@ unsigned long zap_page_range(struct vm_a
 }
 
 /*
+ * Add a new anonymous page
+ */
+void anon_add(struct vm_area_struct *vma, struct page *page,
+				unsigned long address)
+{
+	inc_mm_counter(vma->vm_mm, anon_rss);
+	if (vma->vm_flags & VM_LOCKED) {
+		SetPageMlocked(page);
+		inc_zone_page_state(page, NR_MLOCK);
+	} else
+		lru_cache_add_active(page);
+	page_add_new_anon_rmap(page, vma, address);
+}
+
+/*
  * Do a quick page-table lookup for a single page.
  */
 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
@@ -949,6 +966,10 @@ struct page *follow_page(struct vm_area_
 	if (unlikely(!page))
 		goto unlock;
 
+	if ((flags & FOLL_MLOCK) &&
+			!PageMlocked(page) &&
+			(vma->vm_flags & VM_LOCKED))
+		mlock_add(page, vma);
 	if (flags & FOLL_GET)
 		get_page(page);
 	if (flags & FOLL_TOUCH) {
@@ -1045,7 +1066,7 @@ int get_user_pages(struct task_struct *t
 			continue;
 		}
 
-		foll_flags = FOLL_TOUCH;
+		foll_flags = FOLL_TOUCH | FOLL_MLOCK;
 		if (pages)
 			foll_flags |= FOLL_GET;
 		if (!write && !(vma->vm_flags & VM_LOCKED) &&
@@ -2101,9 +2122,7 @@ static int do_anonymous_page(struct mm_s
 		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 		if (!pte_none(*page_table))
 			goto release;
-		inc_mm_counter(mm, anon_rss);
-		lru_cache_add_active(page);
-		page_add_new_anon_rmap(page, vma, address);
+		anon_add(vma, page, address);
 	} else {
 		/* Map the ZERO_PAGE - vm_page_prot is readonly */
 		page = ZERO_PAGE(address);
@@ -2247,12 +2266,13 @@ retry:
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
-		if (anon) {
-			inc_mm_counter(mm, anon_rss);
-			lru_cache_add_active(new_page);
-			page_add_new_anon_rmap(new_page, vma, address);
-		} else {
+		if (anon)
+			anon_add(vma, new_page, address);
+		else {
 			inc_mm_counter(mm, file_rss);
+			if (!PageMlocked(new_page) &&
+					(vma->vm_flags & VM_LOCKED))
+				mlock_add(new_page, vma);
 			page_add_file_rmap(new_page);
 			if (write_access) {
 				dirty_page = new_page;
Index: current/drivers/base/node.c
===================================================================
--- current.orig/drivers/base/node.c	2007-02-02 16:42:51.000000000 -0800
+++ current/drivers/base/node.c	2007-02-02 16:43:28.000000000 -0800
@@ -60,6 +60,7 @@ static ssize_t node_read_meminfo(struct 
 		       "Node %d FilePages:    %8lu kB\n"
 		       "Node %d Mapped:       %8lu kB\n"
 		       "Node %d AnonPages:    %8lu kB\n"
+		       "Node %d Mlock:        %8lu KB\n"
 		       "Node %d PageTables:   %8lu kB\n"
 		       "Node %d NFS_Unstable: %8lu kB\n"
 		       "Node %d Bounce:       %8lu kB\n"
@@ -82,6 +83,7 @@ static ssize_t node_read_meminfo(struct 
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
+		       nid, K(node_page_state(nid, NR_MLOCK)),
 		       nid, K(node_page_state(nid, NR_PAGETABLE)),
 		       nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
 		       nid, K(node_page_state(nid, NR_BOUNCE)),
Index: current/fs/proc/proc_misc.c
===================================================================
--- current.orig/fs/proc/proc_misc.c	2007-02-02 16:42:51.000000000 -0800
+++ current/fs/proc/proc_misc.c	2007-02-02 16:43:28.000000000 -0800
@@ -166,6 +166,7 @@ static int meminfo_read_proc(char *page,
 		"Writeback:    %8lu kB\n"
 		"AnonPages:    %8lu kB\n"
 		"Mapped:       %8lu kB\n"
+		"Mlock:        %8lu KB\n"
 		"Slab:         %8lu kB\n"
 		"SReclaimable: %8lu kB\n"
 		"SUnreclaim:   %8lu kB\n"
@@ -196,6 +197,7 @@ static int meminfo_read_proc(char *page,
 		K(global_page_state(NR_WRITEBACK)),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
+		K(global_page_state(NR_MLOCK)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE) +
 				global_page_state(NR_SLAB_UNRECLAIMABLE)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE)),
Index: current/mm/vmstat.c
===================================================================
--- current.orig/mm/vmstat.c	2007-02-02 16:42:51.000000000 -0800
+++ current/mm/vmstat.c	2007-02-02 16:43:28.000000000 -0800
@@ -439,6 +439,7 @@ static const char * const vmstat_text[] 
 	"nr_file_pages",
 	"nr_dirty",
 	"nr_writeback",
+	"nr_mlock",
 	"nr_slab_reclaimable",
 	"nr_slab_unreclaimable",
 	"nr_page_table_pages",
Index: current/include/linux/page-flags.h
===================================================================
--- current.orig/include/linux/page-flags.h	2007-02-02 16:42:51.000000000 -0800
+++ current/include/linux/page-flags.h	2007-02-02 16:43:28.000000000 -0800
@@ -93,6 +93,7 @@
 
 #define PG_readahead		20	/* Reminder to do read-ahead */
 
+#define PG_mlocked		21	/* Page is mlocked */
 
 #if (BITS_PER_LONG > 32)
 /*
@@ -235,6 +236,20 @@ static inline void SetPageUptodate(struc
 #define SetPageReadahead(page)	set_bit(PG_readahead, &(page)->flags)
 #define ClearPageReadahead(page) clear_bit(PG_readahead, &(page)->flags)
 
+#define PageMlocked(page)		\
+		test_bit(PG_mlocked, &(page)->flags)
+#define SetPageMlocked(page)		\
+		set_bit(PG_mlocked, &(page)->flags)
+#define TestSetPageMlocked(page)		\
+		test_and_set_bit(PG_mlocked, &(page)->flags)
+#define TestClearPageMlocked(page)		\
+		test_and_clear_bit(PG_mlocked, &(page)->flags)
+#define ClearPageMlocked(page)		\
+		clear_bit(PG_mlocked, &(page)->flags)
+#define TestClearPageMlocked(page)	\
+		test_and_clear_bit(PG_mlocked, &(page)->flags)
+
+
 struct page;	/* forward declaration */
 
 extern void cancel_dirty_page(struct page *page, unsigned int account_size);
Index: current/include/linux/swap.h
===================================================================
--- current.orig/include/linux/swap.h	2007-02-02 16:42:51.000000000 -0800
+++ current/include/linux/swap.h	2007-02-02 21:11:27.000000000 -0800
@@ -187,6 +187,9 @@ extern void lru_add_drain(void);
 extern int lru_add_drain_all(void);
 extern int rotate_reclaimable_page(struct page *page);
 extern void swap_setup(void);
+extern void lru_release(struct page *page);
+extern void mlock_remove(struct page *page, struct vm_area_struct *vma);
+extern void mlock_add(struct page *page, struct vm_area_struct *vma);
 
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zone **zones, int order,
Index: current/mm/swap.c
===================================================================
--- current.orig/mm/swap.c	2007-02-02 16:42:51.000000000 -0800
+++ current/mm/swap.c	2007-02-02 16:43:28.000000000 -0800
@@ -36,10 +36,9 @@
 int page_cluster;
 
 /*
- * This path almost never happens for VM activity - pages are normally
- * freed via pagevecs.  But it gets used by networking.
+ * Release a page from the LRU. Needed by mlock.
  */
-static void fastcall __page_cache_release(struct page *page)
+void lru_release(struct page *page)
 {
 	if (PageLRU(page)) {
 		unsigned long flags;
@@ -51,6 +50,15 @@ static void fastcall __page_cache_releas
 		del_page_from_lru(zone, page);
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
+}
+
+/*
+ * This path almost never happens for VM activity - pages are normally
+ * freed via pagevecs.  But it gets used by networking.
+ */
+static void fastcall __page_cache_release(struct page *page)
+{
+	lru_release(page);
 	free_hot_page(page);
 }
 
Index: current/mm/mlock.c
===================================================================
--- current.orig/mm/mlock.c	2007-02-02 16:42:51.000000000 -0800
+++ current/mm/mlock.c	2007-02-02 21:22:15.000000000 -0800
@@ -10,7 +10,185 @@
 #include <linux/mm.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
+#include <linux/rmap.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
 
+static int mlock_check_vmas(struct page *page, struct vm_area_struct *vma);
+
+/*
+ * Add a new mlocked reference to a page. If we are the first needing
+ * the page mlocked then update counters.
+ */
+static int __mlock_add(struct page *page)
+{
+	int rc = !TestSetPageMlocked(page);
+
+	if (rc)
+		inc_zone_page_state(page, NR_MLOCK);
+	return rc;
+}
+
+/*
+ * Remove a mlocked reference to a page.
+ */
+static int __mlock_remove(struct page *page, struct vm_area_struct *vma)
+{
+	int rc = TestClearPageMlocked(page);
+
+	if (rc) {
+		dec_zone_page_state(page, NR_MLOCK);
+
+		/*
+		 * Set the mlocked bit again if any vma still
+		 * has VM_LOCKED set
+		 */
+		mlock_check_vmas(page, vma);
+	}
+	return PageMlocked(page);
+}
+
+void mlock_add(struct page *page, struct vm_area_struct *vma)
+{
+	/*
+	 * Unconditionally move the page off the LRU.
+	 * Note that we may fail to remove the page from
+	 * the LRU if some other function already took off the page.
+	 * That function may return the page to the LRU again. At some
+	 * point isolate_lru_pages() will encounter the page
+	 * and take it off the LRU for good.
+	 */
+	lru_release(page);
+	__mlock_add(page);
+}
+
+/*
+ * Remove a mlocked reference to a page.
+ */
+void mlock_remove(struct page *page, struct vm_area_struct *vma)
+{
+	/*
+	 * Pin page so that the page cannot vanish from under us
+	 * via reclaim, page migration etc.
+	 */
+	get_page(page);
+
+	/*
+	 * Safe to drop PageMlocked since the page is pinned in a
+	 * different way.
+	 */
+	if (!__mlock_remove(page, vma))
+		lru_cache_add_active(page);
+	put_page(page);
+}
+
+/*
+ * Check if the page is mapped by a mlocked vma. If so set PageMlocked.
+ */
+static int page_in_mlocked_vma(struct vm_area_struct *vma, struct page *page)
+{
+	struct mm_struct *mm = vma->vm_mm;
+ 	pgd_t *pgd;
+ 	pud_t *pud;
+ 	pmd_t *pmd;
+	pte_t *ptep, pte;
+ 	spinlock_t *ptl;
+	unsigned long addr = page_address_in_vma(page, vma);
+	int rc = 0;
+
+	if (addr == -EFAULT ||
+			PageReserved(page) ||
+			!PageMlocked(page))
+		return 0;
+
+ 	pgd = pgd_offset(mm, addr);
+	if (!pgd_present(*pgd))
+                return 0;
+
+	pud = pud_offset(pgd, addr);
+	if (!pud_present(*pud))
+                return 0;
+
+	pmd = pmd_offset(pud, addr);
+	if (!pmd_present(*pmd))
+		return 0;
+
+	ptep = pte_offset_map(pmd, addr);
+ 	ptl = pte_lockptr(mm, pmd);
+ 	spin_lock(ptl);
+	pte = *ptep;
+	if (!is_swap_pte(pte) &&
+		page == vm_normal_page(vma, addr, pte) &&
+		(vma->vm_flags & VM_LOCKED)) {
+			if (__mlock_add(page))
+				rc = 1;
+		}
+	pte_unmap_unlock(ptep, ptl);
+	return rc;
+}
+
+static int mlock_check_file(struct page *page, struct vm_area_struct *v)
+{
+	struct vm_area_struct *vma;
+	struct address_space *mapping = page_mapping(page);
+	struct prio_tree_iter iter;
+	pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+	int rc = 0;
+
+	if (!mapping)
+		return 0;
+
+	spin_lock(&mapping->i_mmap_lock);
+
+	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
+		if (v != vma && (vma->vm_flags & VM_LOCKED) &&
+				page_in_mlocked_vma(vma, page)) {
+					rc = 1;
+					break;
+				}
+
+	spin_unlock(&mapping->i_mmap_lock);
+	return rc;
+}
+
+static int mlock_check_anon(struct page *page, struct vm_area_struct *v)
+{
+	struct anon_vma *anon_vma;
+	struct vm_area_struct *vma;
+	unsigned long mapping;
+	int rc = 0;
+
+	mapping = (unsigned long)page->mapping;
+
+	if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
+		return 0;
+
+	/*
+	 * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
+	 */
+	anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
+	spin_lock(&anon_vma->lock);
+
+	list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
+		if (v != vma && (vma->vm_flags & VM_LOCKED) &&
+				page_in_mlocked_vma(vma, page)) {
+					rc = 1;
+					break;
+				}
+	spin_unlock(&anon_vma->lock);
+	return rc;
+}
+
+/*
+ * Check for remaining vmas with VM_LOCKED set
+ */
+int mlock_check_vmas(struct page *page, struct vm_area_struct *vma)
+{
+	if (PageAnon(page))
+		return mlock_check_anon(page, vma);
+	else
+		return mlock_check_file(page, vma);
+}
 
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	unsigned long start, unsigned long end, unsigned int newflags)
@@ -19,6 +197,7 @@ static int mlock_fixup(struct vm_area_st
 	pgoff_t pgoff;
 	int pages;
 	int ret = 0;
+	unsigned long addr;
 
 	if (newflags == vma->vm_flags) {
 		*prev = vma;
@@ -63,8 +242,39 @@ success:
 		pages = -pages;
 		if (!(newflags & VM_IO))
 			ret = make_pages_present(start, end);
+	} else {
+		/*
+		 * We are clearing VM_LOCKED. Insure that all PageMlocked
+		 * is reset on all pages where we need it.
+		 */
+		for (addr = start; addr < end; addr += PAGE_SIZE) {
+			/*
+			 * No need to get a page reference. mmap_sem writelock
+			 * is held and the pages of interest are not on
+			 * the LRU
+			 */
+			struct page *page = follow_page(vma, start, 0);
+
+			if (page && PageMlocked(page))
+				mlock_remove(page, vma);
+			cond_resched();
+		}
 	}
 
+#if 0	/* We may need this to only mlock current pages */
+		/*
+		 * We are setting VM_LOCKED for current pages.
+		 * are changed to !PageMlocked if necessary.
+		 */
+		for (addr = start; addr < end; addr += PAGE_SIZE) {
+			struct page *page = follow_page(vma, start, 0);
+
+			if (page && !PageMlocked(page))
+				mlock_add(page, vma);
+			cond_resched();
+		}
+#endif
+
 	mm->locked_vm -= pages;
 out:
 	if (ret == -ENOMEM)
Index: current/include/linux/rmap.h
===================================================================
--- current.orig/include/linux/rmap.h	2007-02-02 16:42:51.000000000 -0800
+++ current/include/linux/rmap.h	2007-02-02 16:43:28.000000000 -0800
@@ -128,6 +128,11 @@ static inline int page_mkclean(struct pa
 
 #endif	/* CONFIG_MMU */
 
+static inline int is_swap_pte(pte_t pte)
+{
+	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
+}
+
 /*
  * Return values of try_to_unmap
  */
Index: current/mm/migrate.c
===================================================================
--- current.orig/mm/migrate.c	2007-02-02 16:42:51.000000000 -0800
+++ current/mm/migrate.c	2007-02-02 16:43:28.000000000 -0800
@@ -115,11 +115,6 @@ int putback_lru_pages(struct list_head *
 	return count;
 }
 
-static inline int is_swap_pte(pte_t pte)
-{
-	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
-}
-
 /*
  * Restore a potential migration pte to a working pte entry
  */
Index: current/include/linux/mm.h
===================================================================
--- current.orig/include/linux/mm.h	2007-02-02 16:42:51.000000000 -0800
+++ current/include/linux/mm.h	2007-02-02 16:43:28.000000000 -0800
@@ -1178,6 +1178,7 @@ struct page *follow_page(struct vm_area_
 #define FOLL_TOUCH	0x02	/* mark page accessed */
 #define FOLL_GET	0x04	/* do get_page on page */
 #define FOLL_ANON	0x08	/* give ZERO_PAGE if no pgtable */
+#define FOLL_MLOCK	0x10	/* If vma is VM_LOCK make page PageMlocked */
 
 #ifdef CONFIG_PROC_FS
 void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long);
Index: current/mm/vmscan.c
===================================================================
--- current.orig/mm/vmscan.c	2007-02-02 16:42:51.000000000 -0800
+++ current/mm/vmscan.c	2007-02-02 17:02:48.000000000 -0800
@@ -693,6 +693,16 @@ static unsigned long isolate_lru_pages(u
 			BUG();
 		}
 
+		if (PageMlocked(page)) {
+			/*
+			 * May happen due to pages being off the LRU while they
+			 * were mlocked. Take them off now.
+			 */
+			printk(KERN_INFO "Found mlocked page on LRU\n");
+			list_del(&page->lru);
+			put_page(page);
+			continue;
+		}
 		if (!order)
 			continue;
 

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03  6:20 [RFC] Tracking mlocked pages and moving them off the LRU Christoph Lameter
@ 2007-02-03  8:53 ` Andrew Morton
  2007-02-03 17:56   ` Christoph Lameter
  2007-02-03 22:58   ` Nigel Cunningham
  2007-02-03 10:16 ` Christoph Hellwig
  2007-02-03 22:56 ` Nigel Cunningham
  2 siblings, 2 replies; 26+ messages in thread
From: Andrew Morton @ 2007-02-03  8:53 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki, Rik van Riel

On Fri, 2 Feb 2007 22:20:12 -0800 (PST) Christoph Lameter <clameter@sgi.com> wrote:

> This is a new variation on the earlier RFC for tracking mlocked pages.
> We now mark a mlocked page with a bit in the page flags and remove
> them from the LRU. Pages get moved back when no vma that references
> the page has VM_LOCKED set anymore.
> 
> This means that vmscan no longer uselessly cycles over large amounts
> of mlocked memory should someone attempt to mlock large amounts of
> memory (may even result in a livelock on large systems).
> 
> Synchronization is build around state changes of the PageMlocked bit.
> The NR_MLOCK counter is incremented and decremented based on
> state transitions of PageMlocked. So the count is accurate.

I wonder if it can be simpler.  Make two changes:

a) If the scanner encounters an mlocked page on the LRU, take it off.

b) munlock() adds all affected pages to the LRU.

And that's it.  Simpler, solves the uselessly-scan-lots-of-mlocked-pages
problem (which is the sole requirement according to your description) and
doesn't consume a page flag.  Optional (and arguable) extension: scan the
vmas during munmap, don't add page to LRU if it's still mlocked.

Why _does_ your patch add a new page flag?  That info is available via a
vma scan.

> There is still some unfinished business:
> 
> 1. We use the 21st page flag and we only have 20 on 32 bit NUMA platforms.

Ow.  How were you thinking of fixing that?

> 2. Since mlocked pages are now off the LRU page migration will no longer
>    move them.

Ow.  That could be a right pain when we get around to using migration for
memory-unplug?


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03  6:20 [RFC] Tracking mlocked pages and moving them off the LRU Christoph Lameter
  2007-02-03  8:53 ` Andrew Morton
@ 2007-02-03 10:16 ` Christoph Hellwig
  2007-02-03 15:35   ` Martin J. Bligh
  2007-02-03 17:33   ` Christoph Lameter
  2007-02-03 22:56 ` Nigel Cunningham
  2 siblings, 2 replies; 26+ messages in thread
From: Christoph Hellwig @ 2007-02-03 10:16 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, akpm, Nick Piggin, KAMEZAWA Hiroyuki, Rik van Riel

On Fri, Feb 02, 2007 at 10:20:12PM -0800, Christoph Lameter wrote:
> This is a new variation on the earlier RFC for tracking mlocked pages.
> We now mark a mlocked page with a bit in the page flags and remove
> them from the LRU. Pages get moved back when no vma that references
> the page has VM_LOCKED set anymore.
> 
> This means that vmscan no longer uselessly cycles over large amounts
> of mlocked memory should someone attempt to mlock large amounts of
> memory (may even result in a livelock on large systems).
> 
> Synchronization is build around state changes of the PageMlocked bit.
> The NR_MLOCK counter is incremented and decremented based on
> state transitions of PageMlocked. So the count is accurate.
> 
> There is still some unfinished business:
> 
> 1. We use the 21st page flag and we only have 20 on 32 bit NUMA platforms.
> 
> 2. Since mlocked pages are now off the LRU page migration will no longer
>    move them.
> 
> 3. Use NR_MLOCK to tune various VM behaviors so that the VM does not 
>    longer fall due to too many mlocked pages in certain areas.

This patch seems to not handle the cases where more than one process mlocks
a page and you really need a pincount in the page to not release it before
all processes have munlock it or died.  I did a similar patch a while
ago and tried to handle it by overloading the lru lists pointers with
a pincount, but at some point I gave up because I couldn't get that part
right.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03 10:16 ` Christoph Hellwig
@ 2007-02-03 15:35   ` Martin J. Bligh
  2007-02-03 17:59     ` Christoph Lameter
  2007-02-03 17:33   ` Christoph Lameter
  1 sibling, 1 reply; 26+ messages in thread
From: Martin J. Bligh @ 2007-02-03 15:35 UTC (permalink / raw)
  To: Christoph Hellwig, Christoph Lameter, linux-kernel, akpm,
	Nick Piggin, KAMEZAWA Hiroyuki, Rik van Riel

Christoph Hellwig wrote:
> On Fri, Feb 02, 2007 at 10:20:12PM -0800, Christoph Lameter wrote:
>> This is a new variation on the earlier RFC for tracking mlocked pages.
>> We now mark a mlocked page with a bit in the page flags and remove
>> them from the LRU. Pages get moved back when no vma that references
>> the page has VM_LOCKED set anymore.
>>
>> This means that vmscan no longer uselessly cycles over large amounts
>> of mlocked memory should someone attempt to mlock large amounts of
>> memory (may even result in a livelock on large systems).
>>
>> Synchronization is build around state changes of the PageMlocked bit.
>> The NR_MLOCK counter is incremented and decremented based on
>> state transitions of PageMlocked. So the count is accurate.
>>
>> There is still some unfinished business:
>>
>> 1. We use the 21st page flag and we only have 20 on 32 bit NUMA platforms.
>>
>> 2. Since mlocked pages are now off the LRU page migration will no longer
>>    move them.
>>
>> 3. Use NR_MLOCK to tune various VM behaviors so that the VM does not 
>>    longer fall due to too many mlocked pages in certain areas.
> 
> This patch seems to not handle the cases where more than one process mlocks
> a page and you really need a pincount in the page to not release it before
> all processes have munlock it or died.  I did a similar patch a while
> ago and tried to handle it by overloading the lru lists pointers with
> a pincount, but at some point I gave up because I couldn't get that part
> right.

Doesn't matter - you can just do it lazily. If you find a page that is
locked, move it to the locked list. when unlocking a page you *always*
move it back to the normal list. If someone else is still locking it,
we'll move it back to the lock list on next reclaim pass.

I have a half-finished patch from 6 months ago that does this, but never
found time to complete it ;-(

M.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03 10:16 ` Christoph Hellwig
  2007-02-03 15:35   ` Martin J. Bligh
@ 2007-02-03 17:33   ` Christoph Lameter
  1 sibling, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-02-03 17:33 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: linux-kernel, akpm, Nick Piggin, KAMEZAWA Hiroyuki, Rik van Riel

On Sat, 3 Feb 2007, Christoph Hellwig wrote:

> This patch seems to not handle the cases where more than one process mlocks
> a page and you really need a pincount in the page to not release it before
> all processes have munlock it or died.  I did a similar patch a while
> ago and tried to handle it by overloading the lru lists pointers with
> a pincount, but at some point I gave up because I couldn't get that part
> right.

It doees handle that case. Before PageMlocked is cleared for a page we 
check for vmas referencing the page that have VM_LOCKED set. That logic 
makes the patch so big.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03  8:53 ` Andrew Morton
@ 2007-02-03 17:56   ` Christoph Lameter
  2007-02-03 18:04     ` Arjan van de Ven
  2007-02-03 22:58   ` Nigel Cunningham
  1 sibling, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2007-02-03 17:56 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki, Rik van Riel

On Sat, 3 Feb 2007, Andrew Morton wrote:

> I wonder if it can be simpler.  Make two changes:

Would be great if this could get simpler.

> a) If the scanner encounters an mlocked page on the LRU, take it off.

The current patch takes them off when mlock is set (which may not work 
since the page may be off the LRU) and then has the scanner taking them 
off. We could just remove the early one but what would this bring us?

> b) munlock() adds all affected pages to the LRU.

Hmmm... You mean without checking all the vmas of a page for VM_LOCKED? So they 
are going to be removed again on the next pass? Ok. I see that makes it 
simpler but it requires another reclaim scan.

> doesn't consume a page flag.  Optional (and arguable) extension: scan the
> vmas during munmap, don't add page to LRU if it's still mlocked.
> 
> Why _does_ your patch add a new page flag?  That info is available via a
> vma scan.

The page flag allows a clean state transition of a page and accurate 
keeping of statistics for MLOCKed pages. There were objections against the 
fuzzy counting in the earlier incarnation and it was proposed that a page 
flag be introduced. Without the flag we cannot know that the page is 
already mapped by a VM_LOCKED vma without scanning over all vmas 
referencing the page.

> > 1. We use the 21st page flag and we only have 20 on 32 bit NUMA platforms.
> 
> Ow.  How were you thinking of fixing that?

I thought someone else could come up with something. Maybe the one that 
told me to use another page flag?

> > 2. Since mlocked pages are now off the LRU page migration will no longer
> >    move them.
> 
> Ow.  That could be a right pain when we get around to using migration for
> memory-unplug?

We need to expand page migration anyways to allow the gerneral migration 
of non-LRU pages.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03 15:35   ` Martin J. Bligh
@ 2007-02-03 17:59     ` Christoph Lameter
  0 siblings, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-02-03 17:59 UTC (permalink / raw)
  To: Martin J. Bligh
  Cc: Christoph Hellwig, linux-kernel, akpm, Nick Piggin,
	KAMEZAWA Hiroyuki, Rik van Riel

On Sat, 3 Feb 2007, Martin J. Bligh wrote:

> Doesn't matter - you can just do it lazily. If you find a page that is
> locked, move it to the locked list. when unlocking a page you *always*
> move it back to the normal list. If someone else is still locking it,
> we'll move it back to the lock list on next reclaim pass.

Sounds similar to what Andrew is proposing.
 
> I have a half-finished patch from 6 months ago that does this, but never
> found time to complete it ;-(

Could I see that patch? Could have some good approaches in there that 
would be useful.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03 17:56   ` Christoph Lameter
@ 2007-02-03 18:04     ` Arjan van de Ven
  2007-02-03 18:09       ` Christoph Lameter
                         ` (2 more replies)
  0 siblings, 3 replies; 26+ messages in thread
From: Arjan van de Ven @ 2007-02-03 18:04 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel

On Sat, 2007-02-03 at 09:56 -0800, Christoph Lameter wrote:
> On Sat, 3 Feb 2007, Andrew Morton wrote:
> 
> > I wonder if it can be simpler.  Make two changes:
> 
> Would be great if this could get simpler.
> 
> > a) If the scanner encounters an mlocked page on the LRU, take it off.
> 
> The current patch takes them off when mlock is set (which may not work 
> since the page may be off the LRU) and then has the scanner taking them 
> off. We could just remove the early one but what would this bring us?

it's simpler. You only move them off when you encounter them during a
scan. No walking early etc etc. Only do work when there is an actual
situation where you do scan.

> 
> > b) munlock() adds all affected pages to the LRU.
> 
> Hmmm... You mean without checking all the vmas of a page for VM_LOCKED? So they 
> are going to be removed again on the next pass? Ok. I see that makes it 
> simpler but it requires another reclaim scan.

Well.. That's the point! Only IF there is a reclaim scan do you move
them out again. The fact that these pages are on the list isn't a
problem. The fact that you keep encountering them over and over again
during *scanning* is. So Andrews suggestion makes them go away in the
situations that actually matter

> The page flag allows a clean state transition of a page and accurate 
> keeping of statistics for MLOCKed pages. There were objections against the 
> fuzzy counting in the earlier incarnation and it was proposed that a page 
> flag be introduced. Without the flag we cannot know that the page is 
> already mapped by a VM_LOCKED vma without scanning over all vmas 
> referencing the page.

who cares though.. just do it lazy.



^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03 18:04     ` Arjan van de Ven
@ 2007-02-03 18:09       ` Christoph Lameter
  2007-02-03 18:55       ` Christoph Lameter
  2007-02-03 19:03       ` Christoph Lameter
  2 siblings, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-02-03 18:09 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andrew Morton, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel

On Sat, 3 Feb 2007, Arjan van de Ven wrote:

> it's simpler. You only move them off when you encounter them during a
> scan. No walking early etc etc. Only do work when there is an actual
> situation where you do scan.

Yes but then you do not have an accurate count of MLOCKed pages. We will 
only have that count after reclaim has run and removed the pages from the 
LRU. That could take some time (or may never happen the way some people 
handle memory around here).

> Well.. That's the point! Only IF there is a reclaim scan do you move
> them out again. The fact that these pages are on the list isn't a
> problem. The fact that you keep encountering them over and over again
> during *scanning* is. So Andrews suggestion makes them go away in the
> situations that actually matter

I see... Okay that could be simple to address.

> who cares though.. just do it lazy.

The other issue that the patch should address is to allow the VM to have 
statistics that show the actual amount of MLOCKed pages. With the pure 
scanner based approach this would no longer work.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03 18:04     ` Arjan van de Ven
  2007-02-03 18:09       ` Christoph Lameter
@ 2007-02-03 18:55       ` Christoph Lameter
  2007-02-03 19:03       ` Christoph Lameter
  2 siblings, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-02-03 18:55 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andrew Morton, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel

On Sat, 3 Feb 2007, Arjan van de Ven wrote:

> Well.. That's the point! Only IF there is a reclaim scan do you move
> them out again. The fact that these pages are on the list isn't a
> problem. The fact that you keep encountering them over and over again
> during *scanning* is. So Andrews suggestion makes them go away in the
> situations that actually matter

In order to get this to work try_to_unmap() must be able to distinguish 
betwen failures due to MLOCK and otherwise. So I guess we need this patch:


[PATCH] Make try_to_unmap() return SWAP_MLOCK for mlocked pages

Modify try_to_unmap() so that we can distinguish between failing to
unmap because a page is mlocked from other causes.

Signed-off-by: Christoph Lameter <clameter@sgi.com>

Index: current/include/linux/rmap.h
===================================================================
--- current.orig/include/linux/rmap.h	2007-02-03 10:24:47.000000000 -0800
+++ current/include/linux/rmap.h	2007-02-03 10:25:08.000000000 -0800
@@ -134,5 +134,6 @@ static inline int page_mkclean(struct pa
 #define SWAP_SUCCESS	0
 #define SWAP_AGAIN	1
 #define SWAP_FAIL	2
+#define SWAP_MLOCK	3
 
 #endif	/* _LINUX_RMAP_H */
Index: current/mm/rmap.c
===================================================================
--- current.orig/mm/rmap.c	2007-02-03 10:24:47.000000000 -0800
+++ current/mm/rmap.c	2007-02-03 10:25:08.000000000 -0800
@@ -631,10 +631,16 @@ static int try_to_unmap_one(struct page 
 	 * If it's recently referenced (perhaps page_referenced
 	 * skipped over this mm) then we should reactivate it.
 	 */
-	if (!migration && ((vma->vm_flags & VM_LOCKED) ||
-			(ptep_clear_flush_young(vma, address, pte)))) {
-		ret = SWAP_FAIL;
-		goto out_unmap;
+	if (!migration) {
+		if (vma->vm_flags & VM_LOCKED) {
+			ret = SWAP_MLOCK;
+			goto out_unmap;
+		}
+
+		if (ptep_clear_flush_young(vma, address, pte)) {
+			ret = SWAP_FAIL;
+			goto out_unmap;
+		}
 	}
 
 	/* Nuke the page table entry. */
@@ -799,7 +805,8 @@ static int try_to_unmap_anon(struct page
 
 	list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
 		ret = try_to_unmap_one(page, vma, migration);
-		if (ret == SWAP_FAIL || !page_mapped(page))
+		if (ret == SWAP_FAIL || ret == SWAP_MLOCK ||
+				!page_mapped(page))
 			break;
 	}
 	spin_unlock(&anon_vma->lock);
@@ -830,7 +837,8 @@ static int try_to_unmap_file(struct page
 	spin_lock(&mapping->i_mmap_lock);
 	vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
 		ret = try_to_unmap_one(page, vma, migration);
-		if (ret == SWAP_FAIL || !page_mapped(page))
+		if (ret == SWAP_FAIL || ret == SWAP_MLOCK ||
+				!page_mapped(page))
 			goto out;
 	}
 
@@ -913,6 +921,7 @@ out:
  * SWAP_SUCCESS	- we succeeded in removing all mappings
  * SWAP_AGAIN	- we missed a mapping, try again later
  * SWAP_FAIL	- the page is unswappable
+ * SWAP_MLOCK	- the page is under mlock()
  */
 int try_to_unmap(struct page *page, int migration)
 {
Index: current/mm/vmscan.c
===================================================================
--- current.orig/mm/vmscan.c	2007-02-03 10:25:00.000000000 -0800
+++ current/mm/vmscan.c	2007-02-03 10:25:12.000000000 -0800
@@ -516,6 +516,7 @@ static unsigned long shrink_page_list(st
 		if (page_mapped(page) && mapping) {
 			switch (try_to_unmap(page, 0)) {
 			case SWAP_FAIL:
+			case SWAP_MLOCK:
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03 18:04     ` Arjan van de Ven
  2007-02-03 18:09       ` Christoph Lameter
  2007-02-03 18:55       ` Christoph Lameter
@ 2007-02-03 19:03       ` Christoph Lameter
  2007-02-04  1:22         ` Andrew Morton
  2 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2007-02-03 19:03 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andrew Morton, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel

Here is the second piece removing mlock pages off the LRU during scanning. 
I tried moving them to a separate list but then we run into issues with
locking. We do not need ithe list though since we will encounter the
page again anyways during zap_pte_range.

However, in zap_pte_range we then run into another problem. Multiple 
zap_pte_ranges may handle the same page and without a page flag and 
scanning all the vmas we cannot determine if the page should or should not 
be moved back to the LRU. As a result this patch may decrement NR_MLOCK 
too much so that is goes below zero. Any ideas on how to fix this without 
a page flag and a scan over vmas?

Plus there is the issue of NR_MLOCK only being updated when we are 
reclaiming and when we may already be in trouble. An app may mlock huge 
amounts of memory and NR_MLOCK will stay low. If memory gets too low then
NR_MLOCKED is suddenly become accurate and the VM is likely undergoing a 
shock from that discovery (should we actually use NR_MLOCK elsewhere to 
determine memory management behavior). Hopefully we will not fall over 
then.

Maybe the best would be to handle the counter separately via a page flag? 
But then we go back to ugly vma scans. Yuck.

Index: current/mm/vmscan.c
===================================================================
--- current.orig/mm/vmscan.c	2007-02-03 10:53:15.000000000 -0800
+++ current/mm/vmscan.c	2007-02-03 10:53:25.000000000 -0800
@@ -516,10 +516,11 @@ static unsigned long shrink_page_list(st
 		if (page_mapped(page) && mapping) {
 			switch (try_to_unmap(page, 0)) {
 			case SWAP_FAIL:
-			case SWAP_MLOCK:
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
+			case SWAP_MLOCK:
+				goto mlocked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
@@ -594,6 +595,11 @@ free_it:
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
 
+mlocked:
+		unlock_page(page);
+		__inc_zone_page_state(page, NR_MLOCK);
+		continue;
+
 activate_locked:
 		SetPageActive(page);
 		pgactivate++;
Index: current/mm/memory.c
===================================================================
--- current.orig/mm/memory.c	2007-02-03 10:52:37.000000000 -0800
+++ current/mm/memory.c	2007-02-03 10:53:25.000000000 -0800
@@ -682,6 +682,10 @@ static unsigned long zap_pte_range(struc
 				file_rss--;
 			}
 			page_remove_rmap(page, vma);
+			if (vma->vm_flags & VM_LOCKED) {
+				__dec_zone_page_state(page, NR_MLOCK);
+				lru_cache_add_active(page);
+			}
 			tlb_remove_page(tlb, page);
 			continue;
 		}
Index: current/drivers/base/node.c
===================================================================
--- current.orig/drivers/base/node.c	2007-02-03 10:52:35.000000000 -0800
+++ current/drivers/base/node.c	2007-02-03 10:53:25.000000000 -0800
@@ -60,6 +60,7 @@ static ssize_t node_read_meminfo(struct 
 		       "Node %d FilePages:    %8lu kB\n"
 		       "Node %d Mapped:       %8lu kB\n"
 		       "Node %d AnonPages:    %8lu kB\n"
+		       "Node %d Mlock:        %8lu KB\n"
 		       "Node %d PageTables:   %8lu kB\n"
 		       "Node %d NFS_Unstable: %8lu kB\n"
 		       "Node %d Bounce:       %8lu kB\n"
@@ -82,6 +83,7 @@ static ssize_t node_read_meminfo(struct 
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
+		       nid, K(node_page_state(nid, NR_MLOCK)),
 		       nid, K(node_page_state(nid, NR_PAGETABLE)),
 		       nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
 		       nid, K(node_page_state(nid, NR_BOUNCE)),
Index: current/fs/proc/proc_misc.c
===================================================================
--- current.orig/fs/proc/proc_misc.c	2007-02-03 10:52:36.000000000 -0800
+++ current/fs/proc/proc_misc.c	2007-02-03 10:53:25.000000000 -0800
@@ -166,6 +166,7 @@ static int meminfo_read_proc(char *page,
 		"Writeback:    %8lu kB\n"
 		"AnonPages:    %8lu kB\n"
 		"Mapped:       %8lu kB\n"
+		"Mlock:        %8lu KB\n"
 		"Slab:         %8lu kB\n"
 		"SReclaimable: %8lu kB\n"
 		"SUnreclaim:   %8lu kB\n"
@@ -196,6 +197,7 @@ static int meminfo_read_proc(char *page,
 		K(global_page_state(NR_WRITEBACK)),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
+		K(global_page_state(NR_MLOCK)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE) +
 				global_page_state(NR_SLAB_UNRECLAIMABLE)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE)),
Index: current/include/linux/mmzone.h
===================================================================
--- current.orig/include/linux/mmzone.h	2007-02-03 10:52:35.000000000 -0800
+++ current/include/linux/mmzone.h	2007-02-03 10:53:25.000000000 -0800
@@ -58,6 +58,7 @@ enum zone_stat_item {
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
 	/* Second 128 byte cacheline */
+	NR_MLOCK,		/* Mlocked pages */
 	NR_SLAB_RECLAIMABLE,
 	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
Index: current/mm/vmstat.c
===================================================================
--- current.orig/mm/vmstat.c	2007-02-03 10:52:36.000000000 -0800
+++ current/mm/vmstat.c	2007-02-03 10:53:25.000000000 -0800
@@ -439,6 +439,7 @@ static const char * const vmstat_text[] 
 	"nr_file_pages",
 	"nr_dirty",
 	"nr_writeback",
+	"nr_mlock",
 	"nr_slab_reclaimable",
 	"nr_slab_unreclaimable",
 	"nr_page_table_pages",

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03  6:20 [RFC] Tracking mlocked pages and moving them off the LRU Christoph Lameter
  2007-02-03  8:53 ` Andrew Morton
  2007-02-03 10:16 ` Christoph Hellwig
@ 2007-02-03 22:56 ` Nigel Cunningham
  2 siblings, 0 replies; 26+ messages in thread
From: Nigel Cunningham @ 2007-02-03 22:56 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: linux-kernel, akpm, Nick Piggin, KAMEZAWA Hiroyuki, Rik van Riel

Hi.

On Fri, 2007-02-02 at 22:20 -0800, Christoph Lameter wrote:
> 1. We use the 21st page flag and we only have 20 on 32 bit NUMA platforms.

If it will help, I now have an implementation of the dynamically
allocated pageflags code I've posted in the past that is NUMA aware.
It's not memory hotplug aware yet, but that can be fixed. I can see if I
can find time this week to address that and send it to you if it will
help.

Regards,

Nigel


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03  8:53 ` Andrew Morton
  2007-02-03 17:56   ` Christoph Lameter
@ 2007-02-03 22:58   ` Nigel Cunningham
  1 sibling, 0 replies; 26+ messages in thread
From: Nigel Cunningham @ 2007-02-03 22:58 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Christoph Lameter, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel

Hi again.

On Sat, 2007-02-03 at 00:53 -0800, Andrew Morton wrote:
> > 1. We use the 21st page flag and we only have 20 on 32 bit NUMA platforms.
> 
> Ow.  How were you thinking of fixing that?

Oh, guess the dyn_pageflags patch is not needed then - the dangers of
replying before reading a whole thread :)

Nigel


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-03 19:03       ` Christoph Lameter
@ 2007-02-04  1:22         ` Andrew Morton
  2007-02-04  1:49           ` Christoph Lameter
  0 siblings, 1 reply; 26+ messages in thread
From: Andrew Morton @ 2007-02-04  1:22 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Arjan van de Ven, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel

On Sat, 3 Feb 2007 11:03:59 -0800 (PST) Christoph Lameter <clameter@sgi.com> wrote:

> Here is the second piece removing mlock pages off the LRU during scanning. 
> I tried moving them to a separate list but then we run into issues with
> locking. We do not need ithe list though since we will encounter the
> page again anyways during zap_pte_range.
> 
> However, in zap_pte_range we then run into another problem. Multiple 
> zap_pte_ranges may handle the same page and without a page flag and 
> scanning all the vmas we cannot determine if the page should or should not 
> be moved back to the LRU. As a result this patch may decrement NR_MLOCK 
> too much so that is goes below zero. Any ideas on how to fix this without 
> a page flag and a scan over vmas?
> 
> Plus there is the issue of NR_MLOCK only being updated when we are 
> reclaiming and when we may already be in trouble. An app may mlock huge 
> amounts of memory and NR_MLOCK will stay low. If memory gets too low then
> NR_MLOCKED is suddenly become accurate and the VM is likely undergoing a 
> shock from that discovery (should we actually use NR_MLOCK elsewhere to 
> determine memory management behavior). Hopefully we will not fall over 
> then.

Do we actually need NR_MLOCK?  Page reclaim tends to care more about the
size of the LRUs and doesn't have much dependency on ->present_pages,
iirc.

I guess we could use NR_MLOCK for writeback threshold calculations, to
force writeback earlier if there's a lot of mlocked memory in the affected
zones.  But that code isn't zone-aware anyway, and we don't know how to make
it zone aware in any sane fashion and making it cpuset-aware isn't very
interesting or useful..

So..  Why do we want NR_MLOCK?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-04  1:22         ` Andrew Morton
@ 2007-02-04  1:49           ` Christoph Lameter
  2007-02-04  8:16             ` Arjan van de Ven
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2007-02-04  1:49 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Arjan van de Ven, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel

On Sat, 3 Feb 2007, Andrew Morton wrote:

> Do we actually need NR_MLOCK?  Page reclaim tends to care more about the
> size of the LRUs and doesn't have much dependency on ->present_pages,

Yes, we'd be fine with general reclaim I think. But the calculation of the 
dirty ratio based on ZVCs would need it if we take the mlocked pages off. 
Otherwise we may have dirty ratios > 100%.

> I guess we could use NR_MLOCK for writeback threshold calculations, to
> force writeback earlier if there's a lot of mlocked memory in the affected
> zones.  But that code isn't zone-aware anyway, and we don't know how to make
> it zone aware in any sane fashion and making it cpuset-aware isn't very
> interesting or useful..

Exclusion or inclusion of NR_MLOCK number is straightforward for the dirty 
ratio calcuations. global_page_state(NR_MLOCK) f.e. would get us totals on 
mlocked pages per zone. node_page_state(NR_MLOCK) gives a node specific 
number of mlocked pages. The nice thing about ZVCs is that it allows
easy access to counts on different levels.

> So..  Why do we want NR_MLOCK?

Rik also had some uses in mind for allocation?

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-04  1:49           ` Christoph Lameter
@ 2007-02-04  8:16             ` Arjan van de Ven
  2007-02-05  6:45               ` Christoph Lameter
  2007-02-05  7:57               ` Christoph Lameter
  0 siblings, 2 replies; 26+ messages in thread
From: Arjan van de Ven @ 2007-02-04  8:16 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel


> Exclusion or inclusion of NR_MLOCK number is straightforward for the dirty 
> ratio calcuations. global_page_state(NR_MLOCK) f.e. would get us totals on 
> mlocked pages per zone. node_page_state(NR_MLOCK) gives a node specific 
> number of mlocked pages. The nice thing about ZVCs is that it allows
> easy access to counts on different levels.

however... mlocked pages still can be dirty, and want to be written back
at some point ;)

I can see the point of doing dirty ratio as percentage of the LRU size,
but in that case you don't need to track NR_MLOCK, only the total LRU
size. (And yes it'll be sometimes optimistic because not all mlock'd
pages are moved off the lru yet, but I doubt you'll have that as a
problem in practice)
-- 
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-04  8:16             ` Arjan van de Ven
@ 2007-02-05  6:45               ` Christoph Lameter
  2007-02-06 22:05                 ` Nate Diller
  2007-02-05  7:57               ` Christoph Lameter
  1 sibling, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2007-02-05  6:45 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andrew Morton, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel

On Sun, 4 Feb 2007, Arjan van de Ven wrote:

> 
> > Exclusion or inclusion of NR_MLOCK number is straightforward for the dirty 
> > ratio calcuations. global_page_state(NR_MLOCK) f.e. would get us totals on 
> > mlocked pages per zone. node_page_state(NR_MLOCK) gives a node specific 
> > number of mlocked pages. The nice thing about ZVCs is that it allows
> > easy access to counts on different levels.
> 
> however... mlocked pages still can be dirty, and want to be written back
> at some point ;)

Yes that is why we need to add them to the count of total pages.
 
> I can see the point of doing dirty ratio as percentage of the LRU size,
> but in that case you don't need to track NR_MLOCK, only the total LRU
> size. (And yes it'll be sometimes optimistic because not all mlock'd
> pages are moved off the lru yet, but I doubt you'll have that as a
> problem in practice)

The dirty ratio with the ZVCS would be

NR_DIRTY + NR_UNSTABLE_NFS
	/ 
NR_FREE_PAGES + NR_INACTIVE + NR_ACTIVE + NR_MLOCK


I think we need a PageMlocked after all for the delayed NR_MLOCK 
approach but it needs to have different semantics in order to make
it work. With the patch that I posted earlier we could actually return
a page to the LRU in zap_pte_range while something else is keeping
a page off the LRU (i.e. Page migration is occurring and suddenly the 
page is reclaimed since zap_pte_range put it back). So PageMlocked needs 
to be set in shrink_list() in order to clarify that the page was taken 
off the LRU lists due to it being mlocked and not for other reasons. 
zap_pte_range then needs to check for PageMlocked before putting the 
page onto the LRU.

If we do that then we can observer the state transitions and have an 
accurate count. The delayed accounting problem can probably be 
somewhat remedied by putting new mlocked pages not on the LRU and 
counting them directly. Page migration can simply clear the PageMlocked 
bit and then treat the page as if it was taken off the LRU.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-04  8:16             ` Arjan van de Ven
  2007-02-05  6:45               ` Christoph Lameter
@ 2007-02-05  7:57               ` Christoph Lameter
  2007-02-05  8:39                 ` Arjan van de Ven
  1 sibling, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2007-02-05  7:57 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andrew Morton, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel

Hmmm.. I have had no time to test this one yet but I think this should 
work. It uses the delayed method and a new page flag PageMlocked() with 
different semantics. Fix for page migration is also included.

Patch avoids to put new anonymous mlocked pages on the LRU. Maybe the same 
could be done for new pagecache pages?

I still need a solution for the problem of not having enough page flag 
bits on i386 NUMA.


Index: current/mm/vmscan.c
===================================================================
--- current.orig/mm/vmscan.c	2007-02-03 10:53:15.000000000 -0800
+++ current/mm/vmscan.c	2007-02-04 22:59:01.000000000 -0800
@@ -516,10 +516,11 @@ static unsigned long shrink_page_list(st
 		if (page_mapped(page) && mapping) {
 			switch (try_to_unmap(page, 0)) {
 			case SWAP_FAIL:
-			case SWAP_MLOCK:
 				goto activate_locked;
 			case SWAP_AGAIN:
 				goto keep_locked;
+			case SWAP_MLOCK:
+				goto mlocked;
 			case SWAP_SUCCESS:
 				; /* try to free the page below */
 			}
@@ -594,6 +595,14 @@ free_it:
 			__pagevec_release_nonlru(&freed_pvec);
 		continue;
 
+mlocked:
+		ClearPageActive(page);
+		unlock_page(page);
+		__inc_zone_page_state(page, NR_MLOCK);
+		smp_wmb();
+		SetPageMlocked(page);
+		continue;
+
 activate_locked:
 		SetPageActive(page);
 		pgactivate++;
Index: current/mm/memory.c
===================================================================
--- current.orig/mm/memory.c	2007-02-03 10:52:37.000000000 -0800
+++ current/mm/memory.c	2007-02-04 23:48:36.000000000 -0800
@@ -682,6 +682,8 @@ static unsigned long zap_pte_range(struc
 				file_rss--;
 			}
 			page_remove_rmap(page, vma);
+			if (PageMlocked(page) && vma->vm_flags & VM_LOCKED)
+				lru_cache_add_mlock(page);
 			tlb_remove_page(tlb, page);
 			continue;
 		}
@@ -898,6 +900,25 @@ unsigned long zap_page_range(struct vm_a
 }
 
 /*
+ * Add a new anonymous page
+ */
+void anon_add(struct vm_area_struct *vma, struct page *page,
+				unsigned long address)
+{
+	inc_mm_counter(vma->vm_mm, anon_rss);
+	if (vma->vm_flags & VM_LOCKED) {
+		/*
+		 * Page is new and therefore not on the LRU
+		 * so we can directly mark it as mlocked
+		 */
+		SetPageMlocked(page);
+		inc_zone_page_state(page, NR_MLOCK);
+	} else
+		lru_cache_add_active(page);
+	page_add_new_anon_rmap(page, vma, address);
+}
+
+/*
  * Do a quick page-table lookup for a single page.
  */
 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
@@ -2101,9 +2122,7 @@ static int do_anonymous_page(struct mm_s
 		page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 		if (!pte_none(*page_table))
 			goto release;
-		inc_mm_counter(mm, anon_rss);
-		lru_cache_add_active(page);
-		page_add_new_anon_rmap(page, vma, address);
+		anon_add(vma, page, address);
 	} else {
 		/* Map the ZERO_PAGE - vm_page_prot is readonly */
 		page = ZERO_PAGE(address);
@@ -2247,11 +2266,9 @@ retry:
 		if (write_access)
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 		set_pte_at(mm, address, page_table, entry);
-		if (anon) {
-			inc_mm_counter(mm, anon_rss);
-			lru_cache_add_active(new_page);
-			page_add_new_anon_rmap(new_page, vma, address);
-		} else {
+		if (anon)
+			anon_add(vma, new_page, address);
+		else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
 			if (write_access) {
Index: current/drivers/base/node.c
===================================================================
--- current.orig/drivers/base/node.c	2007-02-03 10:52:35.000000000 -0800
+++ current/drivers/base/node.c	2007-02-03 10:53:25.000000000 -0800
@@ -60,6 +60,7 @@ static ssize_t node_read_meminfo(struct 
 		       "Node %d FilePages:    %8lu kB\n"
 		       "Node %d Mapped:       %8lu kB\n"
 		       "Node %d AnonPages:    %8lu kB\n"
+		       "Node %d Mlock:        %8lu KB\n"
 		       "Node %d PageTables:   %8lu kB\n"
 		       "Node %d NFS_Unstable: %8lu kB\n"
 		       "Node %d Bounce:       %8lu kB\n"
@@ -82,6 +83,7 @@ static ssize_t node_read_meminfo(struct 
 		       nid, K(node_page_state(nid, NR_FILE_PAGES)),
 		       nid, K(node_page_state(nid, NR_FILE_MAPPED)),
 		       nid, K(node_page_state(nid, NR_ANON_PAGES)),
+		       nid, K(node_page_state(nid, NR_MLOCK)),
 		       nid, K(node_page_state(nid, NR_PAGETABLE)),
 		       nid, K(node_page_state(nid, NR_UNSTABLE_NFS)),
 		       nid, K(node_page_state(nid, NR_BOUNCE)),
Index: current/fs/proc/proc_misc.c
===================================================================
--- current.orig/fs/proc/proc_misc.c	2007-02-03 10:52:36.000000000 -0800
+++ current/fs/proc/proc_misc.c	2007-02-03 10:53:25.000000000 -0800
@@ -166,6 +166,7 @@ static int meminfo_read_proc(char *page,
 		"Writeback:    %8lu kB\n"
 		"AnonPages:    %8lu kB\n"
 		"Mapped:       %8lu kB\n"
+		"Mlock:        %8lu KB\n"
 		"Slab:         %8lu kB\n"
 		"SReclaimable: %8lu kB\n"
 		"SUnreclaim:   %8lu kB\n"
@@ -196,6 +197,7 @@ static int meminfo_read_proc(char *page,
 		K(global_page_state(NR_WRITEBACK)),
 		K(global_page_state(NR_ANON_PAGES)),
 		K(global_page_state(NR_FILE_MAPPED)),
+		K(global_page_state(NR_MLOCK)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE) +
 				global_page_state(NR_SLAB_UNRECLAIMABLE)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE)),
Index: current/include/linux/mmzone.h
===================================================================
--- current.orig/include/linux/mmzone.h	2007-02-03 10:52:35.000000000 -0800
+++ current/include/linux/mmzone.h	2007-02-03 10:53:25.000000000 -0800
@@ -58,6 +58,7 @@ enum zone_stat_item {
 	NR_FILE_DIRTY,
 	NR_WRITEBACK,
 	/* Second 128 byte cacheline */
+	NR_MLOCK,		/* Mlocked pages */
 	NR_SLAB_RECLAIMABLE,
 	NR_SLAB_UNRECLAIMABLE,
 	NR_PAGETABLE,		/* used for pagetables */
Index: current/mm/vmstat.c
===================================================================
--- current.orig/mm/vmstat.c	2007-02-03 10:52:36.000000000 -0800
+++ current/mm/vmstat.c	2007-02-03 10:53:25.000000000 -0800
@@ -439,6 +439,7 @@ static const char * const vmstat_text[] 
 	"nr_file_pages",
 	"nr_dirty",
 	"nr_writeback",
+	"nr_mlock",
 	"nr_slab_reclaimable",
 	"nr_slab_unreclaimable",
 	"nr_page_table_pages",
Index: current/include/linux/page-flags.h
===================================================================
--- current.orig/include/linux/page-flags.h	2007-02-03 17:56:36.000000000 -0800
+++ current/include/linux/page-flags.h	2007-02-04 23:14:47.000000000 -0800
@@ -93,6 +93,7 @@
 
 #define PG_readahead		20	/* Reminder to do read-ahead */
 
+#define PG_mlocked		21	/* Page is mlocked */
 
 #if (BITS_PER_LONG > 32)
 /*
@@ -235,6 +236,16 @@ static inline void SetPageUptodate(struc
 #define SetPageReadahead(page)	set_bit(PG_readahead, &(page)->flags)
 #define ClearPageReadahead(page) clear_bit(PG_readahead, &(page)->flags)
 
+/*
+ * PageMlocked set means that the page was taken off the LRU because
+ * a VM_LOCKED vma does exist. PageMlocked must be cleared before a
+ * page is put back onto the LRU. PageMlocked is only modified
+ * under the zone->lru_lock like PageLRU.
+ */
+#define PageMlocked(page)	test_bit(PG_mlocked, &(page)->flags)
+#define SetPageMlocked(page)	set_bit(PG_mlocked, &(page)->flags)
+#define ClearPageMlocked(page)	clear_bit(PG_mlocked, &(page)->flags)
+
 struct page;	/* forward declaration */
 
 extern void cancel_dirty_page(struct page *page, unsigned int account_size);
Index: current/include/linux/pagevec.h
===================================================================
--- current.orig/include/linux/pagevec.h	2007-02-04 22:55:38.000000000 -0800
+++ current/include/linux/pagevec.h	2007-02-04 23:17:34.000000000 -0800
@@ -25,6 +25,7 @@ void __pagevec_release_nonlru(struct pag
 void __pagevec_free(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
 void __pagevec_lru_add_active(struct pagevec *pvec);
+void __pagevec_lru_add_mlock(struct pagevec *pvec);
 void pagevec_strip(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
 		pgoff_t start, unsigned nr_pages);
Index: current/include/linux/swap.h
===================================================================
--- current.orig/include/linux/swap.h	2007-02-04 22:55:38.000000000 -0800
+++ current/include/linux/swap.h	2007-02-04 23:17:34.000000000 -0800
@@ -181,6 +181,7 @@ extern unsigned int nr_free_pagecache_pa
 extern void FASTCALL(lru_cache_add(struct page *));
 extern void FASTCALL(lru_cache_add_active(struct page *));
 extern void FASTCALL(lru_cache_add_tail(struct page *));
+extern void FASTCALL(lru_cache_add_mlock(struct page *));
 extern void FASTCALL(activate_page(struct page *));
 extern void FASTCALL(mark_page_accessed(struct page *));
 extern void lru_add_drain(void);
Index: current/mm/mlock.c
===================================================================
--- current.orig/mm/mlock.c	2007-02-04 22:55:38.000000000 -0800
+++ current/mm/mlock.c	2007-02-04 23:28:51.000000000 -0800
@@ -10,7 +10,7 @@
 #include <linux/mm.h>
 #include <linux/mempolicy.h>
 #include <linux/syscalls.h>
-
+#include <linux/swap.h>
 
 static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	unsigned long start, unsigned long end, unsigned int newflags)
@@ -63,6 +63,24 @@ success:
 		pages = -pages;
 		if (!(newflags & VM_IO))
 			ret = make_pages_present(start, end);
+	} else {
+		unsigned long addr;
+
+		/*
+		 * We are clearing VM_LOCKED. Feed all pages back via
+		 * to the LRU via lru_cache_add_mlock()
+		 */
+		for (addr = start; addr < end; addr += PAGE_SIZE) {
+			/*
+			 * No need to get a page reference. mmap_sem
+			 * writelock is held.
+			 */
+			struct page *page = follow_page(vma, start, 0);
+
+			if (PageMlocked(page))
+				lru_cache_add_mlock(page);
+			cond_resched();
+		}
 	}
 
 	mm->locked_vm -= pages;
Index: current/mm/swap.c
===================================================================
--- current.orig/mm/swap.c	2007-02-03 17:57:20.000000000 -0800
+++ current/mm/swap.c	2007-02-04 23:25:50.000000000 -0800
@@ -178,6 +178,7 @@ EXPORT_SYMBOL(mark_page_accessed);
 static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
 static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
 static DEFINE_PER_CPU(struct pagevec, lru_add_tail_pvecs) = { 0, };
+static DEFINE_PER_CPU(struct pagevec, lru_add_mlock_pvecs) = { 0, };
 
 void fastcall lru_cache_add(struct page *page)
 {
@@ -199,6 +200,16 @@ void fastcall lru_cache_add_active(struc
 	put_cpu_var(lru_add_active_pvecs);
 }
 
+void fastcall lru_cache_add_mlock(struct page *page)
+{
+	struct pagevec *pvec = &get_cpu_var(lru_add_mlock_pvecs);
+
+	page_cache_get(page);
+	if (!pagevec_add(pvec, page))
+		__pagevec_lru_add_mlock(pvec);
+	put_cpu_var(lru_add_mlock_pvecs);
+}
+
 static void __pagevec_lru_add_tail(struct pagevec *pvec)
 {
 	int i;
@@ -237,6 +248,9 @@ static void __lru_add_drain(int cpu)
 	pvec = &per_cpu(lru_add_tail_pvecs, cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_tail(pvec);
+	pvec = &per_cpu(lru_add_mlock_pvecs, cpu);
+	if (pagevec_count(pvec))
+		__pagevec_lru_add_mlock(pvec);
 }
 
 void lru_add_drain(void)
@@ -394,6 +408,7 @@ void __pagevec_lru_add(struct pagevec *p
 			spin_lock_irq(&zone->lru_lock);
 		}
 		VM_BUG_ON(PageLRU(page));
+		VM_BUG_ON(PageMlocked(page));
 		SetPageLRU(page);
 		add_page_to_inactive_list(zone, page);
 	}
@@ -423,6 +438,7 @@ void __pagevec_lru_add_active(struct pag
 		VM_BUG_ON(PageLRU(page));
 		SetPageLRU(page);
 		VM_BUG_ON(PageActive(page));
+		VM_BUG_ON(PageMlocked(page));
 		SetPageActive(page);
 		add_page_to_active_list(zone, page);
 	}
@@ -432,6 +448,36 @@ void __pagevec_lru_add_active(struct pag
 	pagevec_reinit(pvec);
 }
 
+void __pagevec_lru_add_mlock(struct pagevec *pvec)
+{
+	int i;
+	struct zone *zone = NULL;
+
+	for (i = 0; i < pagevec_count(pvec); i++) {
+		struct page *page = pvec->pages[i];
+		struct zone *pagezone = page_zone(page);
+
+		if (pagezone != zone) {
+			if (zone)
+				spin_unlock_irq(&zone->lru_lock);
+			zone = pagezone;
+			spin_lock_irq(&zone->lru_lock);
+		}
+		BUG_ON(PageLRU(page));
+		if (!PageMlocked(page))
+			continue;
+		ClearPageMlocked(page);
+		smp_wmb();
+		__dec_zone_state(zone, NR_MLOCK);
+		SetPageLRU(page);
+		add_page_to_active_list(zone, page);
+	}
+	if (zone)
+		spin_unlock_irq(&zone->lru_lock);
+	release_pages(pvec->pages, pvec->nr, pvec->cold);
+	pagevec_reinit(pvec);
+}
+
 /*
  * Function used uniquely to put pages back to the lru at the end of the
  * inactive list to preserve the lru order. Currently only used by swap
Index: current/mm/migrate.c
===================================================================
--- current.orig/mm/migrate.c	2007-02-04 23:37:27.000000000 -0800
+++ current/mm/migrate.c	2007-02-04 23:39:55.000000000 -0800
@@ -58,6 +58,11 @@ int isolate_lru_page(struct page *page, 
 			else
 				del_page_from_inactive_list(zone, page);
 			list_add_tail(&page->lru, pagelist);
+		} else
+		if (PageMlocked(page)) {
+			get_page(page);
+			ClearPageMlocked(page);
+			list_add_tail(&page->lru, pagelist);
 		}
 		spin_unlock_irq(&zone->lru_lock);
 	}


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-05  7:57               ` Christoph Lameter
@ 2007-02-05  8:39                 ` Arjan van de Ven
  2007-02-05 16:38                   ` Matt Mackall
  2007-02-05 17:33                   ` Christoph Lameter
  0 siblings, 2 replies; 26+ messages in thread
From: Arjan van de Ven @ 2007-02-05  8:39 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Andrew Morton, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel

On Sun, 2007-02-04 at 23:57 -0800, Christoph Lameter wrote:
> Hmmm.. I have had no time to test this one yet but I think this should 
> work. It uses the delayed method and a new page flag PageMlocked() with 
> different semantics. Fix for page migration is also included.
> 
> Patch avoids to put new anonymous mlocked pages on the LRU. Maybe the same 
> could be done for new pagecache pages?
> 
> I still need a solution for the problem of not having enough page flag 
> bits on i386 NUMA.

I still don't get why you *really* need such a bit. 
-- 
if you want to mail me at work (you don't), use arjan (at) linux.intel.com
Test the interaction between Linux and your BIOS via http://www.linuxfirmwarekit.org


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-05  8:39                 ` Arjan van de Ven
@ 2007-02-05 16:38                   ` Matt Mackall
  2007-02-05 17:34                     ` Christoph Lameter
  2007-02-05 19:04                     ` Christoph Lameter
  2007-02-05 17:33                   ` Christoph Lameter
  1 sibling, 2 replies; 26+ messages in thread
From: Matt Mackall @ 2007-02-05 16:38 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Christoph Lameter, Andrew Morton, linux-kernel, Nick Piggin,
	KAMEZAWA Hiroyuki, Rik van Riel

On Mon, Feb 05, 2007 at 09:39:34AM +0100, Arjan van de Ven wrote:
> On Sun, 2007-02-04 at 23:57 -0800, Christoph Lameter wrote:
> > Hmmm.. I have had no time to test this one yet but I think this should 
> > work. It uses the delayed method and a new page flag PageMlocked() with 
> > different semantics. Fix for page migration is also included.
> > 
> > Patch avoids to put new anonymous mlocked pages on the LRU. Maybe the same 
> > could be done for new pagecache pages?
> > 
> > I still need a solution for the problem of not having enough page flag 
> > bits on i386 NUMA.
> 
> I still don't get why you *really* need such a bit. 

There are three possibilities mentioned so far:

1) slow accounting - scan each attached VMA on each mmap/munmap
2) lazy accounting - the same as above, with the work all moved to the
LRU sweep
3) accounting with an extra page flag - still needs to scan VMAs on munmap

Christoph seems to prefer the third.

I wonder if we couldn't stick a rough counter in address_space to
fast-path the slow accounting - we'll typically only have 0 or 1 locks
active.

-- 
Mathematics is the supreme nostalgia of our time.

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-05  8:39                 ` Arjan van de Ven
  2007-02-05 16:38                   ` Matt Mackall
@ 2007-02-05 17:33                   ` Christoph Lameter
  1 sibling, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-02-05 17:33 UTC (permalink / raw)
  To: Arjan van de Ven
  Cc: Andrew Morton, linux-kernel, Nick Piggin, KAMEZAWA Hiroyuki,
	Rik van Riel

On Mon, 5 Feb 2007, Arjan van de Ven wrote:

> > I still need a solution for the problem of not having enough page flag 
> > bits on i386 NUMA.
> 
> I still don't get why you *really* need such a bit. 

Because otherwise you cannot establish why a page was removed from the 
LRU. If a page is off the LRU for other reasons then one should not 
return the page to the LRU in zap_pte_range. How can this determination 
be made without a page flag?


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-05 16:38                   ` Matt Mackall
@ 2007-02-05 17:34                     ` Christoph Lameter
  2007-02-05 19:04                     ` Christoph Lameter
  1 sibling, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-02-05 17:34 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Arjan van de Ven, Andrew Morton, linux-kernel, Nick Piggin,
	KAMEZAWA Hiroyuki, Rik van Riel

On Mon, 5 Feb 2007, Matt Mackall wrote:

> 2) lazy accounting - the same as above, with the work all moved to the
> LRU sweep
> 3) accounting with an extra page flag - still needs to scan VMAs on munmap
> 
> Christoph seems to prefer the third.

No I am saying that 2 requires 3 to work reliably. The patch I posted last 
night does 2 but the approach needs page flag in order to work 
correctly.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-05 16:38                   ` Matt Mackall
  2007-02-05 17:34                     ` Christoph Lameter
@ 2007-02-05 19:04                     ` Christoph Lameter
  1 sibling, 0 replies; 26+ messages in thread
From: Christoph Lameter @ 2007-02-05 19:04 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Arjan van de Ven, Andrew Morton, linux-kernel, Nick Piggin,
	KAMEZAWA Hiroyuki, Rik van Riel

Patch seems to work and survives AIM7. However, we only know about 30% of 
the Mlocked pages after boot. With this additional patch to 
opportunistically move pages off the LRU immediately I can get the counter 
be accurate (for all practical purposes) like the non lazy version:


Index: current/mm/memory.c
===================================================================
--- current.orig/mm/memory.c	2007-02-05 10:44:10.000000000 -0800
+++ current/mm/memory.c	2007-02-05 11:01:46.000000000 -0800
@@ -919,6 +919,30 @@ void anon_add(struct vm_area_struct *vma
 }
 
 /*
+ * Opportunistically move the page off the LRU
+ * if possible. If we do not succeed then the LRU
+ * scans will take the page off.
+ */
+void try_to_set_mlocked(struct page *page)
+{
+	struct zone *zone;
+	unsigned long flags;
+
+	if (!PageLRU(page) || PageMlocked(page))
+		return;
+
+	zone = page_zone(page);
+	if (spin_trylock_irqsave(&zone->lru_lock, flags)) {
+		if (PageLRU(page) && !PageMlocked(page)) {
+			ClearPageLRU(page);
+			list_del(&page->lru);
+			SetPageMlocked(page);
+			__inc_zone_page_state(page, NR_MLOCK);
+		}
+		spin_unlock_irqrestore(&zone->lru_lock, flags);
+	}
+}
+/*
  * Do a quick page-table lookup for a single page.
  */
 struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
@@ -978,6 +1002,8 @@ struct page *follow_page(struct vm_area_
 			set_page_dirty(page);
 		mark_page_accessed(page);
 	}
+	if (vma->vm_flags & VM_LOCKED)
+		try_to_set_mlocked(page);
 unlock:
 	pte_unmap_unlock(ptep, ptl);
 out:
@@ -2271,6 +2297,8 @@ retry:
 		else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
+			if (vma->vm_flags & VM_LOCKED)
+				try_to_set_mlocked(new_page);
 			if (write_access) {
 				dirty_page = new_page;
 				get_page(dirty_page);

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-05  6:45               ` Christoph Lameter
@ 2007-02-06 22:05                 ` Nate Diller
  2007-02-07  8:02                   ` Christoph Lameter
  0 siblings, 1 reply; 26+ messages in thread
From: Nate Diller @ 2007-02-06 22:05 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Arjan van de Ven, Andrew Morton, linux-kernel, Nick Piggin,
	KAMEZAWA Hiroyuki, Rik van Riel

On 2/4/07, Christoph Lameter <clameter@sgi.com> wrote:
> On Sun, 4 Feb 2007, Arjan van de Ven wrote:
>
> >
> > > Exclusion or inclusion of NR_MLOCK number is straightforward for the dirty
> > > ratio calcuations. global_page_state(NR_MLOCK) f.e. would get us totals on
> > > mlocked pages per zone. node_page_state(NR_MLOCK) gives a node specific
> > > number of mlocked pages. The nice thing about ZVCs is that it allows
> > > easy access to counts on different levels.
> >
> > however... mlocked pages still can be dirty, and want to be written back
> > at some point ;)
>
> Yes that is why we need to add them to the count of total pages.
>
> > I can see the point of doing dirty ratio as percentage of the LRU size,
> > but in that case you don't need to track NR_MLOCK, only the total LRU
> > size. (And yes it'll be sometimes optimistic because not all mlock'd
> > pages are moved off the lru yet, but I doubt you'll have that as a
> > problem in practice)
>
> The dirty ratio with the ZVCS would be
>
> NR_DIRTY + NR_UNSTABLE_NFS
>         /
> NR_FREE_PAGES + NR_INACTIVE + NR_ACTIVE + NR_MLOCK

I don't understand why you want to account mlocked pages in
dirty_ratio.  of course mlocked pages *can* be dirty, but they have no
relevance in the write throttling code.  the point of dirty ratio is
to guarantee that there are some number of non-dirty, non-pinned,
non-mlocked pages on the LRU, to (try to) avoid deadlocks where the
writeback path needs to allocate pages, which many filesystems like to
do.  if an mlocked page is clean, there's still no way to free it up,
so it should not be treated as being on the LRU at all, for write
throttling.  the ideal (IMO) dirty ratio would be

NR_DIRTY - NR_DIRTY_MLOCKED + NR_UNSTABLE_NFS
        /
NR_FREE_PAGES + NR_INACTIVE + NR_ACTIVE

obviously it's kinda useless to keep an NR_DIRTY_MLOCKED counter, any
of these mlock accounting schemes could easily be modified to update
the NR_DIRTY counter so that it only reflects dirty unpinned pages,
and not mlocked ones.

is that the only place you wanted to have an accurate mocked page count?

NATE

^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-06 22:05                 ` Nate Diller
@ 2007-02-07  8:02                   ` Christoph Lameter
  2007-02-07 18:39                     ` Nate Diller
  0 siblings, 1 reply; 26+ messages in thread
From: Christoph Lameter @ 2007-02-07  8:02 UTC (permalink / raw)
  To: Nate Diller
  Cc: Arjan van de Ven, Andrew Morton, linux-kernel, Nick Piggin,
	KAMEZAWA Hiroyuki, Rik van Riel

On Tue, 6 Feb 2007, Nate Diller wrote:

> > The dirty ratio with the ZVCS would be
> > 
> > NR_DIRTY + NR_UNSTABLE_NFS
> >         /
> > NR_FREE_PAGES + NR_INACTIVE + NR_ACTIVE + NR_MLOCK
> 
> I don't understand why you want to account mlocked pages in
> dirty_ratio.  of course mlocked pages *can* be dirty, but they have no
> relevance in the write throttling code.  the point of dirty ratio is

mlocked pages can be counted as dirty pages. So if we do not include
NR_MLOCK in the number of total pages that could be dirty then we may in 
some cases have >100% dirty pages.

> to guarantee that there are some number of non-dirty, non-pinned,
> non-mlocked pages on the LRU, to (try to) avoid deadlocks where the
> writeback path needs to allocate pages, which many filesystems like to
> do.  if an mlocked page is clean, there's still no way to free it up,
> so it should not be treated as being on the LRU at all, for write
> throttling.  the ideal (IMO) dirty ratio would be

Hmmm... I think write throttling is different from reclaim. In write 
throttling the major objective is to decouple the applications from
the physical I/O. So the dirty ratio specifies how much "buffer" space
can be used for I/O. There is an issue that too many dirty pages will
cause difficulty for reclaim because pages can only be reclaimed after
writeback is complete.

And yes this is not true for mlocked pages.

> 
> NR_DIRTY - NR_DIRTY_MLOCKED + NR_UNSTABLE_NFS
>        /
> NR_FREE_PAGES + NR_INACTIVE + NR_ACTIVE
> 
> obviously it's kinda useless to keep an NR_DIRTY_MLOCKED counter, any
> of these mlock accounting schemes could easily be modified to update
> the NR_DIRTY counter so that it only reflects dirty unpinned pages,
> and not mlocked ones.

So you would be okay with dirty_ratio possibly be >100% of mlocked pages 
are dirty?

> is that the only place you wanted to have an accurate mocked page count?

Rik had some other ideas on what to do with it. I also think we may end up 
checking for excessive high mlock counts in various tight VM situations.


^ permalink raw reply	[flat|nested] 26+ messages in thread

* Re: [RFC] Tracking mlocked pages and moving them off the LRU
  2007-02-07  8:02                   ` Christoph Lameter
@ 2007-02-07 18:39                     ` Nate Diller
  0 siblings, 0 replies; 26+ messages in thread
From: Nate Diller @ 2007-02-07 18:39 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Arjan van de Ven, Andrew Morton, linux-kernel, Nick Piggin,
	KAMEZAWA Hiroyuki, Rik van Riel

On 2/7/07, Christoph Lameter <clameter@sgi.com> wrote:
> On Tue, 6 Feb 2007, Nate Diller wrote:
>
> > > The dirty ratio with the ZVCS would be
> > >
> > > NR_DIRTY + NR_UNSTABLE_NFS
> > >         /
> > > NR_FREE_PAGES + NR_INACTIVE + NR_ACTIVE + NR_MLOCK
> >
> > I don't understand why you want to account mlocked pages in
> > dirty_ratio.  of course mlocked pages *can* be dirty, but they have no
> > relevance in the write throttling code.  the point of dirty ratio is
>
> mlocked pages can be counted as dirty pages. So if we do not include
> NR_MLOCK in the number of total pages that could be dirty then we may in
> some cases have >100% dirty pages.

unless we exclude mlocked dirty pages from NR_DIRTY accounting, which
is what i suggest should be done as part of this patch

> > to guarantee that there are some number of non-dirty, non-pinned,
> > non-mlocked pages on the LRU, to (try to) avoid deadlocks where the
> > writeback path needs to allocate pages, which many filesystems like to
> > do.  if an mlocked page is clean, there's still no way to free it up,
> > so it should not be treated as being on the LRU at all, for write
> > throttling.  the ideal (IMO) dirty ratio would be
>
> Hmmm... I think write throttling is different from reclaim. In write
> throttling the major objective is to decouple the applications from
> the physical I/O. So the dirty ratio specifies how much "buffer" space
> can be used for I/O. There is an issue that too many dirty pages will
> cause difficulty for reclaim because pages can only be reclaimed after
> writeback is complete.

NR_DIRTY is only used for write throttling, right?  well, and
reporting to user-space, but again, i suggest that user space should
get to see NR_MLOCKED as well.  would people flip out if NR_DIRTY
stopped showing pages that are mlocked, as long as a seperate
NR_MLOCKED variable was present?

> And yes this is not true for mlocked pages.
>
> >
> > NR_DIRTY - NR_DIRTY_MLOCKED + NR_UNSTABLE_NFS
> >        /
> > NR_FREE_PAGES + NR_INACTIVE + NR_ACTIVE
> >
> > obviously it's kinda useless to keep an NR_DIRTY_MLOCKED counter, any
> > of these mlock accounting schemes could easily be modified to update
> > the NR_DIRTY counter so that it only reflects dirty unpinned pages,
> > and not mlocked ones.
>
> So you would be okay with dirty_ratio possibly be >100% of mlocked pages
> are dirty?
>
> > is that the only place you wanted to have an accurate mocked page count?
>
> Rik had some other ideas on what to do with it. I also think we may end up
> checking for excessive high mlock counts in various tight VM situations.

i'd be wary of a VM algorithm that treated mlocked pages any
differently than, say, unreclaimable slab pages.  but there are no
concrete suggestions yet, so i won't comment further.

all this is not to say that i dislike the idea of keeping mlocked
pages off the LRU, quite the opposite i've been looking for this for a
while and was hoping that Stone Wang's wired list patch
(http://lkml.org/lkml/2006/3/20/128) would get further than it did.
but i don't see the need to keep strict accounting if it hurts
performance in the common case.

NATE

^ permalink raw reply	[flat|nested] 26+ messages in thread

end of thread, other threads:[~2007-02-07 18:39 UTC | newest]

Thread overview: 26+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-02-03  6:20 [RFC] Tracking mlocked pages and moving them off the LRU Christoph Lameter
2007-02-03  8:53 ` Andrew Morton
2007-02-03 17:56   ` Christoph Lameter
2007-02-03 18:04     ` Arjan van de Ven
2007-02-03 18:09       ` Christoph Lameter
2007-02-03 18:55       ` Christoph Lameter
2007-02-03 19:03       ` Christoph Lameter
2007-02-04  1:22         ` Andrew Morton
2007-02-04  1:49           ` Christoph Lameter
2007-02-04  8:16             ` Arjan van de Ven
2007-02-05  6:45               ` Christoph Lameter
2007-02-06 22:05                 ` Nate Diller
2007-02-07  8:02                   ` Christoph Lameter
2007-02-07 18:39                     ` Nate Diller
2007-02-05  7:57               ` Christoph Lameter
2007-02-05  8:39                 ` Arjan van de Ven
2007-02-05 16:38                   ` Matt Mackall
2007-02-05 17:34                     ` Christoph Lameter
2007-02-05 19:04                     ` Christoph Lameter
2007-02-05 17:33                   ` Christoph Lameter
2007-02-03 22:58   ` Nigel Cunningham
2007-02-03 10:16 ` Christoph Hellwig
2007-02-03 15:35   ` Martin J. Bligh
2007-02-03 17:59     ` Christoph Lameter
2007-02-03 17:33   ` Christoph Lameter
2007-02-03 22:56 ` Nigel Cunningham

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.