All of lore.kernel.org
 help / color / mirror / Atom feed
* [Patch] mm tracepoints
@ 2009-03-16 19:52 Larry Woodman
  2009-03-19 15:47 ` Rik van Riel
  0 siblings, 1 reply; 24+ messages in thread
From: Larry Woodman @ 2009-03-16 19:52 UTC (permalink / raw)
  To: linux-mm

[-- Attachment #1: Type: text/plain, Size: 1145 bytes --]

I've implemented several mm tracepoints to track page allocation and
freeing, various types of pagefaults and unmaps, and critical page
reclamation routines.  This is useful for debugging memory allocation
issues and system performance problems under heavy memory loads.
Thoughts?:

# tracer: mm
#
#           TASK-PID    CPU#    TIMESTAMP  FUNCTION
#              | |       |          |         |
         pdflush-624   [004]   184.293169: wb_kupdate:
(mm_pdflush_kupdate) count=3e48
         pdflush-624   [004]   184.293439: get_page_from_freelist:
(mm_page_allocation) pfn=447c27 zone_free=1940910
        events/6-33    [006]   184.962879: free_hot_cold_page:
(mm_page_free) pfn=44bba9
      irqbalance-8313  [001]   188.042951: unmap_vmas:
(mm_anon_userfree) mm=ffff88044a7300c0 address=7f9a2eb70000 pfn=24c29a
             cat-9122  [005]   191.141173: filemap_fault:
(mm_filemap_fault) primary fault: mm=ffff88024c9d8f40 address=3cea2dd000
pfn=44d68e
             cat-9122  [001]   191.143036: handle_mm_fault:
(mm_anon_fault) mm=ffff88024c8beb40 address=7fffbde99f94 pfn=24ce22
...



Signed-off-by: Larry Woodman <lwoodman@redhat.com>

[-- Attachment #2: upstream-mm_tracepoints.patch --]
[-- Type: text/x-patch, Size: 21557 bytes --]

--- linux-2.6-tip/mm/memory.c.orig	2009-03-16 12:05:59.000000000 -0400
+++ linux-2.6-tip/mm/memory.c	2009-03-16 12:10:49.000000000 -0400
@@ -57,6 +57,7 @@
 #include <linux/kallsyms.h>
 #include <linux/swapops.h>
 #include <linux/elf.h>
+#include <trace/mm.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -768,6 +769,8 @@ int copy_page_range(struct mm_struct *ds
 	return ret;
 }
 
+DEFINE_TRACE(mm_anon_userfree);
+DEFINE_TRACE(mm_filemap_userunmap);
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
@@ -822,15 +825,19 @@ static unsigned long zap_pte_range(struc
 						addr) != page->index)
 				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
-			if (PageAnon(page))
+			if (PageAnon(page)) {
 				anon_rss--;
-			else {
+				trace_mm_anon_userfree(mm, addr,
+							page_to_pfn(page));
+			} else {
 				if (pte_dirty(ptent))
 					set_page_dirty(page);
 				if (pte_young(ptent) &&
 				    likely(!VM_SequentialReadHint(vma)))
 					mark_page_accessed(page);
 				file_rss--;
+				trace_mm_filemap_userunmap(mm, addr,
+							page_to_pfn(page));
 			}
 			page_remove_rmap(page);
 			if (unlikely(page_mapcount(page) < 0))
@@ -1879,6 +1886,8 @@ static inline void cow_user_page(struct 
 		copy_user_highpage(dst, src, va, vma);
 }
 
+DEFINE_TRACE(mm_anon_cow);
+DEFINE_TRACE(mm_filemap_cow);
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -1901,7 +1910,7 @@ static int do_wp_page(struct mm_struct *
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		spinlock_t *ptl, pte_t orig_pte)
 {
-	struct page *old_page, *new_page;
+	struct page *old_page, *new_page = NULL;
 	pte_t entry;
 	int reuse = 0, ret = 0;
 	int page_mkwrite = 0;
@@ -2031,9 +2040,14 @@ gotten:
 			if (!PageAnon(old_page)) {
 				dec_mm_counter(mm, file_rss);
 				inc_mm_counter(mm, anon_rss);
+				trace_mm_filemap_cow(mm, address,
+					page_to_pfn(new_page));
 			}
-		} else
+		} else {
 			inc_mm_counter(mm, anon_rss);
+			trace_mm_anon_cow(mm, address,
+					page_to_pfn(new_page));
+		}
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2398,6 +2412,7 @@ int vmtruncate_range(struct inode *inode
 	return 0;
 }
 
+DEFINE_TRACE(mm_anon_pgin);
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2511,6 +2526,7 @@ static int do_swap_page(struct mm_struct
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
+	trace_mm_anon_pgin(mm, address, page_to_pfn(page));
 	return ret;
 out_nomap:
 	mem_cgroup_cancel_charge_swapin(ptr);
@@ -2520,6 +2536,7 @@ out_nomap:
 	return ret;
 }
 
+DEFINE_TRACE(mm_anon_fault);
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2543,6 +2560,7 @@ static int do_anonymous_page(struct mm_s
 		goto oom;
 	__SetPageUptodate(page);
 
+	trace_mm_anon_fault(mm, address, page_to_pfn(page));
 	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
 		goto oom_free_page;
 
--- linux-2.6-tip/mm/rmap.c.orig	2009-03-16 12:05:59.000000000 -0400
+++ linux-2.6-tip/mm/rmap.c	2009-03-16 12:10:49.000000000 -0400
@@ -50,6 +50,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 
@@ -978,6 +979,7 @@ static int try_to_mlock_page(struct page
 	return mlocked;
 }
 
+DEFINE_TRACE(mm_anon_unmap);
 /**
  * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
  * rmap method
@@ -1034,9 +1036,11 @@ static int try_to_unmap_anon(struct page
 	else if (ret == SWAP_MLOCK)
 		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
 
+	trace_mm_anon_unmap(page_to_pfn(page), ret == SWAP_SUCCESS);
 	return ret;
 }
 
+DEFINE_TRACE(mm_filemap_unmap);
 /**
  * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
  * @page: the page to unmap/unlock
@@ -1170,6 +1174,7 @@ out:
 		ret = SWAP_MLOCK;	/* actually mlocked the page */
 	else if (ret == SWAP_MLOCK)
 		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
+	trace_mm_filemap_unmap(page_to_pfn(page), ret == SWAP_SUCCESS);
 	return ret;
 }
 
--- linux-2.6-tip/mm/page_alloc.c.orig	2009-03-16 12:05:59.000000000 -0400
+++ linux-2.6-tip/mm/page_alloc.c	2009-03-16 12:10:49.000000000 -0400
@@ -47,6 +47,7 @@
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -994,6 +995,7 @@ void mark_free_pages(struct zone *zone)
 }
 #endif /* CONFIG_PM */
 
+DEFINE_TRACE(mm_page_free);
 /*
  * Free a 0-order page
  */
@@ -1010,6 +1012,7 @@ static void free_hot_cold_page(struct pa
 	if (free_pages_check(page))
 		return;
 
+	trace_mm_page_free(page_to_pfn(page));
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
 		debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
@@ -1399,6 +1402,7 @@ static void zlc_mark_zone_full(struct zo
 }
 #endif	/* CONFIG_NUMA */
 
+DEFINE_TRACE(mm_page_allocation);
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
@@ -1453,8 +1457,11 @@ zonelist_scan:
 		}
 
 		page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
-		if (page)
+		if (page) {
+			trace_mm_page_allocation(page_to_pfn(page),
+					zone_page_state(zone, NR_FREE_PAGES));
 			break;
+		}
 this_zone_full:
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
--- linux-2.6-tip/mm/vmscan.c.orig	2009-03-16 12:05:59.000000000 -0400
+++ linux-2.6-tip/mm/vmscan.c	2009-03-16 12:10:49.000000000 -0400
@@ -40,6 +40,7 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -329,6 +330,7 @@ typedef enum {
 	PAGE_CLEAN,
 } pageout_t;
 
+DEFINE_TRACE(mm_pagereclaim_pgout);
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
@@ -407,6 +409,7 @@ static pageout_t pageout(struct page *pa
 			ClearPageReclaim(page);
 		}
 		inc_zone_page_state(page, NR_VMSCAN_WRITE);
+		trace_mm_pagereclaim_pgout(page_to_pfn(page), PageAnon(page));
 		return PAGE_SUCCESS;
 	}
 
@@ -570,6 +573,9 @@ void putback_lru_page(struct page *page)
 #endif /* CONFIG_UNEVICTABLE_LRU */
 
 
+DEFINE_TRACE(mm_pagereclaim_free);
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive_i2a);
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive_i2i);
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -758,6 +764,7 @@ free_it:
 			__pagevec_free(&freed_pvec);
 			pagevec_reinit(&freed_pvec);
 		}
+		trace_mm_pagereclaim_free(page_to_pfn(page), PageAnon(page));
 		continue;
 
 cull_mlocked:
@@ -774,10 +781,12 @@ activate_locked:
 		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
 		pgactivate++;
+		trace_mm_pagereclaim_shrinkinactive_i2a(page_to_pfn(page));
 keep_locked:
 		unlock_page(page);
 keep:
 		list_add(&page->lru, &ret_pages);
+		trace_mm_pagereclaim_shrinkinactive_i2i(page_to_pfn(page));
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
 	list_splice(&ret_pages, page_list);
@@ -1036,6 +1045,7 @@ int isolate_lru_page(struct page *page)
 	return ret;
 }
 
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive);
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
@@ -1170,6 +1180,7 @@ static unsigned long shrink_inactive_lis
 done:
 	local_irq_enable();
 	pagevec_release(&pvec);
+	trace_mm_pagereclaim_shrinkinactive(nr_reclaimed);
 	return nr_reclaimed;
 }
 
@@ -1187,6 +1198,9 @@ static inline void note_zone_scanning_pr
 		zone->prev_priority = priority;
 }
 
+DEFINE_TRACE(mm_pagereclaim_shrinkactive);
+DEFINE_TRACE(mm_pagereclaim_shrinkactive_a2a);
+DEFINE_TRACE(mm_pagereclaim_shrinkactive_a2i);
 /*
  * This moves pages from the active list to the inactive list.
  *
@@ -1247,6 +1261,7 @@ static void shrink_active_list(unsigned 
 
 		if (unlikely(!page_evictable(page, NULL))) {
 			putback_lru_page(page);
+			trace_mm_pagereclaim_shrinkactive_a2a(page_to_pfn(page));
 			continue;
 		}
 
@@ -1256,6 +1271,7 @@ static void shrink_active_list(unsigned 
 			pgmoved++;
 
 		list_add(&page->lru, &l_inactive);
+		trace_mm_pagereclaim_shrinkactive_a2i(page_to_pfn(page));
 	}
 
 	/*
@@ -1310,6 +1326,7 @@ static void shrink_active_list(unsigned 
 		pagevec_swap_free(&pvec);
 
 	pagevec_release(&pvec);
+	trace_mm_pagereclaim_shrinkactive(pgscanned);
 }
 
 static int inactive_anon_is_low_global(struct zone *zone)
@@ -1450,6 +1467,7 @@ static void get_scan_ratio(struct zone *
 }
 
 
+DEFINE_TRACE(mm_pagereclaim_shrinkzone);
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -1510,6 +1528,7 @@ static void shrink_zone(int priority, st
 	}
 
 	sc->nr_reclaimed = nr_reclaimed;
+	trace_mm_pagereclaim_shrinkzone(nr_reclaimed);
 
 	/*
 	 * Even if we did not try to evict anon pages at all, we want to
@@ -1521,6 +1540,7 @@ static void shrink_zone(int priority, st
 	throttle_vm_writeout(sc->gfp_mask);
 }
 
+DEFINE_TRACE(mm_directreclaim_reclaimall);
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -1569,6 +1589,7 @@ static void shrink_zones(int priority, s
 							priority);
 		}
 
+		trace_mm_directreclaim_reclaimall(priority);
 		shrink_zone(priority, zone, sc);
 	}
 }
@@ -1732,6 +1753,7 @@ unsigned long try_to_free_mem_cgroup_pag
 }
 #endif
 
+DEFINE_TRACE(mm_kswapd_runs);
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
@@ -1938,6 +1960,7 @@ out:
 		goto loop_again;
 	}
 
+	trace_mm_kswapd_runs(sc.nr_reclaimed);
 	return sc.nr_reclaimed;
 }
 
@@ -2278,6 +2301,7 @@ int sysctl_min_unmapped_ratio = 1;
  */
 int sysctl_min_slab_ratio = 5;
 
+DEFINE_TRACE(mm_directreclaim_reclaimzone);
 /*
  * Try to free up some pages from this zone through reclaim.
  */
@@ -2321,6 +2345,7 @@ static int __zone_reclaim(struct zone *z
 		do {
 			note_zone_scanning_priority(zone, priority);
 			shrink_zone(priority, zone, &sc);
+			trace_mm_directreclaim_reclaimzone(priority);
 			priority--;
 		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
 	}
@@ -2352,6 +2377,7 @@ static int __zone_reclaim(struct zone *z
 
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+	trace_mm_directreclaim_reclaimzone(sc.nr_reclaimed);
 	return sc.nr_reclaimed >= nr_pages;
 }
 
--- linux-2.6-tip/mm/filemap.c.orig	2009-03-16 12:05:59.000000000 -0400
+++ linux-2.6-tip/mm/filemap.c	2009-03-16 12:10:49.000000000 -0400
@@ -34,6 +34,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <trace/mm.h>
 #include "internal.h"
 
 /*
@@ -1436,6 +1437,7 @@ static int page_cache_read(struct file *
 
 #define MMAP_LOTSAMISS  (100)
 
+DEFINE_TRACE(mm_filemap_fault);
 /**
  * filemap_fault - read in file data for page fault handling
  * @vma:	vma in which the fault was taken
@@ -1547,6 +1549,8 @@ retry_find:
 	 */
 	ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	vmf->page = page;
+	trace_mm_filemap_fault(vma->vm_mm, (unsigned long)vmf->virtual_address,
+			page_to_pfn(page), vmf->flags&FAULT_FLAG_NONLINEAR);
 	return ret | VM_FAULT_LOCKED;
 
 no_cached_page:
--- linux-2.6-tip/mm/page-writeback.c.orig	2009-03-16 12:05:59.000000000 -0400
+++ linux-2.6-tip/mm/page-writeback.c	2009-03-16 12:10:49.000000000 -0400
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <trace/mm.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -677,6 +678,7 @@ void throttle_vm_writeout(gfp_t gfp_mask
         }
 }
 
+DEFINE_TRACE(mm_pdflush_bgwriteout);
 /*
  * writeback at least _min_pages, and keep writing until the amount of dirty
  * memory is less than the background threshold, or until we're all clean.
@@ -716,6 +718,7 @@ static void background_writeout(unsigned
 				break;
 		}
 	}
+	trace_mm_pdflush_bgwriteout(_min_pages);
 }
 
 /*
@@ -737,6 +740,7 @@ static void laptop_timer_fn(unsigned lon
 static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
 static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 
+DEFINE_TRACE(mm_pdflush_kupdate);
 /*
  * Periodic writeback of "old" data.
  *
@@ -776,6 +780,7 @@ static void wb_kupdate(unsigned long arg
 	nr_to_write = global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+	trace_mm_pdflush_kupdate(nr_to_write);
 	while (nr_to_write > 0) {
 		wbc.more_io = 0;
 		wbc.encountered_congestion = 0;
--- /dev/null	2009-03-01 15:15:06.452930698 -0500
+++ linux-2.6-tip/include/trace/mm_event_types.h	2009-03-16 12:10:49.000000000 -0400
@@ -0,0 +1,281 @@
+/* use <trace/mm.h> instead */
+#ifndef TRACE_FORMAT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mm
+
+TRACE_EVENT_FORMAT(mm_anon_fault,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_pgin,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_cow,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_userfree,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_unmap,
+	TPPROTO(unsigned long pfn, int success),
+	TPARGS(pfn, success),
+	TPFMT("%s: pfn=%lx", pfn, success ? "succeeded" : "failed"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("pfn %lx success %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_fault,
+	TPPROTO(struct mm_struct *mm, unsigned long address,
+			unsigned long pfn, int flag),
+	TPARGS(mm, address, pfn, flag),
+	TPFMT("%s: mm=%lx address=%lx pfn=%lx",
+		flag ? "pagein" : "primary fault", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, flag, flag)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx flag %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_cow,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_unmap,
+	TPPROTO(unsigned long pfn, int success),
+	TPARGS(pfn, success),
+	TPFMT("%s: pfn=%lx", pfn, success ? "succeeded" : "failed"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("pfn %lx success %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_userunmap,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_pgout,
+	TPPROTO(unsigned long pfn, int anon),
+	TPARGS(pfn, anon),
+	TPFMT("%s page: pfn=%lx", pfn, anon ? "anonymous" : "pagecache"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_free,
+	TPPROTO(unsigned long pfn, int anon),
+	TPARGS(pfn, anon),
+	TPFMT("%s page: pfn=%lx", pfn, anon ? "anonymous" : "pagecache"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pdflush_bgwriteout,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pdflush_kupdate,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_page_allocation,
+	TPPROTO(unsigned long pfn, unsigned long free),
+	TPARGS(pfn, free),
+	TPFMT("pfn=%lx zone_free=%ld", pfn, free),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(unsigned long, free, free)
+	),
+	TPRAWFMT("pfn %lx free %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_kswapd_runs,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_directreclaim_reclaimall,
+	TPPROTO(unsigned long priority),
+	TPARGS(priority),
+	TPFMT("priority=%lx", priority),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, priority, priority)
+	),
+	TPRAWFMT("priority %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_directreclaim_reclaimzone,
+	TPPROTO(unsigned long reclaimed),
+	TPARGS(reclaimed),
+	TPFMT("reclaimed=%lx", reclaimed),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, reclaimed, reclaimed)
+	),
+	TPRAWFMT("reclaimed %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkzone,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive_a2a,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive_a2i,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive_i2a,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive_i2i,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_page_free,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+#undef TRACE_SYSTEM
+
+#undef TRACE_SYSTEM
--- linux-2.6-tip/include/trace/trace_events.h.orig	2009-03-16 12:05:59.000000000 -0400
+++ linux-2.6-tip/include/trace/trace_events.h	2009-03-16 12:10:49.000000000 -0400
@@ -3,3 +3,4 @@
 #include <trace/sched.h>
 #include <trace/irq.h>
 #include <trace/lockdep.h>
+#include <trace/mm.h>
--- linux-2.6-tip/include/trace/trace_event_types.h.orig	2009-03-16 12:05:59.000000000 -0400
+++ linux-2.6-tip/include/trace/trace_event_types.h	2009-03-16 12:10:49.000000000 -0400
@@ -3,3 +3,4 @@
 #include <trace/sched_event_types.h>
 #include <trace/irq_event_types.h>
 #include <trace/lockdep_event_types.h>
+#include <trace/mm_event_types.h>
--- /dev/null	2009-03-01 15:15:06.452930698 -0500
+++ linux-2.6-tip/include/trace/mm.h	2009-03-16 12:10:49.000000000 -0400
@@ -0,0 +1,9 @@
+#ifndef _TRACE_MM_H
+#define _TRACE_MM_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+
+#include <trace/mm_event_types.h>
+
+#endif

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-16 19:52 [Patch] mm tracepoints Larry Woodman
@ 2009-03-19 15:47 ` Rik van Riel
  2009-03-19 15:55   ` Larry Woodman
  2009-03-23 18:54   ` Larry Woodman
  0 siblings, 2 replies; 24+ messages in thread
From: Rik van Riel @ 2009-03-19 15:47 UTC (permalink / raw)
  To: Larry Woodman; +Cc: linux-mm

Larry Woodman wrote:
> I've implemented several mm tracepoints to track page allocation and
> freeing, various types of pagefaults and unmaps, and critical page
> reclamation routines.  This is useful for debugging memory allocation
> issues and system performance problems under heavy memory loads.
> Thoughts?:

It looks mostly good.

I believe that the vmscan.c tracepoints could be a little
more verbose though, it would be useful to know whether we
are scanning anon or file pages and whether or not we're
doing lumpy reclaim.  Possibly the priority level, too.

-- 
All rights reversed.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-19 15:47 ` Rik van Riel
@ 2009-03-19 15:55   ` Larry Woodman
  2009-03-23 18:54   ` Larry Woodman
  1 sibling, 0 replies; 24+ messages in thread
From: Larry Woodman @ 2009-03-19 15:55 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-mm

On Thu, 2009-03-19 at 11:47 -0400, Rik van Riel wrote:
> Larry Woodman wrote:
> > I've implemented several mm tracepoints to track page allocation and
> > freeing, various types of pagefaults and unmaps, and critical page
> > reclamation routines.  This is useful for debugging memory allocation
> > issues and system performance problems under heavy memory loads.
> > Thoughts?:
> 
> It looks mostly good.
> 
> I believe that the vmscan.c tracepoints could be a little
> more verbose though, it would be useful to know whether we
> are scanning anon or file pages and whether or not we're
> doing lumpy reclaim.  Possibly the priority level, too.
> 

OK thanks, I'll address these concerns.

Larry


--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-19 15:47 ` Rik van Riel
  2009-03-19 15:55   ` Larry Woodman
@ 2009-03-23 18:54   ` Larry Woodman
  1 sibling, 0 replies; 24+ messages in thread
From: Larry Woodman @ 2009-03-23 18:54 UTC (permalink / raw)
  To: Rik van Riel; +Cc: linux-mm

[-- Attachment #1: Type: text/plain, Size: 720 bytes --]

On Thu, 2009-03-19 at 11:47 -0400, Rik van Riel wrote:
> Larry Woodman wrote:
> > I've implemented several mm tracepoints to track page allocation and
> > freeing, various types of pagefaults and unmaps, and critical page
> > reclamation routines.  This is useful for debugging memory allocation
> > issues and system performance problems under heavy memory loads.
> > Thoughts?:
> 
> It looks mostly good.
> 
> I believe that the vmscan.c tracepoints could be a little
> more verbose though, it would be useful to know whether we
> are scanning anon or file pages and whether or not we're
> doing lumpy reclaim.  Possibly the priority level, too.
> 

The attached patch addresses your concerns.  Can you have a look:



[-- Attachment #2: upstream-mm_tracepoints.patch --]
[-- Type: text/x-patch, Size: 22755 bytes --]

diff --git a/include/trace/mm.h b/include/trace/mm.h
new file mode 100644
index 0000000..a3e760e
--- /dev/null
+++ b/include/trace/mm.h
@@ -0,0 +1,9 @@
+#ifndef _TRACE_MM_H
+#define _TRACE_MM_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+
+#include <trace/mm_event_types.h>
+
+#endif
diff --git a/include/trace/mm_event_types.h b/include/trace/mm_event_types.h
new file mode 100644
index 0000000..0100d43
--- /dev/null
+++ b/include/trace/mm_event_types.h
@@ -0,0 +1,285 @@
+/* use <trace/mm.h> instead */
+#ifndef TRACE_FORMAT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mm
+
+TRACE_EVENT_FORMAT(mm_anon_fault,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_pgin,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_cow,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_userfree,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_unmap,
+	TPPROTO(unsigned long pfn, int success),
+	TPARGS(pfn, success),
+	TPFMT("%s: pfn=%lx", success ? "succeeded" : "failed", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("pfn %lx success %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_fault,
+	TPPROTO(struct mm_struct *mm, unsigned long address,
+			unsigned long pfn, int flag),
+	TPARGS(mm, address, pfn, flag),
+	TPFMT("%s: mm=%lx address=%lx pfn=%lx",
+		flag ? "pagein" : "primary fault", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, flag, flag)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx flag %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_cow,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_unmap,
+	TPPROTO(unsigned long pfn, int success),
+	TPARGS(pfn, success),
+	TPFMT("%s: pfn=%lx", success ? "succeeded" : "failed", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("pfn %lx success %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_userunmap,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_pgout,
+	TPPROTO(unsigned long pfn, int anon),
+	TPARGS(pfn, anon),
+	TPFMT("%s page: pfn=%lx", anon ? "anonymous" : "pagecache", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_free,
+	TPPROTO(unsigned long pfn, int anon),
+	TPARGS(pfn, anon),
+	TPFMT("%s page: pfn=%lx", anon ? "anonymous" : "pagecache", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pdflush_bgwriteout,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pdflush_kupdate,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_page_allocation,
+	TPPROTO(unsigned long pfn, unsigned long free),
+	TPARGS(pfn, free),
+	TPFMT("pfn=%lx zone_free=%ld", pfn, free),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(unsigned long, free, free)
+	),
+	TPRAWFMT("pfn %lx free %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_kswapd_runs,
+	TPPROTO(unsigned long reclaimed),
+	TPARGS(reclaimed),
+	TPFMT("reclaimed=%lx", reclaimed),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, reclaimed, reclaimed)
+	),
+	TPRAWFMT("reclaimed %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_directreclaim_reclaimall,
+	TPPROTO(unsigned long priority),
+	TPARGS(priority),
+	TPFMT("priority=%lx", priority),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, priority, priority)
+	),
+	TPRAWFMT("priority %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_directreclaim_reclaimzone,
+	TPPROTO(unsigned long reclaimed),
+	TPARGS(reclaimed),
+	TPFMT("reclaimed=%lx", reclaimed),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, reclaimed, reclaimed)
+	),
+	TPRAWFMT("reclaimed %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkzone,
+	TPPROTO(unsigned long reclaimed),
+	TPARGS(reclaimed),
+	TPFMT("reclaimed=%lx", reclaimed),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, reclaimed, reclaimed)
+	),
+	TPRAWFMT("reclaimed %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive,
+	TPPROTO(unsigned long scanned, int file, int priority),
+	TPARGS(scanned, file, priority),
+	TPFMT("scanned=%lx, %s, priority=%d", scanned, file ? "anonymous" : "pagecache", priority),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, scanned, scanned)
+		TRACE_FIELD(int, file, file)
+		TRACE_FIELD(int, priority, priority)
+	),
+	TPRAWFMT("scanned %lx file %x priority %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive_a2a,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive_a2i,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive,
+	TPPROTO(unsigned long reclaimed, int file, int priority),
+	TPARGS(reclaimed, file, priority),
+	TPFMT("reclaimed=%lx, %s, priority=%d", reclaimed, file ? "anonymous" : "pagecache", priority),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, reclaimed, reclaimed)
+		TRACE_FIELD(int, file, file)
+		TRACE_FIELD(int, priority, priority)
+	),
+	TPRAWFMT("reclaimed %lx file %x priority %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive_i2a,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive_i2i,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_page_free,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+#undef TRACE_SYSTEM
+
+#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
index df56f56..153d262 100644
--- a/include/trace/trace_event_types.h
+++ b/include/trace/trace_event_types.h
@@ -3,3 +3,4 @@
 #include <trace/sched_event_types.h>
 #include <trace/irq_event_types.h>
 #include <trace/lockdep_event_types.h>
+#include <trace/mm_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index fd13750..2752e7f 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -3,3 +3,4 @@
 #include <trace/sched.h>
 #include <trace/irq.h>
 #include <trace/lockdep.h>
+#include <trace/mm.h>
diff --git a/mm/filemap.c b/mm/filemap.c
index 126d397..9c68755 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <trace/mm.h>
 #include "internal.h"
 
 /*
@@ -1436,6 +1437,7 @@ static int page_cache_read(struct file *file, pgoff_t offset)
 
 #define MMAP_LOTSAMISS  (100)
 
+DEFINE_TRACE(mm_filemap_fault);
 /**
  * filemap_fault - read in file data for page fault handling
  * @vma:	vma in which the fault was taken
@@ -1547,6 +1549,8 @@ retry_find:
 	 */
 	ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	vmf->page = page;
+	trace_mm_filemap_fault(vma->vm_mm, (unsigned long)vmf->virtual_address,
+			page_to_pfn(page), vmf->flags&FAULT_FLAG_NONLINEAR);
 	return ret | VM_FAULT_LOCKED;
 
 no_cached_page:
diff --git a/mm/memory.c b/mm/memory.c
index baa999e..6acc389 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -55,6 +55,7 @@
 #include <linux/kallsyms.h>
 #include <linux/swapops.h>
 #include <linux/elf.h>
+#include <trace/mm.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -758,6 +759,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	return ret;
 }
 
+DEFINE_TRACE(mm_anon_userfree);
+DEFINE_TRACE(mm_filemap_userunmap);
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
@@ -812,15 +815,19 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 						addr) != page->index)
 				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
-			if (PageAnon(page))
+			if (PageAnon(page)) {
 				anon_rss--;
-			else {
+				trace_mm_anon_userfree(mm, addr,
+							page_to_pfn(page));
+			} else {
 				if (pte_dirty(ptent))
 					set_page_dirty(page);
 				if (pte_young(ptent) &&
 				    likely(!VM_SequentialReadHint(vma)))
 					mark_page_accessed(page);
 				file_rss--;
+				trace_mm_filemap_userunmap(mm, addr,
+							page_to_pfn(page));
 			}
 			page_remove_rmap(page);
 			if (unlikely(page_mapcount(page) < 0))
@@ -1867,6 +1874,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 		copy_user_highpage(dst, src, va, vma);
 }
 
+DEFINE_TRACE(mm_anon_cow);
+DEFINE_TRACE(mm_filemap_cow);
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -1889,7 +1898,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		spinlock_t *ptl, pte_t orig_pte)
 {
-	struct page *old_page, *new_page;
+	struct page *old_page, *new_page = NULL;
 	pte_t entry;
 	int reuse = 0, ret = 0;
 	int page_mkwrite = 0;
@@ -2019,9 +2028,14 @@ gotten:
 			if (!PageAnon(old_page)) {
 				dec_mm_counter(mm, file_rss);
 				inc_mm_counter(mm, anon_rss);
+				trace_mm_filemap_cow(mm, address,
+					page_to_pfn(new_page));
 			}
-		} else
+		} else {
 			inc_mm_counter(mm, anon_rss);
+			trace_mm_anon_cow(mm, address,
+					page_to_pfn(new_page));
+		}
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2386,6 +2400,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 	return 0;
 }
 
+DEFINE_TRACE(mm_anon_pgin);
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2499,6 +2514,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
+	trace_mm_anon_pgin(mm, address, page_to_pfn(page));
 	return ret;
 out_nomap:
 	mem_cgroup_cancel_charge_swapin(ptr);
@@ -2508,6 +2524,7 @@ out_nomap:
 	return ret;
 }
 
+DEFINE_TRACE(mm_anon_fault);
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2531,6 +2548,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto oom;
 	__SetPageUptodate(page);
 
+	trace_mm_anon_fault(mm, address, page_to_pfn(page));
 	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
 		goto oom_free_page;
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 74dc57c..7ebd33c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <trace/mm.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -677,6 +678,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
         }
 }
 
+DEFINE_TRACE(mm_pdflush_bgwriteout);
 /*
  * writeback at least _min_pages, and keep writing until the amount of dirty
  * memory is less than the background threshold, or until we're all clean.
@@ -716,6 +718,7 @@ static void background_writeout(unsigned long _min_pages)
 				break;
 		}
 	}
+	trace_mm_pdflush_bgwriteout(_min_pages);
 }
 
 /*
@@ -737,6 +740,7 @@ static void laptop_timer_fn(unsigned long unused);
 static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
 static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 
+DEFINE_TRACE(mm_pdflush_kupdate);
 /*
  * Periodic writeback of "old" data.
  *
@@ -776,6 +780,7 @@ static void wb_kupdate(unsigned long arg)
 	nr_to_write = global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+	trace_mm_pdflush_kupdate(nr_to_write);
 	while (nr_to_write > 0) {
 		wbc.more_io = 0;
 		wbc.encountered_congestion = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0d0633f..b088370 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -47,6 +47,7 @@
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -994,6 +995,7 @@ void mark_free_pages(struct zone *zone)
 }
 #endif /* CONFIG_PM */
 
+DEFINE_TRACE(mm_page_free);
 /*
  * Free a 0-order page
  */
@@ -1010,6 +1012,7 @@ static void free_hot_cold_page(struct page *page, int cold)
 	if (free_pages_check(page))
 		return;
 
+	trace_mm_page_free(page_to_pfn(page));
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
 		debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
@@ -1399,6 +1402,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 }
 #endif	/* CONFIG_NUMA */
 
+DEFINE_TRACE(mm_page_allocation);
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
@@ -1453,8 +1457,11 @@ zonelist_scan:
 		}
 
 		page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
-		if (page)
+		if (page) {
+			trace_mm_page_allocation(page_to_pfn(page),
+					zone_page_state(zone, NR_FREE_PAGES));
 			break;
+		}
 this_zone_full:
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
diff --git a/mm/rmap.c b/mm/rmap.c
index 1652166..39a4876 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -50,6 +50,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 
@@ -978,6 +979,7 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
 	return mlocked;
 }
 
+DEFINE_TRACE(mm_anon_unmap);
 /**
  * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
  * rmap method
@@ -1034,9 +1036,11 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 	else if (ret == SWAP_MLOCK)
 		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
 
+	trace_mm_anon_unmap(page_to_pfn(page), ret == SWAP_SUCCESS);
 	return ret;
 }
 
+DEFINE_TRACE(mm_filemap_unmap);
 /**
  * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
  * @page: the page to unmap/unlock
@@ -1170,6 +1174,7 @@ out:
 		ret = SWAP_MLOCK;	/* actually mlocked the page */
 	else if (ret == SWAP_MLOCK)
 		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
+	trace_mm_filemap_unmap(page_to_pfn(page), ret == SWAP_SUCCESS);
 	return ret;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ae6f4c1..626b91f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -40,6 +40,7 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -329,6 +330,7 @@ typedef enum {
 	PAGE_CLEAN,
 } pageout_t;
 
+DEFINE_TRACE(mm_pagereclaim_pgout);
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
@@ -407,6 +409,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 			ClearPageReclaim(page);
 		}
 		inc_zone_page_state(page, NR_VMSCAN_WRITE);
+		trace_mm_pagereclaim_pgout(page_to_pfn(page), PageAnon(page));
 		return PAGE_SUCCESS;
 	}
 
@@ -570,6 +573,9 @@ void putback_lru_page(struct page *page)
 #endif /* CONFIG_UNEVICTABLE_LRU */
 
 
+DEFINE_TRACE(mm_pagereclaim_free);
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive_i2a);
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive_i2i);
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -758,6 +764,7 @@ free_it:
 			__pagevec_free(&freed_pvec);
 			pagevec_reinit(&freed_pvec);
 		}
+		trace_mm_pagereclaim_free(page_to_pfn(page), PageAnon(page));
 		continue;
 
 cull_mlocked:
@@ -774,10 +781,12 @@ activate_locked:
 		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
 		pgactivate++;
+		trace_mm_pagereclaim_shrinkinactive_i2a(page_to_pfn(page));
 keep_locked:
 		unlock_page(page);
 keep:
 		list_add(&page->lru, &ret_pages);
+		trace_mm_pagereclaim_shrinkinactive_i2i(page_to_pfn(page));
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
 	list_splice(&ret_pages, page_list);
@@ -1036,6 +1045,7 @@ int isolate_lru_page(struct page *page)
 	return ret;
 }
 
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive);
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
@@ -1170,6 +1180,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 done:
 	local_irq_enable();
 	pagevec_release(&pvec);
+	trace_mm_pagereclaim_shrinkinactive(nr_reclaimed, file, priority);
 	return nr_reclaimed;
 }
 
@@ -1187,6 +1198,9 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
 		zone->prev_priority = priority;
 }
 
+DEFINE_TRACE(mm_pagereclaim_shrinkactive);
+DEFINE_TRACE(mm_pagereclaim_shrinkactive_a2a);
+DEFINE_TRACE(mm_pagereclaim_shrinkactive_a2i);
 /*
  * This moves pages from the active list to the inactive list.
  *
@@ -1247,6 +1261,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 
 		if (unlikely(!page_evictable(page, NULL))) {
 			putback_lru_page(page);
+			trace_mm_pagereclaim_shrinkactive_a2a(page_to_pfn(page));
 			continue;
 		}
 
@@ -1256,6 +1271,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 			pgmoved++;
 
 		list_add(&page->lru, &l_inactive);
+		trace_mm_pagereclaim_shrinkactive_a2i(page_to_pfn(page));
 	}
 
 	/*
@@ -1310,6 +1326,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		pagevec_swap_free(&pvec);
 
 	pagevec_release(&pvec);
+	trace_mm_pagereclaim_shrinkactive(pgscanned, file, priority);
 }
 
 static int inactive_anon_is_low_global(struct zone *zone)
@@ -1450,6 +1467,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 }
 
 
+DEFINE_TRACE(mm_pagereclaim_shrinkzone);
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -1510,6 +1528,7 @@ static void shrink_zone(int priority, struct zone *zone,
 	}
 
 	sc->nr_reclaimed = nr_reclaimed;
+	trace_mm_pagereclaim_shrinkzone(nr_reclaimed);
 
 	/*
 	 * Even if we did not try to evict anon pages at all, we want to
@@ -1521,6 +1540,7 @@ static void shrink_zone(int priority, struct zone *zone,
 	throttle_vm_writeout(sc->gfp_mask);
 }
 
+DEFINE_TRACE(mm_directreclaim_reclaimall);
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -1569,6 +1589,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 							priority);
 		}
 
+		trace_mm_directreclaim_reclaimall(priority);
 		shrink_zone(priority, zone, sc);
 	}
 }
@@ -1732,6 +1753,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
 
+DEFINE_TRACE(mm_kswapd_runs);
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
@@ -1938,6 +1960,7 @@ out:
 		goto loop_again;
 	}
 
+	trace_mm_kswapd_runs(sc.nr_reclaimed);
 	return sc.nr_reclaimed;
 }
 
@@ -2278,6 +2301,7 @@ int sysctl_min_unmapped_ratio = 1;
  */
 int sysctl_min_slab_ratio = 5;
 
+DEFINE_TRACE(mm_directreclaim_reclaimzone);
 /*
  * Try to free up some pages from this zone through reclaim.
  */
@@ -2321,6 +2345,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		do {
 			note_zone_scanning_priority(zone, priority);
 			shrink_zone(priority, zone, &sc);
+			trace_mm_directreclaim_reclaimzone(priority);
 			priority--;
 		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
 	}
@@ -2352,6 +2377,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+	trace_mm_directreclaim_reclaimzone(sc.nr_reclaimed);
 	return sc.nr_reclaimed >= nr_pages;
 }
 

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 17:38                           ` Peter Zijlstra
                                               ` (2 preceding siblings ...)
  2009-03-06 19:06                             ` Larry Woodman
@ 2009-03-06 21:53                             ` Chris Friesen
  3 siblings, 0 replies; 24+ messages in thread
From: Chris Friesen @ 2009-03-06 21:53 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Larry Woodman, Andrew Morton, Nick Piggin,
	Steven Rostedt, KOSAKI Motohiro, linux-kernel, fweisbec

Peter Zijlstra wrote:
> On Fri, 2009-03-06 at 18:10 +0100, Ingo Molnar wrote:
>> Looks pretty good and useful to me. I've Cc:-ed more mm folks, 
>> it would be nice to hear their opinion about these tracepoints.
>>
>> Andrew, Nick, Peter, what do you think?
> 
> Bit sad we use the struct mm_struct * as mm identifier (little %lx vs %p
> confusion there too), but I suppose there simply isn't anything better.

Could we use the tgid as an mm identifier?  Or does the possibility of 
CLONE_VM & !CLONE_THREAD preclude this?

Chris

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-05 22:16           ` [Patch] mm tracepoints Larry Woodman
@ 2009-03-06 21:16               ` Andrew Morton
  2009-03-06 21:16               ` Andrew Morton
  1 sibling, 0 replies; 24+ messages in thread
From: Andrew Morton @ 2009-03-06 21:16 UTC (permalink / raw)
  To: Larry Woodman; +Cc: linux-kernel, mingo, rostedt, peterz, fweisbec, linux-mm

On Thu, 05 Mar 2009 17:16:40 -0500
Larry Woodman <lwoodman@redhat.com> wrote:

> I've implemented several mm tracepoints to track page allocation and
> freeing, various types of pagefaults and unmaps, and critical page
> reclamation routines.  This is useful for debugging memory allocation
> issues and system performance problems under heavy memory loads:
> 
> # tracer: mm
> #
> #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
> #              | |       |          |         |
>          pdflush-624   [004]   184.293169: wb_kupdate:
> (mm_pdflush_kupdate) count=3e48
>          pdflush-624   [004]   184.293439: get_page_from_freelist:
> (mm_page_allocation) pfn=447c27 zone_free=1940910
>         events/6-33    [006]   184.962879: free_hot_cold_page:
> (mm_page_free) pfn=44bba9
>       irqbalance-8313  [001]   188.042951: unmap_vmas:
> (mm_anon_userfree) mm=ffff88044a7300c0 address=7f9a2eb70000 pfn=24c29a
>              cat-9122  [005]   191.141173: filemap_fault:
> (mm_filemap_fault) primary fault: mm=ffff88024c9d8f40 address=3cea2dd000
> pfn=44d68e
>              cat-9122  [001]   191.143036: handle_mm_fault:
> (mm_anon_fault) mm=ffff88024c8beb40 address=7fffbde99f94 pfn=24ce22
> ...

I'm struggling to think of any memory management problems which this
facility would have helped us solve.  Single-page tracing like this
isn't very interesting or useful.

What we generally are looking for when resolving MM
performance/correctness problems is a representation/visualisation of
aggregated results over a period of time.  That means synchronous or
downstream processing of large amounts of bulk data.

Now, possibly the above information could be used to generate the
needed information.  But the above rather random-looking and chaotic
data output would make it very hard to develop the needed
aggregation/representation tools.

And unless someone actually develops those tools (which is a lot of
work), there isn't much point in adding the kernel infrastructure to
generate the data for the non-existing tool.

I haven't looked at LTT in a while.  What sort of information does it
extract from the MM system?  Is it useful to MM developers?  If so, can
this newly-proposed facility do the same thing?


How about a test case - how could this patch help us (and our testers)
make some progress with the infamous
http://bugzilla.kernel.org/show_bug.cgi?id=12309 ?


Then again, maybe I'm wrong!  Maybe MM developers _do_ believe that
this tool would assist them in their work.  Given that MM develoeprs
are the target market for this feature, it would be sensible to cc the
linux-mm list, methinks?


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
@ 2009-03-06 21:16               ` Andrew Morton
  0 siblings, 0 replies; 24+ messages in thread
From: Andrew Morton @ 2009-03-06 21:16 UTC (permalink / raw)
  To: Larry Woodman; +Cc: linux-kernel, mingo, rostedt, peterz, fweisbec, linux-mm

On Thu, 05 Mar 2009 17:16:40 -0500
Larry Woodman <lwoodman@redhat.com> wrote:

> I've implemented several mm tracepoints to track page allocation and
> freeing, various types of pagefaults and unmaps, and critical page
> reclamation routines.  This is useful for debugging memory allocation
> issues and system performance problems under heavy memory loads:
> 
> # tracer: mm
> #
> #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
> #              | |       |          |         |
>          pdflush-624   [004]   184.293169: wb_kupdate:
> (mm_pdflush_kupdate) count=3e48
>          pdflush-624   [004]   184.293439: get_page_from_freelist:
> (mm_page_allocation) pfn=447c27 zone_free=1940910
>         events/6-33    [006]   184.962879: free_hot_cold_page:
> (mm_page_free) pfn=44bba9
>       irqbalance-8313  [001]   188.042951: unmap_vmas:
> (mm_anon_userfree) mm=ffff88044a7300c0 address=7f9a2eb70000 pfn=24c29a
>              cat-9122  [005]   191.141173: filemap_fault:
> (mm_filemap_fault) primary fault: mm=ffff88024c9d8f40 address=3cea2dd000
> pfn=44d68e
>              cat-9122  [001]   191.143036: handle_mm_fault:
> (mm_anon_fault) mm=ffff88024c8beb40 address=7fffbde99f94 pfn=24ce22
> ...

I'm struggling to think of any memory management problems which this
facility would have helped us solve.  Single-page tracing like this
isn't very interesting or useful.

What we generally are looking for when resolving MM
performance/correctness problems is a representation/visualisation of
aggregated results over a period of time.  That means synchronous or
downstream processing of large amounts of bulk data.

Now, possibly the above information could be used to generate the
needed information.  But the above rather random-looking and chaotic
data output would make it very hard to develop the needed
aggregation/representation tools.

And unless someone actually develops those tools (which is a lot of
work), there isn't much point in adding the kernel infrastructure to
generate the data for the non-existing tool.

I haven't looked at LTT in a while.  What sort of information does it
extract from the MM system?  Is it useful to MM developers?  If so, can
this newly-proposed facility do the same thing?


How about a test case - how could this patch help us (and our testers)
make some progress with the infamous
http://bugzilla.kernel.org/show_bug.cgi?id=12309 ?


Then again, maybe I'm wrong!  Maybe MM developers _do_ believe that
this tool would assist them in their work.  Given that MM develoeprs
are the target market for this feature, it would be sensible to cc the
linux-mm list, methinks?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 18:01                               ` Ingo Molnar
  2009-03-06 18:20                                 ` Peter Zijlstra
@ 2009-03-06 20:01                                 ` Larry Woodman
  1 sibling, 0 replies; 24+ messages in thread
From: Larry Woodman @ 2009-03-06 20:01 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Peter Zijlstra, Andrew Morton, Nick Piggin, Steven Rostedt,
	KOSAKI Motohiro, linux-kernel, fweisbec

On Fri, 2009-03-06 at 19:01 +0100, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, 2009-03-06 at 18:38 +0100, Peter Zijlstra wrote:
> > > On Fri, 2009-03-06 at 18:10 +0100, Ingo Molnar wrote:
> > > > Looks pretty good and useful to me. I've Cc:-ed more mm folks, 
> > > > it would be nice to hear their opinion about these tracepoints.
> > > > 
> > > > Andrew, Nick, Peter, what do you think?
> > > 
> > > Bit sad we use the struct mm_struct * as mm identifier (little %lx vs %p
> > > confusion there too), but I suppose there simply isn't anything better.
> > 
> > > Things missing, 
> > 
> > Why only anon and filemap, that misses out on all the funky 
> > driver ->fault() handlers.
> 
> btw., does it include shm faults? I think all of this would be 
> handled if the tracepoint was at handle_mm_fault(), right?

The problem with this approach is you cant tell what kind of fault is
being encountered and how it will be handled until you are way down in
the functions that I added the tracepoints in...

The value of these tracepoint is the data you get from they are
currently located.

Larry

> 
> 	Ingo


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 17:10                         ` Ingo Molnar
  2009-03-06 17:38                           ` Peter Zijlstra
@ 2009-03-06 19:22                           ` Larry Woodman
  1 sibling, 0 replies; 24+ messages in thread
From: Larry Woodman @ 2009-03-06 19:22 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Andrew Morton, Nick Piggin, Steven Rostedt, KOSAKI Motohiro,
	linux-kernel, peterz, fweisbec, Peter Zijlstra

On Fri, 2009-03-06 at 18:10 +0100, Ingo Molnar wrote:
> * Larry Woodman <lwoodman@redhat.com> wrote:
> 
> > > Would that be useless or controversial? We know from 
> > > vma->mapping which inode it maps to. Knowing which file is 
> > > faulting in can be useful - especially when addresses are a 
> > > moving target such as under PIE or with dlopen(), etc.
> > > 
> > > 	Ingo
> > 
> > Attached is the updated patch that applies and builds 
> > correctly (sorry I missed the lockdep tracepoints that were 
> > added at the last minute). [...]
> 
> Looks pretty good and useful to me. I've Cc:-ed more mm folks, 
> it would be nice to hear their opinion about these tracepoints.
> 
> Andrew, Nick, Peter, what do you think?
> 
> About the motivation of these tracepoints: i suspect these 
> tracepoints reflect your years-long experience in dealing with 
> various MM regressions in the enterprise space and these 
> tracepoints would help understand such regressions 
> faster/easier?

Exactly, and without running some "debug enhanced kernel".

> 
> > [...]  As far as the filename:offset is concerned I am working 
> > on that.  Its not as simple as it looks because we have to 
> > follow a variable list of structs that can be null terminated 
> > several places along the way.
> 
> It's definitely not simple! I dont think it should be in this 
> base patch at all - it should be an add-on.
> 
> 	Ingo


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 17:38                           ` Peter Zijlstra
  2009-03-06 17:46                             ` Ingo Molnar
  2009-03-06 17:56                             ` Peter Zijlstra
@ 2009-03-06 19:06                             ` Larry Woodman
  2009-03-06 21:53                             ` Chris Friesen
  3 siblings, 0 replies; 24+ messages in thread
From: Larry Woodman @ 2009-03-06 19:06 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Ingo Molnar, Andrew Morton, Nick Piggin, Steven Rostedt,
	KOSAKI Motohiro, linux-kernel, fweisbec

On Fri, 2009-03-06 at 18:38 +0100, Peter Zijlstra wrote:
> On Fri, 2009-03-06 at 18:10 +0100, Ingo Molnar wrote:
> > Looks pretty good and useful to me. I've Cc:-ed more mm folks, 
> > it would be nice to hear their opinion about these tracepoints.
> > 
> > Andrew, Nick, Peter, what do you think?
> 
> Bit sad we use the struct mm_struct * as mm identifier (little %lx vs %p
> confusion there too), but I suppose there simply isn't anything better.
> 
> Exposing kernel pointers like that might upset some of the security
> folks, not sure if I care though.
> 
> I'm missing the fault_filemap_read counterpart of fault_anon_pgin.

filemap_fault handles both the initial fault when the pte is zero and
pagein when the page has been reclaimed.  It was impossible to implement
them as separate handlers in __do_fault() without changing the
underlying MM code.

> 
> Once you have anon/filemap symmetric, you might consider folding these
> and doing the anon argument thing you do elsewhere.
> 
> Initially I was thinking we lacked the kswapd vs direct reclaim
> information on the pgout data, but since we log the pid:comm for each
> event...

They are separate, trace_mm_kswapd_runs() and
trace_mm_directreclaim_reclaimall().
trace_mm_directreclaim_reclaimzone() is for the zone_reclaim path where
we do local zone reclamation rather than falling off to the next zone in
the zone list.

> 
> Which brings us to mm_pdflush_*, we can already see its pdflush from
> pid:comm, then again, it fits the naming style. Same for
> mm_directreclaim*() - we already know its direct, since its not kswapd
> doing it.
> 

Like I said above there are 2 direct reclaim paths: one is teh call to
try_to_free_pages() out of __alloc_pages_internal() and the other is the
call to shrink_zone() out of __zone_reclaim().  I made a distinction
between these because the first calls shrink_zone for each zone in the
zone list when memory is really low(below min) where the second calls
shrink_zone for the local zone to prevent memory allocation from a
remote node.

> Finally, we have page_free, but not page_alloc? Oh, it is there, just
> not in the obvious place.

In order to get the zone free information it has to be in down in
get_page_from_freelist.

> 
> 
> Things missing, we trace unmap, but not mmap, mprotect, mlock?
> 
I was concentrating more on the operations that traced a page moving
throughout the system.  mmap and mprotect operate on the virtual address
space instead of the pages mapped in that address space.


Larry



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 18:20                                 ` Peter Zijlstra
@ 2009-03-06 18:24                                   ` Ingo Molnar
  0 siblings, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2009-03-06 18:24 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Larry Woodman, Andrew Morton, Nick Piggin, Steven Rostedt,
	KOSAKI Motohiro, linux-kernel, fweisbec


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2009-03-06 at 19:01 +0100, Ingo Molnar wrote:
> > * Peter Zijlstra <peterz@infradead.org> wrote:
> > 
> > > On Fri, 2009-03-06 at 18:38 +0100, Peter Zijlstra wrote:
> > > > On Fri, 2009-03-06 at 18:10 +0100, Ingo Molnar wrote:
> > > > > Looks pretty good and useful to me. I've Cc:-ed more mm folks, 
> > > > > it would be nice to hear their opinion about these tracepoints.
> > > > > 
> > > > > Andrew, Nick, Peter, what do you think?
> > > > 
> > > > Bit sad we use the struct mm_struct * as mm identifier (little %lx vs %p
> > > > confusion there too), but I suppose there simply isn't anything better.
> > > 
> > > > Things missing, 
> > > 
> > > Why only anon and filemap, that misses out on all the funky 
> > > driver ->fault() handlers.
> > 
> > btw., does it include shm faults? I think all of this would 
> > be handled if the tracepoint was at handle_mm_fault(), 
> > right?
> 
> Partially, you wouldn't be able to do the file:offset thing 
> you asked for.

That could be done further down in filemap_fault(). I.e. have an 
all-encompassing tracepoint for all things [user-] page faults, 
and a few opt-in places for more interesting specific fault 
types.

> But yeah, also hugetlb seems to be missing.

Probably not that huge of an issue, given how rare those faults 
are ;-)

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 18:01                               ` Ingo Molnar
@ 2009-03-06 18:20                                 ` Peter Zijlstra
  2009-03-06 18:24                                   ` Ingo Molnar
  2009-03-06 20:01                                 ` Larry Woodman
  1 sibling, 1 reply; 24+ messages in thread
From: Peter Zijlstra @ 2009-03-06 18:20 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Larry Woodman, Andrew Morton, Nick Piggin, Steven Rostedt,
	KOSAKI Motohiro, linux-kernel, fweisbec

On Fri, 2009-03-06 at 19:01 +0100, Ingo Molnar wrote:
> * Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, 2009-03-06 at 18:38 +0100, Peter Zijlstra wrote:
> > > On Fri, 2009-03-06 at 18:10 +0100, Ingo Molnar wrote:
> > > > Looks pretty good and useful to me. I've Cc:-ed more mm folks, 
> > > > it would be nice to hear their opinion about these tracepoints.
> > > > 
> > > > Andrew, Nick, Peter, what do you think?
> > > 
> > > Bit sad we use the struct mm_struct * as mm identifier (little %lx vs %p
> > > confusion there too), but I suppose there simply isn't anything better.
> > 
> > > Things missing, 
> > 
> > Why only anon and filemap, that misses out on all the funky 
> > driver ->fault() handlers.
> 
> btw., does it include shm faults? I think all of this would be 
> handled if the tracepoint was at handle_mm_fault(), right?

Partially, you wouldn't be able to do the file:offset thing you asked
for.

But yeah, also hugetlb seems to be missing.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 17:56                             ` Peter Zijlstra
@ 2009-03-06 18:01                               ` Ingo Molnar
  2009-03-06 18:20                                 ` Peter Zijlstra
  2009-03-06 20:01                                 ` Larry Woodman
  0 siblings, 2 replies; 24+ messages in thread
From: Ingo Molnar @ 2009-03-06 18:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Larry Woodman, Andrew Morton, Nick Piggin, Steven Rostedt,
	KOSAKI Motohiro, linux-kernel, fweisbec


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2009-03-06 at 18:38 +0100, Peter Zijlstra wrote:
> > On Fri, 2009-03-06 at 18:10 +0100, Ingo Molnar wrote:
> > > Looks pretty good and useful to me. I've Cc:-ed more mm folks, 
> > > it would be nice to hear their opinion about these tracepoints.
> > > 
> > > Andrew, Nick, Peter, what do you think?
> > 
> > Bit sad we use the struct mm_struct * as mm identifier (little %lx vs %p
> > confusion there too), but I suppose there simply isn't anything better.
> 
> > Things missing, 
> 
> Why only anon and filemap, that misses out on all the funky 
> driver ->fault() handlers.

btw., does it include shm faults? I think all of this would be 
handled if the tracepoint was at handle_mm_fault(), right?

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 17:38                           ` Peter Zijlstra
  2009-03-06 17:46                             ` Ingo Molnar
@ 2009-03-06 17:56                             ` Peter Zijlstra
  2009-03-06 18:01                               ` Ingo Molnar
  2009-03-06 19:06                             ` Larry Woodman
  2009-03-06 21:53                             ` Chris Friesen
  3 siblings, 1 reply; 24+ messages in thread
From: Peter Zijlstra @ 2009-03-06 17:56 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Larry Woodman, Andrew Morton, Nick Piggin, Steven Rostedt,
	KOSAKI Motohiro, linux-kernel, fweisbec

On Fri, 2009-03-06 at 18:38 +0100, Peter Zijlstra wrote:
> On Fri, 2009-03-06 at 18:10 +0100, Ingo Molnar wrote:
> > Looks pretty good and useful to me. I've Cc:-ed more mm folks, 
> > it would be nice to hear their opinion about these tracepoints.
> > 
> > Andrew, Nick, Peter, what do you think?
> 
> Bit sad we use the struct mm_struct * as mm identifier (little %lx vs %p
> confusion there too), but I suppose there simply isn't anything better.

> Things missing, 

Why only anon and filemap, that misses out on all the funky driver
->fault() handlers.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 17:38                           ` Peter Zijlstra
@ 2009-03-06 17:46                             ` Ingo Molnar
  2009-03-06 17:56                             ` Peter Zijlstra
                                               ` (2 subsequent siblings)
  3 siblings, 0 replies; 24+ messages in thread
From: Ingo Molnar @ 2009-03-06 17:46 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Larry Woodman, Andrew Morton, Nick Piggin, Steven Rostedt,
	KOSAKI Motohiro, linux-kernel, fweisbec


* Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, 2009-03-06 at 18:10 +0100, Ingo Molnar wrote:
> > Looks pretty good and useful to me. I've Cc:-ed more mm folks, 
> > it would be nice to hear their opinion about these tracepoints.
> > 
> > Andrew, Nick, Peter, what do you think?
> 
> Bit sad we use the struct mm_struct * as mm identifier (little 
> %lx vs %p confusion there too), but I suppose there simply 
> isn't anything better.

the other option would be to trace the pgd physical pfn value. 
The physical address of the pagetable is a pretty fundamental 
thing so that abstraction is unlikely to change.

> Exposing kernel pointers like that might upset some of the 
> security folks, not sure if I care though.

it's admin-only.

> I'm missing the fault_filemap_read counterpart of 
> fault_anon_pgin.
> 
> Once you have anon/filemap symmetric, you might consider 
> folding these and doing the anon argument thing you do 
> elsewhere.
> 
> Initially I was thinking we lacked the kswapd vs direct 
> reclaim information on the pgout data, but since we log the 
> pid:comm for each event...
> 
> Which brings us to mm_pdflush_*, we can already see its 
> pdflush from pid:comm, then again, it fits the naming style. 
> Same for mm_directreclaim*() - we already know its direct, 
> since its not kswapd doing it.
> 
> Finally, we have page_free, but not page_alloc? Oh, it is 
> there, just not in the obvious place.
> 
> Things missing, we trace unmap, but not mmap, mprotect, mlock?
> 
> pagelock perhaps?

yeah, pagelock would be nice. In a similar way to lockdep 
tracing. Maybe it should be part of lock tracing?

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 17:10                         ` Ingo Molnar
@ 2009-03-06 17:38                           ` Peter Zijlstra
  2009-03-06 17:46                             ` Ingo Molnar
                                               ` (3 more replies)
  2009-03-06 19:22                           ` Larry Woodman
  1 sibling, 4 replies; 24+ messages in thread
From: Peter Zijlstra @ 2009-03-06 17:38 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Larry Woodman, Andrew Morton, Nick Piggin, Steven Rostedt,
	KOSAKI Motohiro, linux-kernel, fweisbec

On Fri, 2009-03-06 at 18:10 +0100, Ingo Molnar wrote:
> Looks pretty good and useful to me. I've Cc:-ed more mm folks, 
> it would be nice to hear their opinion about these tracepoints.
> 
> Andrew, Nick, Peter, what do you think?

Bit sad we use the struct mm_struct * as mm identifier (little %lx vs %p
confusion there too), but I suppose there simply isn't anything better.

Exposing kernel pointers like that might upset some of the security
folks, not sure if I care though.

I'm missing the fault_filemap_read counterpart of fault_anon_pgin.

Once you have anon/filemap symmetric, you might consider folding these
and doing the anon argument thing you do elsewhere.

Initially I was thinking we lacked the kswapd vs direct reclaim
information on the pgout data, but since we log the pid:comm for each
event...

Which brings us to mm_pdflush_*, we can already see its pdflush from
pid:comm, then again, it fits the naming style. Same for
mm_directreclaim*() - we already know its direct, since its not kswapd
doing it.

Finally, we have page_free, but not page_alloc? Oh, it is there, just
not in the obvious place.


Things missing, we trace unmap, but not mmap, mprotect, mlock?

pagelock perhaps?




^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 16:57                       ` Larry Woodman
@ 2009-03-06 17:10                         ` Ingo Molnar
  2009-03-06 17:38                           ` Peter Zijlstra
  2009-03-06 19:22                           ` Larry Woodman
  0 siblings, 2 replies; 24+ messages in thread
From: Ingo Molnar @ 2009-03-06 17:10 UTC (permalink / raw)
  To: Larry Woodman, Andrew Morton, Nick Piggin
  Cc: Steven Rostedt, KOSAKI Motohiro, linux-kernel, peterz, fweisbec,
	Peter Zijlstra


* Larry Woodman <lwoodman@redhat.com> wrote:

> > Would that be useless or controversial? We know from 
> > vma->mapping which inode it maps to. Knowing which file is 
> > faulting in can be useful - especially when addresses are a 
> > moving target such as under PIE or with dlopen(), etc.
> > 
> > 	Ingo
> 
> Attached is the updated patch that applies and builds 
> correctly (sorry I missed the lockdep tracepoints that were 
> added at the last minute). [...]

Looks pretty good and useful to me. I've Cc:-ed more mm folks, 
it would be nice to hear their opinion about these tracepoints.

Andrew, Nick, Peter, what do you think?

About the motivation of these tracepoints: i suspect these 
tracepoints reflect your years-long experience in dealing with 
various MM regressions in the enterprise space and these 
tracepoints would help understand such regressions 
faster/easier?

> [...]  As far as the filename:offset is concerned I am working 
> on that.  Its not as simple as it looks because we have to 
> follow a variable list of structs that can be null terminated 
> several places along the way.

It's definitely not simple! I dont think it should be in this 
base patch at all - it should be an add-on.

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 13:55                     ` Ingo Molnar
@ 2009-03-06 16:57                       ` Larry Woodman
  2009-03-06 17:10                         ` Ingo Molnar
  0 siblings, 1 reply; 24+ messages in thread
From: Larry Woodman @ 2009-03-06 16:57 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, KOSAKI Motohiro, linux-kernel, peterz, fweisbec,
	Peter Zijlstra

[-- Attachment #1: Type: text/plain, Size: 3257 bytes --]

On Fri, 2009-03-06 at 14:55 +0100, Ingo Molnar wrote:
> * Larry Woodman <lwoodman@redhat.com> wrote:
> 
> > On Fri, 2009-03-06 at 12:04 +0100, Ingo Molnar wrote:
> > > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > > 
> > > > 
> > > > On Fri, 6 Mar 2009, KOSAKI Motohiro wrote:
> > > > 
> > > > > > I've implemented several mm tracepoints to track page allocation and
> > > > > > freeing, various types of pagefaults and unmaps, and critical page
> > > > > > reclamation routines.  This is useful for debugging memory allocation
> > > > > > issues and system performance problems under heavy memory loads:
> > > > > > 
> > > > > > # tracer: mm
> > > > > > #
> > > > > > #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
> > > > > > #              | |       |          |         |
> > > > > >          pdflush-624   [004]   184.293169: wb_kupdate:
> > > > > > (mm_pdflush_kupdate) count=3e48
> > > > > >          pdflush-624   [004]   184.293439: get_page_from_freelist:
> > > > > > (mm_page_allocation) pfn=447c27 zone_free=1940910
> > > > > >         events/6-33    [006]   184.962879: free_hot_cold_page:
> > > > > > (mm_page_free) pfn=44bba9
> > > > > >       irqbalance-8313  [001]   188.042951: unmap_vmas:
> > > > > > (mm_anon_userfree) mm=ffff88044a7300c0 address=7f9a2eb70000 pfn=24c29a
> > > > > >              cat-9122  [005]   191.141173: filemap_fault:
> > > > > > (mm_filemap_fault) primary fault: mm=ffff88024c9d8f40 address=3cea2dd000
> > > > > > pfn=44d68e
> > > > > >              cat-9122  [001]   191.143036: handle_mm_fault:
> > > > > > (mm_anon_fault) mm=ffff88024c8beb40 address=7fffbde99f94 pfn=24ce22
> > > > > > ...
> > > > > 
> > > > > Hi Larry,
> > > > > 
> > > > > I've started to evaluate your patch.
> > > > > 
> > > > > firstly, this patch can't apply tip/master.
> > > 
> > > yeah, would be nice to have a patch against:
> > > 
> > >   http://people.redhat.com/mingo/tip.git/README
> > 
> > Yeah I'll fix that, it is a moving target.
> > > 
> > > > > secondly, I don't think the address of mm_struct and pfn 
> > > > > help to analysis. administrator don't know the page is which 
> > > > > file's cache.
> > > > 
> > > > The mm_struct may not be helpful since there should be a 1 to 
> > > > 1 mapping between user tasks and the mm struct. Hmm, maybe 
> > > > not, due to threads?
> > > 
> > > Correct - so the mm ID looks useful.
> > > 
> > > > But the pfn is helpful since it is a unique identifier for 
> > > > what physical page was mapped.
> > > 
> > > Yeah. Nevertheless some sort of filename:offset indicator 
> > > would be nice too. (as an add-on)
> > 
> > You mean in the filemap pagefault case???
> 
> Would that be useless or controversial? We know from 
> vma->mapping which inode it maps to. Knowing which file is 
> faulting in can be useful - especially when addresses are a 
> moving target such as under PIE or with dlopen(), etc.
> 
> 	Ingo

Attached is the updated patch that applies and builds correctly(sorry I
missed the lockdep tracepoints that were added at the last minute).  As
far as the filename:offset is concerned I am working on that.  Its not
as simple as it looks because we have to follow a variable list of
structs that can be null terminated several places along the way.

Larry 



[-- Attachment #2: mm_tracepoints.diff --]
[-- Type: text/x-patch, Size: 22239 bytes --]

diff --git a/include/trace/mm.h b/include/trace/mm.h
new file mode 100644
index 0000000..a3e760e
--- /dev/null
+++ b/include/trace/mm.h
@@ -0,0 +1,9 @@
+#ifndef _TRACE_MM_H
+#define _TRACE_MM_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+
+#include <trace/mm_event_types.h>
+
+#endif
diff --git a/include/trace/mm_event_types.h b/include/trace/mm_event_types.h
new file mode 100644
index 0000000..f6fbbc5
--- /dev/null
+++ b/include/trace/mm_event_types.h
@@ -0,0 +1,281 @@
+/* use <trace/mm.h> instead */
+#ifndef TRACE_FORMAT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mm
+
+TRACE_EVENT_FORMAT(mm_anon_fault,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_pgin,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_cow,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_userfree,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_unmap,
+	TPPROTO(unsigned long pfn, int success),
+	TPARGS(pfn, success),
+	TPFMT("%s: pfn=%lx", pfn, success ? "succeeded" : "failed"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("pfn %lx success %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_fault,
+	TPPROTO(struct mm_struct *mm, unsigned long address,
+			unsigned long pfn, int flag),
+	TPARGS(mm, address, pfn, flag),
+	TPFMT("%s: mm=%lx address=%lx pfn=%lx",
+		flag ? "pagein" : "primary fault", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, flag, flag)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx flag %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_cow,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_unmap,
+	TPPROTO(unsigned long pfn, int success),
+	TPARGS(pfn, success),
+	TPFMT("%s: pfn=%lx", pfn, success ? "succeeded" : "failed"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("pfn %lx success %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_userunmap,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_pgout,
+	TPPROTO(unsigned long pfn, int anon),
+	TPARGS(pfn, anon),
+	TPFMT("%s page: pfn=%lx", pfn, anon ? "anonymous" : "pagecache"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_free,
+	TPPROTO(unsigned long pfn, int anon),
+	TPARGS(pfn, anon),
+	TPFMT("%s page: pfn=%lx", pfn, anon ? "anonymous" : "pagecache"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pdflush_bgwriteout,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pdflush_kupdate,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_page_allocation,
+	TPPROTO(unsigned long pfn, unsigned long free),
+	TPARGS(pfn, free),
+	TPFMT("pfn=%lx zone_free=%ld", pfn, free),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(unsigned long, free, free)
+	),
+	TPRAWFMT("pfn %lx free %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_kswapd_runs,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_directreclaim_reclaimall,
+	TPPROTO(unsigned long priority),
+	TPARGS(priority),
+	TPFMT("priority=%lx", priority),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, priority, priority)
+	),
+	TPRAWFMT("priority %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_directreclaim_reclaimzone,
+	TPPROTO(unsigned long reclaimed),
+	TPARGS(reclaimed),
+	TPFMT("reclaimed=%lx", reclaimed),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, reclaimed, reclaimed)
+	),
+	TPRAWFMT("reclaimed %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkzone,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive_a2a,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive_a2i,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive_i2a,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive_i2i,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_page_free,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+#undef TRACE_SYSTEM
+
+#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
index df56f56..153d262 100644
--- a/include/trace/trace_event_types.h
+++ b/include/trace/trace_event_types.h
@@ -3,3 +3,4 @@
 #include <trace/sched_event_types.h>
 #include <trace/irq_event_types.h>
 #include <trace/lockdep_event_types.h>
+#include <trace/mm_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index fd13750..2752e7f 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -3,3 +3,4 @@
 #include <trace/sched.h>
 #include <trace/irq.h>
 #include <trace/lockdep.h>
+#include <trace/mm.h>
diff --git a/mm/filemap.c b/mm/filemap.c
index 126d397..9c68755 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <trace/mm.h>
 #include "internal.h"
 
 /*
@@ -1436,6 +1437,7 @@ static int page_cache_read(struct file *file, pgoff_t offset)
 
 #define MMAP_LOTSAMISS  (100)
 
+DEFINE_TRACE(mm_filemap_fault);
 /**
  * filemap_fault - read in file data for page fault handling
  * @vma:	vma in which the fault was taken
@@ -1547,6 +1549,8 @@ retry_find:
 	 */
 	ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	vmf->page = page;
+	trace_mm_filemap_fault(vma->vm_mm, (unsigned long)vmf->virtual_address,
+			page_to_pfn(page), vmf->flags&FAULT_FLAG_NONLINEAR);
 	return ret | VM_FAULT_LOCKED;
 
 no_cached_page:
diff --git a/mm/memory.c b/mm/memory.c
index baa999e..6acc389 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -55,6 +55,7 @@
 #include <linux/kallsyms.h>
 #include <linux/swapops.h>
 #include <linux/elf.h>
+#include <trace/mm.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -758,6 +759,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	return ret;
 }
 
+DEFINE_TRACE(mm_anon_userfree);
+DEFINE_TRACE(mm_filemap_userunmap);
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
@@ -812,15 +815,19 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 						addr) != page->index)
 				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
-			if (PageAnon(page))
+			if (PageAnon(page)) {
 				anon_rss--;
-			else {
+				trace_mm_anon_userfree(mm, addr,
+							page_to_pfn(page));
+			} else {
 				if (pte_dirty(ptent))
 					set_page_dirty(page);
 				if (pte_young(ptent) &&
 				    likely(!VM_SequentialReadHint(vma)))
 					mark_page_accessed(page);
 				file_rss--;
+				trace_mm_filemap_userunmap(mm, addr,
+							page_to_pfn(page));
 			}
 			page_remove_rmap(page);
 			if (unlikely(page_mapcount(page) < 0))
@@ -1867,6 +1874,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 		copy_user_highpage(dst, src, va, vma);
 }
 
+DEFINE_TRACE(mm_anon_cow);
+DEFINE_TRACE(mm_filemap_cow);
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -1889,7 +1898,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		spinlock_t *ptl, pte_t orig_pte)
 {
-	struct page *old_page, *new_page;
+	struct page *old_page, *new_page = NULL;
 	pte_t entry;
 	int reuse = 0, ret = 0;
 	int page_mkwrite = 0;
@@ -2019,9 +2028,14 @@ gotten:
 			if (!PageAnon(old_page)) {
 				dec_mm_counter(mm, file_rss);
 				inc_mm_counter(mm, anon_rss);
+				trace_mm_filemap_cow(mm, address,
+					page_to_pfn(new_page));
 			}
-		} else
+		} else {
 			inc_mm_counter(mm, anon_rss);
+			trace_mm_anon_cow(mm, address,
+					page_to_pfn(new_page));
+		}
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2386,6 +2400,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 	return 0;
 }
 
+DEFINE_TRACE(mm_anon_pgin);
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2499,6 +2514,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
+	trace_mm_anon_pgin(mm, address, page_to_pfn(page));
 	return ret;
 out_nomap:
 	mem_cgroup_cancel_charge_swapin(ptr);
@@ -2508,6 +2524,7 @@ out_nomap:
 	return ret;
 }
 
+DEFINE_TRACE(mm_anon_fault);
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2531,6 +2548,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto oom;
 	__SetPageUptodate(page);
 
+	trace_mm_anon_fault(mm, address, page_to_pfn(page));
 	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
 		goto oom_free_page;
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 74dc57c..7ebd33c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <trace/mm.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -677,6 +678,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
         }
 }
 
+DEFINE_TRACE(mm_pdflush_bgwriteout);
 /*
  * writeback at least _min_pages, and keep writing until the amount of dirty
  * memory is less than the background threshold, or until we're all clean.
@@ -716,6 +718,7 @@ static void background_writeout(unsigned long _min_pages)
 				break;
 		}
 	}
+	trace_mm_pdflush_bgwriteout(_min_pages);
 }
 
 /*
@@ -737,6 +740,7 @@ static void laptop_timer_fn(unsigned long unused);
 static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
 static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 
+DEFINE_TRACE(mm_pdflush_kupdate);
 /*
  * Periodic writeback of "old" data.
  *
@@ -776,6 +780,7 @@ static void wb_kupdate(unsigned long arg)
 	nr_to_write = global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+	trace_mm_pdflush_kupdate(nr_to_write);
 	while (nr_to_write > 0) {
 		wbc.more_io = 0;
 		wbc.encountered_congestion = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0d0633f..b088370 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -47,6 +47,7 @@
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -994,6 +995,7 @@ void mark_free_pages(struct zone *zone)
 }
 #endif /* CONFIG_PM */
 
+DEFINE_TRACE(mm_page_free);
 /*
  * Free a 0-order page
  */
@@ -1010,6 +1012,7 @@ static void free_hot_cold_page(struct page *page, int cold)
 	if (free_pages_check(page))
 		return;
 
+	trace_mm_page_free(page_to_pfn(page));
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
 		debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
@@ -1399,6 +1402,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 }
 #endif	/* CONFIG_NUMA */
 
+DEFINE_TRACE(mm_page_allocation);
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
@@ -1453,8 +1457,11 @@ zonelist_scan:
 		}
 
 		page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
-		if (page)
+		if (page) {
+			trace_mm_page_allocation(page_to_pfn(page),
+					zone_page_state(zone, NR_FREE_PAGES));
 			break;
+		}
 this_zone_full:
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
diff --git a/mm/rmap.c b/mm/rmap.c
index 1652166..39a4876 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -50,6 +50,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 
@@ -978,6 +979,7 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
 	return mlocked;
 }
 
+DEFINE_TRACE(mm_anon_unmap);
 /**
  * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
  * rmap method
@@ -1034,9 +1036,11 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 	else if (ret == SWAP_MLOCK)
 		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
 
+	trace_mm_anon_unmap(page_to_pfn(page), ret == SWAP_SUCCESS);
 	return ret;
 }
 
+DEFINE_TRACE(mm_filemap_unmap);
 /**
  * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
  * @page: the page to unmap/unlock
@@ -1170,6 +1174,7 @@ out:
 		ret = SWAP_MLOCK;	/* actually mlocked the page */
 	else if (ret == SWAP_MLOCK)
 		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
+	trace_mm_filemap_unmap(page_to_pfn(page), ret == SWAP_SUCCESS);
 	return ret;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ae6f4c1..654d17f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -40,6 +40,7 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -329,6 +330,7 @@ typedef enum {
 	PAGE_CLEAN,
 } pageout_t;
 
+DEFINE_TRACE(mm_pagereclaim_pgout);
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
@@ -407,6 +409,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 			ClearPageReclaim(page);
 		}
 		inc_zone_page_state(page, NR_VMSCAN_WRITE);
+		trace_mm_pagereclaim_pgout(page_to_pfn(page), PageAnon(page));
 		return PAGE_SUCCESS;
 	}
 
@@ -570,6 +573,9 @@ void putback_lru_page(struct page *page)
 #endif /* CONFIG_UNEVICTABLE_LRU */
 
 
+DEFINE_TRACE(mm_pagereclaim_free);
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive_i2a);
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive_i2i);
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -758,6 +764,7 @@ free_it:
 			__pagevec_free(&freed_pvec);
 			pagevec_reinit(&freed_pvec);
 		}
+		trace_mm_pagereclaim_free(page_to_pfn(page), PageAnon(page));
 		continue;
 
 cull_mlocked:
@@ -774,10 +781,12 @@ activate_locked:
 		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
 		pgactivate++;
+		trace_mm_pagereclaim_shrinkinactive_i2a(page_to_pfn(page));
 keep_locked:
 		unlock_page(page);
 keep:
 		list_add(&page->lru, &ret_pages);
+		trace_mm_pagereclaim_shrinkinactive_i2i(page_to_pfn(page));
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
 	list_splice(&ret_pages, page_list);
@@ -1036,6 +1045,7 @@ int isolate_lru_page(struct page *page)
 	return ret;
 }
 
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive);
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
@@ -1170,6 +1180,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 done:
 	local_irq_enable();
 	pagevec_release(&pvec);
+	trace_mm_pagereclaim_shrinkinactive(nr_reclaimed);
 	return nr_reclaimed;
 }
 
@@ -1187,6 +1198,9 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
 		zone->prev_priority = priority;
 }
 
+DEFINE_TRACE(mm_pagereclaim_shrinkactive);
+DEFINE_TRACE(mm_pagereclaim_shrinkactive_a2a);
+DEFINE_TRACE(mm_pagereclaim_shrinkactive_a2i);
 /*
  * This moves pages from the active list to the inactive list.
  *
@@ -1247,6 +1261,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 
 		if (unlikely(!page_evictable(page, NULL))) {
 			putback_lru_page(page);
+			trace_mm_pagereclaim_shrinkactive_a2a(page_to_pfn(page));
 			continue;
 		}
 
@@ -1256,6 +1271,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 			pgmoved++;
 
 		list_add(&page->lru, &l_inactive);
+		trace_mm_pagereclaim_shrinkactive_a2i(page_to_pfn(page));
 	}
 
 	/*
@@ -1310,6 +1326,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		pagevec_swap_free(&pvec);
 
 	pagevec_release(&pvec);
+	trace_mm_pagereclaim_shrinkactive(pgscanned);
 }
 
 static int inactive_anon_is_low_global(struct zone *zone)
@@ -1450,6 +1467,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 }
 
 
+DEFINE_TRACE(mm_pagereclaim_shrinkzone);
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -1510,6 +1528,7 @@ static void shrink_zone(int priority, struct zone *zone,
 	}
 
 	sc->nr_reclaimed = nr_reclaimed;
+	trace_mm_pagereclaim_shrinkzone(nr_reclaimed);
 
 	/*
 	 * Even if we did not try to evict anon pages at all, we want to
@@ -1521,6 +1540,7 @@ static void shrink_zone(int priority, struct zone *zone,
 	throttle_vm_writeout(sc->gfp_mask);
 }
 
+DEFINE_TRACE(mm_directreclaim_reclaimall);
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -1569,6 +1589,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 							priority);
 		}
 
+		trace_mm_directreclaim_reclaimall(priority);
 		shrink_zone(priority, zone, sc);
 	}
 }
@@ -1732,6 +1753,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
 
+DEFINE_TRACE(mm_kswapd_runs);
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
@@ -1938,6 +1960,7 @@ out:
 		goto loop_again;
 	}
 
+	trace_mm_kswapd_runs(sc.nr_reclaimed);
 	return sc.nr_reclaimed;
 }
 
@@ -2278,6 +2301,7 @@ int sysctl_min_unmapped_ratio = 1;
  */
 int sysctl_min_slab_ratio = 5;
 
+DEFINE_TRACE(mm_directreclaim_reclaimzone);
 /*
  * Try to free up some pages from this zone through reclaim.
  */
@@ -2321,6 +2345,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		do {
 			note_zone_scanning_priority(zone, priority);
 			shrink_zone(priority, zone, &sc);
+			trace_mm_directreclaim_reclaimzone(priority);
 			priority--;
 		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
 	}
@@ -2352,6 +2377,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+	trace_mm_directreclaim_reclaimzone(sc.nr_reclaimed);
 	return sc.nr_reclaimed >= nr_pages;
 }
 

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 12:33                   ` Larry Woodman
@ 2009-03-06 13:55                     ` Ingo Molnar
  2009-03-06 16:57                       ` Larry Woodman
  0 siblings, 1 reply; 24+ messages in thread
From: Ingo Molnar @ 2009-03-06 13:55 UTC (permalink / raw)
  To: Larry Woodman
  Cc: Steven Rostedt, KOSAKI Motohiro, linux-kernel, peterz, fweisbec,
	Peter Zijlstra


* Larry Woodman <lwoodman@redhat.com> wrote:

> On Fri, 2009-03-06 at 12:04 +0100, Ingo Molnar wrote:
> > * Steven Rostedt <rostedt@goodmis.org> wrote:
> > 
> > > 
> > > On Fri, 6 Mar 2009, KOSAKI Motohiro wrote:
> > > 
> > > > > I've implemented several mm tracepoints to track page allocation and
> > > > > freeing, various types of pagefaults and unmaps, and critical page
> > > > > reclamation routines.  This is useful for debugging memory allocation
> > > > > issues and system performance problems under heavy memory loads:
> > > > > 
> > > > > # tracer: mm
> > > > > #
> > > > > #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
> > > > > #              | |       |          |         |
> > > > >          pdflush-624   [004]   184.293169: wb_kupdate:
> > > > > (mm_pdflush_kupdate) count=3e48
> > > > >          pdflush-624   [004]   184.293439: get_page_from_freelist:
> > > > > (mm_page_allocation) pfn=447c27 zone_free=1940910
> > > > >         events/6-33    [006]   184.962879: free_hot_cold_page:
> > > > > (mm_page_free) pfn=44bba9
> > > > >       irqbalance-8313  [001]   188.042951: unmap_vmas:
> > > > > (mm_anon_userfree) mm=ffff88044a7300c0 address=7f9a2eb70000 pfn=24c29a
> > > > >              cat-9122  [005]   191.141173: filemap_fault:
> > > > > (mm_filemap_fault) primary fault: mm=ffff88024c9d8f40 address=3cea2dd000
> > > > > pfn=44d68e
> > > > >              cat-9122  [001]   191.143036: handle_mm_fault:
> > > > > (mm_anon_fault) mm=ffff88024c8beb40 address=7fffbde99f94 pfn=24ce22
> > > > > ...
> > > > 
> > > > Hi Larry,
> > > > 
> > > > I've started to evaluate your patch.
> > > > 
> > > > firstly, this patch can't apply tip/master.
> > 
> > yeah, would be nice to have a patch against:
> > 
> >   http://people.redhat.com/mingo/tip.git/README
> 
> Yeah I'll fix that, it is a moving target.
> > 
> > > > secondly, I don't think the address of mm_struct and pfn 
> > > > help to analysis. administrator don't know the page is which 
> > > > file's cache.
> > > 
> > > The mm_struct may not be helpful since there should be a 1 to 
> > > 1 mapping between user tasks and the mm struct. Hmm, maybe 
> > > not, due to threads?
> > 
> > Correct - so the mm ID looks useful.
> > 
> > > But the pfn is helpful since it is a unique identifier for 
> > > what physical page was mapped.
> > 
> > Yeah. Nevertheless some sort of filename:offset indicator 
> > would be nice too. (as an add-on)
> 
> You mean in the filemap pagefault case???

Would that be useless or controversial? We know from 
vma->mapping which inode it maps to. Knowing which file is 
faulting in can be useful - especially when addresses are a 
moving target such as under PIE or with dlopen(), etc.

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06 11:04                 ` Ingo Molnar
@ 2009-03-06 12:33                   ` Larry Woodman
  2009-03-06 13:55                     ` Ingo Molnar
  0 siblings, 1 reply; 24+ messages in thread
From: Larry Woodman @ 2009-03-06 12:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Steven Rostedt, KOSAKI Motohiro, linux-kernel, peterz, fweisbec,
	Peter Zijlstra

On Fri, 2009-03-06 at 12:04 +0100, Ingo Molnar wrote:
> * Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> > 
> > On Fri, 6 Mar 2009, KOSAKI Motohiro wrote:
> > 
> > > > I've implemented several mm tracepoints to track page allocation and
> > > > freeing, various types of pagefaults and unmaps, and critical page
> > > > reclamation routines.  This is useful for debugging memory allocation
> > > > issues and system performance problems under heavy memory loads:
> > > > 
> > > > # tracer: mm
> > > > #
> > > > #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
> > > > #              | |       |          |         |
> > > >          pdflush-624   [004]   184.293169: wb_kupdate:
> > > > (mm_pdflush_kupdate) count=3e48
> > > >          pdflush-624   [004]   184.293439: get_page_from_freelist:
> > > > (mm_page_allocation) pfn=447c27 zone_free=1940910
> > > >         events/6-33    [006]   184.962879: free_hot_cold_page:
> > > > (mm_page_free) pfn=44bba9
> > > >       irqbalance-8313  [001]   188.042951: unmap_vmas:
> > > > (mm_anon_userfree) mm=ffff88044a7300c0 address=7f9a2eb70000 pfn=24c29a
> > > >              cat-9122  [005]   191.141173: filemap_fault:
> > > > (mm_filemap_fault) primary fault: mm=ffff88024c9d8f40 address=3cea2dd000
> > > > pfn=44d68e
> > > >              cat-9122  [001]   191.143036: handle_mm_fault:
> > > > (mm_anon_fault) mm=ffff88024c8beb40 address=7fffbde99f94 pfn=24ce22
> > > > ...
> > > 
> > > Hi Larry,
> > > 
> > > I've started to evaluate your patch.
> > > 
> > > firstly, this patch can't apply tip/master.
> 
> yeah, would be nice to have a patch against:
> 
>   http://people.redhat.com/mingo/tip.git/README

Yeah I'll fix that, it is a moving target.
> 
> > > secondly, I don't think the address of mm_struct and pfn 
> > > help to analysis. administrator don't know the page is which 
> > > file's cache.
> > 
> > The mm_struct may not be helpful since there should be a 1 to 
> > 1 mapping between user tasks and the mm struct. Hmm, maybe 
> > not, due to threads?
> 
> Correct - so the mm ID looks useful.
> 
> > But the pfn is helpful since it is a unique identifier for 
> > what physical page was mapped.
> 
> Yeah. Nevertheless some sort of filename:offset indicator would 
> be nice too. (as an add-on)

You mean in the filemap pagefault case???

Thanks, Larry

> 
> 	Ingo


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06  2:26               ` Steven Rostedt
@ 2009-03-06 11:04                 ` Ingo Molnar
  2009-03-06 12:33                   ` Larry Woodman
  0 siblings, 1 reply; 24+ messages in thread
From: Ingo Molnar @ 2009-03-06 11:04 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: KOSAKI Motohiro, Larry Woodman, linux-kernel, peterz, fweisbec,
	Peter Zijlstra


* Steven Rostedt <rostedt@goodmis.org> wrote:

> 
> On Fri, 6 Mar 2009, KOSAKI Motohiro wrote:
> 
> > > I've implemented several mm tracepoints to track page allocation and
> > > freeing, various types of pagefaults and unmaps, and critical page
> > > reclamation routines.  This is useful for debugging memory allocation
> > > issues and system performance problems under heavy memory loads:
> > > 
> > > # tracer: mm
> > > #
> > > #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
> > > #              | |       |          |         |
> > >          pdflush-624   [004]   184.293169: wb_kupdate:
> > > (mm_pdflush_kupdate) count=3e48
> > >          pdflush-624   [004]   184.293439: get_page_from_freelist:
> > > (mm_page_allocation) pfn=447c27 zone_free=1940910
> > >         events/6-33    [006]   184.962879: free_hot_cold_page:
> > > (mm_page_free) pfn=44bba9
> > >       irqbalance-8313  [001]   188.042951: unmap_vmas:
> > > (mm_anon_userfree) mm=ffff88044a7300c0 address=7f9a2eb70000 pfn=24c29a
> > >              cat-9122  [005]   191.141173: filemap_fault:
> > > (mm_filemap_fault) primary fault: mm=ffff88024c9d8f40 address=3cea2dd000
> > > pfn=44d68e
> > >              cat-9122  [001]   191.143036: handle_mm_fault:
> > > (mm_anon_fault) mm=ffff88024c8beb40 address=7fffbde99f94 pfn=24ce22
> > > ...
> > 
> > Hi Larry,
> > 
> > I've started to evaluate your patch.
> > 
> > firstly, this patch can't apply tip/master.

yeah, would be nice to have a patch against:

  http://people.redhat.com/mingo/tip.git/README

> > secondly, I don't think the address of mm_struct and pfn 
> > help to analysis. administrator don't know the page is which 
> > file's cache.
> 
> The mm_struct may not be helpful since there should be a 1 to 
> 1 mapping between user tasks and the mm struct. Hmm, maybe 
> not, due to threads?

Correct - so the mm ID looks useful.

> But the pfn is helpful since it is a unique identifier for 
> what physical page was mapped.

Yeah. Nevertheless some sort of filename:offset indicator would 
be nice too. (as an add-on)

	Ingo

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-06  2:11             ` KOSAKI Motohiro
@ 2009-03-06  2:26               ` Steven Rostedt
  2009-03-06 11:04                 ` Ingo Molnar
  0 siblings, 1 reply; 24+ messages in thread
From: Steven Rostedt @ 2009-03-06  2:26 UTC (permalink / raw)
  To: KOSAKI Motohiro; +Cc: Larry Woodman, linux-kernel, mingo, peterz, fweisbec


On Fri, 6 Mar 2009, KOSAKI Motohiro wrote:

> > I've implemented several mm tracepoints to track page allocation and
> > freeing, various types of pagefaults and unmaps, and critical page
> > reclamation routines.  This is useful for debugging memory allocation
> > issues and system performance problems under heavy memory loads:
> > 
> > # tracer: mm
> > #
> > #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
> > #              | |       |          |         |
> >          pdflush-624   [004]   184.293169: wb_kupdate:
> > (mm_pdflush_kupdate) count=3e48
> >          pdflush-624   [004]   184.293439: get_page_from_freelist:
> > (mm_page_allocation) pfn=447c27 zone_free=1940910
> >         events/6-33    [006]   184.962879: free_hot_cold_page:
> > (mm_page_free) pfn=44bba9
> >       irqbalance-8313  [001]   188.042951: unmap_vmas:
> > (mm_anon_userfree) mm=ffff88044a7300c0 address=7f9a2eb70000 pfn=24c29a
> >              cat-9122  [005]   191.141173: filemap_fault:
> > (mm_filemap_fault) primary fault: mm=ffff88024c9d8f40 address=3cea2dd000
> > pfn=44d68e
> >              cat-9122  [001]   191.143036: handle_mm_fault:
> > (mm_anon_fault) mm=ffff88024c8beb40 address=7fffbde99f94 pfn=24ce22
> > ...
> 
> Hi Larry,
> 
> I've started to evaluate your patch.
> 
> firstly, this patch can't apply tip/master.
> secondly, I don't think the address of mm_struct and pfn help to analysis.
> administrator don't know the page is which file's cache.

The mm_struct may not be helpful since there should be a 1 to 1 mapping 
between user tasks and the mm struct. Hmm, maybe not, due to threads?

But the pfn is helpful since it is a unique identifier for what physical 
page was mapped.

-- Steve


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [Patch] mm tracepoints
  2009-03-05 22:16           ` [Patch] mm tracepoints Larry Woodman
@ 2009-03-06  2:11             ` KOSAKI Motohiro
  2009-03-06  2:26               ` Steven Rostedt
  2009-03-06 21:16               ` Andrew Morton
  1 sibling, 1 reply; 24+ messages in thread
From: KOSAKI Motohiro @ 2009-03-06  2:11 UTC (permalink / raw)
  To: Larry Woodman
  Cc: kosaki.motohiro, linux-kernel, mingo, rostedt, peterz, fweisbec

> I've implemented several mm tracepoints to track page allocation and
> freeing, various types of pagefaults and unmaps, and critical page
> reclamation routines.  This is useful for debugging memory allocation
> issues and system performance problems under heavy memory loads:
> 
> # tracer: mm
> #
> #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
> #              | |       |          |         |
>          pdflush-624   [004]   184.293169: wb_kupdate:
> (mm_pdflush_kupdate) count=3e48
>          pdflush-624   [004]   184.293439: get_page_from_freelist:
> (mm_page_allocation) pfn=447c27 zone_free=1940910
>         events/6-33    [006]   184.962879: free_hot_cold_page:
> (mm_page_free) pfn=44bba9
>       irqbalance-8313  [001]   188.042951: unmap_vmas:
> (mm_anon_userfree) mm=ffff88044a7300c0 address=7f9a2eb70000 pfn=24c29a
>              cat-9122  [005]   191.141173: filemap_fault:
> (mm_filemap_fault) primary fault: mm=ffff88024c9d8f40 address=3cea2dd000
> pfn=44d68e
>              cat-9122  [001]   191.143036: handle_mm_fault:
> (mm_anon_fault) mm=ffff88024c8beb40 address=7fffbde99f94 pfn=24ce22
> ...

Hi Larry,

I've started to evaluate your patch.

firstly, this patch can't apply tip/master.
secondly, I don't think the address of mm_struct and pfn help to analysis.
administrator don't know the page is which file's cache.




^ permalink raw reply	[flat|nested] 24+ messages in thread

* [Patch] mm tracepoints
  2009-01-30  9:05         ` Nigel Cunningham
@ 2009-03-05 22:16           ` Larry Woodman
  2009-03-06  2:11             ` KOSAKI Motohiro
  2009-03-06 21:16               ` Andrew Morton
  0 siblings, 2 replies; 24+ messages in thread
From: Larry Woodman @ 2009-03-05 22:16 UTC (permalink / raw)
  To: linux-kernel; +Cc: mingo, rostedt, peterz, fweisbec

[-- Attachment #1: Type: text/plain, Size: 1134 bytes --]

I've implemented several mm tracepoints to track page allocation and
freeing, various types of pagefaults and unmaps, and critical page
reclamation routines.  This is useful for debugging memory allocation
issues and system performance problems under heavy memory loads:

# tracer: mm
#
#           TASK-PID    CPU#    TIMESTAMP  FUNCTION
#              | |       |          |         |
         pdflush-624   [004]   184.293169: wb_kupdate:
(mm_pdflush_kupdate) count=3e48
         pdflush-624   [004]   184.293439: get_page_from_freelist:
(mm_page_allocation) pfn=447c27 zone_free=1940910
        events/6-33    [006]   184.962879: free_hot_cold_page:
(mm_page_free) pfn=44bba9
      irqbalance-8313  [001]   188.042951: unmap_vmas:
(mm_anon_userfree) mm=ffff88044a7300c0 address=7f9a2eb70000 pfn=24c29a
             cat-9122  [005]   191.141173: filemap_fault:
(mm_filemap_fault) primary fault: mm=ffff88024c9d8f40 address=3cea2dd000
pfn=44d68e
             cat-9122  [001]   191.143036: handle_mm_fault:
(mm_anon_fault) mm=ffff88024c8beb40 address=7fffbde99f94 pfn=24ce22
...



Signed-off-by: Larry Woodman <lwoodman@redhat.com>

[-- Attachment #2: mm_tracepoints.diff --]
[-- Type: text/x-patch, Size: 22175 bytes --]

diff --git a/include/trace/mm.h b/include/trace/mm.h
new file mode 100644
index 0000000..a3e760e
--- /dev/null
+++ b/include/trace/mm.h
@@ -0,0 +1,9 @@
+#ifndef _TRACE_MM_H
+#define _TRACE_MM_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+
+#include <trace/mm_event_types.h>
+
+#endif
diff --git a/include/trace/mm_event_types.h b/include/trace/mm_event_types.h
new file mode 100644
index 0000000..f6fbbc5
--- /dev/null
+++ b/include/trace/mm_event_types.h
@@ -0,0 +1,281 @@
+/* use <trace/mm.h> instead */
+#ifndef TRACE_FORMAT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM mm
+
+TRACE_EVENT_FORMAT(mm_anon_fault,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_pgin,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_cow,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_userfree,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_anon_unmap,
+	TPPROTO(unsigned long pfn, int success),
+	TPARGS(pfn, success),
+	TPFMT("%s: pfn=%lx", pfn, success ? "succeeded" : "failed"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("pfn %lx success %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_fault,
+	TPPROTO(struct mm_struct *mm, unsigned long address,
+			unsigned long pfn, int flag),
+	TPARGS(mm, address, pfn, flag),
+	TPFMT("%s: mm=%lx address=%lx pfn=%lx",
+		flag ? "pagein" : "primary fault", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, flag, flag)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx flag %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_cow,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_unmap,
+	TPPROTO(unsigned long pfn, int success),
+	TPARGS(pfn, success),
+	TPFMT("%s: pfn=%lx", pfn, success ? "succeeded" : "failed"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(int, success, success)
+	),
+	TPRAWFMT("pfn %lx success %x")
+	);
+
+TRACE_EVENT_FORMAT(mm_filemap_userunmap,
+	TPPROTO(struct mm_struct *mm, unsigned long address, unsigned long pfn),
+	TPARGS(mm, address, pfn),
+	TPFMT("mm=%lx address=%lx pfn=%lx", mm, address, pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(struct mm_struct *, mm, mm)
+		TRACE_FIELD(unsigned long, address, address)
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("mm %p address %lx pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_pgout,
+	TPPROTO(unsigned long pfn, int anon),
+	TPARGS(pfn, anon),
+	TPFMT("%s page: pfn=%lx", pfn, anon ? "anonymous" : "pagecache"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_free,
+	TPPROTO(unsigned long pfn, int anon),
+	TPARGS(pfn, anon),
+	TPFMT("%s page: pfn=%lx", pfn, anon ? "anonymous" : "pagecache"),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pdflush_bgwriteout,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pdflush_kupdate,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_page_allocation,
+	TPPROTO(unsigned long pfn, unsigned long free),
+	TPARGS(pfn, free),
+	TPFMT("pfn=%lx zone_free=%ld", pfn, free),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+		TRACE_FIELD(unsigned long, free, free)
+	),
+	TPRAWFMT("pfn %lx free %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_kswapd_runs,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_directreclaim_reclaimall,
+	TPPROTO(unsigned long priority),
+	TPARGS(priority),
+	TPFMT("priority=%lx", priority),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, priority, priority)
+	),
+	TPRAWFMT("priority %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_directreclaim_reclaimzone,
+	TPPROTO(unsigned long reclaimed),
+	TPARGS(reclaimed),
+	TPFMT("reclaimed=%lx", reclaimed),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, reclaimed, reclaimed)
+	),
+	TPRAWFMT("reclaimed %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkzone,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive_a2a,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkactive_a2i,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive,
+	TPPROTO(unsigned long count),
+	TPARGS(count),
+	TPFMT("count=%lx", count),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, count, count)
+	),
+	TPRAWFMT("count %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive_i2a,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_pagereclaim_shrinkinactive_i2i,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+
+TRACE_EVENT_FORMAT(mm_page_free,
+	TPPROTO(unsigned long pfn),
+	TPARGS(pfn),
+	TPFMT("pfn=%lx", pfn),
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, pfn, pfn)
+	),
+	TPRAWFMT("pfn %lx")
+	);
+#undef TRACE_SYSTEM
+
+#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
index 33c8ed5..865a108 100644
--- a/include/trace/trace_event_types.h
+++ b/include/trace/trace_event_types.h
@@ -2,3 +2,4 @@
 
 #include <trace/sched_event_types.h>
 #include <trace/irq_event_types.h>
+#include <trace/mm_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index ea2ef20..b234596 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -2,3 +2,4 @@
 
 #include <trace/sched.h>
 #include <trace/irq.h>
+#include <trace/mm.h>
diff --git a/mm/filemap.c b/mm/filemap.c
index 126d397..9c68755 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -34,6 +34,7 @@
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
 #include <linux/memcontrol.h>
 #include <linux/mm_inline.h> /* for page_is_file_cache() */
+#include <trace/mm.h>
 #include "internal.h"
 
 /*
@@ -1436,6 +1437,7 @@ static int page_cache_read(struct file *file, pgoff_t offset)
 
 #define MMAP_LOTSAMISS  (100)
 
+DEFINE_TRACE(mm_filemap_fault);
 /**
  * filemap_fault - read in file data for page fault handling
  * @vma:	vma in which the fault was taken
@@ -1547,6 +1549,8 @@ retry_find:
 	 */
 	ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
 	vmf->page = page;
+	trace_mm_filemap_fault(vma->vm_mm, (unsigned long)vmf->virtual_address,
+			page_to_pfn(page), vmf->flags&FAULT_FLAG_NONLINEAR);
 	return ret | VM_FAULT_LOCKED;
 
 no_cached_page:
diff --git a/mm/memory.c b/mm/memory.c
index baa999e..6acc389 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -55,6 +55,7 @@
 #include <linux/kallsyms.h>
 #include <linux/swapops.h>
 #include <linux/elf.h>
+#include <trace/mm.h>
 
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
@@ -758,6 +759,8 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	return ret;
 }
 
+DEFINE_TRACE(mm_anon_userfree);
+DEFINE_TRACE(mm_filemap_userunmap);
 static unsigned long zap_pte_range(struct mmu_gather *tlb,
 				struct vm_area_struct *vma, pmd_t *pmd,
 				unsigned long addr, unsigned long end,
@@ -812,15 +815,19 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 						addr) != page->index)
 				set_pte_at(mm, addr, pte,
 					   pgoff_to_pte(page->index));
-			if (PageAnon(page))
+			if (PageAnon(page)) {
 				anon_rss--;
-			else {
+				trace_mm_anon_userfree(mm, addr,
+							page_to_pfn(page));
+			} else {
 				if (pte_dirty(ptent))
 					set_page_dirty(page);
 				if (pte_young(ptent) &&
 				    likely(!VM_SequentialReadHint(vma)))
 					mark_page_accessed(page);
 				file_rss--;
+				trace_mm_filemap_userunmap(mm, addr,
+							page_to_pfn(page));
 			}
 			page_remove_rmap(page);
 			if (unlikely(page_mapcount(page) < 0))
@@ -1867,6 +1874,8 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
 		copy_user_highpage(dst, src, va, vma);
 }
 
+DEFINE_TRACE(mm_anon_cow);
+DEFINE_TRACE(mm_filemap_cow);
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -1889,7 +1898,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		spinlock_t *ptl, pte_t orig_pte)
 {
-	struct page *old_page, *new_page;
+	struct page *old_page, *new_page = NULL;
 	pte_t entry;
 	int reuse = 0, ret = 0;
 	int page_mkwrite = 0;
@@ -2019,9 +2028,14 @@ gotten:
 			if (!PageAnon(old_page)) {
 				dec_mm_counter(mm, file_rss);
 				inc_mm_counter(mm, anon_rss);
+				trace_mm_filemap_cow(mm, address,
+					page_to_pfn(new_page));
 			}
-		} else
+		} else {
 			inc_mm_counter(mm, anon_rss);
+			trace_mm_anon_cow(mm, address,
+					page_to_pfn(new_page));
+		}
 		flush_cache_page(vma, address, pte_pfn(orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2386,6 +2400,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
 	return 0;
 }
 
+DEFINE_TRACE(mm_anon_pgin);
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2499,6 +2514,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
+	trace_mm_anon_pgin(mm, address, page_to_pfn(page));
 	return ret;
 out_nomap:
 	mem_cgroup_cancel_charge_swapin(ptr);
@@ -2508,6 +2524,7 @@ out_nomap:
 	return ret;
 }
 
+DEFINE_TRACE(mm_anon_fault);
 /*
  * We enter with non-exclusive mmap_sem (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -2531,6 +2548,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto oom;
 	__SetPageUptodate(page);
 
+	trace_mm_anon_fault(mm, address, page_to_pfn(page));
 	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
 		goto oom_free_page;
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 74dc57c..7ebd33c 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -34,6 +34,7 @@
 #include <linux/syscalls.h>
 #include <linux/buffer_head.h>
 #include <linux/pagevec.h>
+#include <trace/mm.h>
 
 /*
  * The maximum number of pages to writeout in a single bdflush/kupdate
@@ -677,6 +678,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
         }
 }
 
+DEFINE_TRACE(mm_pdflush_bgwriteout);
 /*
  * writeback at least _min_pages, and keep writing until the amount of dirty
  * memory is less than the background threshold, or until we're all clean.
@@ -716,6 +718,7 @@ static void background_writeout(unsigned long _min_pages)
 				break;
 		}
 	}
+	trace_mm_pdflush_bgwriteout(_min_pages);
 }
 
 /*
@@ -737,6 +740,7 @@ static void laptop_timer_fn(unsigned long unused);
 static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0);
 static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
 
+DEFINE_TRACE(mm_pdflush_kupdate);
 /*
  * Periodic writeback of "old" data.
  *
@@ -776,6 +780,7 @@ static void wb_kupdate(unsigned long arg)
 	nr_to_write = global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
+	trace_mm_pdflush_kupdate(nr_to_write);
 	while (nr_to_write > 0) {
 		wbc.more_io = 0;
 		wbc.encountered_congestion = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0d0633f..b088370 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -47,6 +47,7 @@
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -994,6 +995,7 @@ void mark_free_pages(struct zone *zone)
 }
 #endif /* CONFIG_PM */
 
+DEFINE_TRACE(mm_page_free);
 /*
  * Free a 0-order page
  */
@@ -1010,6 +1012,7 @@ static void free_hot_cold_page(struct page *page, int cold)
 	if (free_pages_check(page))
 		return;
 
+	trace_mm_page_free(page_to_pfn(page));
 	if (!PageHighMem(page)) {
 		debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
 		debug_check_no_obj_freed(page_address(page), PAGE_SIZE);
@@ -1399,6 +1402,7 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z)
 }
 #endif	/* CONFIG_NUMA */
 
+DEFINE_TRACE(mm_page_allocation);
 /*
  * get_page_from_freelist goes through the zonelist trying to allocate
  * a page.
@@ -1453,8 +1457,11 @@ zonelist_scan:
 		}
 
 		page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask);
-		if (page)
+		if (page) {
+			trace_mm_page_allocation(page_to_pfn(page),
+					zone_page_state(zone, NR_FREE_PAGES));
 			break;
+		}
 this_zone_full:
 		if (NUMA_BUILD)
 			zlc_mark_zone_full(zonelist, z);
diff --git a/mm/rmap.c b/mm/rmap.c
index 1652166..39a4876 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -50,6 +50,7 @@
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 
@@ -978,6 +979,7 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
 	return mlocked;
 }
 
+DEFINE_TRACE(mm_anon_unmap);
 /**
  * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
  * rmap method
@@ -1034,9 +1036,11 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration)
 	else if (ret == SWAP_MLOCK)
 		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
 
+	trace_mm_anon_unmap(page_to_pfn(page), ret == SWAP_SUCCESS);
 	return ret;
 }
 
+DEFINE_TRACE(mm_filemap_unmap);
 /**
  * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
  * @page: the page to unmap/unlock
@@ -1170,6 +1174,7 @@ out:
 		ret = SWAP_MLOCK;	/* actually mlocked the page */
 	else if (ret == SWAP_MLOCK)
 		ret = SWAP_AGAIN;	/* saw VM_LOCKED vma */
+	trace_mm_filemap_unmap(page_to_pfn(page), ret == SWAP_SUCCESS);
 	return ret;
 }
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ae6f4c1..654d17f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -40,6 +40,7 @@
 #include <linux/memcontrol.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
+#include <trace/mm.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -329,6 +330,7 @@ typedef enum {
 	PAGE_CLEAN,
 } pageout_t;
 
+DEFINE_TRACE(mm_pagereclaim_pgout);
 /*
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
@@ -407,6 +409,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
 			ClearPageReclaim(page);
 		}
 		inc_zone_page_state(page, NR_VMSCAN_WRITE);
+		trace_mm_pagereclaim_pgout(page_to_pfn(page), PageAnon(page));
 		return PAGE_SUCCESS;
 	}
 
@@ -570,6 +573,9 @@ void putback_lru_page(struct page *page)
 #endif /* CONFIG_UNEVICTABLE_LRU */
 
 
+DEFINE_TRACE(mm_pagereclaim_free);
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive_i2a);
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive_i2i);
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -758,6 +764,7 @@ free_it:
 			__pagevec_free(&freed_pvec);
 			pagevec_reinit(&freed_pvec);
 		}
+		trace_mm_pagereclaim_free(page_to_pfn(page), PageAnon(page));
 		continue;
 
 cull_mlocked:
@@ -774,10 +781,12 @@ activate_locked:
 		VM_BUG_ON(PageActive(page));
 		SetPageActive(page);
 		pgactivate++;
+		trace_mm_pagereclaim_shrinkinactive_i2a(page_to_pfn(page));
 keep_locked:
 		unlock_page(page);
 keep:
 		list_add(&page->lru, &ret_pages);
+		trace_mm_pagereclaim_shrinkinactive_i2i(page_to_pfn(page));
 		VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
 	}
 	list_splice(&ret_pages, page_list);
@@ -1036,6 +1045,7 @@ int isolate_lru_page(struct page *page)
 	return ret;
 }
 
+DEFINE_TRACE(mm_pagereclaim_shrinkinactive);
 /*
  * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
  * of reclaimed pages
@@ -1170,6 +1180,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
 done:
 	local_irq_enable();
 	pagevec_release(&pvec);
+	trace_mm_pagereclaim_shrinkinactive(nr_reclaimed);
 	return nr_reclaimed;
 }
 
@@ -1187,6 +1198,9 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
 		zone->prev_priority = priority;
 }
 
+DEFINE_TRACE(mm_pagereclaim_shrinkactive);
+DEFINE_TRACE(mm_pagereclaim_shrinkactive_a2a);
+DEFINE_TRACE(mm_pagereclaim_shrinkactive_a2i);
 /*
  * This moves pages from the active list to the inactive list.
  *
@@ -1247,6 +1261,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 
 		if (unlikely(!page_evictable(page, NULL))) {
 			putback_lru_page(page);
+			trace_mm_pagereclaim_shrinkactive_a2a(page_to_pfn(page));
 			continue;
 		}
 
@@ -1256,6 +1271,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 			pgmoved++;
 
 		list_add(&page->lru, &l_inactive);
+		trace_mm_pagereclaim_shrinkactive_a2i(page_to_pfn(page));
 	}
 
 	/*
@@ -1310,6 +1326,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
 		pagevec_swap_free(&pvec);
 
 	pagevec_release(&pvec);
+	trace_mm_pagereclaim_shrinkactive(pgscanned);
 }
 
 static int inactive_anon_is_low_global(struct zone *zone)
@@ -1450,6 +1467,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
 }
 
 
+DEFINE_TRACE(mm_pagereclaim_shrinkzone);
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
@@ -1510,6 +1528,7 @@ static void shrink_zone(int priority, struct zone *zone,
 	}
 
 	sc->nr_reclaimed = nr_reclaimed;
+	trace_mm_pagereclaim_shrinkzone(nr_reclaimed);
 
 	/*
 	 * Even if we did not try to evict anon pages at all, we want to
@@ -1521,6 +1540,7 @@ static void shrink_zone(int priority, struct zone *zone,
 	throttle_vm_writeout(sc->gfp_mask);
 }
 
+DEFINE_TRACE(mm_directreclaim_reclaimall);
 /*
  * This is the direct reclaim path, for page-allocating processes.  We only
  * try to reclaim pages from zones which will satisfy the caller's allocation
@@ -1569,6 +1589,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
 							priority);
 		}
 
+		trace_mm_directreclaim_reclaimall(priority);
 		shrink_zone(priority, zone, sc);
 	}
 }
@@ -1732,6 +1753,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
 }
 #endif
 
+DEFINE_TRACE(mm_kswapd_runs);
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
@@ -1938,6 +1960,7 @@ out:
 		goto loop_again;
 	}
 
+	trace_mm_kswapd_runs(sc.nr_reclaimed);
 	return sc.nr_reclaimed;
 }
 
@@ -2278,6 +2301,7 @@ int sysctl_min_unmapped_ratio = 1;
  */
 int sysctl_min_slab_ratio = 5;
 
+DEFINE_TRACE(mm_directreclaim_reclaimzone);
 /*
  * Try to free up some pages from this zone through reclaim.
  */
@@ -2321,6 +2345,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		do {
 			note_zone_scanning_priority(zone, priority);
 			shrink_zone(priority, zone, &sc);
+			trace_mm_directreclaim_reclaimzone(priority);
 			priority--;
 		} while (priority >= 0 && sc.nr_reclaimed < nr_pages);
 	}
@@ -2352,6 +2377,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+	trace_mm_directreclaim_reclaimzone(sc.nr_reclaimed);
 	return sc.nr_reclaimed >= nr_pages;
 }
 

^ permalink raw reply related	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2009-03-23 17:39 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-03-16 19:52 [Patch] mm tracepoints Larry Woodman
2009-03-19 15:47 ` Rik van Riel
2009-03-19 15:55   ` Larry Woodman
2009-03-23 18:54   ` Larry Woodman
  -- strict thread matches above, loose matches on Subject: below --
2009-01-26 15:38 marching through all physical memory in software Chris Friesen
2009-01-26 15:59 ` Arjan van de Ven
2009-01-27 18:29   ` Chris Friesen
2009-01-27 20:16     ` Eric W. Biederman
2009-01-28 19:38       ` Pavel Machek
2009-01-30  9:05         ` Nigel Cunningham
2009-03-05 22:16           ` [Patch] mm tracepoints Larry Woodman
2009-03-06  2:11             ` KOSAKI Motohiro
2009-03-06  2:26               ` Steven Rostedt
2009-03-06 11:04                 ` Ingo Molnar
2009-03-06 12:33                   ` Larry Woodman
2009-03-06 13:55                     ` Ingo Molnar
2009-03-06 16:57                       ` Larry Woodman
2009-03-06 17:10                         ` Ingo Molnar
2009-03-06 17:38                           ` Peter Zijlstra
2009-03-06 17:46                             ` Ingo Molnar
2009-03-06 17:56                             ` Peter Zijlstra
2009-03-06 18:01                               ` Ingo Molnar
2009-03-06 18:20                                 ` Peter Zijlstra
2009-03-06 18:24                                   ` Ingo Molnar
2009-03-06 20:01                                 ` Larry Woodman
2009-03-06 19:06                             ` Larry Woodman
2009-03-06 21:53                             ` Chris Friesen
2009-03-06 19:22                           ` Larry Woodman
2009-03-06 21:16             ` Andrew Morton
2009-03-06 21:16               ` Andrew Morton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.