All of lore.kernel.org
 help / color / mirror / Atom feed
* [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-18  3:23 ` Aubrey Li
  0 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-18  3:23 UTC (permalink / raw)
  To: linux-kernel, linux-mm
  Cc: Linus Torvalds, Andrew Morton, Nick Piggin,
	linux-os (Dick Johnson),
	Robin Getz

[-- Attachment #1: Type: text/plain, Size: 4731 bytes --]

Here is the newest patch against 2.6.20-rc5.
======================================================
>From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001
From: Aubrey.Li <aubreylee@gmail.com>
Date: Thu, 18 Jan 2007 11:08:31 +0800
Subject: [PATCH] Add an interface to limit total vfs page cache.
The default percent is using 90% memory for page cache.

Signed-off-by: Aubrey.Li <aubreylee@gmail.com>
---
 include/linux/gfp.h     |    1 +
 include/linux/pagemap.h |    2 +-
 include/linux/sysctl.h  |    2 ++
 kernel/sysctl.c         |   11 +++++++++++
 mm/page_alloc.c         |   17 +++++++++++++++--
 5 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 00c314a..531360e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -46,6 +46,7 @@ struct vm_area_struct;
 #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use
emergency reserves */
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce
hardwall cpuset memory allocs */
 #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
+#define __GFP_PAGECACHE	((__force gfp_t)0x80000u) /* Is page cache
allocation ? */

 #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c3e255b..890bb23 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -62,7 +62,7 @@ static inline struct page *__page_cache_

 static inline struct page *page_cache_alloc(struct address_space *x)
 {
-	return __page_cache_alloc(mapping_gfp_mask(x));
+	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE);
 }

 static inline struct page *page_cache_alloc_cold(struct address_space *x)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 81480e6..d3c9174 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -202,6 +202,7 @@ enum
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
 	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
+	VM_PAGECACHE_RATIO=36,	/* percent of RAM to use as page cache */
 };


@@ -955,6 +956,7 @@ extern ctl_handler sysctl_string;
 extern ctl_handler sysctl_intvec;
 extern ctl_handler sysctl_jiffies;
 extern ctl_handler sysctl_ms_jiffies;
+extern int sysctl_pagecache_ratio;


 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 600b333..92db115 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1035,6 +1035,17 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 #endif
+	{
+		.ctl_name	= VM_PAGECACHE_RATIO,
+		.procname	= "pagecache_ratio",
+		.data		= &sysctl_pagecache_ratio,
+		.maxlen		= sizeof(sysctl_pagecache_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1         = &zero,
+                .extra2         = &one_hundred,
+	},
 	{ .ctl_name = 0 }
 };

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fc5b544..5802b39 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -82,6 +82,8 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z
 #endif
 };

+int sysctl_pagecache_ratio = 10;
+
 EXPORT_SYMBOL(totalram_pages);

 static char * const zone_names[MAX_NR_ZONES] = {
@@ -895,6 +897,7 @@ failed:
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
+#define ALLOC_PAGECACHE		0x80 /* __GFP_PAGECACHE set */

 #ifdef CONFIG_FAIL_PAGE_ALLOC

@@ -998,6 +1001,9 @@ int zone_watermark_ok(struct zone *z, in
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;

+	if (alloc_flags & ALLOC_PAGECACHE)
+		min = min + (sysctl_pagecache_ratio * z->present_pages) / 100;
+
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
 		return 0;
 	for (o = 0; o < order; o++) {
@@ -1236,8 +1242,12 @@ restart:
 		return NULL;
 	}

-	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-				zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+	if (gfp_mask & __GFP_PAGECACHE)	
+		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+			zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_PAGECACHE);
+	else
+		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+					zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
 	if (page)
 		goto got_pg;

@@ -1273,6 +1283,9 @@ restart:
 	if (wait)
 		alloc_flags |= ALLOC_CPUSET;

+	if (gfp_mask & __GFP_PAGECACHE)
+		alloc_flags |= ALLOC_PAGECACHE;
+
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks go deeper into reserves.
-- 
1.4.3.4
=====================================================

[-- Attachment #2: 0001-Add-an-interface-to-limit-total-vfs-page-cache.txt --]
[-- Type: text/plain, Size: 4593 bytes --]

From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001
From: Aubrey.Li <aubreylee@gmail.com>
Date: Thu, 18 Jan 2007 11:08:31 +0800
Subject: [PATCH] Add an interface to limit total vfs page cache.
The default percent is using 90% memory for page cache.

Signed-off-by: Aubrey.Li <aubreylee@gmail.com>
---
 include/linux/gfp.h     |    1 +
 include/linux/pagemap.h |    2 +-
 include/linux/sysctl.h  |    2 ++
 kernel/sysctl.c         |   11 +++++++++++
 mm/page_alloc.c         |   17 +++++++++++++++--
 5 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 00c314a..531360e 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -46,6 +46,7 @@ struct vm_area_struct;
 #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use emergency reserves */
 #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce hardwall cpuset memory allocs */
 #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
+#define __GFP_PAGECACHE	((__force gfp_t)0x80000u) /* Is page cache allocation ? */
 
 #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index c3e255b..890bb23 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -62,7 +62,7 @@ static inline struct page *__page_cache_
 
 static inline struct page *page_cache_alloc(struct address_space *x)
 {
-	return __page_cache_alloc(mapping_gfp_mask(x));
+	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE);
 }
 
 static inline struct page *page_cache_alloc_cold(struct address_space *x)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 81480e6..d3c9174 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -202,6 +202,7 @@ enum
 	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
 	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
 	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
+	VM_PAGECACHE_RATIO=36,	/* percent of RAM to use as page cache */
 };
 
 
@@ -955,6 +956,7 @@ extern ctl_handler sysctl_string;
 extern ctl_handler sysctl_intvec;
 extern ctl_handler sysctl_jiffies;
 extern ctl_handler sysctl_ms_jiffies;
+extern int sysctl_pagecache_ratio;
 
 
 /*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 600b333..92db115 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1035,6 +1035,17 @@ static ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 #endif
+	{
+		.ctl_name	= VM_PAGECACHE_RATIO,
+		.procname	= "pagecache_ratio",
+		.data		= &sysctl_pagecache_ratio,
+		.maxlen		= sizeof(sysctl_pagecache_ratio),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1         = &zero,
+                .extra2         = &one_hundred,
+	},
 	{ .ctl_name = 0 }
 };
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fc5b544..5802b39 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -82,6 +82,8 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z
 #endif
 };
 
+int sysctl_pagecache_ratio = 10;
+
 EXPORT_SYMBOL(totalram_pages);
 
 static char * const zone_names[MAX_NR_ZONES] = {
@@ -895,6 +897,7 @@ failed:
 #define ALLOC_HARDER		0x10 /* try to alloc harder */
 #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
 #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
+#define ALLOC_PAGECACHE		0x80 /* __GFP_PAGECACHE set */
 
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 
@@ -998,6 +1001,9 @@ int zone_watermark_ok(struct zone *z, in
 	if (alloc_flags & ALLOC_HARDER)
 		min -= min / 4;
 
+	if (alloc_flags & ALLOC_PAGECACHE)
+		min = min + (sysctl_pagecache_ratio * z->present_pages) / 100;
+
 	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
 		return 0;
 	for (o = 0; o < order; o++) {
@@ -1236,8 +1242,12 @@ restart:
 		return NULL;
 	}
 
-	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
-				zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
+	if (gfp_mask & __GFP_PAGECACHE)	
+		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+			zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_PAGECACHE);
+	else
+		page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order,
+					zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET);
 	if (page)
 		goto got_pg;
 
@@ -1273,6 +1283,9 @@ restart:
 	if (wait)
 		alloc_flags |= ALLOC_CPUSET;
 
+	if (gfp_mask & __GFP_PAGECACHE)
+		alloc_flags |= ALLOC_PAGECACHE;
+
 	/*
 	 * Go through the zonelist again. Let __GFP_HIGH and allocations
 	 * coming from realtime tasks go deeper into reserves.
-- 
1.4.3.4


^ permalink raw reply related	[flat|nested] 40+ messages in thread

* [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-18  3:23 ` Aubrey Li
  0 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-18  3:23 UTC (permalink / raw)
  To: linux-kernel, linux-mm
  Cc: Linus Torvalds, Andrew Morton, Nick Piggin,
	linux-os (Dick Johnson),
	Robin Getz

[-- Attachment #1: Type: text/plain, Size: 99 bytes --]

Here is the newest patch against 2.6.20-rc5.
======================================================

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-18  3:23 ` Aubrey Li
@ 2007-01-19 14:44   ` Vaidyanathan Srinivasan
  -1 siblings, 0 replies; 40+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-01-19 14:44 UTC (permalink / raw)
  To: Aubrey Li
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz


Aubrey Li wrote:
> Here is the newest patch against 2.6.20-rc5.
> ======================================================
> From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001
> From: Aubrey.Li <aubreylee@gmail.com>
> Date: Thu, 18 Jan 2007 11:08:31 +0800
> Subject: [PATCH] Add an interface to limit total vfs page cache.
> The default percent is using 90% memory for page cache.

Hi Aubrey,

I used your patch on my PPC64 box and I do not get expected
behavior.  As you had requested, I am attaching zoneinfo and meminfo
dumps:

# cat  /proc/sys/vm/pagecache_ratio
50
# cat /proc/meminfo
MemTotal:      1014600 kB << 1GB Ram
MemFree:        960336 kB << Expect to see around 500MB free after
Buffers:          8348 kB       issue of DD command
Cached:           8624 kB
SwapCached:          8 kB
Active:          20908 kB
Inactive:         5680 kB
SwapTotal:     1526164 kB
SwapFree:      1526088 kB
Dirty:             116 kB
Writeback:           0 kB
AnonPages:        9544 kB
Mapped:           7736 kB
Slab:            18920 kB
SReclaimable:     5792 kB
SUnreclaim:      13128 kB
PageTables:        972 kB
NFS_Unstable:        0 kB
Bounce:              0 kB
CommitLimit:   2033464 kB
Committed_AS:    46652 kB
VmallocTotal: 8589934592 kB
VmallocUsed:      2440 kB
VmallocChunk: 8589932152 kB
HugePages_Total:     0
HugePages_Free:      0
HugePages_Rsvd:      0
Hugepagesize:    16384 kB

# cat /proc/zoneinfo
Node 0, zone      DMA
  pages free     130474
        min      571
        low      713
        high     856
        active   5010
        inactive 775
        scanned  0 (a: 24 i: 0)
        spanned  147456
        present  145440
    nr_anon_pages 2383
    nr_mapped    1932
    nr_file_pages 3389
    nr_slab_reclaimable 1094
    nr_slab_unreclaimable 1819
    nr_page_table_pages 243
    nr_dirty     4
    nr_writeback 0
    nr_unstable  0
    nr_bounce    0
    nr_vmscan_write 34
    numa_hit     1428389
    numa_miss    0
    numa_foreign 1048457
    numa_interleave 1511
    numa_local   1428389
    numa_other   0
        protection: (0, 0)
  pagesets
    cpu: 0 pcp: 0
              count: 77
              high:  186
              batch: 31
    cpu: 0 pcp: 1
              count: 3
              high:  62
              batch: 15
  vm stats threshold: 16
    cpu: 1 pcp: 0
              count: 171
              high:  186
              batch: 31
    cpu: 1 pcp: 1
              count: 11
              high:  62
              batch: 15
  vm stats threshold: 16
  all_unreclaimable: 0
  prev_priority:     12
  start_pfn:         0
Node 1, zone      DMA
  pages free     109610
        min      444
        low      555
        high     666
        active   217
        inactive 655
        scanned  0 (a: 21 i: 0)
        spanned  114688
        present  113120
    nr_anon_pages 3
    nr_mapped    2
    nr_file_pages 869
    nr_slab_reclaimable 354
    nr_slab_unreclaimable 1454
    nr_page_table_pages 0
    nr_dirty     0
    nr_writeback 0
    nr_unstable  0
    nr_bounce    0
    nr_vmscan_write 0
    numa_hit     2220
    numa_miss    1048457
    numa_foreign 0
    numa_interleave 1519
    numa_local   0
    numa_other   1050677
        protection: (0, 0)
  pagesets
  all_unreclaimable: 0
  prev_priority:     12
  start_pfn:         147456

The test: Write 1GB file in /tmp

 # dd if=/dev/zero of=/tmp/foo bs=1M count=1024
1024+0 records in
1024+0 records out
1073741824 bytes (1.1 GB) copied, 15.2301 seconds, 70.5 MB/s

Expect around 500MB to be retained as free after the run?

# cat /proc/meminfo
MemTotal:      1014600 kB
MemFree:         14080 kB  <<<
Buffers:         11164 kB
Cached:         924536 kB  <<< Almost all memory is consumed by
SwapCached:          8 kB         pagecache
Active:          27500 kB
Inactive:       917740 kB
SwapTotal:     1526164 kB
SwapFree:      1526088 kB
Dirty:          100528 kB
Writeback:           0 kB
AnonPages:        9544 kB
Mapped:           7736 kB
Slab:            45264 kB
SReclaimable:    29652 kB
SUnreclaim:      15612 kB
PageTables:        972 kB
NFS_Unstable:        0 kB
Bounce:              0 kB
CommitLimit:   2033464 kB
Committed_AS:    47732 kB
VmallocTotal: 8589934592 kB
VmallocUsed:      2440 kB
VmallocChunk: 8589932152 kB
HugePages_Total:     0
HugePages_Free:      0
HugePages_Rsvd:      0
Hugepagesize:    16384 kB

# cat /proc/zoneinfo
Node 0, zone      DMA
  pages free     2063
        min      571
        low      713
        high     856
        active   6028
        inactive 124552
        scanned  0 (a: 5 i: 0)
        spanned  147456
        present  145440
    nr_anon_pages 2384
    nr_mapped    1932
    nr_file_pages 128191
    nr_slab_reclaimable 4312
    nr_slab_unreclaimable 2102
    nr_page_table_pages 243
    nr_dirty     13724
    nr_writeback 0
    nr_unstable  0
    nr_bounce    0
    nr_vmscan_write 34
    numa_hit     1577905
    numa_miss    0
    numa_foreign 1173147
    numa_interleave 1511
    numa_local   1577905
    numa_other   0
        protection: (0, 0)
  pagesets
    cpu: 0 pcp: 0
              count: 147
              high:  186
              batch: 31
    cpu: 0 pcp: 1
              count: 7
              high:  62
              batch: 15
  vm stats threshold: 16
    cpu: 1 pcp: 0
              count: 160
              high:  186
              batch: 31
    cpu: 1 pcp: 1
              count: 52
              high:  62
              batch: 15
  vm stats threshold: 16
  all_unreclaimable: 0
  prev_priority:     12
  start_pfn:         0
Node 1, zone      DMA
  pages free     1766
        min      444
        low      555
        high     666
        active   847
        inactive 104893
        scanned  0 (a: 27 i: 0)
        spanned  114688
        present  113120
    nr_anon_pages 2
    nr_mapped    2
    nr_file_pages 105739
    nr_slab_reclaimable 3082
    nr_slab_unreclaimable 1658
    nr_page_table_pages 0
    nr_dirty     11419
    nr_writeback 0
    nr_unstable  0
    nr_bounce    0
    nr_vmscan_write 0
    numa_hit     2220
    numa_miss    1173147
    numa_foreign 0
    numa_interleave 1519
    numa_local   0
    numa_other   1175367
        protection: (0, 0)
  pagesets
    cpu: 0 pcp: 0
              count: 1
              high:  186
              batch: 31
    cpu: 0 pcp: 1
              count: 0
              high:  62
              batch: 15
  vm stats threshold: 12
    cpu: 1 pcp: 0
              count: 35
              high:  186
              batch: 31
    cpu: 1 pcp: 1
              count: 0
              high:  62
              batch: 15
  vm stats threshold: 12
  all_unreclaimable: 0
  prev_priority:     12
  start_pfn:         147456



[snip]

Please let me know if you need any further data to help me out with
the test/experiment.

--Vaidy


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-19 14:44   ` Vaidyanathan Srinivasan
  0 siblings, 0 replies; 40+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-01-19 14:44 UTC (permalink / raw)
  To: Aubrey Li
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz

Aubrey Li wrote:
> Here is the newest patch against 2.6.20-rc5.
> ======================================================
> From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001
> From: Aubrey.Li <aubreylee@gmail.com>
> Date: Thu, 18 Jan 2007 11:08:31 +0800
> Subject: [PATCH] Add an interface to limit total vfs page cache.
> The default percent is using 90% memory for page cache.

Hi Aubrey,

I used your patch on my PPC64 box and I do not get expected
behavior.  As you had requested, I am attaching zoneinfo and meminfo
dumps:

# cat  /proc/sys/vm/pagecache_ratio
50
# cat /proc/meminfo
MemTotal:      1014600 kB << 1GB Ram
MemFree:        960336 kB << Expect to see around 500MB free after
Buffers:          8348 kB       issue of DD command
Cached:           8624 kB
SwapCached:          8 kB
Active:          20908 kB
Inactive:         5680 kB
SwapTotal:     1526164 kB
SwapFree:      1526088 kB
Dirty:             116 kB
Writeback:           0 kB
AnonPages:        9544 kB
Mapped:           7736 kB
Slab:            18920 kB
SReclaimable:     5792 kB
SUnreclaim:      13128 kB
PageTables:        972 kB
NFS_Unstable:        0 kB
Bounce:              0 kB
CommitLimit:   2033464 kB
Committed_AS:    46652 kB
VmallocTotal: 8589934592 kB
VmallocUsed:      2440 kB
VmallocChunk: 8589932152 kB
HugePages_Total:     0
HugePages_Free:      0
HugePages_Rsvd:      0
Hugepagesize:    16384 kB

# cat /proc/zoneinfo
Node 0, zone      DMA
  pages free     130474
        min      571
        low      713
        high     856
        active   5010
        inactive 775
        scanned  0 (a: 24 i: 0)
        spanned  147456
        present  145440
    nr_anon_pages 2383
    nr_mapped    1932
    nr_file_pages 3389
    nr_slab_reclaimable 1094
    nr_slab_unreclaimable 1819
    nr_page_table_pages 243
    nr_dirty     4
    nr_writeback 0
    nr_unstable  0
    nr_bounce    0
    nr_vmscan_write 34
    numa_hit     1428389
    numa_miss    0
    numa_foreign 1048457
    numa_interleave 1511
    numa_local   1428389
    numa_other   0
        protection: (0, 0)
  pagesets
    cpu: 0 pcp: 0
              count: 77
              high:  186
              batch: 31
    cpu: 0 pcp: 1
              count: 3
              high:  62
              batch: 15
  vm stats threshold: 16
    cpu: 1 pcp: 0
              count: 171
              high:  186
              batch: 31
    cpu: 1 pcp: 1
              count: 11
              high:  62
              batch: 15
  vm stats threshold: 16
  all_unreclaimable: 0
  prev_priority:     12
  start_pfn:         0
Node 1, zone      DMA
  pages free     109610
        min      444
        low      555
        high     666
        active   217
        inactive 655
        scanned  0 (a: 21 i: 0)
        spanned  114688
        present  113120
    nr_anon_pages 3
    nr_mapped    2
    nr_file_pages 869
    nr_slab_reclaimable 354
    nr_slab_unreclaimable 1454
    nr_page_table_pages 0
    nr_dirty     0
    nr_writeback 0
    nr_unstable  0
    nr_bounce    0
    nr_vmscan_write 0
    numa_hit     2220
    numa_miss    1048457
    numa_foreign 0
    numa_interleave 1519
    numa_local   0
    numa_other   1050677
        protection: (0, 0)
  pagesets
  all_unreclaimable: 0
  prev_priority:     12
  start_pfn:         147456

The test: Write 1GB file in /tmp

 # dd if=/dev/zero of=/tmp/foo bs=1M count=1024
1024+0 records in
1024+0 records out
1073741824 bytes (1.1 GB) copied, 15.2301 seconds, 70.5 MB/s

Expect around 500MB to be retained as free after the run?

# cat /proc/meminfo
MemTotal:      1014600 kB
MemFree:         14080 kB  <<<
Buffers:         11164 kB
Cached:         924536 kB  <<< Almost all memory is consumed by
SwapCached:          8 kB         pagecache
Active:          27500 kB
Inactive:       917740 kB
SwapTotal:     1526164 kB
SwapFree:      1526088 kB
Dirty:          100528 kB
Writeback:           0 kB
AnonPages:        9544 kB
Mapped:           7736 kB
Slab:            45264 kB
SReclaimable:    29652 kB
SUnreclaim:      15612 kB
PageTables:        972 kB
NFS_Unstable:        0 kB
Bounce:              0 kB
CommitLimit:   2033464 kB
Committed_AS:    47732 kB
VmallocTotal: 8589934592 kB
VmallocUsed:      2440 kB
VmallocChunk: 8589932152 kB
HugePages_Total:     0
HugePages_Free:      0
HugePages_Rsvd:      0
Hugepagesize:    16384 kB

# cat /proc/zoneinfo
Node 0, zone      DMA
  pages free     2063
        min      571
        low      713
        high     856
        active   6028
        inactive 124552
        scanned  0 (a: 5 i: 0)
        spanned  147456
        present  145440
    nr_anon_pages 2384
    nr_mapped    1932
    nr_file_pages 128191
    nr_slab_reclaimable 4312
    nr_slab_unreclaimable 2102
    nr_page_table_pages 243
    nr_dirty     13724
    nr_writeback 0
    nr_unstable  0
    nr_bounce    0
    nr_vmscan_write 34
    numa_hit     1577905
    numa_miss    0
    numa_foreign 1173147
    numa_interleave 1511
    numa_local   1577905
    numa_other   0
        protection: (0, 0)
  pagesets
    cpu: 0 pcp: 0
              count: 147
              high:  186
              batch: 31
    cpu: 0 pcp: 1
              count: 7
              high:  62
              batch: 15
  vm stats threshold: 16
    cpu: 1 pcp: 0
              count: 160
              high:  186
              batch: 31
    cpu: 1 pcp: 1
              count: 52
              high:  62
              batch: 15
  vm stats threshold: 16
  all_unreclaimable: 0
  prev_priority:     12
  start_pfn:         0
Node 1, zone      DMA
  pages free     1766
        min      444
        low      555
        high     666
        active   847
        inactive 104893
        scanned  0 (a: 27 i: 0)
        spanned  114688
        present  113120
    nr_anon_pages 2
    nr_mapped    2
    nr_file_pages 105739
    nr_slab_reclaimable 3082
    nr_slab_unreclaimable 1658
    nr_page_table_pages 0
    nr_dirty     11419
    nr_writeback 0
    nr_unstable  0
    nr_bounce    0
    nr_vmscan_write 0
    numa_hit     2220
    numa_miss    1173147
    numa_foreign 0
    numa_interleave 1519
    numa_local   0
    numa_other   1175367
        protection: (0, 0)
  pagesets
    cpu: 0 pcp: 0
              count: 1
              high:  186
              batch: 31
    cpu: 0 pcp: 1
              count: 0
              high:  62
              batch: 15
  vm stats threshold: 12
    cpu: 1 pcp: 0
              count: 35
              high:  186
              batch: 31
    cpu: 1 pcp: 1
              count: 0
              high:  62
              batch: 15
  vm stats threshold: 12
  all_unreclaimable: 0
  prev_priority:     12
  start_pfn:         147456



[snip]

Please let me know if you need any further data to help me out with
the test/experiment.

--Vaidy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-18  3:23 ` Aubrey Li
@ 2007-01-19 14:52   ` Vaidyanathan Srinivasan
  -1 siblings, 0 replies; 40+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-01-19 14:52 UTC (permalink / raw)
  To: Aubrey Li
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz



Aubrey Li wrote:
> Here is the newest patch against 2.6.20-rc5.
> ======================================================
> From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001
> From: Aubrey.Li <aubreylee@gmail.com>
> Date: Thu, 18 Jan 2007 11:08:31 +0800
> Subject: [PATCH] Add an interface to limit total vfs page cache.
> The default percent is using 90% memory for page cache.
> 
> Signed-off-by: Aubrey.Li <aubreylee@gmail.com>
> ---
>  include/linux/gfp.h     |    1 +
>  include/linux/pagemap.h |    2 +-
>  include/linux/sysctl.h  |    2 ++
>  kernel/sysctl.c         |   11 +++++++++++
>  mm/page_alloc.c         |   17 +++++++++++++++--
>  5 files changed, 30 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 00c314a..531360e 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -46,6 +46,7 @@ struct vm_area_struct;
>  #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use
> emergency reserves */
>  #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce
> hardwall cpuset memory allocs */
>  #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
> +#define __GFP_PAGECACHE	((__force gfp_t)0x80000u) /* Is page cache
> allocation ? */
> 
>  #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
>  #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index c3e255b..890bb23 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -62,7 +62,7 @@ static inline struct page *__page_cache_
> 
>  static inline struct page *page_cache_alloc(struct address_space *x)
>  {
> -	return __page_cache_alloc(mapping_gfp_mask(x));
> +	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE);
>  }
> 
>  static inline struct page *page_cache_alloc_cold(struct address_space *x)
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 81480e6..d3c9174 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -202,6 +202,7 @@ enum
>  	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
>  	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
>  	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
> +	VM_PAGECACHE_RATIO=36,	/* percent of RAM to use as page cache */
>  };
> 
> 
> @@ -955,6 +956,7 @@ extern ctl_handler sysctl_string;
>  extern ctl_handler sysctl_intvec;
>  extern ctl_handler sysctl_jiffies;
>  extern ctl_handler sysctl_ms_jiffies;
> +extern int sysctl_pagecache_ratio;
> 
> 
>  /*
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 600b333..92db115 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1035,6 +1035,17 @@ static ctl_table vm_table[] = {
>  		.extra1		= &zero,
>  	},
>  #endif
> +	{
> +		.ctl_name	= VM_PAGECACHE_RATIO,
> +		.procname	= "pagecache_ratio",
> +		.data		= &sysctl_pagecache_ratio,
> +		.maxlen		= sizeof(sysctl_pagecache_ratio),
> +		.mode		= 0644,
> +		.proc_handler	= &proc_dointvec_minmax,
> +		.strategy	= &sysctl_intvec,
> +		.extra1         = &zero,
> +                .extra2         = &one_hundred,
> +	},
>  	{ .ctl_name = 0 }
>  };
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index fc5b544..5802b39 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -82,6 +82,8 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z
>  #endif
>  };
> 
> +int sysctl_pagecache_ratio = 10;
> +
>  EXPORT_SYMBOL(totalram_pages);
> 
>  static char * const zone_names[MAX_NR_ZONES] = {
> @@ -895,6 +897,7 @@ failed:
>  #define ALLOC_HARDER		0x10 /* try to alloc harder */
>  #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
>  #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
> +#define ALLOC_PAGECACHE		0x80 /* __GFP_PAGECACHE set */
> 
>  #ifdef CONFIG_FAIL_PAGE_ALLOC
> 
> @@ -998,6 +1001,9 @@ int zone_watermark_ok(struct zone *z, in
>  	if (alloc_flags & ALLOC_HARDER)
>  		min -= min / 4;
> 
> +	if (alloc_flags & ALLOC_PAGECACHE)
> +		min = min + (sysctl_pagecache_ratio * z->present_pages) / 100;
> +
>  	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
>  		return 0;

Hi Aubrey,

The idea of creating separate flag for pagecache in page_alloc is
interesting.  The good part is that you flag watermark low and the
zone reclaimer will do the rest of the job.

However when the zone reclaimer starts to reclaim pages, it will
remove all cold pages and not specifically pagecache pages.  This
may affect performance of applications.

One possible solution to this reclaim is to use scan control fields
and ask the shrink_page_list() and shrink_active_list() routines to
target only pagecache pages.  Pagecache pages are not mapped and
they are easy to find on the LRU list.

Please review my patch at http://lkml.org/lkml/2007/01/17/96

--Vaidy

[snip]


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-19 14:52   ` Vaidyanathan Srinivasan
  0 siblings, 0 replies; 40+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-01-19 14:52 UTC (permalink / raw)
  To: Aubrey Li
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz


Aubrey Li wrote:
> Here is the newest patch against 2.6.20-rc5.
> ======================================================
> From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001
> From: Aubrey.Li <aubreylee@gmail.com>
> Date: Thu, 18 Jan 2007 11:08:31 +0800
> Subject: [PATCH] Add an interface to limit total vfs page cache.
> The default percent is using 90% memory for page cache.
> 
> Signed-off-by: Aubrey.Li <aubreylee@gmail.com>
> ---
>  include/linux/gfp.h     |    1 +
>  include/linux/pagemap.h |    2 +-
>  include/linux/sysctl.h  |    2 ++
>  kernel/sysctl.c         |   11 +++++++++++
>  mm/page_alloc.c         |   17 +++++++++++++++--
>  5 files changed, 30 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/gfp.h b/include/linux/gfp.h
> index 00c314a..531360e 100644
> --- a/include/linux/gfp.h
> +++ b/include/linux/gfp.h
> @@ -46,6 +46,7 @@ struct vm_area_struct;
>  #define __GFP_NOMEMALLOC ((__force gfp_t)0x10000u) /* Don't use
> emergency reserves */
>  #define __GFP_HARDWALL   ((__force gfp_t)0x20000u) /* Enforce
> hardwall cpuset memory allocs */
>  #define __GFP_THISNODE	((__force gfp_t)0x40000u)/* No fallback, no policies */
> +#define __GFP_PAGECACHE	((__force gfp_t)0x80000u) /* Is page cache
> allocation ? */
> 
>  #define __GFP_BITS_SHIFT 20	/* Room for 20 __GFP_FOO bits */
>  #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
> diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
> index c3e255b..890bb23 100644
> --- a/include/linux/pagemap.h
> +++ b/include/linux/pagemap.h
> @@ -62,7 +62,7 @@ static inline struct page *__page_cache_
> 
>  static inline struct page *page_cache_alloc(struct address_space *x)
>  {
> -	return __page_cache_alloc(mapping_gfp_mask(x));
> +	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE);
>  }
> 
>  static inline struct page *page_cache_alloc_cold(struct address_space *x)
> diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
> index 81480e6..d3c9174 100644
> --- a/include/linux/sysctl.h
> +++ b/include/linux/sysctl.h
> @@ -202,6 +202,7 @@ enum
>  	VM_PANIC_ON_OOM=33,	/* panic at out-of-memory */
>  	VM_VDSO_ENABLED=34,	/* map VDSO into new processes? */
>  	VM_MIN_SLAB=35,		 /* Percent pages ignored by zone reclaim */
> +	VM_PAGECACHE_RATIO=36,	/* percent of RAM to use as page cache */
>  };
> 
> 
> @@ -955,6 +956,7 @@ extern ctl_handler sysctl_string;
>  extern ctl_handler sysctl_intvec;
>  extern ctl_handler sysctl_jiffies;
>  extern ctl_handler sysctl_ms_jiffies;
> +extern int sysctl_pagecache_ratio;
> 
> 
>  /*
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index 600b333..92db115 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -1035,6 +1035,17 @@ static ctl_table vm_table[] = {
>  		.extra1		= &zero,
>  	},
>  #endif
> +	{
> +		.ctl_name	= VM_PAGECACHE_RATIO,
> +		.procname	= "pagecache_ratio",
> +		.data		= &sysctl_pagecache_ratio,
> +		.maxlen		= sizeof(sysctl_pagecache_ratio),
> +		.mode		= 0644,
> +		.proc_handler	= &proc_dointvec_minmax,
> +		.strategy	= &sysctl_intvec,
> +		.extra1         = &zero,
> +                .extra2         = &one_hundred,
> +	},
>  	{ .ctl_name = 0 }
>  };
> 
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index fc5b544..5802b39 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -82,6 +82,8 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z
>  #endif
>  };
> 
> +int sysctl_pagecache_ratio = 10;
> +
>  EXPORT_SYMBOL(totalram_pages);
> 
>  static char * const zone_names[MAX_NR_ZONES] = {
> @@ -895,6 +897,7 @@ failed:
>  #define ALLOC_HARDER		0x10 /* try to alloc harder */
>  #define ALLOC_HIGH		0x20 /* __GFP_HIGH set */
>  #define ALLOC_CPUSET		0x40 /* check for correct cpuset */
> +#define ALLOC_PAGECACHE		0x80 /* __GFP_PAGECACHE set */
> 
>  #ifdef CONFIG_FAIL_PAGE_ALLOC
> 
> @@ -998,6 +1001,9 @@ int zone_watermark_ok(struct zone *z, in
>  	if (alloc_flags & ALLOC_HARDER)
>  		min -= min / 4;
> 
> +	if (alloc_flags & ALLOC_PAGECACHE)
> +		min = min + (sysctl_pagecache_ratio * z->present_pages) / 100;
> +
>  	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
>  		return 0;

Hi Aubrey,

The idea of creating separate flag for pagecache in page_alloc is
interesting.  The good part is that you flag watermark low and the
zone reclaimer will do the rest of the job.

However when the zone reclaimer starts to reclaim pages, it will
remove all cold pages and not specifically pagecache pages.  This
may affect performance of applications.

One possible solution to this reclaim is to use scan control fields
and ask the shrink_page_list() and shrink_active_list() routines to
target only pagecache pages.  Pagecache pages are not mapped and
they are easy to find on the LRU list.

Please review my patch at http://lkml.org/lkml/2007/01/17/96

--Vaidy

[snip]

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-19 14:44   ` Vaidyanathan Srinivasan
@ 2007-01-19 15:40     ` Aubrey Li
  -1 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-19 15:40 UTC (permalink / raw)
  To: Vaidyanathan Srinivasan
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz

On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>
> Hi Aubrey,
>
> I used your patch on my PPC64 box and I do not get expected
> behavior.  As you had requested, I am attaching zoneinfo and meminfo
> dumps:
>
> Please let me know if you need any further data to help me out with
> the test/experiment.
>

Although I have no PPC64 box in hand, I think the logic should be the same.
get_page_from_freelist() is called 5 times in __alloc_pages().

1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE;
2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE;
We should have the same result on the first two times get_page_from_freelist().

3) if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
			&& !in_interrupt())
   alloc_flags = ALLOC_NO_WATERMARKS
The case on my platform will never enter this branch. If the branch
occurs on your side,
The limit will be omitted. Because NO watermark, zone_watermark_ok()
will not be checked. memory will be allocated directly.

4)if (likely(did_some_progress)) {
   alloc_flags should include ALLOC_PAGECACHE.
So we should have the same result on this call.

5)	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
   alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE

This branch will not hit on my case. You may need to check it.

If 3) or 5) occurs on your platform, I think you can easily fix it.
Please confirm it and let me know the result.

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-19 15:40     ` Aubrey Li
  0 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-19 15:40 UTC (permalink / raw)
  To: Vaidyanathan Srinivasan
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz

On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>
> Hi Aubrey,
>
> I used your patch on my PPC64 box and I do not get expected
> behavior.  As you had requested, I am attaching zoneinfo and meminfo
> dumps:
>
> Please let me know if you need any further data to help me out with
> the test/experiment.
>

Although I have no PPC64 box in hand, I think the logic should be the same.
get_page_from_freelist() is called 5 times in __alloc_pages().

1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE;
2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE;
We should have the same result on the first two times get_page_from_freelist().

3) if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
			&& !in_interrupt())
   alloc_flags = ALLOC_NO_WATERMARKS
The case on my platform will never enter this branch. If the branch
occurs on your side,
The limit will be omitted. Because NO watermark, zone_watermark_ok()
will not be checked. memory will be allocated directly.

4)if (likely(did_some_progress)) {
   alloc_flags should include ALLOC_PAGECACHE.
So we should have the same result on this call.

5)	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
   alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE

This branch will not hit on my case. You may need to check it.

If 3) or 5) occurs on your platform, I think you can easily fix it.
Please confirm it and let me know the result.

Thanks,
-Aubrey

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-19 14:52   ` Vaidyanathan Srinivasan
@ 2007-01-19 16:05     ` Aubrey Li
  -1 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-19 16:05 UTC (permalink / raw)
  To: Vaidyanathan Srinivasan
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz

On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>
>
> Hi Aubrey,
>
> The idea of creating separate flag for pagecache in page_alloc is
> interesting.  The good part is that you flag watermark low and the
> zone reclaimer will do the rest of the job.
>
> However when the zone reclaimer starts to reclaim pages, it will
> remove all cold pages and not specifically pagecache pages.  This
> may affect performance of applications.
>
> One possible solution to this reclaim is to use scan control fields
> and ask the shrink_page_list() and shrink_active_list() routines to
> target only pagecache pages.  Pagecache pages are not mapped and
> they are easy to find on the LRU list.
>
> Please review my patch at http://lkml.org/lkml/2007/01/17/96
>

So you mean the existing reclaimer has the same issue, doesn't it?
In your and Roy's patch, balance_pagecache() routine is called on file
backed access.
So you still want to add this checking? or change the current
reclaimer completely?

-Aubrey

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-19 16:05     ` Aubrey Li
  0 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-19 16:05 UTC (permalink / raw)
  To: Vaidyanathan Srinivasan
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz

On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>
>
> Hi Aubrey,
>
> The idea of creating separate flag for pagecache in page_alloc is
> interesting.  The good part is that you flag watermark low and the
> zone reclaimer will do the rest of the job.
>
> However when the zone reclaimer starts to reclaim pages, it will
> remove all cold pages and not specifically pagecache pages.  This
> may affect performance of applications.
>
> One possible solution to this reclaim is to use scan control fields
> and ask the shrink_page_list() and shrink_active_list() routines to
> target only pagecache pages.  Pagecache pages are not mapped and
> they are easy to find on the LRU list.
>
> Please review my patch at http://lkml.org/lkml/2007/01/17/96
>

So you mean the existing reclaimer has the same issue, doesn't it?
In your and Roy's patch, balance_pagecache() routine is called on file
backed access.
So you still want to add this checking? or change the current
reclaimer completely?

-Aubrey

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-18  3:23 ` Aubrey Li
@ 2007-01-19 18:21   ` Christoph Lameter
  -1 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2007-01-19 18:21 UTC (permalink / raw)
  To: Aubrey Li
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz

On Thu, 18 Jan 2007, Aubrey Li wrote:

> +int sysctl_pagecache_ratio = 10;
> +

Pagecache ratio is the ratio of memory to be left over? Would it not be 
better to twist this around and to be able to specify how much of the
memory of a node may be used by the pagecache?

Why limit the size of the pagecache? Some kind of rationale would be 
useful. Maybe it was there in earlier incarnations of the patch that I did 
not see? It should be kept with it.

zone_reclaim already dynamically limits the size of the pagecache.

> +	if (alloc_flags & ALLOC_PAGECACHE)
> +		min = min + (sysctl_pagecache_ratio * z->present_pages) / 100;

The calculation of the multiplication / division is usually not done in 
the hot allocation path. See f.e. how min_unmapped_pages is handled in 
mm/page_alloc.c

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-19 18:21   ` Christoph Lameter
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2007-01-19 18:21 UTC (permalink / raw)
  To: Aubrey Li
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz

On Thu, 18 Jan 2007, Aubrey Li wrote:

> +int sysctl_pagecache_ratio = 10;
> +

Pagecache ratio is the ratio of memory to be left over? Would it not be 
better to twist this around and to be able to specify how much of the
memory of a node may be used by the pagecache?

Why limit the size of the pagecache? Some kind of rationale would be 
useful. Maybe it was there in earlier incarnations of the patch that I did 
not see? It should be kept with it.

zone_reclaim already dynamically limits the size of the pagecache.

> +	if (alloc_flags & ALLOC_PAGECACHE)
> +		min = min + (sysctl_pagecache_ratio * z->present_pages) / 100;

The calculation of the multiplication / division is usually not done in 
the hot allocation path. See f.e. how min_unmapped_pages is handled in 
mm/page_alloc.c

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-19 16:05     ` Aubrey Li
@ 2007-01-19 18:49       ` Vaidyanathan Srinivasan
  -1 siblings, 0 replies; 40+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-01-19 18:49 UTC (permalink / raw)
  To: Aubrey Li
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz



Aubrey Li wrote:
> On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>>
>> Hi Aubrey,
>>
>> The idea of creating separate flag for pagecache in page_alloc is
>> interesting.  The good part is that you flag watermark low and the
>> zone reclaimer will do the rest of the job.
>>
>> However when the zone reclaimer starts to reclaim pages, it will
>> remove all cold pages and not specifically pagecache pages.  This
>> may affect performance of applications.
>>
>> One possible solution to this reclaim is to use scan control fields
>> and ask the shrink_page_list() and shrink_active_list() routines to
>> target only pagecache pages.  Pagecache pages are not mapped and
>> they are easy to find on the LRU list.
>>
>> Please review my patch at http://lkml.org/lkml/2007/01/17/96
>>
> 
> So you mean the existing reclaimer has the same issue, doesn't it?

Well, the existing reclaimer will do the right job if the kernel
really runs out of memory and need to recover pages for new
allocations.  The pages to be removed will be the coldest page in
the system.  However now with the introduction of pagecache limit,
we are artificially creating a memory scarcity and forcing the
reclaimer to throw away some pages while we actually have free
usable RAM.  In this context the choice of pages picked by the
present reclaimer may not be the best ones.

If pagecache is overlimit, we expect old (cold) pagecache pages to
be thrown out and reused for new file data.  We do not expect to
drop a few text or data pages to make room for new pagecache.

> In your and Roy's patch, balance_pagecache() routine is called on file
> backed access.
> So you still want to add this checking? or change the current
> reclaimer completely?

The balance_pagecache() routine is called for file backed access
since that is when we would probably exceed the pagecache limit.
The routine check if the limit has exceeded and calls the reclaimer.
The reclaimer is an extension of the present reclaimer with more
checks to remove only pagecache pages and not try to unmap any
mapped pages and potentially affect application performance.

I am open to suggestions on reclaim logic.  My view is that we need
to selectively reclaim pagecache pages and not just call the
traditional reclaimer to freeup arbitrary type of pages.

--Vaidy

> -Aubrey
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-19 18:49       ` Vaidyanathan Srinivasan
  0 siblings, 0 replies; 40+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-01-19 18:49 UTC (permalink / raw)
  To: Aubrey Li
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz


Aubrey Li wrote:
> On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>>
>> Hi Aubrey,
>>
>> The idea of creating separate flag for pagecache in page_alloc is
>> interesting.  The good part is that you flag watermark low and the
>> zone reclaimer will do the rest of the job.
>>
>> However when the zone reclaimer starts to reclaim pages, it will
>> remove all cold pages and not specifically pagecache pages.  This
>> may affect performance of applications.
>>
>> One possible solution to this reclaim is to use scan control fields
>> and ask the shrink_page_list() and shrink_active_list() routines to
>> target only pagecache pages.  Pagecache pages are not mapped and
>> they are easy to find on the LRU list.
>>
>> Please review my patch at http://lkml.org/lkml/2007/01/17/96
>>
> 
> So you mean the existing reclaimer has the same issue, doesn't it?

Well, the existing reclaimer will do the right job if the kernel
really runs out of memory and need to recover pages for new
allocations.  The pages to be removed will be the coldest page in
the system.  However now with the introduction of pagecache limit,
we are artificially creating a memory scarcity and forcing the
reclaimer to throw away some pages while we actually have free
usable RAM.  In this context the choice of pages picked by the
present reclaimer may not be the best ones.

If pagecache is overlimit, we expect old (cold) pagecache pages to
be thrown out and reused for new file data.  We do not expect to
drop a few text or data pages to make room for new pagecache.

> In your and Roy's patch, balance_pagecache() routine is called on file
> backed access.
> So you still want to add this checking? or change the current
> reclaimer completely?

The balance_pagecache() routine is called for file backed access
since that is when we would probably exceed the pagecache limit.
The routine check if the limit has exceeded and calls the reclaimer.
The reclaimer is an extension of the present reclaimer with more
checks to remove only pagecache pages and not try to unmap any
mapped pages and potentially affect application performance.

I am open to suggestions on reclaim logic.  My view is that we need
to selectively reclaim pagecache pages and not just call the
traditional reclaimer to freeup arbitrary type of pages.

--Vaidy

> -Aubrey
> -
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-19 18:49       ` Vaidyanathan Srinivasan
@ 2007-01-19 19:01         ` Christoph Lameter
  -1 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2007-01-19 19:01 UTC (permalink / raw)
  To: Vaidyanathan Srinivasan
  Cc: Aubrey Li, linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz

On Sat, 20 Jan 2007, Vaidyanathan Srinivasan wrote:

> >> However when the zone reclaimer starts to reclaim pages, it will
> >> remove all cold pages and not specifically pagecache pages.  This
> >> may affect performance of applications.

The reclaimer is passed a control structure that can be used to disable
write to swap (if that is the concern).

> I am open to suggestions on reclaim logic.  My view is that we need
> to selectively reclaim pagecache pages and not just call the
> traditional reclaimer to freeup arbitrary type of pages.

The traditional reclaim works fine if told what to do. Introducing another 
LRU list to do reclaim is a significant change to the VM, creates lots of
overhead etc.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-19 19:01         ` Christoph Lameter
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2007-01-19 19:01 UTC (permalink / raw)
  To: Vaidyanathan Srinivasan
  Cc: Aubrey Li, linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz

On Sat, 20 Jan 2007, Vaidyanathan Srinivasan wrote:

> >> However when the zone reclaimer starts to reclaim pages, it will
> >> remove all cold pages and not specifically pagecache pages.  This
> >> may affect performance of applications.

The reclaimer is passed a control structure that can be used to disable
write to swap (if that is the concern).

> I am open to suggestions on reclaim logic.  My view is that we need
> to selectively reclaim pagecache pages and not just call the
> traditional reclaimer to freeup arbitrary type of pages.

The traditional reclaim works fine if told what to do. Introducing another 
LRU list to do reclaim is a significant change to the VM, creates lots of
overhead etc.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-19 18:49       ` Vaidyanathan Srinivasan
@ 2007-01-20  2:04         ` Aubrey Li
  -1 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-20  2:04 UTC (permalink / raw)
  To: Vaidyanathan Srinivasan
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On 1/20/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>
>
> Aubrey Li wrote:
> > On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
> >>
> >> Hi Aubrey,
> >>
> >> The idea of creating separate flag for pagecache in page_alloc is
> >> interesting.  The good part is that you flag watermark low and the
> >> zone reclaimer will do the rest of the job.
> >>
> >> However when the zone reclaimer starts to reclaim pages, it will
> >> remove all cold pages and not specifically pagecache pages.  This
> >> may affect performance of applications.
> >>
> >> One possible solution to this reclaim is to use scan control fields
> >> and ask the shrink_page_list() and shrink_active_list() routines to
> >> target only pagecache pages.  Pagecache pages are not mapped and
> >> they are easy to find on the LRU list.
> >>
> >> Please review my patch at http://lkml.org/lkml/2007/01/17/96
> >>
> >
> > So you mean the existing reclaimer has the same issue, doesn't it?
>
> Well, the existing reclaimer will do the right job if the kernel
> really runs out of memory and need to recover pages for new
> allocations.  The pages to be removed will be the coldest page in
> the system.  However now with the introduction of pagecache limit,
> we are artificially creating a memory scarcity and forcing the
> reclaimer to throw away some pages while we actually have free
> usable RAM.  In this context the choice of pages picked by the
> present reclaimer may not be the best ones.
>
> If pagecache is overlimit, we expect old (cold) pagecache pages to
> be thrown out and reused for new file data.  We do not expect to
> drop a few text or data pages to make room for new pagecache.
>
Well, actually I think this probably not necessary. Because the
reclaimer has no way to predict the behavior of user mode processes,
how do you make sure the pagecache will not be access again in a short
time? So I think the present reclaimer is suitable. Limit pagecache
must affect performance of applications. The key is what do you want
to get?
In my case, I get more memory to allocate, less fragmentation, it can
solve my problem, :)

Now the problem in the idea of the patch is, when vfs cache limit is
hit, reclaimer doesn't reclaim all of the reclaimable pages, it just
give few out. So next time vfs pagecache request, it is quite possible
reclaimer is triggered again. That's the point in my mind affecting
the performance of the applications.

I'll continue to work on this issue to see if I can make a improvement.

-Aubrey

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-20  2:04         ` Aubrey Li
  0 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-20  2:04 UTC (permalink / raw)
  To: Vaidyanathan Srinivasan
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On 1/20/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>
>
> Aubrey Li wrote:
> > On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
> >>
> >> Hi Aubrey,
> >>
> >> The idea of creating separate flag for pagecache in page_alloc is
> >> interesting.  The good part is that you flag watermark low and the
> >> zone reclaimer will do the rest of the job.
> >>
> >> However when the zone reclaimer starts to reclaim pages, it will
> >> remove all cold pages and not specifically pagecache pages.  This
> >> may affect performance of applications.
> >>
> >> One possible solution to this reclaim is to use scan control fields
> >> and ask the shrink_page_list() and shrink_active_list() routines to
> >> target only pagecache pages.  Pagecache pages are not mapped and
> >> they are easy to find on the LRU list.
> >>
> >> Please review my patch at http://lkml.org/lkml/2007/01/17/96
> >>
> >
> > So you mean the existing reclaimer has the same issue, doesn't it?
>
> Well, the existing reclaimer will do the right job if the kernel
> really runs out of memory and need to recover pages for new
> allocations.  The pages to be removed will be the coldest page in
> the system.  However now with the introduction of pagecache limit,
> we are artificially creating a memory scarcity and forcing the
> reclaimer to throw away some pages while we actually have free
> usable RAM.  In this context the choice of pages picked by the
> present reclaimer may not be the best ones.
>
> If pagecache is overlimit, we expect old (cold) pagecache pages to
> be thrown out and reused for new file data.  We do not expect to
> drop a few text or data pages to make room for new pagecache.
>
Well, actually I think this probably not necessary. Because the
reclaimer has no way to predict the behavior of user mode processes,
how do you make sure the pagecache will not be access again in a short
time? So I think the present reclaimer is suitable. Limit pagecache
must affect performance of applications. The key is what do you want
to get?
In my case, I get more memory to allocate, less fragmentation, it can
solve my problem, :)

Now the problem in the idea of the patch is, when vfs cache limit is
hit, reclaimer doesn't reclaim all of the reclaimable pages, it just
give few out. So next time vfs pagecache request, it is quite possible
reclaimer is triggered again. That's the point in my mind affecting
the performance of the applications.

I'll continue to work on this issue to see if I can make a improvement.

-Aubrey

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-20  2:04         ` Aubrey Li
@ 2007-01-20  2:24           ` Nick Piggin
  -1 siblings, 0 replies; 40+ messages in thread
From: Nick Piggin @ 2007-01-20  2:24 UTC (permalink / raw)
  To: Aubrey Li
  Cc: Vaidyanathan Srinivasan, linux-kernel, linux-mm, Linus Torvalds,
	Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

Aubrey Li wrote:
> On 1/20/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:

>> If pagecache is overlimit, we expect old (cold) pagecache pages to
>> be thrown out and reused for new file data.  We do not expect to
>> drop a few text or data pages to make room for new pagecache.
>>
> Well, actually I think this probably not necessary. Because the
> reclaimer has no way to predict the behavior of user mode processes,
> how do you make sure the pagecache will not be access again in a short

It is not about predicting behaviour, it is about directing the reclaim
effort at the actual resource that is under pressure.

Even given a pagecache limiting patch which does the proper accounting
to keep pagecache pages under a % limit (unlike yours), kicking off an
undirected reclaim could (in theory) reclaim all slab and anonymous
memory pages before bringing pagecache under the limit. So I think
you need to be a bit more thorough than just assuming everything will
be OK. Page reclaim behaviour is pretty strange and complex.

Secondly, your patch isn't actually very good. It unconditionally
shrinks memory to below the given % mark each time a pagecache alloc
occurs, regardless of how much pagecache is in the system. Effectively
that seems to just reduce the amount of memory available to the system.

Luckily, there are actually good, robust solutions for your higher
order allocation problem. Do higher order allocations at boot time,
modifiy userspace applications, or set up otherwise-unused, or easily
reclaimable reserve pools for higher order allocations. I don't
understand why you are so resistant to all of these approaches?

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-20  2:24           ` Nick Piggin
  0 siblings, 0 replies; 40+ messages in thread
From: Nick Piggin @ 2007-01-20  2:24 UTC (permalink / raw)
  To: Aubrey Li
  Cc: Vaidyanathan Srinivasan, linux-kernel, linux-mm, Linus Torvalds,
	Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

Aubrey Li wrote:
> On 1/20/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:

>> If pagecache is overlimit, we expect old (cold) pagecache pages to
>> be thrown out and reused for new file data.  We do not expect to
>> drop a few text or data pages to make room for new pagecache.
>>
> Well, actually I think this probably not necessary. Because the
> reclaimer has no way to predict the behavior of user mode processes,
> how do you make sure the pagecache will not be access again in a short

It is not about predicting behaviour, it is about directing the reclaim
effort at the actual resource that is under pressure.

Even given a pagecache limiting patch which does the proper accounting
to keep pagecache pages under a % limit (unlike yours), kicking off an
undirected reclaim could (in theory) reclaim all slab and anonymous
memory pages before bringing pagecache under the limit. So I think
you need to be a bit more thorough than just assuming everything will
be OK. Page reclaim behaviour is pretty strange and complex.

Secondly, your patch isn't actually very good. It unconditionally
shrinks memory to below the given % mark each time a pagecache alloc
occurs, regardless of how much pagecache is in the system. Effectively
that seems to just reduce the amount of memory available to the system.

Luckily, there are actually good, robust solutions for your higher
order allocation problem. Do higher order allocations at boot time,
modifiy userspace applications, or set up otherwise-unused, or easily
reclaimable reserve pools for higher order allocations. I don't
understand why you are so resistant to all of these approaches?

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-20  2:24           ` Nick Piggin
@ 2007-01-20  2:35             ` Mike Frysinger
  -1 siblings, 0 replies; 40+ messages in thread
From: Mike Frysinger @ 2007-01-20  2:35 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Aubrey Li, Vaidyanathan Srinivasan, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On 1/19/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> Luckily, there are actually good, robust solutions for your higher
> order allocation problem. Do higher order allocations at boot time,
> modifiy userspace applications, or set up otherwise-unused, or easily
> reclaimable reserve pools for higher order allocations. I don't
> understand why you are so resistant to all of these approaches?

in a nutshell ...

the idea is to try and generalize these things

your approach involves tweaking each end solution to maximize the performance

our approach is to teach the kernel some more tricks so that each
solution need not be tweaked

these are at obvious odds as they tackle the problem by going in
pretty much opposite directions ... yours leads to a tighter system in
the end, but ours leads to much more rapid development and deployment
-mike

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-20  2:35             ` Mike Frysinger
  0 siblings, 0 replies; 40+ messages in thread
From: Mike Frysinger @ 2007-01-20  2:35 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Aubrey Li, Vaidyanathan Srinivasan, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On 1/19/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> Luckily, there are actually good, robust solutions for your higher
> order allocation problem. Do higher order allocations at boot time,
> modifiy userspace applications, or set up otherwise-unused, or easily
> reclaimable reserve pools for higher order allocations. I don't
> understand why you are so resistant to all of these approaches?

in a nutshell ...

the idea is to try and generalize these things

your approach involves tweaking each end solution to maximize the performance

our approach is to teach the kernel some more tricks so that each
solution need not be tweaked

these are at obvious odds as they tackle the problem by going in
pretty much opposite directions ... yours leads to a tighter system in
the end, but ours leads to much more rapid development and deployment
-mike

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-20  2:35             ` Mike Frysinger
@ 2007-01-20  2:49               ` Nick Piggin
  -1 siblings, 0 replies; 40+ messages in thread
From: Nick Piggin @ 2007-01-20  2:49 UTC (permalink / raw)
  To: Mike Frysinger
  Cc: Aubrey Li, Vaidyanathan Srinivasan, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

Mike Frysinger wrote:
> On 1/19/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
>> Luckily, there are actually good, robust solutions for your higher
>> order allocation problem. Do higher order allocations at boot time,
>> modifiy userspace applications, or set up otherwise-unused, or easily
>> reclaimable reserve pools for higher order allocations. I don't
>> understand why you are so resistant to all of these approaches?
> 
> 
> in a nutshell ...
> 
> the idea is to try and generalize these things
> 
> your approach involves tweaking each end solution to maximize the 
> performance

Maybe, if you are talking about my advice to fix userspace... but you
*are* going to contribute those changes back for the nommu community
to use, right? So the end result of that is _not_ actually tweaking the
end solutions.

But actually, if you take the reserved pool approach, then that will
work fine, in-kernel, and it is something that already needs to be done
for dynamic hugepage allocations which is almost exactly the same
situation. And everybody can use this as well (I think most of the code
is written already, but not merged).

> our approach is to teach the kernel some more tricks so that each
> solution need not be tweaked
> 
> these are at obvious odds as they tackle the problem by going in
> pretty much opposite directions ... yours leads to a tighter system in
> the end, but ours leads to much more rapid development and deployment

OK that's fair enough, but considering that it doesn't actually fix
the problem properly; and that it does weird and wonderful things with
our already fragile page reclaim path, then it is not a good idea to
merge it upstream.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-20  2:49               ` Nick Piggin
  0 siblings, 0 replies; 40+ messages in thread
From: Nick Piggin @ 2007-01-20  2:49 UTC (permalink / raw)
  To: Mike Frysinger
  Cc: Aubrey Li, Vaidyanathan Srinivasan, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

Mike Frysinger wrote:
> On 1/19/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> 
>> Luckily, there are actually good, robust solutions for your higher
>> order allocation problem. Do higher order allocations at boot time,
>> modifiy userspace applications, or set up otherwise-unused, or easily
>> reclaimable reserve pools for higher order allocations. I don't
>> understand why you are so resistant to all of these approaches?
> 
> 
> in a nutshell ...
> 
> the idea is to try and generalize these things
> 
> your approach involves tweaking each end solution to maximize the 
> performance

Maybe, if you are talking about my advice to fix userspace... but you
*are* going to contribute those changes back for the nommu community
to use, right? So the end result of that is _not_ actually tweaking the
end solutions.

But actually, if you take the reserved pool approach, then that will
work fine, in-kernel, and it is something that already needs to be done
for dynamic hugepage allocations which is almost exactly the same
situation. And everybody can use this as well (I think most of the code
is written already, but not merged).

> our approach is to teach the kernel some more tricks so that each
> solution need not be tweaked
> 
> these are at obvious odds as they tackle the problem by going in
> pretty much opposite directions ... yours leads to a tighter system in
> the end, but ours leads to much more rapid development and deployment

OK that's fair enough, but considering that it doesn't actually fix
the problem properly; and that it does weird and wonderful things with
our already fragile page reclaim path, then it is not a good idea to
merge it upstream.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-20  2:24           ` Nick Piggin
@ 2007-01-20  3:08             ` Aubrey Li
  -1 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-20  3:08 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Vaidyanathan Srinivasan, linux-kernel, linux-mm, Linus Torvalds,
	Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On 1/20/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> Aubrey Li wrote:
> > On 1/20/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>
> >> If pagecache is overlimit, we expect old (cold) pagecache pages to
> >> be thrown out and reused for new file data.  We do not expect to
> >> drop a few text or data pages to make room for new pagecache.
> >>
> > Well, actually I think this probably not necessary. Because the
> > reclaimer has no way to predict the behavior of user mode processes,
> > how do you make sure the pagecache will not be access again in a short
>
> It is not about predicting behaviour, it is about directing the reclaim
> effort at the actual resource that is under pressure.
>
> Even given a pagecache limiting patch which does the proper accounting
> to keep pagecache pages under a % limit (unlike yours), kicking off an
> undirected reclaim could (in theory) reclaim all slab and anonymous
> memory pages before bringing pagecache under the limit. So I think
> you need to be a bit more thorough than just assuming everything will
> be OK. Page reclaim behaviour is pretty strange and complex.

So what's the right way to limit pagecache?

>
> Secondly, your patch isn't actually very good. It unconditionally
> shrinks memory to below the given % mark each time a pagecache alloc
> occurs, regardless of how much pagecache is in the system. Effectively
> that seems to just reduce the amount of memory available to the system.

It doesn't reduce the amount of memory available to the system. It
just reduce the amount of memory available to the page cache. So that
page cache is limited and the reserved memory can be allocated by the
application.

>
> Luckily, there are actually good, robust solutions for your higher
> order allocation problem. Do higher order allocations at boot time,
> modifiy userspace applications, or set up otherwise-unused, or easily
> reclaimable reserve pools for higher order allocations. I don't
> understand why you are so resistant to all of these approaches?
>

I think we have explained the reason too much. We are working on
no-mmu arch and provide a platform running linux to our customer. They
are doing very good things like mplayer, asterisk, ip camera, etc on
our platform, some applications was migrated from mmu arch. I think
that means in some cases no-mmu arch is somewhat better than mmu arch.
So we are taking effort to make the migration smooth or make no-mmu
linux stronger.
It's no way to let our customer modify their applications, we also
unwilling to do it. And we have not an existing mechanism to set up a
pools for the complex applications. So I'm trying to do some coding
hack in the kernel to satisfy these kinds of requirement.

And as you see, the patch seems to solve the problems on my side. But
I'm not sure it's the right way to limit vfs cache, So I'm asking for
comments and suggestions and help, I'm not asking to clobber the
kernel.

-Aubrey

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-20  3:08             ` Aubrey Li
  0 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-20  3:08 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Vaidyanathan Srinivasan, linux-kernel, linux-mm, Linus Torvalds,
	Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On 1/20/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> Aubrey Li wrote:
> > On 1/20/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>
> >> If pagecache is overlimit, we expect old (cold) pagecache pages to
> >> be thrown out and reused for new file data.  We do not expect to
> >> drop a few text or data pages to make room for new pagecache.
> >>
> > Well, actually I think this probably not necessary. Because the
> > reclaimer has no way to predict the behavior of user mode processes,
> > how do you make sure the pagecache will not be access again in a short
>
> It is not about predicting behaviour, it is about directing the reclaim
> effort at the actual resource that is under pressure.
>
> Even given a pagecache limiting patch which does the proper accounting
> to keep pagecache pages under a % limit (unlike yours), kicking off an
> undirected reclaim could (in theory) reclaim all slab and anonymous
> memory pages before bringing pagecache under the limit. So I think
> you need to be a bit more thorough than just assuming everything will
> be OK. Page reclaim behaviour is pretty strange and complex.

So what's the right way to limit pagecache?

>
> Secondly, your patch isn't actually very good. It unconditionally
> shrinks memory to below the given % mark each time a pagecache alloc
> occurs, regardless of how much pagecache is in the system. Effectively
> that seems to just reduce the amount of memory available to the system.

It doesn't reduce the amount of memory available to the system. It
just reduce the amount of memory available to the page cache. So that
page cache is limited and the reserved memory can be allocated by the
application.

>
> Luckily, there are actually good, robust solutions for your higher
> order allocation problem. Do higher order allocations at boot time,
> modifiy userspace applications, or set up otherwise-unused, or easily
> reclaimable reserve pools for higher order allocations. I don't
> understand why you are so resistant to all of these approaches?
>

I think we have explained the reason too much. We are working on
no-mmu arch and provide a platform running linux to our customer. They
are doing very good things like mplayer, asterisk, ip camera, etc on
our platform, some applications was migrated from mmu arch. I think
that means in some cases no-mmu arch is somewhat better than mmu arch.
So we are taking effort to make the migration smooth or make no-mmu
linux stronger.
It's no way to let our customer modify their applications, we also
unwilling to do it. And we have not an existing mechanism to set up a
pools for the complex applications. So I'm trying to do some coding
hack in the kernel to satisfy these kinds of requirement.

And as you see, the patch seems to solve the problems on my side. But
I'm not sure it's the right way to limit vfs cache, So I'm asking for
comments and suggestions and help, I'm not asking to clobber the
kernel.

-Aubrey

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-20  2:49               ` Nick Piggin
@ 2007-01-20  3:40                 ` Mike Frysinger
  -1 siblings, 0 replies; 40+ messages in thread
From: Mike Frysinger @ 2007-01-20  3:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Aubrey Li, Vaidyanathan Srinivasan, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On 1/19/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> Maybe, if you are talking about my advice to fix userspace... but you
> *are* going to contribute those changes back for the nommu community
> to use, right? So the end result of that is _not_ actually tweaking the
> end solutions.

not quite sure what you're referring to here, but our approach is to
contribute everything back in an acceptable form
-mike

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-20  3:40                 ` Mike Frysinger
  0 siblings, 0 replies; 40+ messages in thread
From: Mike Frysinger @ 2007-01-20  3:40 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Aubrey Li, Vaidyanathan Srinivasan, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On 1/19/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> Maybe, if you are talking about my advice to fix userspace... but you
> *are* going to contribute those changes back for the nommu community
> to use, right? So the end result of that is _not_ actually tweaking the
> end solutions.

not quite sure what you're referring to here, but our approach is to
contribute everything back in an acceptable form
-mike

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-20  3:08             ` Aubrey Li
@ 2007-01-20  4:03               ` Nick Piggin
  -1 siblings, 0 replies; 40+ messages in thread
From: Nick Piggin @ 2007-01-20  4:03 UTC (permalink / raw)
  To: Aubrey Li
  Cc: Vaidyanathan Srinivasan, linux-kernel, linux-mm, Linus Torvalds,
	Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

Aubrey Li wrote:

> So what's the right way to limit pagecache?

Probably something a lot more complicated... if you can say there
is a "right way".

>> Secondly, your patch isn't actually very good. It unconditionally
>> shrinks memory to below the given % mark each time a pagecache alloc
>> occurs, regardless of how much pagecache is in the system. Effectively
>> that seems to just reduce the amount of memory available to the system.
> 
> 
> It doesn't reduce the amount of memory available to the system. It
> just reduce the amount of memory available to the page cache. So that
> page cache is limited and the reserved memory can be allocated by the
> application.

But the patch doesn't do that, as I explained.

>> Luckily, there are actually good, robust solutions for your higher
>> order allocation problem. Do higher order allocations at boot time,
>> modifiy userspace applications, or set up otherwise-unused, or easily
>> reclaimable reserve pools for higher order allocations. I don't
>> understand why you are so resistant to all of these approaches?
>>
> 
> I think we have explained the reason too much. We are working on
> no-mmu arch and provide a platform running linux to our customer. They
> are doing very good things like mplayer, asterisk, ip camera, etc on
> our platform, some applications was migrated from mmu arch. I think
> that means in some cases no-mmu arch is somewhat better than mmu arch.
> So we are taking effort to make the migration smooth or make no-mmu
> linux stronger.
> It's no way to let our customer modify their applications, we also
> unwilling to do it. And we have not an existing mechanism to set up a
> pools for the complex applications. So I'm trying to do some coding
> hack in the kernel to satisfy these kinds of requirement.

Oh, maybe you misunderstand the reserve pools idea: that is an entirely
kernel based solution where you can preallocate a large, contiguous
pool of memory at boot time which you can use to satisfy your nommu
higher order anonymous memory allocations.

This is something that will not get fragmented by pagecache, nor will
it get fragmented by any other page allocation, slab allocation. Tt is
a pretty good solution provided that you size the pool correctly for
your application's needs.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-20  4:03               ` Nick Piggin
  0 siblings, 0 replies; 40+ messages in thread
From: Nick Piggin @ 2007-01-20  4:03 UTC (permalink / raw)
  To: Aubrey Li
  Cc: Vaidyanathan Srinivasan, linux-kernel, linux-mm, Linus Torvalds,
	Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

Aubrey Li wrote:

> So what's the right way to limit pagecache?

Probably something a lot more complicated... if you can say there
is a "right way".

>> Secondly, your patch isn't actually very good. It unconditionally
>> shrinks memory to below the given % mark each time a pagecache alloc
>> occurs, regardless of how much pagecache is in the system. Effectively
>> that seems to just reduce the amount of memory available to the system.
> 
> 
> It doesn't reduce the amount of memory available to the system. It
> just reduce the amount of memory available to the page cache. So that
> page cache is limited and the reserved memory can be allocated by the
> application.

But the patch doesn't do that, as I explained.

>> Luckily, there are actually good, robust solutions for your higher
>> order allocation problem. Do higher order allocations at boot time,
>> modifiy userspace applications, or set up otherwise-unused, or easily
>> reclaimable reserve pools for higher order allocations. I don't
>> understand why you are so resistant to all of these approaches?
>>
> 
> I think we have explained the reason too much. We are working on
> no-mmu arch and provide a platform running linux to our customer. They
> are doing very good things like mplayer, asterisk, ip camera, etc on
> our platform, some applications was migrated from mmu arch. I think
> that means in some cases no-mmu arch is somewhat better than mmu arch.
> So we are taking effort to make the migration smooth or make no-mmu
> linux stronger.
> It's no way to let our customer modify their applications, we also
> unwilling to do it. And we have not an existing mechanism to set up a
> pools for the complex applications. So I'm trying to do some coding
> hack in the kernel to satisfy these kinds of requirement.

Oh, maybe you misunderstand the reserve pools idea: that is an entirely
kernel based solution where you can preallocate a large, contiguous
pool of memory at boot time which you can use to satisfy your nommu
higher order anonymous memory allocations.

This is something that will not get fragmented by pagecache, nor will
it get fragmented by any other page allocation, slab allocation. Tt is
a pretty good solution provided that you size the pool correctly for
your application's needs.

-- 
SUSE Labs, Novell Inc.
Send instant messages to your online friends http://au.messenger.yahoo.com 

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-20  4:03               ` Nick Piggin
@ 2007-01-20  4:26                 ` Aubrey Li
  -1 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-20  4:26 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Vaidyanathan Srinivasan, linux-kernel, linux-mm, Linus Torvalds,
	Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On 1/20/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> Aubrey Li wrote:
>
> > So what's the right way to limit pagecache?
>
> Probably something a lot more complicated... if you can say there
> is a "right way".
>
> >> Secondly, your patch isn't actually very good. It unconditionally
> >> shrinks memory to below the given % mark each time a pagecache alloc
> >> occurs, regardless of how much pagecache is in the system. Effectively
> >> that seems to just reduce the amount of memory available to the system.
> >
> >
> > It doesn't reduce the amount of memory available to the system. It
> > just reduce the amount of memory available to the page cache. So that
> > page cache is limited and the reserved memory can be allocated by the
> > application.
>
> But the patch doesn't do that, as I explained.

I'm not sure you read the correct patch. Let me explain the logic again.

assume:
min = 123pages
pagecache_reserved = 200 pages

if( alloc_flags & ALLOC_PAGECACHE)
        watermark = min + pagecache_reserved ( 323 pages)
else
        watermark = min ( 123 pages)

So if request pagecache, when free pages < 323 pages, reclaim is triggered.
But at this time if request memory not pagecache, reclaim will be
triggered when free pages < 123 as the present reclaimer does.

I verified it on my side, why do you think it doesn't work properly?

>
> >> Luckily, there are actually good, robust solutions for your higher
> >> order allocation problem. Do higher order allocations at boot time,
> >> modifiy userspace applications, or set up otherwise-unused, or easily
> >> reclaimable reserve pools for higher order allocations. I don't
> >> understand why you are so resistant to all of these approaches?
> >>
> >
> > I think we have explained the reason too much. We are working on
> > no-mmu arch and provide a platform running linux to our customer. They
> > are doing very good things like mplayer, asterisk, ip camera, etc on
> > our platform, some applications was migrated from mmu arch. I think
> > that means in some cases no-mmu arch is somewhat better than mmu arch.
> > So we are taking effort to make the migration smooth or make no-mmu
> > linux stronger.
> > It's no way to let our customer modify their applications, we also
> > unwilling to do it. And we have not an existing mechanism to set up a
> > pools for the complex applications. So I'm trying to do some coding
> > hack in the kernel to satisfy these kinds of requirement.
>
> Oh, maybe you misunderstand the reserve pools idea: that is an entirely
> kernel based solution where you can preallocate a large, contiguous
> pool of memory at boot time which you can use to satisfy your nommu
> higher order anonymous memory allocations.
>
> This is something that will not get fragmented by pagecache, nor will
> it get fragmented by any other page allocation, slab allocation. Tt is
> a pretty good solution provided that you size the pool correctly for
> your application's needs.
>

So if application malloc(1M), how does kernel know to allocate
reserved pool not from buddy system? I didn't see any special code
about this. Is there any doc or example?

-Aubrey

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-20  4:26                 ` Aubrey Li
  0 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-20  4:26 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Vaidyanathan Srinivasan, linux-kernel, linux-mm, Linus Torvalds,
	Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On 1/20/07, Nick Piggin <nickpiggin@yahoo.com.au> wrote:
> Aubrey Li wrote:
>
> > So what's the right way to limit pagecache?
>
> Probably something a lot more complicated... if you can say there
> is a "right way".
>
> >> Secondly, your patch isn't actually very good. It unconditionally
> >> shrinks memory to below the given % mark each time a pagecache alloc
> >> occurs, regardless of how much pagecache is in the system. Effectively
> >> that seems to just reduce the amount of memory available to the system.
> >
> >
> > It doesn't reduce the amount of memory available to the system. It
> > just reduce the amount of memory available to the page cache. So that
> > page cache is limited and the reserved memory can be allocated by the
> > application.
>
> But the patch doesn't do that, as I explained.

I'm not sure you read the correct patch. Let me explain the logic again.

assume:
min = 123pages
pagecache_reserved = 200 pages

if( alloc_flags & ALLOC_PAGECACHE)
        watermark = min + pagecache_reserved ( 323 pages)
else
        watermark = min ( 123 pages)

So if request pagecache, when free pages < 323 pages, reclaim is triggered.
But at this time if request memory not pagecache, reclaim will be
triggered when free pages < 123 as the present reclaimer does.

I verified it on my side, why do you think it doesn't work properly?

>
> >> Luckily, there are actually good, robust solutions for your higher
> >> order allocation problem. Do higher order allocations at boot time,
> >> modifiy userspace applications, or set up otherwise-unused, or easily
> >> reclaimable reserve pools for higher order allocations. I don't
> >> understand why you are so resistant to all of these approaches?
> >>
> >
> > I think we have explained the reason too much. We are working on
> > no-mmu arch and provide a platform running linux to our customer. They
> > are doing very good things like mplayer, asterisk, ip camera, etc on
> > our platform, some applications was migrated from mmu arch. I think
> > that means in some cases no-mmu arch is somewhat better than mmu arch.
> > So we are taking effort to make the migration smooth or make no-mmu
> > linux stronger.
> > It's no way to let our customer modify their applications, we also
> > unwilling to do it. And we have not an existing mechanism to set up a
> > pools for the complex applications. So I'm trying to do some coding
> > hack in the kernel to satisfy these kinds of requirement.
>
> Oh, maybe you misunderstand the reserve pools idea: that is an entirely
> kernel based solution where you can preallocate a large, contiguous
> pool of memory at boot time which you can use to satisfy your nommu
> higher order anonymous memory allocations.
>
> This is something that will not get fragmented by pagecache, nor will
> it get fragmented by any other page allocation, slab allocation. Tt is
> a pretty good solution provided that you size the pool correctly for
> your application's needs.
>

So if application malloc(1M), how does kernel know to allocate
reserved pool not from buddy system? I didn't see any special code
about this. Is there any doc or example?

-Aubrey

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-20  4:03               ` Nick Piggin
@ 2007-01-22 19:15                 ` Christoph Lameter
  -1 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2007-01-22 19:15 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Aubrey Li, Vaidyanathan Srinivasan, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On Sat, 20 Jan 2007, Nick Piggin wrote:

> > It doesn't reduce the amount of memory available to the system. It
> > just reduce the amount of memory available to the page cache. So that
> > page cache is limited and the reserved memory can be allocated by the
> > application.
> 
> But the patch doesn't do that, as I explained.

The patch could do it if he would be checking NR_FILE_PAGES against 
a limit instead of the free pages.


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-22 19:15                 ` Christoph Lameter
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2007-01-22 19:15 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Aubrey Li, Vaidyanathan Srinivasan, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On Sat, 20 Jan 2007, Nick Piggin wrote:

> > It doesn't reduce the amount of memory available to the system. It
> > just reduce the amount of memory available to the page cache. So that
> > page cache is limited and the reserved memory can be allocated by the
> > application.
> 
> But the patch doesn't do that, as I explained.

The patch could do it if he would be checking NR_FILE_PAGES against 
a limit instead of the free pages.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-20  4:26                 ` Aubrey Li
@ 2007-01-22 19:22                   ` Christoph Lameter
  -1 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2007-01-22 19:22 UTC (permalink / raw)
  To: Aubrey Li
  Cc: Nick Piggin, Vaidyanathan Srinivasan, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On Sat, 20 Jan 2007, Aubrey Li wrote:

> assume:
> min = 123pages
> pagecache_reserved = 200 pages
> 
> if( alloc_flags & ALLOC_PAGECACHE)
>        watermark = min + pagecache_reserved ( 323 pages)
> else
>        watermark = min ( 123 pages)
> 
> So if request pagecache, when free pages < 323 pages, reclaim is triggered.
> But at this time if request memory not pagecache, reclaim will be
> triggered when free pages < 123 as the present reclaimer does.
> 
> I verified it on my side, why do you think it doesn't work properly?

The code does not check the page cache size but the number of free pages. 
The page cache size is available via zone_page_state(zone, NR_FILE_PAGES).

In its current form your patch is making the system reclaim earlier for 
page cache allocations. And its reclaiming regardless of the number of 
pages in the page cache. If there are no pagecache pages but only 
anonymous pages in the zone then the code will still reclaim although the 
page cache size is zero.

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-22 19:22                   ` Christoph Lameter
  0 siblings, 0 replies; 40+ messages in thread
From: Christoph Lameter @ 2007-01-22 19:22 UTC (permalink / raw)
  To: Aubrey Li
  Cc: Nick Piggin, Vaidyanathan Srinivasan, linux-kernel, linux-mm,
	Linus Torvalds, Andrew Morton, linux-os (Dick Johnson),
	Robin Getz, Hennerich, Michael

On Sat, 20 Jan 2007, Aubrey Li wrote:

> assume:
> min = 123pages
> pagecache_reserved = 200 pages
> 
> if( alloc_flags & ALLOC_PAGECACHE)
>        watermark = min + pagecache_reserved ( 323 pages)
> else
>        watermark = min ( 123 pages)
> 
> So if request pagecache, when free pages < 323 pages, reclaim is triggered.
> But at this time if request memory not pagecache, reclaim will be
> triggered when free pages < 123 as the present reclaimer does.
> 
> I verified it on my side, why do you think it doesn't work properly?

The code does not check the page cache size but the number of free pages. 
The page cache size is available via zone_page_state(zone, NR_FILE_PAGES).

In its current form your patch is making the system reclaim earlier for 
page cache allocations. And its reclaiming regardless of the number of 
pages in the page cache. If there are no pagecache pages but only 
anonymous pages in the zone then the code will still reclaim although the 
page cache size is zero.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-19 15:40     ` Aubrey Li
@ 2007-01-24  5:30       ` Vaidyanathan Srinivasan
  -1 siblings, 0 replies; 40+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-01-24  5:30 UTC (permalink / raw)
  To: Aubrey Li
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz



Aubrey Li wrote:
> On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>> Hi Aubrey,
>>
>> I used your patch on my PPC64 box and I do not get expected
>> behavior.  As you had requested, I am attaching zoneinfo and meminfo
>> dumps:
>>
>> Please let me know if you need any further data to help me out with
>> the test/experiment.
>>
> 
> Although I have no PPC64 box in hand, I think the logic should be the same.
> get_page_from_freelist() is called 5 times in __alloc_pages().
> 
> 1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE;
> 2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE;
> We should have the same result on the first two times get_page_from_freelist().
> 
> 3) if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
> 			&& !in_interrupt())
>    alloc_flags = ALLOC_NO_WATERMARKS
> The case on my platform will never enter this branch. If the branch
> occurs on your side,
> The limit will be omitted. Because NO watermark, zone_watermark_ok()
> will not be checked. memory will be allocated directly.
> 
> 4)if (likely(did_some_progress)) {
>    alloc_flags should include ALLOC_PAGECACHE.
> So we should have the same result on this call.
> 
> 5)	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
>    alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE
> 
> This branch will not hit on my case. You may need to check it.
> 
> If 3) or 5) occurs on your platform, I think you can easily fix it.
> Please confirm it and let me know the result.


None of the above condition was the problem in my PPC64 box.  I
added __GFP_PAGECACHE flag in pagecache_alloc_cold() and
grab_cache_page_nowait() routines and the reclaim seemed to work.

--- linux-2.6.20-rc5.orig/include/linux/pagemap.h
+++ linux-2.6.20-rc5/include/linux/pagemap.h
@@ -62,12 +62,12 @@ static inline struct page *__page_cache_

 static inline struct page *page_cache_alloc(struct address_space *x)
 {
-	return __page_cache_alloc(mapping_gfp_mask(x));
+	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE);
 }

 static inline struct page *page_cache_alloc_cold(struct
address_space *x)
 {
-	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
+	return
__page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD|__GFP_PAGECACHE);
 }

 typedef int filler_t(void *, struct page *);

[snip]

--- linux-2.6.20-rc5.orig/mm/filemap.c
+++ linux-2.6.20-rc5/mm/filemap.c
@@ -823,7 +823,7 @@ grab_cache_page_nowait(struct address_sp
 		page_cache_release(page);
 		return NULL;
 	}
-	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
+	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS |
__GFP_PAGECACHE);
 	if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
 		page_cache_release(page);
 		page = NULL;


pagecache_alloc_cold() is used in the read-ahead path which was
being called in my case of large file operations.

--Vaidy


^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-24  5:30       ` Vaidyanathan Srinivasan
  0 siblings, 0 replies; 40+ messages in thread
From: Vaidyanathan Srinivasan @ 2007-01-24  5:30 UTC (permalink / raw)
  To: Aubrey Li
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz


Aubrey Li wrote:
> On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>> Hi Aubrey,
>>
>> I used your patch on my PPC64 box and I do not get expected
>> behavior.  As you had requested, I am attaching zoneinfo and meminfo
>> dumps:
>>
>> Please let me know if you need any further data to help me out with
>> the test/experiment.
>>
> 
> Although I have no PPC64 box in hand, I think the logic should be the same.
> get_page_from_freelist() is called 5 times in __alloc_pages().
> 
> 1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE;
> 2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE;
> We should have the same result on the first two times get_page_from_freelist().
> 
> 3) if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
> 			&& !in_interrupt())
>    alloc_flags = ALLOC_NO_WATERMARKS
> The case on my platform will never enter this branch. If the branch
> occurs on your side,
> The limit will be omitted. Because NO watermark, zone_watermark_ok()
> will not be checked. memory will be allocated directly.
> 
> 4)if (likely(did_some_progress)) {
>    alloc_flags should include ALLOC_PAGECACHE.
> So we should have the same result on this call.
> 
> 5)	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
>    alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE
> 
> This branch will not hit on my case. You may need to check it.
> 
> If 3) or 5) occurs on your platform, I think you can easily fix it.
> Please confirm it and let me know the result.


None of the above condition was the problem in my PPC64 box.  I
added __GFP_PAGECACHE flag in pagecache_alloc_cold() and
grab_cache_page_nowait() routines and the reclaim seemed to work.

--- linux-2.6.20-rc5.orig/include/linux/pagemap.h
+++ linux-2.6.20-rc5/include/linux/pagemap.h
@@ -62,12 +62,12 @@ static inline struct page *__page_cache_

 static inline struct page *page_cache_alloc(struct address_space *x)
 {
-	return __page_cache_alloc(mapping_gfp_mask(x));
+	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE);
 }

 static inline struct page *page_cache_alloc_cold(struct
address_space *x)
 {
-	return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
+	return
__page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD|__GFP_PAGECACHE);
 }

 typedef int filler_t(void *, struct page *);

[snip]

--- linux-2.6.20-rc5.orig/mm/filemap.c
+++ linux-2.6.20-rc5/mm/filemap.c
@@ -823,7 +823,7 @@ grab_cache_page_nowait(struct address_sp
 		page_cache_release(page);
 		return NULL;
 	}
-	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
+	page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS |
__GFP_PAGECACHE);
 	if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
 		page_cache_release(page);
 		page = NULL;


pagecache_alloc_cold() is used in the read-ahead path which was
being called in my case of large file operations.

--Vaidy

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
  2007-01-24  5:30       ` Vaidyanathan Srinivasan
@ 2007-01-24  5:53         ` Aubrey Li
  -1 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-24  5:53 UTC (permalink / raw)
  To: Vaidyanathan Srinivasan
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz

On 1/24/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>
>
> Aubrey Li wrote:
> > On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
> >> Hi Aubrey,
> >>
> >> I used your patch on my PPC64 box and I do not get expected
> >> behavior.  As you had requested, I am attaching zoneinfo and meminfo
> >> dumps:
> >>
> >> Please let me know if you need any further data to help me out with
> >> the test/experiment.
> >>
> >
> > Although I have no PPC64 box in hand, I think the logic should be the same.
> > get_page_from_freelist() is called 5 times in __alloc_pages().
> >
> > 1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE;
> > 2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE;
> > We should have the same result on the first two times get_page_from_freelist().
> >
> > 3) if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
> >                       && !in_interrupt())
> >    alloc_flags = ALLOC_NO_WATERMARKS
> > The case on my platform will never enter this branch. If the branch
> > occurs on your side,
> > The limit will be omitted. Because NO watermark, zone_watermark_ok()
> > will not be checked. memory will be allocated directly.
> >
> > 4)if (likely(did_some_progress)) {
> >    alloc_flags should include ALLOC_PAGECACHE.
> > So we should have the same result on this call.
> >
> > 5)    } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
> >    alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE
> >
> > This branch will not hit on my case. You may need to check it.
> >
> > If 3) or 5) occurs on your platform, I think you can easily fix it.
> > Please confirm it and let me know the result.
>
>
> None of the above condition was the problem in my PPC64 box.  I
> added __GFP_PAGECACHE flag in pagecache_alloc_cold() and
> grab_cache_page_nowait() routines and the reclaim seemed to work.
>
> --- linux-2.6.20-rc5.orig/include/linux/pagemap.h
> +++ linux-2.6.20-rc5/include/linux/pagemap.h
> @@ -62,12 +62,12 @@ static inline struct page *__page_cache_
>
>  static inline struct page *page_cache_alloc(struct address_space *x)
>  {
> -       return __page_cache_alloc(mapping_gfp_mask(x));
> +       return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE);
>  }
>
>  static inline struct page *page_cache_alloc_cold(struct
> address_space *x)
>  {
> -       return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
> +       return
> __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD|__GFP_PAGECACHE);
>  }
>
>  typedef int filler_t(void *, struct page *);
>
> [snip]
>
> --- linux-2.6.20-rc5.orig/mm/filemap.c
> +++ linux-2.6.20-rc5/mm/filemap.c
> @@ -823,7 +823,7 @@ grab_cache_page_nowait(struct address_sp
>                 page_cache_release(page);
>                 return NULL;
>         }
> -       page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
> +       page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS |
> __GFP_PAGECACHE);
>         if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
>                 page_cache_release(page);
>                 page = NULL;
>
>
> pagecache_alloc_cold() is used in the read-ahead path which was
> being called in my case of large file operations.
>
> --Vaidy
>
Thanks to point it out. There is another patch on the LKML which I
think is better.
Checking the zone->max_pagecache  in the get_page_from_freelist() is
better than checking the watermark in zone_watermark_ok(). Let me know
if it works for you.

Thanks,
-Aubrey

^ permalink raw reply	[flat|nested] 40+ messages in thread

* Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
@ 2007-01-24  5:53         ` Aubrey Li
  0 siblings, 0 replies; 40+ messages in thread
From: Aubrey Li @ 2007-01-24  5:53 UTC (permalink / raw)
  To: Vaidyanathan Srinivasan
  Cc: linux-kernel, linux-mm, Linus Torvalds, Andrew Morton,
	Nick Piggin, linux-os (Dick Johnson),
	Robin Getz

On 1/24/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
>
>
> Aubrey Li wrote:
> > On 1/19/07, Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com> wrote:
> >> Hi Aubrey,
> >>
> >> I used your patch on my PPC64 box and I do not get expected
> >> behavior.  As you had requested, I am attaching zoneinfo and meminfo
> >> dumps:
> >>
> >> Please let me know if you need any further data to help me out with
> >> the test/experiment.
> >>
> >
> > Although I have no PPC64 box in hand, I think the logic should be the same.
> > get_page_from_freelist() is called 5 times in __alloc_pages().
> >
> > 1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE;
> > 2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE;
> > We should have the same result on the first two times get_page_from_freelist().
> >
> > 3) if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
> >                       && !in_interrupt())
> >    alloc_flags = ALLOC_NO_WATERMARKS
> > The case on my platform will never enter this branch. If the branch
> > occurs on your side,
> > The limit will be omitted. Because NO watermark, zone_watermark_ok()
> > will not be checked. memory will be allocated directly.
> >
> > 4)if (likely(did_some_progress)) {
> >    alloc_flags should include ALLOC_PAGECACHE.
> > So we should have the same result on this call.
> >
> > 5)    } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
> >    alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE
> >
> > This branch will not hit on my case. You may need to check it.
> >
> > If 3) or 5) occurs on your platform, I think you can easily fix it.
> > Please confirm it and let me know the result.
>
>
> None of the above condition was the problem in my PPC64 box.  I
> added __GFP_PAGECACHE flag in pagecache_alloc_cold() and
> grab_cache_page_nowait() routines and the reclaim seemed to work.
>
> --- linux-2.6.20-rc5.orig/include/linux/pagemap.h
> +++ linux-2.6.20-rc5/include/linux/pagemap.h
> @@ -62,12 +62,12 @@ static inline struct page *__page_cache_
>
>  static inline struct page *page_cache_alloc(struct address_space *x)
>  {
> -       return __page_cache_alloc(mapping_gfp_mask(x));
> +       return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE);
>  }
>
>  static inline struct page *page_cache_alloc_cold(struct
> address_space *x)
>  {
> -       return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD);
> +       return
> __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD|__GFP_PAGECACHE);
>  }
>
>  typedef int filler_t(void *, struct page *);
>
> [snip]
>
> --- linux-2.6.20-rc5.orig/mm/filemap.c
> +++ linux-2.6.20-rc5/mm/filemap.c
> @@ -823,7 +823,7 @@ grab_cache_page_nowait(struct address_sp
>                 page_cache_release(page);
>                 return NULL;
>         }
> -       page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
> +       page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS |
> __GFP_PAGECACHE);
>         if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) {
>                 page_cache_release(page);
>                 page = NULL;
>
>
> pagecache_alloc_cold() is used in the read-ahead path which was
> being called in my case of large file operations.
>
> --Vaidy
>
Thanks to point it out. There is another patch on the LKML which I
think is better.
Checking the zone->max_pagecache  in the get_page_from_freelist() is
better than checking the watermark in zone_watermark_ok(). Let me know
if it works for you.

Thanks,
-Aubrey

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 40+ messages in thread

end of thread, other threads:[~2007-01-24  5:53 UTC | newest]

Thread overview: 40+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2007-01-18  3:23 [RPC][PATCH 2.6.20-rc5] limit total vfs page cache Aubrey Li
2007-01-18  3:23 ` Aubrey Li
2007-01-19 14:44 ` Vaidyanathan Srinivasan
2007-01-19 14:44   ` Vaidyanathan Srinivasan
2007-01-19 15:40   ` Aubrey Li
2007-01-19 15:40     ` Aubrey Li
2007-01-24  5:30     ` Vaidyanathan Srinivasan
2007-01-24  5:30       ` Vaidyanathan Srinivasan
2007-01-24  5:53       ` Aubrey Li
2007-01-24  5:53         ` Aubrey Li
2007-01-19 14:52 ` Vaidyanathan Srinivasan
2007-01-19 14:52   ` Vaidyanathan Srinivasan
2007-01-19 16:05   ` Aubrey Li
2007-01-19 16:05     ` Aubrey Li
2007-01-19 18:49     ` Vaidyanathan Srinivasan
2007-01-19 18:49       ` Vaidyanathan Srinivasan
2007-01-19 19:01       ` Christoph Lameter
2007-01-19 19:01         ` Christoph Lameter
2007-01-20  2:04       ` Aubrey Li
2007-01-20  2:04         ` Aubrey Li
2007-01-20  2:24         ` Nick Piggin
2007-01-20  2:24           ` Nick Piggin
2007-01-20  2:35           ` Mike Frysinger
2007-01-20  2:35             ` Mike Frysinger
2007-01-20  2:49             ` Nick Piggin
2007-01-20  2:49               ` Nick Piggin
2007-01-20  3:40               ` Mike Frysinger
2007-01-20  3:40                 ` Mike Frysinger
2007-01-20  3:08           ` Aubrey Li
2007-01-20  3:08             ` Aubrey Li
2007-01-20  4:03             ` Nick Piggin
2007-01-20  4:03               ` Nick Piggin
2007-01-20  4:26               ` Aubrey Li
2007-01-20  4:26                 ` Aubrey Li
2007-01-22 19:22                 ` Christoph Lameter
2007-01-22 19:22                   ` Christoph Lameter
2007-01-22 19:15               ` Christoph Lameter
2007-01-22 19:15                 ` Christoph Lameter
2007-01-19 18:21 ` Christoph Lameter
2007-01-19 18:21   ` Christoph Lameter

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.