linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] per-zone kswapd process
@ 2002-09-13  3:33 Dave Hansen
  2002-09-13  4:06 ` Andrew Morton
  0 siblings, 1 reply; 28+ messages in thread
From: Dave Hansen @ 2002-09-13  3:33 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Martin J. Bligh, William Lee Irwin III, linux-kernel, linux-mm

[-- Attachment #1: Type: text/plain, Size: 1119 bytes --]

This patch implements a kswapd process for each memory zone.  The original code 
came from Bill Irwin, but the current VM is quite a bit different from the one 
that he wrote it for, so not much remains.  The current kswapd interface is much 
more simple than before because there is a single waitqueue and there is a 
single place where it is emptied.

kswapd_can_sleep() and kswapd_balance() are simpler now that the extra pgdat 
level of indirection is gone.

Tested on 8-way PIII with highmem off and then 4GB support.  With 4GB support, I 
did 20 parallel greps through a 10GB fileset while some other processes 
allocated and freed 1-2GB chunks of memory.  That gave kswapd a good workout, 
and I observed it running the zone Highmem and zone Normal kswapd threads.  So, 
it survives my torture test.  It also removes more code than it adds.

include/linux/mmzone.h |    2 +
include/linux/swap.h   |    1
mm/page_alloc.c        |   11 +++++-
mm/vmscan.c            |   88 +++++++++++++++++--------------------------------
4 files changed, 42 insertions(+), 60 deletions(-)

-- 
Dave Hansen
haveblue@us.ibm.com




[-- Attachment #2: per-zone-kswapd-2.5.34-mm2-3.patch --]
[-- Type: text/plain, Size: 6092 bytes --]

# This is a BitKeeper generated patch for the following project:
# Project Name: Linux kernel tree
# This patch format is intended for GNU patch command version 2.5 or higher.
# This patch includes the following deltas:
#	           ChangeSet	1.625   -> 1.628  
#	include/linux/mmzone.h	1.19    -> 1.20   
#	include/linux/swap.h	1.57    -> 1.58   
#	     mm/page_alloc.c	1.98    -> 1.101  
#	         mm/vmscan.c	1.102   -> 1.105  
#
# The following is the BitKeeper ChangeSet Log
# --------------------------------------------
# 02/09/12	haveblue@elm3b96.(none)	1.626
# add per-zone kswapd
# --------------------------------------------
# 02/09/12	haveblue@elm3b96.(none)	1.627
# fix some wli-indicated formatting bits
# --------------------------------------------
# 02/09/12	haveblue@elm3b96.(none)	1.628
# move waitqueue init to a more appropriate place 
# --------------------------------------------
#
diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h
--- a/include/linux/mmzone.h	Thu Sep 12 20:24:39 2002
+++ b/include/linux/mmzone.h	Thu Sep 12 20:24:39 2002
@@ -108,6 +108,8 @@
 	unsigned long		wait_table_size;
 	unsigned long		wait_table_bits;
 
+	wait_queue_head_t       kswapd_wait;	
+	
 	/*
 	 * Discontig memory support fields.
 	 */
diff -Nru a/include/linux/swap.h b/include/linux/swap.h
--- a/include/linux/swap.h	Thu Sep 12 20:24:39 2002
+++ b/include/linux/swap.h	Thu Sep 12 20:24:39 2002
@@ -162,7 +162,6 @@
 extern void swap_setup(void);
 
 /* linux/mm/vmscan.c */
-extern wait_queue_head_t kswapd_wait;
 extern int try_to_free_pages(struct zone *, unsigned int, unsigned int);
 
 /* linux/mm/page_io.c */
diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c
--- a/mm/page_alloc.c	Thu Sep 12 20:24:39 2002
+++ b/mm/page_alloc.c	Thu Sep 12 20:24:39 2002
@@ -345,8 +345,15 @@
 	classzone->need_balance = 1;
 	mb();
 	/* we're somewhat low on memory, failed to find what we needed */
-	if (waitqueue_active(&kswapd_wait))
-		wake_up_interruptible(&kswapd_wait);
+	for (i = 0; zones[i] != NULL; i++) {
+		struct zone *z = zones[i];
+
+		/* We don't want to go swapping on zones that aren't actually
+		 * low.  This accounts for "incremental min" from last loop */
+		if (z->free_pages <= z->pages_low &&
+		    waitqueue_active(&z->kswapd_wait)) 
+			wake_up_interruptible(&z->kswapd_wait);
+	}
 
 	/* Go through the zonelist again, taking __GFP_HIGH into account */
 	min = 1UL << order;
@@ -874,6 +881,8 @@
 		for(i = 0; i < zone->wait_table_size; ++i)
 			init_waitqueue_head(zone->wait_table + i);
 
+		init_waitqueue_head(&zone->kswapd_wait);
+		
 		pgdat->nr_zones = j+1;
 
 		mask = (realsize / zone_balance_ratio[j]);
diff -Nru a/mm/vmscan.c b/mm/vmscan.c
--- a/mm/vmscan.c	Thu Sep 12 20:24:39 2002
+++ b/mm/vmscan.c	Thu Sep 12 20:24:39 2002
@@ -713,8 +713,6 @@
 	return 0;
 }
 
-DECLARE_WAIT_QUEUE_HEAD(kswapd_wait);
-
 static int check_classzone_need_balance(struct zone *classzone)
 {
 	struct zone *first_classzone;
@@ -728,71 +726,33 @@
 	return 1;
 }
 
-static int kswapd_balance_pgdat(pg_data_t * pgdat)
+static int kswapd_balance_zone(struct zone *zone)
 {
-	int need_more_balance = 0, i;
-	struct zone *zone;
-
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
+	int need_more_balance = 0;
+	
+	do {
 		cond_resched();
 		if (!zone->need_balance)
-			continue;
+			break;
 		if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) {
 			zone->need_balance = 0;
 			__set_current_state(TASK_INTERRUPTIBLE);
 			schedule_timeout(HZ);
-			continue;
+			break;
 		}
 		if (check_classzone_need_balance(zone))
 			need_more_balance = 1;
 		else
 			zone->need_balance = 0;
-	}
-
-	return need_more_balance;
-}
-
-static void kswapd_balance(void)
-{
-	int need_more_balance;
-	pg_data_t * pgdat;
-
-	do {
-		need_more_balance = 0;
-		pgdat = pgdat_list;
-		do
-			need_more_balance |= kswapd_balance_pgdat(pgdat);
-		while ((pgdat = pgdat->pgdat_next));
 	} while (need_more_balance);
-}
 
-static int kswapd_can_sleep_pgdat(pg_data_t * pgdat)
-{
-	struct zone *zone;
-	int i;
-
-	for (i = pgdat->nr_zones-1; i >= 0; i--) {
-		zone = pgdat->node_zones + i;
-		if (!zone->need_balance)
-			continue;
-		return 0;
-	}
-
-	return 1;
+	return 0;
 }
 
-static int kswapd_can_sleep(void)
+static int kswapd_can_sleep_zone(struct zone *zone)
 {
-	pg_data_t * pgdat;
-
-	pgdat = pgdat_list;
-	do {
-		if (kswapd_can_sleep_pgdat(pgdat))
-			continue;
-		return 0;
-	} while ((pgdat = pgdat->pgdat_next));
-
+	if (zone->need_balance)
+		return 0;	
 	return 1;
 }
 
@@ -809,13 +769,18 @@
  * If there are applications that are active memory-allocators
  * (most normal use), this basically shouldn't matter.
  */
-int kswapd(void *unused)
+int kswapd_zone(void *p)
 {
+	struct zone *zone = (struct zone *)p;
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
+	
+	printk( "kswapd%d starting for %s\n", 
+			zone - zone->zone_pgdat->node_zones, 
+			zone->name);
 
 	daemonize();
-	strcpy(tsk->comm, "kswapd");
+	sprintf(tsk->comm, "kswapd%d", zone - zone->zone_pgdat->node_zones);
 	sigfillset(&tsk->blocked);
 	
 	/*
@@ -839,30 +804,37 @@
 		if (current->flags & PF_FREEZE)
 			refrigerator(PF_IOTHREAD);
 		__set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&kswapd_wait, &wait);
+		add_wait_queue(&zone->kswapd_wait, &wait);
 
 		mb();
-		if (kswapd_can_sleep())
+		if (kswapd_can_sleep_zone(zone))
 			schedule();
 
 		__set_current_state(TASK_RUNNING);
-		remove_wait_queue(&kswapd_wait, &wait);
+		remove_wait_queue(&zone->kswapd_wait, &wait);
 
 		/*
 		 * If we actually get into a low-memory situation,
 		 * the processes needing more memory will wake us
 		 * up on a more timely basis.
 		 */
-		kswapd_balance();
+		kswapd_balance_zone(zone);
 		blk_run_queues();
 	}
 }
 
 static int __init kswapd_init(void)
 {
+	struct zone* zone;
+
 	printk("Starting kswapd\n");
 	swap_setup();
-	kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	for_each_zone(zone)
+		if (zone->size)
+			kernel_thread(kswapd_zone, 
+				      zone, 
+				      CLONE_FS | CLONE_FILES | CLONE_SIGNAL);
+	
 	return 0;
 }
 

^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [PATCH] recognize MAP_LOCKED in mmap() call
@ 2002-09-18 19:18 Mark_H_Johnson
  2002-09-18 19:39 ` Rik van Riel
                   ` (2 more replies)
  0 siblings, 3 replies; 28+ messages in thread
From: Mark_H_Johnson @ 2002-09-18 19:18 UTC (permalink / raw)
  To: Andrew Morton; +Cc: linux-kernel, linux-mm, owner-linux-mm


Andrew Morton wrote:
>(SuS really only anticipates that mmap needs to look at prior mlocks
>in force against the address range.  It also says
>
>     Process memory locking does apply to shared memory regions,
>
>and we don't do that either.  I think we should; can't see why SuS
>requires this.)

Let me make sure I read what you said correctly. Does this mean that Linux
2.4 (or 2.5) kernels do not lock shared memory regions if a process uses
mlockall?

If not, that is *really bad* for our real time applications. We don't want
to take a page fault while running some 80hz task, just because some
non-real time application tried to use what little physical memory we allow
for the kernel and all other applications.

I asked a related question about a week ago on linux-mm and didn't get a
response. Basically, I was concerned that top did not show RSS == Size when
mlockall(MCL_CURRENT|MCL_FUTURE) was called. Could this explain the
difference or is there something else that I'm missing here?

Thanks.
--Mark H Johnson
  <mailto:Mark_H_Johnson@raytheon.com>




^ permalink raw reply	[flat|nested] 28+ messages in thread
* Re: [PATCH] recognize MAP_LOCKED in mmap() call
@ 2002-09-25 16:57 Mark_H_Johnson
  0 siblings, 0 replies; 28+ messages in thread
From: Mark_H_Johnson @ 2002-09-25 16:57 UTC (permalink / raw)
  To: frankeh; +Cc: Andrew Morton, linux-kernel, linux-mm


>This is what the manpage says...
>
>       mlockall  disables  paging  for  all pages mapped into the
>       address space of the calling process.  This  includes  the
>       pages  of  the  code,  data  and stack segment, as well as
>       shared libraries, user space kernel  data,  shared  memory
>       and  memory  mapped files. All mapped pages are guaranteed
>       to be resident  in  RAM  when  the  mlockall  system  call
>       returns  successfully  and  they are guaranteed to stay in
>       RAM until the pages  are  unlocked  again  by  munlock  or
>       munlockall  or  until  the  process  terminates  or starts
>       another program with exec.  Child processes do not inherit
>       page locks across a fork.
>
>Do you read that all pages must be faulted in apriori ?
>Or is it sufficient to to make sure none of the currently mapped
>pages are swapped out and future swapout is prohibited.
>
The key phrase is that "...all mapped pages are guaranteed to be resident
in RAM when the mlockall system call returns successfully..." (third
sentence) In that way I would expect the segments containing the code,
heap, and current stack allocations to be resident. I do not expect
the full stack allocation (e.g., 2M for each thread if that is the
stack size) to be mapped (nor resident) unless I take special action
to grow the stack that large.

We happen to have special code to grow each stack and allocate heap
variables to account for what we expect to use prior to mlockall.

That does raise a question though - are there other segments (e.g.,
debug information) that may be in the total size calculations that
are mapped only when some special action is taken (e.g., I run the
debugger)? That would explain the difference - the measures I reported
on were with executables built with debug symbols.

That might also explain a possible problem we have had when trying
to debug such an application after an hour of run time or so. If
running gdb triggers a growth in locked memory (and we don't have
enough) - we would likely get an error condition that isn't normally
expected by gdb.

>This still allows for page faults on pages that have not been
>mapped in the specified range or process. If required the
>app could touch these and they wouldn't be swapped later.
>

I don't think touching the pages is enough - they have to be allocated
and the maps generated (e.g., calls to mmap, malloc). That is a possibly
expensive operation when real time is active and something we try to
avoid whenever possible.

--
--Mark H Johnson
  <mailto:Mark_H_Johnson@raytheon.com>



^ permalink raw reply	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2002-09-25 17:05 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2002-09-13  3:33 [PATCH] per-zone kswapd process Dave Hansen
2002-09-13  4:06 ` Andrew Morton
2002-09-13  4:59   ` William Lee Irwin III
2002-09-13  5:10     ` Martin J. Bligh
     [not found]       ` <3D8232DE.9090000@us.ibm.com>
     [not found]         ` <3D823702.8E29AB4F@digeo.com>
     [not found]           ` <3D8251D6.3060704@us.ibm.com>
     [not found]             ` <3D82566B.EB2939D5@digeo.com>
2002-09-13 22:52               ` [PATCH] per-zone^Wnode " Dave Hansen
2002-09-13 23:24                 ` Matthew Dobson
2002-09-13 23:29                 ` Matthew Dobson
2002-09-13 23:46                 ` William Lee Irwin III
2002-09-14  0:02                   ` Andrew Morton
2002-09-14  0:12                     ` William Lee Irwin III
2002-09-14  1:19                       ` Andrew Morton
2002-09-13  5:46     ` [PATCH] per-zone " Andrew Morton
2002-09-13  5:38       ` Martin J. Bligh
2002-09-13  6:03         ` Andrew Morton
2002-09-13 13:05     ` Alan Cox
2002-09-13 21:30       ` William Lee Irwin III
2002-09-18 16:07         ` [PATCH] recognize MAP_LOCKED in mmap() call Hubertus Franke
2002-09-18 16:29           ` Andrew Morton
2002-09-16  5:44     ` [PATCH] per-zone kswapd process Daniel Phillips
2002-09-16  7:46       ` William Lee Irwin III
2002-09-16 15:12         ` Rik van Riel
2002-09-18 19:18 [PATCH] recognize MAP_LOCKED in mmap() call Mark_H_Johnson
2002-09-18 19:39 ` Rik van Riel
2002-09-18 19:54 ` Andrew Morton
2002-09-25 15:42   ` Hubertus Franke
2002-09-25 16:35     ` Andrew Morton
2002-09-25 15:36 ` Hubertus Franke
2002-09-25 16:57 Mark_H_Johnson

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).