[-- Attachment #1: Type: text/plain, Size: 1119 bytes --] This patch implements a kswapd process for each memory zone. The original code came from Bill Irwin, but the current VM is quite a bit different from the one that he wrote it for, so not much remains. The current kswapd interface is much more simple than before because there is a single waitqueue and there is a single place where it is emptied. kswapd_can_sleep() and kswapd_balance() are simpler now that the extra pgdat level of indirection is gone. Tested on 8-way PIII with highmem off and then 4GB support. With 4GB support, I did 20 parallel greps through a 10GB fileset while some other processes allocated and freed 1-2GB chunks of memory. That gave kswapd a good workout, and I observed it running the zone Highmem and zone Normal kswapd threads. So, it survives my torture test. It also removes more code than it adds. include/linux/mmzone.h | 2 + include/linux/swap.h | 1 mm/page_alloc.c | 11 +++++- mm/vmscan.c | 88 +++++++++++++++++-------------------------------- 4 files changed, 42 insertions(+), 60 deletions(-) -- Dave Hansen haveblue@us.ibm.com [-- Attachment #2: per-zone-kswapd-2.5.34-mm2-3.patch --] [-- Type: text/plain, Size: 6092 bytes --] # This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.625 -> 1.628 # include/linux/mmzone.h 1.19 -> 1.20 # include/linux/swap.h 1.57 -> 1.58 # mm/page_alloc.c 1.98 -> 1.101 # mm/vmscan.c 1.102 -> 1.105 # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 02/09/12 haveblue@elm3b96.(none) 1.626 # add per-zone kswapd # -------------------------------------------- # 02/09/12 haveblue@elm3b96.(none) 1.627 # fix some wli-indicated formatting bits # -------------------------------------------- # 02/09/12 haveblue@elm3b96.(none) 1.628 # move waitqueue init to a more appropriate place # -------------------------------------------- # diff -Nru a/include/linux/mmzone.h b/include/linux/mmzone.h --- a/include/linux/mmzone.h Thu Sep 12 20:24:39 2002 +++ b/include/linux/mmzone.h Thu Sep 12 20:24:39 2002 @@ -108,6 +108,8 @@ unsigned long wait_table_size; unsigned long wait_table_bits; + wait_queue_head_t kswapd_wait; + /* * Discontig memory support fields. */ diff -Nru a/include/linux/swap.h b/include/linux/swap.h --- a/include/linux/swap.h Thu Sep 12 20:24:39 2002 +++ b/include/linux/swap.h Thu Sep 12 20:24:39 2002 @@ -162,7 +162,6 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern wait_queue_head_t kswapd_wait; extern int try_to_free_pages(struct zone *, unsigned int, unsigned int); /* linux/mm/page_io.c */ diff -Nru a/mm/page_alloc.c b/mm/page_alloc.c --- a/mm/page_alloc.c Thu Sep 12 20:24:39 2002 +++ b/mm/page_alloc.c Thu Sep 12 20:24:39 2002 @@ -345,8 +345,15 @@ classzone->need_balance = 1; mb(); /* we're somewhat low on memory, failed to find what we needed */ - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + + /* We don't want to go swapping on zones that aren't actually + * low. This accounts for "incremental min" from last loop */ + if (z->free_pages <= z->pages_low && + waitqueue_active(&z->kswapd_wait)) + wake_up_interruptible(&z->kswapd_wait); + } /* Go through the zonelist again, taking __GFP_HIGH into account */ min = 1UL << order; @@ -874,6 +881,8 @@ for(i = 0; i < zone->wait_table_size; ++i) init_waitqueue_head(zone->wait_table + i); + init_waitqueue_head(&zone->kswapd_wait); + pgdat->nr_zones = j+1; mask = (realsize / zone_balance_ratio[j]); diff -Nru a/mm/vmscan.c b/mm/vmscan.c --- a/mm/vmscan.c Thu Sep 12 20:24:39 2002 +++ b/mm/vmscan.c Thu Sep 12 20:24:39 2002 @@ -713,8 +713,6 @@ return 0; } -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - static int check_classzone_need_balance(struct zone *classzone) { struct zone *first_classzone; @@ -728,71 +726,33 @@ return 1; } -static int kswapd_balance_pgdat(pg_data_t * pgdat) +static int kswapd_balance_zone(struct zone *zone) { - int need_more_balance = 0, i; - struct zone *zone; - - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; + int need_more_balance = 0; + + do { cond_resched(); if (!zone->need_balance) - continue; + break; if (!try_to_free_pages(zone, GFP_KSWAPD, 0)) { zone->need_balance = 0; __set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ); - continue; + break; } if (check_classzone_need_balance(zone)) need_more_balance = 1; else zone->need_balance = 0; - } - - return need_more_balance; -} - -static void kswapd_balance(void) -{ - int need_more_balance; - pg_data_t * pgdat; - - do { - need_more_balance = 0; - pgdat = pgdat_list; - do - need_more_balance |= kswapd_balance_pgdat(pgdat); - while ((pgdat = pgdat->pgdat_next)); } while (need_more_balance); -} -static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) -{ - struct zone *zone; - int i; - - for (i = pgdat->nr_zones-1; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!zone->need_balance) - continue; - return 0; - } - - return 1; + return 0; } -static int kswapd_can_sleep(void) +static int kswapd_can_sleep_zone(struct zone *zone) { - pg_data_t * pgdat; - - pgdat = pgdat_list; - do { - if (kswapd_can_sleep_pgdat(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->pgdat_next)); - + if (zone->need_balance) + return 0; return 1; } @@ -809,13 +769,18 @@ * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -int kswapd(void *unused) +int kswapd_zone(void *p) { + struct zone *zone = (struct zone *)p; struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); + + printk( "kswapd%d starting for %s\n", + zone - zone->zone_pgdat->node_zones, + zone->name); daemonize(); - strcpy(tsk->comm, "kswapd"); + sprintf(tsk->comm, "kswapd%d", zone - zone->zone_pgdat->node_zones); sigfillset(&tsk->blocked); /* @@ -839,30 +804,37 @@ if (current->flags & PF_FREEZE) refrigerator(PF_IOTHREAD); __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kswapd_wait, &wait); + add_wait_queue(&zone->kswapd_wait, &wait); mb(); - if (kswapd_can_sleep()) + if (kswapd_can_sleep_zone(zone)) schedule(); __set_current_state(TASK_RUNNING); - remove_wait_queue(&kswapd_wait, &wait); + remove_wait_queue(&zone->kswapd_wait, &wait); /* * If we actually get into a low-memory situation, * the processes needing more memory will wake us * up on a more timely basis. */ - kswapd_balance(); + kswapd_balance_zone(zone); blk_run_queues(); } } static int __init kswapd_init(void) { + struct zone* zone; + printk("Starting kswapd\n"); swap_setup(); - kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + for_each_zone(zone) + if (zone->size) + kernel_thread(kswapd_zone, + zone, + CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + return 0; }
Dave Hansen wrote:
>
> This patch implements a kswapd process for each memory zone.
I still don't see why it's per zone and not per node. It seems strange
that a wee little laptop would be running two kswapds?
kswapd can get a ton of work done in the development VM and one per
node would, I expect, suffice?
Also, I'm wondering why the individual kernel threads don't have
their affinity masks set to make them run on the CPUs to which the
zone (or zones) are local?
Isn't it the case that with this code you could end up with a kswapd
on node 0 crunching on node 1's pages while a kswapd on node 1 crunches
on node 0's pages?
If I'm not totally out to lunch on this, I'd have thought that a
better approach would be
int sys_kswapd(int nid)
{
return kernel_thread(kswapd, ...);
}
Userspace could then set up the CPU affinity based on some topology
or config information and would then parent a kswapd instance. That
kswapd instance would then be bound to the CPUs which were on the
node identified by `nid'.
Or something like that?
On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote: > I still don't see why it's per zone and not per node. It seems strange > that a wee little laptop would be running two kswapds? > kswapd can get a ton of work done in the development VM and one per > node would, I expect, suffice? Machines without observable NUMA effects can benefit from it if it's per-zone. It also follows that if there's more than one task doing this, page replacement is less likely to block entirely. Last, but not least, when I devised it, "per-zone" was the theme. On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote: > Also, I'm wondering why the individual kernel threads don't have > their affinity masks set to make them run on the CPUs to which the > zone (or zones) are local? > Isn't it the case that with this code you could end up with a kswapd > on node 0 crunching on node 1's pages while a kswapd on node 1 crunches > on node 0's pages? Without some architecture-neutral method of topology detection, there's no way to do this. A follow-up when it's there should fix it. On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote: > If I'm not totally out to lunch on this, I'd have thought that a > better approach would be > int sys_kswapd(int nid) > { > return kernel_thread(kswapd, ...); > } > Userspace could then set up the CPU affinity based on some topology > or config information and would then parent a kswapd instance. That > kswapd instance would then be bound to the CPUs which were on the > node identified by `nid'. > Or something like that? I'm very very scared of handing things like that to userspace, largely because I don't trust userspace at all. At this point, we need to enumerate nodes and provide a cpu to node correspondence to userspace, and the kernel can obey, aside from the question of "What do we do if we need to scan a node without a kswapd started yet?". I think mbligh recently got the long-needed arch code in for cpu to node... But I'm just not able to make the leap of faith that memory detection is something that can ever comfortably be given to userspace. Cheers, Bill
>> Also, I'm wondering why the individual kernel threads don't have >> their affinity masks set to make them run on the CPUs to which the >> zone (or zones) are local? >> Isn't it the case that with this code you could end up with a kswapd >> on node 0 crunching on node 1's pages while a kswapd on node 1 crunches >> on node 0's pages? > > Without some architecture-neutral method of topology detection, there's > no way to do this. A follow-up when it's there should fix it. Every discontigmem arch should implement cpu_to_node, with a generic fallback mechanism that returns 0 or something. Not that we do right now, but that's easy to fix. There should also be a node_to_cpus call that returns a bitmask of which cpus are in that node. Matt ... want to sneak in the first bit of the topology patch, or whatever lump this fell under? Seems like an appropriate juncture. We have the code already somewhere, just need to fish it out. >> If I'm not totally out to lunch on this, I'd have thought that a >> better approach would be >> int sys_kswapd(int nid) >> { >> return kernel_thread(kswapd, ...); >> } >> Userspace could then set up the CPU affinity based on some topology >> or config information and would then parent a kswapd instance. That >> kswapd instance would then be bound to the CPUs which were on the >> node identified by `nid'. >> Or something like that? > > I'm very very scared of handing things like that to userspace, largely > because I don't trust userspace at all. > > At this point, we need to enumerate nodes and provide a cpu to node > correspondence to userspace, and the kernel can obey, aside from the > question of "What do we do if we need to scan a node without a kswapd > started yet?". I think mbligh recently got the long-needed arch code in > for cpu to node... But I'm just not able to make the leap of faith that > memory detection is something that can ever comfortably be given to > userspace. I don't think the userspace stuff is necessary - we can do this all in the kernel dead easily I think. Just need a couple of definitions, which are trivially small functions. M.
> Sorry, I don't buy that. > > a) It does not need to be architecture neutral. > > b) You surely need a way of communicating the discovered topology > to userspace anyway. > > c) $EDITOR /etc/numa-layouf.conf > > d) $EDITOR /etc/kswapd.conf I guess you could do that, but it seems overly complicated to me. >> I think mbligh recently got the long-needed arch code in >> for cpu to node... But I'm just not able to make the leap of faith that >> memory detection is something that can ever comfortably be given to >> userspace. > > A simple syscall which alows you to launch a kswapd instance against > a group of zones on any group of CPUs provides complete generality > and flexibility to userspace. And it is architecture neutral. > > If it really is incredibly hard to divine the topology from userspace > then you need to fix that up. Provide the topology to userspace. > Which has the added benefit of providing, umm, the topology to userspace ;) Can we make a simple default of 1 per node, which is what 99% of people want, and then make it more complicated later if people complain? It's really pretty easy: for (node = 0; node < numnodes; ++node) { kswapd = kick_off_kswapd_for_node(node); kswapd->cpus_allowed = node_to_cpus(node); } Or whatever the current cpus_allowed method is. All we seem to need is node_to_cpus ... I can give that to you tommorow with no problem, it's trivial. M.
William Lee Irwin III wrote: > > On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote: > > I still don't see why it's per zone and not per node. It seems strange > > that a wee little laptop would be running two kswapds? > > kswapd can get a ton of work done in the development VM and one per > > node would, I expect, suffice? > > Machines without observable NUMA effects can benefit from it if it's > per-zone. It also follows that if there's more than one task doing this, > page replacement is less likely to block entirely. Last, but not least, > when I devised it, "per-zone" was the theme. Maybe, marginally. You could pass a gfp mask to sys_kswapd to select the zones if that's really a benefit. But if this _is_ a benefit then it's a VM bug. Because if a single kswapd cannot service three zones then it cannot service one zone. (Maybe. We need to do per-zone throttling soon to fix your OOM problems properly, but then, that shouldn't throttle kswapd). > On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote: > > Also, I'm wondering why the individual kernel threads don't have > > their affinity masks set to make them run on the CPUs to which the > > zone (or zones) are local? > > Isn't it the case that with this code you could end up with a kswapd > > on node 0 crunching on node 1's pages while a kswapd on node 1 crunches > > on node 0's pages? > > Without some architecture-neutral method of topology detection, there's > no way to do this. A follow-up when it's there should fix it. Sorry, I don't buy that. a) It does not need to be architecture neutral. b) You surely need a way of communicating the discovered topology to userspace anyway. c) $EDITOR /etc/numa-layouf.conf d) $EDITOR /etc/kswapd.conf > On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote: > > If I'm not totally out to lunch on this, I'd have thought that a > > better approach would be > > int sys_kswapd(int nid) > > { > > return kernel_thread(kswapd, ...); > > } > > Userspace could then set up the CPU affinity based on some topology > > or config information and would then parent a kswapd instance. That > > kswapd instance would then be bound to the CPUs which were on the > > node identified by `nid'. > > Or something like that? > > I'm very very scared of handing things like that to userspace, largely > because I don't trust userspace at all. Me either. I've seen workloads in which userspace consumes over 50% of the CPU resources. It should be banned! > At this point, we need to enumerate nodes and provide a cpu to node > correspondence to userspace, and the kernel can obey, aside from the > question of "What do we do if we need to scan a node without a kswapd > started yet?". kswapd is completely optional. Put a `do_exit(0)' into the current one and watch. You'll get crappy dbench numbers, but it stays up. > I think mbligh recently got the long-needed arch code in > for cpu to node... But I'm just not able to make the leap of faith that > memory detection is something that can ever comfortably be given to > userspace. A simple syscall which alows you to launch a kswapd instance against a group of zones on any group of CPUs provides complete generality and flexibility to userspace. And it is architecture neutral. If it really is incredibly hard to divine the topology from userspace then you need to fix that up. Provide the topology to userspace. Which has the added benefit of providing, umm, the topology to userspace ;)
"Martin J. Bligh" wrote: > > .. > Can we make a simple default of 1 per node, which is what 99% > of people want, and then make it more complicated later if people > complain? It's really pretty easy: > > for (node = 0; node < numnodes; ++node) { > kswapd = kick_off_kswapd_for_node(node); > kswapd->cpus_allowed = node_to_cpus(node); > } Seems sane. > Or whatever the current cpus_allowed method is. All we seem to need > is node_to_cpus ... I can give that to you tommorow with no problem, > it's trivial. Tomorrow sounds too early - it'd be nice to get some before-n-after performance testing to go along with that patch ;)
On Fri, 2002-09-13 at 05:59, William Lee Irwin III wrote:
> Machines without observable NUMA effects can benefit from it if it's
> per-zone. It also follows that if there's more than one task doing this,
> page replacement is less likely to block entirely. Last, but not least,
> when I devised it, "per-zone" was the theme.
It will also increase the amount of disk head thrashing surely ?
On Fri, 2002-09-13 at 05:59, William Lee Irwin III wrote: >> Machines without observable NUMA effects can benefit from it if it's >> per-zone. It also follows that if there's more than one task doing this, >> page replacement is less likely to block entirely. Last, but not least, >> when I devised it, "per-zone" was the theme. On Fri, Sep 13, 2002 at 02:05:52PM +0100, Alan Cox wrote: > It will also increase the amount of disk head thrashing surely ? I doubt it. Writeout isn't really supposed to happen there in 2.4 either, except under duress. OTOH I've not been doing much with this directly since rmap10c. Cheers, Bill
[-- Attachment #1: Type: text/plain, Size: 233 bytes --] Here's a per-node kswapd. It's actually per-pg_data_t, but I guess that they're equivalent. Matt is going to follow up his topology API with something to bind these to their respective nodes. -- Dave Hansen haveblue@us.ibm.com [-- Attachment #2: per-node-kswapd-2.5.34-mm2-0.patch --] [-- Type: text/plain, Size: 4764 bytes --] diff -ur linux-2.5.34-mm2-clean/include/linux/mmzone.h linux-2.5.34-mm2-per-node-kswapd/include/linux/mmzone.h --- linux-2.5.34-mm2-clean/include/linux/mmzone.h Fri Sep 13 14:32:02 2002 +++ linux-2.5.34-mm2-per-node-kswapd/include/linux/mmzone.h Fri Sep 13 15:07:02 2002 @@ -168,6 +168,7 @@ unsigned long node_size; int node_id; struct pglist_data *pgdat_next; + wait_queue_head_t kswapd_wait; } pg_data_t; extern int numnodes; diff -ur linux-2.5.34-mm2-clean/include/linux/swap.h linux-2.5.34-mm2-per-node-kswapd/include/linux/swap.h --- linux-2.5.34-mm2-clean/include/linux/swap.h Fri Sep 13 14:32:02 2002 +++ linux-2.5.34-mm2-per-node-kswapd/include/linux/swap.h Fri Sep 13 15:05:32 2002 @@ -162,7 +162,6 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern wait_queue_head_t kswapd_wait; extern int try_to_free_pages(struct zone *, unsigned int, unsigned int); /* linux/mm/page_io.c */ diff -ur linux-2.5.34-mm2-clean/mm/page_alloc.c linux-2.5.34-mm2-per-node-kswapd/mm/page_alloc.c --- linux-2.5.34-mm2-clean/mm/page_alloc.c Fri Sep 13 14:32:15 2002 +++ linux-2.5.34-mm2-per-node-kswapd/mm/page_alloc.c Fri Sep 13 15:09:42 2002 @@ -345,8 +345,12 @@ classzone->need_balance = 1; mb(); /* we're somewhat low on memory, failed to find what we needed */ - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + if (z->free_pages <= z->pages_low && + waitqueue_active(&z->zone_pgdat->kswapd_wait)) + wake_up_interruptible(&z->zone_pgdat->kswapd_wait); + } /* Go through the zonelist again, taking __GFP_HIGH into account */ min = 1UL << order; @@ -833,6 +837,8 @@ unsigned long zone_start_pfn = pgdat->node_start_pfn; pgdat->nr_zones = 0; + init_waitqueue_head(&pgdat->kswapd_wait); + local_offset = 0; /* offset within lmem_map */ for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; diff -ur linux-2.5.34-mm2-clean/mm/vmscan.c linux-2.5.34-mm2-per-node-kswapd/mm/vmscan.c --- linux-2.5.34-mm2-clean/mm/vmscan.c Fri Sep 13 14:32:15 2002 +++ linux-2.5.34-mm2-per-node-kswapd/mm/vmscan.c Fri Sep 13 15:06:18 2002 @@ -713,8 +713,6 @@ return 0; } -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - static int check_classzone_need_balance(struct zone *classzone) { struct zone *first_classzone; @@ -753,20 +751,6 @@ return need_more_balance; } -static void kswapd_balance(void) -{ - int need_more_balance; - pg_data_t * pgdat; - - do { - need_more_balance = 0; - pgdat = pgdat_list; - do - need_more_balance |= kswapd_balance_pgdat(pgdat); - while ((pgdat = pgdat->pgdat_next)); - } while (need_more_balance); -} - static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) { struct zone *zone; @@ -774,28 +758,13 @@ for (i = pgdat->nr_zones-1; i >= 0; i--) { zone = pgdat->node_zones + i; - if (!zone->need_balance) - continue; - return 0; + if (zone->need_balance) + return 0; } return 1; } -static int kswapd_can_sleep(void) -{ - pg_data_t * pgdat; - - pgdat = pgdat_list; - do { - if (kswapd_can_sleep_pgdat(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->pgdat_next)); - - return 1; -} - /* * The background pageout daemon, started as a kernel thread * from the init process. @@ -809,13 +778,14 @@ * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -int kswapd(void *unused) +int kswapd(void *p) { + pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); daemonize(); - strcpy(tsk->comm, "kswapd"); + sprintf(tsk->comm, "kswapd%d", pgdat->node_id); sigfillset(&tsk->blocked); /* @@ -839,30 +809,34 @@ if (current->flags & PF_FREEZE) refrigerator(PF_IOTHREAD); __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kswapd_wait, &wait); + add_wait_queue(&pgdat->kswapd_wait, &wait); mb(); - if (kswapd_can_sleep()) + if (kswapd_can_sleep_pgdat(pgdat)) schedule(); __set_current_state(TASK_RUNNING); - remove_wait_queue(&kswapd_wait, &wait); + remove_wait_queue(&pgdat->kswapd_wait, &wait); /* * If we actually get into a low-memory situation, * the processes needing more memory will wake us * up on a more timely basis. */ - kswapd_balance(); + kswapd_balance_pgdat(pgdat); blk_run_queues(); } } static int __init kswapd_init(void) { + pg_data_t *pgdat; printk("Starting kswapd\n"); swap_setup(); - kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + for_each_pgdat(pgdat) + kernel_thread(kswapd, + pgdat, + CLONE_FS | CLONE_FILES | CLONE_SIGNAL); return 0; }
[-- Attachment #1: Type: text/plain, Size: 413 bytes --] Dave Hansen wrote: > Here's a per-node kswapd. It's actually per-pg_data_t, but I guess that > they're equivalent. Matt is going to follow up his topology API with > something to bind these to their respective nodes. Yep.. Here it is... rolled together with dave's patch.. It is only a 2 line addition. It applies cleanly on top of mm3 and topology (which I'll be resending momentarily). Cheers! -Matt [-- Attachment #2: per_node_kswapd.patch --] [-- Type: text/plain, Size: 5107 bytes --] diff -Nur linux-2.5.34-mm3+topo/include/linux/mmzone.h linux-2.5.34-mm3+topo+kswapd/include/linux/mmzone.h --- linux-2.5.34-mm3+topo/include/linux/mmzone.h Fri Sep 13 14:59:21 2002 +++ linux-2.5.34-mm3+topo+kswapd/include/linux/mmzone.h Fri Sep 13 16:06:50 2002 @@ -168,6 +168,7 @@ unsigned long node_size; int node_id; struct pglist_data *pgdat_next; + wait_queue_head_t kswapd_wait; } pg_data_t; extern int numnodes; diff -Nur linux-2.5.34-mm3+topo/include/linux/swap.h linux-2.5.34-mm3+topo+kswapd/include/linux/swap.h --- linux-2.5.34-mm3+topo/include/linux/swap.h Mon Sep 9 10:35:01 2002 +++ linux-2.5.34-mm3+topo+kswapd/include/linux/swap.h Fri Sep 13 16:06:50 2002 @@ -162,7 +162,6 @@ extern void swap_setup(void); /* linux/mm/vmscan.c */ -extern wait_queue_head_t kswapd_wait; extern int try_to_free_pages(struct zone *, unsigned int, unsigned int); /* linux/mm/page_io.c */ diff -Nur linux-2.5.34-mm3+topo/mm/page_alloc.c linux-2.5.34-mm3+topo+kswapd/mm/page_alloc.c --- linux-2.5.34-mm3+topo/mm/page_alloc.c Fri Sep 13 14:58:05 2002 +++ linux-2.5.34-mm3+topo+kswapd/mm/page_alloc.c Fri Sep 13 16:06:50 2002 @@ -346,8 +346,12 @@ classzone->need_balance = 1; mb(); /* we're somewhat low on memory, failed to find what we needed */ - if (waitqueue_active(&kswapd_wait)) - wake_up_interruptible(&kswapd_wait); + for (i = 0; zones[i] != NULL; i++) { + struct zone *z = zones[i]; + if (z->free_pages <= z->pages_low && + waitqueue_active(&z->zone_pgdat->kswapd_wait)) + wake_up_interruptible(&z->zone_pgdat->kswapd_wait); + } /* Go through the zonelist again, taking __GFP_HIGH into account */ min = 1UL << order; @@ -845,6 +849,8 @@ unsigned long zone_start_pfn = pgdat->node_start_pfn; pgdat->nr_zones = 0; + init_waitqueue_head(&pgdat->kswapd_wait); + local_offset = 0; /* offset within lmem_map */ for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; diff -Nur linux-2.5.34-mm3+topo/mm/vmscan.c linux-2.5.34-mm3+topo+kswapd/mm/vmscan.c --- linux-2.5.34-mm3+topo/mm/vmscan.c Fri Sep 13 14:58:05 2002 +++ linux-2.5.34-mm3+topo+kswapd/mm/vmscan.c Fri Sep 13 16:11:26 2002 @@ -28,10 +28,11 @@ #include <linux/pagevec.h> #include <linux/backing-dev.h> #include <linux/rmap-locking.h> +#include <linux/swapops.h> #include <asm/pgalloc.h> #include <asm/tlbflush.h> -#include <linux/swapops.h> +#include <asm/topology.h> /* * The "priority" of VM scanning is how much of the queues we @@ -713,8 +714,6 @@ return 0; } -DECLARE_WAIT_QUEUE_HEAD(kswapd_wait); - static int check_classzone_need_balance(struct zone *classzone) { struct zone *first_classzone; @@ -753,20 +752,6 @@ return need_more_balance; } -static void kswapd_balance(void) -{ - int need_more_balance; - pg_data_t * pgdat; - - do { - need_more_balance = 0; - pgdat = pgdat_list; - do - need_more_balance |= kswapd_balance_pgdat(pgdat); - while ((pgdat = pgdat->pgdat_next)); - } while (need_more_balance); -} - static int kswapd_can_sleep_pgdat(pg_data_t * pgdat) { struct zone *zone; @@ -774,28 +759,13 @@ for (i = pgdat->nr_zones-1; i >= 0; i--) { zone = pgdat->node_zones + i; - if (!zone->need_balance) - continue; - return 0; + if (zone->need_balance) + return 0; } return 1; } -static int kswapd_can_sleep(void) -{ - pg_data_t * pgdat; - - pgdat = pgdat_list; - do { - if (kswapd_can_sleep_pgdat(pgdat)) - continue; - return 0; - } while ((pgdat = pgdat->pgdat_next)); - - return 1; -} - /* * The background pageout daemon, started as a kernel thread * from the init process. @@ -809,13 +779,16 @@ * If there are applications that are active memory-allocators * (most normal use), this basically shouldn't matter. */ -int kswapd(void *unused) +int kswapd(void *p) { + pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); + set_cpus_allowed(tsk, __node_to_cpu_mask(p->node_id)); + daemonize(); - strcpy(tsk->comm, "kswapd"); + sprintf(tsk->comm, "kswapd%d", pgdat->node_id); sigfillset(&tsk->blocked); /* @@ -839,30 +812,34 @@ if (current->flags & PF_FREEZE) refrigerator(PF_IOTHREAD); __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&kswapd_wait, &wait); + add_wait_queue(&pgdat->kswapd_wait, &wait); mb(); - if (kswapd_can_sleep()) + if (kswapd_can_sleep_pgdat(pgdat)) schedule(); __set_current_state(TASK_RUNNING); - remove_wait_queue(&kswapd_wait, &wait); + remove_wait_queue(&pgdat->kswapd_wait, &wait); /* * If we actually get into a low-memory situation, * the processes needing more memory will wake us * up on a more timely basis. */ - kswapd_balance(); + kswapd_balance_pgdat(pgdat); blk_run_queues(); } } static int __init kswapd_init(void) { + pg_data_t *pgdat; printk("Starting kswapd\n"); swap_setup(); - kernel_thread(kswapd, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); + for_each_pgdat(pgdat) + kernel_thread(kswapd, + pgdat, + CLONE_FS | CLONE_FILES | CLONE_SIGNAL); return 0; }
[-- Attachment #1: Type: text/plain, Size: 172 bytes --] As promised, here is the latest and greatest (heh) topology API patch. Should apply cleanly on top of mm3. As always, comment, criticize, and flame away! Cheers -Matt [-- Attachment #2: simple_topo-v0.5-in_kernel-2.5.34-mm3.patch --] [-- Type: text/plain, Size: 14861 bytes --] diff -Nur linux-2.5.34-mm3/include/asm-alpha/mmzone.h linux-2.5.34-mm3+topo/include/asm-alpha/mmzone.h --- linux-2.5.34-mm3/include/asm-alpha/mmzone.h Fri Sep 13 14:58:03 2002 +++ linux-2.5.34-mm3+topo/include/asm-alpha/mmzone.h Fri Sep 13 14:59:21 2002 @@ -107,15 +107,6 @@ #ifdef CONFIG_NUMA_SCHED #define NODE_SCHEDULE_DATA(nid) (&((PLAT_NODE_DATA(nid))->schedule_data)) #endif - -#ifdef CONFIG_ALPHA_WILDFIRE -/* With wildfire assume 4 CPUs per node */ -#define cputonode(cpu) ((cpu) >> 2) -#else -#define cputonode(cpu) 0 -#endif /* CONFIG_ALPHA_WILDFIRE */ - -#define numa_node_id() cputonode(smp_processor_id()) #endif /* CONFIG_NUMA */ #endif /* CONFIG_DISCONTIGMEM */ diff -Nur linux-2.5.34-mm3/include/asm-alpha/topology.h linux-2.5.34-mm3+topo/include/asm-alpha/topology.h --- linux-2.5.34-mm3/include/asm-alpha/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-alpha/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,14 @@ +#ifndef _ASM_ALPHA_TOPOLOGY_H +#define _ASM_ALPHA_TOPOLOGY_H + +#ifdef CONFIG_NUMA +#ifdef CONFIG_ALPHA_WILDFIRE +/* With wildfire assume 4 CPUs per node */ +#define __cpu_to_node(cpu) ((cpu) >> 2) +#endif /* CONFIG_ALPHA_WILDFIRE */ +#endif /* CONFIG_NUMA */ + +/* Get the rest of the topology definitions */ +#include <asm-generic/topology.h> + +#endif /* _ASM_ALPHA_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-arm/topology.h linux-2.5.34-mm3+topo/include/asm-arm/topology.h --- linux-2.5.34-mm3/include/asm-arm/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-arm/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_ARM_TOPOLOGY_H +#define _ASM_ARM_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_ARM_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-cris/topology.h linux-2.5.34-mm3+topo/include/asm-cris/topology.h --- linux-2.5.34-mm3/include/asm-cris/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-cris/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_CRIS_TOPOLOGY_H +#define _ASM_CRIS_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_CRIS_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-generic/topology.h linux-2.5.34-mm3+topo/include/asm-generic/topology.h --- linux-2.5.34-mm3/include/asm-generic/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-generic/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,51 @@ +/* + * linux/include/asm-generic/topology.h + * + * Written by: Matthew Dobson, IBM Corporation + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to <colpatch@us.ibm.com> + */ +#ifndef _ASM_GENERIC_TOPOLOGY_H +#define _ASM_GENERIC_TOPOLOGY_H + +/* Other architectures wishing to use this simple topology API should fill + in the below functions as appropriate in their own <asm/topology.h> file. */ +#ifndef __cpu_to_node +#define __cpu_to_node(cpu) (0) +#endif +#ifndef __memblk_to_node +#define __memblk_to_node(memblk) (0) +#endif +#ifndef __parent_node +#define __parent_node(nid) (0) +#endif +#ifndef __node_to_first_cpu +#define __node_to_first_cpu(node) (0) +#endif +#ifndef __node_to_cpu_mask +#define __node_to_cpu_mask(node) (cpu_online_map) +#endif +#ifndef __node_to_memblk +#define __node_to_memblk(node) (0) +#endif + +#endif /* _ASM_GENERIC_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-i386/topology.h linux-2.5.34-mm3+topo/include/asm-i386/topology.h --- linux-2.5.34-mm3/include/asm-i386/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-i386/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,92 @@ +/* + * linux/include/asm-i386/topology.h + * + * Written by: Matthew Dobson, IBM Corporation + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send feedback to <colpatch@us.ibm.com> + */ +#ifndef _ASM_I386_TOPOLOGY_H +#define _ASM_I386_TOPOLOGY_H + +#ifdef CONFIG_X86_NUMAQ + +#include <asm/smpboot.h> + +/* Returns the number of the node containing CPU 'cpu' */ +#define __cpu_to_node(cpu) (cpu_to_logical_apicid(cpu) >> 4) + +/* Returns the number of the node containing MemBlk 'memblk' */ +#define __memblk_to_node(memblk) (memblk) + +/* Returns the number of the node containing Node 'nid'. This architecture is flat, + so it is a pretty simple function! */ +#define __parent_node(nid) (nid) + +/* Returns the number of the first CPU on Node 'node'. + * This should be changed to a set of cached values + * but this will do for now. + */ +static inline int __node_to_first_cpu(int node) +{ + int i, cpu, logical_apicid = node << 4; + + for(i = 1; i < 16; i <<= 1) + /* check to see if the cpu is in the system */ + if ((cpu = logical_apicid_to_cpu(logical_apicid | i)) >= 0) + /* if yes, return it to caller */ + return cpu; + + return 0; +} + +/* Returns a bitmask of CPUs on Node 'node'. + * This should be changed to a set of cached bitmasks + * but this will do for now. + */ +static inline unsigned long __node_to_cpu_mask(int node) +{ + int i, cpu, logical_apicid = node << 4; + unsigned long mask; + + for(i = 1; i < 16; i <<= 1) + /* check to see if the cpu is in the system */ + if ((cpu = logical_apicid_to_cpu(logical_apicid | i)) >= 0) + /* if yes, add to bitmask */ + mask |= 1 << cpu; + + return mask; +} + +/* Returns the number of the first MemBlk on Node 'node' */ +#define __node_to_memblk(node) (node) + +#else /* !CONFIG_X86_NUMAQ */ +/* + * Other i386 platforms should define their own version of the + * above macros here. + */ + +#include <asm-generic/topology.h> + +#endif /* CONFIG_X86_NUMAQ */ + +#endif /* _ASM_I386_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-ia64/topology.h linux-2.5.34-mm3+topo/include/asm-ia64/topology.h --- linux-2.5.34-mm3/include/asm-ia64/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-ia64/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_IA64_TOPOLOGY_H +#define _ASM_IA64_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_IA64_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-m68k/topology.h linux-2.5.34-mm3+topo/include/asm-m68k/topology.h --- linux-2.5.34-mm3/include/asm-m68k/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-m68k/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_M68K_TOPOLOGY_H +#define _ASM_M68K_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_M68K_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-mips/topology.h linux-2.5.34-mm3+topo/include/asm-mips/topology.h --- linux-2.5.34-mm3/include/asm-mips/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-mips/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_MIPS_TOPOLOGY_H +#define _ASM_MIPS_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_MIPS_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-mips64/mmzone.h linux-2.5.34-mm3+topo/include/asm-mips64/mmzone.h --- linux-2.5.34-mm3/include/asm-mips64/mmzone.h Fri Sep 13 14:58:03 2002 +++ linux-2.5.34-mm3+topo/include/asm-mips64/mmzone.h Fri Sep 13 14:59:21 2002 @@ -28,8 +28,6 @@ #define PLAT_NODE_DATA_LOCALNR(p, n) \ (((p) >> PAGE_SHIFT) - PLAT_NODE_DATA(n)->gendata.node_start_pfn) -#define numa_node_id() cputocnode(current->processor) - #ifdef CONFIG_DISCONTIGMEM /* diff -Nur linux-2.5.34-mm3/include/asm-mips64/topology.h linux-2.5.34-mm3+topo/include/asm-mips64/topology.h --- linux-2.5.34-mm3/include/asm-mips64/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-mips64/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,11 @@ +#ifndef _ASM_MIPS64_TOPOLOGY_H +#define _ASM_MIPS64_TOPOLOGY_H + +#include <asm/mmzone.h> + +#define __cpu_to_node(cpu) (cputocnode(cpu)) + +/* Get the rest of the topology definitions */ +#include <asm-generic/topology.h> + +#endif /* _ASM_MIPS64_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-parisc/topology.h linux-2.5.34-mm3+topo/include/asm-parisc/topology.h --- linux-2.5.34-mm3/include/asm-parisc/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-parisc/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_PARISC_TOPOLOGY_H +#define _ASM_PARISC_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_PARISC_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-ppc/topology.h linux-2.5.34-mm3+topo/include/asm-ppc/topology.h --- linux-2.5.34-mm3/include/asm-ppc/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-ppc/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_PPC_TOPOLOGY_H +#define _ASM_PPC_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_PPC_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-ppc64/mmzone.h linux-2.5.34-mm3+topo/include/asm-ppc64/mmzone.h --- linux-2.5.34-mm3/include/asm-ppc64/mmzone.h Fri Sep 13 14:58:03 2002 +++ linux-2.5.34-mm3+topo/include/asm-ppc64/mmzone.h Fri Sep 13 14:59:21 2002 @@ -82,13 +82,5 @@ (ADDR_TO_MAPBASE(kaddr) + LOCAL_MAP_NR(kaddr)); \ }) -#ifdef CONFIG_NUMA - -/* XXX grab this from the device tree - Anton */ -#define cputonode(cpu) ((cpu) >> CPU_SHIFT_BITS) - -#define numa_node_id() cputonode(smp_processor_id()) - -#endif /* CONFIG_NUMA */ #endif /* CONFIG_DISCONTIGMEM */ #endif /* _ASM_MMZONE_H_ */ diff -Nur linux-2.5.34-mm3/include/asm-ppc64/topology.h linux-2.5.34-mm3+topo/include/asm-ppc64/topology.h --- linux-2.5.34-mm3/include/asm-ppc64/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-ppc64/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,14 @@ +#ifndef _ASM_PPC64_TOPOLOGY_H +#define _ASM_PPC64_TOPOLOGY_H + +#include <asm/mmzone.h> + +#ifdef CONFIG_NUMA +/* XXX grab this from the device tree - Anton */ +#define __cpu_to_node(cpu) ((cpu) >> CPU_SHIFT_BITS) +#endif /* CONFIG_NUMA */ + +/* Get the rest of the topology definitions */ +#include <asm-generic/topology.h> + +#endif /* _ASM_PPC64_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-s390/topology.h linux-2.5.34-mm3+topo/include/asm-s390/topology.h --- linux-2.5.34-mm3/include/asm-s390/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-s390/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_S390_TOPOLOGY_H +#define _ASM_S390_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_S390_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-s390x/topology.h linux-2.5.34-mm3+topo/include/asm-s390x/topology.h --- linux-2.5.34-mm3/include/asm-s390x/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-s390x/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_S390X_TOPOLOGY_H +#define _ASM_S390X_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_S390X_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-sh/topology.h linux-2.5.34-mm3+topo/include/asm-sh/topology.h --- linux-2.5.34-mm3/include/asm-sh/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-sh/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_SH_TOPOLOGY_H +#define _ASM_SH_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_SH_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-sparc/topology.h linux-2.5.34-mm3+topo/include/asm-sparc/topology.h --- linux-2.5.34-mm3/include/asm-sparc/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-sparc/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_SPARC_TOPOLOGY_H +#define _ASM_SPARC_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_SPARC_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-sparc64/topology.h linux-2.5.34-mm3+topo/include/asm-sparc64/topology.h --- linux-2.5.34-mm3/include/asm-sparc64/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-sparc64/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_SPARC64_TOPOLOGY_H +#define _ASM_SPARC64_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_SPARC64_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/asm-x86_64/topology.h linux-2.5.34-mm3+topo/include/asm-x86_64/topology.h --- linux-2.5.34-mm3/include/asm-x86_64/topology.h Wed Dec 31 16:00:00 1969 +++ linux-2.5.34-mm3+topo/include/asm-x86_64/topology.h Fri Sep 13 14:59:21 2002 @@ -0,0 +1,6 @@ +#ifndef _ASM_X86_64_TOPOLOGY_H +#define _ASM_X86_64_TOPOLOGY_H + +#include <asm-generic/topology.h> + +#endif /* _ASM_X86_64_TOPOLOGY_H */ diff -Nur linux-2.5.34-mm3/include/linux/mmzone.h linux-2.5.34-mm3+topo/include/linux/mmzone.h --- linux-2.5.34-mm3/include/linux/mmzone.h Fri Sep 13 14:58:04 2002 +++ linux-2.5.34-mm3+topo/include/linux/mmzone.h Fri Sep 13 14:59:21 2002 @@ -248,13 +248,23 @@ #define for_each_zone(zone) \ for (zone = pgdat_list->node_zones; zone; zone = next_zone(zone)) +#ifdef CONFIG_NUMA +#define MAX_NR_MEMBLKS BITS_PER_LONG /* Max number of Memory Blocks */ +#else /* !CONFIG_NUMA */ +#define MAX_NR_MEMBLKS 1 +#endif /* CONFIG_NUMA */ + +#include <asm/topology.h> +/* Returns the number of the current Node. */ +#define numa_node_id() (__cpu_to_node(smp_processor_id())) + #ifndef CONFIG_DISCONTIGMEM #define NODE_DATA(nid) (&contig_page_data) #define NODE_MEM_MAP(nid) mem_map #define MAX_NR_NODES 1 -#else /* !CONFIG_DISCONTIGMEM */ +#else /* CONFIG_DISCONTIGMEM */ #include <asm/mmzone.h>
On Fri, Sep 13, 2002 at 03:52:21PM -0700, Dave Hansen wrote: > Here's a per-node kswapd. It's actually per-pg_data_t, but I guess that > they're equivalent. Matt is going to follow up his topology API with > something to bind these to their respective nodes. >From 64 parallel tiobench 64's (higher counts livelock in fork() etc.): 38 root 15 0 0 0 0 RW 23.0 0.0 1:11 kswapd0 4779 wli 22 0 4460 3588 1648 R 17.9 0.0 0:16 top ... 4779 wli 25 0 4460 3592 1648 R 14.1 0.0 0:27 top 38 root 15 0 0 0 0 DW 3.5 0.0 1:31 kswapd0 ... OTOH, with such a small task count, ZONE_NORMAL is the only thing feeling pressure anyway. There's also very little cpu pressure: CPU states: 0.2% user, 6.4% system, 0.0% nice, 93.4% idle Cheers, Bill
William Lee Irwin III wrote:
>
> On Fri, Sep 13, 2002 at 03:52:21PM -0700, Dave Hansen wrote:
> > Here's a per-node kswapd. It's actually per-pg_data_t, but I guess that
> > they're equivalent. Matt is going to follow up his topology API with
> > something to bind these to their respective nodes.
>
> >From 64 parallel tiobench 64's (higher counts livelock in fork() etc.):
>
> 38 root 15 0 0 0 0 RW 23.0 0.0 1:11 kswapd0
> 4779 wli 22 0 4460 3588 1648 R 17.9 0.0 0:16 top
>
> ...
>
> 4779 wli 25 0 4460 3592 1648 R 14.1 0.0 0:27 top
> 38 root 15 0 0 0 0 DW 3.5 0.0 1:31 kswapd0
>
Why do I see only one kswapd here?
Are you claiming an overall 4x improvement, or what?
I'll add some instrumentation whch tells us how many pages
kswapd is reclaiming versus direct reclaim.
William Lee Irwin III wrote: >> From 64 parallel tiobench 64's (higher counts livelock in fork() etc.): >> 38 root 15 0 0 0 0 RW 23.0 0.0 1:11 kswapd0 >> 4779 wli 22 0 4460 3588 1648 R 17.9 0.0 0:16 top >> ... >> 4779 wli 25 0 4460 3592 1648 R 14.1 0.0 0:27 top >> 38 root 15 0 0 0 0 DW 3.5 0.0 1:31 kswapd0 On Fri, Sep 13, 2002 at 05:02:56PM -0700, Andrew Morton wrote: > Why do I see only one kswapd here? > Are you claiming an overall 4x improvement, or what? > I'll add some instrumentation whch tells us how many pages > kswapd is reclaiming versus direct reclaim. I can catch the others running if I refresh more often: 38 root 15 0 0 0 0 DW 4.8 0.0 1:57 kswapd0 36 root 15 0 0 0 0 SW 2.7 0.0 0:16 kswapd2 4779 wli 22 0 4476 3604 1648 R 9.2 0.0 0:58 top 37 root 15 0 0 0 0 SW 2.6 0.0 0:16 kswapd1 38 root 15 0 0 0 0 DW 2.9 0.0 2:12 kswapd0 36 root 15 0 0 0 0 SW 1.8 0.0 0:22 kswapd2 4779 wli 25 0 4476 3600 1648 R 7.4 0.0 1:18 top 37 root 15 0 0 0 0 SW 2.7 0.0 0:21 kswapd1 4779 wli 24 0 4476 3600 1648 R 37.5 0.0 1:49 top 37 root 16 0 0 0 0 RW 11.1 0.0 0:23 kswapd1 4779 wli 25 0 4476 3600 1648 R 14.1 0.0 1:51 top 35 root 15 0 0 0 0 SW 6.9 0.0 0:24 kswapd3 38 root 15 0 0 0 0 RW 2.9 0.0 2:29 kswapd0 37 root 16 0 0 0 0 SW 1.4 0.0 0:28 kswapd1 etc. Not sure about baselines. I'm happier because there's more cpu utilization. kswapd0 is relatively busy so the other ones take some load off of it. The benchmark isn't quite done yet. I think four dbench 512's in parallel might be easier to extract results from. tiobench also looks like it's getting some cpu: procs memory io system cpu r b w swpd free buff cache bi bo in cs us sy id 7 3649 2 0 12516 13460 15365328 32 11209 1349 3334 3 8 90 0 3474 3 0 11216 9648 15354584 760 10333 1360 3089 2 10 88 7 3222 3 0 11872 10744 15367528 3883 8841 1371 2362 1 8 91 1 2958 2 0 12572 10304 15373820 569 10617 1347 2214 2 8 90 Cheers, Bill
William Lee Irwin III wrote:
>
> ...
> On Fri, Sep 13, 2002 at 05:02:56PM -0700, Andrew Morton wrote:
> > Why do I see only one kswapd here?
> > Are you claiming an overall 4x improvement, or what?
> > I'll add some instrumentation whch tells us how many pages
> > kswapd is reclaiming versus direct reclaim.
>
> I can catch the others running if I refresh more often:
>
> 38 root 15 0 0 0 0 DW 4.8 0.0 1:57 kswapd0
> 36 root 15 0 0 0 0 SW 2.7 0.0 0:16 kswapd2
>
> 4779 wli 22 0 4476 3604 1648 R 9.2 0.0 0:58 top
> 37 root 15 0 0 0 0 SW 2.6 0.0 0:16 kswapd1
>
> 38 root 15 0 0 0 0 DW 2.9 0.0 2:12 kswapd0
> 36 root 15 0 0 0 0 SW 1.8 0.0 0:22 kswapd2
>
> 4779 wli 25 0 4476 3600 1648 R 7.4 0.0 1:18 top
> 37 root 15 0 0 0 0 SW 2.7 0.0 0:21 kswapd1
>
> 4779 wli 24 0 4476 3600 1648 R 37.5 0.0 1:49 top
> 37 root 16 0 0 0 0 RW 11.1 0.0 0:23 kswapd1
>
> 4779 wli 25 0 4476 3600 1648 R 14.1 0.0 1:51 top
> 35 root 15 0 0 0 0 SW 6.9 0.0 0:24 kswapd3
>
> 38 root 15 0 0 0 0 RW 2.9 0.0 2:29 kswapd0
> 37 root 16 0 0 0 0 SW 1.4 0.0 0:28 kswapd1
>
> etc.
>
> Not sure about baselines. I'm happier because there's more cpu
> utilization. kswapd0 is relatively busy so the other ones take some
> load off of it. The benchmark isn't quite done yet. I think four
> dbench 512's in parallel might be easier to extract results from.
> tiobench also looks like it's getting some cpu:
>
OK, thanks.
Could you please go into /proc/<pif_of_kswapd>/cpu and double
check that each kswapd is only racking up points on the CPUs
which it is supposed to be running on? As a little sanity check...
(hm. Why isn't cpus_allowed displayed from in there?)
Also, we need to double check that I'm not completely full of
unmentionables, and that kswapd really is doing useful work there.
I can check that.
I'll do an mm4 in a mo (as soon as I work out who did the dud patch
which stops it booting) and we can see what the kwapd-versus-direct-reclaim
ratio looks like.
On Friday 13 September 2002 06:59, William Lee Irwin III wrote:
> On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote:
> > I still don't see why it's per zone and not per node. It seems strange
> > that a wee little laptop would be running two kswapds?
> > kswapd can get a ton of work done in the development VM and one per
> > node would, I expect, suffice?
>
> Machines without observable NUMA effects can benefit from it if it's
> per-zone.
How?
--
Daniel
On Thu, Sep 12, 2002 at 09:06:20PM -0700, Andrew Morton wrote: >>> I still don't see why it's per zone and not per node. It seems strange >>> that a wee little laptop would be running two kswapds? >>> kswapd can get a ton of work done in the development VM and one per >>> node would, I expect, suffice? On Friday 13 September 2002 06:59, William Lee Irwin III wrote: >> Machines without observable NUMA effects can benefit from it if it's >> per-zone. On Mon, Sep 16, 2002 at 07:44:30AM +0200, Daniel Phillips wrote: > How? The notion was that some level of parallelism would be bestowed on the single-node case by using separate worker threads on a per-zone basis, as they won't have more than one node to spawn worker threads for at all. This notion apparently got shot down somewhere, and I don't care to rise to its defense. I've lost enough debates this release to know better than to try. Bill
On Mon, 16 Sep 2002, William Lee Irwin III wrote: > This notion apparently got shot down somewhere, and I don't care to rise > to its defense. I've lost enough debates this release to know better > than to try. Don't worry about this, there are bigger fish around, lower hanging sea fruit, so to say. ;) Rik -- Bravely reimplemented by the knights who say "NIH". http://www.surriel.com/ http://distro.conectiva.com/ Spamtraps of the month: september@surriel.com trac@trac.org
[-- Attachment #1: Type: text/plain, Size: 1017 bytes --] Andrew, at the current time an mmap() ignores a MAP_LOCKED passed to it. The only way we can get VM_LOCKED associated with the newly created VMA is to have previously called mlockall() on the process which sets the mm->def_flags != VM_LOCKED or subsequently call mlock() on the newly created VMA. The attached patch checks for MAP_LOCKED being passed and if so checks the capabilities of the process. Limit checks were already in place. -- -- Hubertus Franke (frankeh@watson.ibm.com) --------------------------------< PATCH >------------------------------ --- linux-2.5.35/mm/mmap.c Wed Sep 18 11:12:13 2002 +++ linux-2.5.35-fix/mm/mmap.c Wed Sep 18 11:44:32 2002 @@ -461,6 +461,11 @@ */ vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + if (flags & MAP_LOCKED) { + if (!capable(CAP_IPC_LOCK)) + return -EPERM; + vm_flags |= VM_LOCKED; + } /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { unsigned long locked = mm->locked_vm << PAGE_SHIFT; [-- Attachment #2: patch.2.5.35.mmap_locked --] [-- Type: text/x-diff, Size: 452 bytes --] --- linux-2.5.35/mm/mmap.c Wed Sep 18 11:12:13 2002 +++ linux-2.5.35-fix/mm/mmap.c Wed Sep 18 11:44:32 2002 @@ -461,6 +461,11 @@ */ vm_flags = calc_vm_flags(prot,flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + if (flags & MAP_LOCKED) { + if (!capable(CAP_IPC_LOCK)) + return -EPERM; + vm_flags |= VM_LOCKED; + } /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { unsigned long locked = mm->locked_vm << PAGE_SHIFT;
Hubertus Franke wrote:
>
> Andrew, at the current time an mmap() ignores a MAP_LOCKED passed to it.
> The only way we can get VM_LOCKED associated with the newly created VMA
> is to have previously called mlockall() on the process which sets the
> mm->def_flags != VM_LOCKED or subsequently call mlock() on the
> newly created VMA.
>
> The attached patch checks for MAP_LOCKED being passed and if so checks
> the capabilities of the process. Limit checks were already in place.
Looks sane, thanks.
It appears that MAP_LOCKED is a Linux-special, so presumably it
_used_ to work. I wonder when it broke?
You patch applies to 2.4 as well; it would be useful to give that
a sanity test and send a copy to Marcelo.
(SuS really only anticipates that mmap needs to look at prior mlocks
in force against the address range. It also says
Process memory locking does apply to shared memory regions,
and we don't do that either. I think we should; can't see why SuS
requires this.)