linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [GIT PULL] scheduler fixes
@ 2009-05-18 14:27 Ingo Molnar
  2009-05-18 16:13 ` Linus Torvalds
  2009-05-18 16:55 ` [GIT PULL, v2] scheduler fixes Ingo Molnar
  0 siblings, 2 replies; 57+ messages in thread
From: Ingo Molnar @ 2009-05-18 14:27 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Andrew Morton, Peter Zijlstra

Linus,

Please pull the latest sched-fixes-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-fixes-for-linus

 Thanks,

	Ingo

------------------>
Ron (1):
      sched: Fix fallback sched_clock()'s offset when using jiffies

Rusty Russell (1):
      sched: avoid flexible array member inside struct (gcc extension)


 kernel/sched.c       |   28 +++++++++++++++-------------
 kernel/sched_clock.c |    3 ++-
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa47..d1ef62c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7756,22 +7756,24 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
  * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
  * for nr_cpu_ids < CONFIG_NR_CPUS.
  */
-struct static_sched_group {
+union static_sched_group {
 	struct sched_group sg;
-	DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
+	char _sg_and_cpus[sizeof(struct sched_group) +
+			  BITS_TO_LONGS(CONFIG_NR_CPUS) * sizeof(long)];
 };
 
-struct static_sched_domain {
+union static_sched_domain {
 	struct sched_domain sd;
-	DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+	char _sd_and_cpus[sizeof(struct sched_domain) +
+			  BITS_TO_LONGS(CONFIG_NR_CPUS) * sizeof(long)];
 };
 
 /*
  * SMT sched-domains:
  */
 #ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_cpus);
+static DEFINE_PER_CPU(union static_sched_domain, cpu_domains);
+static DEFINE_PER_CPU(union static_sched_group, sched_group_cpus);
 
 static int
 cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
@@ -7787,8 +7789,8 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
  * multi-core sched-domains:
  */
 #ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
+static DEFINE_PER_CPU(union static_sched_domain, core_domains);
+static DEFINE_PER_CPU(union static_sched_group, sched_group_core);
 #endif /* CONFIG_SCHED_MC */
 
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -7815,8 +7817,8 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
 }
 #endif
 
-static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
+static DEFINE_PER_CPU(union static_sched_domain, phys_domains);
+static DEFINE_PER_CPU(union static_sched_group, sched_group_phys);
 
 static int
 cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
@@ -7843,11 +7845,11 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
  * groups, so roll our own. Now each node has its own list of groups which
  * gets dynamically allocated.
  */
-static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
+static DEFINE_PER_CPU(union static_sched_domain, node_domains);
 static struct sched_group ***sched_group_nodes_bycpu;
 
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
+static DEFINE_PER_CPU(union static_sched_domain, allnodes_domains);
+static DEFINE_PER_CPU(union static_sched_group, sched_group_allnodes);
 
 static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
 				 struct sched_group **sg,
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 819f17a..e1d16c9 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -38,7 +38,8 @@
  */
 unsigned long long __attribute__((weak)) sched_clock(void)
 {
-	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
+					* (NSEC_PER_SEC / HZ);
 }
 
 static __read_mostly int sched_clock_running;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-18 14:27 [GIT PULL] scheduler fixes Ingo Molnar
@ 2009-05-18 16:13 ` Linus Torvalds
  2009-05-18 16:49   ` Ingo Molnar
  2009-05-19  8:31   ` [tip:sched/core] sched: properly define the sched_group::cpumask and sched_domain::span fields tip-bot for Ingo Molnar
  2009-05-18 16:55 ` [GIT PULL, v2] scheduler fixes Ingo Molnar
  1 sibling, 2 replies; 57+ messages in thread
From: Linus Torvalds @ 2009-05-18 16:13 UTC (permalink / raw)
  To: Ingo Molnar, Rusty Russell
  Cc: Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra



On Mon, 18 May 2009, Ingo Molnar wrote:
> 
> Rusty Russell (1):
>       sched: avoid flexible array member inside struct (gcc extension)

I'm not pulling this one either.

It makes no sense what-so-ever. It's uglier code, so calling it a cleanup 
is just wrong.

Now apart from being ugly and pointless, it is also FUNDAMENTALLY 
INCORRECT.

It is in no way true that "a union is the Right Way to do this", 
especially not the way THAT PIECE OF UTTER CRAP does it.

This is the original data structure:

	struct static_sched_group {
		struct sched_group sg;
		DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
	};

and it is fine (the fact that "sg" isn't fine is a totally different 
issue).

The new one:

	union static_sched_group {
		struct sched_group sg;
		char _sg_and_cpus[sizeof(struct sched_group) +
				  BITS_TO_LONGS(CONFIG_NR_CPUS) * sizeof(long)];
	};

claimed to be a "cleanup" (hah - what a f*cking joke! Anybody looking at 
it for half a second would see that that is a clear lie) is just a 
horrible bug waiting to happen.

You can't do that. Doing a character array with "sizeof" IS NOT VALID. It 
doesn't take things like different alignment into account. It might 
_work_, but it is still UTTER SH*T.

Yes, I'm upset. It's -rc6 and now two "please pull" requests have been 
totally unacceptable in very fundamental and obvious forms. 

I'm also upset becasue that obvious PIECE OF CRAP got two Acked-by's from 
people who should know better. 

If you wan tto fix this up, then just fix "struct sched_group sg" instead. 
Here's a suggested better fix, but I'd suggest somebody also write a 
honking big COMMENT in addition to this.

But note how simple this attached patch is? Note how it's not adding 
totally ugly and horrible code? THIS is a fix (and yes, zero-sized arrays 
are a gcc extensiontoo, but we've used them a lot)

I'm sure there are other ways to fix it too, and I'm open to them, but 
that union with character arrays etc I'm not open to.

			Linus
---
 include/linux/sched.h |    2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc..e0c9733 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -838,7 +838,7 @@ struct sched_group {
 	 */
 	u32 reciprocal_cpu_power;
 
-	unsigned long cpumask[];
+	unsigned long cpumask[0];
 };
 
 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-18 16:13 ` Linus Torvalds
@ 2009-05-18 16:49   ` Ingo Molnar
  2009-05-18 16:58     ` Linus Torvalds
  2009-05-19  8:31   ` [tip:sched/core] sched: properly define the sched_group::cpumask and sched_domain::span fields tip-bot for Ingo Molnar
  1 sibling, 1 reply; 57+ messages in thread
From: Ingo Molnar @ 2009-05-18 16:49 UTC (permalink / raw)
  To: Linus Torvalds, Jeff Garzik, Alexander Viro
  Cc: Rusty Russell, Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Mon, 18 May 2009, Ingo Molnar wrote:
> > 
> > Rusty Russell (1):
> >       sched: avoid flexible array member inside struct (gcc extension)
> 
> I'm not pulling this one either.
> 
> It makes no sense what-so-ever. It's uglier code, so calling it a 
> cleanup is just wrong.

hm - i've Cc:-ed Jeff & Viro. The background is that Sparse and LLVM 
barfed on the current construct and Al strongly advocated this 
solution, see:

  "[RFC PATCH 2/2] kernel/sched.c: VLA in middle of struct"

See that particular reply below.

	Ingo

----- Forwarded message from Al Viro <viro@ZenIV.linux.org.uk> -----

Date: Tue, 12 May 2009 15:03:44 +0100
From: Al Viro <viro@ZenIV.linux.org.uk>
To: Rusty Russell <rusty@rustcorp.com.au>
Subject: Re: [RFC PATCH 2/2] kernel/sched.c: VLA in middle of struct
Cc: Jeff Garzik <jeff@garzik.org>, Ingo Molnar <mingo@elte.hu>,
	Peter Zijlstra <a.p.zijlstra@chello.nl>,
	Mike Travis <travis@sgi.com>, LKML <linux-kernel@vger.kernel.org>,
	Andrew Morton <akpm@linux-foundation.org>, roland@redhat.com

On Tue, May 12, 2009 at 11:04:51PM +0930, Rusty Russell wrote:
> On Mon, 11 May 2009 12:39:54 am Jeff Garzik wrote:
> > On Sun, May 10, 2009 at 06:19:40PM +0930, Rusty Russell wrote:
> > > Yeah, it's kinda nasty.  Generally, sched_group is dynamically allocated,
> > > so we just allocate sizeof(struct sched_group) + size of nr_cpu_ids bits.
> > >
> > > These ones are static, and it was easier to put this hack in than make
> > > them dynamic.  There's nothing wrong with it, until we really want
> > > NR_CPUS == bignum, or we want to get rid of NR_CPUS altogether for
> > > CONFIG_CPUMASKS_OFFSTACK (which would be very clean, but not clearly
> > > worthwhile).
> >
> > Nothing wrong with it, except
> >
> > - C99 only defines variable-length automatic arrays
> > - VLA in the middle of a struct are difficult to optimize
> > - gcc's VLA handling WILL change, as gcc docs state
> > - other compilers -- and sparse -- puke all over VLAs, making
> >   static analysis impossible for all code with this weirdism
> 
> Jeff, you seem confused.  In my copy of the standard, you'd know this is called 
> a "flexible array member"; it's not a variable length array.  The only GCC 
> specific issue I can find here is that you're not normally allowed to embed 
> structs with them in another struct (according to the gcc docs; I can't 
> actually find this clearly stated in the standard).

6.7.2.1p2.  It's a separate issue from revolting gcc extension that *do*
allow VLA-in-the-middle-of-struct.  And I mean real VLA, not flex array
member :-/

> Anyway, since [] is C99, I thought it preferable to [0] which is a gcc 
> extension.  However, if C99 is really so braindead as to disallow this fairly 
> standard trick, so I'm happy to go with the gcc extension.[1]

No.  There's a standard way to do that in C99; you can put a struct with
that thing into a union.  So correct way to force enought storage for
such an object is

union {
	struct has_flex_array_member foo;
	char [how_much_space_do_I_want];
} bar;

Unions with overlapping members are fine.  Structures are not.

----- End forwarded message -----

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [GIT PULL, v2] scheduler fixes
  2009-05-18 14:27 [GIT PULL] scheduler fixes Ingo Molnar
  2009-05-18 16:13 ` Linus Torvalds
@ 2009-05-18 16:55 ` Ingo Molnar
  1 sibling, 0 replies; 57+ messages in thread
From: Ingo Molnar @ 2009-05-18 16:55 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: linux-kernel, Andrew Morton, Peter Zijlstra


Linus,

Please pull the latest sched-fixes-for-linus-2 git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-fixes-for-linus-2

This excludes the "sched: avoid flexible array member inside struct 
(gcc extension)" commit you objected to. Did a test-build and a 
test-boot of this, just in case...

 Thanks,

	Ingo

------------------>
Ron (1):
      sched: Fix fallback sched_clock()'s offset when using jiffies


 kernel/sched_clock.c |    3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 819f17a..e1d16c9 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -38,7 +38,8 @@
  */
 unsigned long long __attribute__((weak)) sched_clock(void)
 {
-	return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+	return (unsigned long long)(jiffies - INITIAL_JIFFIES)
+					* (NSEC_PER_SEC / HZ);
 }
 
 static __read_mostly int sched_clock_running;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-18 16:49   ` Ingo Molnar
@ 2009-05-18 16:58     ` Linus Torvalds
  2009-05-18 17:09       ` Ingo Molnar
  0 siblings, 1 reply; 57+ messages in thread
From: Linus Torvalds @ 2009-05-18 16:58 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jeff Garzik, Alexander Viro, Rusty Russell,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra



On Mon, 18 May 2009, Ingo Molnar wrote:
> 
> hm - i've Cc:-ed Jeff & Viro. The background is that Sparse and LLVM 
> barfed on the current construct and Al strongly advocated this 
> solution, see:

I know the background.

Did you read my email?

Did you see my one-line patch that fixes the same problem WITHOUT THE 
INSANITY?

Yes, Al is mostly right. In this case he is wrong. (Ab-)Using unions for 
something like this is crazy, since the code doesn't want a union of 
overlapping data, it wants two consecutive data structures.

And I'm not saying that my one-liner is necessarily the only way to fix 
it. But it's a _better_ way than the crazy way you merged.

		Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-18 16:58     ` Linus Torvalds
@ 2009-05-18 17:09       ` Ingo Molnar
  2009-05-18 19:03         ` Ingo Molnar
  0 siblings, 1 reply; 57+ messages in thread
From: Ingo Molnar @ 2009-05-18 17:09 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeff Garzik, Alexander Viro, Rusty Russell,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Mon, 18 May 2009, Ingo Molnar wrote:
> > 
> > hm - i've Cc:-ed Jeff & Viro. The background is that Sparse and LLVM 
> > barfed on the current construct and Al strongly advocated this 
> > solution, see:
> 
> I know the background.
> 
> Did you read my email?
> 
> Did you see my one-line patch that fixes the same problem WITHOUT 
> THE INSANITY?

Yes, i even proposed that very patch in the original discussion, in 
my first reply:

    http://lkml.org/lkml/2009/5/8/378

but then got sidetracked by the 'this is C-correct' claim. Should 
have noticed the bogosity, sorry about that.

	Ingo

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-18 17:09       ` Ingo Molnar
@ 2009-05-18 19:03         ` Ingo Molnar
  2009-05-18 19:16           ` Linus Torvalds
  0 siblings, 1 reply; 57+ messages in thread
From: Ingo Molnar @ 2009-05-18 19:03 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Jeff Garzik, Alexander Viro, Rusty Russell,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra


* Ingo Molnar <mingo@elte.hu> wrote:

> * Linus Torvalds <torvalds@linux-foundation.org> wrote:
> 
> > On Mon, 18 May 2009, Ingo Molnar wrote:
> > > 
> > > hm - i've Cc:-ed Jeff & Viro. The background is that Sparse and LLVM 
> > > barfed on the current construct and Al strongly advocated this 
> > > solution, see:
> > 
> > I know the background.
> > 
> > Did you read my email?
> > 
> > Did you see my one-line patch that fixes the same problem WITHOUT 
> > THE INSANITY?
> 
> Yes, i even proposed that very patch in the original discussion, in 
> my first reply:
> 
>     http://lkml.org/lkml/2009/5/8/378

Something like the patch below. It also fixes ->span[] which has a 
similar problem.

But ... i think this needs further clean-ups really. Either go fully 
static, or go fully dynamic.

I'd suggest we go fully dynamic: the static structures are never 
used directly anyway, they are pointer-ized during sched domain 
setup.

The reason for this duality is allocation bootstrap: there's no 
generic early-capable allocator that allocates something and 
switches from bootmem to kmalloc transparently once the SLAB has 
been set up.

Would be nice if bootmem_alloc() was extended with such properties - 
if SLAB is up (and bootmem is down) it would return 
kmalloc(GFP_KERNEL) memory buffers.

	Ingo

-------------->
Subject: sched: properly define the sched_group::cpumask and sched_domain::span fields
From: Ingo Molnar <mingo@elte.hu>

Properly document the variable-size structure tricks we are doing
wrt. struct sched_group and sched_domain, and use the field[0] GCC
extension instead of defining a vla array.

Dont use unions for this, as pointed out by Linus.

This also un-confuses Sparse and LLVM.

Reported-by: Jeff Garzik <jeff@garzik.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <alpine.LFD.2.01.0905180850110.3301@localhost.localdomain>
Not-Yet-Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |   25 ++++++++++++++++++++++---
 kernel/sched.c        |    5 +++--
 2 files changed, 25 insertions(+), 5 deletions(-)

Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -846,7 +846,17 @@ struct sched_group {
 	 */
 	u32 reciprocal_cpu_power;
 
-	unsigned long cpumask[];
+	/*
+	 * The CPUs this group covers.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_group' in kernel/sched.c)
+	 */
+	unsigned long cpumask[0];
 };
 
 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@ -932,8 +942,17 @@ struct sched_domain {
 	char *name;
 #endif
 
-	/* span of all CPUs in this domain */
-	unsigned long span[];
+	/*
+	 * Span of all CPUs in this domain.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_domain' in kernel/sched.c)
+	 */
+	unsigned long span[0];
 };
 
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -8049,8 +8049,9 @@ int sched_smt_power_savings = 0, sched_m
 
 /*
  * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
  */
 struct static_sched_group {
 	struct sched_group sg;

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-18 19:03         ` Ingo Molnar
@ 2009-05-18 19:16           ` Linus Torvalds
  2009-05-18 20:20             ` Ingo Molnar
  0 siblings, 1 reply; 57+ messages in thread
From: Linus Torvalds @ 2009-05-18 19:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Jeff Garzik, Alexander Viro, Rusty Russell,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra



On Mon, 18 May 2009, Ingo Molnar wrote:
> 
> Something like the patch below. It also fixes ->span[] which has a 
> similar problem.

Patch looks good to me.

> But ... i think this needs further clean-ups really. Either go fully 
> static, or go fully dynamic.

I do agree that it would probably be good to try to avoid this static 
allocation, and allocate these data structures dynamically. However, if we 
end up having to use two different allocators anyway (one for bootup, and 
one for regular uptimes), then I think that would be an overall loss 
(compared to just the simplicity of statically doing this in a couple of 
places), rather than an overall win.

> Would be nice if bootmem_alloc() was extended with such properties - 
> if SLAB is up (and bootmem is down) it would return 
> kmalloc(GFP_KERNEL) memory buffers.

I would rather say the other way around: no "bootmem_alloc()" at all, but 
just have a regular alloc() that ends up working like the "SMP 
alternatives" code, but instead of being about SMP, it would be about how 
early in the boot sequence it is.

That said, if there are just a couple of places like this that care, I 
don't think it's worth it. The static allocation isn't that horrible. I'd 
rather have a few ugly static allocations with comments about _why_ they 
look the way they do, than try to over-design things to look "clean".

Simplicity is a good thing - even if it can then end up meaning special 
cases like this.

That said, if we could move the kmalloc initialization up some more (and 
get at least the "boot node" data structures set up, and avoid any bootmem 
alloc issues _entirely_, then that would be good.

I hate that stupid bootmem allocator. I suspect we seriously over-use it, 
and that we _should_ be able to do the SL*B init earlier.

			Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-18 19:16           ` Linus Torvalds
@ 2009-05-18 20:20             ` Ingo Molnar
  2009-05-18 22:06               ` Linus Torvalds
       [not found]               ` <4A12E759.6040806@kernel.org>
  0 siblings, 2 replies; 57+ messages in thread
From: Ingo Molnar @ 2009-05-18 20:20 UTC (permalink / raw)
  To: Linus Torvalds, H. Peter Anvin, Pekka Enberg, Yinghai Lu
  Cc: Jeff Garzik, Alexander Viro, Rusty Russell,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Mon, 18 May 2009, Ingo Molnar wrote:
> > 
> > Something like the patch below. It also fixes ->span[] which has 
> > a similar problem.
> 
> Patch looks good to me.

ok. I've queued it up for .31, with your Acked-by. (which i assume 
your reply implies?)

> > But ... i think this needs further clean-ups really. Either go 
> > fully static, or go fully dynamic.
> 
> I do agree that it would probably be good to try to avoid this 
> static allocation, and allocate these data structures dynamically. 
> However, if we end up having to use two different allocators 
> anyway (one for bootup, and one for regular uptimes), then I think 
> that would be an overall loss (compared to just the simplicity of 
> statically doing this in a couple of places), rather than an 
> overall win.
> 
> > Would be nice if bootmem_alloc() was extended with such 
> > properties - if SLAB is up (and bootmem is down) it would return 
> > kmalloc(GFP_KERNEL) memory buffers.
> 
> I would rather say the other way around: no "bootmem_alloc()" at 
> all, but just have a regular alloc() that ends up working like the 
> "SMP alternatives" code, but instead of being about SMP, it would 
> be about how early in the boot sequence it is.
> 
> That said, if there are just a couple of places like this that 
> care, I don't think it's worth it. The static allocation isn't 
> that horrible. I'd rather have a few ugly static allocations with 
> comments about _why_ they look the way they do, than try to 
> over-design things to look "clean".
> 
> Simplicity is a good thing - even if it can then end up meaning 
> special cases like this.
> 
> That said, if we could move the kmalloc initialization up some 
> more (and get at least the "boot node" data structures set up, and 
> avoid any bootmem alloc issues _entirely_, then that would be 
> good.
> 
> I hate that stupid bootmem allocator. I suspect we seriously 
> over-use it, and that we _should_ be able to do the SL*B init 
> earlier.

Hm, tempting thought - not sure how to pull it off though.

One of the biggest user of bootmem is the mem_map[] hierarchies and 
the page allocator bitmaps. Not sure we can get rid of bootmem there 
- those areas are really large, physical memory is often fragmented 
and we need a good NUMA sense for them as well.

We might also have a 22-architectures-to-fix problem as well, before 
we can get rid of bootmem:

  $ git grep alloc_bootmem arch/ | wc -l
  168

On x86 we recently switched some (but not all) early-pagetable 
allocations to the 'early brk' method (which is an utterly simple 
early linear allocator, for limited early dynamic allocations), but 
even with that we still have ugly bootmem use - for example see the 
after_bootmem hacks in arch/x86/mm/init_64.c.

So we have these increasingly more complete layers of allocators, 
which bootstrap each other gradually:

  - static, build-time allocations

  - early-brk (see extend_brk(), RESERVE_BRK and direct use of 
    _brk_end in assembly code)

  - e820 based early allocator (reserve_early()) to bootstrap bootmem

  - bootmem - to bootstrap the page allocator [NUMA aware]

  - page allocator - to bootstrap SLAB

  - SLAB

that's 5 layers until we get to SLAB. Each layer has to be aware of 
its own limits, has to interact with pagetable setup and has to end 
up with a NUMA-aware dynamic allocations as early as possible.

And all this complexity definitely _feels_ utterly wrong, as we 
really know it pretty early on what kind of memory we have, how it's 
laid out amongst nodes. In the end we really just want to have the 
page allocator and SL[AOQU]B.

Looks daunting.

	Ingo

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-18 20:20             ` Ingo Molnar
@ 2009-05-18 22:06               ` Linus Torvalds
  2009-05-19 12:27                 ` Rusty Russell
  2009-05-24 16:13                 ` Pekka J Enberg
       [not found]               ` <4A12E759.6040806@kernel.org>
  1 sibling, 2 replies; 57+ messages in thread
From: Linus Torvalds @ 2009-05-18 22:06 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: H. Peter Anvin, Pekka Enberg, Yinghai Lu, Jeff Garzik,
	Alexander Viro, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra



On Mon, 18 May 2009, Ingo Molnar wrote:
> 
> ok. I've queued it up for .31, with your Acked-by. (which i assume 
> your reply implies?)

Yes.

> > I hate that stupid bootmem allocator. I suspect we seriously 
> > over-use it, and that we _should_ be able to do the SL*B init 
> > earlier.
> 
> Hm, tempting thought - not sure how to pull it off though.

As far as I can recall, one of the things that historically made us want 
to use the bootmem allocator even relatively late was that the real SLAB 
allocator had to wait until all the node information etc was initialized. 

That's pretty damn late. And I wonder if SLUB (and SLOB) might not need a 
lot less initialization, and work much earlier. Something like that might 
be the final nail in the coffin for SLAB, and convince me to just say 
'we don't support it any more".

That said, for the case of things like 'static_sched_group' and 
'static_sched_domain', the problem might well be not just about the 
allocation itself, but simply about the use of those variables. Maybe they 
themselves are needed before we've done all the CPU setup?

> One of the biggest user of bootmem is the mem_map[] hierarchies and 
> the page allocator bitmaps. Not sure we can get rid of bootmem there 
> - those areas are really large, physical memory is often fragmented 
> and we need a good NUMA sense for them as well.

I think that's the _valid_ kind of use of a bootmem allocator.

But for something like the scheduler data structures? Not so much.

		Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [tip:sched/core] sched: properly define the sched_group::cpumask and sched_domain::span fields
  2009-05-18 16:13 ` Linus Torvalds
  2009-05-18 16:49   ` Ingo Molnar
@ 2009-05-19  8:31   ` tip-bot for Ingo Molnar
  1 sibling, 0 replies; 57+ messages in thread
From: tip-bot for Ingo Molnar @ 2009-05-19  8:31 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, torvalds, jeff, tglx, mingo

Commit-ID:  4200efd9acda4accf24640f1e77d24fdcdb524df
Gitweb:     http://git.kernel.org/tip/4200efd9acda4accf24640f1e77d24fdcdb524df
Author:     Ingo Molnar <mingo@elte.hu>
AuthorDate: Tue, 19 May 2009 09:22:19 +0200
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Tue, 19 May 2009 09:22:19 +0200

sched: properly define the sched_group::cpumask and sched_domain::span fields

Properly document the variable-size structure tricks we are doing
wrt. struct sched_group and sched_domain, and use the field[0] GCC
extension instead of defining a vla array.

Dont use unions for this, as pointed out by Linus.

[ Impact: cleanup, un-confuse Sparse and LLVM ]

Reported-by: Jeff Garzik <jeff@garzik.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <alpine.LFD.2.01.0905180850110.3301@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 include/linux/sched.h |   25 ++++++++++++++++++++++---
 kernel/sched.c        |    5 +++--
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index de7b3b2..dbb1043 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -839,7 +839,17 @@ struct sched_group {
 	 */
 	u32 reciprocal_cpu_power;
 
-	unsigned long cpumask[];
+	/*
+	 * The CPUs this group covers.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_group' in kernel/sched.c)
+	 */
+	unsigned long cpumask[0];
 };
 
 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@ -925,8 +935,17 @@ struct sched_domain {
 	char *name;
 #endif
 
-	/* span of all CPUs in this domain */
-	unsigned long span[];
+	/*
+	 * Span of all CPUs in this domain.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_domain' in kernel/sched.c)
+	 */
+	unsigned long span[0];
 };
 
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
diff --git a/kernel/sched.c b/kernel/sched.c
index 497c09b..228acae 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7948,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
 /*
  * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
  */
 struct static_sched_group {
 	struct sched_group sg;

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-18 22:06               ` Linus Torvalds
@ 2009-05-19 12:27                 ` Rusty Russell
  2009-05-24 16:13                 ` Pekka J Enberg
  1 sibling, 0 replies; 57+ messages in thread
From: Rusty Russell @ 2009-05-19 12:27 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, H. Peter Anvin, Pekka Enberg, Yinghai Lu,
	Jeff Garzik, Alexander Viro, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra

On Tue, 19 May 2009 07:36:26 am Linus Torvalds wrote:
> > One of the biggest user of bootmem is the mem_map[] hierarchies and
> > the page allocator bitmaps. Not sure we can get rid of bootmem there
> > - those areas are really large, physical memory is often fragmented
> > and we need a good NUMA sense for them as well.
>
> I think that's the _valid_ kind of use of a bootmem allocator.
>
> But for something like the scheduler data structures? Not so much.

Yeah, and it feels dirty to use slab_is_available() to figure if the code 
should kmalloc or alloc_bootmem.  

Ideally kmalloc/kfree would "always work".  But at least we could get closer.

Thanks,
Rusty.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [PATCH] x86: enable_update_mptable should MACRO
       [not found]                 ` <20090520071900.GB11952@elte.hu>
@ 2009-05-20  7:37                   ` Yinghai Lu
  2009-05-28  0:00                     ` [tip:irq/numa] x86: enable_update_mptable should be a macro tip-bot for Yinghai Lu
  0 siblings, 1 reply; 57+ messages in thread
From: Yinghai Lu @ 2009-05-20  7:37 UTC (permalink / raw)
  To: Ingo Molnar, Thomas Gleixner, H. Peter Anvin; +Cc: linux-kernel



instead of declare as one inline function.
because other case that is variable

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/include/asm/mpspec.h |   11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

Index: linux-2.6/arch/x86/include/asm/mpspec.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/mpspec.h
+++ linux-2.6/arch/x86/include/asm/mpspec.h
@@ -61,9 +61,11 @@ extern void get_smp_config(void);
 #ifdef CONFIG_X86_MPPARSE
 extern void find_smp_config(void);
 extern void early_reserve_e820_mpc_new(void);
+extern int enable_update_mptable;
 #else
 static inline void find_smp_config(void) { }
 static inline void early_reserve_e820_mpc_new(void) { }
+#define enable_update_mptable 0
 #endif
 
 void __cpuinit generic_processor_info(int apicid, int version);
@@ -87,15 +89,6 @@ static inline int acpi_probe_gsi(void)
 }
 #endif /* CONFIG_ACPI */
 
-#ifdef CONFIG_X86_MPPARSE
-extern int enable_update_mptable;
-#else
-static inline int enable_update_mptable(void)
-{
-	return 0;
-}
-#endif
-
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
 
 struct physid_mask {

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-18 22:06               ` Linus Torvalds
  2009-05-19 12:27                 ` Rusty Russell
@ 2009-05-24 16:13                 ` Pekka J Enberg
  2009-05-24 18:18                   ` Linus Torvalds
  2009-05-24 18:34                   ` Yinghai Lu
  1 sibling, 2 replies; 57+ messages in thread
From: Pekka J Enberg @ 2009-05-24 16:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, H. Peter Anvin, Yinghai Lu, Jeff Garzik,
	Alexander Viro, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra

On Mon, 18 May 2009, Linus Torvalds wrote:
> > > I hate that stupid bootmem allocator. I suspect we seriously 
> > > over-use it, and that we _should_ be able to do the SL*B init 
> > > earlier.
> > 
> > Hm, tempting thought - not sure how to pull it off though.
> 
> As far as I can recall, one of the things that historically made us want 
> to use the bootmem allocator even relatively late was that the real SLAB 
> allocator had to wait until all the node information etc was initialized. 
> 
> That's pretty damn late. And I wonder if SLUB (and SLOB) might not need a 
> lot less initialization, and work much earlier. Something like that might 
> be the final nail in the coffin for SLAB, and convince me to just say 
> 'we don't support it any more".

Ingo, here's a patch that boots UMA+SMP+SLUB x86-64 kernel on qemu all 
the way to userspace. It probably breaks bunch of things for now but 
something for you to play with if you want.

			Pekka

diff --git a/init/main.c b/init/main.c
index 3bbf93b..856afa9 100644
--- a/init/main.c
+++ b/init/main.c
@@ -575,6 +575,22 @@ asmlinkage void __init start_kernel(void)
 	setup_nr_cpu_ids();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
+	build_all_zonelists();
+	page_alloc_init();
+
+	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
+	parse_early_param();
+	parse_args("Booting kernel", static_command_line, __start___param,
+		   __stop___param - __start___param,
+		   &unknown_bootoption);
+	/*
+	 * Setup kernel memory allocators
+	 */
+	pidhash_init();
+	vmalloc_init();
+	vfs_caches_init_early();
+	mem_init();
+	kmem_cache_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -586,13 +602,6 @@ asmlinkage void __init start_kernel(void)
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
-	build_all_zonelists();
-	page_alloc_init();
-	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
-	parse_early_param();
-	parse_args("Booting kernel", static_command_line, __start___param,
-		   __stop___param - __start___param,
-		   &unknown_bootoption);
 	if (!irqs_disabled()) {
 		printk(KERN_WARNING "start_kernel(): bug: interrupts were "
 				"enabled *very* early, fixing it\n");
@@ -604,7 +613,6 @@ asmlinkage void __init start_kernel(void)
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
-	pidhash_init();
 	init_timers();
 	hrtimers_init();
 	softirq_init();
@@ -646,14 +654,10 @@ asmlinkage void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
-	vmalloc_init();
-	vfs_caches_init_early();
 	cpuset_init_early();
 	page_cgroup_init();
-	mem_init();
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
-	kmem_cache_init();
 	kmemtrace_init();
 	debug_objects_mem_init();
 	idr_init_cache();
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 26e0875..702a696 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -18,7 +18,7 @@
 #include <linux/rculist.h>
 #include <linux/hash.h>
 #include <trace/irq.h>
-#include <linux/bootmem.h>
+#include <linux/slab.h>
 
 #include "internals.h"
 
@@ -44,7 +44,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 static void __init init_irq_default_affinity(void)
 {
-	alloc_bootmem_cpumask_var(&irq_default_affinity);
+	alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
 	cpumask_setall(irq_default_affinity);
 }
 #else
@@ -158,12 +158,12 @@ int __init early_irq_init(void)
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
 
 	/* allocate irq_desc_ptrs array based on nr_irqs */
-	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+	irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
 
 	/* allocate based on nr_cpu_ids */
 	/* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
-	kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
-					  sizeof(int));
+	kstat_irqs_legacy = kzalloc(NR_IRQS_LEGACY * nr_cpu_ids *
+					  sizeof(int), GFP_NOWAIT);
 
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa47..b403536 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -68,7 +68,6 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
@@ -7525,21 +7524,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 
 static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
 {
+	gfp_t gfp = GFP_KERNEL;
+
 	memset(rd, 0, sizeof(*rd));
 
-	if (bootmem) {
-		alloc_bootmem_cpumask_var(&def_root_domain.span);
-		alloc_bootmem_cpumask_var(&def_root_domain.online);
-		alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-		cpupri_init(&rd->cpupri, true);
-		return 0;
-	}
+	if (bootmem)
+		gfp = GFP_NOWAIT;
 
-	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->span, gfp))
 		goto out;
-	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->online, gfp))
 		goto free_span;
-	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->rto_mask, gfp))
 		goto free_online;
 
 	if (cpupri_init(&rd->cpupri, false) != 0)
@@ -8860,12 +8856,8 @@ void __init sched_init(void)
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
-	/*
-	 * As sched_init() is called before page_alloc is setup,
-	 * we use alloc_bootmem().
-	 */
 	if (alloc_size) {
-		ptr = (unsigned long)alloc_bootmem(alloc_size);
+		ptr = (unsigned long) kzalloc(alloc_size, GFP_NOWAIT);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.se = (struct sched_entity **)ptr;
@@ -9051,12 +9043,12 @@ void __init sched_init(void)
 	current->sched_class = &fair_sched_class;
 
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+	alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+	alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
 #endif
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
 	scheduler_running = 1;
diff --git a/mm/slub.c b/mm/slub.c
index 65ffda5..0ead807 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2557,13 +2557,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
 	if (gfp_flags & SLUB_DMA)
 		flags = SLAB_CACHE_DMA;
 
-	down_write(&slub_lock);
+	/*
+	 * This function is called with IRQs disabled during early-boot on
+	 * single CPU so there's no need to take slub_lock here.
+	 */
 	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
 								flags, NULL))
 		goto panic;
 
 	list_add(&s->list, &slab_caches);
-	up_write(&slub_lock);
+
 	if (sysfs_slab_add(s))
 		goto panic;
 	return s;
@@ -3021,7 +3024,7 @@ void __init kmem_cache_init(void)
 	 * kmem_cache_open for slab_state == DOWN.
 	 */
 	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
-		sizeof(struct kmem_cache_node), GFP_KERNEL);
+		sizeof(struct kmem_cache_node), GFP_NOWAIT);
 	kmalloc_caches[0].refcount = -1;
 	caches++;
 
@@ -3034,16 +3037,16 @@ void __init kmem_cache_init(void)
 	/* Caches that are not of the two-to-the-power-of size */
 	if (KMALLOC_MIN_SIZE <= 64) {
 		create_kmalloc_cache(&kmalloc_caches[1],
-				"kmalloc-96", 96, GFP_KERNEL);
+				"kmalloc-96", 96, GFP_NOWAIT);
 		caches++;
 		create_kmalloc_cache(&kmalloc_caches[2],
-				"kmalloc-192", 192, GFP_KERNEL);
+				"kmalloc-192", 192, GFP_NOWAIT);
 		caches++;
 	}
 
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
-			"kmalloc", 1 << i, GFP_KERNEL);
+			"kmalloc", 1 << i, GFP_NOWAIT);
 		caches++;
 	}
 
@@ -3080,7 +3083,7 @@ void __init kmem_cache_init(void)
 	/* Provide the correct kmalloc names now that the caches are up */
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
-			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+			kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
 
 #ifdef CONFIG_SMP
 	register_cpu_notifier(&slab_notifier);

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-24 16:13                 ` Pekka J Enberg
@ 2009-05-24 18:18                   ` Linus Torvalds
  2009-05-24 19:13                     ` Pekka Enberg
  2009-05-25  5:16                     ` Benjamin Herrenschmidt
  2009-05-24 18:34                   ` Yinghai Lu
  1 sibling, 2 replies; 57+ messages in thread
From: Linus Torvalds @ 2009-05-24 18:18 UTC (permalink / raw)
  To: Pekka J Enberg
  Cc: Ingo Molnar, H. Peter Anvin, Yinghai Lu, Jeff Garzik,
	Alexander Viro, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra



On Sun, 24 May 2009, Pekka J Enberg wrote:
>
> Ingo, here's a patch that boots UMA+SMP+SLUB x86-64 kernel on qemu all 
> the way to userspace. It probably breaks bunch of things for now but 
> something for you to play with if you want.

Just looking at the patch (not actually trying it out in any way or 
looking at some bigger context), this absolutely looks like the right 
direction to go in.

Our order in init/main.c is largely totally historical, and I think it 
makes tons of sense to move the memory allocation initialization up much 
earlier. 

In fact, it would be nice to perhaps try to move it even earlier. Now you 
moved it to before the scheduler init (good!), but I do wonder if it could 
be moved up to even before the setup_per_cpu_areas() etc crud. 

I realize that the allocator wants to use the per-CPU area, but if we have 
just the boot CPU area set up statically at that point, since it's only 
the boot CPU running, maybe we could do those per-cpu area allocations 
without the bootmem allocator too?

But even just getting bootmem out of the scheduler setup is a big 
improvement, I think. So this patch looks very promising as is.

Did you test whether the other allocators were ok with this too?

		Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-24 16:13                 ` Pekka J Enberg
  2009-05-24 18:18                   ` Linus Torvalds
@ 2009-05-24 18:34                   ` Yinghai Lu
  2009-05-24 19:15                     ` Pekka Enberg
  2009-05-25  2:53                     ` Ingo Molnar
  1 sibling, 2 replies; 57+ messages in thread
From: Yinghai Lu @ 2009-05-24 18:34 UTC (permalink / raw)
  To: Pekka J Enberg, Linus Torvalds, Ingo Molnar
  Cc: H. Peter Anvin, Jeff Garzik, Alexander Viro, Rusty Russell,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra

Pekka J Enberg wrote:
> On Mon, 18 May 2009, Linus Torvalds wrote:
>>>> I hate that stupid bootmem allocator. I suspect we seriously 
>>>> over-use it, and that we _should_ be able to do the SL*B init 
>>>> earlier.
>>> Hm, tempting thought - not sure how to pull it off though.
>> As far as I can recall, one of the things that historically made us want 
>> to use the bootmem allocator even relatively late was that the real SLAB 
>> allocator had to wait until all the node information etc was initialized. 
>>
>> That's pretty damn late. And I wonder if SLUB (and SLOB) might not need a 
>> lot less initialization, and work much earlier. Something like that might 
>> be the final nail in the coffin for SLAB, and convince me to just say 
>> 'we don't support it any more".
> 
> Ingo, here's a patch that boots UMA+SMP+SLUB x86-64 kernel on qemu all 
> the way to userspace. It probably breaks bunch of things for now but 
> something for you to play with if you want.
> 

updated with tip/master. also add change to cpupri_init
otherwise will get 
[    0.000000] Memory: 523096612k/537526272k available (10461k kernel code, 656156k absent, 13773504k reserved, 7186k data, 2548k init)
[    0.000000] SLUB: Genslabs=14, HWalign=64, Order=0-3, MinObjects=0, CPUs=32, Nodes=8
[    0.000000] ------------[ cut here ]------------
[    0.000000] WARNING: at kernel/lockdep.c:2282 lockdep_trace_alloc+0xaf/0xee()
[    0.000000] Hardware name: Sun Fire X4600 M2
[    0.000000] Modules linked in:
[    0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-rc6-tip-01778-g0afdd0f-dirty #259
[    0.000000] Call Trace:
[    0.000000]  [<ffffffff810a0274>] ? lockdep_trace_alloc+0xaf/0xee
[    0.000000]  [<ffffffff81075ab0>] warn_slowpath_common+0x88/0xcb
[    0.000000]  [<ffffffff81075b15>] warn_slowpath_null+0x22/0x38
[    0.000000]  [<ffffffff810a0274>] lockdep_trace_alloc+0xaf/0xee
[    0.000000]  [<ffffffff8110301b>] kmem_cache_alloc_node+0x38/0x14d
[    0.000000]  [<ffffffff813ec548>] ? alloc_cpumask_var_node+0x4a/0x10a
[    0.000000]  [<ffffffff8109eb61>] ? lockdep_init_map+0xb9/0x564
[    0.000000]  [<ffffffff813ec548>] alloc_cpumask_var_node+0x4a/0x10a
[    0.000000]  [<ffffffff813ec62c>] alloc_cpumask_var+0x24/0x3a
[    0.000000]  [<ffffffff819e6306>] cpupri_init+0x7f/0x112
[    0.000000]  [<ffffffff819e5a30>] init_rootdomain+0x72/0xb7
[    0.000000]  [<ffffffff821facce>] sched_init+0x109/0x660
[    0.000000]  [<ffffffff82203082>] ? kmem_cache_init+0x193/0x1b2
[    0.000000]  [<ffffffff821dfd7a>] start_kernel+0x218/0x3f3
[    0.000000]  [<ffffffff821df2a9>] x86_64_start_reservations+0xb9/0xd4
[    0.000000]  [<ffffffff821df3b2>] x86_64_start_kernel+0xee/0x109
[    0.000000] ---[ end trace a7919e7f17c0a725 ]---

works with 8 sockets numa amd64 box.

YH

---
 init/main.c           |   28 ++++++++++++++++------------
 kernel/irq/handle.c   |   23 ++++++++---------------
 kernel/sched.c        |   34 +++++++++++++---------------------
 kernel/sched_cpupri.c |    9 ++++++---
 mm/slub.c             |   17 ++++++++++-------
 5 files changed, 53 insertions(+), 58 deletions(-)

Index: linux-2.6/init/main.c
===================================================================
--- linux-2.6.orig/init/main.c
+++ linux-2.6/init/main.c
@@ -576,6 +576,22 @@ asmlinkage void __init start_kernel(void
 	setup_nr_cpu_ids();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
+	build_all_zonelists();
+	page_alloc_init();
+
+	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
+	parse_early_param();
+	parse_args("Booting kernel", static_command_line, __start___param,
+		   __stop___param - __start___param,
+		   &unknown_bootoption);
+	/*
+	 * Setup kernel memory allocators
+	 */
+	pidhash_init();
+	vmalloc_init();
+	vfs_caches_init_early();
+	mem_init();
+	kmem_cache_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -587,13 +603,6 @@ asmlinkage void __init start_kernel(void
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
-	build_all_zonelists();
-	page_alloc_init();
-	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
-	parse_early_param();
-	parse_args("Booting kernel", static_command_line, __start___param,
-		   __stop___param - __start___param,
-		   &unknown_bootoption);
 	if (!irqs_disabled()) {
 		printk(KERN_WARNING "start_kernel(): bug: interrupts were "
 				"enabled *very* early, fixing it\n");
@@ -605,7 +614,6 @@ asmlinkage void __init start_kernel(void
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
-	pidhash_init();
 	init_timers();
 	hrtimers_init();
 	softirq_init();
@@ -647,14 +655,10 @@ asmlinkage void __init start_kernel(void
 		initrd_start = 0;
 	}
 #endif
-	vmalloc_init();
-	vfs_caches_init_early();
 	cpuset_init_early();
 	page_cgroup_init();
-	mem_init();
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
-	kmem_cache_init();
 	kmemtrace_init();
 	debug_objects_mem_init();
 	idr_init_cache();
Index: linux-2.6/kernel/irq/handle.c
===================================================================
--- linux-2.6.orig/kernel/irq/handle.c
+++ linux-2.6/kernel/irq/handle.c
@@ -18,7 +18,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <linux/bootmem.h>
+#include <linux/slab.h>
 #include <trace/events/irq.h>
 
 #include "internals.h"
@@ -45,7 +45,7 @@ void handle_bad_irq(unsigned int irq, st
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 static void __init init_irq_default_affinity(void)
 {
-	alloc_bootmem_cpumask_var(&irq_default_affinity);
+	alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
 	cpumask_setall(irq_default_affinity);
 }
 #else
@@ -86,12 +86,8 @@ void __ref init_kstat_irqs(struct irq_de
 {
 	void *ptr;
 
-	if (slab_is_available())
-		ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-				   GFP_ATOMIC, node);
-	else
-		ptr = alloc_bootmem_node(NODE_DATA(node),
-				nr * sizeof(*desc->kstat_irqs));
+	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+			   GFP_ATOMIC, node);
 
 	/*
 	 * don't overwite if can not get new one
@@ -162,12 +158,12 @@ int __init early_irq_init(void)
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
 
 	/* allocate irq_desc_ptrs array based on nr_irqs */
-	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+	irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
 
 	/* allocate based on nr_cpu_ids */
 	/* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
-	kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
-					  sizeof(int));
+	kstat_irqs_legacy = kzalloc(NR_IRQS_LEGACY * nr_cpu_ids *
+					  sizeof(int), GFP_NOWAIT);
 
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
@@ -214,10 +210,7 @@ struct irq_desc * __ref irq_to_desc_allo
 	if (desc)
 		goto out_unlock;
 
-	if (slab_is_available())
-		desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-	else
-		desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
 
 	printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
 	if (!desc) {
Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -69,7 +69,6 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
@@ -7821,24 +7820,21 @@ static void rq_attach_root(struct rq *rq
 
 static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
 {
+	gfp_t gfp = GFP_KERNEL;
+
 	memset(rd, 0, sizeof(*rd));
 
-	if (bootmem) {
-		alloc_bootmem_cpumask_var(&def_root_domain.span);
-		alloc_bootmem_cpumask_var(&def_root_domain.online);
-		alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-		cpupri_init(&rd->cpupri, true);
-		return 0;
-	}
+	if (bootmem)
+		gfp = GFP_NOWAIT;
 
-	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->span, gfp))
 		goto out;
-	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->online, gfp))
 		goto free_span;
-	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->rto_mask, gfp))
 		goto free_online;
 
-	if (cpupri_init(&rd->cpupri, false) != 0)
+	if (cpupri_init(&rd->cpupri, bootmem) != 0)
 		goto free_rto_mask;
 	return 0;
 
@@ -9157,12 +9153,8 @@ void __init sched_init(void)
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
-	/*
-	 * As sched_init() is called before page_alloc is setup,
-	 * we use alloc_bootmem().
-	 */
 	if (alloc_size) {
-		ptr = (unsigned long)alloc_bootmem(alloc_size);
+		ptr = (unsigned long) kzalloc(alloc_size, GFP_NOWAIT);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.se = (struct sched_entity **)ptr;
@@ -9353,13 +9345,13 @@ void __init sched_init(void)
 	current->sched_class = &fair_sched_class;
 
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+	alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
-	alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
+	alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
 	perf_counter_init();
Index: linux-2.6/mm/slub.c
===================================================================
--- linux-2.6.orig/mm/slub.c
+++ linux-2.6/mm/slub.c
@@ -2582,13 +2582,16 @@ static struct kmem_cache *create_kmalloc
 	if (gfp_flags & SLUB_DMA)
 		flags = SLAB_CACHE_DMA;
 
-	down_write(&slub_lock);
+	/*
+	 * This function is called with IRQs disabled during early-boot on
+	 * single CPU so there's no need to take slub_lock here.
+	 */
 	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
 								flags, NULL))
 		goto panic;
 
 	list_add(&s->list, &slab_caches);
-	up_write(&slub_lock);
+
 	if (sysfs_slab_add(s))
 		goto panic;
 	return s;
@@ -3048,7 +3051,7 @@ void __init kmem_cache_init(void)
 	 * kmem_cache_open for slab_state == DOWN.
 	 */
 	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
-		sizeof(struct kmem_cache_node), GFP_KERNEL);
+		sizeof(struct kmem_cache_node), GFP_NOWAIT);
 	kmalloc_caches[0].refcount = -1;
 	caches++;
 
@@ -3061,16 +3064,16 @@ void __init kmem_cache_init(void)
 	/* Caches that are not of the two-to-the-power-of size */
 	if (KMALLOC_MIN_SIZE <= 64) {
 		create_kmalloc_cache(&kmalloc_caches[1],
-				"kmalloc-96", 96, GFP_KERNEL);
+				"kmalloc-96", 96, GFP_NOWAIT);
 		caches++;
 		create_kmalloc_cache(&kmalloc_caches[2],
-				"kmalloc-192", 192, GFP_KERNEL);
+				"kmalloc-192", 192, GFP_NOWAIT);
 		caches++;
 	}
 
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
-			"kmalloc", 1 << i, GFP_KERNEL);
+			"kmalloc", 1 << i, GFP_NOWAIT);
 		caches++;
 	}
 
@@ -3107,7 +3110,7 @@ void __init kmem_cache_init(void)
 	/* Provide the correct kmalloc names now that the caches are up */
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
-			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+			kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
 
 #ifdef CONFIG_SMP
 	register_cpu_notifier(&slab_notifier);
Index: linux-2.6/kernel/sched_cpupri.c
===================================================================
--- linux-2.6.orig/kernel/sched_cpupri.c
+++ linux-2.6/kernel/sched_cpupri.c
@@ -156,16 +156,19 @@ int __init_refok cpupri_init(struct cpup
 {
 	int i;
 
+	gfp_t gfp = GFP_KERNEL;
+
 	memset(cp, 0, sizeof(*cp));
 
+	if (bootmem)
+		gfp = GFP_NOWAIT;
+
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
 
 		spin_lock_init(&vec->lock);
 		vec->count = 0;
-		if (bootmem)
-			alloc_bootmem_cpumask_var(&vec->mask);
-		else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
+		if (!alloc_cpumask_var(&vec->mask, gfp))
 			goto cleanup;
 	}
 

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-24 18:18                   ` Linus Torvalds
@ 2009-05-24 19:13                     ` Pekka Enberg
  2009-05-25  5:16                     ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 57+ messages in thread
From: Pekka Enberg @ 2009-05-24 19:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Ingo Molnar, H. Peter Anvin, Yinghai Lu, Jeff Garzik,
	Alexander Viro, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra, Christoph Lameter, Nick Piggin,
	Matt Mackall

Hi Linus,

On Sun, 24 May 2009, Pekka J Enberg wrote:
>> Ingo, here's a patch that boots UMA+SMP+SLUB x86-64 kernel on qemu all
>> the way to userspace. It probably breaks bunch of things for now but
>> something for you to play with if you want.

On Sun, May 24, 2009 at 9:18 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> In fact, it would be nice to perhaps try to move it even earlier. Now you
> moved it to before the scheduler init (good!), but I do wonder if it could
> be moved up to even before the setup_per_cpu_areas() etc crud.

Oh, sure, we can look into that. I just wanted to take the
conservative approach because I worry about breaking bunch of
configurations I cannot test. I suspect it's going to get pretty hairy
if we do kmem_cache_init() even earlier. Furthermore, SLUB does sysfs
setup in kmem_cache_init() so we probably need to split slab
initialization in two stages.

On Sun, May 24, 2009 at 9:18 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> I realize that the allocator wants to use the per-CPU area, but if we have
> just the boot CPU area set up statically at that point, since it's only
> the boot CPU running, maybe we could do those per-cpu area allocations
> without the bootmem allocator too?

We probably can. I don't see any fundamental reason why slab
allocators can't bootstrap early in the boot sequence after we've set
up the page allocator.

On Sun, May 24, 2009 at 9:18 PM, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
> But even just getting bootmem out of the scheduler setup is a big
> improvement, I think. So this patch looks very promising as is.
>
> Did you test whether the other allocators were ok with this too?

SLUB and SLOB are fine but SLAB explodes. I didn't investigate it yet
but it's probably because SLAB expects interrupts to be enabled when
kmem_cache_init() is called.

                       Pekka

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-24 18:34                   ` Yinghai Lu
@ 2009-05-24 19:15                     ` Pekka Enberg
  2009-05-25  2:53                     ` Ingo Molnar
  1 sibling, 0 replies; 57+ messages in thread
From: Pekka Enberg @ 2009-05-24 19:15 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Linus Torvalds, Ingo Molnar, H. Peter Anvin, Jeff Garzik,
	Alexander Viro, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra

On Sun, May 24, 2009 at 9:34 PM, Yinghai Lu <yinghai@kernel.org> wrote:
>> Ingo, here's a patch that boots UMA+SMP+SLUB x86-64 kernel on qemu all
>> the way to userspace. It probably breaks bunch of things for now but
>> something for you to play with if you want.
>>
>
> updated with tip/master. also add change to cpupri_init
> otherwise will get
> [    0.000000] Memory: 523096612k/537526272k available (10461k kernel code, 656156k absent, 13773504k reserved, 7186k data, 2548k init)
> [    0.000000] SLUB: Genslabs=14, HWalign=64, Order=0-3, MinObjects=0, CPUs=32, Nodes=8
> [    0.000000] ------------[ cut here ]------------
> [    0.000000] WARNING: at kernel/lockdep.c:2282 lockdep_trace_alloc+0xaf/0xee()
> [    0.000000] Hardware name: Sun Fire X4600 M2
> [    0.000000] Modules linked in:
> [    0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-rc6-tip-01778-g0afdd0f-dirty #259
> [    0.000000] Call Trace:
> [    0.000000]  [<ffffffff810a0274>] ? lockdep_trace_alloc+0xaf/0xee
> [    0.000000]  [<ffffffff81075ab0>] warn_slowpath_common+0x88/0xcb
> [    0.000000]  [<ffffffff81075b15>] warn_slowpath_null+0x22/0x38
> [    0.000000]  [<ffffffff810a0274>] lockdep_trace_alloc+0xaf/0xee
> [    0.000000]  [<ffffffff8110301b>] kmem_cache_alloc_node+0x38/0x14d
> [    0.000000]  [<ffffffff813ec548>] ? alloc_cpumask_var_node+0x4a/0x10a
> [    0.000000]  [<ffffffff8109eb61>] ? lockdep_init_map+0xb9/0x564
> [    0.000000]  [<ffffffff813ec548>] alloc_cpumask_var_node+0x4a/0x10a
> [    0.000000]  [<ffffffff813ec62c>] alloc_cpumask_var+0x24/0x3a
> [    0.000000]  [<ffffffff819e6306>] cpupri_init+0x7f/0x112
> [    0.000000]  [<ffffffff819e5a30>] init_rootdomain+0x72/0xb7
> [    0.000000]  [<ffffffff821facce>] sched_init+0x109/0x660
> [    0.000000]  [<ffffffff82203082>] ? kmem_cache_init+0x193/0x1b2
> [    0.000000]  [<ffffffff821dfd7a>] start_kernel+0x218/0x3f3
> [    0.000000]  [<ffffffff821df2a9>] x86_64_start_reservations+0xb9/0xd4
> [    0.000000]  [<ffffffff821df3b2>] x86_64_start_kernel+0xee/0x109
> [    0.000000] ---[ end trace a7919e7f17c0a725 ]---
>
> works with 8 sockets numa amd64 box.

Oh, cool. Thanks for testing! I'll rebase on top of tip and merge your
changes to my patch.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-24 18:34                   ` Yinghai Lu
  2009-05-24 19:15                     ` Pekka Enberg
@ 2009-05-25  2:53                     ` Ingo Molnar
  2009-05-25  4:45                       ` Yinghai Lu
                                         ` (3 more replies)
  1 sibling, 4 replies; 57+ messages in thread
From: Ingo Molnar @ 2009-05-25  2:53 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Pekka J Enberg, Linus Torvalds, H. Peter Anvin, Jeff Garzik,
	Alexander Viro, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra


* Yinghai Lu <yinghai@kernel.org> wrote:

> Pekka J Enberg wrote:
> > On Mon, 18 May 2009, Linus Torvalds wrote:
> >>>> I hate that stupid bootmem allocator. I suspect we seriously 
> >>>> over-use it, and that we _should_ be able to do the SL*B init 
> >>>> earlier.
> >>> Hm, tempting thought - not sure how to pull it off though.
> >> As far as I can recall, one of the things that historically made us want 
> >> to use the bootmem allocator even relatively late was that the real SLAB 
> >> allocator had to wait until all the node information etc was initialized. 
> >>
> >> That's pretty damn late. And I wonder if SLUB (and SLOB) might not need a 
> >> lot less initialization, and work much earlier. Something like that might 
> >> be the final nail in the coffin for SLAB, and convince me to just say 
> >> 'we don't support it any more".
> > 
> > Ingo, here's a patch that boots UMA+SMP+SLUB x86-64 kernel on qemu all 
> > the way to userspace. It probably breaks bunch of things for now but 
> > something for you to play with if you want.
> > 
> 
> updated with tip/master. also add change to cpupri_init
> otherwise will get 
> [    0.000000] Memory: 523096612k/537526272k available (10461k kernel code, 656156k absent, 13773504k reserved, 7186k data, 2548k init)
> [    0.000000] SLUB: Genslabs=14, HWalign=64, Order=0-3, MinObjects=0, CPUs=32, Nodes=8
> [    0.000000] ------------[ cut here ]------------
> [    0.000000] WARNING: at kernel/lockdep.c:2282 lockdep_trace_alloc+0xaf/0xee()
> [    0.000000] Hardware name: Sun Fire X4600 M2
> [    0.000000] Modules linked in:
> [    0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-rc6-tip-01778-g0afdd0f-dirty #259
> [    0.000000] Call Trace:
> [    0.000000]  [<ffffffff810a0274>] ? lockdep_trace_alloc+0xaf/0xee
> [    0.000000]  [<ffffffff81075ab0>] warn_slowpath_common+0x88/0xcb
> [    0.000000]  [<ffffffff81075b15>] warn_slowpath_null+0x22/0x38
> [    0.000000]  [<ffffffff810a0274>] lockdep_trace_alloc+0xaf/0xee
> [    0.000000]  [<ffffffff8110301b>] kmem_cache_alloc_node+0x38/0x14d
> [    0.000000]  [<ffffffff813ec548>] ? alloc_cpumask_var_node+0x4a/0x10a
> [    0.000000]  [<ffffffff8109eb61>] ? lockdep_init_map+0xb9/0x564
> [    0.000000]  [<ffffffff813ec548>] alloc_cpumask_var_node+0x4a/0x10a
> [    0.000000]  [<ffffffff813ec62c>] alloc_cpumask_var+0x24/0x3a
> [    0.000000]  [<ffffffff819e6306>] cpupri_init+0x7f/0x112
> [    0.000000]  [<ffffffff819e5a30>] init_rootdomain+0x72/0xb7
> [    0.000000]  [<ffffffff821facce>] sched_init+0x109/0x660
> [    0.000000]  [<ffffffff82203082>] ? kmem_cache_init+0x193/0x1b2
> [    0.000000]  [<ffffffff821dfd7a>] start_kernel+0x218/0x3f3
> [    0.000000]  [<ffffffff821df2a9>] x86_64_start_reservations+0xb9/0xd4
> [    0.000000]  [<ffffffff821df3b2>] x86_64_start_kernel+0xee/0x109
> [    0.000000] ---[ end trace a7919e7f17c0a725 ]---
> 
> works with 8 sockets numa amd64 box.
> 
> YH
> 
> ---
>  init/main.c           |   28 ++++++++++++++++------------
>  kernel/irq/handle.c   |   23 ++++++++---------------
>  kernel/sched.c        |   34 +++++++++++++---------------------
>  kernel/sched_cpupri.c |    9 ++++++---
>  mm/slub.c             |   17 ++++++++++-------
>  5 files changed, 53 insertions(+), 58 deletions(-)

Very nice!

Would it be possible to restructure things to move kmalloc init to 
before IRQ init as well? We have a couple of uglinesses there too.

Conceptually, memory should be the first thing set up in general, in 
a kernel. It does not need IRQs, timers, the scheduler or any of the 
IO facilities and abstractions. All of them need memory though - and 
as Linux scales to more and more hardware via the same single image, 
so will we get more and more dynamic concepts like cpumask_var_t and 
sparse-irqs, which want to allocate very early.

setup_arch() is one huge function that sets up all architecture 
details at once - but if we split a separate setup_arch_mem() out of 
it, and left the rest in setup_arch (and moved it further down), we 
could remove much of bootmem (especially the ugly uses).

This might even be doable realistically, and we could thus librarize 
bootmem and eliminate it from x86 at least. Perhaps.

	Ingo

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  2:53                     ` Ingo Molnar
@ 2009-05-25  4:45                       ` Yinghai Lu
  2009-05-25  5:15                         ` Ingo Molnar
  2009-05-25  4:52                       ` H. Peter Anvin
                                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 57+ messages in thread
From: Yinghai Lu @ 2009-05-25  4:45 UTC (permalink / raw)
  To: Ingo Molnar, Pekka J Enberg, Rusty Russell
  Cc: Linus Torvalds, H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra

Ingo Molnar wrote:
> * Yinghai Lu <yinghai@kernel.org> wrote:
> 
>> Pekka J Enberg wrote:
>>> On Mon, 18 May 2009, Linus Torvalds wrote:
>>>>>> I hate that stupid bootmem allocator. I suspect we seriously 
>>>>>> over-use it, and that we _should_ be able to do the SL*B init 
>>>>>> earlier.
>>>>> Hm, tempting thought - not sure how to pull it off though.
>>>> As far as I can recall, one of the things that historically made us want 
>>>> to use the bootmem allocator even relatively late was that the real SLAB 
>>>> allocator had to wait until all the node information etc was initialized. 
>>>>
>>>> That's pretty damn late. And I wonder if SLUB (and SLOB) might not need a 
>>>> lot less initialization, and work much earlier. Something like that might 
>>>> be the final nail in the coffin for SLAB, and convince me to just say 
>>>> 'we don't support it any more".
>>> Ingo, here's a patch that boots UMA+SMP+SLUB x86-64 kernel on qemu all 
>>> the way to userspace. It probably breaks bunch of things for now but 
>>> something for you to play with if you want.
>>>
>> updated with tip/master. also add change to cpupri_init
>> otherwise will get 
>> [    0.000000] Memory: 523096612k/537526272k available (10461k kernel code, 656156k absent, 13773504k reserved, 7186k data, 2548k init)
>> [    0.000000] SLUB: Genslabs=14, HWalign=64, Order=0-3, MinObjects=0, CPUs=32, Nodes=8
>> [    0.000000] ------------[ cut here ]------------
>> [    0.000000] WARNING: at kernel/lockdep.c:2282 lockdep_trace_alloc+0xaf/0xee()
>> [    0.000000] Hardware name: Sun Fire X4600 M2
>> [    0.000000] Modules linked in:
>> [    0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-rc6-tip-01778-g0afdd0f-dirty #259
>> [    0.000000] Call Trace:
>> [    0.000000]  [<ffffffff810a0274>] ? lockdep_trace_alloc+0xaf/0xee
>> [    0.000000]  [<ffffffff81075ab0>] warn_slowpath_common+0x88/0xcb
>> [    0.000000]  [<ffffffff81075b15>] warn_slowpath_null+0x22/0x38
>> [    0.000000]  [<ffffffff810a0274>] lockdep_trace_alloc+0xaf/0xee
>> [    0.000000]  [<ffffffff8110301b>] kmem_cache_alloc_node+0x38/0x14d
>> [    0.000000]  [<ffffffff813ec548>] ? alloc_cpumask_var_node+0x4a/0x10a
>> [    0.000000]  [<ffffffff8109eb61>] ? lockdep_init_map+0xb9/0x564
>> [    0.000000]  [<ffffffff813ec548>] alloc_cpumask_var_node+0x4a/0x10a
>> [    0.000000]  [<ffffffff813ec62c>] alloc_cpumask_var+0x24/0x3a
>> [    0.000000]  [<ffffffff819e6306>] cpupri_init+0x7f/0x112
>> [    0.000000]  [<ffffffff819e5a30>] init_rootdomain+0x72/0xb7
>> [    0.000000]  [<ffffffff821facce>] sched_init+0x109/0x660
>> [    0.000000]  [<ffffffff82203082>] ? kmem_cache_init+0x193/0x1b2
>> [    0.000000]  [<ffffffff821dfd7a>] start_kernel+0x218/0x3f3
>> [    0.000000]  [<ffffffff821df2a9>] x86_64_start_reservations+0xb9/0xd4
>> [    0.000000]  [<ffffffff821df3b2>] x86_64_start_kernel+0xee/0x109
>> [    0.000000] ---[ end trace a7919e7f17c0a725 ]---
>>
>> works with 8 sockets numa amd64 box.
>>
>> YH
>>
>> ---
>>  init/main.c           |   28 ++++++++++++++++------------
>>  kernel/irq/handle.c   |   23 ++++++++---------------
>>  kernel/sched.c        |   34 +++++++++++++---------------------
>>  kernel/sched_cpupri.c |    9 ++++++---
>>  mm/slub.c             |   17 ++++++++++-------
>>  5 files changed, 53 insertions(+), 58 deletions(-)
> 
> Very nice!
> 
> Would it be possible to restructure things to move kmalloc init to 
> before IRQ init as well? We have a couple of uglinesses there too.
> 
> Conceptually, memory should be the first thing set up in general, in 
> a kernel. It does not need IRQs, timers, the scheduler or any of the 
> IO facilities and abstractions. All of them need memory though - and 
> as Linux scales to more and more hardware via the same single image, 
> so will we get more and more dynamic concepts like cpumask_var_t and 
> sparse-irqs, which want to allocate very early.

Pekka's patch already made kmalloc before early_irq_init()/init_IRQ...

we can clean up alloc_desc_masks and
alloc_cpumask_var_node could be much simplified too.

[PATCH] x86: remove some alloc_bootmem_cpumask_var calling

except some is called from setup_percpu_area...

Signed-off-by: Yinghai Lu <yinghai@kernel.org>

---
 arch/x86/kernel/apic/io_apic.c |    4 ++--
 include/linux/irq.h            |   18 +++++++-----------
 kernel/cpuset.c                |    2 +-
 kernel/profile.c               |    6 ------
 lib/cpumask.c                  |   11 ++---------
 5 files changed, 12 insertions(+), 29 deletions(-)

Index: linux-2.6/include/linux/irq.h
===================================================================
--- linux-2.6.orig/include/linux/irq.h
+++ linux-2.6/include/linux/irq.h
@@ -430,23 +430,19 @@ extern int set_irq_msi(unsigned int irq,
  * Returns true if successful (or not required).
  */
 static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
-								bool boot)
+							bool boot)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
-	if (boot) {
-		alloc_bootmem_cpumask_var(&desc->affinity);
+	gfp_t gfp = GFP_ATOMIC;
 
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-		alloc_bootmem_cpumask_var(&desc->pending_mask);
-#endif
-		return true;
-	}
+	if (boot)
+		gfp = GFP_NOWAIT;
 
-	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
 		return false;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+	if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
 		free_cpumask_var(desc->affinity);
 		return false;
 	}
Index: linux-2.6/lib/cpumask.c
===================================================================
--- linux-2.6.orig/lib/cpumask.c
+++ linux-2.6/lib/cpumask.c
@@ -92,15 +92,8 @@ int cpumask_any_but(const struct cpumask
  */
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 {
-	if (likely(slab_is_available()))
-		*mask = kmalloc_node(cpumask_size(), flags, node);
-	else {
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-		printk(KERN_ERR
-			"=> alloc_cpumask_var: kmalloc not available!\n");
-#endif
-		*mask = NULL;
-	}
+	*mask = kmalloc_node(cpumask_size(), flags, node);
+
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 	if (!*mask) {
 		printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
Index: linux-2.6/arch/x86/kernel/apic/io_apic.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/apic/io_apic.c
+++ linux-2.6/arch/x86/kernel/apic/io_apic.c
@@ -185,8 +185,8 @@ int __init arch_early_irq_init(void)
 	for (i = 0; i < count; i++) {
 		desc = irq_to_desc(i);
 		desc->chip_data = &cfg[i];
-		alloc_bootmem_cpumask_var(&cfg[i].domain);
-		alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+		alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT);
+		alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT);
 		if (i < NR_IRQS_LEGACY)
 			cpumask_setall(cfg[i].domain);
 	}
Index: linux-2.6/kernel/cpuset.c
===================================================================
--- linux-2.6.orig/kernel/cpuset.c
+++ linux-2.6/kernel/cpuset.c
@@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = {
 
 int __init cpuset_init_early(void)
 {
-	alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
+	alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
 
 	top_cpuset.mems_generation = cpuset_mems_generation++;
 	return 0;
Index: linux-2.6/kernel/profile.c
===================================================================
--- linux-2.6.orig/kernel/profile.c
+++ linux-2.6/kernel/profile.c
@@ -111,12 +111,6 @@ int __ref profile_init(void)
 	/* only text is profiled */
 	prof_len = (_etext - _stext) >> prof_shift;
 	buffer_bytes = prof_len*sizeof(atomic_t);
-	if (!slab_is_available()) {
-		prof_buffer = alloc_bootmem(buffer_bytes);
-		alloc_bootmem_cpumask_var(&prof_cpu_mask);
-		cpumask_copy(prof_cpu_mask, cpu_possible_mask);
-		return 0;
-	}
 
 	if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
 		return -ENOMEM;



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  2:53                     ` Ingo Molnar
  2009-05-25  4:45                       ` Yinghai Lu
@ 2009-05-25  4:52                       ` H. Peter Anvin
  2009-05-25  5:05                         ` Ingo Molnar
  2009-05-25  5:13                         ` Yinghai Lu
  2009-05-25  5:19                       ` Benjamin Herrenschmidt
  2009-05-25  7:16                       ` Rusty Russell
  3 siblings, 2 replies; 57+ messages in thread
From: H. Peter Anvin @ 2009-05-25  4:52 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Yinghai Lu, Pekka J Enberg, Linus Torvalds, Jeff Garzik,
	Alexander Viro, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra

Ingo Molnar wrote:
> 
> Would it be possible to restructure things to move kmalloc init to 
> before IRQ init as well? We have a couple of uglinesses there too.
> 
> Conceptually, memory should be the first thing set up in general, in 
> a kernel. It does not need IRQs, timers, the scheduler or any of the 
> IO facilities and abstractions. All of them need memory though - and 
> as Linux scales to more and more hardware via the same single image, 
> so will we get more and more dynamic concepts like cpumask_var_t and 
> sparse-irqs, which want to allocate very early.
> 
> setup_arch() is one huge function that sets up all architecture 
> details at once - but if we split a separate setup_arch_mem() out of 
> it, and left the rest in setup_arch (and moved it further down), we 
> could remove much of bootmem (especially the ugly uses).
> 
> This might even be doable realistically, and we could thus librarize 
> bootmem and eliminate it from x86 at least. Perhaps.
> 

The only thing that might make sense to set up before memory might be
exceptions (as opposed to interrupts), but both of those should be
doable very very early.

	-hpa

-- 
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  4:52                       ` H. Peter Anvin
@ 2009-05-25  5:05                         ` Ingo Molnar
  2009-05-25  5:13                         ` Yinghai Lu
  1 sibling, 0 replies; 57+ messages in thread
From: Ingo Molnar @ 2009-05-25  5:05 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Yinghai Lu, Pekka J Enberg, Linus Torvalds, Jeff Garzik,
	Alexander Viro, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra


* H. Peter Anvin <hpa@zytor.com> wrote:

> Ingo Molnar wrote:
> > 
> > Would it be possible to restructure things to move kmalloc init to 
> > before IRQ init as well? We have a couple of uglinesses there too.
> > 
> > Conceptually, memory should be the first thing set up in general, in 
> > a kernel. It does not need IRQs, timers, the scheduler or any of the 
> > IO facilities and abstractions. All of them need memory though - and 
> > as Linux scales to more and more hardware via the same single image, 
> > so will we get more and more dynamic concepts like cpumask_var_t and 
> > sparse-irqs, which want to allocate very early.
> > 
> > setup_arch() is one huge function that sets up all architecture 
> > details at once - but if we split a separate setup_arch_mem() out of 
> > it, and left the rest in setup_arch (and moved it further down), we 
> > could remove much of bootmem (especially the ugly uses).
> > 
> > This might even be doable realistically, and we could thus librarize 
> > bootmem and eliminate it from x86 at least. Perhaps.
> > 
> 
> The only thing that might make sense to set up before memory might 
> be exceptions (as opposed to interrupts), but both of those should 
> be doable very very early.

Yeah. And we already have early exception handlers to help debugging 
so there's practically no dependency on memory init. (other than the 
full kernel image being executable by the CPU - so some very minimal 
memory/paging init is necessary)

	Ingo

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  4:52                       ` H. Peter Anvin
  2009-05-25  5:05                         ` Ingo Molnar
@ 2009-05-25  5:13                         ` Yinghai Lu
  1 sibling, 0 replies; 57+ messages in thread
From: Yinghai Lu @ 2009-05-25  5:13 UTC (permalink / raw)
  To: H. Peter Anvin
  Cc: Ingo Molnar, Pekka J Enberg, Linus Torvalds, Jeff Garzik,
	Alexander Viro, Rusty Russell, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra

H. Peter Anvin wrote:
> Ingo Molnar wrote:
>> Would it be possible to restructure things to move kmalloc init to 
>> before IRQ init as well? We have a couple of uglinesses there too.
>>
>> Conceptually, memory should be the first thing set up in general, in 
>> a kernel. It does not need IRQs, timers, the scheduler or any of the 
>> IO facilities and abstractions. All of them need memory though - and 
>> as Linux scales to more and more hardware via the same single image, 
>> so will we get more and more dynamic concepts like cpumask_var_t and 
>> sparse-irqs, which want to allocate very early.
>>
>> setup_arch() is one huge function that sets up all architecture 
>> details at once - but if we split a separate setup_arch_mem() out of 
>> it, and left the rest in setup_arch (and moved it further down), we 
>> could remove much of bootmem (especially the ugly uses).
>>
>> This might even be doable realistically, and we could thus librarize 
>> bootmem and eliminate it from x86 at least. Perhaps.
>>
> 
> The only thing that might make sense to set up before memory might be
> exceptions (as opposed to interrupts), but both of those should be
> doable very very early.
> 

put trap_init() right after setup_arch() in start_kernel()?

YH

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  4:45                       ` Yinghai Lu
@ 2009-05-25  5:15                         ` Ingo Molnar
  2009-05-25  5:54                           ` Yinghai Lu
  2009-05-25  8:47                           ` Pekka J Enberg
  0 siblings, 2 replies; 57+ messages in thread
From: Ingo Molnar @ 2009-05-25  5:15 UTC (permalink / raw)
  To: Yinghai Lu
  Cc: Pekka J Enberg, Rusty Russell, Linus Torvalds, H. Peter Anvin,
	Jeff Garzik, Alexander Viro, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra


* Yinghai Lu <yinghai@kernel.org> wrote:

> Ingo Molnar wrote:
> > * Yinghai Lu <yinghai@kernel.org> wrote:
> > 
> >> Pekka J Enberg wrote:
> >>> On Mon, 18 May 2009, Linus Torvalds wrote:
> >>>>>> I hate that stupid bootmem allocator. I suspect we seriously 
> >>>>>> over-use it, and that we _should_ be able to do the SL*B init 
> >>>>>> earlier.
> >>>>> Hm, tempting thought - not sure how to pull it off though.
> >>>> As far as I can recall, one of the things that historically made us want 
> >>>> to use the bootmem allocator even relatively late was that the real SLAB 
> >>>> allocator had to wait until all the node information etc was initialized. 
> >>>>
> >>>> That's pretty damn late. And I wonder if SLUB (and SLOB) might not need a 
> >>>> lot less initialization, and work much earlier. Something like that might 
> >>>> be the final nail in the coffin for SLAB, and convince me to just say 
> >>>> 'we don't support it any more".
> >>> Ingo, here's a patch that boots UMA+SMP+SLUB x86-64 kernel on qemu all 
> >>> the way to userspace. It probably breaks bunch of things for now but 
> >>> something for you to play with if you want.
> >>>
> >> updated with tip/master. also add change to cpupri_init
> >> otherwise will get 
> >> [    0.000000] Memory: 523096612k/537526272k available (10461k kernel code, 656156k absent, 13773504k reserved, 7186k data, 2548k init)
> >> [    0.000000] SLUB: Genslabs=14, HWalign=64, Order=0-3, MinObjects=0, CPUs=32, Nodes=8
> >> [    0.000000] ------------[ cut here ]------------
> >> [    0.000000] WARNING: at kernel/lockdep.c:2282 lockdep_trace_alloc+0xaf/0xee()
> >> [    0.000000] Hardware name: Sun Fire X4600 M2
> >> [    0.000000] Modules linked in:
> >> [    0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-rc6-tip-01778-g0afdd0f-dirty #259
> >> [    0.000000] Call Trace:
> >> [    0.000000]  [<ffffffff810a0274>] ? lockdep_trace_alloc+0xaf/0xee
> >> [    0.000000]  [<ffffffff81075ab0>] warn_slowpath_common+0x88/0xcb
> >> [    0.000000]  [<ffffffff81075b15>] warn_slowpath_null+0x22/0x38
> >> [    0.000000]  [<ffffffff810a0274>] lockdep_trace_alloc+0xaf/0xee
> >> [    0.000000]  [<ffffffff8110301b>] kmem_cache_alloc_node+0x38/0x14d
> >> [    0.000000]  [<ffffffff813ec548>] ? alloc_cpumask_var_node+0x4a/0x10a
> >> [    0.000000]  [<ffffffff8109eb61>] ? lockdep_init_map+0xb9/0x564
> >> [    0.000000]  [<ffffffff813ec548>] alloc_cpumask_var_node+0x4a/0x10a
> >> [    0.000000]  [<ffffffff813ec62c>] alloc_cpumask_var+0x24/0x3a
> >> [    0.000000]  [<ffffffff819e6306>] cpupri_init+0x7f/0x112
> >> [    0.000000]  [<ffffffff819e5a30>] init_rootdomain+0x72/0xb7
> >> [    0.000000]  [<ffffffff821facce>] sched_init+0x109/0x660
> >> [    0.000000]  [<ffffffff82203082>] ? kmem_cache_init+0x193/0x1b2
> >> [    0.000000]  [<ffffffff821dfd7a>] start_kernel+0x218/0x3f3
> >> [    0.000000]  [<ffffffff821df2a9>] x86_64_start_reservations+0xb9/0xd4
> >> [    0.000000]  [<ffffffff821df3b2>] x86_64_start_kernel+0xee/0x109
> >> [    0.000000] ---[ end trace a7919e7f17c0a725 ]---
> >>
> >> works with 8 sockets numa amd64 box.
> >>
> >> YH
> >>
> >> ---
> >>  init/main.c           |   28 ++++++++++++++++------------
> >>  kernel/irq/handle.c   |   23 ++++++++---------------
> >>  kernel/sched.c        |   34 +++++++++++++---------------------
> >>  kernel/sched_cpupri.c |    9 ++++++---
> >>  mm/slub.c             |   17 ++++++++++-------
> >>  5 files changed, 53 insertions(+), 58 deletions(-)
> > 
> > Very nice!
> > 
> > Would it be possible to restructure things to move kmalloc init to 
> > before IRQ init as well? We have a couple of uglinesses there too.
> > 
> > Conceptually, memory should be the first thing set up in general, in 
> > a kernel. It does not need IRQs, timers, the scheduler or any of the 
> > IO facilities and abstractions. All of them need memory though - and 
> > as Linux scales to more and more hardware via the same single image, 
> > so will we get more and more dynamic concepts like cpumask_var_t and 
> > sparse-irqs, which want to allocate very early.
> 
> Pekka's patch already made kmalloc before early_irq_init()/init_IRQ...
> 
> we can clean up alloc_desc_masks and
> alloc_cpumask_var_node could be much simplified too.

That's nice!

Ok, i think this all looks pretty realistic - but there's quite a 
bit of layering on top of pending changes in the x86 and irq trees. 
We could do this on top of those topic branches in -tip, and rebase 
in the merge window. Or delay it to .32.

... plus i think we are _very_ close to being able to remove all of 
bootmem on x86 (with some compatibility/migration mechanism in 
place). Which bootmem calls do we have before kmalloc init with 
Pekka's patch applied? I think it's mostly the page table init code.
 
( beyond the page allocator internal use - where we could use 
  straight e820 based APIs that clip memory off from the beginning 
  of existing e820 RAM ranges - enriched with NUMA/SRAT locality 
  info. )

	Ingo

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-24 18:18                   ` Linus Torvalds
  2009-05-24 19:13                     ` Pekka Enberg
@ 2009-05-25  5:16                     ` Benjamin Herrenschmidt
  1 sibling, 0 replies; 57+ messages in thread
From: Benjamin Herrenschmidt @ 2009-05-25  5:16 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Pekka J Enberg, Ingo Molnar, H. Peter Anvin, Yinghai Lu,
	Jeff Garzik, Alexander Viro, Rusty Russell,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra

On Sun, 2009-05-24 at 11:18 -0700, Linus Torvalds wrote:
> In fact, it would be nice to perhaps try to move it even earlier. Now you 
> moved it to before the scheduler init (good!), but I do wonder if it could 
> be moved up to even before the setup_per_cpu_areas() etc crud. 
> 
> I realize that the allocator wants to use the per-CPU area, but if we have 
> just the boot CPU area set up statically at that point, since it's only 
> the boot CPU running, maybe we could do those per-cpu area allocations 
> without the bootmem allocator too?

Well, we want at least node information since we want per-cpu areas
to be allocated on the right node etc...

But then, bootmem has them, so we should be able to feed them off
to SL*B early.

One thing I'm wondering... Most archs I see have their own allocator
for before bootmem is available even. On PowerPC and Sparc, we call it
LMB and it's in fact in generic code now. x86 seems to have several
layers but thew e820 early allocator seems to fit a similar bill.

I wonder if we could try to shoot bootmem that way.

With a blend of Pekka's approach which can drastically reduce how much
we need bootmem, for the remaining bits such as the SL*B own data
structures and the mem_map, the arch is responsible to provide a simple
API to provide node local allocations that is roughly equivalent to
whatever bits of bootmem remain and are needed.

That API wraps on top of whatever the arch already has for early boot
stuff.

Finally, we can keep bootmem around in lib/ or such for archs that
don't want to convert or don't have an existing suitable early
allocator.

Cheers,
Ben.





^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  2:53                     ` Ingo Molnar
  2009-05-25  4:45                       ` Yinghai Lu
  2009-05-25  4:52                       ` H. Peter Anvin
@ 2009-05-25  5:19                       ` Benjamin Herrenschmidt
  2009-05-25  7:16                       ` Rusty Russell
  3 siblings, 0 replies; 57+ messages in thread
From: Benjamin Herrenschmidt @ 2009-05-25  5:19 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Yinghai Lu, Pekka J Enberg, Linus Torvalds, H. Peter Anvin,
	Jeff Garzik, Alexander Viro, Rusty Russell,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra

On Mon, 2009-05-25 at 04:53 +0200, Ingo Molnar wrote:
> 
> Would it be possible to restructure things to move kmalloc init to 
> before IRQ init as well? We have a couple of uglinesses there too.

Amen :-)

> Conceptually, memory should be the first thing set up in general, in 
> a kernel. It does not need IRQs, timers, the scheduler or any of the 
> IO facilities and abstractions. All of them need memory though - and 
> as Linux scales to more and more hardware via the same single image, 
> so will we get more and more dynamic concepts like cpumask_var_t and 
> sparse-irqs, which want to allocate very early.
> 
> setup_arch() is one huge function that sets up all architecture 
> details at once - but if we split a separate setup_arch_mem() out of 
> it, and left the rest in setup_arch (and moved it further down), we 
> could remove much of bootmem (especially the ugly uses).
> 
> This might even be doable realistically, and we could thus librarize 
> bootmem and eliminate it from x86 at least. Perhaps.

Yup, see my earlier email. Archs like x86 and powerpc already have a low
level allocator they can use to allocate the mem_map etc... so bootmem
really becomes redundant.

Cheers
Ben.



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  5:15                         ` Ingo Molnar
@ 2009-05-25  5:54                           ` Yinghai Lu
  2009-05-25  8:47                           ` Pekka J Enberg
  1 sibling, 0 replies; 57+ messages in thread
From: Yinghai Lu @ 2009-05-25  5:54 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Pekka J Enberg, Rusty Russell, Linus Torvalds, H. Peter Anvin,
	Jeff Garzik, Alexander Viro, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra

Ingo Molnar wrote:
> 
> Ok, i think this all looks pretty realistic - but there's quite a 
> bit of layering on top of pending changes in the x86 and irq trees. 
> We could do this on top of those topic branches in -tip, and rebase 
> in the merge window. Or delay it to .32.

would have move setup_per_cpu_areas after mem_init().
some kind of limiting bootmem related in setup_arch()

> 
> ... plus i think we are _very_ close to being able to remove all of 
> bootmem on x86 (with some compatibility/migration mechanism in 
> place). Which bootmem calls do we have before kmalloc init with 
> Pekka's patch applied? I think it's mostly the page table init code.

need to decide what should be in setup_arch_mem, or setup_arch_rest().
before initmem_init() ==>  setup_arch_mem
after initmem_init()
	reserve_bootmem related should stay in setup_arch_mem
try to move other call in setup_arch to _reset after ane setup_arch_rest will
be called after mem_init()

>  
> ( beyond the page allocator internal use - where we could use 
>   straight e820 based APIs that clip memory off from the beginning 
>   of existing e820 RAM ranges - enriched with NUMA/SRAT locality 
>   info. )

yes. it is there. need to dynamic early_res array.

YH

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  2:53                     ` Ingo Molnar
                                         ` (2 preceding siblings ...)
  2009-05-25  5:19                       ` Benjamin Herrenschmidt
@ 2009-05-25  7:16                       ` Rusty Russell
  3 siblings, 0 replies; 57+ messages in thread
From: Rusty Russell @ 2009-05-25  7:16 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Yinghai Lu, Pekka J Enberg, Linus Torvalds, H. Peter Anvin,
	Jeff Garzik, Alexander Viro, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra

On Mon, 25 May 2009 12:23:53 pm Ingo Molnar wrote:
> setup_arch() is one huge function that sets up all architecture
> details at once - but if we split a separate setup_arch_mem() out of
> it, and left the rest in setup_arch (and moved it further down), we
> could remove much of bootmem (especially the ugly uses).

cmdline parsing needs to be available even before mem setup.  I waded
into this earlier and broke almost every arch, but if there's renewed 
enthusiasm I could dig out those patches.

(Search for "arch_get_boot_cmdline()").

Cheers,
Rusty.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  5:15                         ` Ingo Molnar
  2009-05-25  5:54                           ` Yinghai Lu
@ 2009-05-25  8:47                           ` Pekka J Enberg
  2009-05-25 11:25                             ` Nick Piggin
                                               ` (2 more replies)
  1 sibling, 3 replies; 57+ messages in thread
From: Pekka J Enberg @ 2009-05-25  8:47 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Yinghai Lu, Rusty Russell, Linus Torvalds, H. Peter Anvin,
	Jeff Garzik, Alexander Viro, Linux Kernel Mailing List,
	Andrew Morton, Peter Zijlstra, cl, npiggin, mpm

On Mon, 25 May 2009, Ingo Molnar wrote:
> Ok, i think this all looks pretty realistic - but there's quite a 
> bit of layering on top of pending changes in the x86 and irq trees. 
> We could do this on top of those topic branches in -tip, and rebase 
> in the merge window. Or delay it to .32.
> 
> ... plus i think we are _very_ close to being able to remove all of 
> bootmem on x86 (with some compatibility/migration mechanism in 
> place). Which bootmem calls do we have before kmalloc init with 
> Pekka's patch applied? I think it's mostly the page table init code.
>  
> ( beyond the page allocator internal use - where we could use 
>   straight e820 based APIs that clip memory off from the beginning 
>   of existing e820 RAM ranges - enriched with NUMA/SRAT locality 
>   info. )

OK, here's a version of the patch with Yinghai's rebasing on top of 
tip/master. All in-kernel memory allocators boot cleanly now on my 
configuration (I did not try SLQB but it probably needs fixing). I would 
appreciate if someone gave SLAB+NUMA configuration a try.

Ingo, I don't really want to carry this in slab.git so perhaps you could 
put this and Yinghai's irq init cleanup in a "earlyslab" branch in tip for 
testing?

And oh, Christoph/Matt, can I have your NAK/ACK for this patch?

			Pekka

>From f5338db5f1b959cb82ed811037a545e19c7b2b7b Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 25 May 2009 09:44:39 +0300
Subject: [PATCH] slab: setup allocators earlier in the boot sequence

This patch makes kmalloc() available earlier in the boot sequence so we can get
rid of some bootmem allocations. The bulk of the changes are due to
kmem_cache_init() being called with interrupts disabled which requires some
changes to allocator boostrap code.

[ yinghai@kernel.org: rebase to tip ]
Cc: Nick Piggin <npiggin@suse.de>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Christoph Lameter <cl@linux-foundation.com>
Cc: Linus Torvalds <torvals@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 init/main.c           |   28 +++++++++-------
 kernel/irq/handle.c   |   23 +++++---------
 kernel/sched.c        |   34 +++++++------------
 kernel/sched_cpupri.c |    9 +++--
 mm/slab.c             |   85 ++++++++++++++++++++++++++-----------------------
 mm/slub.c             |   17 ++++++----
 6 files changed, 98 insertions(+), 98 deletions(-)

diff --git a/init/main.c b/init/main.c
index 33ce929..e7a9c18 100644
--- a/init/main.c
+++ b/init/main.c
@@ -576,6 +576,22 @@ asmlinkage void __init start_kernel(void)
 	setup_nr_cpu_ids();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
+	build_all_zonelists();
+	page_alloc_init();
+
+	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
+	parse_early_param();
+	parse_args("Booting kernel", static_command_line, __start___param,
+		   __stop___param - __start___param,
+		   &unknown_bootoption);
+	/*
+	 * Setup kernel memory allocators
+	 */
+	pidhash_init();
+	vmalloc_init();
+	vfs_caches_init_early();
+	mem_init();
+	kmem_cache_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -587,13 +603,6 @@ asmlinkage void __init start_kernel(void)
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
-	build_all_zonelists();
-	page_alloc_init();
-	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
-	parse_early_param();
-	parse_args("Booting kernel", static_command_line, __start___param,
-		   __stop___param - __start___param,
-		   &unknown_bootoption);
 	if (!irqs_disabled()) {
 		printk(KERN_WARNING "start_kernel(): bug: interrupts were "
 				"enabled *very* early, fixing it\n");
@@ -605,7 +614,6 @@ asmlinkage void __init start_kernel(void)
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
-	pidhash_init();
 	init_timers();
 	hrtimers_init();
 	softirq_init();
@@ -647,14 +655,10 @@ asmlinkage void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
-	vmalloc_init();
-	vfs_caches_init_early();
 	cpuset_init_early();
 	page_cgroup_init();
-	mem_init();
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
-	kmem_cache_init();
 	kmemtrace_init();
 	debug_objects_mem_init();
 	idr_init_cache();
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a600184..6cc19a9 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -18,7 +18,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <linux/bootmem.h>
+#include <linux/slab.h>
 #include <trace/events/irq.h>
 
 #include "internals.h"
@@ -45,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 static void __init init_irq_default_affinity(void)
 {
-	alloc_bootmem_cpumask_var(&irq_default_affinity);
+	alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
 	cpumask_setall(irq_default_affinity);
 }
 #else
@@ -86,12 +86,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
 	void *ptr;
 
-	if (slab_is_available())
-		ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-				   GFP_ATOMIC, node);
-	else
-		ptr = alloc_bootmem_node(NODE_DATA(node),
-				nr * sizeof(*desc->kstat_irqs));
+	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+			   GFP_ATOMIC, node);
 
 	/*
 	 * don't overwite if can not get new one
@@ -162,12 +158,12 @@ int __init early_irq_init(void)
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
 
 	/* allocate irq_desc_ptrs array based on nr_irqs */
-	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+	irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
 
 	/* allocate based on nr_cpu_ids */
 	/* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
-	kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
-					  sizeof(int));
+	kstat_irqs_legacy = kzalloc(NR_IRQS_LEGACY * nr_cpu_ids *
+					  sizeof(int), GFP_NOWAIT);
 
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
@@ -214,10 +210,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 	if (desc)
 		goto out_unlock;
 
-	if (slab_is_available())
-		desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-	else
-		desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
 
 	printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
 	if (!desc) {
diff --git a/kernel/sched.c b/kernel/sched.c
index f93305b..645280c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -69,7 +69,6 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
@@ -7821,24 +7820,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 
 static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
 {
+	gfp_t gfp = GFP_KERNEL;
+
 	memset(rd, 0, sizeof(*rd));
 
-	if (bootmem) {
-		alloc_bootmem_cpumask_var(&def_root_domain.span);
-		alloc_bootmem_cpumask_var(&def_root_domain.online);
-		alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-		cpupri_init(&rd->cpupri, true);
-		return 0;
-	}
+	if (bootmem)
+		gfp = GFP_NOWAIT;
 
-	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->span, gfp))
 		goto out;
-	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->online, gfp))
 		goto free_span;
-	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->rto_mask, gfp))
 		goto free_online;
 
-	if (cpupri_init(&rd->cpupri, false) != 0)
+	if (cpupri_init(&rd->cpupri, bootmem) != 0)
 		goto free_rto_mask;
 	return 0;
 
@@ -9157,12 +9153,8 @@ void __init sched_init(void)
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
-	/*
-	 * As sched_init() is called before page_alloc is setup,
-	 * we use alloc_bootmem().
-	 */
 	if (alloc_size) {
-		ptr = (unsigned long)alloc_bootmem(alloc_size);
+		ptr = (unsigned long) kzalloc(alloc_size, GFP_NOWAIT);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.se = (struct sched_entity **)ptr;
@@ -9353,13 +9345,13 @@ void __init sched_init(void)
 	current->sched_class = &fair_sched_class;
 
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+	alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
-	alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
+	alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
 	perf_counter_init();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index cdd3c89..5ba8e32 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -156,16 +156,19 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
 {
 	int i;
 
+	gfp_t gfp = GFP_KERNEL;
+
 	memset(cp, 0, sizeof(*cp));
 
+	if (bootmem)
+		gfp = GFP_NOWAIT;
+
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
 
 		spin_lock_init(&vec->lock);
 		vec->count = 0;
-		if (bootmem)
-			alloc_bootmem_cpumask_var(&vec->mask);
-		else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
+		if (!alloc_cpumask_var(&vec->mask, gfp))
 			goto cleanup;
 	}
 
diff --git a/mm/slab.c b/mm/slab.c
index 762acc2..ab5039a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -316,7 +316,7 @@ static int drain_freelist(struct kmem_cache *cache,
 			struct kmem_list3 *l3, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 			int node);
-static int enable_cpucache(struct kmem_cache *cachep);
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 static void cache_reap(struct work_struct *unused);
 
 /*
@@ -878,12 +878,12 @@ static void __cpuinit start_cpu_timer(int cpu)
 }
 
 static struct array_cache *alloc_arraycache(int node, int entries,
-					    int batchcount)
+					    int batchcount, gfp_t gfp)
 {
 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 	struct array_cache *nc = NULL;
 
-	nc = kmalloc_node(memsize, GFP_KERNEL, node);
+	nc = kmalloc_node(memsize, gfp, node);
 	if (nc) {
 		nc->avail = 0;
 		nc->limit = entries;
@@ -923,7 +923,7 @@ static int transfer_objects(struct array_cache *to,
 #define drain_alien_cache(cachep, alien) do { } while (0)
 #define reap_alien(cachep, l3) do { } while (0)
 
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
+static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
 	return (struct array_cache **)BAD_ALIEN_MAGIC;
 }
@@ -954,7 +954,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
-static struct array_cache **alloc_alien_cache(int node, int limit)
+static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
 	struct array_cache **ac_ptr;
 	int memsize = sizeof(void *) * nr_node_ids;
@@ -962,14 +962,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit)
 
 	if (limit > 1)
 		limit = 12;
-	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+	ac_ptr = kmalloc_node(memsize, gfp, node);
 	if (ac_ptr) {
 		for_each_node(i) {
 			if (i == node || !node_online(i)) {
 				ac_ptr[i] = NULL;
 				continue;
 			}
-			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
+			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
 			if (!ac_ptr[i]) {
 				for (i--; i >= 0; i--)
 					kfree(ac_ptr[i]);
@@ -1202,20 +1202,20 @@ static int __cpuinit cpuup_prepare(long cpu)
 		struct array_cache **alien = NULL;
 
 		nc = alloc_arraycache(node, cachep->limit,
-					cachep->batchcount);
+					cachep->batchcount, GFP_KERNEL);
 		if (!nc)
 			goto bad;
 		if (cachep->shared) {
 			shared = alloc_arraycache(node,
 				cachep->shared * cachep->batchcount,
-				0xbaadf00d);
+				0xbaadf00d, GFP_KERNEL);
 			if (!shared) {
 				kfree(nc);
 				goto bad;
 			}
 		}
 		if (use_alien_caches) {
-			alien = alloc_alien_cache(node, cachep->limit);
+			alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
 			if (!alien) {
 				kfree(shared);
 				kfree(nc);
@@ -1319,10 +1319,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
 {
 	struct kmem_list3 *ptr;
 
-	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
 	BUG_ON(!ptr);
 
-	local_irq_disable();
 	memcpy(ptr, list, sizeof(struct kmem_list3));
 	/*
 	 * Do not assume that spinlocks can be initialized via memcpy:
@@ -1331,7 +1330,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
 
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->nodelists[nodeid] = ptr;
-	local_irq_enable();
 }
 
 /*
@@ -1495,9 +1493,8 @@ void __init kmem_cache_init(void)
 	{
 		struct array_cache *ptr;
 
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
-		local_irq_disable();
 		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
 		memcpy(ptr, cpu_cache_get(&cache_cache),
 		       sizeof(struct arraycache_init));
@@ -1507,11 +1504,9 @@ void __init kmem_cache_init(void)
 		spin_lock_init(&ptr->lock);
 
 		cache_cache.array[smp_processor_id()] = ptr;
-		local_irq_enable();
 
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
-		local_irq_disable();
 		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
 		       != &initarray_generic.cache);
 		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
@@ -1523,7 +1518,6 @@ void __init kmem_cache_init(void)
 
 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
 		    ptr;
-		local_irq_enable();
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
 	{
@@ -1547,7 +1541,7 @@ void __init kmem_cache_init(void)
 		struct kmem_cache *cachep;
 		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
-			if (enable_cpucache(cachep))
+			if (enable_cpucache(cachep, GFP_NOWAIT))
 				BUG();
 		mutex_unlock(&cache_chain_mutex);
 	}
@@ -1996,10 +1990,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 	return left_over;
 }
 
-static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
+static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	if (g_cpucache_up == FULL)
-		return enable_cpucache(cachep);
+		return enable_cpucache(cachep, gfp);
 
 	if (g_cpucache_up == NONE) {
 		/*
@@ -2021,7 +2015,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 			g_cpucache_up = PARTIAL_AC;
 	} else {
 		cachep->array[smp_processor_id()] =
-			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+			kmalloc(sizeof(struct arraycache_init), gfp);
 
 		if (g_cpucache_up == PARTIAL_AC) {
 			set_up_list3s(cachep, SIZE_L3);
@@ -2085,6 +2079,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
+	gfp_t gfp;
 
 	/*
 	 * Sanity checks... these are all serious usage bugs.
@@ -2100,8 +2095,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * We use cache_chain_mutex to ensure a consistent view of
 	 * cpu_online_mask as well.  Please see cpuup_callback
 	 */
-	get_online_cpus();
-	mutex_lock(&cache_chain_mutex);
+	if (slab_is_available()) {
+		get_online_cpus();
+		mutex_lock(&cache_chain_mutex);
+	}
 
 	list_for_each_entry(pc, &cache_chain, next) {
 		char tmp;
@@ -2210,8 +2207,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 */
 	align = ralign;
 
+	if (slab_is_available())
+		gfp = GFP_KERNEL;
+	else
+		gfp = GFP_NOWAIT;
+
 	/* Get cache's description obj. */
-	cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
+	cachep = kmem_cache_zalloc(&cache_cache, gfp);
 	if (!cachep)
 		goto oops;
 
@@ -2314,7 +2316,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->ctor = ctor;
 	cachep->name = name;
 
-	if (setup_cpu_cache(cachep)) {
+	if (setup_cpu_cache(cachep, gfp)) {
 		__kmem_cache_destroy(cachep);
 		cachep = NULL;
 		goto oops;
@@ -2326,8 +2328,10 @@ oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
-	mutex_unlock(&cache_chain_mutex);
-	put_online_cpus();
+	if (slab_is_available()) {
+		mutex_unlock(&cache_chain_mutex);
+		put_online_cpus();
+	}
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -3742,7 +3746,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
 /*
  * This initializes kmem_list3 or resizes various caches for all nodes.
  */
-static int alloc_kmemlist(struct kmem_cache *cachep)
+static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int node;
 	struct kmem_list3 *l3;
@@ -3752,7 +3756,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 	for_each_online_node(node) {
 
                 if (use_alien_caches) {
-                        new_alien = alloc_alien_cache(node, cachep->limit);
+                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
                         if (!new_alien)
                                 goto fail;
                 }
@@ -3761,7 +3765,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 		if (cachep->shared) {
 			new_shared = alloc_arraycache(node,
 				cachep->shared*cachep->batchcount,
-					0xbaadf00d);
+					0xbaadf00d, gfp);
 			if (!new_shared) {
 				free_alien_cache(new_alien);
 				goto fail;
@@ -3790,7 +3794,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 			free_alien_cache(new_alien);
 			continue;
 		}
-		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+		l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
 		if (!l3) {
 			free_alien_cache(new_alien);
 			kfree(new_shared);
@@ -3846,18 +3850,18 @@ static void do_ccupdate_local(void *info)
 
 /* Always called with the cache_chain_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
-				int batchcount, int shared)
+				int batchcount, int shared, gfp_t gfp)
 {
 	struct ccupdate_struct *new;
 	int i;
 
-	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	new = kzalloc(sizeof(*new), gfp);
 	if (!new)
 		return -ENOMEM;
 
 	for_each_online_cpu(i) {
 		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
-						batchcount);
+						batchcount, gfp);
 		if (!new->new[i]) {
 			for (i--; i >= 0; i--)
 				kfree(new->new[i]);
@@ -3884,11 +3888,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 		kfree(ccold);
 	}
 	kfree(new);
-	return alloc_kmemlist(cachep);
+	return alloc_kmemlist(cachep, gfp);
 }
 
 /* Called with cache_chain_mutex held always */
-static int enable_cpucache(struct kmem_cache *cachep)
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int err;
 	int limit, shared;
@@ -3934,7 +3938,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
 	if (limit > 32)
 		limit = 32;
 #endif
-	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
+	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
 		       cachep->name, -err);
@@ -4240,7 +4244,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
-						       batchcount, shared);
+						       batchcount, shared,
+						       GFP_KERNEL);
 			}
 			break;
 		}
diff --git a/mm/slub.c b/mm/slub.c
index 44bc77c..3cc5e98 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2582,13 +2582,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
 	if (gfp_flags & SLUB_DMA)
 		flags = SLAB_CACHE_DMA;
 
-	down_write(&slub_lock);
+	/*
+	 * This function is called with IRQs disabled during early-boot on
+	 * single CPU so there's no need to take slub_lock here.
+	 */
 	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
 								flags, NULL))
 		goto panic;
 
 	list_add(&s->list, &slab_caches);
-	up_write(&slub_lock);
+
 	if (sysfs_slab_add(s))
 		goto panic;
 	return s;
@@ -3048,7 +3051,7 @@ void __init kmem_cache_init(void)
 	 * kmem_cache_open for slab_state == DOWN.
 	 */
 	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
-		sizeof(struct kmem_cache_node), GFP_KERNEL);
+		sizeof(struct kmem_cache_node), GFP_NOWAIT);
 	kmalloc_caches[0].refcount = -1;
 	caches++;
 
@@ -3061,16 +3064,16 @@ void __init kmem_cache_init(void)
 	/* Caches that are not of the two-to-the-power-of size */
 	if (KMALLOC_MIN_SIZE <= 64) {
 		create_kmalloc_cache(&kmalloc_caches[1],
-				"kmalloc-96", 96, GFP_KERNEL);
+				"kmalloc-96", 96, GFP_NOWAIT);
 		caches++;
 		create_kmalloc_cache(&kmalloc_caches[2],
-				"kmalloc-192", 192, GFP_KERNEL);
+				"kmalloc-192", 192, GFP_NOWAIT);
 		caches++;
 	}
 
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
-			"kmalloc", 1 << i, GFP_KERNEL);
+			"kmalloc", 1 << i, GFP_NOWAIT);
 		caches++;
 	}
 
@@ -3107,7 +3110,7 @@ void __init kmem_cache_init(void)
 	/* Provide the correct kmalloc names now that the caches are up */
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
-			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+			kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
 
 #ifdef CONFIG_SMP
 	register_cpu_notifier(&slab_notifier);
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  8:47                           ` Pekka J Enberg
@ 2009-05-25 11:25                             ` Nick Piggin
  2009-05-25 11:37                               ` Pekka Enberg
  2009-05-25 14:55                             ` Matt Mackall
  2009-05-26 14:27                             ` Christoph Lameter
  2 siblings, 1 reply; 57+ messages in thread
From: Nick Piggin @ 2009-05-25 11:25 UTC (permalink / raw)
  To: Pekka J Enberg
  Cc: Ingo Molnar, Yinghai Lu, Rusty Russell, Linus Torvalds,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

On Mon, May 25, 2009 at 11:47:40AM +0300, Pekka Enberg wrote:
> On Mon, 25 May 2009, Ingo Molnar wrote:
> > Ok, i think this all looks pretty realistic - but there's quite a 
> > bit of layering on top of pending changes in the x86 and irq trees. 
> > We could do this on top of those topic branches in -tip, and rebase 
> > in the merge window. Or delay it to .32.
> > 
> > ... plus i think we are _very_ close to being able to remove all of 
> > bootmem on x86 (with some compatibility/migration mechanism in 
> > place). Which bootmem calls do we have before kmalloc init with 
> > Pekka's patch applied? I think it's mostly the page table init code.
> >  
> > ( beyond the page allocator internal use - where we could use 
> >   straight e820 based APIs that clip memory off from the beginning 
> >   of existing e820 RAM ranges - enriched with NUMA/SRAT locality 
> >   info. )
> 
> OK, here's a version of the patch with Yinghai's rebasing on top of 
> tip/master. All in-kernel memory allocators boot cleanly now on my 
> configuration (I did not try SLQB but it probably needs fixing). I would 
> appreciate if someone gave SLAB+NUMA configuration a try.
> 
> Ingo, I don't really want to carry this in slab.git so perhaps you could 
> put this and Yinghai's irq init cleanup in a "earlyslab" branch in tip for 
> testing?
> 
> And oh, Christoph/Matt, can I have your NAK/ACK for this patch?
> 
> 			Pekka
> 
> >From f5338db5f1b959cb82ed811037a545e19c7b2b7b Mon Sep 17 00:00:00 2001
> From: Pekka Enberg <penberg@cs.helsinki.fi>
> Date: Mon, 25 May 2009 09:44:39 +0300
> Subject: [PATCH] slab: setup allocators earlier in the boot sequence
> 
> This patch makes kmalloc() available earlier in the boot sequence so we can get
> rid of some bootmem allocations. The bulk of the changes are due to
> kmem_cache_init() being called with interrupts disabled which requires some
> changes to allocator boostrap code.
> 
> [ yinghai@kernel.org: rebase to tip ]
> Cc: Nick Piggin <npiggin@suse.de>
> Cc: Matt Mackall <mpm@selenic.com>
> Cc: Christoph Lameter <cl@linux-foundation.com>
> Cc: Linus Torvalds <torvals@linux-foundation.org>
> Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
> ---
>  init/main.c           |   28 +++++++++-------
>  kernel/irq/handle.c   |   23 +++++---------
>  kernel/sched.c        |   34 +++++++------------
>  kernel/sched_cpupri.c |    9 +++--
>  mm/slab.c             |   85 ++++++++++++++++++++++++++-----------------------
>  mm/slub.c             |   17 ++++++----
>  6 files changed, 98 insertions(+), 98 deletions(-)
> 
> diff --git a/init/main.c b/init/main.c
> index 33ce929..e7a9c18 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -576,6 +576,22 @@ asmlinkage void __init start_kernel(void)
>  	setup_nr_cpu_ids();
>  	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
>  
> +	build_all_zonelists();
> +	page_alloc_init();
> +
> +	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
> +	parse_early_param();
> +	parse_args("Booting kernel", static_command_line, __start___param,
> +		   __stop___param - __start___param,
> +		   &unknown_bootoption);
> +	/*
> +	 * Setup kernel memory allocators
> +	 */
> +	pidhash_init();
> +	vmalloc_init();
> +	vfs_caches_init_early();
> +	mem_init();
> +	kmem_cache_init();

Looks quite OK to me. The comment above maybe misleading?

The bootmem allocations of course are required because some hashes may
need to be larger than MAX_ORDER without using vmalloc.

kmem_cache_init (and mem_init, partially) sets up the kernel memory
allocators...

I can take a look at the SLQB side.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 11:25                             ` Nick Piggin
@ 2009-05-25 11:37                               ` Pekka Enberg
  2009-05-25 11:41                                 ` Nick Piggin
  0 siblings, 1 reply; 57+ messages in thread
From: Pekka Enberg @ 2009-05-25 11:37 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Yinghai Lu, Rusty Russell, Linus Torvalds,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

On Mon, May 25, 2009 at 2:25 PM, Nick Piggin <npiggin@suse.de> wrote:
>> @@ -576,6 +576,22 @@ asmlinkage void __init start_kernel(void)
>>       setup_nr_cpu_ids();
>>       smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */
>>
>> +     build_all_zonelists();
>> +     page_alloc_init();
>> +
>> +     printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
>> +     parse_early_param();
>> +     parse_args("Booting kernel", static_command_line, __start___param,
>> +                __stop___param - __start___param,
>> +                &unknown_bootoption);
>> +     /*
>> +      * Setup kernel memory allocators
>> +      */
>> +     pidhash_init();
>> +     vmalloc_init();
>> +     vfs_caches_init_early();
>> +     mem_init();
>> +     kmem_cache_init();

On Mon, May 25, 2009 at 2:25 PM, Nick Piggin <npiggin@suse.de> wrote:
> Looks quite OK to me. The comment above maybe misleading?
>
> The bootmem allocations of course are required because some hashes may
> need to be larger than MAX_ORDER without using vmalloc.
>
> kmem_cache_init (and mem_init, partially) sets up the kernel memory
> allocators...

Which comment are you talking about? The "setup kernel memory
allocators" one? Yeah, I should probably move it couple of lines down.
It's just that we absolutely need to do the other ones before calling
mem_init() and then kmem_cache_init().

On Mon, May 25, 2009 at 2:25 PM, Nick Piggin <npiggin@suse.de> wrote:
> I can take a look at the SLQB side.

Great!

                        Pekka

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 11:37                               ` Pekka Enberg
@ 2009-05-25 11:41                                 ` Nick Piggin
  2009-05-25 11:44                                   ` Pekka J Enberg
  2009-05-25 12:04                                   ` Pekka J Enberg
  0 siblings, 2 replies; 57+ messages in thread
From: Nick Piggin @ 2009-05-25 11:41 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Ingo Molnar, Yinghai Lu, Rusty Russell, Linus Torvalds,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

On Mon, May 25, 2009 at 02:37:59PM +0300, Pekka Enberg wrote:
> On Mon, May 25, 2009 at 2:25 PM, Nick Piggin <npiggin@suse.de> wrote:
> > The bootmem allocations of course are required because some hashes may
> > need to be larger than MAX_ORDER without using vmalloc.
> >
> > kmem_cache_init (and mem_init, partially) sets up the kernel memory
> > allocators...
> 
> Which comment are you talking about? The "setup kernel memory
> allocators" one? Yeah, I should probably move it couple of lines down.

Yes, just that one.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 11:41                                 ` Nick Piggin
@ 2009-05-25 11:44                                   ` Pekka J Enberg
  2009-05-25 15:01                                     ` Matt Mackall
  2009-05-25 16:39                                     ` Linus Torvalds
  2009-05-25 12:04                                   ` Pekka J Enberg
  1 sibling, 2 replies; 57+ messages in thread
From: Pekka J Enberg @ 2009-05-25 11:44 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Yinghai Lu, Rusty Russell, Linus Torvalds,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

On Mon, May 25, 2009 at 2:25 PM, Nick Piggin <npiggin@suse.de> wrote:
> > > The bootmem allocations of course are required because some hashes may
> > > need to be larger than MAX_ORDER without using vmalloc.
> > >
> > > kmem_cache_init (and mem_init, partially) sets up the kernel memory
> > > allocators...
> > 

On Mon, May 25, 2009 at 02:37:59PM +0300, Pekka Enberg wrote:
> > Which comment are you talking about? The "setup kernel memory
> > allocators" one? Yeah, I should probably move it couple of lines down.

On Mon, 25 May 2009, Nick Piggin wrote:
> Yes, just that one.

Here's an updated patch that does that. Thanks!

			Pekka

>From fbd1061276b607d6eca65d4bc50d7d5c0d2e27ce Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 25 May 2009 09:44:39 +0300
Subject: [PATCH] slab: setup allocators earlier in the boot sequence

This patch makes kmalloc() available earlier in the boot sequence so we can get
rid of some bootmem allocations. The bulk of the changes are due to
kmem_cache_init() being called with interrupts disabled which requires some
changes to allocator boostrap code.

[ yinghai@kernel.org: rebase to tip ]
Cc: Nick Piggin <npiggin@suse.de>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Christoph Lameter <cl@linux-foundation.com>
Cc: Linus Torvalds <torvals@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 init/main.c           |   28 +++++++++-------
 kernel/irq/handle.c   |   23 +++++---------
 kernel/sched.c        |   34 +++++++------------
 kernel/sched_cpupri.c |    9 +++--
 mm/slab.c             |   85 ++++++++++++++++++++++++++-----------------------
 mm/slub.c             |   17 ++++++----
 6 files changed, 98 insertions(+), 98 deletions(-)

diff --git a/init/main.c b/init/main.c
index 33ce929..fb0e004 100644
--- a/init/main.c
+++ b/init/main.c
@@ -576,6 +576,22 @@ asmlinkage void __init start_kernel(void)
 	setup_nr_cpu_ids();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
+	build_all_zonelists();
+	page_alloc_init();
+
+	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
+	parse_early_param();
+	parse_args("Booting kernel", static_command_line, __start___param,
+		   __stop___param - __start___param,
+		   &unknown_bootoption);
+	pidhash_init();
+	vmalloc_init();
+	vfs_caches_init_early();
+	/*
+	 * Set up kernel memory allocators
+	 */
+	mem_init();
+	kmem_cache_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -587,13 +603,6 @@ asmlinkage void __init start_kernel(void)
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
-	build_all_zonelists();
-	page_alloc_init();
-	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
-	parse_early_param();
-	parse_args("Booting kernel", static_command_line, __start___param,
-		   __stop___param - __start___param,
-		   &unknown_bootoption);
 	if (!irqs_disabled()) {
 		printk(KERN_WARNING "start_kernel(): bug: interrupts were "
 				"enabled *very* early, fixing it\n");
@@ -605,7 +614,6 @@ asmlinkage void __init start_kernel(void)
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
-	pidhash_init();
 	init_timers();
 	hrtimers_init();
 	softirq_init();
@@ -647,14 +655,10 @@ asmlinkage void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
-	vmalloc_init();
-	vfs_caches_init_early();
 	cpuset_init_early();
 	page_cgroup_init();
-	mem_init();
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
-	kmem_cache_init();
 	kmemtrace_init();
 	debug_objects_mem_init();
 	idr_init_cache();
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a600184..6cc19a9 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -18,7 +18,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <linux/bootmem.h>
+#include <linux/slab.h>
 #include <trace/events/irq.h>
 
 #include "internals.h"
@@ -45,7 +45,7 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 static void __init init_irq_default_affinity(void)
 {
-	alloc_bootmem_cpumask_var(&irq_default_affinity);
+	alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
 	cpumask_setall(irq_default_affinity);
 }
 #else
@@ -86,12 +86,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
 	void *ptr;
 
-	if (slab_is_available())
-		ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
-				   GFP_ATOMIC, node);
-	else
-		ptr = alloc_bootmem_node(NODE_DATA(node),
-				nr * sizeof(*desc->kstat_irqs));
+	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+			   GFP_ATOMIC, node);
 
 	/*
 	 * don't overwite if can not get new one
@@ -162,12 +158,12 @@ int __init early_irq_init(void)
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
 
 	/* allocate irq_desc_ptrs array based on nr_irqs */
-	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+	irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
 
 	/* allocate based on nr_cpu_ids */
 	/* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
-	kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
-					  sizeof(int));
+	kstat_irqs_legacy = kzalloc(NR_IRQS_LEGACY * nr_cpu_ids *
+					  sizeof(int), GFP_NOWAIT);
 
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
@@ -214,10 +210,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 	if (desc)
 		goto out_unlock;
 
-	if (slab_is_available())
-		desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-	else
-		desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
 
 	printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
 	if (!desc) {
diff --git a/kernel/sched.c b/kernel/sched.c
index f93305b..645280c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -69,7 +69,6 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
@@ -7821,24 +7820,21 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 
 static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
 {
+	gfp_t gfp = GFP_KERNEL;
+
 	memset(rd, 0, sizeof(*rd));
 
-	if (bootmem) {
-		alloc_bootmem_cpumask_var(&def_root_domain.span);
-		alloc_bootmem_cpumask_var(&def_root_domain.online);
-		alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-		cpupri_init(&rd->cpupri, true);
-		return 0;
-	}
+	if (bootmem)
+		gfp = GFP_NOWAIT;
 
-	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->span, gfp))
 		goto out;
-	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->online, gfp))
 		goto free_span;
-	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->rto_mask, gfp))
 		goto free_online;
 
-	if (cpupri_init(&rd->cpupri, false) != 0)
+	if (cpupri_init(&rd->cpupri, bootmem) != 0)
 		goto free_rto_mask;
 	return 0;
 
@@ -9157,12 +9153,8 @@ void __init sched_init(void)
 #ifdef CONFIG_CPUMASK_OFFSTACK
 	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
-	/*
-	 * As sched_init() is called before page_alloc is setup,
-	 * we use alloc_bootmem().
-	 */
 	if (alloc_size) {
-		ptr = (unsigned long)alloc_bootmem(alloc_size);
+		ptr = (unsigned long) kzalloc(alloc_size, GFP_NOWAIT);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.se = (struct sched_entity **)ptr;
@@ -9353,13 +9345,13 @@ void __init sched_init(void)
 	current->sched_class = &fair_sched_class;
 
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+	alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
-	alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
+	alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
 	perf_counter_init();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index cdd3c89..5ba8e32 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -156,16 +156,19 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
 {
 	int i;
 
+	gfp_t gfp = GFP_KERNEL;
+
 	memset(cp, 0, sizeof(*cp));
 
+	if (bootmem)
+		gfp = GFP_NOWAIT;
+
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
 		struct cpupri_vec *vec = &cp->pri_to_cpu[i];
 
 		spin_lock_init(&vec->lock);
 		vec->count = 0;
-		if (bootmem)
-			alloc_bootmem_cpumask_var(&vec->mask);
-		else if (!alloc_cpumask_var(&vec->mask, GFP_KERNEL))
+		if (!alloc_cpumask_var(&vec->mask, gfp))
 			goto cleanup;
 	}
 
diff --git a/mm/slab.c b/mm/slab.c
index 762acc2..ab5039a 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -316,7 +316,7 @@ static int drain_freelist(struct kmem_cache *cache,
 			struct kmem_list3 *l3, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 			int node);
-static int enable_cpucache(struct kmem_cache *cachep);
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 static void cache_reap(struct work_struct *unused);
 
 /*
@@ -878,12 +878,12 @@ static void __cpuinit start_cpu_timer(int cpu)
 }
 
 static struct array_cache *alloc_arraycache(int node, int entries,
-					    int batchcount)
+					    int batchcount, gfp_t gfp)
 {
 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 	struct array_cache *nc = NULL;
 
-	nc = kmalloc_node(memsize, GFP_KERNEL, node);
+	nc = kmalloc_node(memsize, gfp, node);
 	if (nc) {
 		nc->avail = 0;
 		nc->limit = entries;
@@ -923,7 +923,7 @@ static int transfer_objects(struct array_cache *to,
 #define drain_alien_cache(cachep, alien) do { } while (0)
 #define reap_alien(cachep, l3) do { } while (0)
 
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
+static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
 	return (struct array_cache **)BAD_ALIEN_MAGIC;
 }
@@ -954,7 +954,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
-static struct array_cache **alloc_alien_cache(int node, int limit)
+static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
 	struct array_cache **ac_ptr;
 	int memsize = sizeof(void *) * nr_node_ids;
@@ -962,14 +962,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit)
 
 	if (limit > 1)
 		limit = 12;
-	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+	ac_ptr = kmalloc_node(memsize, gfp, node);
 	if (ac_ptr) {
 		for_each_node(i) {
 			if (i == node || !node_online(i)) {
 				ac_ptr[i] = NULL;
 				continue;
 			}
-			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
+			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
 			if (!ac_ptr[i]) {
 				for (i--; i >= 0; i--)
 					kfree(ac_ptr[i]);
@@ -1202,20 +1202,20 @@ static int __cpuinit cpuup_prepare(long cpu)
 		struct array_cache **alien = NULL;
 
 		nc = alloc_arraycache(node, cachep->limit,
-					cachep->batchcount);
+					cachep->batchcount, GFP_KERNEL);
 		if (!nc)
 			goto bad;
 		if (cachep->shared) {
 			shared = alloc_arraycache(node,
 				cachep->shared * cachep->batchcount,
-				0xbaadf00d);
+				0xbaadf00d, GFP_KERNEL);
 			if (!shared) {
 				kfree(nc);
 				goto bad;
 			}
 		}
 		if (use_alien_caches) {
-			alien = alloc_alien_cache(node, cachep->limit);
+			alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
 			if (!alien) {
 				kfree(shared);
 				kfree(nc);
@@ -1319,10 +1319,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
 {
 	struct kmem_list3 *ptr;
 
-	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
 	BUG_ON(!ptr);
 
-	local_irq_disable();
 	memcpy(ptr, list, sizeof(struct kmem_list3));
 	/*
 	 * Do not assume that spinlocks can be initialized via memcpy:
@@ -1331,7 +1330,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
 
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->nodelists[nodeid] = ptr;
-	local_irq_enable();
 }
 
 /*
@@ -1495,9 +1493,8 @@ void __init kmem_cache_init(void)
 	{
 		struct array_cache *ptr;
 
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
-		local_irq_disable();
 		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
 		memcpy(ptr, cpu_cache_get(&cache_cache),
 		       sizeof(struct arraycache_init));
@@ -1507,11 +1504,9 @@ void __init kmem_cache_init(void)
 		spin_lock_init(&ptr->lock);
 
 		cache_cache.array[smp_processor_id()] = ptr;
-		local_irq_enable();
 
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
-		local_irq_disable();
 		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
 		       != &initarray_generic.cache);
 		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
@@ -1523,7 +1518,6 @@ void __init kmem_cache_init(void)
 
 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
 		    ptr;
-		local_irq_enable();
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
 	{
@@ -1547,7 +1541,7 @@ void __init kmem_cache_init(void)
 		struct kmem_cache *cachep;
 		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
-			if (enable_cpucache(cachep))
+			if (enable_cpucache(cachep, GFP_NOWAIT))
 				BUG();
 		mutex_unlock(&cache_chain_mutex);
 	}
@@ -1996,10 +1990,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 	return left_over;
 }
 
-static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
+static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	if (g_cpucache_up == FULL)
-		return enable_cpucache(cachep);
+		return enable_cpucache(cachep, gfp);
 
 	if (g_cpucache_up == NONE) {
 		/*
@@ -2021,7 +2015,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 			g_cpucache_up = PARTIAL_AC;
 	} else {
 		cachep->array[smp_processor_id()] =
-			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+			kmalloc(sizeof(struct arraycache_init), gfp);
 
 		if (g_cpucache_up == PARTIAL_AC) {
 			set_up_list3s(cachep, SIZE_L3);
@@ -2085,6 +2079,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
+	gfp_t gfp;
 
 	/*
 	 * Sanity checks... these are all serious usage bugs.
@@ -2100,8 +2095,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * We use cache_chain_mutex to ensure a consistent view of
 	 * cpu_online_mask as well.  Please see cpuup_callback
 	 */
-	get_online_cpus();
-	mutex_lock(&cache_chain_mutex);
+	if (slab_is_available()) {
+		get_online_cpus();
+		mutex_lock(&cache_chain_mutex);
+	}
 
 	list_for_each_entry(pc, &cache_chain, next) {
 		char tmp;
@@ -2210,8 +2207,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 */
 	align = ralign;
 
+	if (slab_is_available())
+		gfp = GFP_KERNEL;
+	else
+		gfp = GFP_NOWAIT;
+
 	/* Get cache's description obj. */
-	cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
+	cachep = kmem_cache_zalloc(&cache_cache, gfp);
 	if (!cachep)
 		goto oops;
 
@@ -2314,7 +2316,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->ctor = ctor;
 	cachep->name = name;
 
-	if (setup_cpu_cache(cachep)) {
+	if (setup_cpu_cache(cachep, gfp)) {
 		__kmem_cache_destroy(cachep);
 		cachep = NULL;
 		goto oops;
@@ -2326,8 +2328,10 @@ oops:
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
-	mutex_unlock(&cache_chain_mutex);
-	put_online_cpus();
+	if (slab_is_available()) {
+		mutex_unlock(&cache_chain_mutex);
+		put_online_cpus();
+	}
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -3742,7 +3746,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
 /*
  * This initializes kmem_list3 or resizes various caches for all nodes.
  */
-static int alloc_kmemlist(struct kmem_cache *cachep)
+static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int node;
 	struct kmem_list3 *l3;
@@ -3752,7 +3756,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 	for_each_online_node(node) {
 
                 if (use_alien_caches) {
-                        new_alien = alloc_alien_cache(node, cachep->limit);
+                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
                         if (!new_alien)
                                 goto fail;
                 }
@@ -3761,7 +3765,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 		if (cachep->shared) {
 			new_shared = alloc_arraycache(node,
 				cachep->shared*cachep->batchcount,
-					0xbaadf00d);
+					0xbaadf00d, gfp);
 			if (!new_shared) {
 				free_alien_cache(new_alien);
 				goto fail;
@@ -3790,7 +3794,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 			free_alien_cache(new_alien);
 			continue;
 		}
-		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+		l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
 		if (!l3) {
 			free_alien_cache(new_alien);
 			kfree(new_shared);
@@ -3846,18 +3850,18 @@ static void do_ccupdate_local(void *info)
 
 /* Always called with the cache_chain_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
-				int batchcount, int shared)
+				int batchcount, int shared, gfp_t gfp)
 {
 	struct ccupdate_struct *new;
 	int i;
 
-	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	new = kzalloc(sizeof(*new), gfp);
 	if (!new)
 		return -ENOMEM;
 
 	for_each_online_cpu(i) {
 		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
-						batchcount);
+						batchcount, gfp);
 		if (!new->new[i]) {
 			for (i--; i >= 0; i--)
 				kfree(new->new[i]);
@@ -3884,11 +3888,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 		kfree(ccold);
 	}
 	kfree(new);
-	return alloc_kmemlist(cachep);
+	return alloc_kmemlist(cachep, gfp);
 }
 
 /* Called with cache_chain_mutex held always */
-static int enable_cpucache(struct kmem_cache *cachep)
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int err;
 	int limit, shared;
@@ -3934,7 +3938,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
 	if (limit > 32)
 		limit = 32;
 #endif
-	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
+	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
 		       cachep->name, -err);
@@ -4240,7 +4244,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
-						       batchcount, shared);
+						       batchcount, shared,
+						       GFP_KERNEL);
 			}
 			break;
 		}
diff --git a/mm/slub.c b/mm/slub.c
index 44bc77c..3cc5e98 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2582,13 +2582,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
 	if (gfp_flags & SLUB_DMA)
 		flags = SLAB_CACHE_DMA;
 
-	down_write(&slub_lock);
+	/*
+	 * This function is called with IRQs disabled during early-boot on
+	 * single CPU so there's no need to take slub_lock here.
+	 */
 	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
 								flags, NULL))
 		goto panic;
 
 	list_add(&s->list, &slab_caches);
-	up_write(&slub_lock);
+
 	if (sysfs_slab_add(s))
 		goto panic;
 	return s;
@@ -3048,7 +3051,7 @@ void __init kmem_cache_init(void)
 	 * kmem_cache_open for slab_state == DOWN.
 	 */
 	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
-		sizeof(struct kmem_cache_node), GFP_KERNEL);
+		sizeof(struct kmem_cache_node), GFP_NOWAIT);
 	kmalloc_caches[0].refcount = -1;
 	caches++;
 
@@ -3061,16 +3064,16 @@ void __init kmem_cache_init(void)
 	/* Caches that are not of the two-to-the-power-of size */
 	if (KMALLOC_MIN_SIZE <= 64) {
 		create_kmalloc_cache(&kmalloc_caches[1],
-				"kmalloc-96", 96, GFP_KERNEL);
+				"kmalloc-96", 96, GFP_NOWAIT);
 		caches++;
 		create_kmalloc_cache(&kmalloc_caches[2],
-				"kmalloc-192", 192, GFP_KERNEL);
+				"kmalloc-192", 192, GFP_NOWAIT);
 		caches++;
 	}
 
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
-			"kmalloc", 1 << i, GFP_KERNEL);
+			"kmalloc", 1 << i, GFP_NOWAIT);
 		caches++;
 	}
 
@@ -3107,7 +3110,7 @@ void __init kmem_cache_init(void)
 	/* Provide the correct kmalloc names now that the caches are up */
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
-			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+			kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
 
 #ifdef CONFIG_SMP
 	register_cpu_notifier(&slab_notifier);
-- 
1.6.0.4

 

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 11:41                                 ` Nick Piggin
  2009-05-25 11:44                                   ` Pekka J Enberg
@ 2009-05-25 12:04                                   ` Pekka J Enberg
  2009-05-25 12:12                                     ` Nick Piggin
  1 sibling, 1 reply; 57+ messages in thread
From: Pekka J Enberg @ 2009-05-25 12:04 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Ingo Molnar, Yinghai Lu, Rusty Russell, Linus Torvalds,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

> On Mon, May 25, 2009 at 02:37:59PM +0300, Pekka Enberg wrote:
> > On Mon, May 25, 2009 at 2:25 PM, Nick Piggin <npiggin@suse.de> wrote:
> > > The bootmem allocations of course are required because some hashes may
> > > need to be larger than MAX_ORDER without using vmalloc.
> > >
> > > kmem_cache_init (and mem_init, partially) sets up the kernel memory
> > > allocators...
> > 
> > Which comment are you talking about? The "setup kernel memory
> > allocators" one? Yeah, I should probably move it couple of lines down.
 
On Mon, 25 May 2009, Nick Piggin wrote:
> Yes, just that one.

And oh, we probably want to do this too while at it. Nick?

>From b211497261670ff6f9af6b8b0fab429e848ccc87 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 25 May 2009 15:01:35 +0300
Subject: [PATCH] vmalloc: use kzalloc() instead of alloc_bootmem()

We can call vmalloc_init() after kmem_cache_init() and use kzalloc() instead of
the bootmem allocator when initializing vmalloc data structures.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 init/main.c  |    2 +-
 mm/vmalloc.c |    3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/init/main.c b/init/main.c
index fb0e004..766194c 100644
--- a/init/main.c
+++ b/init/main.c
@@ -585,13 +585,13 @@ asmlinkage void __init start_kernel(void)
 		   __stop___param - __start___param,
 		   &unknown_bootoption);
 	pidhash_init();
-	vmalloc_init();
 	vfs_caches_init_early();
 	/*
 	 * Set up kernel memory allocators
 	 */
 	mem_init();
 	kmem_cache_init();
+	vmalloc_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 083716e..3235138 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -23,7 +23,6 @@
 #include <linux/rbtree.h>
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
-#include <linux/bootmem.h>
 #include <linux/pfn.h>
 
 #include <asm/atomic.h>
@@ -1032,7 +1031,7 @@ void __init vmalloc_init(void)
 
 	/* Import existing vmlist entries. */
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
-		va = alloc_bootmem(sizeof(struct vmap_area));
+		va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
 		va->flags = tmp->flags | VM_VM_AREA;
 		va->va_start = (unsigned long)tmp->addr;
 		va->va_end = va->va_start + tmp->size;
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 12:04                                   ` Pekka J Enberg
@ 2009-05-25 12:12                                     ` Nick Piggin
  0 siblings, 0 replies; 57+ messages in thread
From: Nick Piggin @ 2009-05-25 12:12 UTC (permalink / raw)
  To: Pekka J Enberg
  Cc: Ingo Molnar, Yinghai Lu, Rusty Russell, Linus Torvalds,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

On Mon, May 25, 2009 at 03:04:20PM +0300, Pekka Enberg wrote:
> > On Mon, May 25, 2009 at 02:37:59PM +0300, Pekka Enberg wrote:
> > > On Mon, May 25, 2009 at 2:25 PM, Nick Piggin <npiggin@suse.de> wrote:
> > > > The bootmem allocations of course are required because some hashes may
> > > > need to be larger than MAX_ORDER without using vmalloc.
> > > >
> > > > kmem_cache_init (and mem_init, partially) sets up the kernel memory
> > > > allocators...
> > > 
> > > Which comment are you talking about? The "setup kernel memory
> > > allocators" one? Yeah, I should probably move it couple of lines down.
>  
> On Mon, 25 May 2009, Nick Piggin wrote:
> > Yes, just that one.
> 
> And oh, we probably want to do this too while at it. Nick?
> 
> >From b211497261670ff6f9af6b8b0fab429e848ccc87 Mon Sep 17 00:00:00 2001
> From: Pekka Enberg <penberg@cs.helsinki.fi>
> Date: Mon, 25 May 2009 15:01:35 +0300
> Subject: [PATCH] vmalloc: use kzalloc() instead of alloc_bootmem()
> 
> We can call vmalloc_init() after kmem_cache_init() and use kzalloc() instead of
> the bootmem allocator when initializing vmalloc data structures.
> 
> Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
> ---
>  init/main.c  |    2 +-
>  mm/vmalloc.c |    3 +--
>  2 files changed, 2 insertions(+), 3 deletions(-)
> 
> diff --git a/init/main.c b/init/main.c
> index fb0e004..766194c 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -585,13 +585,13 @@ asmlinkage void __init start_kernel(void)
>  		   __stop___param - __start___param,
>  		   &unknown_bootoption);
>  	pidhash_init();
> -	vmalloc_init();
>  	vfs_caches_init_early();
>  	/*
>  	 * Set up kernel memory allocators
>  	 */
>  	mem_init();
>  	kmem_cache_init();
> +	vmalloc_init();

vmalloc won't work fully without slab and page allocators anyway, so this
is conceptually a bugfix I think. Ack.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  8:47                           ` Pekka J Enberg
  2009-05-25 11:25                             ` Nick Piggin
@ 2009-05-25 14:55                             ` Matt Mackall
  2009-05-25 14:58                               ` Pekka Enberg
  2009-05-26 17:19                               ` Christoph Lameter
  2009-05-26 14:27                             ` Christoph Lameter
  2 siblings, 2 replies; 57+ messages in thread
From: Matt Mackall @ 2009-05-25 14:55 UTC (permalink / raw)
  To: Pekka J Enberg
  Cc: Ingo Molnar, Yinghai Lu, Rusty Russell, Linus Torvalds,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	npiggin

On Mon, 2009-05-25 at 11:47 +0300, Pekka J Enberg wrote:
> On Mon, 25 May 2009, Ingo Molnar wrote:
> > Ok, i think this all looks pretty realistic - but there's quite a 
> > bit of layering on top of pending changes in the x86 and irq trees. 
> > We could do this on top of those topic branches in -tip, and rebase 
> > in the merge window. Or delay it to .32.
> > 
> > ... plus i think we are _very_ close to being able to remove all of 
> > bootmem on x86 (with some compatibility/migration mechanism in 
> > place). Which bootmem calls do we have before kmalloc init with 
> > Pekka's patch applied? I think it's mostly the page table init code.
> >  
> > ( beyond the page allocator internal use - where we could use 
> >   straight e820 based APIs that clip memory off from the beginning 
> >   of existing e820 RAM ranges - enriched with NUMA/SRAT locality 
> >   info. )
> 
> OK, here's a version of the patch with Yinghai's rebasing on top of 
> tip/master. All in-kernel memory allocators boot cleanly now on my 
> configuration (I did not try SLQB but it probably needs fixing). I would 
> appreciate if someone gave SLAB+NUMA configuration a try.
> 
> Ingo, I don't really want to carry this in slab.git so perhaps you could 
> put this and Yinghai's irq init cleanup in a "earlyslab" branch in tip for 
> testing?
> 
> And oh, Christoph/Matt, can I have your NAK/ACK for this patch?

Looks ok to me, though I'd probably split it into three parts (slab,
slub, init).

For future reference, 0xbaadf00d ought to move into poison.h.

-- 
http://selenic.com : development and support for Mercurial and Linux



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 14:55                             ` Matt Mackall
@ 2009-05-25 14:58                               ` Pekka Enberg
  2009-05-26 17:19                               ` Christoph Lameter
  1 sibling, 0 replies; 57+ messages in thread
From: Pekka Enberg @ 2009-05-25 14:58 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Ingo Molnar, Yinghai Lu, Rusty Russell, Linus Torvalds,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	npiggin

On Mon, 2009-05-25 at 09:55 -0500, Matt Mackall wrote:
> > And oh, Christoph/Matt, can I have your NAK/ACK for this patch?
> 
> Looks ok to me, though I'd probably split it into three parts (slab,
> slub, init).

Yeah, the patch is pretty big but I don't think we can do that. It would
break git bisect.

			Pekka


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 11:44                                   ` Pekka J Enberg
@ 2009-05-25 15:01                                     ` Matt Mackall
  2009-05-25 16:39                                     ` Linus Torvalds
  1 sibling, 0 replies; 57+ messages in thread
From: Matt Mackall @ 2009-05-25 15:01 UTC (permalink / raw)
  To: Pekka J Enberg
  Cc: Nick Piggin, Ingo Molnar, Yinghai Lu, Rusty Russell,
	Linus Torvalds, H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl

On Mon, 2009-05-25 at 14:44 +0300, Pekka J Enberg wrote:
> On Mon, May 25, 2009 at 2:25 PM, Nick Piggin <npiggin@suse.de> wrote:
> > > > The bootmem allocations of course are required because some hashes may
> > > > need to be larger than MAX_ORDER without using vmalloc.
> > > >
> > > > kmem_cache_init (and mem_init, partially) sets up the kernel memory
> > > > allocators...
> > > 
> 
> On Mon, May 25, 2009 at 02:37:59PM +0300, Pekka Enberg wrote:
> > > Which comment are you talking about? The "setup kernel memory
> > > allocators" one? Yeah, I should probably move it couple of lines down.
> 
> On Mon, 25 May 2009, Nick Piggin wrote:
> > Yes, just that one.
> 
> Here's an updated patch that does that. Thanks!

If we're gonna do that, we probably ought to have a comment.

/* These use large bootmem allocations and must precede kmem_cache_init
*/

-- 
http://selenic.com : development and support for Mercurial and Linux



^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 11:44                                   ` Pekka J Enberg
  2009-05-25 15:01                                     ` Matt Mackall
@ 2009-05-25 16:39                                     ` Linus Torvalds
  2009-05-25 18:39                                       ` Pekka Enberg
  2009-05-26  7:33                                       ` Nick Piggin
  1 sibling, 2 replies; 57+ messages in thread
From: Linus Torvalds @ 2009-05-25 16:39 UTC (permalink / raw)
  To: Pekka J Enberg
  Cc: Nick Piggin, Ingo Molnar, Yinghai Lu, Rusty Russell,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm



On Mon, 25 May 2009, Pekka J Enberg wrote:
> diff --git a/init/main.c b/init/main.c
> index 33ce929..fb0e004 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -576,6 +576,22 @@ asmlinkage void __init start_kernel(void)
>  	setup_nr_cpu_ids();
>  	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
>  
> +	build_all_zonelists();
> +	page_alloc_init();
> +
> +	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
> +	parse_early_param();
> +	parse_args("Booting kernel", static_command_line, __start___param,
> +		   __stop___param - __start___param,
> +		   &unknown_bootoption);
> +	pidhash_init();
> +	vmalloc_init();
> +	vfs_caches_init_early();
> +	/*
> +	 * Set up kernel memory allocators
> +	 */
> +	mem_init();
> +	kmem_cache_init();

So what strikes me is a question:

 - why do we want to do pidhash_init and vfs_caches_init_early() so early?

Yes, pidhash_init() now uses alloc_bootmem. It's an allocation that is not 
trivially small, but it's not humongous either (max 4096 hash list heads, 
one pointer each).

And vfs_caches_init_early() is actually doing some rather strange things, 
like doing a "alloc_large_system_hash()" but not unconditionally: it does 
it in the "late" initialization too, if not done early. inode_init_early 
does soemthing very similar (ie a _conditional_ early init).

So none of this seems to really get a huge advantage from the early init. 
There seems to be some subtle NUMA issues, but do we really want that? I 
get the feeling that nobody ever wanted to do it early, and then the NUMA 
people said "I don't wnt to do this early, but I don't want to touch the 
non-NUMA case, so I'll do it early for non-numa, and late for numa".

I'm also not entirely sure we really need to do vmalloc_init() that early, 
but I dunno. It also uses alloc_bootmem().

		Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 16:39                                     ` Linus Torvalds
@ 2009-05-25 18:39                                       ` Pekka Enberg
  2009-05-25 19:14                                         ` Linus Torvalds
  2009-05-26  7:38                                         ` Nick Piggin
  2009-05-26  7:33                                       ` Nick Piggin
  1 sibling, 2 replies; 57+ messages in thread
From: Pekka Enberg @ 2009-05-25 18:39 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Nick Piggin, Ingo Molnar, Yinghai Lu, Rusty Russell,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

Hi Linus,

Linus Torvalds wrote:
> On Mon, 25 May 2009, Pekka J Enberg wrote:
>> diff --git a/init/main.c b/init/main.c
>> index 33ce929..fb0e004 100644
>> --- a/init/main.c
>> +++ b/init/main.c
>> @@ -576,6 +576,22 @@ asmlinkage void __init start_kernel(void)
>>  	setup_nr_cpu_ids();
>>  	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
>>  
>> +	build_all_zonelists();
>> +	page_alloc_init();
>> +
>> +	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
>> +	parse_early_param();
>> +	parse_args("Booting kernel", static_command_line, __start___param,
>> +		   __stop___param - __start___param,
>> +		   &unknown_bootoption);
>> +	pidhash_init();
>> +	vmalloc_init();
>> +	vfs_caches_init_early();
>> +	/*
>> +	 * Set up kernel memory allocators
>> +	 */
>> +	mem_init();
>> +	kmem_cache_init();
> 
> So what strikes me is a question:
> 
>  - why do we want to do pidhash_init and vfs_caches_init_early() so early?
> 
> Yes, pidhash_init() now uses alloc_bootmem. It's an allocation that is not 
> trivially small, but it's not humongous either (max 4096 hash list heads, 
> one pointer each).

I can certainly fix that up to use kmalloc() or vmalloc(). I moved it 
because I wasn't sure how much it's actually allocating and wanted to do 
the conservative thing here.

Linus Torvalds wrote:
> And vfs_caches_init_early() is actually doing some rather strange things, 
> like doing a "alloc_large_system_hash()" but not unconditionally: it does 
> it in the "late" initialization too, if not done early. inode_init_early 
> does soemthing very similar (ie a _conditional_ early init).
> 
> So none of this seems to really get a huge advantage from the early init. 
> There seems to be some subtle NUMA issues, but do we really want that? I 
> get the feeling that nobody ever wanted to do it early, and then the NUMA 
> people said "I don't wnt to do this early, but I don't want to touch the 
> non-NUMA case, so I'll do it early for non-numa, and late for numa".

SLUB does sysfs setup in kmem_cache_init() and if I saw some oopses if I 
don't call vfs_caches_init_early() first. I didn't look too closely, though.

Linus Torvalds wrote:
> I'm also not entirely sure we really need to do vmalloc_init() that early, 
> but I dunno. It also uses alloc_bootmem().

We can do that later but then we need to fix up vmalloc_init(). There's 
actually a patch floating around to do that.

			Pekka

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 19:14                                         ` Linus Torvalds
@ 2009-05-25 19:13                                           ` Pekka Enberg
  2009-05-26  1:50                                             ` Yinghai Lu
  0 siblings, 1 reply; 57+ messages in thread
From: Pekka Enberg @ 2009-05-25 19:13 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Nick Piggin, Ingo Molnar, Yinghai Lu, Rusty Russell,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

Hi Linus,

Linus Torvalds wrote:
> Hey, all my suggestions were definitely for the "do it later". I'd hate to 
> have this blow up to one large patch. I'm perfectly ok with the patch I 
> replied to, I was more thinking that maybe there is then room for further 
> (independent) fixups too.
> 
> So [vmalloc|pidhash|vfs_caches_eary]_init all fall under the "let's look 
> at that too. If we start off with just the scheduler init ordering, that's 
> good.

OK, makes sense. I'll try to sneak the current patch past Ingo first and 
then do the ones you pointed out as incremental patches.

			Pekka

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 18:39                                       ` Pekka Enberg
@ 2009-05-25 19:14                                         ` Linus Torvalds
  2009-05-25 19:13                                           ` Pekka Enberg
  2009-05-26  7:38                                         ` Nick Piggin
  1 sibling, 1 reply; 57+ messages in thread
From: Linus Torvalds @ 2009-05-25 19:14 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Nick Piggin, Ingo Molnar, Yinghai Lu, Rusty Russell,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm



On Mon, 25 May 2009, Pekka Enberg wrote:
> 
> We can do that later but then we need to fix up vmalloc_init(). There's
> actually a patch floating around to do that.

Hey, all my suggestions were definitely for the "do it later". I'd hate to 
have this blow up to one large patch. I'm perfectly ok with the patch I 
replied to, I was more thinking that maybe there is then room for further 
(independent) fixups too.

So [vmalloc|pidhash|vfs_caches_eary]_init all fall under the "let's look 
at that too. If we start off with just the scheduler init ordering, that's 
good.

			Linus

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 19:13                                           ` Pekka Enberg
@ 2009-05-26  1:50                                             ` Yinghai Lu
  0 siblings, 0 replies; 57+ messages in thread
From: Yinghai Lu @ 2009-05-26  1:50 UTC (permalink / raw)
  To: Pekka Enberg, Linus Torvalds, Ingo Molnar, Tejun Heo
  Cc: Nick Piggin, Rusty Russell, H. Peter Anvin, Jeff Garzik,
	Alexander Viro, Linux Kernel Mailing List, Andrew Morton,
	Peter Zijlstra, cl, mpm

trying to put setup_percpu_areas() after mem_init/kmem_cache_init...

http://git.kernel.org/?p=linux/kernel/git/yinghai/linux-2.6-yinghai.git;a=summary
git://git.kernel.org/pub/scm/linux/kernel/git/yinghai/linux-2.6-yinghai.git

based on tip/master, on AMD 8 sockets, NUMA + SLUB.

got

[   49.962048] Freeing unused kernel memory: 2564k freed
INIT: [   49.984095] BUG: unable to handle kernel paging request at ffff888687090860
[   49.988020] IP: [<ffffffff81135249>] __slab_free+0x61/0x2ee
[   49.988020] PGD a067 PUD 0 
[   49.988020] Oops: 0002 [#1] SMP 
[   49.988020] last sysfs file: 
[   49.988020] CPU 0 
[   49.988020] Modules linked in:
[   49.988020] Pid: 1, comm: init Not tainted 2.6.30-rc7-tip-01641-g4c0d14e-dirty #285 Sun Fire X4600 M2
[   49.988020] RIP: 0010:[<ffffffff81135249>]  [<ffffffff81135249>] __slab_free+0x61/0x2ee
[   49.988020] RSP: 0018:ffff888021be5a08  EFLAGS: 00010046
[   49.988020] RAX: 00000000cccccccc RBX: ffffffff8251ce60 RCX: ffffffff81185e56
[   49.988020] RDX: 0000000000000000 RSI: ffffea0340d42110 RDI: ffffffff82247000
[   49.988020] RBP: ffff888021be5a58 R08: 00000000cccccccc R09: 0000000000000000
[   49.988020] R10: ffff885021be6200 R11: 000000002cce478a R12: ffffea0340d42110
[   49.988020] R13: ffff888020a2a200 R14: ffffffff82247000 R15: ffffffff82247000
[   49.988020] FS:  00000000006d1880(0063) GS:ffffc90000000000(0000) knlGS:0000000000000000
[   49.988020] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[   49.988020] CR2: ffff888687090860 CR3: 0000001020576000 CR4: 00000000000006f0
[   49.988020] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   49.988020] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[   49.988020] Process init (pid: 1, threadinfo ffff888021be4000, task ffff887021be8000)
[   49.988020] Stack:
[   49.988020]  ffff888021be5a58 cccccccc810ae203 ffffffff81185e56 000000002cce478a
[   49.988020]  000000002cce478a ffffffff8251ce60 ffffea0340d42110 ffff888020a2a200
[   49.988020]  0000000000000286 ffffffff82247000 ffff888021be5aa8 ffffffff8113603d
[   49.988020] Call Trace:
[   49.988020]  [<ffffffff81185e56>] ? free_proc_entry+0x79/0x94
[   49.988020]  [<ffffffff8113603d>] kfree+0x10c/0x14a
[   49.988020]  [<ffffffff81185e56>] ? free_proc_entry+0x79/0x94
[   49.988020]  [<ffffffff81185e56>] free_proc_entry+0x79/0x94
[   49.988020]  [<ffffffff81186094>] remove_proc_entry+0x223/0x249
[   49.988020]  [<ffffffff810524d5>] ? mask_IO_APIC_irq_desc+0x36/0x112
[   49.988020]  [<ffffffff81a8a887>] ? _spin_unlock_irqrestore+0x50/0x76
[   49.988020]  [<ffffffff810acd60>] ? trace_hardirqs_off+0x20/0x36
[   49.988020]  [<ffffffff81a8a887>] ? _spin_unlock_irqrestore+0x50/0x76
[   49.988020]  [<ffffffff810add3b>] ? mark_held_locks+0x60/0x96
[   49.988020]  [<ffffffff81a8a88e>] ? _spin_unlock_irqrestore+0x57/0x76
[   49.988020]  [<ffffffff810ae065>] ? trace_hardirqs_on_caller+0x13b/0x17d
[   49.988020]  [<ffffffff810cef38>] unregister_handler_proc+0x46/0x61
[   49.988020]  [<ffffffff810cd36e>] __free_irq+0x135/0x191
[   49.988020]  [<ffffffff810cd3eb>] free_irq+0x21/0x3f
[   49.988020]  [<ffffffff814f3bee>] serial8250_shutdown+0x130/0x170
[   49.988020]  [<ffffffff814ef6c2>] uart_shutdown+0xd0/0x124
[   49.988020]  [<ffffffff814f0c41>] uart_close+0x14c/0x1f7
[   49.988020]  [<ffffffff814bbffa>] ? tty_fasync+0x11b/0x13e
[   49.988020]  [<ffffffff814befe7>] tty_release_dev+0x1d3/0x51c
[   49.988020]  [<ffffffff8113c5f2>] ? sys_close+0x45/0x112
[   49.988020]  [<ffffffff814bf35d>] tty_release+0x2d/0x4f
[   49.988020]  [<ffffffff8113fd24>] __fput+0xff/0x1d1
[   49.988020]  [<ffffffff8113fe26>] fput+0x30/0x46
[   49.988020]  [<ffffffff8113c58c>] filp_close+0x78/0x99
[   49.988020]  [<ffffffff81a8a8e7>] ? _spin_unlock+0x3a/0x55
[   49.988020]  [<ffffffff8113c66a>] sys_close+0xbd/0x112
[   49.988020]  [<ffffffff81034edb>] system_call_fastpath+0x16/0x1b
[   49.988020] Code: eb 09 f3 90 41 f6 04 24 01 75 f7 f0 41 0f ba 2c 24 00 19 c0 85 c0 75 ec 41 f6 04 24 02 0f 85 ae 00 00 00 8b 45 bc 49 8b 54 24 50 <49> 89 54 c5 00 41 8b 44 24 0c 4d 89 6c 24 50 8d 48 ff 49 8b 04 
[   49.988020] RIP  [<ffffffff81135249>] __slab_free+0x61/0x2ee
[   49.988020]  RSP <ffff888021be5a08>
[   49.988020] CR2: ffff888687090860
[   49.988020] ---[ end trace 4dde5b4c04e715b6 ]---

^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 16:39                                     ` Linus Torvalds
  2009-05-25 18:39                                       ` Pekka Enberg
@ 2009-05-26  7:33                                       ` Nick Piggin
  1 sibling, 0 replies; 57+ messages in thread
From: Nick Piggin @ 2009-05-26  7:33 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Pekka J Enberg, Ingo Molnar, Yinghai Lu, Rusty Russell,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

On Mon, May 25, 2009 at 09:39:36AM -0700, Linus Torvalds wrote:
> 
> 
> On Mon, 25 May 2009, Pekka J Enberg wrote:
> > diff --git a/init/main.c b/init/main.c
> > index 33ce929..fb0e004 100644
> > --- a/init/main.c
> > +++ b/init/main.c
> > @@ -576,6 +576,22 @@ asmlinkage void __init start_kernel(void)
> >  	setup_nr_cpu_ids();
> >  	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
> >  
> > +	build_all_zonelists();
> > +	page_alloc_init();
> > +
> > +	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
> > +	parse_early_param();
> > +	parse_args("Booting kernel", static_command_line, __start___param,
> > +		   __stop___param - __start___param,
> > +		   &unknown_bootoption);
> > +	pidhash_init();
> > +	vmalloc_init();
> > +	vfs_caches_init_early();
> > +	/*
> > +	 * Set up kernel memory allocators
> > +	 */
> > +	mem_init();
> > +	kmem_cache_init();
> 
> So what strikes me is a question:
> 
>  - why do we want to do pidhash_init and vfs_caches_init_early() so early?
> 
> Yes, pidhash_init() now uses alloc_bootmem. It's an allocation that is not 
> trivially small, but it's not humongous either (max 4096 hash list heads, 
> one pointer each).

It would be nice to use the regular page allocator for pidhash_init; For
my case, I have this patch floating around for a long time which can make
this (among other things) dynamically resizable without using locking, and
avoiding the special case for the bootmem allocated hash would be good. 


> And vfs_caches_init_early() is actually doing some rather strange things, 
> like doing a "alloc_large_system_hash()" but not unconditionally: it does 
> it in the "late" initialization too, if not done early. inode_init_early 
> does soemthing very similar (ie a _conditional_ early init).
> 
> So none of this seems to really get a huge advantage from the early init. 
> There seems to be some subtle NUMA issues, but do we really want that? I 
> get the feeling that nobody ever wanted to do it early, and then the NUMA 
> people said "I don't wnt to do this early, but I don't want to touch the 
> non-NUMA case, so I'll do it early for non-numa, and late for numa".
 
vfs_caches_init_early wants to allocate with bootmem so it can get
>= MAX_ORDER cache size in the kernel direct mapping. In the NUMA
case, it is more important to spread the memory usage and utilisation
over nodes I guess so they use vmalloc for that. Bootmem and vmalloc
are not available at the same time, so it has to be 2 cases.


> I'm also not entirely sure we really need to do vmalloc_init() that early, 
> but I dunno. It also uses alloc_bootmem().

Probably not. vmalloc doesn't really work without the page allocator and
slab allocator already up, so it can probably be moved after them.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 18:39                                       ` Pekka Enberg
  2009-05-25 19:14                                         ` Linus Torvalds
@ 2009-05-26  7:38                                         ` Nick Piggin
  2009-05-28 12:06                                           ` Pekka Enberg
  1 sibling, 1 reply; 57+ messages in thread
From: Nick Piggin @ 2009-05-26  7:38 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Linus Torvalds, Ingo Molnar, Yinghai Lu, Rusty Russell,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

On Mon, May 25, 2009 at 09:39:08PM +0300, Pekka Enberg wrote:
> Linus Torvalds wrote:
> >And vfs_caches_init_early() is actually doing some rather strange things, 
> >like doing a "alloc_large_system_hash()" but not unconditionally: it does 
> >it in the "late" initialization too, if not done early. inode_init_early 
> >does soemthing very similar (ie a _conditional_ early init).
> >
> >So none of this seems to really get a huge advantage from the early init. 
> >There seems to be some subtle NUMA issues, but do we really want that? I 
> >get the feeling that nobody ever wanted to do it early, and then the NUMA 
> >people said "I don't wnt to do this early, but I don't want to touch the 
> >non-NUMA case, so I'll do it early for non-numa, and late for numa".
> 
> SLUB does sysfs setup in kmem_cache_init() and if I saw some oopses if I 
> don't call vfs_caches_init_early() first. I didn't look too closely, though.

Did you also test the NUMA/hashdist case? vfs_caches_init_early doesn't
do much in that case.

I would say it is much more robust to do sysfs setup later if we move
the slab setup so early. Probably it is just quite lucky not to explode
in the !numa case because the vfs needs quite a bit of setting up...


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25  8:47                           ` Pekka J Enberg
  2009-05-25 11:25                             ` Nick Piggin
  2009-05-25 14:55                             ` Matt Mackall
@ 2009-05-26 14:27                             ` Christoph Lameter
  2 siblings, 0 replies; 57+ messages in thread
From: Christoph Lameter @ 2009-05-26 14:27 UTC (permalink / raw)
  To: Pekka J Enberg
  Cc: Ingo Molnar, Yinghai Lu, Rusty Russell, Linus Torvalds,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra,
	npiggin, mpm

On Mon, 25 May 2009, Pekka J Enberg wrote:

> And oh, Christoph/Matt, can I have your NAK/ACK for this patch?

You are dinking with something very sensitive here. There are numerous
dependencies on the exact sequence in the boot code and that sequence may
vary from arch to arch.  It wont be easy to ack.

Argh. Meeting. I hope to get back to this later.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-25 14:55                             ` Matt Mackall
  2009-05-25 14:58                               ` Pekka Enberg
@ 2009-05-26 17:19                               ` Christoph Lameter
  2009-05-28 12:14                                 ` Pekka Enberg
  1 sibling, 1 reply; 57+ messages in thread
From: Christoph Lameter @ 2009-05-26 17:19 UTC (permalink / raw)
  To: Matt Mackall
  Cc: Pekka J Enberg, Ingo Molnar, Yinghai Lu, Rusty Russell,
	Linus Torvalds, H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra,
	npiggin

On Mon, 25 May 2009, Matt Mackall wrote:

> > And oh, Christoph/Matt, can I have your NAK/ACK for this patch?
>
> Looks ok to me, though I'd probably split it into three parts (slab,
> slub, init).

Could we document the core boot strap procedure some more? I think we will
have slight breakage in various parts if we change the order. If we had
some design to guide then it may be easier to bring all the components in
order.

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [tip:irq/numa] x86: enable_update_mptable should be a macro
  2009-05-20  7:37                   ` [PATCH] x86: enable_update_mptable should MACRO Yinghai Lu
@ 2009-05-28  0:00                     ` tip-bot for Yinghai Lu
  0 siblings, 0 replies; 57+ messages in thread
From: tip-bot for Yinghai Lu @ 2009-05-28  0:00 UTC (permalink / raw)
  To: linux-tip-commits; +Cc: linux-kernel, hpa, mingo, yinghai, tglx, mingo

Commit-ID:  abfe0af9813153bae8c85d9bac966bafcb8ddab1
Gitweb:     http://git.kernel.org/tip/abfe0af9813153bae8c85d9bac966bafcb8ddab1
Author:     Yinghai Lu <yinghai@kernel.org>
AuthorDate: Wed, 20 May 2009 00:37:40 -0700
Committer:  Ingo Molnar <mingo@elte.hu>
CommitDate: Thu, 28 May 2009 01:59:05 +0200

x86: enable_update_mptable should be a macro

instead of declaring one variant as an inline function...
because other case is a variable

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4A13B344.7030307@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>


---
 arch/x86/include/asm/mpspec.h |   11 ++---------
 1 files changed, 2 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 3dcbaaa..e2a1bb6 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -61,9 +61,11 @@ extern void get_smp_config(void);
 #ifdef CONFIG_X86_MPPARSE
 extern void find_smp_config(void);
 extern void early_reserve_e820_mpc_new(void);
+extern int enable_update_mptable;
 #else
 static inline void find_smp_config(void) { }
 static inline void early_reserve_e820_mpc_new(void) { }
+#define enable_update_mptable 0
 #endif
 
 void __cpuinit generic_processor_info(int apicid, int version);
@@ -87,15 +89,6 @@ static inline int acpi_probe_gsi(void)
 }
 #endif /* CONFIG_ACPI */
 
-#ifdef CONFIG_X86_MPPARSE
-extern int enable_update_mptable;
-#else
-static inline int enable_update_mptable(void)
-{
-	return 0;
-}
-#endif
-
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
 
 struct physid_mask {

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-26  7:38                                         ` Nick Piggin
@ 2009-05-28 12:06                                           ` Pekka Enberg
  2009-05-28 12:12                                             ` Nick Piggin
  0 siblings, 1 reply; 57+ messages in thread
From: Pekka Enberg @ 2009-05-28 12:06 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Linus Torvalds, Ingo Molnar, Yinghai Lu, Rusty Russell,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

On Tue, 2009-05-26 at 09:38 +0200, Nick Piggin wrote:
> On Mon, May 25, 2009 at 09:39:08PM +0300, Pekka Enberg wrote:
> > Linus Torvalds wrote:
> > >And vfs_caches_init_early() is actually doing some rather strange things, 
> > >like doing a "alloc_large_system_hash()" but not unconditionally: it does 
> > >it in the "late" initialization too, if not done early. inode_init_early 
> > >does soemthing very similar (ie a _conditional_ early init).
> > >
> > >So none of this seems to really get a huge advantage from the early init. 
> > >There seems to be some subtle NUMA issues, but do we really want that? I 
> > >get the feeling that nobody ever wanted to do it early, and then the NUMA 
> > >people said "I don't wnt to do this early, but I don't want to touch the 
> > >non-NUMA case, so I'll do it early for non-numa, and late for numa".
> > 
> > SLUB does sysfs setup in kmem_cache_init() and if I saw some oopses if I 
> > don't call vfs_caches_init_early() first. I didn't look too closely, though.
> 
> Did you also test the NUMA/hashdist case? vfs_caches_init_early doesn't
> do much in that case.

No, I tested UMA only.

On Tue, 2009-05-26 at 09:38 +0200, Nick Piggin wrote:
> I would say it is much more robust to do sysfs setup later if we move
> the slab setup so early. Probably it is just quite lucky not to explode
> in the !numa case because the vfs needs quite a bit of setting up...

That should not be an issue. SLUB already defers sysfs registration
until slab_sysfs_init() initcall has been run. So my patches have zero
change in how SLUB interracts with sysfs, actually.

			Pekka


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-28 12:06                                           ` Pekka Enberg
@ 2009-05-28 12:12                                             ` Nick Piggin
  2009-05-28 12:24                                               ` Pekka Enberg
  0 siblings, 1 reply; 57+ messages in thread
From: Nick Piggin @ 2009-05-28 12:12 UTC (permalink / raw)
  To: Pekka Enberg
  Cc: Linus Torvalds, Ingo Molnar, Yinghai Lu, Rusty Russell,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

On Thu, May 28, 2009 at 03:06:40PM +0300, Pekka Enberg wrote:
> On Tue, 2009-05-26 at 09:38 +0200, Nick Piggin wrote:
> > On Mon, May 25, 2009 at 09:39:08PM +0300, Pekka Enberg wrote:
> > > Linus Torvalds wrote:
> > > >And vfs_caches_init_early() is actually doing some rather strange things, 
> > > >like doing a "alloc_large_system_hash()" but not unconditionally: it does 
> > > >it in the "late" initialization too, if not done early. inode_init_early 
> > > >does soemthing very similar (ie a _conditional_ early init).
> > > >
> > > >So none of this seems to really get a huge advantage from the early init. 
> > > >There seems to be some subtle NUMA issues, but do we really want that? I 
> > > >get the feeling that nobody ever wanted to do it early, and then the NUMA 
> > > >people said "I don't wnt to do this early, but I don't want to touch the 
> > > >non-NUMA case, so I'll do it early for non-numa, and late for numa".
> > > 
> > > SLUB does sysfs setup in kmem_cache_init() and if I saw some oopses if I 
> > > don't call vfs_caches_init_early() first. I didn't look too closely, though.
> > 
> > Did you also test the NUMA/hashdist case? vfs_caches_init_early doesn't
> > do much in that case.
> 
> No, I tested UMA only.
> 
> On Tue, 2009-05-26 at 09:38 +0200, Nick Piggin wrote:
> > I would say it is much more robust to do sysfs setup later if we move
> > the slab setup so early. Probably it is just quite lucky not to explode
> > in the !numa case because the vfs needs quite a bit of setting up...
> 
> That should not be an issue. SLUB already defers sysfs registration
> until slab_sysfs_init() initcall has been run. So my patches have zero
> change in how SLUB interracts with sysfs, actually.

Oh right, I didn't actually look. An initcall should be fine, but
I wonder why it is crashing if you move it before vfs_caches_init_early?
Those just allocate inode and dentry hash tables...


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-26 17:19                               ` Christoph Lameter
@ 2009-05-28 12:14                                 ` Pekka Enberg
  0 siblings, 0 replies; 57+ messages in thread
From: Pekka Enberg @ 2009-05-28 12:14 UTC (permalink / raw)
  To: Christoph Lameter
  Cc: Matt Mackall, Ingo Molnar, Yinghai Lu, Rusty Russell,
	Linus Torvalds, H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra,
	npiggin

On Tue, 2009-05-26 at 13:19 -0400, Christoph Lameter wrote:
> Could we document the core boot strap procedure some more? I think we will
> have slight breakage in various parts if we change the order. If we had
> some design to guide then it may be easier to bring all the components in
> order.

Hmm. I am avoiding all the hard issues by letting all the architecture
specific early boot code run before I set up the kernel memory
allocators. The only thing that changes order is init_IRQ() which should
be fine.

			Pekka


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL] scheduler fixes
  2009-05-28 12:12                                             ` Nick Piggin
@ 2009-05-28 12:24                                               ` Pekka Enberg
  0 siblings, 0 replies; 57+ messages in thread
From: Pekka Enberg @ 2009-05-28 12:24 UTC (permalink / raw)
  To: Nick Piggin
  Cc: Linus Torvalds, Ingo Molnar, Yinghai Lu, Rusty Russell,
	H. Peter Anvin, Jeff Garzik, Alexander Viro,
	Linux Kernel Mailing List, Andrew Morton, Peter Zijlstra, cl,
	mpm

On Thu, 2009-05-28 at 14:12 +0200, Nick Piggin wrote:
> Oh right, I didn't actually look. An initcall should be fine, but
> I wonder why it is crashing if you move it before vfs_caches_init_early?
> Those just allocate inode and dentry hash tables...

That's because we pass HASH_EARLY to alloc_large_system_hash(). Should
be easy enough to fix, though.


^ permalink raw reply	[flat|nested] 57+ messages in thread

* Re: [GIT PULL v2] scheduler fixes
  2019-11-17 10:41     ` [GIT PULL v2] " Ingo Molnar
@ 2019-11-17 16:35       ` pr-tracker-bot
  0 siblings, 0 replies; 57+ messages in thread
From: pr-tracker-bot @ 2019-11-17 16:35 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Linus Torvalds, Valentin Schneider, Linux Kernel Mailing List,
	Peter Zijlstra, Thomas Gleixner, Andrew Morton

The pull request you sent on Sun, 17 Nov 2019 11:41:12 +0100:

> git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/cbb104f91dfec8ae5bc67ff6dc67b824330a0919

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker

^ permalink raw reply	[flat|nested] 57+ messages in thread

* [GIT PULL v2] scheduler fixes
  2019-11-17  0:10   ` Linus Torvalds
@ 2019-11-17 10:41     ` Ingo Molnar
  2019-11-17 16:35       ` pr-tracker-bot
  0 siblings, 1 reply; 57+ messages in thread
From: Ingo Molnar @ 2019-11-17 10:41 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: Valentin Schneider, Linux Kernel Mailing List, Peter Zijlstra,
	Thomas Gleixner, Andrew Morton


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sat, Nov 16, 2019 at 2:44 PM Valentin Schneider
> <valentin.schneider@arm.com> wrote:
> >
> > > Valentin Schneider (2):
> > >       sched/uclamp: Fix overzealous type replacement
> >
> > This one got a v2 (was missing one location), acked by Vincent:
> >
> >   20191115103908.27610-1-valentin.schneider@arm.com
> >
> > >       sched/topology, cpuset: Account for housekeeping CPUs to avoid empty cpumasks
> >
> > And this one is no longer needed, as Michal & I understood (IOW the fix in
> > rc6 is sufficient), see:
> >
> >   c425c5cb-ba8a-e5f6-d91c-5479779cfb7a@arm.com
> 
> Ingo, what do you want me to do? Pull it anyway and send updates
> later? Or skip this pull request?
> 
> I'll leave it pending for now,

We ended up zapping the final two commits from sched/urgent.

Please pull the latest sched-urgent-for-linus git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched-urgent-for-linus

   # HEAD: 6e1ff0773f49c7d38e8b4a9df598def6afb9f415 sched/uclamp: Fix incorrect condition

Misc fixes:

 - Fix potential deadlock under CONFIG_DEBUG_OBJECTS=y
 - PELT metrics update ordering fix
 - uclamp logic fix

 Thanks,

	Ingo

------------------>
Peter Zijlstra (1):
      sched/core: Avoid spurious lock dependencies

Qais Yousef (1):
      sched/uclamp: Fix incorrect condition

Vincent Guittot (1):
      sched/pelt: Fix update of blocked PELT ordering


 kernel/sched/core.c |  5 +++--
 kernel/sched/fair.c | 29 ++++++++++++++++++++---------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0f2eb3629070..44123b4d14e8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1065,7 +1065,7 @@ uclamp_update_active(struct task_struct *p, enum uclamp_id clamp_id)
 	 * affecting a valid clamp bucket, the next time it's enqueued,
 	 * it will already see the updated clamp bucket value.
 	 */
-	if (!p->uclamp[clamp_id].active) {
+	if (p->uclamp[clamp_id].active) {
 		uclamp_rq_dec_id(rq, p, clamp_id);
 		uclamp_rq_inc_id(rq, p, clamp_id);
 	}
@@ -6019,10 +6019,11 @@ void init_idle(struct task_struct *idle, int cpu)
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long flags;
 
+	__sched_fork(0, idle);
+
 	raw_spin_lock_irqsave(&idle->pi_lock, flags);
 	raw_spin_lock(&rq->lock);
 
-	__sched_fork(0, idle);
 	idle->state = TASK_RUNNING;
 	idle->se.exec_start = sched_clock();
 	idle->flags |= PF_IDLE;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 22a2fed29054..69a81a5709ff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7547,6 +7547,19 @@ static void update_blocked_averages(int cpu)
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
 
+	/*
+	 * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
+	 * that RT, DL and IRQ signals have been updated before updating CFS.
+	 */
+	curr_class = rq->curr->sched_class;
+	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
+	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
+	update_irq_load_avg(rq, 0);
+
+	/* Don't need periodic decay once load/util_avg are null */
+	if (others_have_blocked(rq))
+		done = false;
+
 	/*
 	 * Iterates the task_group tree in a bottom up fashion, see
 	 * list_add_leaf_cfs_rq() for details.
@@ -7574,14 +7587,6 @@ static void update_blocked_averages(int cpu)
 			done = false;
 	}
 
-	curr_class = rq->curr->sched_class;
-	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
-	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
-	update_irq_load_avg(rq, 0);
-	/* Don't need periodic decay once load/util_avg are null */
-	if (others_have_blocked(rq))
-		done = false;
-
 	update_blocked_load_status(rq, !done);
 	rq_unlock_irqrestore(rq, &rf);
 }
@@ -7642,12 +7647,18 @@ static inline void update_blocked_averages(int cpu)
 
 	rq_lock_irqsave(rq, &rf);
 	update_rq_clock(rq);
-	update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
 
+	/*
+	 * update_cfs_rq_load_avg() can call cpufreq_update_util(). Make sure
+	 * that RT, DL and IRQ signals have been updated before updating CFS.
+	 */
 	curr_class = rq->curr->sched_class;
 	update_rt_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &rt_sched_class);
 	update_dl_rq_load_avg(rq_clock_pelt(rq), rq, curr_class == &dl_sched_class);
 	update_irq_load_avg(rq, 0);
+
+	update_cfs_rq_load_avg(cfs_rq_clock_pelt(cfs_rq), cfs_rq);
+
 	update_blocked_load_status(rq, cfs_rq_has_blocked(cfs_rq) || others_have_blocked(rq));
 	rq_unlock_irqrestore(rq, &rf);
 }

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [GIT PULL, v2] scheduler fixes
  2011-09-30 18:36 [GIT PULL] " Ingo Molnar
@ 2011-10-01  7:38 ` Ingo Molnar
  0 siblings, 0 replies; 57+ messages in thread
From: Ingo Molnar @ 2011-10-01  7:38 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Peter Zijlstra, Thomas Gleixner, Andrew Morton

Linus,

Please pull the latest sched-urgent-for-linus git tree from:

   git://tesla.tglx.de/git/linux-2.6-tip.git sched-urgent-for-linus

[ Note, the diffstat is different from the v1 version, a new fix was 
  added.]

 Thanks,

	Ingo

------------------>
Peter Zijlstra (1):
      posix-cpu-timers: Cure SMP wobbles

Shawn Bohrer (1):
      sched/rt: Migrate equal priority tasks to available CPUs

Simon Kirby (1):
      sched: Fix up wchan borkage


 include/linux/sched.h     |    1 -
 kernel/posix-cpu-timers.c |    5 +++--
 kernel/sched.c            |   26 +-------------------------
 kernel/sched_rt.c         |    4 ++--
 4 files changed, 6 insertions(+), 30 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ac2c05..41d0237 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1956,7 +1956,6 @@ static inline void disable_sched_clock_irqtime(void) {}
 
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
-extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 58f405b..c8008dd 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 	do {
 		times->utime = cputime_add(times->utime, t->utime);
 		times->stime = cputime_add(times->stime, t->stime);
-		times->sum_exec_runtime += t->se.sum_exec_runtime;
+		times->sum_exec_runtime += task_sched_runtime(t);
 	} while_each_thread(tsk, t);
 out:
 	rcu_read_unlock();
@@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 		cpu->cpu = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = thread_group_sched_runtime(p);
+		thread_group_cputime(p, &cputime);
+		cpu->sched = cputime.sum_exec_runtime;
 		break;
 	}
 	return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index ec5f472..b50b0f0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3725,30 +3725,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 }
 
 /*
- * Return sum_exec_runtime for the thread group.
- * In case the task is currently running, return the sum plus current's
- * pending runtime that have not been accounted yet.
- *
- * Note that the thread group might have other running tasks as well,
- * so the return value not includes other pending runtime that other
- * running tasks might have.
- */
-unsigned long long thread_group_sched_runtime(struct task_struct *p)
-{
-	struct task_cputime totals;
-	unsigned long flags;
-	struct rq *rq;
-	u64 ns;
-
-	rq = task_rq_lock(p, &flags);
-	thread_group_cputime(p, &totals);
-	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-	task_rq_unlock(rq, p, &flags);
-
-	return ns;
-}
-
-/*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to
  * @cputime: the cpu time spent in user space since the last update
@@ -4372,7 +4348,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 		blk_schedule_flush_plug(tsk);
 }
 
-asmlinkage void schedule(void)
+asmlinkage void __sched schedule(void)
 {
 	struct task_struct *tsk = current;
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 97540f0..af11778 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1050,7 +1050,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 	 */
 	if (curr && unlikely(rt_task(curr)) &&
 	    (curr->rt.nr_cpus_allowed < 2 ||
-	     curr->prio < p->prio) &&
+	     curr->prio <= p->prio) &&
 	    (p->rt.nr_cpus_allowed > 1)) {
 		int target = find_lowest_rq(p);
 
@@ -1581,7 +1581,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 	    p->rt.nr_cpus_allowed > 1 &&
 	    rt_task(rq->curr) &&
 	    (rq->curr->rt.nr_cpus_allowed < 2 ||
-	     rq->curr->prio < p->prio))
+	     rq->curr->prio <= p->prio))
 		push_rt_tasks(rq);
 }
 

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [GIT PULL, v2] scheduler fixes
  2010-12-19 20:45 ` Linus Torvalds
@ 2010-12-19 22:30   ` Ingo Molnar
  0 siblings, 0 replies; 57+ messages in thread
From: Ingo Molnar @ 2010-12-19 22:30 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Peter Zijlstra, Mike Galbraith, Thomas Gleixner,
	Andrew Morton


* Linus Torvalds <torvalds@linux-foundation.org> wrote:

> On Sun, Dec 19, 2010 at 7:27 AM, Ingo Molnar <mingo@elte.hu> wrote:
> > @@ -3943,6 +4128,7 @@ need_resched_nonpreemptible:
> >                rq->nr_switches++;
> >                rq->curr = next;
> >                ++*switch_count;
> > +               WARN_ON_ONCE(test_tsk_need_resched(next));
> >
> >                context_switch(rq, prev, next); /* unlocks the rq */
> >                /*
> > diff --git a/kernel/timer.c b/kern
> 
> Please don't add like this in an -rc. It looks like it's some
> debugging aid, I don't think it should have gone in now.

Yeah - sorry about that. The warning is bogus - please pull the updated tree below. 
(Can also be pulled on top of the previous one.)

Thanks,

	Ingo

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-fixes-for-linus

------------------>
Heiko Carstens (1):
      nohz: Fix get_next_timer_interrupt() vs cpu hotplug

Ingo Molnar (1):
      sched: Remove debugging check

Mike Galbraith (1):
      Sched: fix skip_clock_update optimization

Peter Zijlstra (3):
      sched: Cure more NO_HZ load average woes
      sched: Fix the irqtime code to deal with u64 wraps
      sched: Fix the irqtime code for 32bit


 include/linux/sched.h |    2 +-
 kernel/fork.c         |    1 +
 kernel/sched.c        |  287 ++++++++++++++++++++++++++++++++++++++++---------
 kernel/timer.c        |    8 ++-
 4 files changed, 245 insertions(+), 53 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2c79e92..2238745 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -143,7 +143,7 @@ extern unsigned long nr_iowait_cpu(int cpu);
 extern unsigned long this_cpu_load(void);
 
 
-extern void calc_global_load(void);
+extern void calc_global_load(unsigned long ticks);
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 3b159c5..5447dc7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -273,6 +273,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 
 	setup_thread_stack(tsk, orig);
 	clear_user_return_notifier(tsk);
+	clear_tsk_need_resched(tsk);
 	stackend = end_of_stack(tsk);
 	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 
diff --git a/kernel/sched.c b/kernel/sched.c
index dc91a4d..297d1a0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -636,22 +636,18 @@ static inline struct task_group *task_group(struct task_struct *p)
 
 #endif /* CONFIG_CGROUP_SCHED */
 
-static u64 irq_time_cpu(int cpu);
-static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+static void update_rq_clock_task(struct rq *rq, s64 delta);
 
-inline void update_rq_clock(struct rq *rq)
+static void update_rq_clock(struct rq *rq)
 {
-	if (!rq->skip_clock_update) {
-		int cpu = cpu_of(rq);
-		u64 irq_time;
+	s64 delta;
 
-		rq->clock = sched_clock_cpu(cpu);
-		irq_time = irq_time_cpu(cpu);
-		if (rq->clock - irq_time > rq->clock_task)
-			rq->clock_task = rq->clock - irq_time;
+	if (rq->skip_clock_update)
+		return;
 
-		sched_irq_time_avg_update(rq, irq_time);
-	}
+	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
+	rq->clock += delta;
+	update_rq_clock_task(rq, delta);
 }
 
 /*
@@ -1924,10 +1920,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
  * They are read and saved off onto struct rq in update_rq_clock().
  * This may result in other CPU reading this CPU's irq time and can
  * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value (or semi updated value on 32 bit) with a side effect of
- * accounting a slice of irq time to wrong task when irq is in progress
- * while we read rq->clock. That is a worthy compromise in place of having
- * locks on each irq in account_system_time.
+ * or new value with a side effect of accounting a slice of irq time to wrong
+ * task when irq is in progress while we read rq->clock. That is a worthy
+ * compromise in place of having locks on each irq in account_system_time.
  */
 static DEFINE_PER_CPU(u64, cpu_hardirq_time);
 static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1945,19 +1940,58 @@ void disable_sched_clock_irqtime(void)
 	sched_clock_irqtime = 0;
 }
 
-static u64 irq_time_cpu(int cpu)
+#ifndef CONFIG_64BIT
+static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
+
+static inline void irq_time_write_begin(void)
 {
-	if (!sched_clock_irqtime)
-		return 0;
+	__this_cpu_inc(irq_time_seq.sequence);
+	smp_wmb();
+}
+
+static inline void irq_time_write_end(void)
+{
+	smp_wmb();
+	__this_cpu_inc(irq_time_seq.sequence);
+}
+
+static inline u64 irq_time_read(int cpu)
+{
+	u64 irq_time;
+	unsigned seq;
 
+	do {
+		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+		irq_time = per_cpu(cpu_softirq_time, cpu) +
+			   per_cpu(cpu_hardirq_time, cpu);
+	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
+
+	return irq_time;
+}
+#else /* CONFIG_64BIT */
+static inline void irq_time_write_begin(void)
+{
+}
+
+static inline void irq_time_write_end(void)
+{
+}
+
+static inline u64 irq_time_read(int cpu)
+{
 	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
 }
+#endif /* CONFIG_64BIT */
 
+/*
+ * Called before incrementing preempt_count on {soft,}irq_enter
+ * and before decrementing preempt_count on {soft,}irq_exit.
+ */
 void account_system_vtime(struct task_struct *curr)
 {
 	unsigned long flags;
+	s64 delta;
 	int cpu;
-	u64 now, delta;
 
 	if (!sched_clock_irqtime)
 		return;
@@ -1965,9 +1999,10 @@ void account_system_vtime(struct task_struct *curr)
 	local_irq_save(flags);
 
 	cpu = smp_processor_id();
-	now = sched_clock_cpu(cpu);
-	delta = now - per_cpu(irq_start_time, cpu);
-	per_cpu(irq_start_time, cpu) = now;
+	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+	__this_cpu_add(irq_start_time, delta);
+
+	irq_time_write_begin();
 	/*
 	 * We do not account for softirq time from ksoftirqd here.
 	 * We want to continue accounting softirq time to ksoftirqd thread
@@ -1975,33 +2010,55 @@ void account_system_vtime(struct task_struct *curr)
 	 * that do not consume any time, but still wants to run.
 	 */
 	if (hardirq_count())
-		per_cpu(cpu_hardirq_time, cpu) += delta;
+		__this_cpu_add(cpu_hardirq_time, delta);
 	else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
-		per_cpu(cpu_softirq_time, cpu) += delta;
+		__this_cpu_add(cpu_softirq_time, delta);
 
+	irq_time_write_end();
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
 
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-	if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
-		u64 delta_irq = curr_irq_time - rq->prev_irq_time;
-		rq->prev_irq_time = curr_irq_time;
-		sched_rt_avg_update(rq, delta_irq);
-	}
+	s64 irq_delta;
+
+	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
+
+	/*
+	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
+	 * this case when a previous update_rq_clock() happened inside a
+	 * {soft,}irq region.
+	 *
+	 * When this happens, we stop ->clock_task and only update the
+	 * prev_irq_time stamp to account for the part that fit, so that a next
+	 * update will consume the rest. This ensures ->clock_task is
+	 * monotonic.
+	 *
+	 * It does however cause some slight miss-attribution of {soft,}irq
+	 * time, a more accurate solution would be to update the irq_time using
+	 * the current rq->clock timestamp, except that would require using
+	 * atomic ops.
+	 */
+	if (irq_delta > delta)
+		irq_delta = delta;
+
+	rq->prev_irq_time += irq_delta;
+	delta -= irq_delta;
+	rq->clock_task += delta;
+
+	if (irq_delta && sched_feat(NONIRQ_POWER))
+		sched_rt_avg_update(rq, irq_delta);
 }
 
-#else
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
 
-static u64 irq_time_cpu(int cpu)
+static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
-	return 0;
+	rq->clock_task += delta;
 }
 
-static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
-
-#endif
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2129,7 +2186,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 	 * A queue event has occurred, and we're going to schedule.  In
 	 * this case, we can save a useless back to back clock update.
 	 */
-	if (test_tsk_need_resched(rq->curr))
+	if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
 		rq->skip_clock_update = 1;
 }
 
@@ -3119,6 +3176,15 @@ static long calc_load_fold_active(struct rq *this_rq)
 	return delta;
 }
 
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	load += 1UL << (FSHIFT - 1);
+	return load >> FSHIFT;
+}
+
 #ifdef CONFIG_NO_HZ
 /*
  * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3148,6 +3214,128 @@ static long calc_load_fold_idle(void)
 
 	return delta;
 }
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+	unsigned long result = 1UL << frac_bits;
+
+	if (n) for (;;) {
+		if (n & 1) {
+			result *= x;
+			result += 1UL << (frac_bits - 1);
+			result >>= frac_bits;
+		}
+		n >>= 1;
+		if (!n)
+			break;
+		x *= x;
+		x += 1UL << (frac_bits - 1);
+		x >>= frac_bits;
+	}
+
+	return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+	    unsigned long active, unsigned int n)
+{
+
+	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+	long delta, active, n;
+
+	if (time_before(jiffies, calc_load_update))
+		return;
+
+	/*
+	 * If we crossed a calc_load_update boundary, make sure to fold
+	 * any pending idle changes, the respective CPUs might have
+	 * missed the tick driven calc_load_account_active() update
+	 * due to NO_HZ.
+	 */
+	delta = calc_load_fold_idle();
+	if (delta)
+		atomic_long_add(delta, &calc_load_tasks);
+
+	/*
+	 * If we were idle for multiple load cycles, apply them.
+	 */
+	if (ticks >= LOAD_FREQ) {
+		n = ticks / LOAD_FREQ;
+
+		active = atomic_long_read(&calc_load_tasks);
+		active = active > 0 ? active * FIXED_1 : 0;
+
+		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+		calc_load_update += n * LOAD_FREQ;
+	}
+
+	/*
+	 * Its possible the remainder of the above division also crosses
+	 * a LOAD_FREQ period, the regular check in calc_global_load()
+	 * which comes after this will take care of that.
+	 *
+	 * Consider us being 11 ticks before a cycle completion, and us
+	 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+	 * age us 4 cycles, and the test in calc_global_load() will
+	 * pick up the final one.
+	 */
+}
 #else
 static void calc_load_account_idle(struct rq *this_rq)
 {
@@ -3157,6 +3345,10 @@ static inline long calc_load_fold_idle(void)
 {
 	return 0;
 }
+
+static void calc_global_nohz(unsigned long ticks)
+{
+}
 #endif
 
 /**
@@ -3174,24 +3366,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 	loads[2] = (avenrun[2] + offset) << shift;
 }
 
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-	load *= exp;
-	load += active * (FIXED_1 - exp);
-	return load >> FSHIFT;
-}
-
 /*
  * calc_load - update the avenrun load estimates 10 ticks after the
  * CPUs have updated calc_load_tasks.
  */
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
 {
-	unsigned long upd = calc_load_update + 10;
 	long active;
 
-	if (time_before(jiffies, upd))
+	calc_global_nohz(ticks);
+
+	if (time_before(jiffies, calc_load_update + 10))
 		return;
 
 	active = atomic_long_read(&calc_load_tasks);
@@ -3845,7 +4030,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
 	if (prev->se.on_rq)
 		update_rq_clock(rq);
-	rq->skip_clock_update = 0;
 	prev->sched_class->put_prev_task(rq, prev);
 }
 
@@ -3903,7 +4087,6 @@ need_resched_nonpreemptible:
 		hrtick_clear(rq);
 
 	raw_spin_lock_irq(&rq->lock);
-	clear_tsk_need_resched(prev);
 
 	switch_count = &prev->nivcsw;
 	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3935,6 +4118,8 @@ need_resched_nonpreemptible:
 
 	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
+	clear_tsk_need_resched(prev);
+	rq->skip_clock_update = 0;
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
diff --git a/kernel/timer.c b/kernel/timer.c
index 68a9ae7..353b922 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1252,6 +1252,12 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 	unsigned long expires;
 
+	/*
+	 * Pretend that there is no timer pending if the cpu is offline.
+	 * Possible pending timers will be migrated later to an active cpu.
+	 */
+	if (cpu_is_offline(smp_processor_id()))
+		return now + NEXT_TIMER_MAX_DELTA;
 	spin_lock(&base->lock);
 	if (time_before_eq(base->next_timer, base->timer_jiffies))
 		base->next_timer = __next_timer_interrupt(base);
@@ -1319,7 +1325,7 @@ void do_timer(unsigned long ticks)
 {
 	jiffies_64 += ticks;
 	update_wall_time();
-	calc_global_load();
+	calc_global_load(ticks);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM

^ permalink raw reply related	[flat|nested] 57+ messages in thread

* [GIT PULL, v2] scheduler fixes
  2009-10-23 14:43 [GIT PULL] " Ingo Molnar
@ 2009-10-23 14:46 ` Ingo Molnar
  0 siblings, 0 replies; 57+ messages in thread
From: Ingo Molnar @ 2009-10-23 14:46 UTC (permalink / raw)
  To: Linus Torvalds
  Cc: linux-kernel, Peter Zijlstra, Mike Galbraith, Andrew Morton


* Ingo Molnar <mingo@elte.hu> wrote:

> Linus,
> 
> Please pull the latest sched-fixes-for-linus git tree from:
> 
>    git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-fixes-for-linus

Sorry - the tail commit was bad and i forgot to exclude it. Updated pull 
request (with different URI):

Please pull the latest sched-fixes-for-linus-2 git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git sched-fixes-for-linus-2

 Thanks,

	Ingo

------------------>
Peter Zijlstra (2):
      sched: Disable SD_PREFER_LOCAL for MC/CPU domains
      sched: Do less agressive buddy clearing


 arch/x86/include/asm/topology.h |    1 +
 include/linux/topology.h        |    4 ++--
 kernel/sched_fair.c             |   27 +++++++++++++--------------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 25a9284..d823c24 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -143,6 +143,7 @@ extern unsigned long node_remap_size[];
 				| 1*SD_BALANCE_FORK			\
 				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
+				| 1*SD_PREFER_LOCAL			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_POWERSAVINGS_BALANCE		\
 				| 0*SD_SHARE_PKG_RESOURCES		\
diff --git a/include/linux/topology.h b/include/linux/topology.h
index fc0bf3e..57e6357 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -129,7 +129,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_FORK			\
 				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
-				| 1*SD_PREFER_LOCAL			\
+				| 0*SD_PREFER_LOCAL			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 1*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
@@ -162,7 +162,7 @@ int arch_update_cpu_topology(void);
 				| 1*SD_BALANCE_FORK			\
 				| 0*SD_BALANCE_WAKE			\
 				| 1*SD_WAKE_AFFINE			\
-				| 1*SD_PREFER_LOCAL			\
+				| 0*SD_PREFER_LOCAL			\
 				| 0*SD_SHARE_CPUPOWER			\
 				| 0*SD_SHARE_PKG_RESOURCES		\
 				| 0*SD_SERIALIZE			\
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4e777b4..c32c3e6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -861,12 +861,21 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *se = __pick_next_entity(cfs_rq);
+	struct sched_entity *buddy;
 
-	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
-		return cfs_rq->next;
+	if (cfs_rq->next) {
+		buddy = cfs_rq->next;
+		cfs_rq->next = NULL;
+		if (wakeup_preempt_entity(buddy, se) < 1)
+			return buddy;
+	}
 
-	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
-		return cfs_rq->last;
+	if (cfs_rq->last) {
+		buddy = cfs_rq->last;
+		cfs_rq->last = NULL;
+		if (wakeup_preempt_entity(buddy, se) < 1)
+			return buddy;
+	}
 
 	return se;
 }
@@ -1654,16 +1663,6 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 
 	do {
 		se = pick_next_entity(cfs_rq);
-		/*
-		 * If se was a buddy, clear it so that it will have to earn
-		 * the favour again.
-		 *
-		 * If se was not a buddy, clear the buddies because neither
-		 * was elegible to run, let them earn it again.
-		 *
-		 * IOW. unconditionally clear buddies.
-		 */
-		__clear_buddies(cfs_rq, NULL);
 		set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);

^ permalink raw reply related	[flat|nested] 57+ messages in thread

end of thread, other threads:[~2019-11-17 16:35 UTC | newest]

Thread overview: 57+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2009-05-18 14:27 [GIT PULL] scheduler fixes Ingo Molnar
2009-05-18 16:13 ` Linus Torvalds
2009-05-18 16:49   ` Ingo Molnar
2009-05-18 16:58     ` Linus Torvalds
2009-05-18 17:09       ` Ingo Molnar
2009-05-18 19:03         ` Ingo Molnar
2009-05-18 19:16           ` Linus Torvalds
2009-05-18 20:20             ` Ingo Molnar
2009-05-18 22:06               ` Linus Torvalds
2009-05-19 12:27                 ` Rusty Russell
2009-05-24 16:13                 ` Pekka J Enberg
2009-05-24 18:18                   ` Linus Torvalds
2009-05-24 19:13                     ` Pekka Enberg
2009-05-25  5:16                     ` Benjamin Herrenschmidt
2009-05-24 18:34                   ` Yinghai Lu
2009-05-24 19:15                     ` Pekka Enberg
2009-05-25  2:53                     ` Ingo Molnar
2009-05-25  4:45                       ` Yinghai Lu
2009-05-25  5:15                         ` Ingo Molnar
2009-05-25  5:54                           ` Yinghai Lu
2009-05-25  8:47                           ` Pekka J Enberg
2009-05-25 11:25                             ` Nick Piggin
2009-05-25 11:37                               ` Pekka Enberg
2009-05-25 11:41                                 ` Nick Piggin
2009-05-25 11:44                                   ` Pekka J Enberg
2009-05-25 15:01                                     ` Matt Mackall
2009-05-25 16:39                                     ` Linus Torvalds
2009-05-25 18:39                                       ` Pekka Enberg
2009-05-25 19:14                                         ` Linus Torvalds
2009-05-25 19:13                                           ` Pekka Enberg
2009-05-26  1:50                                             ` Yinghai Lu
2009-05-26  7:38                                         ` Nick Piggin
2009-05-28 12:06                                           ` Pekka Enberg
2009-05-28 12:12                                             ` Nick Piggin
2009-05-28 12:24                                               ` Pekka Enberg
2009-05-26  7:33                                       ` Nick Piggin
2009-05-25 12:04                                   ` Pekka J Enberg
2009-05-25 12:12                                     ` Nick Piggin
2009-05-25 14:55                             ` Matt Mackall
2009-05-25 14:58                               ` Pekka Enberg
2009-05-26 17:19                               ` Christoph Lameter
2009-05-28 12:14                                 ` Pekka Enberg
2009-05-26 14:27                             ` Christoph Lameter
2009-05-25  4:52                       ` H. Peter Anvin
2009-05-25  5:05                         ` Ingo Molnar
2009-05-25  5:13                         ` Yinghai Lu
2009-05-25  5:19                       ` Benjamin Herrenschmidt
2009-05-25  7:16                       ` Rusty Russell
     [not found]               ` <4A12E759.6040806@kernel.org>
     [not found]                 ` <20090520071900.GB11952@elte.hu>
2009-05-20  7:37                   ` [PATCH] x86: enable_update_mptable should MACRO Yinghai Lu
2009-05-28  0:00                     ` [tip:irq/numa] x86: enable_update_mptable should be a macro tip-bot for Yinghai Lu
2009-05-19  8:31   ` [tip:sched/core] sched: properly define the sched_group::cpumask and sched_domain::span fields tip-bot for Ingo Molnar
2009-05-18 16:55 ` [GIT PULL, v2] scheduler fixes Ingo Molnar
2009-10-23 14:43 [GIT PULL] " Ingo Molnar
2009-10-23 14:46 ` [GIT PULL, v2] " Ingo Molnar
2010-12-19 15:27 [GIT PULL] " Ingo Molnar
2010-12-19 20:45 ` Linus Torvalds
2010-12-19 22:30   ` [GIT PULL, v2] " Ingo Molnar
2011-09-30 18:36 [GIT PULL] " Ingo Molnar
2011-10-01  7:38 ` [GIT PULL, v2] " Ingo Molnar
2019-11-16 21:37 [GIT PULL] " Ingo Molnar
2019-11-16 22:44 ` Valentin Schneider
2019-11-17  0:10   ` Linus Torvalds
2019-11-17 10:41     ` [GIT PULL v2] " Ingo Molnar
2019-11-17 16:35       ` pr-tracker-bot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).