[v6,15/17] static_call: Allow early init
diff mbox series

Message ID 20200710134337.036717190@infradead.org
State New
Headers show
Series
  • Add static_call
Related show

Commit Message

Peter Zijlstra July 10, 2020, 1:38 p.m. UTC
In order to use static_call() to wire up x86_pmu, we need to
initialize earlier; copy some of the tricks from jump_label to enable
this.

Primarily we overload key->next to store a sites pointer when there
are no modules, this avoids having to use kmalloc() to initialize the
sites and allows us to run much earlier.

(arguably, this is much much earlier than needed for perf, but it
might allow other uses.)

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/setup.c       |    2 +
 arch/x86/kernel/static_call.c |    8 +++++-
 include/linux/static_call.h   |   15 +++++++++--
 kernel/static_call.c          |   55 +++++++++++++++++++++++++++++++++++++++---
 4 files changed, 74 insertions(+), 6 deletions(-)

Comments

Steven Rostedt July 11, 2020, 1:14 a.m. UTC | #1
On Fri, 10 Jul 2020 15:38:46 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> In order to use static_call() to wire up x86_pmu, we need to
> initialize earlier; copy some of the tricks from jump_label to enable
> this.
> 
> Primarily we overload key->next to store a sites pointer when there
> are no modules, this avoids having to use kmalloc() to initialize the
> sites and allows us to run much earlier.
> 

I'm confused. What was the need to have key->next store site pointers
in order to move it up earlier?

-- Steve


> (arguably, this is much much earlier than needed for perf, but it
> might allow other uses.)
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/kernel/setup.c       |    2 +
>  arch/x86/kernel/static_call.c |    8 +++++-
>  include/linux/static_call.h   |   15 +++++++++--
>  kernel/static_call.c          |   55 +++++++++++++++++++++++++++++++++++++++---
>  4 files changed, 74 insertions(+), 6 deletions(-)
> 
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -19,6 +19,7 @@
>  #include <linux/hugetlb.h>
>  #include <linux/tboot.h>
>  #include <linux/usb/xhci-dbgp.h>
> +#include <linux/static_call.h>
>  
>  #include <uapi/linux/mount.h>
>  
> @@ -848,6 +849,7 @@ void __init setup_arch(char **cmdline_p)
>  	early_cpu_init();
>  	arch_init_ideal_nops();
>  	jump_label_init();
> +	static_call_init();
>  	early_ioremap_init();
>  
>  	setup_olpc_ofw_pgd();
> --- a/arch/x86/kernel/static_call.c
> +++ b/arch/x86/kernel/static_call.c
> @@ -11,7 +11,7 @@ enum insn_type {
>  	RET = 3,  /* tramp / site cond-tail-call */
>  };
>  
> -static void __static_call_transform(void *insn, enum insn_type type, void *func)
> +static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
>  {
>  	int size = CALL_INSN_SIZE;
>  	const void *code;
> @@ -33,11 +33,17 @@ static void __static_call_transform(void
>  		code = text_gen_insn(RET_INSN_OPCODE, insn, func);
>  		size = RET_INSN_SIZE;
>  		break;
> +
> +	default: /* GCC is a moron -- it figures @code can be uninitialized below */
> +		BUG();
>  	}
>  
>  	if (memcmp(insn, code, size) == 0)
>  		return;
>  
> +	if (unlikely(system_state == SYSTEM_BOOTING))
> +		return text_poke_early(insn, code, size);
> +
>  	text_poke_bp(insn, code, size, NULL);
>  }
>  
> --- a/include/linux/static_call.h
> +++ b/include/linux/static_call.h
> @@ -99,6 +99,8 @@ extern void arch_static_call_transform(v
>  
>  #ifdef CONFIG_HAVE_STATIC_CALL_INLINE
>  
> +extern void __init static_call_init(void);
> +
>  struct static_call_mod {
>  	struct static_call_mod *next;
>  	struct module *mod; /* for vmlinux, mod == NULL */
> @@ -107,7 +109,12 @@ struct static_call_mod {
>  
>  struct static_call_key {
>  	void *func;
> -	struct static_call_mod *mods;
> +	union {
> +		/* bit 0: 0 = mods, 1 = sites */
> +		unsigned long type;
> +		struct static_call_mod *mods;
> +		struct static_call_site *sites;
> +	};
>  };
>  
>  extern void __static_call_update(struct static_call_key *key, void *tramp, void *func);
> @@ -118,7 +125,7 @@ extern int static_call_text_reserved(voi
>  	DECLARE_STATIC_CALL(name, _func);				\
>  	struct static_call_key STATIC_CALL_KEY(name) = {		\
>  		.func = _func,						\
> -		.mods = NULL,						\
> +		.type = 1,						\
>  	};								\
>  	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
>  
> @@ -143,6 +150,8 @@ extern int static_call_text_reserved(voi
>  
>  #elif defined(CONFIG_HAVE_STATIC_CALL)
>  
> +static inline void static_call_init(void) { }
> +
>  struct static_call_key {
>  	void *func;
>  };
> @@ -188,6 +197,8 @@ static inline int static_call_text_reser
>  
>  #else /* Generic implementation */
>  
> +static inline void static_call_init(void) { }
> +
>  struct static_call_key {
>  	void *func;
>  };
> --- a/kernel/static_call.c
> +++ b/kernel/static_call.c
> @@ -94,10 +94,31 @@ static inline void static_call_sort_entr
>  	     static_call_site_cmp, static_call_site_swap);
>  }
>  
> +static inline bool static_call_key_has_mods(struct static_call_key *key)
> +{
> +	return !(key->type & 1);
> +}
> +
> +static inline struct static_call_mod *static_call_key_next(struct static_call_key *key)
> +{
> +	if (!static_call_key_has_mods(key))
> +		return NULL;
> +
> +	return key->mods;
> +}
> +
> +static inline struct static_call_site *static_call_key_sites(struct static_call_key *key)
> +{
> +	if (static_call_key_has_mods(key))
> +		return NULL;
> +
> +	return (struct static_call_site *)(key->type & ~1);
> +}
> +
>  void __static_call_update(struct static_call_key *key, void *tramp, void *func)
>  {
>  	struct static_call_site *site, *stop;
> -	struct static_call_mod *site_mod;
> +	struct static_call_mod *site_mod, first;
>  
>  	cpus_read_lock();
>  	static_call_lock();
> @@ -116,13 +137,22 @@ void __static_call_update(struct static_
>  	if (WARN_ON_ONCE(!static_call_initialized))
>  		goto done;
>  
> -	for (site_mod = key->mods; site_mod; site_mod = site_mod->next) {
> +	first = (struct static_call_mod){
> +		.next = static_call_key_next(key),
> +		.mod = NULL,
> +		.sites = static_call_key_sites(key),
> +	};
> +
> +	for (site_mod = &first; site_mod; site_mod = site_mod->next) {
>  		struct module *mod = site_mod->mod;
>  
>  		if (!site_mod->sites) {
>  			/*
>  			 * This can happen if the static call key is defined in
>  			 * a module which doesn't use it.
> +			 *
> +			 * It also happens in the has_mods case, where the
> +			 * 'first' entry has no sites associated with it.
>  			 */
>  			continue;
>  		}
> @@ -192,16 +222,35 @@ static int __static_call_init(struct mod
>  		if (key != prev_key) {
>  			prev_key = key;
>  
> +			if (!mod) {
> +				key->sites = site;
> +				key->type |= 1;
> +				goto do_transform;
> +			}
> +
>  			site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
>  			if (!site_mod)
>  				return -ENOMEM;
>  
> +			if (!static_call_key_has_mods(key)) {
> +				site_mod->mod = NULL;
> +				site_mod->next = NULL;
> +				site_mod->sites = static_call_key_sites(key);
> +
> +				key->mods = site_mod;
> +
> +				site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
> +				if (!site_mod)
> +					return -ENOMEM;
> +			}
> +
>  			site_mod->mod = mod;
>  			site_mod->sites = site;
>  			site_mod->next = key->mods;
>  			key->mods = site_mod;
>  		}
>  
> +do_transform:
>  		arch_static_call_transform(site_addr, NULL, key->func,
>  				static_call_is_tail(site));
>  	}
> @@ -344,7 +393,7 @@ int static_call_text_reserved(void *star
>  	return ret;
>  }
>  
> -static void __init static_call_init(void)
> +void __init static_call_init(void)
>  {
>  	int ret;
>  
>
Peter Zijlstra July 11, 2020, 5:08 a.m. UTC | #2
On Fri, Jul 10, 2020 at 09:14:26PM -0400, Steven Rostedt wrote:
> On Fri, 10 Jul 2020 15:38:46 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > In order to use static_call() to wire up x86_pmu, we need to
> > initialize earlier; copy some of the tricks from jump_label to enable
> > this.
> > 
> > Primarily we overload key->next to store a sites pointer when there
> > are no modules, this avoids having to use kmalloc() to initialize the
> > sites and allows us to run much earlier.
> > 
> 
> I'm confused. What was the need to have key->next store site pointers
> in order to move it up earlier?

The critical part was to not need an allocation.
Steven Rostedt July 13, 2020, 8:24 p.m. UTC | #3
On Sat, 11 Jul 2020 07:08:31 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Fri, Jul 10, 2020 at 09:14:26PM -0400, Steven Rostedt wrote:
> > On Fri, 10 Jul 2020 15:38:46 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> >   
> > > In order to use static_call() to wire up x86_pmu, we need to
> > > initialize earlier; copy some of the tricks from jump_label to enable
> > > this.
> > > 
> > > Primarily we overload key->next to store a sites pointer when there
> > > are no modules, this avoids having to use kmalloc() to initialize the
> > > sites and allows us to run much earlier.
> > >   
> > 
> > I'm confused. What was the need to have key->next store site pointers
> > in order to move it up earlier?  
> 
> The critical part was to not need an allocation.

Why is an allocation needed? What's different about calling it early
that we need an allocation or this trick?

The two paragraphs above seem totally disconnected.

"In order to use static_call() to wire up x86_pmu, we need to
initialize earlier; copy some of the tricks from jump_label to enable
this."

What tricks were copied?

"Primarily we overload key->next to store a sites pointer when there
are no modules, this avoids having to use kmalloc() to initialize the
sites and allows us to run much earlier."

Why is kmalloc() (or this trick) needed to initialize the sites?

-- Steve
Peter Zijlstra July 14, 2020, 9:51 a.m. UTC | #4
On Mon, Jul 13, 2020 at 04:24:19PM -0400, Steven Rostedt wrote:
> On Sat, 11 Jul 2020 07:08:31 +0200
> Peter Zijlstra <peterz@infradead.org> wrote:
> 
> > On Fri, Jul 10, 2020 at 09:14:26PM -0400, Steven Rostedt wrote:
> > > On Fri, 10 Jul 2020 15:38:46 +0200
> > > Peter Zijlstra <peterz@infradead.org> wrote:
> > >   
> > > > In order to use static_call() to wire up x86_pmu, we need to
> > > > initialize earlier; copy some of the tricks from jump_label to enable
> > > > this.
> > > > 
> > > > Primarily we overload key->next to store a sites pointer when there
> > > > are no modules, this avoids having to use kmalloc() to initialize the
> > > > sites and allows us to run much earlier.
> > > >   
> > > 
> > > I'm confused. What was the need to have key->next store site pointers
> > > in order to move it up earlier?  
> > 
> > The critical part was to not need an allocation.
> 
> Why is an allocation needed? What's different about calling it early
> that we need an allocation or this trick?
> 
> The two paragraphs above seem totally disconnected.
> 
> "In order to use static_call() to wire up x86_pmu, we need to
> initialize earlier; copy some of the tricks from jump_label to enable
> this."
> 
> What tricks were copied?
> 
> "Primarily we overload key->next to store a sites pointer when there

^^ this trick...

+	union {
+		/* bit 0: 0 = mods, 1 = sites */
+		unsigned long type;
+		struct static_call_mod *mods;
+		struct static_call_site *sites;
+	};

If that isn't a trick, I don't know ;-)

> are no modules, this avoids having to use kmalloc() to initialize the
> sites and allows us to run much earlier."
> 
> Why is kmalloc() (or this trick) needed to initialize the sites?

That's just how the code was; it treated vmlinux as the NULL module, and
as such needed a static_call_mod allocated to host the static_call_sites
pointer.

That is, the static_call_key has a single linked list pointer to
static_call_mod, and every module has an entry on that list with a
pointer to their sites. Very simple and straight forward.

Except it requires an allocation to set up, which is a pain is you want
it initialized very early.
Steven Rostedt July 14, 2020, 2:16 p.m. UTC | #5
On Tue, 14 Jul 2020 11:51:17 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Mon, Jul 13, 2020 at 04:24:19PM -0400, Steven Rostedt wrote:
> > On Sat, 11 Jul 2020 07:08:31 +0200
> > Peter Zijlstra <peterz@infradead.org> wrote:
> >   
> > > On Fri, Jul 10, 2020 at 09:14:26PM -0400, Steven Rostedt wrote:  
> > > > On Fri, 10 Jul 2020 15:38:46 +0200
> > > > Peter Zijlstra <peterz@infradead.org> wrote:
> > > >     
> > > > > In order to use static_call() to wire up x86_pmu, we need to
> > > > > initialize earlier; copy some of the tricks from jump_label to enable
> > > > > this.
> > > > > 
> > > > > Primarily we overload key->next to store a sites pointer when there
> > > > > are no modules, this avoids having to use kmalloc() to initialize the
> > > > > sites and allows us to run much earlier.
> > > > >     
> > > > 
> > > > I'm confused. What was the need to have key->next store site pointers
> > > > in order to move it up earlier?    
> > > 
> > > The critical part was to not need an allocation.  
> > 
> > Why is an allocation needed? What's different about calling it early
> > that we need an allocation or this trick?
> > 
> > The two paragraphs above seem totally disconnected.
> > 
> > "In order to use static_call() to wire up x86_pmu, we need to
> > initialize earlier; copy some of the tricks from jump_label to enable
> > this."
> > 
> > What tricks were copied?
> > 
> > "Primarily we overload key->next to store a sites pointer when there  
> 
> ^^ this trick...
> 
> +	union {
> +		/* bit 0: 0 = mods, 1 = sites */
> +		unsigned long type;
> +		struct static_call_mod *mods;
> +		struct static_call_site *sites;
> +	};
> 
> If that isn't a trick, I don't know ;-)

Ah, that makes more sense. The way it was worded didn't quite put 2 and
2 together like that.

> 
> > are no modules, this avoids having to use kmalloc() to initialize the
> > sites and allows us to run much earlier."
> > 
> > Why is kmalloc() (or this trick) needed to initialize the sites?  
> 
> That's just how the code was; it treated vmlinux as the NULL module, and
> as such needed a static_call_mod allocated to host the static_call_sites
> pointer.
> 
> That is, the static_call_key has a single linked list pointer to
> static_call_mod, and every module has an entry on that list with a
> pointer to their sites. Very simple and straight forward.
> 
> Except it requires an allocation to set up, which is a pain is you want
> it initialized very early.

OK, I'm still missing something, but having a hard time explaining
exactly what that is. ;-)

I guess that is, why did moving the initialization early require an
allocation where initializing it later did not? What allocation are we
avoiding?

I'm not seeing why this trick is needed when we moved the init early as
compared to doing the same thing later on.

Is it this?

> @@ -192,16 +222,35 @@ static int __static_call_init(struct mod
>                 if (key != prev_key) {
>                         prev_key = key;
>  
> +                       if (!mod) {
> +                               key->sites = site;
> +                               key->type |= 1;
> +                               goto do_transform;
> +                       }
> +

We want to avoid calling kzalloc() early?

If so, this should have a comment here stating so and why.

>                         site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
>                         if (!site_mod)
>                                 return -ENOMEM;

-- Steve
Peter Zijlstra July 14, 2020, 3:54 p.m. UTC | #6
On Tue, Jul 14, 2020 at 10:16:36AM -0400, Steven Rostedt wrote:

> > That's just how the code was; it treated vmlinux as the NULL module, and
> > as such needed a static_call_mod allocated to host the static_call_sites
> > pointer.
> > 
> > That is, the static_call_key has a single linked list pointer to
> > static_call_mod, and every module has an entry on that list with a
> > pointer to their sites. Very simple and straight forward.
> > 
> > Except it requires an allocation to set up, which is a pain is you want
> > it initialized very early.
> 
> OK, I'm still missing something, but having a hard time explaining
> exactly what that is. ;-)
> 
> I guess that is, why did moving the initialization early require an
> allocation where initializing it later did not? What allocation are we
> avoiding?

The other way around. Before this patch initialization required an
allocation, with this patch init no longer requires one.

> I'm not seeing why this trick is needed when we moved the init early as
> compared to doing the same thing later on.

We use the trick to avoid the alloc, which is what enables early init.

So, before:
 - init required alloc
 - alloc prohibits early use

after:
 - init no longer require alloc
   - because horrible pointer reuse
 - early use possible

> > @@ -192,16 +222,35 @@ static int __static_call_init(struct mod
> >                 if (key != prev_key) {
> >                         prev_key = key;
> >  
> > +                       if (!mod) {
> > +                               key->sites = site;
> > +                               key->type |= 1;
> > +                               goto do_transform;
> > +                       }
> > +
> 
> We want to avoid calling kzalloc() early?
> 
> If so, this should have a comment here stating so and why.

Fair enough; and I think I even see a further optimiation.

How's this:

---
Subject: static_call: Allow early init
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 4 Oct 17:21:10 CEST 2019

In order to use static_call() to wire up x86_pmu, we need to
initialize earlier; copy some of the tricks from jump_label to enable
this.

Primarily we overload key->next to store a sites pointer when there
are no modules, this avoids having to use kmalloc() to initialize the
sites and allows us to run much earlier.

(arguably, this is much much earlier than needed for perf, but it
might allow other uses.)

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/setup.c       |    2 +
 arch/x86/kernel/static_call.c |    8 ++++-
 include/linux/static_call.h   |   15 ++++++++-
 kernel/static_call.c          |   67 +++++++++++++++++++++++++++++++++++++++---
 4 files changed, 85 insertions(+), 7 deletions(-)

--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -19,6 +19,7 @@
 #include <linux/hugetlb.h>
 #include <linux/tboot.h>
 #include <linux/usb/xhci-dbgp.h>
+#include <linux/static_call.h>
 
 #include <uapi/linux/mount.h>
 
@@ -848,6 +849,7 @@ void __init setup_arch(char **cmdline_p)
 	early_cpu_init();
 	arch_init_ideal_nops();
 	jump_label_init();
+	static_call_init();
 	early_ioremap_init();
 
 	setup_olpc_ofw_pgd();
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -11,7 +11,7 @@ enum insn_type {
 	RET = 3,  /* tramp / site cond-tail-call */
 };
 
-static void __static_call_transform(void *insn, enum insn_type type, void *func)
+static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
 {
 	int size = CALL_INSN_SIZE;
 	const void *code;
@@ -33,11 +33,17 @@ static void __static_call_transform(void
 		code = text_gen_insn(RET_INSN_OPCODE, insn, func);
 		size = RET_INSN_SIZE;
 		break;
+
+	default: /* GCC is a moron -- it figures @code can be uninitialized below */
+		BUG();
 	}
 
 	if (memcmp(insn, code, size) == 0)
 		return;
 
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return text_poke_early(insn, code, size);
+
 	text_poke_bp(insn, code, size, NULL);
 }
 
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -136,6 +136,8 @@ extern void arch_static_call_transform(v
 
 #ifdef CONFIG_HAVE_STATIC_CALL_INLINE
 
+extern void __init static_call_init(void);
+
 struct static_call_mod {
 	struct static_call_mod *next;
 	struct module *mod; /* for vmlinux, mod == NULL */
@@ -144,7 +146,12 @@ struct static_call_mod {
 
 struct static_call_key {
 	void *func;
-	struct static_call_mod *mods;
+	union {
+		/* bit 0: 0 = mods, 1 = sites */
+		unsigned long type;
+		struct static_call_mod *mods;
+		struct static_call_site *sites;
+	};
 };
 
 extern void __static_call_update(struct static_call_key *key, void *tramp, void *func);
@@ -155,7 +162,7 @@ extern int static_call_text_reserved(voi
 	DECLARE_STATIC_CALL(name, _func);				\
 	struct static_call_key STATIC_CALL_KEY(name) = {		\
 		.func = _func,						\
-		.mods = NULL,						\
+		.type = 1,						\
 	};								\
 	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
 
@@ -180,6 +187,8 @@ extern int static_call_text_reserved(voi
 
 #elif defined(CONFIG_HAVE_STATIC_CALL)
 
+static inline void static_call_init(void) { }
+
 struct static_call_key {
 	void *func;
 };
@@ -225,6 +234,8 @@ static inline int static_call_text_reser
 
 #else /* Generic implementation */
 
+static inline void static_call_init(void) { }
+
 struct static_call_key {
 	void *func;
 };
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -94,10 +94,31 @@ static inline void static_call_sort_entr
 	     static_call_site_cmp, static_call_site_swap);
 }
 
+static inline bool static_call_key_has_mods(struct static_call_key *key)
+{
+	return !(key->type & 1);
+}
+
+static inline struct static_call_mod *static_call_key_next(struct static_call_key *key)
+{
+	if (!static_call_key_has_mods(key))
+		return NULL;
+
+	return key->mods;
+}
+
+static inline struct static_call_site *static_call_key_sites(struct static_call_key *key)
+{
+	if (static_call_key_has_mods(key))
+		return NULL;
+
+	return (struct static_call_site *)(key->type & ~1);
+}
+
 void __static_call_update(struct static_call_key *key, void *tramp, void *func)
 {
 	struct static_call_site *site, *stop;
-	struct static_call_mod *site_mod;
+	struct static_call_mod *site_mod, first;
 
 	cpus_read_lock();
 	static_call_lock();
@@ -116,13 +137,22 @@ void __static_call_update(struct static_
 	if (WARN_ON_ONCE(!static_call_initialized))
 		goto done;
 
-	for (site_mod = key->mods; site_mod; site_mod = site_mod->next) {
+	first = (struct static_call_mod){
+		.next = static_call_key_next(key),
+		.mod = NULL,
+		.sites = static_call_key_sites(key),
+	};
+
+	for (site_mod = &first; site_mod; site_mod = site_mod->next) {
 		struct module *mod = site_mod->mod;
 
 		if (!site_mod->sites) {
 			/*
 			 * This can happen if the static call key is defined in
 			 * a module which doesn't use it.
+			 *
+			 * It also happens in the has_mods case, where the
+			 * 'first' entry has no sites associated with it.
 			 */
 			continue;
 		}
@@ -192,16 +222,45 @@ static int __static_call_init(struct mod
 		if (key != prev_key) {
 			prev_key = key;
 
+			/*
+			 * For vmlinux (!mod) avoid the allocation by storing
+			 * the sites pointer in the key itself. Also see
+			 * __static_call_update()'s @first.
+			 */
+			if (!mod) {
+				key->sites = site;
+				key->type |= 1;
+				goto do_transform;
+			}
+
 			site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
 			if (!site_mod)
 				return -ENOMEM;
 
+			/*
+			 * When the key has a direct sites pointer, extract
+			 * that into an explicit struct static_call_mod, so we
+			 * can have a list of modules.
+			 */
+			if (static_call_key_sites(key)) {
+				site_mod->mod = NULL;
+				site_mod->next = NULL;
+				site_mod->sites = static_call_key_sites(key);
+
+				key->mods = site_mod;
+
+				site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
+				if (!site_mod)
+					return -ENOMEM;
+			}
+
 			site_mod->mod = mod;
 			site_mod->sites = site;
-			site_mod->next = key->mods;
+			site_mod->next = static_call_key_next(key);
 			key->mods = site_mod;
 		}
 
+do_transform:
 		arch_static_call_transform(site_addr, NULL, key->func,
 				static_call_is_tail(site));
 	}
@@ -348,7 +407,7 @@ int static_call_text_reserved(void *star
 	return __static_call_mod_text_reserved(start, end);
 }
 
-static void __init static_call_init(void)
+void __init static_call_init(void)
 {
 	int ret;
Steven Rostedt July 14, 2020, 4:07 p.m. UTC | #7
On Tue, 14 Jul 2020 17:54:17 +0200
Peter Zijlstra <peterz@infradead.org> wrote:
> > I guess that is, why did moving the initialization early require an
> > allocation where initializing it later did not? What allocation are we
> > avoiding?  
> 
> The other way around. Before this patch initialization required an
> allocation, with this patch init no longer requires one.
> 

Yeah, that's what I figured. Just didn't express myself that way.

> > I'm not seeing why this trick is needed when we moved the init early as
> > compared to doing the same thing later on.  
> 
> We use the trick to avoid the alloc, which is what enables early init.
> 
> So, before:
>  - init required alloc
>  - alloc prohibits early use
> 
> after:
>  - init no longer require alloc
>    - because horrible pointer reuse
>  - early use possible
> 
> > > @@ -192,16 +222,35 @@ static int __static_call_init(struct mod
> > >                 if (key != prev_key) {
> > >                         prev_key = key;
> > >  
> > > +                       if (!mod) {
> > > +                               key->sites = site;
> > > +                               key->type |= 1;
> > > +                               goto do_transform;
> > > +                       }
> > > +  
> > 
> > We want to avoid calling kzalloc() early?
> > 
> > If so, this should have a comment here stating so and why.  
> 
> Fair enough; and I think I even see a further optimiation.
> 
> How's this:
> 
> ---
> Subject: static_call: Allow early init
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Fri, 4 Oct 17:21:10 CEST 2019
> 
> In order to use static_call() to wire up x86_pmu, we need to
> initialize earlier; copy some of the tricks from jump_label to enable
> this.
> 
> Primarily we overload key->next to store a sites pointer when there
> are no modules, this avoids having to use kmalloc() to initialize the
> sites and allows us to run much earlier.

Can we add a statement that says something like: "Because x86 now calls
static_call_init() before the setup of the memory allocator, we must
avoid using kmalloc() and friends for core kernel static calls." ?

This was the missing piece for me.

> 
> (arguably, this is much much earlier than needed for perf, but it
> might allow other uses.)
> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---
>  arch/x86/kernel/setup.c       |    2 +
>  arch/x86/kernel/static_call.c |    8 ++++-
>  include/linux/static_call.h   |   15 ++++++++-
>  kernel/static_call.c          |   67 +++++++++++++++++++++++++++++++++++++++---
>  4 files changed, 85 insertions(+), 7 deletions(-)
> 
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -19,6 +19,7 @@
>  #include <linux/hugetlb.h>
>  #include <linux/tboot.h>
>  #include <linux/usb/xhci-dbgp.h>
> +#include <linux/static_call.h>
>  
>  #include <uapi/linux/mount.h>
>  
> @@ -848,6 +849,7 @@ void __init setup_arch(char **cmdline_p)
>  	early_cpu_init();
>  	arch_init_ideal_nops();
>  	jump_label_init();
> +	static_call_init();
>  	early_ioremap_init();
>  
>  	setup_olpc_ofw_pgd();
> --- a/arch/x86/kernel/static_call.c
> +++ b/arch/x86/kernel/static_call.c
> @@ -11,7 +11,7 @@ enum insn_type {
>  	RET = 3,  /* tramp / site cond-tail-call */
>  };
>  
> -static void __static_call_transform(void *insn, enum insn_type type, void *func)
> +static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
>  {
>  	int size = CALL_INSN_SIZE;
>  	const void *code;
> @@ -33,11 +33,17 @@ static void __static_call_transform(void
>  		code = text_gen_insn(RET_INSN_OPCODE, insn, func);
>  		size = RET_INSN_SIZE;
>  		break;
> +
> +	default: /* GCC is a moron -- it figures @code can be uninitialized below */
> +		BUG();
>  	}
>  
>  	if (memcmp(insn, code, size) == 0)
>  		return;
>  
> +	if (unlikely(system_state == SYSTEM_BOOTING))
> +		return text_poke_early(insn, code, size);
> +
>  	text_poke_bp(insn, code, size, NULL);
>  }
>  
> --- a/include/linux/static_call.h
> +++ b/include/linux/static_call.h
> @@ -136,6 +136,8 @@ extern void arch_static_call_transform(v
>  
>  #ifdef CONFIG_HAVE_STATIC_CALL_INLINE
>  
> +extern void __init static_call_init(void);
> +
>  struct static_call_mod {
>  	struct static_call_mod *next;
>  	struct module *mod; /* for vmlinux, mod == NULL */
> @@ -144,7 +146,12 @@ struct static_call_mod {
>  
>  struct static_call_key {
>  	void *func;
> -	struct static_call_mod *mods;
> +	union {
> +		/* bit 0: 0 = mods, 1 = sites */
> +		unsigned long type;
> +		struct static_call_mod *mods;
> +		struct static_call_site *sites;
> +	};
>  };
>  
>  extern void __static_call_update(struct static_call_key *key, void *tramp, void *func);
> @@ -155,7 +162,7 @@ extern int static_call_text_reserved(voi
>  	DECLARE_STATIC_CALL(name, _func);				\
>  	struct static_call_key STATIC_CALL_KEY(name) = {		\
>  		.func = _func,						\
> -		.mods = NULL,						\
> +		.type = 1,						\
>  	};								\
>  	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
>  
> @@ -180,6 +187,8 @@ extern int static_call_text_reserved(voi
>  
>  #elif defined(CONFIG_HAVE_STATIC_CALL)
>  
> +static inline void static_call_init(void) { }
> +
>  struct static_call_key {
>  	void *func;
>  };
> @@ -225,6 +234,8 @@ static inline int static_call_text_reser
>  
>  #else /* Generic implementation */
>  
> +static inline void static_call_init(void) { }
> +
>  struct static_call_key {
>  	void *func;
>  };
> --- a/kernel/static_call.c
> +++ b/kernel/static_call.c
> @@ -94,10 +94,31 @@ static inline void static_call_sort_entr
>  	     static_call_site_cmp, static_call_site_swap);
>  }
>  
> +static inline bool static_call_key_has_mods(struct static_call_key *key)
> +{
> +	return !(key->type & 1);
> +}
> +
> +static inline struct static_call_mod *static_call_key_next(struct static_call_key *key)
> +{
> +	if (!static_call_key_has_mods(key))
> +		return NULL;
> +
> +	return key->mods;
> +}
> +
> +static inline struct static_call_site *static_call_key_sites(struct static_call_key *key)
> +{
> +	if (static_call_key_has_mods(key))
> +		return NULL;
> +
> +	return (struct static_call_site *)(key->type & ~1);
> +}
> +
>  void __static_call_update(struct static_call_key *key, void *tramp, void *func)
>  {
>  	struct static_call_site *site, *stop;
> -	struct static_call_mod *site_mod;
> +	struct static_call_mod *site_mod, first;
>  
>  	cpus_read_lock();
>  	static_call_lock();
> @@ -116,13 +137,22 @@ void __static_call_update(struct static_
>  	if (WARN_ON_ONCE(!static_call_initialized))
>  		goto done;
>  
> -	for (site_mod = key->mods; site_mod; site_mod = site_mod->next) {
> +	first = (struct static_call_mod){
> +		.next = static_call_key_next(key),
> +		.mod = NULL,
> +		.sites = static_call_key_sites(key),
> +	};
> +
> +	for (site_mod = &first; site_mod; site_mod = site_mod->next) {
>  		struct module *mod = site_mod->mod;
>  
>  		if (!site_mod->sites) {
>  			/*
>  			 * This can happen if the static call key is defined in
>  			 * a module which doesn't use it.
> +			 *
> +			 * It also happens in the has_mods case, where the
> +			 * 'first' entry has no sites associated with it.
>  			 */
>  			continue;
>  		}
> @@ -192,16 +222,45 @@ static int __static_call_init(struct mod
>  		if (key != prev_key) {
>  			prev_key = key;
>  
> +			/*

Can we add to this comment:

			 * Some architectures (x86) call this before the memory
			 * allocator is set up.
> +			 * For vmlinux (!mod) avoid the allocation by storing
> +			 * the sites pointer in the key itself. Also see
> +			 * __static_call_update()'s @first.

-- Steve

> +			 */
> +			if (!mod) {
> +				key->sites = site;
> +				key->type |= 1;
> +				goto do_transform;
> +			}
> +
>  			site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
>  			if (!site_mod)
>  				return -ENOMEM;
>  
> +			/*
> +			 * When the key has a direct sites pointer, extract
> +			 * that into an explicit struct static_call_mod, so we
> +			 * can have a list of modules.
> +			 */
> +			if (static_call_key_sites(key)) {
> +				site_mod->mod = NULL;
> +				site_mod->next = NULL;
> +				site_mod->sites = static_call_key_sites(key);
> +
> +				key->mods = site_mod;
> +
> +				site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
> +				if (!site_mod)
> +					return -ENOMEM;
> +			}
> +
>  			site_mod->mod = mod;
>  			site_mod->sites = site;
> -			site_mod->next = key->mods;
> +			site_mod->next = static_call_key_next(key);
>  			key->mods = site_mod;
>  		}
>  
> +do_transform:
>  		arch_static_call_transform(site_addr, NULL, key->func,
>  				static_call_is_tail(site));
>  	}
> @@ -348,7 +407,7 @@ int static_call_text_reserved(void *star
>  	return __static_call_mod_text_reserved(start, end);
>  }
>  
> -static void __init static_call_init(void)
> +void __init static_call_init(void)
>  {
>  	int ret;
>
Peter Zijlstra July 14, 2020, 6:31 p.m. UTC | #8
On Tue, Jul 14, 2020 at 12:07:01PM -0400, Steven Rostedt wrote:
> Can we add a statement that says something like: "Because x86 now calls
> static_call_init() before the setup of the memory allocator, we must
> avoid using kmalloc() and friends for core kernel static calls." ?
> 
> This was the missing piece for me.

It now reads like this.

---
Subject: static_call: Allow early init
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 4 Oct 17:21:10 CEST 2019

In order to use static_call() to wire up x86_pmu, we need to
initialize earlier, specifically before memory allocation works; copy
some of the tricks from jump_label to enable this.

Primarily we overload key->next to store a sites pointer when there
are no modules, this avoids having to use kmalloc() to initialize the
sites and allows us to run much earlier.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 arch/x86/kernel/setup.c       |    2 +
 arch/x86/kernel/static_call.c |    8 ++++
 include/linux/static_call.h   |   15 +++++++--
 kernel/static_call.c          |   70 +++++++++++++++++++++++++++++++++++++++---
 4 files changed, 88 insertions(+), 7 deletions(-)

--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -19,6 +19,7 @@
 #include <linux/hugetlb.h>
 #include <linux/tboot.h>
 #include <linux/usb/xhci-dbgp.h>
+#include <linux/static_call.h>

 #include <uapi/linux/mount.h>

@@ -848,6 +849,7 @@ void __init setup_arch(char **cmdline_p)
 	early_cpu_init();
 	arch_init_ideal_nops();
 	jump_label_init();
+	static_call_init();
 	early_ioremap_init();

 	setup_olpc_ofw_pgd();
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -11,7 +11,7 @@ enum insn_type {
 	RET = 3,  /* tramp / site cond-tail-call */
 };

-static void __static_call_transform(void *insn, enum insn_type type, void *func)
+static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
 {
 	int size = CALL_INSN_SIZE;
 	const void *code;
@@ -33,11 +33,17 @@ static void __static_call_transform(void
 		code = text_gen_insn(RET_INSN_OPCODE, insn, func);
 		size = RET_INSN_SIZE;
 		break;
+
+	default: /* GCC is a moron -- it figures @code can be uninitialized below */
+		BUG();
 	}

 	if (memcmp(insn, code, size) == 0)
 		return;

+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return text_poke_early(insn, code, size);
+
 	text_poke_bp(insn, code, size, NULL);
 }

--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -136,6 +136,8 @@ extern void arch_static_call_transform(v

 #ifdef CONFIG_HAVE_STATIC_CALL_INLINE

+extern void __init static_call_init(void);
+
 struct static_call_mod {
 	struct static_call_mod *next;
 	struct module *mod; /* for vmlinux, mod == NULL */
@@ -144,7 +146,12 @@ struct static_call_mod {

 struct static_call_key {
 	void *func;
-	struct static_call_mod *mods;
+	union {
+		/* bit 0: 0 = mods, 1 = sites */
+		unsigned long type;
+		struct static_call_mod *mods;
+		struct static_call_site *sites;
+	};
 };

 extern void __static_call_update(struct static_call_key *key, void *tramp, void *func);
@@ -155,7 +162,7 @@ extern int static_call_text_reserved(voi
 	DECLARE_STATIC_CALL(name, _func);				\
 	struct static_call_key STATIC_CALL_KEY(name) = {		\
 		.func = _func,						\
-		.mods = NULL,						\
+		.type = 1,						\
 	};								\
 	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)

@@ -180,6 +187,8 @@ extern int static_call_text_reserved(voi

 #elif defined(CONFIG_HAVE_STATIC_CALL)

+static inline void static_call_init(void) { }
+
 struct static_call_key {
 	void *func;
 };
@@ -225,6 +234,8 @@ static inline int static_call_text_reser

 #else /* Generic implementation */

+static inline void static_call_init(void) { }
+
 struct static_call_key {
 	void *func;
 };
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -94,10 +94,31 @@ static inline void static_call_sort_entr
 	     static_call_site_cmp, static_call_site_swap);
 }

+static inline bool static_call_key_has_mods(struct static_call_key *key)
+{
+	return !(key->type & 1);
+}
+
+static inline struct static_call_mod *static_call_key_next(struct static_call_key *key)
+{
+	if (!static_call_key_has_mods(key))
+		return NULL;
+
+	return key->mods;
+}
+
+static inline struct static_call_site *static_call_key_sites(struct static_call_key *key)
+{
+	if (static_call_key_has_mods(key))
+		return NULL;
+
+	return (struct static_call_site *)(key->type & ~1);
+}
+
 void __static_call_update(struct static_call_key *key, void *tramp, void *func)
 {
 	struct static_call_site *site, *stop;
-	struct static_call_mod *site_mod;
+	struct static_call_mod *site_mod, first;

 	cpus_read_lock();
 	static_call_lock();
@@ -116,13 +137,22 @@ void __static_call_update(struct static_
 	if (WARN_ON_ONCE(!static_call_initialized))
 		goto done;

-	for (site_mod = key->mods; site_mod; site_mod = site_mod->next) {
+	first = (struct static_call_mod){
+		.next = static_call_key_next(key),
+		.mod = NULL,
+		.sites = static_call_key_sites(key),
+	};
+
+	for (site_mod = &first; site_mod; site_mod = site_mod->next) {
 		struct module *mod = site_mod->mod;

 		if (!site_mod->sites) {
 			/*
 			 * This can happen if the static call key is defined in
 			 * a module which doesn't use it.
+			 *
+			 * It also happens in the has_mods case, where the
+			 * 'first' entry has no sites associated with it.
 			 */
 			continue;
 		}
@@ -192,16 +222,48 @@ static int __static_call_init(struct mod
 		if (key != prev_key) {
 			prev_key = key;

+			/*
+			 * For vmlinux (!mod) avoid the allocation by storing
+			 * the sites pointer in the key itself. Also see
+			 * __static_call_update()'s @first.
+			 *
+			 * This allows architectures (eg. x86) to call
+			 * static_call_init() before memory allocation works.
+			 */
+			if (!mod) {
+				key->sites = site;
+				key->type |= 1;
+				goto do_transform;
+			}
+
 			site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
 			if (!site_mod)
 				return -ENOMEM;

+			/*
+			 * When the key has a direct sites pointer, extract
+			 * that into an explicit struct static_call_mod, so we
+			 * can have a list of modules.
+			 */
+			if (static_call_key_sites(key)) {
+				site_mod->mod = NULL;
+				site_mod->next = NULL;
+				site_mod->sites = static_call_key_sites(key);
+
+				key->mods = site_mod;
+
+				site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
+				if (!site_mod)
+					return -ENOMEM;
+			}
+
 			site_mod->mod = mod;
 			site_mod->sites = site;
-			site_mod->next = key->mods;
+			site_mod->next = static_call_key_next(key);
 			key->mods = site_mod;
 		}

+do_transform:
 		arch_static_call_transform(site_addr, NULL, key->func,
 				static_call_is_tail(site));
 	}
@@ -348,7 +410,7 @@ int static_call_text_reserved(void *star
 	return __static_call_mod_text_reserved(start, end);
 }

-static void __init static_call_init(void)
+void __init static_call_init(void)
 {
 	int ret;
Steven Rostedt July 14, 2020, 7:38 p.m. UTC | #9
On Tue, 14 Jul 2020 20:31:43 +0200
Peter Zijlstra <peterz@infradead.org> wrote:

> On Tue, Jul 14, 2020 at 12:07:01PM -0400, Steven Rostedt wrote:
> > Can we add a statement that says something like: "Because x86 now calls
> > static_call_init() before the setup of the memory allocator, we must
> > avoid using kmalloc() and friends for core kernel static calls." ?
> > 
> > This was the missing piece for me.  
> 
> It now reads like this.
> 
> ---
> Subject: static_call: Allow early init
> From: Peter Zijlstra <peterz@infradead.org>
> Date: Fri, 4 Oct 17:21:10 CEST 2019
> 
> In order to use static_call() to wire up x86_pmu, we need to
> initialize earlier, specifically before memory allocation works; copy
> some of the tricks from jump_label to enable this.
> 
> Primarily we overload key->next to store a sites pointer when there
> are no modules, this avoids having to use kmalloc() to initialize the
> sites and allows us to run much earlier.

All the pieces lie in place. Thanks!

Reviewed-by: Steven Rostedt (VMware) <rostedt@goodmis.org>

-- Steve

> 
> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
> ---

Patch
diff mbox series

--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -19,6 +19,7 @@ 
 #include <linux/hugetlb.h>
 #include <linux/tboot.h>
 #include <linux/usb/xhci-dbgp.h>
+#include <linux/static_call.h>
 
 #include <uapi/linux/mount.h>
 
@@ -848,6 +849,7 @@  void __init setup_arch(char **cmdline_p)
 	early_cpu_init();
 	arch_init_ideal_nops();
 	jump_label_init();
+	static_call_init();
 	early_ioremap_init();
 
 	setup_olpc_ofw_pgd();
--- a/arch/x86/kernel/static_call.c
+++ b/arch/x86/kernel/static_call.c
@@ -11,7 +11,7 @@  enum insn_type {
 	RET = 3,  /* tramp / site cond-tail-call */
 };
 
-static void __static_call_transform(void *insn, enum insn_type type, void *func)
+static void __ref __static_call_transform(void *insn, enum insn_type type, void *func)
 {
 	int size = CALL_INSN_SIZE;
 	const void *code;
@@ -33,11 +33,17 @@  static void __static_call_transform(void
 		code = text_gen_insn(RET_INSN_OPCODE, insn, func);
 		size = RET_INSN_SIZE;
 		break;
+
+	default: /* GCC is a moron -- it figures @code can be uninitialized below */
+		BUG();
 	}
 
 	if (memcmp(insn, code, size) == 0)
 		return;
 
+	if (unlikely(system_state == SYSTEM_BOOTING))
+		return text_poke_early(insn, code, size);
+
 	text_poke_bp(insn, code, size, NULL);
 }
 
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -99,6 +99,8 @@  extern void arch_static_call_transform(v
 
 #ifdef CONFIG_HAVE_STATIC_CALL_INLINE
 
+extern void __init static_call_init(void);
+
 struct static_call_mod {
 	struct static_call_mod *next;
 	struct module *mod; /* for vmlinux, mod == NULL */
@@ -107,7 +109,12 @@  struct static_call_mod {
 
 struct static_call_key {
 	void *func;
-	struct static_call_mod *mods;
+	union {
+		/* bit 0: 0 = mods, 1 = sites */
+		unsigned long type;
+		struct static_call_mod *mods;
+		struct static_call_site *sites;
+	};
 };
 
 extern void __static_call_update(struct static_call_key *key, void *tramp, void *func);
@@ -118,7 +125,7 @@  extern int static_call_text_reserved(voi
 	DECLARE_STATIC_CALL(name, _func);				\
 	struct static_call_key STATIC_CALL_KEY(name) = {		\
 		.func = _func,						\
-		.mods = NULL,						\
+		.type = 1,						\
 	};								\
 	ARCH_DEFINE_STATIC_CALL_TRAMP(name, _func)
 
@@ -143,6 +150,8 @@  extern int static_call_text_reserved(voi
 
 #elif defined(CONFIG_HAVE_STATIC_CALL)
 
+static inline void static_call_init(void) { }
+
 struct static_call_key {
 	void *func;
 };
@@ -188,6 +197,8 @@  static inline int static_call_text_reser
 
 #else /* Generic implementation */
 
+static inline void static_call_init(void) { }
+
 struct static_call_key {
 	void *func;
 };
--- a/kernel/static_call.c
+++ b/kernel/static_call.c
@@ -94,10 +94,31 @@  static inline void static_call_sort_entr
 	     static_call_site_cmp, static_call_site_swap);
 }
 
+static inline bool static_call_key_has_mods(struct static_call_key *key)
+{
+	return !(key->type & 1);
+}
+
+static inline struct static_call_mod *static_call_key_next(struct static_call_key *key)
+{
+	if (!static_call_key_has_mods(key))
+		return NULL;
+
+	return key->mods;
+}
+
+static inline struct static_call_site *static_call_key_sites(struct static_call_key *key)
+{
+	if (static_call_key_has_mods(key))
+		return NULL;
+
+	return (struct static_call_site *)(key->type & ~1);
+}
+
 void __static_call_update(struct static_call_key *key, void *tramp, void *func)
 {
 	struct static_call_site *site, *stop;
-	struct static_call_mod *site_mod;
+	struct static_call_mod *site_mod, first;
 
 	cpus_read_lock();
 	static_call_lock();
@@ -116,13 +137,22 @@  void __static_call_update(struct static_
 	if (WARN_ON_ONCE(!static_call_initialized))
 		goto done;
 
-	for (site_mod = key->mods; site_mod; site_mod = site_mod->next) {
+	first = (struct static_call_mod){
+		.next = static_call_key_next(key),
+		.mod = NULL,
+		.sites = static_call_key_sites(key),
+	};
+
+	for (site_mod = &first; site_mod; site_mod = site_mod->next) {
 		struct module *mod = site_mod->mod;
 
 		if (!site_mod->sites) {
 			/*
 			 * This can happen if the static call key is defined in
 			 * a module which doesn't use it.
+			 *
+			 * It also happens in the has_mods case, where the
+			 * 'first' entry has no sites associated with it.
 			 */
 			continue;
 		}
@@ -192,16 +222,35 @@  static int __static_call_init(struct mod
 		if (key != prev_key) {
 			prev_key = key;
 
+			if (!mod) {
+				key->sites = site;
+				key->type |= 1;
+				goto do_transform;
+			}
+
 			site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
 			if (!site_mod)
 				return -ENOMEM;
 
+			if (!static_call_key_has_mods(key)) {
+				site_mod->mod = NULL;
+				site_mod->next = NULL;
+				site_mod->sites = static_call_key_sites(key);
+
+				key->mods = site_mod;
+
+				site_mod = kzalloc(sizeof(*site_mod), GFP_KERNEL);
+				if (!site_mod)
+					return -ENOMEM;
+			}
+
 			site_mod->mod = mod;
 			site_mod->sites = site;
 			site_mod->next = key->mods;
 			key->mods = site_mod;
 		}
 
+do_transform:
 		arch_static_call_transform(site_addr, NULL, key->func,
 				static_call_is_tail(site));
 	}
@@ -344,7 +393,7 @@  int static_call_text_reserved(void *star
 	return ret;
 }
 
-static void __init static_call_init(void)
+void __init static_call_init(void)
 {
 	int ret;