linux-arm-kernel.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] ARM: option for loading modules into vmalloc area
@ 2014-11-18 16:21 Konstantin Khlebnikov
  2014-11-18 17:34 ` Russell King - ARM Linux
  0 siblings, 1 reply; 22+ messages in thread
From: Konstantin Khlebnikov @ 2014-11-18 16:21 UTC (permalink / raw)
  To: linux-arm-kernel

Usually modules are loaded into small area prior to the kernel
text because they are linked with the kernel using short calls.
Compile-time instrumentation like GCOV or KASAN bloats code a lot,
and as a result huge modules no longer fit into reserved area.

This patch adds option CONFIG_MODULES_USE_VMALLOC which lifts
limitation on amount of loaded modules. It links modules using
long-calls (option -mlong-calls) and loads them into vmalloc area.

In few places exported symbols are called from inline assembly.
This patch adds macro for such call sites: __asmbl and __asmbl_clobber.
Call turns into single 'bl' or sequence 'movw; movt; blx' depending on
context and state of config option.

Unfortunately this option isn't compatible with CONFIG_FUNCTION_TRACER.
Compiler emits short calls to profiling function despite of -mlong-calls.
This is a bug in GCC, but ftrace anyway needs an update to handle this.

Signed-off-by: Konstantin Khlebnikov <k.khlebnikov@samsung.com>
---
 arch/arm/Kconfig                |   20 ++++++++++++++++++++
 arch/arm/Makefile               |    4 ++++
 arch/arm/include/asm/compiler.h |   13 +++++++++++++
 arch/arm/include/asm/div64.h    |    2 +-
 arch/arm/include/asm/memory.h   |   11 +++++++++++
 arch/arm/include/asm/uaccess.h  |   16 ++++++++--------
 arch/arm/kernel/module.c        |    2 ++
 arch/arm/mm/dump.c              |   10 +++++++++-
 arch/arm/mm/init.c              |    2 ++
 arch/arm/mm/mmu.c               |    7 +++----
 arch/arm/mm/pgd.c               |    5 +++--
 11 files changed, 76 insertions(+), 16 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 89c4b5c..7fc4b22 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1686,6 +1686,26 @@ config HIGHPTE
 	bool "Allocate 2nd-level pagetables from highmem"
 	depends on HIGHMEM
 
+config MODULES_USE_LONG_CALLS
+	bool
+	help
+	  Use long calls for calling exported symbols.
+
+config MODULES_USE_VMALLOC
+	bool "Put modules into vmalloc area"
+	select MODULES_USE_LONG_CALLS
+	depends on MMU && MODULES
+	depends on !XIP_KERNEL
+	depends on !FUNCTION_TRACER
+	help
+	  Usually modules are loaded into small area prior to the kernel text
+	  because they are linked with the kernel using short calls.
+
+	  This option enables long calls and moves modules into vmalloc area.
+	  This allows to load more modules but adds some perfromance penalty.
+
+	  If unsure, say n.
+
 config HW_PERF_EVENTS
 	bool "Enable hardware performance counter support for perf events"
 	depends on PERF_EVENTS
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 034a949..64541db 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -117,6 +117,10 @@ CFLAGS_ISA	:=$(call cc-option,-marm,)
 AFLAGS_ISA	:=$(CFLAGS_ISA)
 endif
 
+ifeq ($(CONFIG_MODULES_USE_LONG_CALLS),y)
+CFLAGS_MODULE	+= -mlong-calls
+endif
+
 # Need -Uarm for gcc < 3.x
 KBUILD_CFLAGS	+=$(CFLAGS_ABI) $(CFLAGS_ISA) $(arch-y) $(tune-y) $(call cc-option,-mshort-load-bytes,$(call cc-option,-malignment-traps,)) -msoft-float -Uarm
 KBUILD_AFLAGS	+=$(CFLAGS_ABI) $(AFLAGS_ISA) $(arch-y) $(tune-y) -include asm/unified.h -msoft-float
diff --git a/arch/arm/include/asm/compiler.h b/arch/arm/include/asm/compiler.h
index 8155db2..d953067 100644
--- a/arch/arm/include/asm/compiler.h
+++ b/arch/arm/include/asm/compiler.h
@@ -11,5 +11,18 @@
  */
 #define __asmeq(x, y)  ".ifnc " x "," y " ; .err ; .endif\n\t"
 
+/*
+ * This is used for calling exported symbols from inline assembly code.
+ */
+#if defined(MODULE) && defined(CONFIG_MODULES_USE_LONG_CALLS)
+#define __asmbl(cond, reg, target) \
+	"movw	" reg ", #:lower16:" target "\n\t" \
+	"movt	" reg ", #:upper16:" target "\n\t" \
+	"blx" cond "	" reg "\n\t"
+#define __asmbl_clobber(reg)	,reg
+#else
+#define __asmbl(cond, reg, target) "bl" cond "	" target"\n\t"
+#define __asmbl_clobber(reg)
+#endif
 
 #endif /* __ASM_ARM_COMPILER_H */
diff --git a/arch/arm/include/asm/div64.h b/arch/arm/include/asm/div64.h
index 662c7bd..fc7548d 100644
--- a/arch/arm/include/asm/div64.h
+++ b/arch/arm/include/asm/div64.h
@@ -38,7 +38,7 @@
 		__asmeq("%1", "r2")				\
 		__asmeq("%2", "r0")				\
 		__asmeq("%3", "r4")				\
-		"bl	__do_div64"				\
+		__asmbl("", "ip",  "__do_div64")		\
 		: "=r" (__rem), "=r" (__res)			\
 		: "r" (__n), "r" (__base)			\
 		: "ip", "lr", "cc");				\
diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h
index e731018..17745c2 100644
--- a/arch/arm/include/asm/memory.h
+++ b/arch/arm/include/asm/memory.h
@@ -47,6 +47,15 @@
  */
 #define TASK_SIZE_26		(UL(1) << 26)
 
+#ifdef CONFIG_MODULES_USE_VMALLOC
+/*
+ * Modules might be anywhere in the vmalloc area.
+ */
+#define MODULES_VADDR		VMALLOC_START
+#define MODULES_END		VMALLOC_END
+
+#else /* CONFIG_MODULES_USE_VMALLOC */
+
 /*
  * The module space lives between the addresses given by TASK_SIZE
  * and PAGE_OFFSET - it must be within 32MB of the kernel text.
@@ -71,6 +80,8 @@
 #define MODULES_END		(PAGE_OFFSET)
 #endif
 
+#endif /* CONFIG_MODULES_USE_VMALLOC */
+
 /*
  * The XIP kernel gets mapped@the bottom of the module vm area.
  * Since we use sections to map it, this macro replaces the physical address
diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h
index 4767eb9..c4c8d26 100644
--- a/arch/arm/include/asm/uaccess.h
+++ b/arch/arm/include/asm/uaccess.h
@@ -113,21 +113,21 @@ extern int __get_user_64t_1(void *);
 extern int __get_user_64t_2(void *);
 extern int __get_user_64t_4(void *);
 
-#define __GUP_CLOBBER_1	"lr", "cc"
+#define __GUP_CLOBBER_1	"lr", "cc" __asmbl_clobber("ip")
 #ifdef CONFIG_CPU_USE_DOMAINS
 #define __GUP_CLOBBER_2	"ip", "lr", "cc"
 #else
-#define __GUP_CLOBBER_2 "lr", "cc"
+#define __GUP_CLOBBER_2 "lr", "cc" __asmbl_clobber("ip")
 #endif
-#define __GUP_CLOBBER_4	"lr", "cc"
-#define __GUP_CLOBBER_32t_8 "lr", "cc"
-#define __GUP_CLOBBER_8	"lr", "cc"
+#define __GUP_CLOBBER_4	"lr", "cc" __asmbl_clobber("ip")
+#define __GUP_CLOBBER_32t_8 "lr", "cc" __asmbl_clobber("ip")
+#define __GUP_CLOBBER_8	"lr", "cc" __asmbl_clobber("ip")
 
 #define __get_user_x(__r2,__p,__e,__l,__s)				\
 	   __asm__ __volatile__ (					\
 		__asmeq("%0", "r0") __asmeq("%1", "r2")			\
 		__asmeq("%3", "r1")					\
-		"bl	__get_user_" #__s				\
+		__asmbl("", "ip", "__get_user_" #__s)			\
 		: "=&r" (__e), "=r" (__r2)				\
 		: "0" (__p), "r" (__l)					\
 		: __GUP_CLOBBER_##__s)
@@ -149,7 +149,7 @@ extern int __get_user_64t_4(void *);
 	   __asm__ __volatile__ (					\
 		__asmeq("%0", "r0") __asmeq("%1", "r2")			\
 		__asmeq("%3", "r1")					\
-		"bl	__get_user_64t_" #__s				\
+		__asmbl("", "ip", "__get_user_64t_" #__s)		\
 		: "=&r" (__e), "=r" (__r2)				\
 		: "0" (__p), "r" (__l)					\
 		: __GUP_CLOBBER_##__s)
@@ -211,7 +211,7 @@ extern int __put_user_8(void *, unsigned long long);
 	   __asm__ __volatile__ (					\
 		__asmeq("%0", "r0") __asmeq("%2", "r2")			\
 		__asmeq("%3", "r1")					\
-		"bl	__put_user_" #__s				\
+		__asmbl("", "ip", "__put_user_" #__s)			\
 		: "=&r" (__e)						\
 		: "0" (__p), "r" (__r2), "r" (__l)			\
 		: "ip", "lr", "cc")
diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
index 6a4dffe..081da90 100644
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -38,12 +38,14 @@
 #endif
 
 #ifdef CONFIG_MMU
+#ifndef CONFIG_MODULES_USE_VMALLOC
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
 				GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
+#endif /* CONFIG_MODULES_USE_VMALLOC */
 #endif
 
 int
diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c
index 5942493..d4d4f75d 100644
--- a/arch/arm/mm/dump.c
+++ b/arch/arm/mm/dump.c
@@ -19,6 +19,7 @@
 
 #include <asm/fixmap.h>
 #include <asm/pgtable.h>
+#include <asm/highmem.h>
 
 struct addr_marker {
 	unsigned long start_address;
@@ -26,7 +27,12 @@ struct addr_marker {
 };
 
 static struct addr_marker address_markers[] = {
+#ifndef CONFIG_MODULES_USE_VMALLOC
 	{ MODULES_VADDR,	"Modules" },
+#endif
+#ifdef CONFIG_HIGHMEM
+	{ PKMAP_BASE,		"Page kmap" },
+#endif
 	{ PAGE_OFFSET,		"Kernel Mapping" },
 	{ 0,			"vmalloc() Area" },
 	{ VMALLOC_END,		"vmalloc() End" },
@@ -356,7 +362,9 @@ static int ptdump_init(void)
 			for (j = 0; j < pg_level[i].num; j++)
 				pg_level[i].mask |= pg_level[i].bits[j].mask;
 
-	address_markers[2].start_address = VMALLOC_START;
+	i = 1 + !IS_ENABLED(CONFIG_MODULES_USE_VMALLOC) +
+		!!IS_ENABLED(CONFIG_HIGHMEM);
+	address_markers[i].start_address = VMALLOC_START;
 
 	pe = debugfs_create_file("kernel_page_tables", 0400, NULL, NULL,
 				 &ptdump_fops);
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 9481f85..985aed8 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -595,7 +595,9 @@ void __init mem_init(void)
 	 * be detected@build time already.
 	 */
 #ifdef CONFIG_MMU
+#ifndef CONFIG_MODULES_USE_VMALLOC
 	BUILD_BUG_ON(TASK_SIZE				> MODULES_VADDR);
+#endif
 	BUG_ON(TASK_SIZE 				> MODULES_VADDR);
 #endif
 
diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
index 9d2cdda..9e0c4f4 100644
--- a/arch/arm/mm/mmu.c
+++ b/arch/arm/mm/mmu.c
@@ -1161,16 +1161,15 @@ void __init sanity_check_meminfo(void)
 
 static inline void prepare_page_table(void)
 {
-	unsigned long addr;
+	unsigned long addr = 0;
 	phys_addr_t end;
 
 	/*
 	 * Clear out all the mappings below the kernel image.
 	 */
-	for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE)
-		pmd_clear(pmd_off_k(addr));
-
 #ifdef CONFIG_XIP_KERNEL
+	for ( ; addr < MODULES_VADDR; addr += PMD_SIZE)
+		pmd_clear(pmd_off_k(addr));
 	/* The XIP kernel is mapped in the module area -- skip over it */
 	addr = ((unsigned long)_etext + PMD_SIZE - 1) & PMD_MASK;
 #endif
diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c
index 3fbcb5a..ce23923 100644
--- a/arch/arm/mm/pgd.c
+++ b/arch/arm/mm/pgd.c
@@ -57,11 +57,11 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	clean_dcache_area(new_pgd, TTBR0_PTRS_PER_PGD * sizeof(pgd_t));
 
 #ifdef CONFIG_ARM_LPAE
+#if defined(CONFIG_HIGHMEM) || !defined(CONFIG_MODULES_USE_VMALLOC)
 	/*
 	 * Allocate PMD table for modules and pkmap mappings.
 	 */
-	new_pud = pud_alloc(mm, new_pgd + pgd_index(MODULES_VADDR),
-			    MODULES_VADDR);
+	new_pud = pud_alloc(mm, new_pgd + pgd_index(PKMAP_BASE), PKMAP_BASE);
 	if (!new_pud)
 		goto no_pud;
 
@@ -69,6 +69,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 	if (!new_pmd)
 		goto no_pmd;
 #endif
+#endif
 
 	if (!vectors_high()) {
 		/*

^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-18 16:21 [PATCH RFC] ARM: option for loading modules into vmalloc area Konstantin Khlebnikov
@ 2014-11-18 17:34 ` Russell King - ARM Linux
  2014-11-18 18:13   ` Konstantin Khlebnikov
  0 siblings, 1 reply; 22+ messages in thread
From: Russell King - ARM Linux @ 2014-11-18 17:34 UTC (permalink / raw)
  To: linux-arm-kernel

On Tue, Nov 18, 2014 at 08:21:46PM +0400, Konstantin Khlebnikov wrote:
> Usually modules are loaded into small area prior to the kernel
> text because they are linked with the kernel using short calls.
> Compile-time instrumentation like GCOV or KASAN bloats code a lot,
> and as a result huge modules no longer fit into reserved area.
> 
> This patch adds option CONFIG_MODULES_USE_VMALLOC which lifts
> limitation on amount of loaded modules. It links modules using
> long-calls (option -mlong-calls) and loads them into vmalloc area.
> 
> In few places exported symbols are called from inline assembly.
> This patch adds macro for such call sites: __asmbl and __asmbl_clobber.
> Call turns into single 'bl' or sequence 'movw; movt; blx' depending on
> context and state of config option.
> 
> Unfortunately this option isn't compatible with CONFIG_FUNCTION_TRACER.
> Compiler emits short calls to profiling function despite of -mlong-calls.
> This is a bug in GCC, but ftrace anyway needs an update to handle this.

It also isn't compatible with the older architectures which don't have
"blx".

> diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
> index 9d2cdda..9e0c4f4 100644
> --- a/arch/arm/mm/mmu.c
> +++ b/arch/arm/mm/mmu.c
> @@ -1161,16 +1161,15 @@ void __init sanity_check_meminfo(void)
>  
>  static inline void prepare_page_table(void)
>  {
> -	unsigned long addr;
> +	unsigned long addr = 0;
>  	phys_addr_t end;
>  
>  	/*
>  	 * Clear out all the mappings below the kernel image.
>  	 */
> -	for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE)
> -		pmd_clear(pmd_off_k(addr));
> -
>  #ifdef CONFIG_XIP_KERNEL
> +	for ( ; addr < MODULES_VADDR; addr += PMD_SIZE)
> +		pmd_clear(pmd_off_k(addr));
>  	/* The XIP kernel is mapped in the module area -- skip over it */
>  	addr = ((unsigned long)_etext + PMD_SIZE - 1) & PMD_MASK;
>  #endif

This seems to be an unrelated change.  In any case, it looks wrong to
me.  Please describe what you are trying to achieve here.

-- 
FTTC broadband for 0.8mile line: currently at 9.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-18 17:34 ` Russell King - ARM Linux
@ 2014-11-18 18:13   ` Konstantin Khlebnikov
  2014-11-19 13:40     ` Arnd Bergmann
  0 siblings, 1 reply; 22+ messages in thread
From: Konstantin Khlebnikov @ 2014-11-18 18:13 UTC (permalink / raw)
  To: linux-arm-kernel


On 2014-11-18 20:34, Russell King - ARM Linux wrote:
> On Tue, Nov 18, 2014 at 08:21:46PM +0400, Konstantin Khlebnikov wrote:
>> Usually modules are loaded into small area prior to the kernel
>> text because they are linked with the kernel using short calls.
>> Compile-time instrumentation like GCOV or KASAN bloats code a lot,
>> and as a result huge modules no longer fit into reserved area.
>>
>> This patch adds option CONFIG_MODULES_USE_VMALLOC which lifts
>> limitation on amount of loaded modules. It links modules using
>> long-calls (option -mlong-calls) and loads them into vmalloc area.
>>
>> In few places exported symbols are called from inline assembly.
>> This patch adds macro for such call sites: __asmbl and __asmbl_clobber.
>> Call turns into single 'bl' or sequence 'movw; movt; blx' depending on
>> context and state of config option.
>>
>> Unfortunately this option isn't compatible with CONFIG_FUNCTION_TRACER.
>> Compiler emits short calls to profiling function despite of -mlong-calls.
>> This is a bug in GCC, but ftrace anyway needs an update to handle this.
> It also isn't compatible with the older architectures which don't have
> "blx".

Ok, I'll add "depends on CPU_V6 || CPU_V7" I don't think that it is 
necessary for older cpus.

>> diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c
>> index 9d2cdda..9e0c4f4 100644
>> --- a/arch/arm/mm/mmu.c
>> +++ b/arch/arm/mm/mmu.c
>> @@ -1161,16 +1161,15 @@ void __init sanity_check_meminfo(void)
>>   
>>   static inline void prepare_page_table(void)
>>   {
>> -	unsigned long addr;
>> +	unsigned long addr = 0;
>>   	phys_addr_t end;
>>   
>>   	/*
>>   	 * Clear out all the mappings below the kernel image.
>>   	 */
>> -	for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE)
>> -		pmd_clear(pmd_off_k(addr));
>> -
>>   #ifdef CONFIG_XIP_KERNEL
>> +	for ( ; addr < MODULES_VADDR; addr += PMD_SIZE)
>> +		pmd_clear(pmd_off_k(addr));
>>   	/* The XIP kernel is mapped in the module area -- skip over it */
>>   	addr = ((unsigned long)_etext + PMD_SIZE - 1) & PMD_MASK;
>>   #endif
> This seems to be an unrelated change.  In any case, it looks wrong to
> me.  Please describe what you are trying to achieve here.
>

My patch changes  MODULES_VADDR, if option enabled it becomes alias for 
VMALLOC_START

It's used as border between two loops, before patch this code looks like:

     /*
      * Clear out all the mappings below the kernel image.
      */
     for (addr = 0; addr < MODULES_VADDR; addr += PMD_SIZE)
         pmd_clear(pmd_off_k(addr));

#ifdef CONFIG_XIP_KERNEL
     /* The XIP kernel is mapped in the module area -- skip over it */
     addr = ((unsigned long)_etext + PMD_SIZE - 1) & PMD_MASK;
#endif
     for ( ; addr < PAGE_OFFSET; addr += PMD_SIZE)
         pmd_clear(pmd_off_k(addr));

It's written with assumption that MODULES_VADDR < PAGE_OFFSET,
but this no longer true if modules are placed inside vmalloc area.


After patch:

     unsigned long addr = 0;

     /*
      * Clear out all the mappings below the kernel image.
      */
#ifdef CONFIG_XIP_KERNEL
     for ( ; addr < MODULES_VADDR; addr += PMD_SIZE)
         pmd_clear(pmd_off_k(addr));
     /* The XIP kernel is mapped in the module area -- skip over it */
     addr = ((unsigned long)_etext + PMD_SIZE - 1) & PMD_MASK;
#endif
     for ( ; addr < PAGE_OFFSET; addr += PMD_SIZE)
         pmd_clear(pmd_off_k(addr));


Without XIP there is no reason for splitting it into two loops.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-18 18:13   ` Konstantin Khlebnikov
@ 2014-11-19 13:40     ` Arnd Bergmann
  2014-11-19 14:54       ` Ard Biesheuvel
  0 siblings, 1 reply; 22+ messages in thread
From: Arnd Bergmann @ 2014-11-19 13:40 UTC (permalink / raw)
  To: linux-arm-kernel

On Tuesday 18 November 2014 21:13:56 Konstantin Khlebnikov wrote:
> On 2014-11-18 20:34, Russell King - ARM Linux wrote:
> > On Tue, Nov 18, 2014 at 08:21:46PM +0400, Konstantin Khlebnikov wrote:
> >> Usually modules are loaded into small area prior to the kernel
> >> text because they are linked with the kernel using short calls.
> >> Compile-time instrumentation like GCOV or KASAN bloats code a lot,
> >> and as a result huge modules no longer fit into reserved area.
> >>
> >> This patch adds option CONFIG_MODULES_USE_VMALLOC which lifts
> >> limitation on amount of loaded modules. It links modules using
> >> long-calls (option -mlong-calls) and loads them into vmalloc area.
> >>
> >> In few places exported symbols are called from inline assembly.
> >> This patch adds macro for such call sites: __asmbl and __asmbl_clobber.
> >> Call turns into single 'bl' or sequence 'movw; movt; blx' depending on
> >> context and state of config option.
> >>
> >> Unfortunately this option isn't compatible with CONFIG_FUNCTION_TRACER.
> >> Compiler emits short calls to profiling function despite of -mlong-calls.
> >> This is a bug in GCC, but ftrace anyway needs an update to handle this.
> > It also isn't compatible with the older architectures which don't have
> > "blx".
> 
> Ok, I'll add "depends on CPU_V6 || CPU_V7" I don't think that it is 
> necessary for older cpus.

Why not just use a different branch instruction for the older CPUs?

	Arnd

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 13:40     ` Arnd Bergmann
@ 2014-11-19 14:54       ` Ard Biesheuvel
  2014-11-19 15:52         ` Konstantin Khlebnikov
  0 siblings, 1 reply; 22+ messages in thread
From: Ard Biesheuvel @ 2014-11-19 14:54 UTC (permalink / raw)
  To: linux-arm-kernel

On 19 November 2014 14:40, Arnd Bergmann <arnd@arndb.de> wrote:
> On Tuesday 18 November 2014 21:13:56 Konstantin Khlebnikov wrote:
>> On 2014-11-18 20:34, Russell King - ARM Linux wrote:
>> > On Tue, Nov 18, 2014 at 08:21:46PM +0400, Konstantin Khlebnikov wrote:
>> >> Usually modules are loaded into small area prior to the kernel
>> >> text because they are linked with the kernel using short calls.
>> >> Compile-time instrumentation like GCOV or KASAN bloats code a lot,
>> >> and as a result huge modules no longer fit into reserved area.
>> >>
>> >> This patch adds option CONFIG_MODULES_USE_VMALLOC which lifts
>> >> limitation on amount of loaded modules. It links modules using
>> >> long-calls (option -mlong-calls) and loads them into vmalloc area.
>> >>
>> >> In few places exported symbols are called from inline assembly.
>> >> This patch adds macro for such call sites: __asmbl and __asmbl_clobber.
>> >> Call turns into single 'bl' or sequence 'movw; movt; blx' depending on
>> >> context and state of config option.
>> >>
>> >> Unfortunately this option isn't compatible with CONFIG_FUNCTION_TRACER.
>> >> Compiler emits short calls to profiling function despite of -mlong-calls.
>> >> This is a bug in GCC, but ftrace anyway needs an update to handle this.
>> > It also isn't compatible with the older architectures which don't have
>> > "blx".
>>
>> Ok, I'll add "depends on CPU_V6 || CPU_V7" I don't think that it is
>> necessary for older cpus.
>
> Why not just use a different branch instruction for the older CPUs?
>

ARMv6 doesn't support movw/movt so this will only work on v7.

What about doing 'mov lr, pc; ldr pc,=symbol' instead? You clearly
don't care about performance in this case, so the performance hit (due
to the dcache access and interfering with the return stack predictors)
should be tolerable. The only thing to be careful about is thumb2
kernels: you would need to set the thumb bit in lr manually but only
if the call is made /from/ thumb. You would probably be better off
just depending on !THUMB2_KERNEL.

-- 
Ard.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 14:54       ` Ard Biesheuvel
@ 2014-11-19 15:52         ` Konstantin Khlebnikov
  2014-11-19 16:02           ` Ard Biesheuvel
  0 siblings, 1 reply; 22+ messages in thread
From: Konstantin Khlebnikov @ 2014-11-19 15:52 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 19, 2014 at 5:54 PM, Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> On 19 November 2014 14:40, Arnd Bergmann <arnd@arndb.de> wrote:
>> On Tuesday 18 November 2014 21:13:56 Konstantin Khlebnikov wrote:
>>> On 2014-11-18 20:34, Russell King - ARM Linux wrote:
>>> > On Tue, Nov 18, 2014 at 08:21:46PM +0400, Konstantin Khlebnikov wrote:
>>> >> Usually modules are loaded into small area prior to the kernel
>>> >> text because they are linked with the kernel using short calls.
>>> >> Compile-time instrumentation like GCOV or KASAN bloats code a lot,
>>> >> and as a result huge modules no longer fit into reserved area.
>>> >>
>>> >> This patch adds option CONFIG_MODULES_USE_VMALLOC which lifts
>>> >> limitation on amount of loaded modules. It links modules using
>>> >> long-calls (option -mlong-calls) and loads them into vmalloc area.
>>> >>
>>> >> In few places exported symbols are called from inline assembly.
>>> >> This patch adds macro for such call sites: __asmbl and __asmbl_clobber.
>>> >> Call turns into single 'bl' or sequence 'movw; movt; blx' depending on
>>> >> context and state of config option.
>>> >>
>>> >> Unfortunately this option isn't compatible with CONFIG_FUNCTION_TRACER.
>>> >> Compiler emits short calls to profiling function despite of -mlong-calls.
>>> >> This is a bug in GCC, but ftrace anyway needs an update to handle this.
>>> > It also isn't compatible with the older architectures which don't have
>>> > "blx".
>>>
>>> Ok, I'll add "depends on CPU_V6 || CPU_V7" I don't think that it is
>>> necessary for older cpus.
>>
>> Why not just use a different branch instruction for the older CPUs?
>>
>
> ARMv6 doesn't support movw/movt so this will only work on v7.
>
> What about doing 'mov lr, pc; ldr pc,=symbol' instead? You clearly
> don't care about performance in this case, so the performance hit (due
> to the dcache access and interfering with the return stack predictors)
> should be tolerable. The only thing to be careful about is thumb2
> kernels: you would need to set the thumb bit in lr manually but only
> if the call is made /from/ thumb. You would probably be better off
> just depending on !THUMB2_KERNEL.

Do you mean ldr pc, =symbol ?

In this case I get this error:

/tmp/ccAHtONU.s: Assembler messages:
/tmp/ccAHtONU.s:220: Error: invalid literal constant: pool needs to be closer

Probably constant pool doesn't work well in inline assembly.


Something like this seems work:

add     lr, pc, #4
ldr       pc, [pc, #-4]
.long symbol

> Ard.
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 15:52         ` Konstantin Khlebnikov
@ 2014-11-19 16:02           ` Ard Biesheuvel
  2014-11-19 16:07             ` Russell King - ARM Linux
  0 siblings, 1 reply; 22+ messages in thread
From: Ard Biesheuvel @ 2014-11-19 16:02 UTC (permalink / raw)
  To: linux-arm-kernel

On 19 November 2014 16:52, Konstantin Khlebnikov <koct9i@gmail.com> wrote:
> On Wed, Nov 19, 2014 at 5:54 PM, Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
>> On 19 November 2014 14:40, Arnd Bergmann <arnd@arndb.de> wrote:
>>> On Tuesday 18 November 2014 21:13:56 Konstantin Khlebnikov wrote:
>>>> On 2014-11-18 20:34, Russell King - ARM Linux wrote:
>>>> > On Tue, Nov 18, 2014 at 08:21:46PM +0400, Konstantin Khlebnikov wrote:
>>>> >> Usually modules are loaded into small area prior to the kernel
>>>> >> text because they are linked with the kernel using short calls.
>>>> >> Compile-time instrumentation like GCOV or KASAN bloats code a lot,
>>>> >> and as a result huge modules no longer fit into reserved area.
>>>> >>
>>>> >> This patch adds option CONFIG_MODULES_USE_VMALLOC which lifts
>>>> >> limitation on amount of loaded modules. It links modules using
>>>> >> long-calls (option -mlong-calls) and loads them into vmalloc area.
>>>> >>
>>>> >> In few places exported symbols are called from inline assembly.
>>>> >> This patch adds macro for such call sites: __asmbl and __asmbl_clobber.
>>>> >> Call turns into single 'bl' or sequence 'movw; movt; blx' depending on
>>>> >> context and state of config option.
>>>> >>
>>>> >> Unfortunately this option isn't compatible with CONFIG_FUNCTION_TRACER.
>>>> >> Compiler emits short calls to profiling function despite of -mlong-calls.
>>>> >> This is a bug in GCC, but ftrace anyway needs an update to handle this.
>>>> > It also isn't compatible with the older architectures which don't have
>>>> > "blx".
>>>>
>>>> Ok, I'll add "depends on CPU_V6 || CPU_V7" I don't think that it is
>>>> necessary for older cpus.
>>>
>>> Why not just use a different branch instruction for the older CPUs?
>>>
>>
>> ARMv6 doesn't support movw/movt so this will only work on v7.
>>
>> What about doing 'mov lr, pc; ldr pc,=symbol' instead? You clearly
>> don't care about performance in this case, so the performance hit (due
>> to the dcache access and interfering with the return stack predictors)
>> should be tolerable. The only thing to be careful about is thumb2
>> kernels: you would need to set the thumb bit in lr manually but only
>> if the call is made /from/ thumb. You would probably be better off
>> just depending on !THUMB2_KERNEL.
>
> Do you mean ldr pc, =symbol ?
>
> In this case I get this error:
>
> /tmp/ccAHtONU.s: Assembler messages:
> /tmp/ccAHtONU.s:220: Error: invalid literal constant: pool needs to be closer
>
> Probably constant pool doesn't work well in inline assembly.
>
>
> Something like this seems work:
>
> add     lr, pc, #4
> ldr       pc, [pc, #-4]
> .long symbol
>

You can add a '.ltorg' instruction which tells the assembler to dump
the literal pool, but you still need to jump over it, i.e.,

adr lr, 0f
ldr pc, =symbol
.ltorg
0:
-- 
Ard.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:02           ` Ard Biesheuvel
@ 2014-11-19 16:07             ` Russell King - ARM Linux
  2014-11-19 16:25               ` Ard Biesheuvel
  2014-11-19 16:37               ` Nicolas Pitre
  0 siblings, 2 replies; 22+ messages in thread
From: Russell King - ARM Linux @ 2014-11-19 16:07 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 19, 2014 at 05:02:40PM +0100, Ard Biesheuvel wrote:
> On 19 November 2014 16:52, Konstantin Khlebnikov <koct9i@gmail.com> wrote:
> > Do you mean ldr pc, =symbol ?
> >
> > In this case I get this error:
> >
> > /tmp/ccAHtONU.s: Assembler messages:
> > /tmp/ccAHtONU.s:220: Error: invalid literal constant: pool needs to be closer
> >
> > Probably constant pool doesn't work well in inline assembly.
> >
> >
> > Something like this seems work:
> >
> > add     lr, pc, #4
> > ldr       pc, [pc, #-4]
> > .long symbol
> >
> 
> You can add a '.ltorg' instruction which tells the assembler to dump
> the literal pool, but you still need to jump over it, i.e.,
> 
> adr lr, 0f
> ldr pc, =symbol
> .ltorg
> 0:

Which is not a good idea either, because the compiler needs to know how
far away its own manually generated literal pool is from the instructions
which reference it.  The .ltorg statement can end up emitting any number
of literals at that point, which makes it indeterminant how many words
are contained within the asm() statement.

Yes, it isn't desirable to waste an entire data cache line per indirect
call like the original quote above, but I don't see a practical
alternative.

-- 
FTTC broadband for 0.8mile line: currently at 9.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:07             ` Russell King - ARM Linux
@ 2014-11-19 16:25               ` Ard Biesheuvel
  2014-11-19 16:32                 ` Konstantin Khlebnikov
  2014-11-19 16:41                 ` Russell King - ARM Linux
  2014-11-19 16:37               ` Nicolas Pitre
  1 sibling, 2 replies; 22+ messages in thread
From: Ard Biesheuvel @ 2014-11-19 16:25 UTC (permalink / raw)
  To: linux-arm-kernel

On 19 November 2014 17:07, Russell King - ARM Linux
<linux@arm.linux.org.uk> wrote:
> On Wed, Nov 19, 2014 at 05:02:40PM +0100, Ard Biesheuvel wrote:
>> On 19 November 2014 16:52, Konstantin Khlebnikov <koct9i@gmail.com> wrote:
>> > Do you mean ldr pc, =symbol ?
>> >
>> > In this case I get this error:
>> >
>> > /tmp/ccAHtONU.s: Assembler messages:
>> > /tmp/ccAHtONU.s:220: Error: invalid literal constant: pool needs to be closer
>> >
>> > Probably constant pool doesn't work well in inline assembly.
>> >
>> >
>> > Something like this seems work:
>> >
>> > add     lr, pc, #4
>> > ldr       pc, [pc, #-4]
>> > .long symbol
>> >
>>
>> You can add a '.ltorg' instruction which tells the assembler to dump
>> the literal pool, but you still need to jump over it, i.e.,
>>
>> adr lr, 0f
>> ldr pc, =symbol
>> .ltorg
>> 0:
>
> Which is not a good idea either, because the compiler needs to know how
> far away its own manually generated literal pool is from the instructions
> which reference it.  The .ltorg statement can end up emitting any number
> of literals at that point, which makes it indeterminant how many words
> are contained within the asm() statement.
>

That applies to any inline asm statement in general: the compiler
assumes that the expanded size will not interfere with its ability to
emit literals after the function's return instruction.
Sometimes it will put a literal pool in the middle of the function if
it is very large, and I am not sure if an inline asm by itself would
ever trigger that heuristic to kick in.

But by the same logic, i.e., due to the fact that GCC manages its own
literals, the literal pool at the assembly level is unlikely to be so
large that you will actually hit this condition.

> Yes, it isn't desirable to waste an entire data cache line per indirect
> call like the original quote above, but I don't see a practical
> alternative.
>

We could at least add some labels instead of doing explicit pc arithmetic, i.e.,

adr lr, 1f
ldr pc, 0f
0: .long symbol
1:

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:25               ` Ard Biesheuvel
@ 2014-11-19 16:32                 ` Konstantin Khlebnikov
  2014-11-19 16:38                   ` Ard Biesheuvel
  2014-11-19 16:41                 ` Russell King - ARM Linux
  1 sibling, 1 reply; 22+ messages in thread
From: Konstantin Khlebnikov @ 2014-11-19 16:32 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 19, 2014 at 7:25 PM, Ard Biesheuvel
<ard.biesheuvel@linaro.org> wrote:
> On 19 November 2014 17:07, Russell King - ARM Linux
> <linux@arm.linux.org.uk> wrote:
>> On Wed, Nov 19, 2014 at 05:02:40PM +0100, Ard Biesheuvel wrote:
>>> On 19 November 2014 16:52, Konstantin Khlebnikov <koct9i@gmail.com> wrote:
>>> > Do you mean ldr pc, =symbol ?
>>> >
>>> > In this case I get this error:
>>> >
>>> > /tmp/ccAHtONU.s: Assembler messages:
>>> > /tmp/ccAHtONU.s:220: Error: invalid literal constant: pool needs to be closer
>>> >
>>> > Probably constant pool doesn't work well in inline assembly.
>>> >
>>> >
>>> > Something like this seems work:
>>> >
>>> > add     lr, pc, #4
>>> > ldr       pc, [pc, #-4]
>>> > .long symbol
>>> >
>>>
>>> You can add a '.ltorg' instruction which tells the assembler to dump
>>> the literal pool, but you still need to jump over it, i.e.,
>>>
>>> adr lr, 0f
>>> ldr pc, =symbol
>>> .ltorg
>>> 0:
>>
>> Which is not a good idea either, because the compiler needs to know how
>> far away its own manually generated literal pool is from the instructions
>> which reference it.  The .ltorg statement can end up emitting any number
>> of literals at that point, which makes it indeterminant how many words
>> are contained within the asm() statement.
>>
>
> That applies to any inline asm statement in general: the compiler
> assumes that the expanded size will not interfere with its ability to
> emit literals after the function's return instruction.
> Sometimes it will put a literal pool in the middle of the function if
> it is very large, and I am not sure if an inline asm by itself would
> ever trigger that heuristic to kick in.
>
> But by the same logic, i.e., due to the fact that GCC manages its own
> literals, the literal pool at the assembly level is unlikely to be so
> large that you will actually hit this condition.
>
>> Yes, it isn't desirable to waste an entire data cache line per indirect
>> call like the original quote above, but I don't see a practical
>> alternative.
>>
>
> We could at least add some labels instead of doing explicit pc arithmetic, i.e.,
>
> adr lr, 1f
> ldr pc, 0f
> 0: .long symbol
> 1:

I think we need some unique prefix here, this macro is used inside
bigger inline assembly constructions and probably another macro.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:07             ` Russell King - ARM Linux
  2014-11-19 16:25               ` Ard Biesheuvel
@ 2014-11-19 16:37               ` Nicolas Pitre
  2014-11-19 16:41                 ` Ard Biesheuvel
  2014-11-19 16:49                 ` Russell King - ARM Linux
  1 sibling, 2 replies; 22+ messages in thread
From: Nicolas Pitre @ 2014-11-19 16:37 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:

> On Wed, Nov 19, 2014 at 05:02:40PM +0100, Ard Biesheuvel wrote:
> > On 19 November 2014 16:52, Konstantin Khlebnikov <koct9i@gmail.com> wrote:
> > > Do you mean ldr pc, =symbol ?
> > >
> > > In this case I get this error:
> > >
> > > /tmp/ccAHtONU.s: Assembler messages:
> > > /tmp/ccAHtONU.s:220: Error: invalid literal constant: pool needs to be closer
> > >
> > > Probably constant pool doesn't work well in inline assembly.
> > >
> > >
> > > Something like this seems work:
> > >
> > > add     lr, pc, #4
> > > ldr       pc, [pc, #-4]
> > > .long symbol
> > >
> > 
> > You can add a '.ltorg' instruction which tells the assembler to dump
> > the literal pool, but you still need to jump over it, i.e.,
> > 
> > adr lr, 0f
> > ldr pc, =symbol
> > .ltorg
> > 0:
> 
> Which is not a good idea either, because the compiler needs to know how
> far away its own manually generated literal pool is from the instructions
> which reference it.  The .ltorg statement can end up emitting any number
> of literals at that point, which makes it indeterminant how many words
> are contained within the asm() statement.
> 
> Yes, it isn't desirable to waste an entire data cache line per indirect
> call like the original quote above, but I don't see a practical
> alternative.

Modules could be built without far calls by default, and then the module 
linker would only have to redirect those calls whose destination is too 
far away to a dynamically created trampoline table.

If I remember correctly you even posted some patches to that effect a 
couple years ago.  Maybe those could be salvaged?

I would largely recommend a solution where the link process could deal 
with it automatically and as needed rather than sprinkling yet more 
manually maintained macros into assembly code.


Nicolas

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:32                 ` Konstantin Khlebnikov
@ 2014-11-19 16:38                   ` Ard Biesheuvel
  2014-11-19 16:55                     ` Russell King - ARM Linux
  0 siblings, 1 reply; 22+ messages in thread
From: Ard Biesheuvel @ 2014-11-19 16:38 UTC (permalink / raw)
  To: linux-arm-kernel

On 19 November 2014 17:32, Konstantin Khlebnikov <koct9i@gmail.com> wrote:
> On Wed, Nov 19, 2014 at 7:25 PM, Ard Biesheuvel
> <ard.biesheuvel@linaro.org> wrote:
>> On 19 November 2014 17:07, Russell King - ARM Linux
>> <linux@arm.linux.org.uk> wrote:
>>> On Wed, Nov 19, 2014 at 05:02:40PM +0100, Ard Biesheuvel wrote:
>>>> On 19 November 2014 16:52, Konstantin Khlebnikov <koct9i@gmail.com> wrote:
>>>> > Do you mean ldr pc, =symbol ?
>>>> >
>>>> > In this case I get this error:
>>>> >
>>>> > /tmp/ccAHtONU.s: Assembler messages:
>>>> > /tmp/ccAHtONU.s:220: Error: invalid literal constant: pool needs to be closer
>>>> >
>>>> > Probably constant pool doesn't work well in inline assembly.
>>>> >
>>>> >
>>>> > Something like this seems work:
>>>> >
>>>> > add     lr, pc, #4
>>>> > ldr       pc, [pc, #-4]
>>>> > .long symbol
>>>> >
>>>>
>>>> You can add a '.ltorg' instruction which tells the assembler to dump
>>>> the literal pool, but you still need to jump over it, i.e.,
>>>>
>>>> adr lr, 0f
>>>> ldr pc, =symbol
>>>> .ltorg
>>>> 0:
>>>
>>> Which is not a good idea either, because the compiler needs to know how
>>> far away its own manually generated literal pool is from the instructions
>>> which reference it.  The .ltorg statement can end up emitting any number
>>> of literals at that point, which makes it indeterminant how many words
>>> are contained within the asm() statement.
>>>
>>
>> That applies to any inline asm statement in general: the compiler
>> assumes that the expanded size will not interfere with its ability to
>> emit literals after the function's return instruction.
>> Sometimes it will put a literal pool in the middle of the function if
>> it is very large, and I am not sure if an inline asm by itself would
>> ever trigger that heuristic to kick in.
>>
>> But by the same logic, i.e., due to the fact that GCC manages its own
>> literals, the literal pool at the assembly level is unlikely to be so
>> large that you will actually hit this condition.
>>
>>> Yes, it isn't desirable to waste an entire data cache line per indirect
>>> call like the original quote above, but I don't see a practical
>>> alternative.
>>>
>>
>> We could at least add some labels instead of doing explicit pc arithmetic, i.e.,
>>
>> adr lr, 1f
>> ldr pc, 0f
>> 0: .long symbol
>> 1:
>
> I think we need some unique prefix here, this macro is used inside
> bigger inline assembly constructions and probably another macro.

Numbers are disambiguated by the f and b suffixes, so they can be
reused in the same .s file. So as long as you use a strictly numerical
prefix, you can deal correctly with the case where, for instance,
do_div() is called twice in the same compilation unit, and still not
clash with other inline asm

-- 
Ard.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:25               ` Ard Biesheuvel
  2014-11-19 16:32                 ` Konstantin Khlebnikov
@ 2014-11-19 16:41                 ` Russell King - ARM Linux
  1 sibling, 0 replies; 22+ messages in thread
From: Russell King - ARM Linux @ 2014-11-19 16:41 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 19, 2014 at 05:25:41PM +0100, Ard Biesheuvel wrote:
> On 19 November 2014 17:07, Russell King - ARM Linux
> <linux@arm.linux.org.uk> wrote:
> > On Wed, Nov 19, 2014 at 05:02:40PM +0100, Ard Biesheuvel wrote:
> > Which is not a good idea either, because the compiler needs to know how
> > far away its own manually generated literal pool is from the instructions
> > which reference it.  The .ltorg statement can end up emitting any number
> > of literals at that point, which makes it indeterminant how many words
> > are contained within the asm() statement.
> >
> 
> That applies to any inline asm statement in general: the compiler
> assumes that the expanded size will not interfere with its ability to
> emit literals after the function's return instruction.
> Sometimes it will put a literal pool in the middle of the function if
> it is very large, and I am not sure if an inline asm by itself would
> ever trigger that heuristic to kick in.

The compiler works it out by counting the number of assembler delimiters
(iow, semicolons or newlines) in the asm() statement, and using that to
track how many instructions are present.

> > Yes, it isn't desirable to waste an entire data cache line per indirect
> > call like the original quote above, but I don't see a practical
> > alternative.
> 
> We could at least add some labels instead of doing explicit pc arithmetic, i.e.,
> 
> adr lr, 1f
> ldr pc, 0f
> 0: .long symbol
> 1:

Yes, but this doesn't get away from the performance impact of having one
word used in a D-cache line scattered throughout the code.  This is the
reason why I never looked at this as a serious option for kernel modules,
and decided to put the kernel modules below the kernel itself instead.

In older kernels, when we had the linking done by userspace insmod, I was
able to be much more clever in this regard - I was able to detect which
relocations were out of range, and I generated trampolines for each such
symbol.  What this relied upon was being able to parse the relocations
before allocating module space, so we knew what the maximum size of
trampolines needed for a particular module would be.

We don't have that luxury with the current approach - the earliest we get
to see the module is after the module space has been allocated, and the
module has been copied to that module.  That leaves no room to extend the
allocation for the trampolines.

-- 
FTTC broadband for 0.8mile line: currently at 9.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:37               ` Nicolas Pitre
@ 2014-11-19 16:41                 ` Ard Biesheuvel
  2014-11-19 16:49                 ` Russell King - ARM Linux
  1 sibling, 0 replies; 22+ messages in thread
From: Ard Biesheuvel @ 2014-11-19 16:41 UTC (permalink / raw)
  To: linux-arm-kernel

On 19 November 2014 17:37, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:
> On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:
>
>> On Wed, Nov 19, 2014 at 05:02:40PM +0100, Ard Biesheuvel wrote:
>> > On 19 November 2014 16:52, Konstantin Khlebnikov <koct9i@gmail.com> wrote:
>> > > Do you mean ldr pc, =symbol ?
>> > >
>> > > In this case I get this error:
>> > >
>> > > /tmp/ccAHtONU.s: Assembler messages:
>> > > /tmp/ccAHtONU.s:220: Error: invalid literal constant: pool needs to be closer
>> > >
>> > > Probably constant pool doesn't work well in inline assembly.
>> > >
>> > >
>> > > Something like this seems work:
>> > >
>> > > add     lr, pc, #4
>> > > ldr       pc, [pc, #-4]
>> > > .long symbol
>> > >
>> >
>> > You can add a '.ltorg' instruction which tells the assembler to dump
>> > the literal pool, but you still need to jump over it, i.e.,
>> >
>> > adr lr, 0f
>> > ldr pc, =symbol
>> > .ltorg
>> > 0:
>>
>> Which is not a good idea either, because the compiler needs to know how
>> far away its own manually generated literal pool is from the instructions
>> which reference it.  The .ltorg statement can end up emitting any number
>> of literals at that point, which makes it indeterminant how many words
>> are contained within the asm() statement.
>>
>> Yes, it isn't desirable to waste an entire data cache line per indirect
>> call like the original quote above, but I don't see a practical
>> alternative.
>
> Modules could be built without far calls by default, and then the module
> linker would only have to redirect those calls whose destination is too
> far away to a dynamically created trampoline table.
>
> If I remember correctly you even posted some patches to that effect a
> couple years ago.  Maybe those could be salvaged?
>
> I would largely recommend a solution where the link process could deal
> with it automatically and as needed rather than sprinkling yet more
> manually maintained macros into assembly code.
>

Yes, that would be preferable. I played around with 'bl symbol at PLT'
instead of plain 'bl symbol' but unfortunately, our .ko's are not
proper ELF shared libraries, so that doesn't work.
But essentially, we just need a (eager) PLT.

-- 
Ard.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:37               ` Nicolas Pitre
  2014-11-19 16:41                 ` Ard Biesheuvel
@ 2014-11-19 16:49                 ` Russell King - ARM Linux
  2014-11-19 16:57                   ` Nicolas Pitre
  1 sibling, 1 reply; 22+ messages in thread
From: Russell King - ARM Linux @ 2014-11-19 16:49 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 19, 2014 at 11:37:47AM -0500, Nicolas Pitre wrote:
> On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:
> > Which is not a good idea either, because the compiler needs to know how
> > far away its own manually generated literal pool is from the instructions
> > which reference it.  The .ltorg statement can end up emitting any number
> > of literals at that point, which makes it indeterminant how many words
> > are contained within the asm() statement.
> > 
> > Yes, it isn't desirable to waste an entire data cache line per indirect
> > call like the original quote above, but I don't see a practical
> > alternative.
> 
> Modules could be built without far calls by default, and then the module 
> linker would only have to redirect those calls whose destination is too 
> far away to a dynamically created trampoline table.
> 
> If I remember correctly you even posted some patches to that effect a 
> couple years ago.  Maybe those could be salvaged?

I don't think I ever did, because its pretty much impossible to do as I
explained in a follow up to this thread.

We _used_ to do this with the userspace insmod methods, but since we got
this kernel-side linker, it's been pretty much impossible to do without
rewriting the module code.  That's not going to happen on account of one
quirky architecture which Linus doesn't particularly like.

-- 
FTTC broadband for 0.8mile line: currently at 9.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:38                   ` Ard Biesheuvel
@ 2014-11-19 16:55                     ` Russell King - ARM Linux
  2014-11-19 16:59                       ` Nicolas Pitre
  0 siblings, 1 reply; 22+ messages in thread
From: Russell King - ARM Linux @ 2014-11-19 16:55 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 19, 2014 at 05:38:57PM +0100, Ard Biesheuvel wrote:
> Numbers are disambiguated by the f and b suffixes, so they can be
> reused in the same .s file. So as long as you use a strictly numerical
> prefix, you can deal correctly with the case where, for instance,
> do_div() is called twice in the same compilation unit, and still not
> clash with other inline asm

What's not particularly nice though is to hide these in a macro,
which itself may be part of a larger macro or code fragment also
using small numbers.

We probably ought to be a bit more inteligent about how we choose
these numbers inside macros, rather than just randomly picking some
and hoping that they don't clash.

-- 
FTTC broadband for 0.8mile line: currently at 9.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:49                 ` Russell King - ARM Linux
@ 2014-11-19 16:57                   ` Nicolas Pitre
  2014-11-19 16:59                     ` Russell King - ARM Linux
  0 siblings, 1 reply; 22+ messages in thread
From: Nicolas Pitre @ 2014-11-19 16:57 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:

> On Wed, Nov 19, 2014 at 11:37:47AM -0500, Nicolas Pitre wrote:
> > On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:
> > > Which is not a good idea either, because the compiler needs to know how
> > > far away its own manually generated literal pool is from the instructions
> > > which reference it.  The .ltorg statement can end up emitting any number
> > > of literals at that point, which makes it indeterminant how many words
> > > are contained within the asm() statement.
> > > 
> > > Yes, it isn't desirable to waste an entire data cache line per indirect
> > > call like the original quote above, but I don't see a practical
> > > alternative.
> > 
> > Modules could be built without far calls by default, and then the module 
> > linker would only have to redirect those calls whose destination is too 
> > far away to a dynamically created trampoline table.
> > 
> > If I remember correctly you even posted some patches to that effect a 
> > couple years ago.  Maybe those could be salvaged?
> 
> I don't think I ever did, because its pretty much impossible to do as I
> explained in a follow up to this thread.
> 
> We _used_ to do this with the userspace insmod methods, but since we got
> this kernel-side linker, it's been pretty much impossible to do without
> rewriting the module code.  That's not going to happen on account of one
> quirky architecture which Linus doesn't particularly like.

Still... We could try adding a hook in the generic module linker code 
for a pre-relocation pass.  Maybe only ARM would use it, but if the need 
to load big modules is real then I imagine Linus could be amenable to a 
compromise.


Nicolas

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:55                     ` Russell King - ARM Linux
@ 2014-11-19 16:59                       ` Nicolas Pitre
  0 siblings, 0 replies; 22+ messages in thread
From: Nicolas Pitre @ 2014-11-19 16:59 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:

> On Wed, Nov 19, 2014 at 05:38:57PM +0100, Ard Biesheuvel wrote:
> > Numbers are disambiguated by the f and b suffixes, so they can be
> > reused in the same .s file. So as long as you use a strictly numerical
> > prefix, you can deal correctly with the case where, for instance,
> > do_div() is called twice in the same compilation unit, and still not
> > clash with other inline asm
> 
> What's not particularly nice though is to hide these in a macro,
> which itself may be part of a larger macro or code fragment also
> using small numbers.
> 
> We probably ought to be a bit more inteligent about how we choose
> these numbers inside macros, rather than just randomly picking some
> and hoping that they don't clash.

Maybe in this case, the macro content is so simple that labels are not 
warranted?  Getting away without them certainly solves the issue.


Nicolas

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:57                   ` Nicolas Pitre
@ 2014-11-19 16:59                     ` Russell King - ARM Linux
  2014-11-19 17:12                       ` Nicolas Pitre
  0 siblings, 1 reply; 22+ messages in thread
From: Russell King - ARM Linux @ 2014-11-19 16:59 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, Nov 19, 2014 at 11:57:15AM -0500, Nicolas Pitre wrote:
> On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:
> > I don't think I ever did, because its pretty much impossible to do as I
> > explained in a follow up to this thread.
> > 
> > We _used_ to do this with the userspace insmod methods, but since we got
> > this kernel-side linker, it's been pretty much impossible to do without
> > rewriting the module code.  That's not going to happen on account of one
> > quirky architecture which Linus doesn't particularly like.
> 
> Still... We could try adding a hook in the generic module linker code 
> for a pre-relocation pass.  Maybe only ARM would use it, but if the need 
> to load big modules is real then I imagine Linus could be amenable to a 
> compromise.

So, how big a table would you allocate for the trampolines, based upon
not knowing anything about the module being loaded?  4K?  8K?  64K?

-- 
FTTC broadband for 0.8mile line: currently at 9.5Mbps down 400kbps up
according to speedtest.net.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 16:59                     ` Russell King - ARM Linux
@ 2014-11-19 17:12                       ` Nicolas Pitre
  2014-11-19 17:59                         ` Ard Biesheuvel
  0 siblings, 1 reply; 22+ messages in thread
From: Nicolas Pitre @ 2014-11-19 17:12 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:

> On Wed, Nov 19, 2014 at 11:57:15AM -0500, Nicolas Pitre wrote:
> > On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:
> > > I don't think I ever did, because its pretty much impossible to do as I
> > > explained in a follow up to this thread.
> > > 
> > > We _used_ to do this with the userspace insmod methods, but since we got
> > > this kernel-side linker, it's been pretty much impossible to do without
> > > rewriting the module code.  That's not going to happen on account of one
> > > quirky architecture which Linus doesn't particularly like.
> > 
> > Still... We could try adding a hook in the generic module linker code 
> > for a pre-relocation pass.  Maybe only ARM would use it, but if the need 
> > to load big modules is real then I imagine Linus could be amenable to a 
> > compromise.
> 
> So, how big a table would you allocate for the trampolines, based upon
> not knowing anything about the module being loaded?  4K?  8K?  64K?

The idea of a pre-relocation pass is to determine that.  That could be 
something similar to calling apply_relocate() twice: once to determine 
the number of trampoline entries, and a second time to perform the 
actual relocation.


Nicolas

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 17:12                       ` Nicolas Pitre
@ 2014-11-19 17:59                         ` Ard Biesheuvel
  2014-11-19 18:22                           ` Nicolas Pitre
  0 siblings, 1 reply; 22+ messages in thread
From: Ard Biesheuvel @ 2014-11-19 17:59 UTC (permalink / raw)
  To: linux-arm-kernel

On 19 November 2014 18:12, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:
> On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:
>
>> On Wed, Nov 19, 2014 at 11:57:15AM -0500, Nicolas Pitre wrote:
>> > On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:
>> > > I don't think I ever did, because its pretty much impossible to do as I
>> > > explained in a follow up to this thread.
>> > >
>> > > We _used_ to do this with the userspace insmod methods, but since we got
>> > > this kernel-side linker, it's been pretty much impossible to do without
>> > > rewriting the module code.  That's not going to happen on account of one
>> > > quirky architecture which Linus doesn't particularly like.
>> >
>> > Still... We could try adding a hook in the generic module linker code
>> > for a pre-relocation pass.  Maybe only ARM would use it, but if the need
>> > to load big modules is real then I imagine Linus could be amenable to a
>> > compromise.
>>
>> So, how big a table would you allocate for the trampolines, based upon
>> not knowing anything about the module being loaded?  4K?  8K?  64K?
>
> The idea of a pre-relocation pass is to determine that.  That could be
> something similar to calling apply_relocate() twice: once to determine
> the number of trampoline entries, and a second time to perform the
> actual relocation.
>

Well, the veneers shouldn't take more than 3 words each, right?

ldr ip, [pc]
bx ip
.long symbol

and you would need at most one veneer per unique external symbol
referenced by one or more R_ARM_CALL relocations. Is there no way to
just add that to the static mem footprint as padding, and let the
loader populate it as needed at module relocation time?

-- 
Ard.

^ permalink raw reply	[flat|nested] 22+ messages in thread

* [PATCH RFC] ARM: option for loading modules into vmalloc area
  2014-11-19 17:59                         ` Ard Biesheuvel
@ 2014-11-19 18:22                           ` Nicolas Pitre
  0 siblings, 0 replies; 22+ messages in thread
From: Nicolas Pitre @ 2014-11-19 18:22 UTC (permalink / raw)
  To: linux-arm-kernel

On Wed, 19 Nov 2014, Ard Biesheuvel wrote:

> On 19 November 2014 18:12, Nicolas Pitre <nicolas.pitre@linaro.org> wrote:
> > On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:
> >
> >> On Wed, Nov 19, 2014 at 11:57:15AM -0500, Nicolas Pitre wrote:
> >> > On Wed, 19 Nov 2014, Russell King - ARM Linux wrote:
> >> > > I don't think I ever did, because its pretty much impossible to do as I
> >> > > explained in a follow up to this thread.
> >> > >
> >> > > We _used_ to do this with the userspace insmod methods, but since we got
> >> > > this kernel-side linker, it's been pretty much impossible to do without
> >> > > rewriting the module code.  That's not going to happen on account of one
> >> > > quirky architecture which Linus doesn't particularly like.
> >> >
> >> > Still... We could try adding a hook in the generic module linker code
> >> > for a pre-relocation pass.  Maybe only ARM would use it, but if the need
> >> > to load big modules is real then I imagine Linus could be amenable to a
> >> > compromise.
> >>
> >> So, how big a table would you allocate for the trampolines, based upon
> >> not knowing anything about the module being loaded?  4K?  8K?  64K?
> >
> > The idea of a pre-relocation pass is to determine that.  That could be
> > something similar to calling apply_relocate() twice: once to determine
> > the number of trampoline entries, and a second time to perform the
> > actual relocation.
> >
> 
> Well, the veneers shouldn't take more than 3 words each, right?
> 
> ldr ip, [pc]
> bx ip
> .long symbol

You could possibly do:

 ldr pc, [pc, #-4]
 .long symbol

Or, as RMK suggested a while ago:

.rep 8
ldr pc, [pc, #(32 - 8)]
.endr
.long sym1, sym2, sym3, sym4, sym5, sym6, sym7, sym8

The later is much nicer on the i and d caches.

> and you would need at most one veneer per unique external symbol
> referenced by one or more R_ARM_CALL relocations. Is there no way to
> just add that to the static mem footprint as padding, and let the
> loader populate it as needed at module relocation time?

That's the actual question: how much padding do you need?  Everything 
converge to that very problem.  We need to determine it without too much 
impact on the generic module loader code.


Nicolas

^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2014-11-19 18:22 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-18 16:21 [PATCH RFC] ARM: option for loading modules into vmalloc area Konstantin Khlebnikov
2014-11-18 17:34 ` Russell King - ARM Linux
2014-11-18 18:13   ` Konstantin Khlebnikov
2014-11-19 13:40     ` Arnd Bergmann
2014-11-19 14:54       ` Ard Biesheuvel
2014-11-19 15:52         ` Konstantin Khlebnikov
2014-11-19 16:02           ` Ard Biesheuvel
2014-11-19 16:07             ` Russell King - ARM Linux
2014-11-19 16:25               ` Ard Biesheuvel
2014-11-19 16:32                 ` Konstantin Khlebnikov
2014-11-19 16:38                   ` Ard Biesheuvel
2014-11-19 16:55                     ` Russell King - ARM Linux
2014-11-19 16:59                       ` Nicolas Pitre
2014-11-19 16:41                 ` Russell King - ARM Linux
2014-11-19 16:37               ` Nicolas Pitre
2014-11-19 16:41                 ` Ard Biesheuvel
2014-11-19 16:49                 ` Russell King - ARM Linux
2014-11-19 16:57                   ` Nicolas Pitre
2014-11-19 16:59                     ` Russell King - ARM Linux
2014-11-19 17:12                       ` Nicolas Pitre
2014-11-19 17:59                         ` Ard Biesheuvel
2014-11-19 18:22                           ` Nicolas Pitre

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).