linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH/RFC] mm: add and use batched version of __tlb_remove_table()
@ 2021-12-17  8:19 Nikita Yushchenko
  2021-12-17 18:26 ` Dave Hansen
                   ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Nikita Yushchenko @ 2021-12-17  8:19 UTC (permalink / raw)
  To: Will Deacon, Aneesh Kumar K.V, Andrew Morton, Nick Piggin,
	Peter Zijlstra, Catalin Marinas, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, David S. Miller, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Arnd Bergmann
  Cc: x86, linux-kernel, linux-arch, linux-mm, linuxppc-dev,
	linux-s390, sparclinux, kernel

When batched page table freeing via struct mmu_table_batch is used, the
final freeing in __tlb_remove_table_free() executes a loop, calling
arch hook __tlb_remove_table() to free each table individually.

Shift that loop down to archs. This allows archs to optimize it, by
freeing multiple tables in a single release_pages() call. This is
faster than individual put_page() calls, especially with memcg
accounting enabled.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Signed-off-by: Nikita Yushchenko <nikita.yushchenko@virtuozzo.com>
---
 arch/arm/include/asm/tlb.h                   |  5 ++++
 arch/arm64/include/asm/tlb.h                 |  5 ++++
 arch/powerpc/include/asm/book3s/32/pgalloc.h |  8 +++++++
 arch/powerpc/include/asm/book3s/64/pgalloc.h |  1 +
 arch/powerpc/include/asm/nohash/pgalloc.h    |  8 +++++++
 arch/powerpc/mm/book3s64/pgtable.c           |  8 +++++++
 arch/s390/include/asm/tlb.h                  |  1 +
 arch/s390/mm/pgalloc.c                       |  8 +++++++
 arch/sparc/include/asm/pgalloc_64.h          |  8 +++++++
 arch/x86/include/asm/tlb.h                   |  5 ++++
 include/asm-generic/tlb.h                    |  2 +-
 include/linux/swap.h                         |  5 +++-
 mm/mmu_gather.c                              |  6 +----
 mm/swap_state.c                              | 24 +++++++++++++++-----
 14 files changed, 81 insertions(+), 13 deletions(-)

diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index b8cbe03ad260..37f8a5193581 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -34,6 +34,11 @@ static inline void __tlb_remove_table(void *_table)
 	free_page_and_swap_cache((struct page *)_table);
 }
 
+static inline void __tlb_remove_tables(void **tables, int nr)
+{
+	free_pages_and_swap_cache_nolru((struct page **)tables, nr);
+}
+
 #include <asm-generic/tlb.h>
 
 static inline void
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index c995d1f4594f..c70dd428e1f6 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -16,6 +16,11 @@ static inline void __tlb_remove_table(void *_table)
 	free_page_and_swap_cache((struct page *)_table);
 }
 
+static inline void __tlb_remove_tables(void **tables, int nr)
+{
+	free_pages_and_swap_cache_nolru((struct page **)tables, nr);
+}
+
 #define tlb_flush tlb_flush
 static void tlb_flush(struct mmu_gather *tlb);
 
diff --git a/arch/powerpc/include/asm/book3s/32/pgalloc.h b/arch/powerpc/include/asm/book3s/32/pgalloc.h
index dc5c039eb28e..880369de688a 100644
--- a/arch/powerpc/include/asm/book3s/32/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/32/pgalloc.h
@@ -66,6 +66,14 @@ static inline void __tlb_remove_table(void *_table)
 	pgtable_free(table, shift);
 }
 
+static inline void __tlb_remove_tables(void **tables, int nr)
+{
+	int i;
+
+	for (i = 0; i < nr; i++)
+		__tlb_remove_table(tables[i]);
+}
+
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 				  unsigned long address)
 {
diff --git a/arch/powerpc/include/asm/book3s/64/pgalloc.h b/arch/powerpc/include/asm/book3s/64/pgalloc.h
index e1af0b394ceb..f3dcd735e4ce 100644
--- a/arch/powerpc/include/asm/book3s/64/pgalloc.h
+++ b/arch/powerpc/include/asm/book3s/64/pgalloc.h
@@ -20,6 +20,7 @@ extern pmd_t *pmd_fragment_alloc(struct mm_struct *, unsigned long);
 extern void pmd_fragment_free(unsigned long *);
 extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
 extern void __tlb_remove_table(void *_table);
+extern void __tlb_remove_tables(void **tables, int nr);
 void pte_frag_destroy(void *pte_frag);
 
 static inline pgd_t *radix__pgd_alloc(struct mm_struct *mm)
diff --git a/arch/powerpc/include/asm/nohash/pgalloc.h b/arch/powerpc/include/asm/nohash/pgalloc.h
index 29c43665a753..170f5fda3dc1 100644
--- a/arch/powerpc/include/asm/nohash/pgalloc.h
+++ b/arch/powerpc/include/asm/nohash/pgalloc.h
@@ -63,6 +63,14 @@ static inline void __tlb_remove_table(void *_table)
 	pgtable_free(table, shift);
 }
 
+static inline void __tlb_remove_tables(void **tables, int nr)
+{
+	int i;
+
+	for (i = 0; i < nr; i++)
+		__tlb_remove_table(tables[i]);
+}
+
 static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t table,
 				  unsigned long address)
 {
diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c
index 9e16c7b1a6c5..f95fb42fadfa 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -412,6 +412,14 @@ void __tlb_remove_table(void *_table)
 	return pgtable_free(table, index);
 }
 
+void __tlb_remove_tables(void **tables, int nr)
+{
+	int i;
+
+	for (i = 0; i < nr; i++)
+		__tlb_remove_table(tables[i]);
+}
+
 #ifdef CONFIG_PROC_FS
 atomic_long_t direct_pages_count[MMU_PAGE_COUNT];
 
diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h
index fe6407f0eb1b..144d3db1441e 100644
--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -23,6 +23,7 @@
  */
 
 void __tlb_remove_table(void *_table);
+void __tlb_remove_tables(void **tables, int nr);
 static inline void tlb_flush(struct mmu_gather *tlb);
 static inline bool __tlb_remove_page_size(struct mmu_gather *tlb,
 					  struct page *page, int page_size);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index 781965f7210e..6a685a895fdb 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -315,6 +315,14 @@ void __tlb_remove_table(void *_table)
 	}
 }
 
+void __tlb_remove_tables(void **tables, int nr)
+{
+	int i;
+
+	for (i = 0; i < nr; i++)
+		__tlb_remove_table(tables[i]);
+}
+
 /*
  * Base infrastructure required to generate basic asces, region, segment,
  * and page tables that do not make use of enhanced features like EDAT1.
diff --git a/arch/sparc/include/asm/pgalloc_64.h b/arch/sparc/include/asm/pgalloc_64.h
index 7b5561d17ab1..eb7c9bf46747 100644
--- a/arch/sparc/include/asm/pgalloc_64.h
+++ b/arch/sparc/include/asm/pgalloc_64.h
@@ -92,6 +92,14 @@ static inline void __tlb_remove_table(void *_table)
 		is_page = true;
 	pgtable_free(table, is_page);
 }
+
+static inline void __tlb_remove_tables(void **tables, int nr)
+{
+	int i;
+
+	for (i = 0; i < nr; i++)
+		__tlb_remove_table(tables[i]);
+}
 #else /* CONFIG_SMP */
 static inline void pgtable_free_tlb(struct mmu_gather *tlb, void *table, bool is_page)
 {
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 1bfe979bb9bc..253a62be888c 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -37,4 +37,9 @@ static inline void __tlb_remove_table(void *table)
 	free_page_and_swap_cache(table);
 }
 
+static inline void __tlb_remove_tables(void **tables, int nr)
+{
+	free_pages_and_swap_cache_nolru((struct page **)tables, nr);
+}
+
 #endif /* _ASM_X86_TLB_H */
diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index 2c68a545ffa7..923c65d986dc 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -148,7 +148,7 @@
  *  Useful if your architecture has non-page page directories.
  *
  *  When used, an architecture is expected to provide __tlb_remove_table()
- *  which does the actual freeing of these pages.
+ *  and __tlb_remove_tables() which do the actual freeing of these pages.
  *
  *  MMU_GATHER_RCU_TABLE_FREE
  *
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d1ea44b31f19..86a1b0a61889 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -460,6 +460,7 @@ extern void clear_shadow_from_swap_cache(int type, unsigned long begin,
 extern void free_swap_cache(struct page *);
 extern void free_page_and_swap_cache(struct page *);
 extern void free_pages_and_swap_cache(struct page **, int);
+extern void free_pages_and_swap_cache_nolru(struct page **, int);
 extern struct page *lookup_swap_cache(swp_entry_t entry,
 				      struct vm_area_struct *vma,
 				      unsigned long addr);
@@ -565,7 +566,9 @@ static inline struct address_space *swap_address_space(swp_entry_t entry)
 #define free_page_and_swap_cache(page) \
 	put_page(page)
 #define free_pages_and_swap_cache(pages, nr) \
-	release_pages((pages), (nr));
+	release_pages((pages), (nr))
+#define free_pages_and_swap_cache_nolru(pages, nr) \
+	release_pages((pages), (nr))
 
 static inline void free_swap_cache(struct page *page)
 {
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 1b9837419bf9..2faa0d59aeca 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -95,11 +95,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 
 static void __tlb_remove_table_free(struct mmu_table_batch *batch)
 {
-	int i;
-
-	for (i = 0; i < batch->nr; i++)
-		__tlb_remove_table(batch->tables[i]);
-
+	__tlb_remove_tables(batch->tables, batch->nr);
 	free_page((unsigned long)batch);
 }
 
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8d4104242100..76c3d4a756a3 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -307,17 +307,29 @@ void free_page_and_swap_cache(struct page *page)
 
 /*
  * Passed an array of pages, drop them all from swapcache and then release
- * them.  They are removed from the LRU and freed if this is their last use.
+ * them.  They are optionally removed from the LRU and freed if this is their
+ * last use.
  */
-void free_pages_and_swap_cache(struct page **pages, int nr)
+static void __free_pages_and_swap_cache(struct page **pages, int nr,
+		bool do_lru)
 {
-	struct page **pagep = pages;
 	int i;
 
-	lru_add_drain();
+	if (do_lru)
+		lru_add_drain();
 	for (i = 0; i < nr; i++)
-		free_swap_cache(pagep[i]);
-	release_pages(pagep, nr);
+		free_swap_cache(pages[i]);
+	release_pages(pages, nr);
+}
+
+void free_pages_and_swap_cache(struct page **pages, int nr)
+{
+	__free_pages_and_swap_cache(pages, nr, true);
+}
+
+void free_pages_and_swap_cache_nolru(struct page **pages, int nr)
+{
+	__free_pages_and_swap_cache(pages, nr, false);
 }
 
 static inline bool swap_use_vma_readahead(void)
-- 
2.30.2


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC] mm: add and use batched version of __tlb_remove_table()
  2021-12-17  8:19 [PATCH/RFC] mm: add and use batched version of __tlb_remove_table() Nikita Yushchenko
@ 2021-12-17 18:26 ` Dave Hansen
  2021-12-18 14:31   ` Nikita Yushchenko
  2021-12-18  0:37 ` Peter Zijlstra
       [not found] ` <YbzZaFY+ht+bUtcz@ravnborg.org>
  2 siblings, 1 reply; 8+ messages in thread
From: Dave Hansen @ 2021-12-17 18:26 UTC (permalink / raw)
  To: Nikita Yushchenko, Will Deacon, Aneesh Kumar K.V, Andrew Morton,
	Nick Piggin, Peter Zijlstra, Catalin Marinas, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, David S. Miller,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	Arnd Bergmann
  Cc: x86, linux-kernel, linux-arch, linux-mm, linuxppc-dev,
	linux-s390, sparclinux, kernel

On 12/17/21 12:19 AM, Nikita Yushchenko wrote:
> When batched page table freeing via struct mmu_table_batch is used, the
> final freeing in __tlb_remove_table_free() executes a loop, calling
> arch hook __tlb_remove_table() to free each table individually.
> 
> Shift that loop down to archs. This allows archs to optimize it, by
> freeing multiple tables in a single release_pages() call. This is
> faster than individual put_page() calls, especially with memcg
> accounting enabled.

Could we quantify "faster"?  There's a non-trivial amount of code being
added here and it would be nice to back it up with some cold-hard numbers.

> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -95,11 +95,7 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
>  
>  static void __tlb_remove_table_free(struct mmu_table_batch *batch)
>  {
> -	int i;
> -
> -	for (i = 0; i < batch->nr; i++)
> -		__tlb_remove_table(batch->tables[i]);
> -
> +	__tlb_remove_tables(batch->tables, batch->nr);
>  	free_page((unsigned long)batch);
>  }

This leaves a single call-site for __tlb_remove_table():

> static void tlb_remove_table_one(void *table)
> {
>         tlb_remove_table_sync_one();
>         __tlb_remove_table(table);
> }

Is that worth it, or could it just be:

	__tlb_remove_tables(&table, 1);

?

> -void free_pages_and_swap_cache(struct page **pages, int nr)
> +static void __free_pages_and_swap_cache(struct page **pages, int nr,
> +		bool do_lru)
>  {
> -	struct page **pagep = pages;
>  	int i;
>  
> -	lru_add_drain();
> +	if (do_lru)
> +		lru_add_drain();
>  	for (i = 0; i < nr; i++)
> -		free_swap_cache(pagep[i]);
> -	release_pages(pagep, nr);
> +		free_swap_cache(pages[i]);
> +	release_pages(pages, nr);
> +}
> +
> +void free_pages_and_swap_cache(struct page **pages, int nr)
> +{
> +	__free_pages_and_swap_cache(pages, nr, true);
> +}
> +
> +void free_pages_and_swap_cache_nolru(struct page **pages, int nr)
> +{
> +	__free_pages_and_swap_cache(pages, nr, false);
>  }

This went unmentioned in the changelog.  But, it seems like there's a
specific optimization here.  In the exiting code,
free_pages_and_swap_cache() is wasteful if no page in pages[] is on the
LRU.  It doesn't need the lru_add_drain().

Any code that knows it is freeing all non-LRU pages can call
free_pages_and_swap_cache_nolru() which should perform better than
free_pages_and_swap_cache().

Should we add this to the for loop in __free_pages_and_swap_cache()?

	for (i = 0; i < nr; i++) {
		if (!do_lru)
			VM_WARN_ON_ONCE_PAGE(PageLRU(pagep[i]),
					     pagep[i]);
		free_swap_cache(...);
	}

But, even more than that, do all the architectures even need the
free_swap_cache()?  PageSwapCache() will always be false on x86, which
makes the loop kinda silly.  x86 could, for instance, just do:

static inline void __tlb_remove_tables(void **tables, int nr)
{
	release_pages((struct page **)tables, nr);
}

I _think_ this will work everywhere that has whole pages as page tables.
 Taking that one step further, what if we only had one generic:

static inline void tlb_remove_tables(void **tables, int nr)
{
	int i;

#ifdef ARCH_PAGE_TABLES_ARE_FULL_PAGE
	release_pages((struct page **)tables, nr);
#else
	arch_tlb_remove_tables(tables, i);
#endif
}

Architectures that set ARCH_PAGE_TABLES_ARE_FULL_PAGE (or whatever)
don't need to implement __tlb_remove_table() at all *and* can do
release_pages() directly.

This avoids all the  confusion with the swap cache and LRU naming.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC] mm: add and use batched version of __tlb_remove_table()
  2021-12-17  8:19 [PATCH/RFC] mm: add and use batched version of __tlb_remove_table() Nikita Yushchenko
  2021-12-17 18:26 ` Dave Hansen
@ 2021-12-18  0:37 ` Peter Zijlstra
  2021-12-18 13:35   ` Nikita Yushchenko
       [not found] ` <YbzZaFY+ht+bUtcz@ravnborg.org>
  2 siblings, 1 reply; 8+ messages in thread
From: Peter Zijlstra @ 2021-12-18  0:37 UTC (permalink / raw)
  To: Nikita Yushchenko
  Cc: Will Deacon, Aneesh Kumar K.V, Andrew Morton, Nick Piggin,
	Catalin Marinas, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, David S. Miller, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Arnd Bergmann, x86,
	linux-kernel, linux-arch, linux-mm, linuxppc-dev, linux-s390,
	sparclinux, kernel

On Fri, Dec 17, 2021 at 11:19:10AM +0300, Nikita Yushchenko wrote:
> When batched page table freeing via struct mmu_table_batch is used, the
> final freeing in __tlb_remove_table_free() executes a loop, calling
> arch hook __tlb_remove_table() to free each table individually.
> 
> Shift that loop down to archs. This allows archs to optimize it, by
> freeing multiple tables in a single release_pages() call. This is
> faster than individual put_page() calls, especially with memcg
> accounting enabled.
> 
> Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
> Signed-off-by: Nikita Yushchenko <nikita.yushchenko@virtuozzo.com>
> ---
>  arch/arm/include/asm/tlb.h                   |  5 ++++
>  arch/arm64/include/asm/tlb.h                 |  5 ++++
>  arch/powerpc/include/asm/book3s/32/pgalloc.h |  8 +++++++
>  arch/powerpc/include/asm/book3s/64/pgalloc.h |  1 +
>  arch/powerpc/include/asm/nohash/pgalloc.h    |  8 +++++++
>  arch/powerpc/mm/book3s64/pgtable.c           |  8 +++++++
>  arch/s390/include/asm/tlb.h                  |  1 +
>  arch/s390/mm/pgalloc.c                       |  8 +++++++
>  arch/sparc/include/asm/pgalloc_64.h          |  8 +++++++
>  arch/x86/include/asm/tlb.h                   |  5 ++++
>  include/asm-generic/tlb.h                    |  2 +-
>  include/linux/swap.h                         |  5 +++-
>  mm/mmu_gather.c                              |  6 +----
>  mm/swap_state.c                              | 24 +++++++++++++++-----
>  14 files changed, 81 insertions(+), 13 deletions(-)

Oh gawd, that's terrible. Never, ever duplicate code like that.

I'm thinking the below does the same? But yes, please do as Dave said,
give us actual numbers that show this is worth it.

---
 arch/Kconfig                 |  4 ++++
 arch/arm/Kconfig             |  1 +
 arch/arm/include/asm/tlb.h   |  5 -----
 arch/arm64/Kconfig           |  1 +
 arch/arm64/include/asm/tlb.h |  5 -----
 arch/x86/Kconfig             |  1 +
 arch/x86/include/asm/tlb.h   |  4 ----
 mm/mmu_gather.c              | 22 +++++++++++++++++++---
 8 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 26b8ed11639d..f2bd3f5af2b1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -415,6 +415,10 @@ config HAVE_ARCH_JUMP_LABEL_RELATIVE
 config MMU_GATHER_TABLE_FREE
 	bool
 
+config MMU_GATHER_TABLE_PAGE
+	bool
+	depends on MMU_GATHER_TABLE_FREE
+
 config MMU_GATHER_RCU_TABLE_FREE
 	bool
 	select MMU_GATHER_TABLE_FREE
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f0f9e8bec83a..11baaa5719c2 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -110,6 +110,7 @@ config ARM
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE
+	select MMU_GATHER_TABLE_PAGE if MMU
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RSEQ
 	select HAVE_STACKPROTECTOR
diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h
index b8cbe03ad260..9d9b21649ca0 100644
--- a/arch/arm/include/asm/tlb.h
+++ b/arch/arm/include/asm/tlb.h
@@ -29,11 +29,6 @@
 #include <linux/swap.h>
 #include <asm/tlbflush.h>
 
-static inline void __tlb_remove_table(void *_table)
-{
-	free_page_and_swap_cache((struct page *)_table);
-}
-
 #include <asm-generic/tlb.h>
 
 static inline void
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c4207cf9bb17..4aa28fb03f4f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -196,6 +196,7 @@ config ARM64
 	select HAVE_FUNCTION_ARG_ACCESS_API
 	select HAVE_FUTEX_CMPXCHG if FUTEX
 	select MMU_GATHER_RCU_TABLE_FREE
+	select MMU_GATHER_TABLE_PAGE
 	select HAVE_RSEQ
 	select HAVE_STACKPROTECTOR
 	select HAVE_SYSCALL_TRACEPOINTS
diff --git a/arch/arm64/include/asm/tlb.h b/arch/arm64/include/asm/tlb.h
index c995d1f4594f..401826260a5c 100644
--- a/arch/arm64/include/asm/tlb.h
+++ b/arch/arm64/include/asm/tlb.h
@@ -11,11 +11,6 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 
-static inline void __tlb_remove_table(void *_table)
-{
-	free_page_and_swap_cache((struct page *)_table);
-}
-
 #define tlb_flush tlb_flush
 static void tlb_flush(struct mmu_gather *tlb);
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b9281fab4e3e..a22e653f4d0e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -235,6 +235,7 @@ config X86
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select MMU_GATHER_RCU_TABLE_FREE		if PARAVIRT
+	select MMU_GATHER_TABLE_PAGE
 	select HAVE_POSIX_CPU_TIMERS_TASK_WORK
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_RELIABLE_STACKTRACE		if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
index 1bfe979bb9bc..dec5ffa3042a 100644
--- a/arch/x86/include/asm/tlb.h
+++ b/arch/x86/include/asm/tlb.h
@@ -32,9 +32,5 @@ static inline void tlb_flush(struct mmu_gather *tlb)
  * below 'ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE' in include/asm-generic/tlb.h
  * for more details.
  */
-static inline void __tlb_remove_table(void *table)
-{
-	free_page_and_swap_cache(table);
-}
 
 #endif /* _ASM_X86_TLB_H */
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 1b9837419bf9..0195d0f13ed3 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -93,13 +93,29 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
 
 #ifdef CONFIG_MMU_GATHER_TABLE_FREE
 
-static void __tlb_remove_table_free(struct mmu_table_batch *batch)
+#ifdef CONFIG_MMU_GATHER_TABLE_PAGE
+static inline void __tlb_remove_table(void *table)
+{
+	free_page_and_swap_cache(table);
+}
+
+static inline void __tlb_remove_tables(void **tables, int nr)
+{
+	free_pages_and_swap_cache_nolru((struct page **)tables, nr);
+}
+#else
+static inline void __tlb_remove_tables(void **tables, int nr)
 {
 	int i;
 
-	for (i = 0; i < batch->nr; i++)
-		__tlb_remove_table(batch->tables[i]);
+	for (i = 0; i < nr; i++)
+		__tlb_remove_table(tables[i]);
+}
+#endif
 
+static void __tlb_remove_table_free(struct mmu_table_batch *batch)
+{
+	__tlb_remove_tables(batch->tables, batch->nr);
 	free_page((unsigned long)batch);
 }
 

^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC] mm: add and use batched version of __tlb_remove_table()
  2021-12-18  0:37 ` Peter Zijlstra
@ 2021-12-18 13:35   ` Nikita Yushchenko
  0 siblings, 0 replies; 8+ messages in thread
From: Nikita Yushchenko @ 2021-12-18 13:35 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Will Deacon, Aneesh Kumar K.V, Andrew Morton, Nick Piggin,
	Catalin Marinas, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, David S. Miller, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Arnd Bergmann, x86,
	linux-kernel, linux-arch, linux-mm, linuxppc-dev, linux-s390,
	sparclinux, kernel

> Oh gawd, that's terrible. Never, ever duplicate code like that.

What the patch does is:
- formally shift the loop one level down in the call graph, adding instances of __tmp_remove_tables() 
exactly to locations where instances of __tmp_remove_table() already exist,
- on architectures where __tmp_remove_tables() resulted into calling free_page_and_swap_cache() in loop, 
call batched free_page_and_swap_cache_nolru() instead,
- on other places, keep the loop as is - perhaps as a possible target for future optimizations.

The extra duplication added by this patch just highlights already existing duplication of 
__tlb_remove_table() implementations.

Ok let's follow your suggestion instead. AFAIU, that is:
- remove the free_page_and_swap_cache() based implementation from archs,
- instead, add it into mm/mmu_gather.c, ifdef-ed by a new Kconfig key, and define that Kconfig key into 
the archs that use it,
- then, keep the optimization inside mm/mmu_gather.c.

Indeed, the overall change will become smaller then. Thanks for the idea. Will post patches doing that soon.

Nikita

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC] mm: add and use batched version of __tlb_remove_table()
       [not found] ` <YbzZaFY+ht+bUtcz@ravnborg.org>
@ 2021-12-18 13:38   ` Nikita Yushchenko
  0 siblings, 0 replies; 8+ messages in thread
From: Nikita Yushchenko @ 2021-12-18 13:38 UTC (permalink / raw)
  To: Sam Ravnborg
  Cc: Will Deacon, Aneesh Kumar K.V, Andrew Morton, Nick Piggin,
	Peter Zijlstra, Catalin Marinas, Heiko Carstens, Vasily Gorbik,
	Christian Borntraeger, David S. Miller, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Arnd Bergmann, x86,
	linux-kernel, linux-arch, linux-mm, linuxppc-dev, linux-s390,
	sparclinux, kernel

17.12.2021 21:39, Sam Ravnborg wrote:
> Hi Nikita,
> 
> How about adding the following to tlb.h:
> 
> #ifndef __tlb_remove_tables
> static void __tlb_remove_tables(...)
> {
> 	....
> }
> #endif
> 
> And then the few archs that want to override __tlb_remove_tables
> needs to do a
> #define __tlb_remove_tables __tlb_remove_tables

Hi Sam.

Thanks for you suggestion.

I think that what Peter suggested in the other reply is even better. I will follow that approach.

Nikita

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC] mm: add and use batched version of __tlb_remove_table()
  2021-12-17 18:26 ` Dave Hansen
@ 2021-12-18 14:31   ` Nikita Yushchenko
  2021-12-19  1:34     ` Dave Hansen
  0 siblings, 1 reply; 8+ messages in thread
From: Nikita Yushchenko @ 2021-12-18 14:31 UTC (permalink / raw)
  To: Dave Hansen, Will Deacon, Aneesh Kumar K.V, Andrew Morton,
	Nick Piggin, Peter Zijlstra, Catalin Marinas, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, David S. Miller,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	Arnd Bergmann
  Cc: x86, linux-kernel, linux-arch, linux-mm, linuxppc-dev,
	linux-s390, sparclinux, kernel

>> This allows archs to optimize it, by
>> freeing multiple tables in a single release_pages() call. This is
>> faster than individual put_page() calls, especially with memcg
>> accounting enabled.
> 
> Could we quantify "faster"?  There's a non-trivial amount of code being
> added here and it would be nice to back it up with some cold-hard numbers.

I currently don't have numbers for this patch taken alone. This patch originates from work done some 
years ago to reduce cost of memory accounting, and x86-only version of this patch was in 
virtuozzo/openvz kernel since then. Other patches from that work have been upstreamed, but this one was 
missed.

Still it's obvious that release_pages() shall be faster that a loop calling put_page() - isn't that 
exactly the reason why release_pages() exists and is different from a loop calling put_page()?

>>   static void __tlb_remove_table_free(struct mmu_table_batch *batch)
>>   {
>> -	int i;
>> -
>> -	for (i = 0; i < batch->nr; i++)
>> -		__tlb_remove_table(batch->tables[i]);
>> -
>> +	__tlb_remove_tables(batch->tables, batch->nr);
>>   	free_page((unsigned long)batch);
>>   }
> 
> This leaves a single call-site for __tlb_remove_table():
> 
>> static void tlb_remove_table_one(void *table)
>> {
>>          tlb_remove_table_sync_one();
>>          __tlb_remove_table(table);
>> }
> 
> Is that worth it, or could it just be:
> 
> 	__tlb_remove_tables(&table, 1);

I was considering that while preparing the patch, however that resulted into even larger change in 
archs, due to removal of non-batched call, and I decided not to follow this way.

And, Peter's suggestion to integrate free_page_and_swap()-based implementation of __tlb_remove_table() 
into mm/mmu_gather.c under ifdef, and then do the optimization locally in mm/mmu_gather.c, looks better.

>> +void free_pages_and_swap_cache_nolru(struct page **pages, int nr)
>> +{
>> +	__free_pages_and_swap_cache(pages, nr, false);
>>   }
> 
> This went unmentioned in the changelog.  But, it seems like there's a
> specific optimization here.  In the exiting code,
> free_pages_and_swap_cache() is wasteful if no page in pages[] is on the
> LRU.  It doesn't need the lru_add_drain().

This is a somewhat different topic.

In scope of this patch, the _nolru version was added because there was no lru draining in the looped 
call to __tlb_remove_table(). Having it added to the batched version, although won't break things, does 
add overhead that was not there before, which is in direct conflict with the original goal.

If the version with draining lru is indeed not needed, it can be cleaned out in scope of a different 
patchset.

> 		if (!do_lru)
> 			VM_WARN_ON_ONCE_PAGE(PageLRU(pagep[i]),
> 					     pagep[i]);
> 		free_swap_cache(...);

This looks like a good safety measure, will add it.

> But, even more than that, do all the architectures even need the
> free_swap_cache()?

I was under impression that process page tables are a valid target for swapping out. Although I can be 
wrong here.

Nikita

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC] mm: add and use batched version of __tlb_remove_table()
  2021-12-18 14:31   ` Nikita Yushchenko
@ 2021-12-19  1:34     ` Dave Hansen
  2021-12-23  9:55       ` Nikita Yushchenko
  0 siblings, 1 reply; 8+ messages in thread
From: Dave Hansen @ 2021-12-19  1:34 UTC (permalink / raw)
  To: Nikita Yushchenko, Will Deacon, Aneesh Kumar K.V, Andrew Morton,
	Nick Piggin, Peter Zijlstra, Catalin Marinas, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, David S. Miller,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	Arnd Bergmann
  Cc: x86, linux-kernel, linux-arch, linux-mm, linuxppc-dev,
	linux-s390, sparclinux, kernel

On 12/18/21 6:31 AM, Nikita Yushchenko wrote:
>>> This allows archs to optimize it, by
>>> freeing multiple tables in a single release_pages() call. This is
>>> faster than individual put_page() calls, especially with memcg
>>> accounting enabled.
>>
>> Could we quantify "faster"?  There's a non-trivial amount of code being
>> added here and it would be nice to back it up with some cold-hard
>> numbers.
> 
> I currently don't have numbers for this patch taken alone. This patch
> originates from work done some years ago to reduce cost of memory
> accounting, and x86-only version of this patch was in virtuozzo/openvz
> kernel since then. Other patches from that work have been upstreamed,
> but this one was missed.
> 
> Still it's obvious that release_pages() shall be faster that a loop
> calling put_page() - isn't that exactly the reason why release_pages()
> exists and is different from a loop calling put_page()?

Yep, but this patch does a bunch of stuff to some really hot paths.  It
would be greatly appreciated if you could put in the effort to actually
put some numbers behind this.  Plenty of weird stuff happens on
computers that we suck at predicting.

I'd be happy with even a quick little micro.  My favorite is:

	https://github.com/antonblanchard/will-it-scale

Although, I do wonder if anything will even be measurable.  Please at
least try.

...
>> But, even more than that, do all the architectures even need the
>> free_swap_cache()?
> 
> I was under impression that process page tables are a valid target for
> swapping out. Although I can be wrong here.

It's not out of the realm of possibilities.  But, last I checked, the
only path we free page tables in was when VMAs are being torn down.  I
have a longstanding TODO item to reclaim them if they're empty (all
zeros) or to zero them out if they're mapping page cache.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH/RFC] mm: add and use batched version of __tlb_remove_table()
  2021-12-19  1:34     ` Dave Hansen
@ 2021-12-23  9:55       ` Nikita Yushchenko
  0 siblings, 0 replies; 8+ messages in thread
From: Nikita Yushchenko @ 2021-12-23  9:55 UTC (permalink / raw)
  To: Dave Hansen, Will Deacon, Aneesh Kumar K.V, Andrew Morton,
	Nick Piggin, Peter Zijlstra, Catalin Marinas, Heiko Carstens,
	Vasily Gorbik, Christian Borntraeger, David S. Miller,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	Arnd Bergmann
  Cc: x86, linux-kernel, linux-arch, linux-mm, linuxppc-dev,
	linux-s390, sparclinux, kernel

[-- Attachment #1: Type: text/plain, Size: 1217 bytes --]

>> I currently don't have numbers for this patch taken alone. This patch
>> originates from work done some years ago to reduce cost of memory
>> accounting, and x86-only version of this patch was in virtuozzo/openvz
>> kernel since then. Other patches from that work have been upstreamed,
>> but this one was missed.
>>
>> Still it's obvious that release_pages() shall be faster that a loop
>> calling put_page() - isn't that exactly the reason why release_pages()
>> exists and is different from a loop calling put_page()?
> 
> Yep, but this patch does a bunch of stuff to some really hot paths.  It
> would be greatly appreciated if you could put in the effort to actually
> put some numbers behind this.  Plenty of weird stuff happens on
> computers that we suck at predicting.

I found the original report about high cost of memory accounting, and tried to repeat the test described 
there, with and without the patch.

The test is - run a script in 30 openvz containers in parallel, and measure average time per execution. 
Script is attached.

I'm getting measurable improvement in average msecs per execution: 15360 ms without patch, 15170 ms with 
patch. And this difference is reliably reproducible.

Nikita

[-- Attachment #2: calcprimes.sh --]
[-- Type: application/x-sh, Size: 468 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2021-12-23  9:55 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-17  8:19 [PATCH/RFC] mm: add and use batched version of __tlb_remove_table() Nikita Yushchenko
2021-12-17 18:26 ` Dave Hansen
2021-12-18 14:31   ` Nikita Yushchenko
2021-12-19  1:34     ` Dave Hansen
2021-12-23  9:55       ` Nikita Yushchenko
2021-12-18  0:37 ` Peter Zijlstra
2021-12-18 13:35   ` Nikita Yushchenko
     [not found] ` <YbzZaFY+ht+bUtcz@ravnborg.org>
2021-12-18 13:38   ` Nikita Yushchenko

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).