linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/3] send tlb_remove_table_smp_sync IPI only to necessary CPUs
@ 2023-04-04 13:42 Yair Podemsky
  2023-04-04 13:42 ` [PATCH 1/3] arch: Introduce ARCH_HAS_CPUMASK_BITS Yair Podemsky
                   ` (2 more replies)
  0 siblings, 3 replies; 37+ messages in thread
From: Yair Podemsky @ 2023-04-04 13:42 UTC (permalink / raw)
  To: linux, mpe, npiggin, christophe.leroy, hca, gor, agordeev,
	borntraeger, svens, davem, tglx, mingo, bp, dave.hansen, x86,
	hpa, will, aneesh.kumar, akpm, peterz, arnd, keescook, paulmck,
	jpoimboe, samitolvanen, frederic, ardb, juerg.haefliger,
	rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, vschneid, dhildenb
  Cc: ypodemsk, alougovs

Currently the tlb_remove_table_smp_sync IPI is sent to all CPUs
indiscriminately, this causes unnecessary work and delays notable in
real-time use-cases and isolated cpus.
By limiting the IPI to only be sent to cpus referencing the effected
mm and in kernel mode latency is improved.
a config to differentiate architectures that support mm_cpumask from
those that don't will allow safe usage of this feature.

Yair Podemsky (3):
  arch: Introduce ARCH_HAS_CPUMASK_BITS
  mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to MM CPUs
  mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in
    kernel mode

-- 
2.31.1



^ permalink raw reply	[flat|nested] 37+ messages in thread

* [PATCH 1/3] arch: Introduce ARCH_HAS_CPUMASK_BITS
  2023-04-04 13:42 [PATCH 0/3] send tlb_remove_table_smp_sync IPI only to necessary CPUs Yair Podemsky
@ 2023-04-04 13:42 ` Yair Podemsky
  2023-04-04 13:47   ` David Hildenbrand
  2023-04-04 13:42 ` [PATCH 2/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to MM CPUs Yair Podemsky
  2023-04-04 13:42 ` [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode Yair Podemsky
  2 siblings, 1 reply; 37+ messages in thread
From: Yair Podemsky @ 2023-04-04 13:42 UTC (permalink / raw)
  To: linux, mpe, npiggin, christophe.leroy, hca, gor, agordeev,
	borntraeger, svens, davem, tglx, mingo, bp, dave.hansen, x86,
	hpa, will, aneesh.kumar, akpm, peterz, arnd, keescook, paulmck,
	jpoimboe, samitolvanen, frederic, ardb, juerg.haefliger,
	rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, vschneid, dhildenb
  Cc: ypodemsk, alougovs

Some architectures set and maintain the mm_cpumask bits when loading
or removing process from cpu.
This Kconfig will mark those to allow different behavior between
kernels that maintain the mm_cpumask and those that do not.

Signed-off-by: Yair Podemsky <ypodemsk@redhat.com>
---
 arch/Kconfig         | 8 ++++++++
 arch/arm/Kconfig     | 1 +
 arch/powerpc/Kconfig | 1 +
 arch/s390/Kconfig    | 1 +
 arch/sparc/Kconfig   | 1 +
 arch/x86/Kconfig     | 1 +
 6 files changed, 13 insertions(+)

diff --git a/arch/Kconfig b/arch/Kconfig
index e3511afbb7f2..ec5559779e9f 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1434,6 +1434,14 @@ config ARCH_HAS_NONLEAF_PMD_YOUNG
 	  address translations. Page table walkers that clear the accessed bit
 	  may use this capability to reduce their search space.
 
+config ARCH_HAS_CPUMASK_BITS
+	bool
+	help
+	  Architectures that select this option set bits on the mm_cpumask
+	  to mark which cpus loaded the mm, The mask can then be used to
+	  control mm specific actions such as tlb_flush.
+
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index e24a9820e12f..6111059a68a3 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -70,6 +70,7 @@ config ARM
 	select GENERIC_SCHED_CLOCK
 	select GENERIC_SMP_IDLE_THREAD
 	select HARDIRQS_SW_RESEND
+	select ARCH_HAS_CPUMASK_BITS
 	select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT
 	select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6
 	select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index a6c4407d3ec8..2fd0160f4f8e 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -144,6 +144,7 @@ config PPC
 	select ARCH_HAS_TICK_BROADCAST		if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAS_UACCESS_FLUSHCACHE
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
+	select ARCH_HAS_CPUMASK_BITS
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_KEEP_MEMBLOCK
 	select ARCH_MIGHT_HAVE_PC_PARPORT
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 9809c74e1240..b2de5ee07faf 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -86,6 +86,7 @@ config S390
 	select ARCH_HAS_SYSCALL_WRAPPER
 	select ARCH_HAS_UBSAN_SANITIZE_ALL
 	select ARCH_HAS_VDSO_DATA
+	select ARCH_HAS_CPUMASK_BITS
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_INLINE_READ_LOCK
 	select ARCH_INLINE_READ_LOCK_BH
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 84437a4c6545..f9e0cf26d447 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -98,6 +98,7 @@ config SPARC64
 	select ARCH_HAS_PTE_SPECIAL
 	select PCI_DOMAINS if PCI
 	select ARCH_HAS_GIGANTIC_PAGE
+	select ARCH_HAS_CPUMASK_BITS
 	select HAVE_SOFTIRQ_ON_OWN_STACK
 	select HAVE_SETUP_PER_CPU_AREA
 	select NEED_PER_CPU_EMBED_FIRST_CHUNK
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a825bf031f49..d98dfdf9c6b4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -183,6 +183,7 @@ config X86
 	select HAVE_ARCH_THREAD_STRUCT_WHITELIST
 	select HAVE_ARCH_STACKLEAK
 	select HAVE_ARCH_TRACEHOOK
+	select ARCH_HAS_CPUMASK_BITS
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD if X86_64
 	select HAVE_ARCH_USERFAULTFD_WP         if X86_64 && USERFAULTFD
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 2/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to MM CPUs
  2023-04-04 13:42 [PATCH 0/3] send tlb_remove_table_smp_sync IPI only to necessary CPUs Yair Podemsky
  2023-04-04 13:42 ` [PATCH 1/3] arch: Introduce ARCH_HAS_CPUMASK_BITS Yair Podemsky
@ 2023-04-04 13:42 ` Yair Podemsky
  2023-04-04 14:57   ` Peter Zijlstra
  2023-04-04 13:42 ` [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode Yair Podemsky
  2 siblings, 1 reply; 37+ messages in thread
From: Yair Podemsky @ 2023-04-04 13:42 UTC (permalink / raw)
  To: linux, mpe, npiggin, christophe.leroy, hca, gor, agordeev,
	borntraeger, svens, davem, tglx, mingo, bp, dave.hansen, x86,
	hpa, will, aneesh.kumar, akpm, peterz, arnd, keescook, paulmck,
	jpoimboe, samitolvanen, frederic, ardb, juerg.haefliger,
	rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, vschneid, dhildenb
  Cc: ypodemsk, alougovs

Currently the tlb_remove_table_smp_sync IPI is sent to all CPUs
indiscriminately, this causes unnecessary work and delays notable in
real-time use-cases and isolated cpus.
This patch will limit this IPI on systems with ARCH_HAS_CPUMASK_BITS,
Where the IPI will only be sent to cpus referencing the affected mm.

Signed-off-by: Yair Podemsky <ypodemsk@redhat.com>
Suggested-by: David Hildenbrand <david@redhat.com>
---
 include/asm-generic/tlb.h |  4 ++--
 mm/khugepaged.c           |  4 ++--
 mm/mmu_gather.c           | 17 ++++++++++++-----
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h
index b46617207c93..0b6ba17cc8d3 100644
--- a/include/asm-generic/tlb.h
+++ b/include/asm-generic/tlb.h
@@ -222,7 +222,7 @@ extern void tlb_remove_table(struct mmu_gather *tlb, void *table);
 #define tlb_needs_table_invalidate() (true)
 #endif
 
-void tlb_remove_table_sync_one(void);
+void tlb_remove_table_sync_one(struct mm_struct *mm);
 
 #else
 
@@ -230,7 +230,7 @@ void tlb_remove_table_sync_one(void);
 #error tlb_needs_table_invalidate() requires MMU_GATHER_RCU_TABLE_FREE
 #endif
 
-static inline void tlb_remove_table_sync_one(void) { }
+static inline void tlb_remove_table_sync_one(struct mm_struct *mm) { }
 
 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 92e6f56a932d..2b4e6ca1f38e 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1070,7 +1070,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
 	_pmd = pmdp_collapse_flush(vma, address, pmd);
 	spin_unlock(pmd_ptl);
 	mmu_notifier_invalidate_range_end(&range);
-	tlb_remove_table_sync_one();
+	tlb_remove_table_sync_one(mm);
 
 	spin_lock(pte_ptl);
 	result =  __collapse_huge_page_isolate(vma, address, pte, cc,
@@ -1427,7 +1427,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v
 				addr + HPAGE_PMD_SIZE);
 	mmu_notifier_invalidate_range_start(&range);
 	pmd = pmdp_collapse_flush(vma, addr, pmdp);
-	tlb_remove_table_sync_one();
+	tlb_remove_table_sync_one(mm);
 	mmu_notifier_invalidate_range_end(&range);
 	mm_dec_nr_ptes(mm);
 	page_table_check_pte_clear_range(mm, addr, pmd);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 2b93cf6ac9ae..5ea9be6fb87c 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -191,7 +191,13 @@ static void tlb_remove_table_smp_sync(void *arg)
 	/* Simply deliver the interrupt */
 }
 
-void tlb_remove_table_sync_one(void)
+#ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
+#define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
+#else
+#define REMOVE_TABLE_IPI_MASK NULL
+#endif /* CONFIG_ARCH_HAS_CPUMASK_BITS */
+
+void tlb_remove_table_sync_one(struct mm_struct *mm)
 {
 	/*
 	 * This isn't an RCU grace period and hence the page-tables cannot be
@@ -200,7 +206,8 @@ void tlb_remove_table_sync_one(void)
 	 * It is however sufficient for software page-table walkers that rely on
 	 * IRQ disabling.
 	 */
-	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+	on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
+			NULL, true);
 }
 
 static void tlb_remove_table_rcu(struct rcu_head *head)
@@ -237,9 +244,9 @@ static inline void tlb_table_invalidate(struct mmu_gather *tlb)
 	}
 }
 
-static void tlb_remove_table_one(void *table)
+static void tlb_remove_table_one(struct mm_struct *mm, void *table)
 {
-	tlb_remove_table_sync_one();
+	tlb_remove_table_sync_one(mm);
 	__tlb_remove_table(table);
 }
 
@@ -262,7 +269,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
 		*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 		if (*batch == NULL) {
 			tlb_table_invalidate(tlb);
-			tlb_remove_table_one(table);
+			tlb_remove_table_one(tlb->mm, table);
 			return;
 		}
 		(*batch)->nr = 0;
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-04 13:42 [PATCH 0/3] send tlb_remove_table_smp_sync IPI only to necessary CPUs Yair Podemsky
  2023-04-04 13:42 ` [PATCH 1/3] arch: Introduce ARCH_HAS_CPUMASK_BITS Yair Podemsky
  2023-04-04 13:42 ` [PATCH 2/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to MM CPUs Yair Podemsky
@ 2023-04-04 13:42 ` Yair Podemsky
  2023-04-04 14:03   ` David Hildenbrand
                     ` (2 more replies)
  2 siblings, 3 replies; 37+ messages in thread
From: Yair Podemsky @ 2023-04-04 13:42 UTC (permalink / raw)
  To: linux, mpe, npiggin, christophe.leroy, hca, gor, agordeev,
	borntraeger, svens, davem, tglx, mingo, bp, dave.hansen, x86,
	hpa, will, aneesh.kumar, akpm, peterz, arnd, keescook, paulmck,
	jpoimboe, samitolvanen, frederic, ardb, juerg.haefliger,
	rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, vschneid, dhildenb
  Cc: ypodemsk, alougovs

The tlb_remove_table_smp_sync IPI is used to ensure the outdated tlb page
is not currently being accessed and can be cleared.
This occurs once all CPUs have left the lockless gup code section.
If they reenter the page table walk, the pointers will be to the new
pages.
Therefore the IPI is only needed for CPUs in kernel mode.
By preventing the IPI from being sent to CPUs not in kernel mode,
Latencies are reduced.

Race conditions considerations:
The context state check is vulnerable to race conditions between the
moment the context state is read to when the IPI is sent (or not).

Here are these scenarios.
case 1:
CPU-A                                             CPU-B

                                                  state == CONTEXT_KERNEL
int state = atomic_read(&ct->state);
                                                  Kernel-exit:
                                                  state == CONTEXT_USER
if (state & CT_STATE_MASK == CONTEXT_KERNEL)

In this case, the IPI will be sent to CPU-B despite it is no longer in
the kernel. The consequence of which would be an unnecessary IPI being
handled by CPU-B, causing a reduction in latency.
This would have been the case every time without this patch.

case 2:
CPU-A                                             CPU-B

modify pagetables
tlb_flush (memory barrier)
                                                  state == CONTEXT_USER
int state = atomic_read(&ct->state);
                                                  Kernel-enter:
                                                  state == CONTEXT_KERNEL
                                                  READ(pagetable values)
if (state & CT_STATE_MASK == CONTEXT_USER)

In this case, the IPI will not be sent to CPU-B despite it returning to
the kernel and even reading the pagetable.
However since this CPU-B has entered the pagetable after the
modification it is reading the new, safe values.

The only case when this IPI is truly necessary is when CPU-B has entered
the lockless gup code section before the pagetable modifications and
has yet to exit them, in which case it is still in the kernel.

Signed-off-by: Yair Podemsky <ypodemsk@redhat.com>
---
 mm/mmu_gather.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 5ea9be6fb87c..731d955e152d 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -9,6 +9,7 @@
 #include <linux/smp.h>
 #include <linux/swap.h>
 #include <linux/rmap.h>
+#include <linux/context_tracking_state.h>
 
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
@@ -191,6 +192,20 @@ static void tlb_remove_table_smp_sync(void *arg)
 	/* Simply deliver the interrupt */
 }
 
+
+#ifdef CONFIG_CONTEXT_TRACKING
+static bool cpu_in_kernel(int cpu, void *info)
+{
+	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
+	int state = atomic_read(&ct->state);
+	/* will return true only for cpus in kernel space */
+	return state & CT_STATE_MASK == CONTEXT_KERNEL;
+}
+#define CONTEXT_PREDICATE cpu_in_kernel
+#else
+#define CONTEXT_PREDICATE NULL
+#endif /* CONFIG_CONTEXT_TRACKING */
+
 #ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
 #define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
 #else
@@ -206,8 +221,8 @@ void tlb_remove_table_sync_one(struct mm_struct *mm)
 	 * It is however sufficient for software page-table walkers that rely on
 	 * IRQ disabling.
 	 */
-	on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
-			NULL, true);
+	on_each_cpu_cond_mask(CONTEXT_PREDICATE, tlb_remove_table_smp_sync,
+			NULL, true, REMOVE_TABLE_IPI_MASK);
 }
 
 static void tlb_remove_table_rcu(struct rcu_head *head)
-- 
2.31.1



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 1/3] arch: Introduce ARCH_HAS_CPUMASK_BITS
  2023-04-04 13:42 ` [PATCH 1/3] arch: Introduce ARCH_HAS_CPUMASK_BITS Yair Podemsky
@ 2023-04-04 13:47   ` David Hildenbrand
  0 siblings, 0 replies; 37+ messages in thread
From: David Hildenbrand @ 2023-04-04 13:47 UTC (permalink / raw)
  To: Yair Podemsky, linux, mpe, npiggin, christophe.leroy, hca, gor,
	agordeev, borntraeger, svens, davem, tglx, mingo, bp,
	dave.hansen, x86, hpa, will, aneesh.kumar, akpm, peterz, arnd,
	keescook, paulmck, jpoimboe, samitolvanen, frederic, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, vschneid, dhildenb
  Cc: alougovs

On 04.04.23 15:42, Yair Podemsky wrote:
> Some architectures set and maintain the mm_cpumask bits when loading
> or removing process from cpu.
> This Kconfig will mark those to allow different behavior between
> kernels that maintain the mm_cpumask and those that do not.
> 

I was wondering if we should do something along the lines of:

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 0722859c3647..1f5c15d8e8ed 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -767,11 +767,13 @@ struct mm_struct {
  #endif /* CONFIG_LRU_GEN */
         } __randomize_layout;

+#ifdef CONFIG_MM_CPUMASK
         /*
          * The mm_cpumask needs to be at the end of mm_struct, because it
          * is dynamically sized based on nr_cpu_ids.
          */
         unsigned long cpu_bitmap[];
+#endif
  };

But that would, of course, require additional changes to make it 
compile. What concerns me a bit is that we have in mm/rmap.c a 
mm_cpumask() usage. But it's glued to 
CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH ... shaky.

At least if we would properly fence it, there would be no
accidental abuse anymore.


> Signed-off-by: Yair Podemsky <ypodemsk@redhat.com>
> ---
>   arch/Kconfig         | 8 ++++++++
>   arch/arm/Kconfig     | 1 +
>   arch/powerpc/Kconfig | 1 +
>   arch/s390/Kconfig    | 1 +
>   arch/sparc/Kconfig   | 1 +
>   arch/x86/Kconfig     | 1 +

As Valentin says, there are other architectures that do the same.

-- 
Thanks,

David / dhildenb



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-04 13:42 ` [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode Yair Podemsky
@ 2023-04-04 14:03   ` David Hildenbrand
  2023-04-04 15:12   ` Peter Zijlstra
  2023-04-05 10:43   ` Frederic Weisbecker
  2 siblings, 0 replies; 37+ messages in thread
From: David Hildenbrand @ 2023-04-04 14:03 UTC (permalink / raw)
  To: Yair Podemsky, linux, mpe, npiggin, christophe.leroy, hca, gor,
	agordeev, borntraeger, svens, davem, tglx, mingo, bp,
	dave.hansen, x86, hpa, will, aneesh.kumar, akpm, peterz, arnd,
	keescook, paulmck, jpoimboe, samitolvanen, frederic, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, vschneid, dhildenb
  Cc: alougovs

On 04.04.23 15:42, Yair Podemsky wrote:
> The tlb_remove_table_smp_sync IPI is used to ensure the outdated tlb page
> is not currently being accessed and can be cleared.
> This occurs once all CPUs have left the lockless gup code section.
> If they reenter the page table walk, the pointers will be to the new
> pages.
> Therefore the IPI is only needed for CPUs in kernel mode.
> By preventing the IPI from being sent to CPUs not in kernel mode,
> Latencies are reduced.
> 
> Race conditions considerations:
> The context state check is vulnerable to race conditions between the
> moment the context state is read to when the IPI is sent (or not).
> 
> Here are these scenarios.
> case 1:
> CPU-A                                             CPU-B
> 
>                                                    state == CONTEXT_KERNEL
> int state = atomic_read(&ct->state);
>                                                    Kernel-exit:
>                                                    state == CONTEXT_USER
> if (state & CT_STATE_MASK == CONTEXT_KERNEL)
> 
> In this case, the IPI will be sent to CPU-B despite it is no longer in
> the kernel. The consequence of which would be an unnecessary IPI being
> handled by CPU-B, causing a reduction in latency.
> This would have been the case every time without this patch.
> 
> case 2:
> CPU-A                                             CPU-B
> 
> modify pagetables
> tlb_flush (memory barrier)
>                                                    state == CONTEXT_USER
> int state = atomic_read(&ct->state);
>                                                    Kernel-enter:
>                                                    state == CONTEXT_KERNEL
>                                                    READ(pagetable values)
> if (state & CT_STATE_MASK == CONTEXT_USER)
> 
> In this case, the IPI will not be sent to CPU-B despite it returning to
> the kernel and even reading the pagetable.
> However since this CPU-B has entered the pagetable after the
> modification it is reading the new, safe values.
> 
> The only case when this IPI is truly necessary is when CPU-B has entered
> the lockless gup code section before the pagetable modifications and
> has yet to exit them, in which case it is still in the kernel.
> 
> Signed-off-by: Yair Podemsky <ypodemsk@redhat.com>
> ---
>   mm/mmu_gather.c | 19 +++++++++++++++++--
>   1 file changed, 17 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index 5ea9be6fb87c..731d955e152d 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -9,6 +9,7 @@
>   #include <linux/smp.h>
>   #include <linux/swap.h>
>   #include <linux/rmap.h>
> +#include <linux/context_tracking_state.h>
>   
>   #include <asm/pgalloc.h>
>   #include <asm/tlb.h>
> @@ -191,6 +192,20 @@ static void tlb_remove_table_smp_sync(void *arg)
>   	/* Simply deliver the interrupt */
>   }
>   
> +
> +#ifdef CONFIG_CONTEXT_TRACKING
> +static bool cpu_in_kernel(int cpu, void *info)
> +{
> +	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
> +	int state = atomic_read(&ct->state);
> +	/* will return true only for cpus in kernel space */
> +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> +}
> +#define CONTEXT_PREDICATE cpu_in_kernel
> +#else
> +#define CONTEXT_PREDICATE NULL
> +#endif /* CONFIG_CONTEXT_TRACKING */
> +
>   #ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
>   #define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
>   #else
> @@ -206,8 +221,8 @@ void tlb_remove_table_sync_one(struct mm_struct *mm)
>   	 * It is however sufficient for software page-table walkers that rely on
>   	 * IRQ disabling.
>   	 */
> -	on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
> -			NULL, true);
> +	on_each_cpu_cond_mask(CONTEXT_PREDICATE, tlb_remove_table_smp_sync,
> +			NULL, true, REMOVE_TABLE_IPI_MASK);
>   }
>   
>   static void tlb_remove_table_rcu(struct rcu_head *head)


Maybe a bit cleaner by avoiding CONTEXT_PREDICATE, still not completely nice
(an empty dummy function "cpu_maybe_in_kernel" might be cleanest but would
be slightly slower for !CONFIG_CONTEXT_TRACKING):

#ifdef CONFIG_CONTEXT_TRACKING
static bool cpu_in_kernel(int cpu, void *info)
{
	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
	int state = atomic_read(&ct->state);
	/* will return true only for cpus in kernel space */
	return state & CT_STATE_MASK == CONTEXT_KERNEL;
}
#endif /* CONFIG_CONTEXT_TRACKING */


...
#ifdef CONFIG_CONTEXT_TRACKING
	on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
			 NULL, true);
#else /* CONFIG_CONTEXT_TRACKING */
	on_each_cpu_cond_mask(cpu_in_kernel, tlb_remove_table_smp_sync,
			      NULL, true, REMOVE_TABLE_IPI_MASK);
#endif /* CONFIG_CONTEXT_TRACKING */


-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 2/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to MM CPUs
  2023-04-04 13:42 ` [PATCH 2/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to MM CPUs Yair Podemsky
@ 2023-04-04 14:57   ` Peter Zijlstra
  0 siblings, 0 replies; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-04 14:57 UTC (permalink / raw)
  To: Yair Podemsky
  Cc: linux, mpe, npiggin, christophe.leroy, hca, gor, agordeev,
	borntraeger, svens, davem, tglx, mingo, bp, dave.hansen, x86,
	hpa, will, aneesh.kumar, akpm, arnd, keescook, paulmck, jpoimboe,
	samitolvanen, frederic, ardb, juerg.haefliger, rmk+kernel,
	geert+renesas, tony, linus.walleij, sebastian.reichel,
	nick.hawkins, linux-kernel, linux-arm-kernel, linuxppc-dev,
	linux-s390, sparclinux, linux-arch, linux-mm, mtosatti, vschneid,
	dhildenb, alougovs

On Tue, Apr 04, 2023 at 04:42:23PM +0300, Yair Podemsky wrote:
> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index 2b93cf6ac9ae..5ea9be6fb87c 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -191,7 +191,13 @@ static void tlb_remove_table_smp_sync(void *arg)
>  	/* Simply deliver the interrupt */
>  }
>  
> -void tlb_remove_table_sync_one(void)
> +#ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
> +#define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
> +#else
> +#define REMOVE_TABLE_IPI_MASK NULL
> +#endif /* CONFIG_ARCH_HAS_CPUMASK_BITS */
> +
> +void tlb_remove_table_sync_one(struct mm_struct *mm)
>  {
>  	/*
>  	 * This isn't an RCU grace period and hence the page-tables cannot be
> @@ -200,7 +206,8 @@ void tlb_remove_table_sync_one(void)
>  	 * It is however sufficient for software page-table walkers that rely on
>  	 * IRQ disabling.
>  	 */
> -	smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
> +	on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
> +			NULL, true);
>  }

Uhh, I don't think NULL is a valid @mask argument. Should that not be
something like:

#ifdef CONFIG_ARCH_HAS_CPUMASK
#define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
#else
#define REMOVE_TABLE_IPI_MASK cpu_online_mask
#endif

	preempt_disable();
	on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync, NULL true);
	preempt_enable();


?


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-04 13:42 ` [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode Yair Podemsky
  2023-04-04 14:03   ` David Hildenbrand
@ 2023-04-04 15:12   ` Peter Zijlstra
  2023-04-04 16:00     ` Peter Zijlstra
  2023-04-05 10:43   ` Frederic Weisbecker
  2 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-04 15:12 UTC (permalink / raw)
  To: Yair Podemsky
  Cc: linux, mpe, npiggin, christophe.leroy, hca, gor, agordeev,
	borntraeger, svens, davem, tglx, mingo, bp, dave.hansen, x86,
	hpa, will, aneesh.kumar, akpm, arnd, keescook, paulmck, jpoimboe,
	samitolvanen, frederic, ardb, juerg.haefliger, rmk+kernel,
	geert+renesas, tony, linus.walleij, sebastian.reichel,
	nick.hawkins, linux-kernel, linux-arm-kernel, linuxppc-dev,
	linux-s390, sparclinux, linux-arch, linux-mm, mtosatti, vschneid,
	dhildenb, alougovs, Frederic Weisbecker

On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> The tlb_remove_table_smp_sync IPI is used to ensure the outdated tlb page
> is not currently being accessed and can be cleared.
> This occurs once all CPUs have left the lockless gup code section.
> If they reenter the page table walk, the pointers will be to the new
> pages.
> Therefore the IPI is only needed for CPUs in kernel mode.
> By preventing the IPI from being sent to CPUs not in kernel mode,
> Latencies are reduced.
> 
> Race conditions considerations:
> The context state check is vulnerable to race conditions between the
> moment the context state is read to when the IPI is sent (or not).
> 
> Here are these scenarios.
> case 1:
> CPU-A                                             CPU-B
> 
>                                                   state == CONTEXT_KERNEL
> int state = atomic_read(&ct->state);
>                                                   Kernel-exit:
>                                                   state == CONTEXT_USER
> if (state & CT_STATE_MASK == CONTEXT_KERNEL)
> 
> In this case, the IPI will be sent to CPU-B despite it is no longer in
> the kernel. The consequence of which would be an unnecessary IPI being
> handled by CPU-B, causing a reduction in latency.
> This would have been the case every time without this patch.
> 
> case 2:
> CPU-A                                             CPU-B
> 
> modify pagetables
> tlb_flush (memory barrier)
>                                                   state == CONTEXT_USER
> int state = atomic_read(&ct->state);
>                                                   Kernel-enter:
>                                                   state == CONTEXT_KERNEL
>                                                   READ(pagetable values)
> if (state & CT_STATE_MASK == CONTEXT_USER)
> 
> In this case, the IPI will not be sent to CPU-B despite it returning to
> the kernel and even reading the pagetable.
> However since this CPU-B has entered the pagetable after the
> modification it is reading the new, safe values.
> 
> The only case when this IPI is truly necessary is when CPU-B has entered
> the lockless gup code section before the pagetable modifications and
> has yet to exit them, in which case it is still in the kernel.
> 
> Signed-off-by: Yair Podemsky <ypodemsk@redhat.com>
> ---
>  mm/mmu_gather.c | 19 +++++++++++++++++--
>  1 file changed, 17 insertions(+), 2 deletions(-)
> 
> diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
> index 5ea9be6fb87c..731d955e152d 100644
> --- a/mm/mmu_gather.c
> +++ b/mm/mmu_gather.c
> @@ -9,6 +9,7 @@
>  #include <linux/smp.h>
>  #include <linux/swap.h>
>  #include <linux/rmap.h>
> +#include <linux/context_tracking_state.h>
>  
>  #include <asm/pgalloc.h>
>  #include <asm/tlb.h>
> @@ -191,6 +192,20 @@ static void tlb_remove_table_smp_sync(void *arg)
>  	/* Simply deliver the interrupt */
>  }
>  
> +
> +#ifdef CONFIG_CONTEXT_TRACKING
> +static bool cpu_in_kernel(int cpu, void *info)
> +{
> +	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
> +	int state = atomic_read(&ct->state);
> +	/* will return true only for cpus in kernel space */
> +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> +}
> +#define CONTEXT_PREDICATE cpu_in_kernel
> +#else
> +#define CONTEXT_PREDICATE NULL
> +#endif /* CONFIG_CONTEXT_TRACKING */
> +
>  #ifdef CONFIG_ARCH_HAS_CPUMASK_BITS
>  #define REMOVE_TABLE_IPI_MASK mm_cpumask(mm)
>  #else
> @@ -206,8 +221,8 @@ void tlb_remove_table_sync_one(struct mm_struct *mm)
>  	 * It is however sufficient for software page-table walkers that rely on
>  	 * IRQ disabling.
>  	 */
> -	on_each_cpu_mask(REMOVE_TABLE_IPI_MASK, tlb_remove_table_smp_sync,
> -			NULL, true);
> +	on_each_cpu_cond_mask(CONTEXT_PREDICATE, tlb_remove_table_smp_sync,
> +			NULL, true, REMOVE_TABLE_IPI_MASK);
>  }

I think this is correct; but... I would like much of the changelog
included in a comment above cpu_in_kernel(). I'm sure someone will try
and read this code and wonder about those race conditions.

Of crucial importance is the fact that the page-table modification comes
before the tlbi.

Also, do we really not already have this helper function somewhere, it
seems like something obvious to already have, Frederic?




^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-04 15:12   ` Peter Zijlstra
@ 2023-04-04 16:00     ` Peter Zijlstra
  2023-04-05  0:53       ` Hillf Danton
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-04 16:00 UTC (permalink / raw)
  To: Yair Podemsky
  Cc: linux, mpe, npiggin, christophe.leroy, hca, gor, agordeev,
	borntraeger, svens, davem, tglx, mingo, bp, dave.hansen, x86,
	hpa, will, aneesh.kumar, akpm, arnd, keescook, paulmck, jpoimboe,
	samitolvanen, frederic, ardb, juerg.haefliger, rmk+kernel,
	geert+renesas, tony, linus.walleij, sebastian.reichel,
	nick.hawkins, linux-kernel, linux-arm-kernel, linuxppc-dev,
	linux-s390, sparclinux, linux-arch, linux-mm, mtosatti, vschneid,
	dhildenb, alougovs, Frederic Weisbecker

On Tue, Apr 04, 2023 at 05:12:17PM +0200, Peter Zijlstra wrote:
> > case 2:
> > CPU-A                                             CPU-B
> > 
> > modify pagetables
> > tlb_flush (memory barrier)
> >                                                   state == CONTEXT_USER
> > int state = atomic_read(&ct->state);
> >                                                   Kernel-enter:
> >                                                   state == CONTEXT_KERNEL
> >                                                   READ(pagetable values)
> > if (state & CT_STATE_MASK == CONTEXT_USER)
> > 


Hmm, hold up; what about memory ordering, we need a store-load ordering
between the page-table write and the context trackng load, and a
store-load order on the context tracking update and software page-table
walker loads.

Now, iirc page-table modification is done under pte_lock (or
page_table_lock) and that only provides a RELEASE barrier on this end,
which is insufficient to order against a later load.

Is there anything else?

On the state tracking side, we have ct_state_inc() which is
atomic_add_return() which should provide full barrier and is sufficient.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-04 16:00     ` Peter Zijlstra
@ 2023-04-05  0:53       ` Hillf Danton
  0 siblings, 0 replies; 37+ messages in thread
From: Hillf Danton @ 2023-04-05  0:53 UTC (permalink / raw)
  To: Yair Podemsky
  Cc: Peter Zijlstra, npiggin, dave.hansen, will, linux-kernel,
	linux-mm, dhildenb, Frederic Weisbecker

On Tue, 4 Apr 2023 18:00:38 +0200 Peter Zijlstra <peterz@infradead.org>
> On Tue, Apr 04, 2023 at 05:12:17PM +0200, Peter Zijlstra wrote:
> > > case 2:
> > > CPU-A                                             CPU-B
> > > 
> > > modify pagetables
> > > tlb_flush (memory barrier)
> > >                                                   state == CONTEXT_USER
> > > int state = atomic_read(&ct->state);
> > >                                                   Kernel-enter:
> > >                                                   state == CONTEXT_KERNEL
> > >                                                   READ(pagetable values)
> > > if (state & CT_STATE_MASK == CONTEXT_USER)
> > > 
> 
> Hmm, hold up; what about memory ordering, we need a store-load ordering
> between the page-table write and the context trackng load, and a
> store-load order on the context tracking update and software page-table
> walker loads.
> 
> Now, iirc page-table modification is done under pte_lock (or
> page_table_lock) and that only provides a RELEASE barrier on this end,
> which is insufficient to order against a later load.
> 
> Is there anything else?

Hmm Yair could you specify what you meant by the memory barrier after tlb_flush
in the diagram above?

> 
> On the state tracking side, we have ct_state_inc() which is
> atomic_add_return() which should provide full barrier and is sufficient.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-04 13:42 ` [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode Yair Podemsky
  2023-04-04 14:03   ` David Hildenbrand
  2023-04-04 15:12   ` Peter Zijlstra
@ 2023-04-05 10:43   ` Frederic Weisbecker
  2023-04-05 11:10     ` Frederic Weisbecker
  2023-04-05 19:43     ` Marcelo Tosatti
  2 siblings, 2 replies; 37+ messages in thread
From: Frederic Weisbecker @ 2023-04-05 10:43 UTC (permalink / raw)
  To: Yair Podemsky
  Cc: linux, mpe, npiggin, christophe.leroy, hca, gor, agordeev,
	borntraeger, svens, davem, tglx, mingo, bp, dave.hansen, x86,
	hpa, will, aneesh.kumar, akpm, peterz, arnd, keescook, paulmck,
	jpoimboe, samitolvanen, ardb, juerg.haefliger, rmk+kernel,
	geert+renesas, tony, linus.walleij, sebastian.reichel,
	nick.hawkins, linux-kernel, linux-arm-kernel, linuxppc-dev,
	linux-s390, sparclinux, linux-arch, linux-mm, mtosatti, vschneid,
	dhildenb, alougovs

On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> @@ -191,6 +192,20 @@ static void tlb_remove_table_smp_sync(void *arg)
>  	/* Simply deliver the interrupt */
>  }
>  
> +
> +#ifdef CONFIG_CONTEXT_TRACKING
> +static bool cpu_in_kernel(int cpu, void *info)
> +{
> +	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);

Like Peter said, an smp_mb() is required here before the read (unless there is
already one between the page table modification and that ct->state read?).

So that you have this pairing:


           WRITE page_table                  WRITE ct->state
	   smp_mb()                          smp_mb() // implied by atomic_fetch_or
           READ ct->state                    READ page_table

> +	int state = atomic_read(&ct->state);
> +	/* will return true only for cpus in kernel space */
> +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> +}

Also note that this doesn't stricly prevent userspace from being interrupted.
You may well observe the CPU in kernel but it may receive the IPI later after
switching to userspace.

We could arrange for avoiding that with marking ct->state with a pending work bit
to flush upon user entry/exit but that's a bit more overhead so I first need to
know about your expectations here, ie: can you tolerate such an occasional
interruption or not?

Thanks.



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 10:43   ` Frederic Weisbecker
@ 2023-04-05 11:10     ` Frederic Weisbecker
  2023-04-05 11:41       ` Peter Zijlstra
  2023-04-05 19:45       ` Marcelo Tosatti
  2023-04-05 19:43     ` Marcelo Tosatti
  1 sibling, 2 replies; 37+ messages in thread
From: Frederic Weisbecker @ 2023-04-05 11:10 UTC (permalink / raw)
  To: Yair Podemsky
  Cc: linux, mpe, npiggin, christophe.leroy, hca, gor, agordeev,
	borntraeger, svens, davem, tglx, mingo, bp, dave.hansen, x86,
	hpa, will, aneesh.kumar, akpm, peterz, arnd, keescook, paulmck,
	jpoimboe, samitolvanen, ardb, juerg.haefliger, rmk+kernel,
	geert+renesas, tony, linus.walleij, sebastian.reichel,
	nick.hawkins, linux-kernel, linux-arm-kernel, linuxppc-dev,
	linux-s390, sparclinux, linux-arch, linux-mm, mtosatti, vschneid,
	dhildenb, alougovs

On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > +	int state = atomic_read(&ct->state);
> > +	/* will return true only for cpus in kernel space */
> > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > +}
> 
> Also note that this doesn't stricly prevent userspace from being interrupted.
> You may well observe the CPU in kernel but it may receive the IPI later after
> switching to userspace.
> 
> We could arrange for avoiding that with marking ct->state with a pending work bit
> to flush upon user entry/exit but that's a bit more overhead so I first need to
> know about your expectations here, ie: can you tolerate such an occasional
> interruption or not?

Bah, actually what can we do to prevent from that racy IPI? Not much I fear...



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 11:10     ` Frederic Weisbecker
@ 2023-04-05 11:41       ` Peter Zijlstra
  2023-04-05 12:00         ` David Hildenbrand
  2023-04-05 12:05         ` Frederic Weisbecker
  2023-04-05 19:45       ` Marcelo Tosatti
  1 sibling, 2 replies; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-05 11:41 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Yair Podemsky, linux, mpe, npiggin, christophe.leroy, hca, gor,
	agordeev, borntraeger, svens, davem, tglx, mingo, bp,
	dave.hansen, x86, hpa, will, aneesh.kumar, akpm, arnd, keescook,
	paulmck, jpoimboe, samitolvanen, ardb, juerg.haefliger,
	rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, vschneid, dhildenb, alougovs

On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
> On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> > On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > > +	int state = atomic_read(&ct->state);
> > > +	/* will return true only for cpus in kernel space */
> > > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > > +}
> > 
> > Also note that this doesn't stricly prevent userspace from being interrupted.
> > You may well observe the CPU in kernel but it may receive the IPI later after
> > switching to userspace.
> > 
> > We could arrange for avoiding that with marking ct->state with a pending work bit
> > to flush upon user entry/exit but that's a bit more overhead so I first need to
> > know about your expectations here, ie: can you tolerate such an occasional
> > interruption or not?
> 
> Bah, actually what can we do to prevent from that racy IPI? Not much I fear...

Yeah, so I don't think that's actually a problem. The premise is that
*IFF* NOHZ_FULL stays in userspace, then it will never observe the IPI.

If it violates this by doing syscalls or other kernel entries; it gets
to keep the pieces.




^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 11:41       ` Peter Zijlstra
@ 2023-04-05 12:00         ` David Hildenbrand
  2023-04-05 12:05         ` Frederic Weisbecker
  1 sibling, 0 replies; 37+ messages in thread
From: David Hildenbrand @ 2023-04-05 12:00 UTC (permalink / raw)
  To: Peter Zijlstra, Frederic Weisbecker
  Cc: Yair Podemsky, linux, mpe, npiggin, christophe.leroy, hca, gor,
	agordeev, borntraeger, svens, davem, tglx, mingo, bp,
	dave.hansen, x86, hpa, will, aneesh.kumar, akpm, arnd, keescook,
	paulmck, jpoimboe, samitolvanen, ardb, juerg.haefliger,
	rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, vschneid, dhildenb, alougovs

On 05.04.23 13:41, Peter Zijlstra wrote:
> On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
>> On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
>>> On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
>>>> +	int state = atomic_read(&ct->state);
>>>> +	/* will return true only for cpus in kernel space */
>>>> +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
>>>> +}
>>>
>>> Also note that this doesn't stricly prevent userspace from being interrupted.
>>> You may well observe the CPU in kernel but it may receive the IPI later after
>>> switching to userspace.
>>>
>>> We could arrange for avoiding that with marking ct->state with a pending work bit
>>> to flush upon user entry/exit but that's a bit more overhead so I first need to
>>> know about your expectations here, ie: can you tolerate such an occasional
>>> interruption or not?
>>
>> Bah, actually what can we do to prevent from that racy IPI? Not much I fear...
> 
> Yeah, so I don't think that's actually a problem. The premise is that
> *IFF* NOHZ_FULL stays in userspace, then it will never observe the IPI.
> 
> If it violates this by doing syscalls or other kernel entries; it gets
> to keep the pieces.

Yair is currently on vacation, so I'm replying on his behalf.

Indeed, RT userspace is supposed to not call into the kernel, that's the 
premise.

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 11:41       ` Peter Zijlstra
  2023-04-05 12:00         ` David Hildenbrand
@ 2023-04-05 12:05         ` Frederic Weisbecker
  2023-04-05 12:31           ` Frederic Weisbecker
  2023-04-05 12:45           ` Valentin Schneider
  1 sibling, 2 replies; 37+ messages in thread
From: Frederic Weisbecker @ 2023-04-05 12:05 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Yair Podemsky, linux, mpe, npiggin, christophe.leroy, hca, gor,
	agordeev, borntraeger, svens, davem, tglx, mingo, bp,
	dave.hansen, x86, hpa, will, aneesh.kumar, akpm, arnd, keescook,
	paulmck, jpoimboe, samitolvanen, ardb, juerg.haefliger,
	rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, vschneid, dhildenb, alougovs

On Wed, Apr 05, 2023 at 01:41:48PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
> > On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> > > On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > > > +	int state = atomic_read(&ct->state);
> > > > +	/* will return true only for cpus in kernel space */
> > > > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > > > +}
> > > 
> > > Also note that this doesn't stricly prevent userspace from being interrupted.
> > > You may well observe the CPU in kernel but it may receive the IPI later after
> > > switching to userspace.
> > > 
> > > We could arrange for avoiding that with marking ct->state with a pending work bit
> > > to flush upon user entry/exit but that's a bit more overhead so I first need to
> > > know about your expectations here, ie: can you tolerate such an occasional
> > > interruption or not?
> > 
> > Bah, actually what can we do to prevent from that racy IPI? Not much I fear...
> 
> Yeah, so I don't think that's actually a problem. The premise is that
> *IFF* NOHZ_FULL stays in userspace, then it will never observe the IPI.
> 
> If it violates this by doing syscalls or other kernel entries; it gets
> to keep the pieces.

Ok so how about the following (only build tested)?

Two things:

1) It has the advantage to check context tracking _after_ the llist_add(), so
   it really can't be misused ordering-wise.

2) The IPI callback is always enqueued and then executed upon return
   from userland. The ordering makes sure it will either IPI or execute
   upon return to userspace.

diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 4a4d56f77180..dc4b56da1747 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -137,10 +137,23 @@ static __always_inline int ct_state(void)
 	return ret;
 }
 
+static __always_inline int ct_state_cpu(int cpu)
+{
+	struct context_tracking *ct;
+
+	if (!context_tracking_enabled())
+		return CONTEXT_DISABLED;
+
+	ct = per_cpu_ptr(&context_tracking, cpu);
+
+	return atomic_read(&ct->state) & CT_STATE_MASK;
+}
+
 #else
 static __always_inline bool context_tracking_enabled(void) { return false; }
 static __always_inline bool context_tracking_enabled_cpu(int cpu) { return false; }
 static __always_inline bool context_tracking_enabled_this_cpu(void) { return false; }
+static inline int ct_state_cpu(int cpu) { return CONTEXT_DISABLED; }
 #endif /* CONFIG_CONTEXT_TRACKING_USER */
 
 #endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 846add8394c4..cdc7e8a59acc 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -10,6 +10,7 @@
 #include <linux/audit.h>
 #include <linux/tick.h>
 
+#include "../kernel/sched/smp.h"
 #include "common.h"
 
 #define CREATE_TRACE_POINTS
@@ -27,6 +28,10 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs)
 	instrumentation_begin();
 	kmsan_unpoison_entry_regs(regs);
 	trace_hardirqs_off_finish();
+
+	/* Flush delayed IPI queue on nohz_full */
+	if (context_tracking_enabled_this_cpu())
+		flush_smp_call_function_queue();
 	instrumentation_end();
 }
 
diff --git a/kernel/smp.c b/kernel/smp.c
index 06a413987a14..14b25d25ef3a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -878,6 +878,8 @@ EXPORT_SYMBOL_GPL(smp_call_function_any);
  */
 #define SCF_WAIT	(1U << 0)
 #define SCF_RUN_LOCAL	(1U << 1)
+#define SCF_NO_USER	(1U << 2)
+
 
 static void smp_call_function_many_cond(const struct cpumask *mask,
 					smp_call_func_t func, void *info,
@@ -946,10 +948,13 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 #endif
 			cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
 			if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
-				__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
-				nr_cpus++;
-				last_cpu = cpu;
-
+				if (!(scf_flags & SCF_NO_USER) ||
+				    !IS_ENABLED(CONFIG_GENERIC_ENTRY) ||
+				     ct_state_cpu(cpu) != CONTEXT_USER) {
+					__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
+					nr_cpus++;
+					last_cpu = cpu;
+				}
 				cfd_seq_store(pcpu->seq_ipi, this_cpu, cpu, CFD_SEQ_IPI);
 			} else {
 				cfd_seq_store(pcpu->seq_noipi, this_cpu, cpu, CFD_SEQ_NOIPI);
@@ -1121,6 +1126,24 @@ void __init smp_init(void)
 	smp_cpus_done(setup_max_cpus);
 }
 
+static void __on_each_cpu_cond_mask(smp_cond_func_t cond_func,
+				    smp_call_func_t func,
+				    void *info, bool wait, bool nouser,
+				    const struct cpumask *mask)
+{
+	unsigned int scf_flags = SCF_RUN_LOCAL;
+
+	if (wait)
+		scf_flags |= SCF_WAIT;
+
+	if (nouser)
+		scf_flags |= SCF_NO_USER;
+
+	preempt_disable();
+	smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
+	preempt_enable();
+}
+
 /*
  * on_each_cpu_cond(): Call a function on each processor for which
  * the supplied function cond_func returns true, optionally waiting
@@ -1146,17 +1169,18 @@ void __init smp_init(void)
 void on_each_cpu_cond_mask(smp_cond_func_t cond_func, smp_call_func_t func,
 			   void *info, bool wait, const struct cpumask *mask)
 {
-	unsigned int scf_flags = SCF_RUN_LOCAL;
-
-	if (wait)
-		scf_flags |= SCF_WAIT;
-
-	preempt_disable();
-	smp_call_function_many_cond(mask, func, info, scf_flags, cond_func);
-	preempt_enable();
+	__on_each_cpu_cond_mask(cond_func, func, info, wait, false, mask);
 }
 EXPORT_SYMBOL(on_each_cpu_cond_mask);
 
+void on_each_cpu_cond_nouser_mask(smp_cond_func_t cond_func,
+				  smp_call_func_t func,
+				  void *info, bool wait,
+				  const struct cpumask *mask)
+{
+	__on_each_cpu_cond_mask(cond_func, func, info, wait, true, mask);
+}
+
 static void do_nothing(void *unused)
 {
 }



^ permalink raw reply related	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 12:05         ` Frederic Weisbecker
@ 2023-04-05 12:31           ` Frederic Weisbecker
  2023-04-05 12:45           ` Valentin Schneider
  1 sibling, 0 replies; 37+ messages in thread
From: Frederic Weisbecker @ 2023-04-05 12:31 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Yair Podemsky, linux, mpe, npiggin, christophe.leroy, hca, gor,
	agordeev, borntraeger, svens, davem, tglx, mingo, bp,
	dave.hansen, x86, hpa, will, aneesh.kumar, akpm, arnd, keescook,
	paulmck, jpoimboe, samitolvanen, ardb, juerg.haefliger,
	rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, vschneid, dhildenb, alougovs

On Wed, Apr 05, 2023 at 02:05:13PM +0200, Frederic Weisbecker wrote:
> On Wed, Apr 05, 2023 at 01:41:48PM +0200, Peter Zijlstra wrote:
> 1) It has the advantage to check context tracking _after_ the llist_add(), so
>    it really can't be misused ordering-wise.
> 
> 2) The IPI callback is always enqueued and then executed upon return
>    from userland. The ordering makes sure it will either IPI or execute
>    upon return to userspace.

*from userspace


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 12:05         ` Frederic Weisbecker
  2023-04-05 12:31           ` Frederic Weisbecker
@ 2023-04-05 12:45           ` Valentin Schneider
  2023-04-06 13:38             ` Peter Zijlstra
  1 sibling, 1 reply; 37+ messages in thread
From: Valentin Schneider @ 2023-04-05 12:45 UTC (permalink / raw)
  To: Frederic Weisbecker, Peter Zijlstra
  Cc: Yair Podemsky, linux, mpe, npiggin, christophe.leroy, hca, gor,
	agordeev, borntraeger, svens, davem, tglx, mingo, bp,
	dave.hansen, x86, hpa, will, aneesh.kumar, akpm, arnd, keescook,
	paulmck, jpoimboe, samitolvanen, ardb, juerg.haefliger,
	rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, dhildenb, alougovs

On 05/04/23 14:05, Frederic Weisbecker wrote:
>  static void smp_call_function_many_cond(const struct cpumask *mask,
>                                       smp_call_func_t func, void *info,
> @@ -946,10 +948,13 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
>  #endif
>                       cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
>                       if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
> -				__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
> -				nr_cpus++;
> -				last_cpu = cpu;
> -
> +				if (!(scf_flags & SCF_NO_USER) ||
> +				    !IS_ENABLED(CONFIG_GENERIC_ENTRY) ||
> +				     ct_state_cpu(cpu) != CONTEXT_USER) {
> +					__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
> +					nr_cpus++;
> +					last_cpu = cpu;
> +				}

I've been hacking on something like this (CSD deferral for NOHZ-full),
and unfortunately this uses the CPU-local cfd_data storage thing, which
means any further smp_call_function() from the same CPU to the same
destination will spin on csd_lock_wait(), waiting for the target CPU to
come out of userspace and flush the queue - and we've just spent extra
effort into *not* disturbing it, so that'll take a while :(

I don't have much that is in a shareable state yet (though I'm supposed to
talk some more about it at OSPM in <2 weeks, so I'll have to get there),
but ATM I'm playing with
o a bitmask (like in [1]) for coalescable stuff such as do_sync_core() for
  x86 instruction patching
o a CSD-like queue for things that need to pass data around, using
  statically-allocated storage (so with a limit on how much it can be used) - the
  alternative being allocating a struct on sending, since you don't have a
  bound on how much crap you can queue on an undisturbed NOHZ-full CPU...

[1]: https://lore.kernel.org/all/20210929152429.067060646@infradead.org/



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 10:43   ` Frederic Weisbecker
  2023-04-05 11:10     ` Frederic Weisbecker
@ 2023-04-05 19:43     ` Marcelo Tosatti
  2023-04-05 19:54       ` Peter Zijlstra
  1 sibling, 1 reply; 37+ messages in thread
From: Marcelo Tosatti @ 2023-04-05 19:43 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Yair Podemsky, linux, mpe, npiggin, christophe.leroy, hca, gor,
	agordeev, borntraeger, svens, davem, tglx, mingo, bp,
	dave.hansen, x86, hpa, will, aneesh.kumar, akpm, peterz, arnd,
	keescook, paulmck, jpoimboe, samitolvanen, ardb, juerg.haefliger,
	rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs

On Wed, Apr 05, 2023 at 12:43:58PM +0200, Frederic Weisbecker wrote:
> On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > @@ -191,6 +192,20 @@ static void tlb_remove_table_smp_sync(void *arg)
> >  	/* Simply deliver the interrupt */
> >  }
> >  
> > +
> > +#ifdef CONFIG_CONTEXT_TRACKING
> > +static bool cpu_in_kernel(int cpu, void *info)
> > +{
> > +	struct context_tracking *ct = per_cpu_ptr(&context_tracking, cpu);
> 
> Like Peter said, an smp_mb() is required here before the read (unless there is
> already one between the page table modification and that ct->state read?).
> 
> So that you have this pairing:
> 
> 
>            WRITE page_table                  WRITE ct->state
> 	   smp_mb()                          smp_mb() // implied by atomic_fetch_or
>            READ ct->state                    READ page_table
> 
> > +	int state = atomic_read(&ct->state);
> > +	/* will return true only for cpus in kernel space */
> > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > +}
> 
> Also note that this doesn't stricly prevent userspace from being interrupted.
> You may well observe the CPU in kernel but it may receive the IPI later after
> switching to userspace.
> 
> We could arrange for avoiding that with marking ct->state with a pending work bit
> to flush upon user entry/exit but that's a bit more overhead so I first need to
> know about your expectations here, ie: can you tolerate such an occasional
> interruption or not?

Two points:

1) For a virtualized system, the overhead is not only of executing the
IPI but:

	VM-exit
	run VM-exit code in host
	handle IPI
	run VM-entry code in host
	VM-entry

2) Depends on the application and the definition of "occasional".

For certain types of applications (for example PLC software or
RAN processing), upon occurrence of an event, it is necessary to
complete a certain task in a maximum amount of time (deadline).

One way to express this requirement is with a pair of numbers,
deadline time and execution time, where:

       	* deadline time: length of time between event and deadline.
       	* execution time: length of time it takes for processing of event
                          to occur on a particular hardware platform
                          (uninterrupted).





^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 11:10     ` Frederic Weisbecker
  2023-04-05 11:41       ` Peter Zijlstra
@ 2023-04-05 19:45       ` Marcelo Tosatti
  2023-04-05 19:52         ` Peter Zijlstra
  1 sibling, 1 reply; 37+ messages in thread
From: Marcelo Tosatti @ 2023-04-05 19:45 UTC (permalink / raw)
  To: Frederic Weisbecker
  Cc: Yair Podemsky, linux, mpe, npiggin, christophe.leroy, hca, gor,
	agordeev, borntraeger, svens, davem, tglx, mingo, bp,
	dave.hansen, x86, hpa, will, aneesh.kumar, akpm, peterz, arnd,
	keescook, paulmck, jpoimboe, samitolvanen, ardb, juerg.haefliger,
	rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs

On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
> On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> > On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > > +	int state = atomic_read(&ct->state);
> > > +	/* will return true only for cpus in kernel space */
> > > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > > +}
> > 
> > Also note that this doesn't stricly prevent userspace from being interrupted.
> > You may well observe the CPU in kernel but it may receive the IPI later after
> > switching to userspace.
> > 
> > We could arrange for avoiding that with marking ct->state with a pending work bit
> > to flush upon user entry/exit but that's a bit more overhead so I first need to
> > know about your expectations here, ie: can you tolerate such an occasional
> > interruption or not?
> 
> Bah, actually what can we do to prevent from that racy IPI? Not much I fear...

Use a different mechanism other than an IPI to ensure in progress
__get_free_pages_fast() has finished execution.

Isnt this codepath slow path enough that it can use
synchronize_rcu_expedited?



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 19:45       ` Marcelo Tosatti
@ 2023-04-05 19:52         ` Peter Zijlstra
  2023-04-06 12:38           ` Marcelo Tosatti
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-05 19:52 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs

On Wed, Apr 05, 2023 at 04:45:32PM -0300, Marcelo Tosatti wrote:
> On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
> > On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> > > On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > > > +	int state = atomic_read(&ct->state);
> > > > +	/* will return true only for cpus in kernel space */
> > > > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > > > +}
> > > 
> > > Also note that this doesn't stricly prevent userspace from being interrupted.
> > > You may well observe the CPU in kernel but it may receive the IPI later after
> > > switching to userspace.
> > > 
> > > We could arrange for avoiding that with marking ct->state with a pending work bit
> > > to flush upon user entry/exit but that's a bit more overhead so I first need to
> > > know about your expectations here, ie: can you tolerate such an occasional
> > > interruption or not?
> > 
> > Bah, actually what can we do to prevent from that racy IPI? Not much I fear...
> 
> Use a different mechanism other than an IPI to ensure in progress
> __get_free_pages_fast() has finished execution.
> 
> Isnt this codepath slow path enough that it can use
> synchronize_rcu_expedited?

To actually hit this path you're doing something really dodgy.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 19:43     ` Marcelo Tosatti
@ 2023-04-05 19:54       ` Peter Zijlstra
  2023-04-06 12:49         ` Marcelo Tosatti
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-05 19:54 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs

On Wed, Apr 05, 2023 at 04:43:14PM -0300, Marcelo Tosatti wrote:

> Two points:
> 
> 1) For a virtualized system, the overhead is not only of executing the
> IPI but:
> 
> 	VM-exit
> 	run VM-exit code in host
> 	handle IPI
> 	run VM-entry code in host
> 	VM-entry

I thought we could do IPIs without VMexit these days? Also virt... /me
walks away.

> 2) Depends on the application and the definition of "occasional".
> 
> For certain types of applications (for example PLC software or
> RAN processing), upon occurrence of an event, it is necessary to
> complete a certain task in a maximum amount of time (deadline).

If the application is properly NOHZ_FULL and never does a kernel entry,
it will never get that IPI. If it is a pile of shit and does kernel
entries while it pretends to be NOHZ_FULL it gets to keep the pieces and
no amount of crying will get me to care.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 19:52         ` Peter Zijlstra
@ 2023-04-06 12:38           ` Marcelo Tosatti
  2023-04-06 13:29             ` Peter Zijlstra
  0 siblings, 1 reply; 37+ messages in thread
From: Marcelo Tosatti @ 2023-04-06 12:38 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs

On Wed, Apr 05, 2023 at 09:52:26PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 05, 2023 at 04:45:32PM -0300, Marcelo Tosatti wrote:
> > On Wed, Apr 05, 2023 at 01:10:07PM +0200, Frederic Weisbecker wrote:
> > > On Wed, Apr 05, 2023 at 12:44:04PM +0200, Frederic Weisbecker wrote:
> > > > On Tue, Apr 04, 2023 at 04:42:24PM +0300, Yair Podemsky wrote:
> > > > > +	int state = atomic_read(&ct->state);
> > > > > +	/* will return true only for cpus in kernel space */
> > > > > +	return state & CT_STATE_MASK == CONTEXT_KERNEL;
> > > > > +}
> > > > 
> > > > Also note that this doesn't stricly prevent userspace from being interrupted.
> > > > You may well observe the CPU in kernel but it may receive the IPI later after
> > > > switching to userspace.
> > > > 
> > > > We could arrange for avoiding that with marking ct->state with a pending work bit
> > > > to flush upon user entry/exit but that's a bit more overhead so I first need to
> > > > know about your expectations here, ie: can you tolerate such an occasional
> > > > interruption or not?
> > > 
> > > Bah, actually what can we do to prevent from that racy IPI? Not much I fear...
> > 
> > Use a different mechanism other than an IPI to ensure in progress
> > __get_free_pages_fast() has finished execution.
> > 
> > Isnt this codepath slow path enough that it can use
> > synchronize_rcu_expedited?
> 
> To actually hit this path you're doing something really dodgy.

Apparently khugepaged is using the same infrastructure:

$ grep tlb_remove_table khugepaged.c 
	tlb_remove_table_sync_one();
	tlb_remove_table_sync_one();

So just enabling khugepaged will hit that path.



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 19:54       ` Peter Zijlstra
@ 2023-04-06 12:49         ` Marcelo Tosatti
  2023-04-06 13:32           ` Peter Zijlstra
  0 siblings, 1 reply; 37+ messages in thread
From: Marcelo Tosatti @ 2023-04-06 12:49 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs

On Wed, Apr 05, 2023 at 09:54:57PM +0200, Peter Zijlstra wrote:
> On Wed, Apr 05, 2023 at 04:43:14PM -0300, Marcelo Tosatti wrote:
> 
> > Two points:
> > 
> > 1) For a virtualized system, the overhead is not only of executing the
> > IPI but:
> > 
> > 	VM-exit
> > 	run VM-exit code in host
> > 	handle IPI
> > 	run VM-entry code in host
> > 	VM-entry
> 
> I thought we could do IPIs without VMexit these days? 

Yes, IPIs to vCPU (guest context). In this case we can consider
an IPI to the host pCPU (which requires VM-exit from guest context).

> Also virt... /me walks away.
> 
> > 2) Depends on the application and the definition of "occasional".
> > 
> > For certain types of applications (for example PLC software or
> > RAN processing), upon occurrence of an event, it is necessary to
> > complete a certain task in a maximum amount of time (deadline).
> 
> If the application is properly NOHZ_FULL and never does a kernel entry,
> it will never get that IPI. If it is a pile of shit and does kernel
> entries while it pretends to be NOHZ_FULL it gets to keep the pieces and
> no amount of crying will get me to care.

I suppose its common practice to use certain system calls in latency
sensitive applications, for example nanosleep. Some examples:

1) cyclictest		(nanosleep)
2) PLC programs		(nanosleep)

A system call does not necessarily have to take locks, does it ?

Or even if application does system calls, but runs under a VM,
then you are requiring it to never VM-exit.

This reduces the flexibility of developing such applications.





^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 12:38           ` Marcelo Tosatti
@ 2023-04-06 13:29             ` Peter Zijlstra
  2023-04-06 14:04               ` Peter Zijlstra
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-06 13:29 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs

On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:

> > To actually hit this path you're doing something really dodgy.
> 
> Apparently khugepaged is using the same infrastructure:
> 
> $ grep tlb_remove_table khugepaged.c 
> 	tlb_remove_table_sync_one();
> 	tlb_remove_table_sync_one();
> 
> So just enabling khugepaged will hit that path.

Urgh, WTF..

Let me go read that stuff :/


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 12:49         ` Marcelo Tosatti
@ 2023-04-06 13:32           ` Peter Zijlstra
  2023-04-19 11:01             ` Marcelo Tosatti
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-06 13:32 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs

On Thu, Apr 06, 2023 at 09:49:22AM -0300, Marcelo Tosatti wrote:

> > > 2) Depends on the application and the definition of "occasional".
> > > 
> > > For certain types of applications (for example PLC software or
> > > RAN processing), upon occurrence of an event, it is necessary to
> > > complete a certain task in a maximum amount of time (deadline).
> > 
> > If the application is properly NOHZ_FULL and never does a kernel entry,
> > it will never get that IPI. If it is a pile of shit and does kernel
> > entries while it pretends to be NOHZ_FULL it gets to keep the pieces and
> > no amount of crying will get me to care.
> 
> I suppose its common practice to use certain system calls in latency
> sensitive applications, for example nanosleep. Some examples:
> 
> 1) cyclictest		(nanosleep)

cyclictest is not a NOHZ_FULL application, if you tihnk it is, you're
deluded.

> 2) PLC programs		(nanosleep)

What's a PLC? Programmable Logic Circuit?

> A system call does not necessarily have to take locks, does it ?

This all is unrelated to locks

> Or even if application does system calls, but runs under a VM,
> then you are requiring it to never VM-exit.

That seems to be a goal for performance anyway.

> This reduces the flexibility of developing such applications.

Yeah, that's the cards you're dealt, deal with it.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-05 12:45           ` Valentin Schneider
@ 2023-04-06 13:38             ` Peter Zijlstra
  2023-04-06 14:11               ` Valentin Schneider
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-06 13:38 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, dhildenb, alougovs

On Wed, Apr 05, 2023 at 01:45:02PM +0100, Valentin Schneider wrote:
> On 05/04/23 14:05, Frederic Weisbecker wrote:
> >  static void smp_call_function_many_cond(const struct cpumask *mask,
> >                                       smp_call_func_t func, void *info,
> > @@ -946,10 +948,13 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
> >  #endif
> >                       cfd_seq_store(pcpu->seq_queue, this_cpu, cpu, CFD_SEQ_QUEUE);
> >                       if (llist_add(&csd->node.llist, &per_cpu(call_single_queue, cpu))) {
> > -				__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
> > -				nr_cpus++;
> > -				last_cpu = cpu;
> > -
> > +				if (!(scf_flags & SCF_NO_USER) ||
> > +				    !IS_ENABLED(CONFIG_GENERIC_ENTRY) ||
> > +				     ct_state_cpu(cpu) != CONTEXT_USER) {
> > +					__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
> > +					nr_cpus++;
> > +					last_cpu = cpu;
> > +				}
> 
> I've been hacking on something like this (CSD deferral for NOHZ-full),
> and unfortunately this uses the CPU-local cfd_data storage thing, which
> means any further smp_call_function() from the same CPU to the same
> destination will spin on csd_lock_wait(), waiting for the target CPU to
> come out of userspace and flush the queue - and we've just spent extra
> effort into *not* disturbing it, so that'll take a while :(

I'm not sure I buy into deferring stuff.. a NOHZ_FULL cpu might 'never'
come back. Queueing data just in case it does seems wasteful.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 13:29             ` Peter Zijlstra
@ 2023-04-06 14:04               ` Peter Zijlstra
  2023-04-06 14:42                 ` David Hildenbrand
  2023-04-06 15:02                 ` Peter Zijlstra
  0 siblings, 2 replies; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-06 14:04 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs, jannh

On Thu, Apr 06, 2023 at 03:29:28PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:
> 
> > > To actually hit this path you're doing something really dodgy.
> > 
> > Apparently khugepaged is using the same infrastructure:
> > 
> > $ grep tlb_remove_table khugepaged.c 
> > 	tlb_remove_table_sync_one();
> > 	tlb_remove_table_sync_one();
> > 
> > So just enabling khugepaged will hit that path.
> 
> Urgh, WTF..
> 
> Let me go read that stuff :/

At the very least the one on collapse_and_free_pmd() could easily become
a call_rcu() based free.

I'm not sure I'm following what collapse_huge_page() does just yet.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 13:38             ` Peter Zijlstra
@ 2023-04-06 14:11               ` Valentin Schneider
  2023-04-06 14:39                 ` Peter Zijlstra
  0 siblings, 1 reply; 37+ messages in thread
From: Valentin Schneider @ 2023-04-06 14:11 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, dhildenb, alougovs

On 06/04/23 15:38, Peter Zijlstra wrote:
> On Wed, Apr 05, 2023 at 01:45:02PM +0100, Valentin Schneider wrote:
>>
>> I've been hacking on something like this (CSD deferral for NOHZ-full),
>> and unfortunately this uses the CPU-local cfd_data storage thing, which
>> means any further smp_call_function() from the same CPU to the same
>> destination will spin on csd_lock_wait(), waiting for the target CPU to
>> come out of userspace and flush the queue - and we've just spent extra
>> effort into *not* disturbing it, so that'll take a while :(
>
> I'm not sure I buy into deferring stuff.. a NOHZ_FULL cpu might 'never'
> come back. Queueing data just in case it does seems wasteful.

Putting those callbacks straight into the bin would make my life much
easier!

Unfortunately, even if they really should, I don't believe all of the
things being crammed onto NOHZ_FULL CPUs have the same definition of
'never' as we do :/



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 14:11               ` Valentin Schneider
@ 2023-04-06 14:39                 ` Peter Zijlstra
  0 siblings, 0 replies; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-06 14:39 UTC (permalink / raw)
  To: Valentin Schneider
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	mtosatti, dhildenb, alougovs

On Thu, Apr 06, 2023 at 03:11:52PM +0100, Valentin Schneider wrote:
> On 06/04/23 15:38, Peter Zijlstra wrote:
> > On Wed, Apr 05, 2023 at 01:45:02PM +0100, Valentin Schneider wrote:
> >>
> >> I've been hacking on something like this (CSD deferral for NOHZ-full),
> >> and unfortunately this uses the CPU-local cfd_data storage thing, which
> >> means any further smp_call_function() from the same CPU to the same
> >> destination will spin on csd_lock_wait(), waiting for the target CPU to
> >> come out of userspace and flush the queue - and we've just spent extra
> >> effort into *not* disturbing it, so that'll take a while :(
> >
> > I'm not sure I buy into deferring stuff.. a NOHZ_FULL cpu might 'never'
> > come back. Queueing data just in case it does seems wasteful.
> 
> Putting those callbacks straight into the bin would make my life much
> easier!

Well, it's either they get inhibited at the source like the parent patch
does, or they go through. I really don't see a sane middle way here.

> Unfortunately, even if they really should, I don't believe all of the
> things being crammed onto NOHZ_FULL CPUs have the same definition of
> 'never' as we do :/

That's not entirely the point, the point is that there are proper
NOHZ_FULL users that won't return to the kernel until the machine shuts
down. Buffering stuff for them is more or less a direct memory leak.



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 14:04               ` Peter Zijlstra
@ 2023-04-06 14:42                 ` David Hildenbrand
  2023-04-06 15:06                   ` Peter Zijlstra
  2023-04-06 15:02                 ` Peter Zijlstra
  1 sibling, 1 reply; 37+ messages in thread
From: David Hildenbrand @ 2023-04-06 14:42 UTC (permalink / raw)
  To: Peter Zijlstra, Marcelo Tosatti
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs, jannh

On 06.04.23 16:04, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 03:29:28PM +0200, Peter Zijlstra wrote:
>> On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:
>>
>>>> To actually hit this path you're doing something really dodgy.
>>>
>>> Apparently khugepaged is using the same infrastructure:
>>>
>>> $ grep tlb_remove_table khugepaged.c
>>> 	tlb_remove_table_sync_one();
>>> 	tlb_remove_table_sync_one();
>>>
>>> So just enabling khugepaged will hit that path.
>>
>> Urgh, WTF..
>>
>> Let me go read that stuff :/
> 
> At the very least the one on collapse_and_free_pmd() could easily become
> a call_rcu() based free.
> 
> I'm not sure I'm following what collapse_huge_page() does just yet.

It wants to replace a leaf page table by a THP (Transparent Huge Page 
mapped by a PMD). So we want to rip out a leaf page table while other 
code (GUP-fast) might still be walking it. In contrast to freeing the 
page table, we put it into a list where it can be reuse when having to 
PTE-map a THP again.

Now, similar to after freeing the page table, someone else could reuse 
that page table and modify it.

If we have GUP-fast walking the page table while that is happening, 
we're in trouble. So we have to make sure GUP-fast is done before 
enqueuing the now-free page table.

That's why the tlb_remove_table_sync_one() was recently added (by Jann 
IIRC).

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 14:04               ` Peter Zijlstra
  2023-04-06 14:42                 ` David Hildenbrand
@ 2023-04-06 15:02                 ` Peter Zijlstra
  2023-04-06 15:51                   ` David Hildenbrand
  1 sibling, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-06 15:02 UTC (permalink / raw)
  To: Marcelo Tosatti
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs, jannh

On Thu, Apr 06, 2023 at 04:04:23PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 03:29:28PM +0200, Peter Zijlstra wrote:
> > On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:
> > 
> > > > To actually hit this path you're doing something really dodgy.
> > > 
> > > Apparently khugepaged is using the same infrastructure:
> > > 
> > > $ grep tlb_remove_table khugepaged.c 
> > > 	tlb_remove_table_sync_one();
> > > 	tlb_remove_table_sync_one();
> > > 
> > > So just enabling khugepaged will hit that path.
> > 
> > Urgh, WTF..
> > 
> > Let me go read that stuff :/
> 
> At the very least the one on collapse_and_free_pmd() could easily become
> a call_rcu() based free.
> 
> I'm not sure I'm following what collapse_huge_page() does just yet.

DavidH, what do you thikn about reviving Jann's patches here:

  https://bugs.chromium.org/p/project-zero/issues/detail?id=2365#c1

Those are far more invasive, but afaict they seem to do the right thing.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 14:42                 ` David Hildenbrand
@ 2023-04-06 15:06                   ` Peter Zijlstra
  0 siblings, 0 replies; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-06 15:06 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Marcelo Tosatti, Frederic Weisbecker, Yair Podemsky, linux, mpe,
	npiggin, christophe.leroy, hca, gor, agordeev, borntraeger,
	svens, davem, tglx, mingo, bp, dave.hansen, x86, hpa, will,
	aneesh.kumar, akpm, arnd, keescook, paulmck, jpoimboe,
	samitolvanen, ardb, juerg.haefliger, rmk+kernel, geert+renesas,
	tony, linus.walleij, sebastian.reichel, nick.hawkins,
	linux-kernel, linux-arm-kernel, linuxppc-dev, linux-s390,
	sparclinux, linux-arch, linux-mm, vschneid, dhildenb, alougovs,
	jannh

On Thu, Apr 06, 2023 at 04:42:02PM +0200, David Hildenbrand wrote:
> On 06.04.23 16:04, Peter Zijlstra wrote:
> > On Thu, Apr 06, 2023 at 03:29:28PM +0200, Peter Zijlstra wrote:
> > > On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:
> > > 
> > > > > To actually hit this path you're doing something really dodgy.
> > > > 
> > > > Apparently khugepaged is using the same infrastructure:
> > > > 
> > > > $ grep tlb_remove_table khugepaged.c
> > > > 	tlb_remove_table_sync_one();
> > > > 	tlb_remove_table_sync_one();
> > > > 
> > > > So just enabling khugepaged will hit that path.
> > > 
> > > Urgh, WTF..
> > > 
> > > Let me go read that stuff :/
> > 
> > At the very least the one on collapse_and_free_pmd() could easily become
> > a call_rcu() based free.
> > 
> > I'm not sure I'm following what collapse_huge_page() does just yet.
> 
> It wants to replace a leaf page table by a THP (Transparent Huge Page mapped
> by a PMD). So we want to rip out a leaf page table while other code
> (GUP-fast) might still be walking it. 

Right, I got that far.

> In contrast to freeing the page table,
> we put it into a list where it can be reuse when having to PTE-map a THP
> again.

Yeah, this is the bit I couldn't find, that code is a bit of a maze.

> Now, similar to after freeing the page table, someone else could reuse that
> page table and modify it.

So ideally we'll RCU free the page instead of sticking it on that list.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 15:02                 ` Peter Zijlstra
@ 2023-04-06 15:51                   ` David Hildenbrand
  2023-04-06 18:27                     ` Peter Zijlstra
  0 siblings, 1 reply; 37+ messages in thread
From: David Hildenbrand @ 2023-04-06 15:51 UTC (permalink / raw)
  To: Peter Zijlstra, Marcelo Tosatti
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs, jannh, Yang Shi

On 06.04.23 17:02, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 04:04:23PM +0200, Peter Zijlstra wrote:
>> On Thu, Apr 06, 2023 at 03:29:28PM +0200, Peter Zijlstra wrote:
>>> On Thu, Apr 06, 2023 at 09:38:50AM -0300, Marcelo Tosatti wrote:
>>>
>>>>> To actually hit this path you're doing something really dodgy.
>>>>
>>>> Apparently khugepaged is using the same infrastructure:
>>>>
>>>> $ grep tlb_remove_table khugepaged.c
>>>> 	tlb_remove_table_sync_one();
>>>> 	tlb_remove_table_sync_one();
>>>>
>>>> So just enabling khugepaged will hit that path.
>>>
>>> Urgh, WTF..
>>>
>>> Let me go read that stuff :/
>>
>> At the very least the one on collapse_and_free_pmd() could easily become
>> a call_rcu() based free.
>>
>> I'm not sure I'm following what collapse_huge_page() does just yet.
> 
> DavidH, what do you thikn about reviving Jann's patches here:
> 
>    https://bugs.chromium.org/p/project-zero/issues/detail?id=2365#c1
> 
> Those are far more invasive, but afaict they seem to do the right thing.
> 

I recall seeing those while discussed on security@kernel.org. What we 
currently have was (IMHO for good reasons) deemed better to fix the 
issue, especially when caring about backports and getting it right.

The alternative that was discussed in that context IIRC was to simply 
allocate a fresh page table, place the fresh page table into the list 
instead, and simply free the old page table (then using common machinery).

TBH, I'd wish (and recently raised) that we could just stop wasting 
memory on page tables for THPs that are maybe never going to get 
PTE-mapped ... and eventually just allocate on demand (with some 
caching?) and handle the places where we're OOM and cannot PTE-map a THP 
in some descend way.

... instead of trying to figure out how to deal with these page tables 
we cannot free but have to special-case simply because of GUP-fast.

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 15:51                   ` David Hildenbrand
@ 2023-04-06 18:27                     ` Peter Zijlstra
  2023-04-19 11:30                       ` David Hildenbrand
  0 siblings, 1 reply; 37+ messages in thread
From: Peter Zijlstra @ 2023-04-06 18:27 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Marcelo Tosatti, Frederic Weisbecker, Yair Podemsky, linux, mpe,
	npiggin, christophe.leroy, hca, gor, agordeev, borntraeger,
	svens, davem, tglx, mingo, bp, dave.hansen, x86, hpa, will,
	aneesh.kumar, akpm, arnd, keescook, paulmck, jpoimboe,
	samitolvanen, ardb, juerg.haefliger, rmk+kernel, geert+renesas,
	tony, linus.walleij, sebastian.reichel, nick.hawkins,
	linux-kernel, linux-arm-kernel, linuxppc-dev, linux-s390,
	sparclinux, linux-arch, linux-mm, vschneid, dhildenb, alougovs,
	jannh, Yang Shi

On Thu, Apr 06, 2023 at 05:51:52PM +0200, David Hildenbrand wrote:
> On 06.04.23 17:02, Peter Zijlstra wrote:

> > DavidH, what do you thikn about reviving Jann's patches here:
> > 
> >    https://bugs.chromium.org/p/project-zero/issues/detail?id=2365#c1
> > 
> > Those are far more invasive, but afaict they seem to do the right thing.
> > 
> 
> I recall seeing those while discussed on security@kernel.org. What we
> currently have was (IMHO for good reasons) deemed better to fix the issue,
> especially when caring about backports and getting it right.

Yes, and I think that was the right call. However, we can now revisit
without having the pressure of a known defect and backport
considerations.

> The alternative that was discussed in that context IIRC was to simply
> allocate a fresh page table, place the fresh page table into the list
> instead, and simply free the old page table (then using common machinery).
> 
> TBH, I'd wish (and recently raised) that we could just stop wasting memory
> on page tables for THPs that are maybe never going to get PTE-mapped ... and
> eventually just allocate on demand (with some caching?) and handle the
> places where we're OOM and cannot PTE-map a THP in some descend way.
> 
> ... instead of trying to figure out how to deal with these page tables we
> cannot free but have to special-case simply because of GUP-fast.

Not keeping them around sounds good to me, but I'm not *that* familiar
with the THP code, most of that happened after I stopped tracking mm. So
I'm not sure how feasible is it.

But it does look entirely feasible to rework this page-table freeing
along the lines Jann did.


^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 13:32           ` Peter Zijlstra
@ 2023-04-19 11:01             ` Marcelo Tosatti
  0 siblings, 0 replies; 37+ messages in thread
From: Marcelo Tosatti @ 2023-04-19 11:01 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Frederic Weisbecker, Yair Podemsky, linux, mpe, npiggin,
	christophe.leroy, hca, gor, agordeev, borntraeger, svens, davem,
	tglx, mingo, bp, dave.hansen, x86, hpa, will, aneesh.kumar, akpm,
	arnd, keescook, paulmck, jpoimboe, samitolvanen, ardb,
	juerg.haefliger, rmk+kernel, geert+renesas, tony, linus.walleij,
	sebastian.reichel, nick.hawkins, linux-kernel, linux-arm-kernel,
	linuxppc-dev, linux-s390, sparclinux, linux-arch, linux-mm,
	vschneid, dhildenb, alougovs

On Thu, Apr 06, 2023 at 03:32:06PM +0200, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 09:49:22AM -0300, Marcelo Tosatti wrote:
> 
> > > > 2) Depends on the application and the definition of "occasional".
> > > > 
> > > > For certain types of applications (for example PLC software or
> > > > RAN processing), upon occurrence of an event, it is necessary to
> > > > complete a certain task in a maximum amount of time (deadline).
> > > 
> > > If the application is properly NOHZ_FULL and never does a kernel entry,
> > > it will never get that IPI. If it is a pile of shit and does kernel
> > > entries while it pretends to be NOHZ_FULL it gets to keep the pieces and
> > > no amount of crying will get me to care.
> > 
> > I suppose its common practice to use certain system calls in latency
> > sensitive applications, for example nanosleep. Some examples:
> > 
> > 1) cyclictest		(nanosleep)
> 
> cyclictest is not a NOHZ_FULL application, if you tihnk it is, you're
> deluded.

On the field (what end-users do on production):

cyclictest runs on NOHZ_FULL cores.
PLC type programs run on NOHZ_FULL cores.

So accordingly to physical reality i observe, i am not deluded.

> > 2) PLC programs		(nanosleep)
> 
> What's a PLC? Programmable Logic Circuit?

Programmable logic controller.

> > A system call does not necessarily have to take locks, does it ?
> 
> This all is unrelated to locks

OK.

> > Or even if application does system calls, but runs under a VM,
> > then you are requiring it to never VM-exit.
> 
> That seems to be a goal for performance anyway.

Not sure what you mean.

> > This reduces the flexibility of developing such applications.
> 
> Yeah, that's the cards you're dealt, deal with it.

This is not what happens on the field. 



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-06 18:27                     ` Peter Zijlstra
@ 2023-04-19 11:30                       ` David Hildenbrand
  2023-04-19 11:39                         ` Marcelo Tosatti
  0 siblings, 1 reply; 37+ messages in thread
From: David Hildenbrand @ 2023-04-19 11:30 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Marcelo Tosatti, Frederic Weisbecker, Yair Podemsky, linux, mpe,
	npiggin, christophe.leroy, hca, gor, agordeev, borntraeger,
	svens, davem, tglx, mingo, bp, dave.hansen, x86, hpa, will,
	aneesh.kumar, akpm, arnd, keescook, paulmck, jpoimboe,
	samitolvanen, ardb, juerg.haefliger, rmk+kernel, geert+renesas,
	tony, linus.walleij, sebastian.reichel, nick.hawkins,
	linux-kernel, linux-arm-kernel, linuxppc-dev, linux-s390,
	sparclinux, linux-arch, linux-mm, vschneid, dhildenb, alougovs,
	jannh, Yang Shi

On 06.04.23 20:27, Peter Zijlstra wrote:
> On Thu, Apr 06, 2023 at 05:51:52PM +0200, David Hildenbrand wrote:
>> On 06.04.23 17:02, Peter Zijlstra wrote:
> 
>>> DavidH, what do you thikn about reviving Jann's patches here:
>>>
>>>     https://bugs.chromium.org/p/project-zero/issues/detail?id=2365#c1
>>>
>>> Those are far more invasive, but afaict they seem to do the right thing.
>>>
>>
>> I recall seeing those while discussed on security@kernel.org. What we
>> currently have was (IMHO for good reasons) deemed better to fix the issue,
>> especially when caring about backports and getting it right.
> 
> Yes, and I think that was the right call. However, we can now revisit
> without having the pressure of a known defect and backport
> considerations.
> 
>> The alternative that was discussed in that context IIRC was to simply
>> allocate a fresh page table, place the fresh page table into the list
>> instead, and simply free the old page table (then using common machinery).
>>
>> TBH, I'd wish (and recently raised) that we could just stop wasting memory
>> on page tables for THPs that are maybe never going to get PTE-mapped ... and
>> eventually just allocate on demand (with some caching?) and handle the
>> places where we're OOM and cannot PTE-map a THP in some descend way.
>>
>> ... instead of trying to figure out how to deal with these page tables we
>> cannot free but have to special-case simply because of GUP-fast.
> 
> Not keeping them around sounds good to me, but I'm not *that* familiar
> with the THP code, most of that happened after I stopped tracking mm. So
> I'm not sure how feasible is it.
> 
> But it does look entirely feasible to rework this page-table freeing
> along the lines Jann did.

It's most probably more feasible, although the easiest would be to just 
allocate a fresh page table to deposit and free the old one using the 
mmu gatherer.

This way we can avoid the khugepaged of tlb_remove_table_smp_sync(), but 
not the tlb_remove_table_one() usage. I suspect khugepaged isn't really 
relevant in RT kernels (IIRC, most of RT setups disable THP completely).

tlb_remove_table_one() only triggers if __get_free_page(GFP_NOWAIT | 
__GFP_NOWARN); fails. IIUC, that can happen easily under memory pressure 
because it doesn't wait for direct reclaim.

I don't know much about RT workloads (so I'd appreciate some feedback), 
but I guess we can run int memory pressure as well due to some !rt 
housekeeping task on the system?

-- 
Thanks,

David / dhildenb



^ permalink raw reply	[flat|nested] 37+ messages in thread

* Re: [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode
  2023-04-19 11:30                       ` David Hildenbrand
@ 2023-04-19 11:39                         ` Marcelo Tosatti
  0 siblings, 0 replies; 37+ messages in thread
From: Marcelo Tosatti @ 2023-04-19 11:39 UTC (permalink / raw)
  To: David Hildenbrand
  Cc: Peter Zijlstra, Frederic Weisbecker, Yair Podemsky, linux, mpe,
	npiggin, christophe.leroy, hca, gor, agordeev, borntraeger,
	svens, davem, tglx, mingo, bp, dave.hansen, x86, hpa, will,
	aneesh.kumar, akpm, arnd, keescook, paulmck, jpoimboe,
	samitolvanen, ardb, juerg.haefliger, rmk+kernel, geert+renesas,
	tony, linus.walleij, sebastian.reichel, nick.hawkins,
	linux-kernel, linux-arm-kernel, linuxppc-dev, linux-s390,
	sparclinux, linux-arch, linux-mm, vschneid, dhildenb, alougovs,
	jannh, Yang Shi

On Wed, Apr 19, 2023 at 01:30:57PM +0200, David Hildenbrand wrote:
> On 06.04.23 20:27, Peter Zijlstra wrote:
> > On Thu, Apr 06, 2023 at 05:51:52PM +0200, David Hildenbrand wrote:
> > > On 06.04.23 17:02, Peter Zijlstra wrote:
> > 
> > > > DavidH, what do you thikn about reviving Jann's patches here:
> > > > 
> > > >     https://bugs.chromium.org/p/project-zero/issues/detail?id=2365#c1
> > > > 
> > > > Those are far more invasive, but afaict they seem to do the right thing.
> > > > 
> > > 
> > > I recall seeing those while discussed on security@kernel.org. What we
> > > currently have was (IMHO for good reasons) deemed better to fix the issue,
> > > especially when caring about backports and getting it right.
> > 
> > Yes, and I think that was the right call. However, we can now revisit
> > without having the pressure of a known defect and backport
> > considerations.
> > 
> > > The alternative that was discussed in that context IIRC was to simply
> > > allocate a fresh page table, place the fresh page table into the list
> > > instead, and simply free the old page table (then using common machinery).
> > > 
> > > TBH, I'd wish (and recently raised) that we could just stop wasting memory
> > > on page tables for THPs that are maybe never going to get PTE-mapped ... and
> > > eventually just allocate on demand (with some caching?) and handle the
> > > places where we're OOM and cannot PTE-map a THP in some descend way.
> > > 
> > > ... instead of trying to figure out how to deal with these page tables we
> > > cannot free but have to special-case simply because of GUP-fast.
> > 
> > Not keeping them around sounds good to me, but I'm not *that* familiar
> > with the THP code, most of that happened after I stopped tracking mm. So
> > I'm not sure how feasible is it.
> > 
> > But it does look entirely feasible to rework this page-table freeing
> > along the lines Jann did.
> 
> It's most probably more feasible, although the easiest would be to just
> allocate a fresh page table to deposit and free the old one using the mmu
> gatherer.
> 
> This way we can avoid the khugepaged of tlb_remove_table_smp_sync(), but not
> the tlb_remove_table_one() usage. I suspect khugepaged isn't really relevant
> in RT kernels (IIRC, most of RT setups disable THP completely).

People will disable khugepaged because it causes IPIs (and the fact one
has to disable khugepaged is a configuration overhead, and a source of
headache for configuring the realtime system, since one can forget of
doing that, etc).

But people do want to run non-RT applications along with RT applications
(in case you have a single box on a priviledged location, for example).

> 
> tlb_remove_table_one() only triggers if __get_free_page(GFP_NOWAIT |
> __GFP_NOWARN); fails. IIUC, that can happen easily under memory pressure
> because it doesn't wait for direct reclaim.
> 
> I don't know much about RT workloads (so I'd appreciate some feedback), but
> I guess we can run int memory pressure as well due to some !rt housekeeping
> task on the system?

Yes, exactly (memory for -RT app will be mlocked).



^ permalink raw reply	[flat|nested] 37+ messages in thread

end of thread, other threads:[~2023-04-19 11:41 UTC | newest]

Thread overview: 37+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-04-04 13:42 [PATCH 0/3] send tlb_remove_table_smp_sync IPI only to necessary CPUs Yair Podemsky
2023-04-04 13:42 ` [PATCH 1/3] arch: Introduce ARCH_HAS_CPUMASK_BITS Yair Podemsky
2023-04-04 13:47   ` David Hildenbrand
2023-04-04 13:42 ` [PATCH 2/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to MM CPUs Yair Podemsky
2023-04-04 14:57   ` Peter Zijlstra
2023-04-04 13:42 ` [PATCH 3/3] mm/mmu_gather: send tlb_remove_table_smp_sync IPI only to CPUs in kernel mode Yair Podemsky
2023-04-04 14:03   ` David Hildenbrand
2023-04-04 15:12   ` Peter Zijlstra
2023-04-04 16:00     ` Peter Zijlstra
2023-04-05  0:53       ` Hillf Danton
2023-04-05 10:43   ` Frederic Weisbecker
2023-04-05 11:10     ` Frederic Weisbecker
2023-04-05 11:41       ` Peter Zijlstra
2023-04-05 12:00         ` David Hildenbrand
2023-04-05 12:05         ` Frederic Weisbecker
2023-04-05 12:31           ` Frederic Weisbecker
2023-04-05 12:45           ` Valentin Schneider
2023-04-06 13:38             ` Peter Zijlstra
2023-04-06 14:11               ` Valentin Schneider
2023-04-06 14:39                 ` Peter Zijlstra
2023-04-05 19:45       ` Marcelo Tosatti
2023-04-05 19:52         ` Peter Zijlstra
2023-04-06 12:38           ` Marcelo Tosatti
2023-04-06 13:29             ` Peter Zijlstra
2023-04-06 14:04               ` Peter Zijlstra
2023-04-06 14:42                 ` David Hildenbrand
2023-04-06 15:06                   ` Peter Zijlstra
2023-04-06 15:02                 ` Peter Zijlstra
2023-04-06 15:51                   ` David Hildenbrand
2023-04-06 18:27                     ` Peter Zijlstra
2023-04-19 11:30                       ` David Hildenbrand
2023-04-19 11:39                         ` Marcelo Tosatti
2023-04-05 19:43     ` Marcelo Tosatti
2023-04-05 19:54       ` Peter Zijlstra
2023-04-06 12:49         ` Marcelo Tosatti
2023-04-06 13:32           ` Peter Zijlstra
2023-04-19 11:01             ` Marcelo Tosatti

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).