All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 1/5] stacktrace: move filter_irq_stacks() to kernel/stacktrace.c
@ 2021-09-23 10:47 ` Marco Elver
  0 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 10:47 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

filter_irq_stacks() has little to do with the stackdepot implementation,
except that it is usually used by users (such as KASAN) of stackdepot to
reduce the stack trace.

However, filter_irq_stacks() itself is not useful without a stack trace
as obtained by stack_trace_save() and friends.

Therefore, move filter_irq_stacks() to kernel/stacktrace.c, so that new
users of filter_irq_stacks() do not have to start depending on
STACKDEPOT only for filter_irq_stacks().

Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
---
v3:
* Rebase to -next due to conflicting stackdepot changes.

v2:
* New patch.
---
 include/linux/stackdepot.h |  2 --
 include/linux/stacktrace.h |  1 +
 kernel/stacktrace.c        | 30 ++++++++++++++++++++++++++++++
 lib/stackdepot.c           | 24 ------------------------
 4 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index ee03f11bb51a..c34b55a6e554 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -30,8 +30,6 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
 
 void stack_depot_print(depot_stack_handle_t stack);
 
-unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
-
 #ifdef CONFIG_STACKDEPOT
 int stack_depot_init(void);
 #else
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 9edecb494e9e..bef158815e83 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -21,6 +21,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
 unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
 				   unsigned int size, unsigned int skipnr);
 unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
 
 /* Internal interfaces. Do not use in generic code */
 #ifdef CONFIG_ARCH_STACKWALK
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9f8117c7cfdd..9c625257023d 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -13,6 +13,7 @@
 #include <linux/export.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
+#include <linux/interrupt.h>
 
 /**
  * stack_trace_print - Print the entries in the stack trace
@@ -373,3 +374,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
 #endif /* CONFIG_USER_STACKTRACE_SUPPORT */
 
 #endif /* !CONFIG_ARCH_STACKWALK */
+
+static inline bool in_irqentry_text(unsigned long ptr)
+{
+	return (ptr >= (unsigned long)&__irqentry_text_start &&
+		ptr < (unsigned long)&__irqentry_text_end) ||
+		(ptr >= (unsigned long)&__softirqentry_text_start &&
+		 ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+/**
+ * filter_irq_stacks - Find first IRQ stack entry in trace
+ * @entries:	Pointer to stack trace array
+ * @nr_entries:	Number of entries in the storage array
+ *
+ * Return: Number of trace entries until IRQ stack starts.
+ */
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries)
+{
+	unsigned int i;
+
+	for (i = 0; i < nr_entries; i++) {
+		if (in_irqentry_text(entries[i])) {
+			/* Include the irqentry function into the stack. */
+			return i + 1;
+		}
+	}
+	return nr_entries;
+}
+EXPORT_SYMBOL_GPL(filter_irq_stacks);
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 69c8c9b0d8d7..b437ae79aca1 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -20,7 +20,6 @@
  */
 
 #include <linux/gfp.h>
-#include <linux/interrupt.h>
 #include <linux/jhash.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -417,26 +416,3 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 	return __stack_depot_save(entries, nr_entries, alloc_flags, true);
 }
 EXPORT_SYMBOL_GPL(stack_depot_save);
-
-static inline int in_irqentry_text(unsigned long ptr)
-{
-	return (ptr >= (unsigned long)&__irqentry_text_start &&
-		ptr < (unsigned long)&__irqentry_text_end) ||
-		(ptr >= (unsigned long)&__softirqentry_text_start &&
-		 ptr < (unsigned long)&__softirqentry_text_end);
-}
-
-unsigned int filter_irq_stacks(unsigned long *entries,
-					     unsigned int nr_entries)
-{
-	unsigned int i;
-
-	for (i = 0; i < nr_entries; i++) {
-		if (in_irqentry_text(entries[i])) {
-			/* Include the irqentry function into the stack. */
-			return i + 1;
-		}
-	}
-	return nr_entries;
-}
-EXPORT_SYMBOL_GPL(filter_irq_stacks);
-- 
2.33.0.464.g1972c5931b-goog


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 1/5] stacktrace: move filter_irq_stacks() to kernel/stacktrace.c
@ 2021-09-23 10:47 ` Marco Elver
  0 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 10:47 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

filter_irq_stacks() has little to do with the stackdepot implementation,
except that it is usually used by users (such as KASAN) of stackdepot to
reduce the stack trace.

However, filter_irq_stacks() itself is not useful without a stack trace
as obtained by stack_trace_save() and friends.

Therefore, move filter_irq_stacks() to kernel/stacktrace.c, so that new
users of filter_irq_stacks() do not have to start depending on
STACKDEPOT only for filter_irq_stacks().

Signed-off-by: Marco Elver <elver@google.com>
Acked-by: Dmitry Vyukov <dvyukov@google.com>
---
v3:
* Rebase to -next due to conflicting stackdepot changes.

v2:
* New patch.
---
 include/linux/stackdepot.h |  2 --
 include/linux/stacktrace.h |  1 +
 kernel/stacktrace.c        | 30 ++++++++++++++++++++++++++++++
 lib/stackdepot.c           | 24 ------------------------
 4 files changed, 31 insertions(+), 26 deletions(-)

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index ee03f11bb51a..c34b55a6e554 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -30,8 +30,6 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
 
 void stack_depot_print(depot_stack_handle_t stack);
 
-unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
-
 #ifdef CONFIG_STACKDEPOT
 int stack_depot_init(void);
 #else
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index 9edecb494e9e..bef158815e83 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -21,6 +21,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
 unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
 				   unsigned int size, unsigned int skipnr);
 unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
 
 /* Internal interfaces. Do not use in generic code */
 #ifdef CONFIG_ARCH_STACKWALK
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 9f8117c7cfdd..9c625257023d 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -13,6 +13,7 @@
 #include <linux/export.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
+#include <linux/interrupt.h>
 
 /**
  * stack_trace_print - Print the entries in the stack trace
@@ -373,3 +374,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
 #endif /* CONFIG_USER_STACKTRACE_SUPPORT */
 
 #endif /* !CONFIG_ARCH_STACKWALK */
+
+static inline bool in_irqentry_text(unsigned long ptr)
+{
+	return (ptr >= (unsigned long)&__irqentry_text_start &&
+		ptr < (unsigned long)&__irqentry_text_end) ||
+		(ptr >= (unsigned long)&__softirqentry_text_start &&
+		 ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+/**
+ * filter_irq_stacks - Find first IRQ stack entry in trace
+ * @entries:	Pointer to stack trace array
+ * @nr_entries:	Number of entries in the storage array
+ *
+ * Return: Number of trace entries until IRQ stack starts.
+ */
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries)
+{
+	unsigned int i;
+
+	for (i = 0; i < nr_entries; i++) {
+		if (in_irqentry_text(entries[i])) {
+			/* Include the irqentry function into the stack. */
+			return i + 1;
+		}
+	}
+	return nr_entries;
+}
+EXPORT_SYMBOL_GPL(filter_irq_stacks);
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 69c8c9b0d8d7..b437ae79aca1 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -20,7 +20,6 @@
  */
 
 #include <linux/gfp.h>
-#include <linux/interrupt.h>
 #include <linux/jhash.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
@@ -417,26 +416,3 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 	return __stack_depot_save(entries, nr_entries, alloc_flags, true);
 }
 EXPORT_SYMBOL_GPL(stack_depot_save);
-
-static inline int in_irqentry_text(unsigned long ptr)
-{
-	return (ptr >= (unsigned long)&__irqentry_text_start &&
-		ptr < (unsigned long)&__irqentry_text_end) ||
-		(ptr >= (unsigned long)&__softirqentry_text_start &&
-		 ptr < (unsigned long)&__softirqentry_text_end);
-}
-
-unsigned int filter_irq_stacks(unsigned long *entries,
-					     unsigned int nr_entries)
-{
-	unsigned int i;
-
-	for (i = 0; i < nr_entries; i++) {
-		if (in_irqentry_text(entries[i])) {
-			/* Include the irqentry function into the stack. */
-			return i + 1;
-		}
-	}
-	return nr_entries;
-}
-EXPORT_SYMBOL_GPL(filter_irq_stacks);
-- 
2.33.0.464.g1972c5931b-goog



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 2/5] kfence: count unexpectedly skipped allocations
  2021-09-23 10:47 ` Marco Elver
@ 2021-09-23 10:48   ` Marco Elver
  -1 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 10:48 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

Maintain a counter to count allocations that are skipped due to being
incompatible (oversized, incompatible gfp flags) or no capacity.

This is to compute the fraction of allocations that could not be
serviced by KFENCE, which we expect to be rare.

Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
---
v2:
* Do not count deadlock-avoidance skips.
---
 mm/kfence/core.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 7a97db8bc8e7..249d75b7e5ee 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -112,6 +112,8 @@ enum kfence_counter_id {
 	KFENCE_COUNTER_FREES,
 	KFENCE_COUNTER_ZOMBIES,
 	KFENCE_COUNTER_BUGS,
+	KFENCE_COUNTER_SKIP_INCOMPAT,
+	KFENCE_COUNTER_SKIP_CAPACITY,
 	KFENCE_COUNTER_COUNT,
 };
 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -121,6 +123,8 @@ static const char *const counter_names[] = {
 	[KFENCE_COUNTER_FREES]		= "total frees",
 	[KFENCE_COUNTER_ZOMBIES]	= "zombie allocations",
 	[KFENCE_COUNTER_BUGS]		= "total bugs",
+	[KFENCE_COUNTER_SKIP_INCOMPAT]	= "skipped allocations (incompatible)",
+	[KFENCE_COUNTER_SKIP_CAPACITY]	= "skipped allocations (capacity)",
 };
 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
 
@@ -271,8 +275,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 		list_del_init(&meta->list);
 	}
 	raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
-	if (!meta)
+	if (!meta) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
 		return NULL;
+	}
 
 	if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
 		/*
@@ -740,8 +746,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	 * Perform size check before switching kfence_allocation_gate, so that
 	 * we don't disable KFENCE without making an allocation.
 	 */
-	if (size > PAGE_SIZE)
+	if (size > PAGE_SIZE) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
 		return NULL;
+	}
 
 	/*
 	 * Skip allocations from non-default zones, including DMA. We cannot
@@ -749,8 +757,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	 * properties (e.g. reside in DMAable memory).
 	 */
 	if ((flags & GFP_ZONEMASK) ||
-	    (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
+	    (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
 		return NULL;
+	}
 
 	/*
 	 * allocation_gate only needs to become non-zero, so it doesn't make
-- 
2.33.0.464.g1972c5931b-goog


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 2/5] kfence: count unexpectedly skipped allocations
@ 2021-09-23 10:48   ` Marco Elver
  0 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 10:48 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

Maintain a counter to count allocations that are skipped due to being
incompatible (oversized, incompatible gfp flags) or no capacity.

This is to compute the fraction of allocations that could not be
serviced by KFENCE, which we expect to be rare.

Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
---
v2:
* Do not count deadlock-avoidance skips.
---
 mm/kfence/core.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 7a97db8bc8e7..249d75b7e5ee 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -112,6 +112,8 @@ enum kfence_counter_id {
 	KFENCE_COUNTER_FREES,
 	KFENCE_COUNTER_ZOMBIES,
 	KFENCE_COUNTER_BUGS,
+	KFENCE_COUNTER_SKIP_INCOMPAT,
+	KFENCE_COUNTER_SKIP_CAPACITY,
 	KFENCE_COUNTER_COUNT,
 };
 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -121,6 +123,8 @@ static const char *const counter_names[] = {
 	[KFENCE_COUNTER_FREES]		= "total frees",
 	[KFENCE_COUNTER_ZOMBIES]	= "zombie allocations",
 	[KFENCE_COUNTER_BUGS]		= "total bugs",
+	[KFENCE_COUNTER_SKIP_INCOMPAT]	= "skipped allocations (incompatible)",
+	[KFENCE_COUNTER_SKIP_CAPACITY]	= "skipped allocations (capacity)",
 };
 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
 
@@ -271,8 +275,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 		list_del_init(&meta->list);
 	}
 	raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
-	if (!meta)
+	if (!meta) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
 		return NULL;
+	}
 
 	if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
 		/*
@@ -740,8 +746,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	 * Perform size check before switching kfence_allocation_gate, so that
 	 * we don't disable KFENCE without making an allocation.
 	 */
-	if (size > PAGE_SIZE)
+	if (size > PAGE_SIZE) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
 		return NULL;
+	}
 
 	/*
 	 * Skip allocations from non-default zones, including DMA. We cannot
@@ -749,8 +757,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	 * properties (e.g. reside in DMAable memory).
 	 */
 	if ((flags & GFP_ZONEMASK) ||
-	    (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
+	    (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
 		return NULL;
+	}
 
 	/*
 	 * allocation_gate only needs to become non-zero, so it doesn't make
-- 
2.33.0.464.g1972c5931b-goog



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 3/5] kfence: move saving stack trace of allocations into __kfence_alloc()
  2021-09-23 10:47 ` Marco Elver
@ 2021-09-23 10:48   ` Marco Elver
  -1 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 10:48 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

Move the saving of the stack trace of allocations into __kfence_alloc(),
so that the stack entries array can be used outside of
kfence_guarded_alloc() and we avoid potentially unwinding the stack
multiple times.

Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
---
v2:
* New patch.
---
 mm/kfence/core.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 249d75b7e5ee..db01814f8ff0 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -187,19 +187,26 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m
  * Update the object's metadata state, including updating the alloc/free stacks
  * depending on the state transition.
  */
-static noinline void metadata_update_state(struct kfence_metadata *meta,
-					   enum kfence_object_state next)
+static noinline void
+metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
+		      unsigned long *stack_entries, size_t num_stack_entries)
 {
 	struct kfence_track *track =
 		next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
 
 	lockdep_assert_held(&meta->lock);
 
-	/*
-	 * Skip over 1 (this) functions; noinline ensures we do not accidentally
-	 * skip over the caller by never inlining.
-	 */
-	track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+	if (stack_entries) {
+		memcpy(track->stack_entries, stack_entries,
+		       num_stack_entries * sizeof(stack_entries[0]));
+	} else {
+		/*
+		 * Skip over 1 (this) functions; noinline ensures we do not
+		 * accidentally skip over the caller by never inlining.
+		 */
+		num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+	}
+	track->num_stack_entries = num_stack_entries;
 	track->pid = task_pid_nr(current);
 	track->cpu = raw_smp_processor_id();
 	track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
@@ -261,7 +268,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
 	}
 }
 
-static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
+static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
+				  unsigned long *stack_entries, size_t num_stack_entries)
 {
 	struct kfence_metadata *meta = NULL;
 	unsigned long flags;
@@ -320,7 +328,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	addr = (void *)meta->addr;
 
 	/* Update remaining metadata. */
-	metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
+	metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
 	/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
 	WRITE_ONCE(meta->cache, cache);
 	meta->size = size;
@@ -400,7 +408,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 		memzero_explicit(addr, meta->size);
 
 	/* Mark the object as freed. */
-	metadata_update_state(meta, KFENCE_OBJECT_FREED);
+	metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
 
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
@@ -742,6 +750,9 @@ void kfence_shutdown_cache(struct kmem_cache *s)
 
 void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 {
+	unsigned long stack_entries[KFENCE_STACK_DEPTH];
+	size_t num_stack_entries;
+
 	/*
 	 * Perform size check before switching kfence_allocation_gate, so that
 	 * we don't disable KFENCE without making an allocation.
@@ -786,7 +797,9 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	if (!READ_ONCE(kfence_enabled))
 		return NULL;
 
-	return kfence_guarded_alloc(s, size, flags);
+	num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
+
+	return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
 }
 
 size_t kfence_ksize(const void *addr)
-- 
2.33.0.464.g1972c5931b-goog


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 3/5] kfence: move saving stack trace of allocations into __kfence_alloc()
@ 2021-09-23 10:48   ` Marco Elver
  0 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 10:48 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

Move the saving of the stack trace of allocations into __kfence_alloc(),
so that the stack entries array can be used outside of
kfence_guarded_alloc() and we avoid potentially unwinding the stack
multiple times.

Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
---
v2:
* New patch.
---
 mm/kfence/core.c | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 249d75b7e5ee..db01814f8ff0 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -187,19 +187,26 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m
  * Update the object's metadata state, including updating the alloc/free stacks
  * depending on the state transition.
  */
-static noinline void metadata_update_state(struct kfence_metadata *meta,
-					   enum kfence_object_state next)
+static noinline void
+metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
+		      unsigned long *stack_entries, size_t num_stack_entries)
 {
 	struct kfence_track *track =
 		next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
 
 	lockdep_assert_held(&meta->lock);
 
-	/*
-	 * Skip over 1 (this) functions; noinline ensures we do not accidentally
-	 * skip over the caller by never inlining.
-	 */
-	track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+	if (stack_entries) {
+		memcpy(track->stack_entries, stack_entries,
+		       num_stack_entries * sizeof(stack_entries[0]));
+	} else {
+		/*
+		 * Skip over 1 (this) functions; noinline ensures we do not
+		 * accidentally skip over the caller by never inlining.
+		 */
+		num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
+	}
+	track->num_stack_entries = num_stack_entries;
 	track->pid = task_pid_nr(current);
 	track->cpu = raw_smp_processor_id();
 	track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
@@ -261,7 +268,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
 	}
 }
 
-static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
+static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
+				  unsigned long *stack_entries, size_t num_stack_entries)
 {
 	struct kfence_metadata *meta = NULL;
 	unsigned long flags;
@@ -320,7 +328,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	addr = (void *)meta->addr;
 
 	/* Update remaining metadata. */
-	metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
+	metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
 	/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
 	WRITE_ONCE(meta->cache, cache);
 	meta->size = size;
@@ -400,7 +408,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 		memzero_explicit(addr, meta->size);
 
 	/* Mark the object as freed. */
-	metadata_update_state(meta, KFENCE_OBJECT_FREED);
+	metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
 
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
@@ -742,6 +750,9 @@ void kfence_shutdown_cache(struct kmem_cache *s)
 
 void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 {
+	unsigned long stack_entries[KFENCE_STACK_DEPTH];
+	size_t num_stack_entries;
+
 	/*
 	 * Perform size check before switching kfence_allocation_gate, so that
 	 * we don't disable KFENCE without making an allocation.
@@ -786,7 +797,9 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	if (!READ_ONCE(kfence_enabled))
 		return NULL;
 
-	return kfence_guarded_alloc(s, size, flags);
+	num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
+
+	return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
 }
 
 size_t kfence_ksize(const void *addr)
-- 
2.33.0.464.g1972c5931b-goog



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
  2021-09-23 10:47 ` Marco Elver
@ 2021-09-23 10:48   ` Marco Elver
  -1 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 10:48 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

One of KFENCE's main design principles is that with increasing uptime,
allocation coverage increases sufficiently to detect previously
undetected bugs.

We have observed that frequent long-lived allocations of the same
source (e.g. pagecache) tend to permanently fill up the KFENCE pool
with increasing system uptime, thus breaking the above requirement.
The workaround thus far had been increasing the sample interval and/or
increasing the KFENCE pool size, but is no reliable solution.

To ensure diverse coverage of allocations, limit currently covered
allocations of the same source once pool utilization reaches 75%
(configurable via `kfence.skip_covered_thresh`) or above. The effect is
retaining reasonable allocation coverage when the pool is close to full.

A side-effect is that this also limits frequent long-lived allocations
of the same source filling up the pool permanently.

Uniqueness of an allocation for coverage purposes is based on its
(partial) allocation stack trace (the source). A Counting Bloom filter
is used to check if an allocation is covered; if the allocation is
currently covered, the allocation is skipped by KFENCE.

Testing was done using:

	(a) a synthetic workload that performs frequent long-lived
	    allocations (default config values; sample_interval=1;
	    num_objects=63), and

	(b) normal desktop workloads on an otherwise idle machine where
	    the problem was first reported after a few days of uptime
	    (default config values).

In both test cases the sampled allocation rate no longer drops to zero
at any point. In the case of (b) we observe (after 2 days uptime) 15%
unique allocations in the pool, 77% pool utilization, with 20% "skipped
allocations (covered)".

Signed-off-by: Marco Elver <elver@google.com>
---
v3:
* Remove unneeded !alloc_stack_hash checks.
* Remove unneeded meta->alloc_stack_hash=0 in kfence_guarded_free().

v2:
* Switch to counting bloom filter to guarantee currently covered
  allocations being skipped.
* Use a module param for skip_covered threshold.
* Use kfence pool address as hash entropy.
* Use filter_irq_stacks().
---
 mm/kfence/core.c   | 103 ++++++++++++++++++++++++++++++++++++++++++++-
 mm/kfence/kfence.h |   2 +
 2 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index db01814f8ff0..58a0f6f1acc5 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -11,11 +11,13 @@
 #include <linux/bug.h>
 #include <linux/debugfs.h>
 #include <linux/irq_work.h>
+#include <linux/jhash.h>
 #include <linux/kcsan-checks.h>
 #include <linux/kfence.h>
 #include <linux/kmemleak.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
+#include <linux/log2.h>
 #include <linux/memblock.h>
 #include <linux/moduleparam.h>
 #include <linux/random.h>
@@ -82,6 +84,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
 };
 module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
 
+/* Pool usage% threshold when currently covered allocations are skipped. */
+static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
+module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
+
 /* The pool of pages used for guard pages and objects. */
 char *__kfence_pool __ro_after_init;
 EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
@@ -105,6 +111,25 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
 /* Gates the allocation, ensuring only one succeeds in a given period. */
 atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
 
+/*
+ * A Counting Bloom filter of allocation coverage: limits currently covered
+ * allocations of the same source filling up the pool.
+ *
+ * Assuming a range of 15%-85% unique allocations in the pool at any point in
+ * time, the below parameters provide a probablity of 0.02-0.33 for false
+ * positive hits respectively:
+ *
+ *	P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
+ */
+#define ALLOC_COVERED_HNUM	2
+#define ALLOC_COVERED_SIZE	(1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
+#define ALLOC_COVERED_HNEXT(h)	(1664525 * (h) + 1013904223)
+#define ALLOC_COVERED_MASK	(ALLOC_COVERED_SIZE - 1)
+static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
+
+/* Stack depth used to determine uniqueness of an allocation. */
+#define UNIQUE_ALLOC_STACK_DEPTH 8UL
+
 /* Statistics counters for debugfs. */
 enum kfence_counter_id {
 	KFENCE_COUNTER_ALLOCATED,
@@ -114,6 +139,7 @@ enum kfence_counter_id {
 	KFENCE_COUNTER_BUGS,
 	KFENCE_COUNTER_SKIP_INCOMPAT,
 	KFENCE_COUNTER_SKIP_CAPACITY,
+	KFENCE_COUNTER_SKIP_COVERED,
 	KFENCE_COUNTER_COUNT,
 };
 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -125,11 +151,60 @@ static const char *const counter_names[] = {
 	[KFENCE_COUNTER_BUGS]		= "total bugs",
 	[KFENCE_COUNTER_SKIP_INCOMPAT]	= "skipped allocations (incompatible)",
 	[KFENCE_COUNTER_SKIP_CAPACITY]	= "skipped allocations (capacity)",
+	[KFENCE_COUNTER_SKIP_COVERED]	= "skipped allocations (covered)",
 };
 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
 
 /* === Internals ============================================================ */
 
+static inline bool should_skip_covered(void)
+{
+	unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
+
+	return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
+}
+
+static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
+{
+	/* Some randomness across reboots / different machines. */
+	u32 seed = (u32)((unsigned long)__kfence_pool >> (BITS_PER_LONG - 32));
+
+	num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
+	num_entries = filter_irq_stacks(stack_entries, num_entries);
+	return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), seed);
+}
+
+/*
+ * Adds (or subtracts) count @val for allocation stack trace hash
+ * @alloc_stack_hash from Counting Bloom filter.
+ */
+static void alloc_covered_add(u32 alloc_stack_hash, int val)
+{
+	int i;
+
+	for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+		atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
+		alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+	}
+}
+
+/*
+ * Returns true if the allocation stack trace hash @alloc_stack_hash is
+ * currently contained (non-zero count) in Counting Bloom filter.
+ */
+static bool alloc_covered_contains(u32 alloc_stack_hash)
+{
+	int i;
+
+	for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+		if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
+			return false;
+		alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+	}
+
+	return true;
+}
+
 static bool kfence_protect(unsigned long addr)
 {
 	return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
@@ -269,7 +344,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
 }
 
 static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
-				  unsigned long *stack_entries, size_t num_stack_entries)
+				  unsigned long *stack_entries, size_t num_stack_entries,
+				  u32 alloc_stack_hash)
 {
 	struct kfence_metadata *meta = NULL;
 	unsigned long flags;
@@ -332,6 +408,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
 	WRITE_ONCE(meta->cache, cache);
 	meta->size = size;
+	meta->alloc_stack_hash = alloc_stack_hash;
+
 	for_each_canary(meta, set_canary_byte);
 
 	/* Set required struct page fields. */
@@ -344,6 +422,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
+	alloc_covered_add(alloc_stack_hash, 1);
+
 	/* Memory initialization. */
 
 	/*
@@ -412,6 +492,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
+	alloc_covered_add(meta->alloc_stack_hash, -1);
+
 	/* Protect to detect use-after-frees. */
 	kfence_protect((unsigned long)addr);
 
@@ -752,6 +834,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 {
 	unsigned long stack_entries[KFENCE_STACK_DEPTH];
 	size_t num_stack_entries;
+	u32 alloc_stack_hash;
 
 	/*
 	 * Perform size check before switching kfence_allocation_gate, so that
@@ -799,7 +882,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 
 	num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
 
-	return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
+	/*
+	 * Do expensive check for coverage of allocation in slow-path after
+	 * allocation_gate has already become non-zero, even though it might
+	 * mean not making any allocation within a given sample interval.
+	 *
+	 * This ensures reasonable allocation coverage when the pool is almost
+	 * full, including avoiding long-lived allocations of the same source
+	 * filling up the pool (e.g. pagecache allocations).
+	 */
+	alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
+	if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
+		return NULL;
+	}
+
+	return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
+				    alloc_stack_hash);
 }
 
 size_t kfence_ksize(const void *addr)
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index c1f23c61e5f9..2a2d5de9d379 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -87,6 +87,8 @@ struct kfence_metadata {
 	/* Allocation and free stack information. */
 	struct kfence_track alloc_track;
 	struct kfence_track free_track;
+	/* For updating alloc_covered on frees. */
+	u32 alloc_stack_hash;
 };
 
 extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
-- 
2.33.0.464.g1972c5931b-goog


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
@ 2021-09-23 10:48   ` Marco Elver
  0 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 10:48 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

One of KFENCE's main design principles is that with increasing uptime,
allocation coverage increases sufficiently to detect previously
undetected bugs.

We have observed that frequent long-lived allocations of the same
source (e.g. pagecache) tend to permanently fill up the KFENCE pool
with increasing system uptime, thus breaking the above requirement.
The workaround thus far had been increasing the sample interval and/or
increasing the KFENCE pool size, but is no reliable solution.

To ensure diverse coverage of allocations, limit currently covered
allocations of the same source once pool utilization reaches 75%
(configurable via `kfence.skip_covered_thresh`) or above. The effect is
retaining reasonable allocation coverage when the pool is close to full.

A side-effect is that this also limits frequent long-lived allocations
of the same source filling up the pool permanently.

Uniqueness of an allocation for coverage purposes is based on its
(partial) allocation stack trace (the source). A Counting Bloom filter
is used to check if an allocation is covered; if the allocation is
currently covered, the allocation is skipped by KFENCE.

Testing was done using:

	(a) a synthetic workload that performs frequent long-lived
	    allocations (default config values; sample_interval=1;
	    num_objects=63), and

	(b) normal desktop workloads on an otherwise idle machine where
	    the problem was first reported after a few days of uptime
	    (default config values).

In both test cases the sampled allocation rate no longer drops to zero
at any point. In the case of (b) we observe (after 2 days uptime) 15%
unique allocations in the pool, 77% pool utilization, with 20% "skipped
allocations (covered)".

Signed-off-by: Marco Elver <elver@google.com>
---
v3:
* Remove unneeded !alloc_stack_hash checks.
* Remove unneeded meta->alloc_stack_hash=0 in kfence_guarded_free().

v2:
* Switch to counting bloom filter to guarantee currently covered
  allocations being skipped.
* Use a module param for skip_covered threshold.
* Use kfence pool address as hash entropy.
* Use filter_irq_stacks().
---
 mm/kfence/core.c   | 103 ++++++++++++++++++++++++++++++++++++++++++++-
 mm/kfence/kfence.h |   2 +
 2 files changed, 103 insertions(+), 2 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index db01814f8ff0..58a0f6f1acc5 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -11,11 +11,13 @@
 #include <linux/bug.h>
 #include <linux/debugfs.h>
 #include <linux/irq_work.h>
+#include <linux/jhash.h>
 #include <linux/kcsan-checks.h>
 #include <linux/kfence.h>
 #include <linux/kmemleak.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
+#include <linux/log2.h>
 #include <linux/memblock.h>
 #include <linux/moduleparam.h>
 #include <linux/random.h>
@@ -82,6 +84,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
 };
 module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
 
+/* Pool usage% threshold when currently covered allocations are skipped. */
+static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
+module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
+
 /* The pool of pages used for guard pages and objects. */
 char *__kfence_pool __ro_after_init;
 EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
@@ -105,6 +111,25 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
 /* Gates the allocation, ensuring only one succeeds in a given period. */
 atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
 
+/*
+ * A Counting Bloom filter of allocation coverage: limits currently covered
+ * allocations of the same source filling up the pool.
+ *
+ * Assuming a range of 15%-85% unique allocations in the pool at any point in
+ * time, the below parameters provide a probablity of 0.02-0.33 for false
+ * positive hits respectively:
+ *
+ *	P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
+ */
+#define ALLOC_COVERED_HNUM	2
+#define ALLOC_COVERED_SIZE	(1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
+#define ALLOC_COVERED_HNEXT(h)	(1664525 * (h) + 1013904223)
+#define ALLOC_COVERED_MASK	(ALLOC_COVERED_SIZE - 1)
+static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
+
+/* Stack depth used to determine uniqueness of an allocation. */
+#define UNIQUE_ALLOC_STACK_DEPTH 8UL
+
 /* Statistics counters for debugfs. */
 enum kfence_counter_id {
 	KFENCE_COUNTER_ALLOCATED,
@@ -114,6 +139,7 @@ enum kfence_counter_id {
 	KFENCE_COUNTER_BUGS,
 	KFENCE_COUNTER_SKIP_INCOMPAT,
 	KFENCE_COUNTER_SKIP_CAPACITY,
+	KFENCE_COUNTER_SKIP_COVERED,
 	KFENCE_COUNTER_COUNT,
 };
 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -125,11 +151,60 @@ static const char *const counter_names[] = {
 	[KFENCE_COUNTER_BUGS]		= "total bugs",
 	[KFENCE_COUNTER_SKIP_INCOMPAT]	= "skipped allocations (incompatible)",
 	[KFENCE_COUNTER_SKIP_CAPACITY]	= "skipped allocations (capacity)",
+	[KFENCE_COUNTER_SKIP_COVERED]	= "skipped allocations (covered)",
 };
 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
 
 /* === Internals ============================================================ */
 
+static inline bool should_skip_covered(void)
+{
+	unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
+
+	return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
+}
+
+static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
+{
+	/* Some randomness across reboots / different machines. */
+	u32 seed = (u32)((unsigned long)__kfence_pool >> (BITS_PER_LONG - 32));
+
+	num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
+	num_entries = filter_irq_stacks(stack_entries, num_entries);
+	return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), seed);
+}
+
+/*
+ * Adds (or subtracts) count @val for allocation stack trace hash
+ * @alloc_stack_hash from Counting Bloom filter.
+ */
+static void alloc_covered_add(u32 alloc_stack_hash, int val)
+{
+	int i;
+
+	for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+		atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
+		alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+	}
+}
+
+/*
+ * Returns true if the allocation stack trace hash @alloc_stack_hash is
+ * currently contained (non-zero count) in Counting Bloom filter.
+ */
+static bool alloc_covered_contains(u32 alloc_stack_hash)
+{
+	int i;
+
+	for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
+		if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
+			return false;
+		alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
+	}
+
+	return true;
+}
+
 static bool kfence_protect(unsigned long addr)
 {
 	return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
@@ -269,7 +344,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
 }
 
 static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
-				  unsigned long *stack_entries, size_t num_stack_entries)
+				  unsigned long *stack_entries, size_t num_stack_entries,
+				  u32 alloc_stack_hash)
 {
 	struct kfence_metadata *meta = NULL;
 	unsigned long flags;
@@ -332,6 +408,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
 	WRITE_ONCE(meta->cache, cache);
 	meta->size = size;
+	meta->alloc_stack_hash = alloc_stack_hash;
+
 	for_each_canary(meta, set_canary_byte);
 
 	/* Set required struct page fields. */
@@ -344,6 +422,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
+	alloc_covered_add(alloc_stack_hash, 1);
+
 	/* Memory initialization. */
 
 	/*
@@ -412,6 +492,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
+	alloc_covered_add(meta->alloc_stack_hash, -1);
+
 	/* Protect to detect use-after-frees. */
 	kfence_protect((unsigned long)addr);
 
@@ -752,6 +834,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 {
 	unsigned long stack_entries[KFENCE_STACK_DEPTH];
 	size_t num_stack_entries;
+	u32 alloc_stack_hash;
 
 	/*
 	 * Perform size check before switching kfence_allocation_gate, so that
@@ -799,7 +882,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 
 	num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
 
-	return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
+	/*
+	 * Do expensive check for coverage of allocation in slow-path after
+	 * allocation_gate has already become non-zero, even though it might
+	 * mean not making any allocation within a given sample interval.
+	 *
+	 * This ensures reasonable allocation coverage when the pool is almost
+	 * full, including avoiding long-lived allocations of the same source
+	 * filling up the pool (e.g. pagecache allocations).
+	 */
+	alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
+	if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
+		return NULL;
+	}
+
+	return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
+				    alloc_stack_hash);
 }
 
 size_t kfence_ksize(const void *addr)
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index c1f23c61e5f9..2a2d5de9d379 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -87,6 +87,8 @@ struct kfence_metadata {
 	/* Allocation and free stack information. */
 	struct kfence_track alloc_track;
 	struct kfence_track free_track;
+	/* For updating alloc_covered on frees. */
+	u32 alloc_stack_hash;
 };
 
 extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
-- 
2.33.0.464.g1972c5931b-goog



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 5/5] kfence: add note to documentation about skipping covered allocations
  2021-09-23 10:47 ` Marco Elver
@ 2021-09-23 10:48   ` Marco Elver
  -1 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 10:48 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

Add a note briefly mentioning the new policy about "skipping currently
covered allocations if pool close to full." Since this has a notable
impact on KFENCE's bug-detection ability on systems with large uptimes,
it is worth pointing out the feature.

Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
---
v2:
* Rewrite.
---
 Documentation/dev-tools/kfence.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
index 0fbe3308bf37..d45f952986ae 100644
--- a/Documentation/dev-tools/kfence.rst
+++ b/Documentation/dev-tools/kfence.rst
@@ -269,6 +269,17 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused
 first, and the chances of detecting use-after-frees of recently freed objects
 is increased.
 
+If pool utilization reaches 75% (default) or above, to reduce the risk of the
+pool eventually being fully occupied by allocated objects yet ensure diverse
+coverage of allocations, KFENCE limits currently covered allocations of the
+same source from further filling up the pool. The "source" of an allocation is
+based on its partial allocation stack trace. A side-effect is that this also
+limits frequent long-lived allocations (e.g. pagecache) of the same source
+filling up the pool permanently, which is the most common risk for the pool
+becoming full and the sampled allocation rate dropping to zero. The threshold
+at which to start limiting currently covered allocations can be configured via
+the boot parameter ``kfence.skip_covered_thresh`` (pool usage%).
+
 Interface
 ---------
 
-- 
2.33.0.464.g1972c5931b-goog


^ permalink raw reply related	[flat|nested] 28+ messages in thread

* [PATCH v3 5/5] kfence: add note to documentation about skipping covered allocations
@ 2021-09-23 10:48   ` Marco Elver
  0 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 10:48 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

Add a note briefly mentioning the new policy about "skipping currently
covered allocations if pool close to full." Since this has a notable
impact on KFENCE's bug-detection ability on systems with large uptimes,
it is worth pointing out the feature.

Signed-off-by: Marco Elver <elver@google.com>
Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
---
v2:
* Rewrite.
---
 Documentation/dev-tools/kfence.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
index 0fbe3308bf37..d45f952986ae 100644
--- a/Documentation/dev-tools/kfence.rst
+++ b/Documentation/dev-tools/kfence.rst
@@ -269,6 +269,17 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused
 first, and the chances of detecting use-after-frees of recently freed objects
 is increased.
 
+If pool utilization reaches 75% (default) or above, to reduce the risk of the
+pool eventually being fully occupied by allocated objects yet ensure diverse
+coverage of allocations, KFENCE limits currently covered allocations of the
+same source from further filling up the pool. The "source" of an allocation is
+based on its partial allocation stack trace. A side-effect is that this also
+limits frequent long-lived allocations (e.g. pagecache) of the same source
+filling up the pool permanently, which is the most common risk for the pool
+becoming full and the sampled allocation rate dropping to zero. The threshold
+at which to start limiting currently covered allocations can be configured via
+the boot parameter ``kfence.skip_covered_thresh`` (pool usage%).
+
 Interface
 ---------
 
-- 
2.33.0.464.g1972c5931b-goog



^ permalink raw reply related	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/5] stacktrace: move filter_irq_stacks() to kernel/stacktrace.c
  2021-09-23 10:47 ` Marco Elver
@ 2021-09-23 11:14   ` Alexander Potapenko
  -1 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 11:14 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 12:48 PM Marco Elver <elver@google.com> wrote:
>
> filter_irq_stacks() has little to do with the stackdepot implementation,
> except that it is usually used by users (such as KASAN) of stackdepot to
> reduce the stack trace.
>
> However, filter_irq_stacks() itself is not useful without a stack trace
> as obtained by stack_trace_save() and friends.
>
> Therefore, move filter_irq_stacks() to kernel/stacktrace.c, so that new
> users of filter_irq_stacks() do not have to start depending on
> STACKDEPOT only for filter_irq_stacks().
>
> Signed-off-by: Marco Elver <elver@google.com>
> Acked-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Alexander Potapenko <glider@google.com>

> ---
> v3:
> * Rebase to -next due to conflicting stackdepot changes.
>
> v2:
> * New patch.
> ---
>  include/linux/stackdepot.h |  2 --
>  include/linux/stacktrace.h |  1 +
>  kernel/stacktrace.c        | 30 ++++++++++++++++++++++++++++++
>  lib/stackdepot.c           | 24 ------------------------
>  4 files changed, 31 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
> index ee03f11bb51a..c34b55a6e554 100644
> --- a/include/linux/stackdepot.h
> +++ b/include/linux/stackdepot.h
> @@ -30,8 +30,6 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
>
>  void stack_depot_print(depot_stack_handle_t stack);
>
> -unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
> -
>  #ifdef CONFIG_STACKDEPOT
>  int stack_depot_init(void);
>  #else
> diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
> index 9edecb494e9e..bef158815e83 100644
> --- a/include/linux/stacktrace.h
> +++ b/include/linux/stacktrace.h
> @@ -21,6 +21,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
>  unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
>                                    unsigned int size, unsigned int skipnr);
>  unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
> +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
>
>  /* Internal interfaces. Do not use in generic code */
>  #ifdef CONFIG_ARCH_STACKWALK
> diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
> index 9f8117c7cfdd..9c625257023d 100644
> --- a/kernel/stacktrace.c
> +++ b/kernel/stacktrace.c
> @@ -13,6 +13,7 @@
>  #include <linux/export.h>
>  #include <linux/kallsyms.h>
>  #include <linux/stacktrace.h>
> +#include <linux/interrupt.h>
>
>  /**
>   * stack_trace_print - Print the entries in the stack trace
> @@ -373,3 +374,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
>  #endif /* CONFIG_USER_STACKTRACE_SUPPORT */
>
>  #endif /* !CONFIG_ARCH_STACKWALK */
> +
> +static inline bool in_irqentry_text(unsigned long ptr)
> +{
> +       return (ptr >= (unsigned long)&__irqentry_text_start &&
> +               ptr < (unsigned long)&__irqentry_text_end) ||
> +               (ptr >= (unsigned long)&__softirqentry_text_start &&
> +                ptr < (unsigned long)&__softirqentry_text_end);
> +}
> +
> +/**
> + * filter_irq_stacks - Find first IRQ stack entry in trace
> + * @entries:   Pointer to stack trace array
> + * @nr_entries:        Number of entries in the storage array
> + *
> + * Return: Number of trace entries until IRQ stack starts.
> + */
> +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries)
> +{
> +       unsigned int i;
> +
> +       for (i = 0; i < nr_entries; i++) {
> +               if (in_irqentry_text(entries[i])) {
> +                       /* Include the irqentry function into the stack. */
> +                       return i + 1;
> +               }
> +       }
> +       return nr_entries;
> +}
> +EXPORT_SYMBOL_GPL(filter_irq_stacks);
> diff --git a/lib/stackdepot.c b/lib/stackdepot.c
> index 69c8c9b0d8d7..b437ae79aca1 100644
> --- a/lib/stackdepot.c
> +++ b/lib/stackdepot.c
> @@ -20,7 +20,6 @@
>   */
>
>  #include <linux/gfp.h>
> -#include <linux/interrupt.h>
>  #include <linux/jhash.h>
>  #include <linux/kernel.h>
>  #include <linux/mm.h>
> @@ -417,26 +416,3 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
>         return __stack_depot_save(entries, nr_entries, alloc_flags, true);
>  }
>  EXPORT_SYMBOL_GPL(stack_depot_save);
> -
> -static inline int in_irqentry_text(unsigned long ptr)
> -{
> -       return (ptr >= (unsigned long)&__irqentry_text_start &&
> -               ptr < (unsigned long)&__irqentry_text_end) ||
> -               (ptr >= (unsigned long)&__softirqentry_text_start &&
> -                ptr < (unsigned long)&__softirqentry_text_end);
> -}
> -
> -unsigned int filter_irq_stacks(unsigned long *entries,
> -                                            unsigned int nr_entries)
> -{
> -       unsigned int i;
> -
> -       for (i = 0; i < nr_entries; i++) {
> -               if (in_irqentry_text(entries[i])) {
> -                       /* Include the irqentry function into the stack. */
> -                       return i + 1;
> -               }
> -       }
> -       return nr_entries;
> -}
> -EXPORT_SYMBOL_GPL(filter_irq_stacks);
> --
> 2.33.0.464.g1972c5931b-goog
>


-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 1/5] stacktrace: move filter_irq_stacks() to kernel/stacktrace.c
@ 2021-09-23 11:14   ` Alexander Potapenko
  0 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 11:14 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 12:48 PM Marco Elver <elver@google.com> wrote:
>
> filter_irq_stacks() has little to do with the stackdepot implementation,
> except that it is usually used by users (such as KASAN) of stackdepot to
> reduce the stack trace.
>
> However, filter_irq_stacks() itself is not useful without a stack trace
> as obtained by stack_trace_save() and friends.
>
> Therefore, move filter_irq_stacks() to kernel/stacktrace.c, so that new
> users of filter_irq_stacks() do not have to start depending on
> STACKDEPOT only for filter_irq_stacks().
>
> Signed-off-by: Marco Elver <elver@google.com>
> Acked-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Alexander Potapenko <glider@google.com>

> ---
> v3:
> * Rebase to -next due to conflicting stackdepot changes.
>
> v2:
> * New patch.
> ---
>  include/linux/stackdepot.h |  2 --
>  include/linux/stacktrace.h |  1 +
>  kernel/stacktrace.c        | 30 ++++++++++++++++++++++++++++++
>  lib/stackdepot.c           | 24 ------------------------
>  4 files changed, 31 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
> index ee03f11bb51a..c34b55a6e554 100644
> --- a/include/linux/stackdepot.h
> +++ b/include/linux/stackdepot.h
> @@ -30,8 +30,6 @@ int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size,
>
>  void stack_depot_print(depot_stack_handle_t stack);
>
> -unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
> -
>  #ifdef CONFIG_STACKDEPOT
>  int stack_depot_init(void);
>  #else
> diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
> index 9edecb494e9e..bef158815e83 100644
> --- a/include/linux/stacktrace.h
> +++ b/include/linux/stacktrace.h
> @@ -21,6 +21,7 @@ unsigned int stack_trace_save_tsk(struct task_struct *task,
>  unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
>                                    unsigned int size, unsigned int skipnr);
>  unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
> +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
>
>  /* Internal interfaces. Do not use in generic code */
>  #ifdef CONFIG_ARCH_STACKWALK
> diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
> index 9f8117c7cfdd..9c625257023d 100644
> --- a/kernel/stacktrace.c
> +++ b/kernel/stacktrace.c
> @@ -13,6 +13,7 @@
>  #include <linux/export.h>
>  #include <linux/kallsyms.h>
>  #include <linux/stacktrace.h>
> +#include <linux/interrupt.h>
>
>  /**
>   * stack_trace_print - Print the entries in the stack trace
> @@ -373,3 +374,32 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size)
>  #endif /* CONFIG_USER_STACKTRACE_SUPPORT */
>
>  #endif /* !CONFIG_ARCH_STACKWALK */
> +
> +static inline bool in_irqentry_text(unsigned long ptr)
> +{
> +       return (ptr >= (unsigned long)&__irqentry_text_start &&
> +               ptr < (unsigned long)&__irqentry_text_end) ||
> +               (ptr >= (unsigned long)&__softirqentry_text_start &&
> +                ptr < (unsigned long)&__softirqentry_text_end);
> +}
> +
> +/**
> + * filter_irq_stacks - Find first IRQ stack entry in trace
> + * @entries:   Pointer to stack trace array
> + * @nr_entries:        Number of entries in the storage array
> + *
> + * Return: Number of trace entries until IRQ stack starts.
> + */
> +unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries)
> +{
> +       unsigned int i;
> +
> +       for (i = 0; i < nr_entries; i++) {
> +               if (in_irqentry_text(entries[i])) {
> +                       /* Include the irqentry function into the stack. */
> +                       return i + 1;
> +               }
> +       }
> +       return nr_entries;
> +}
> +EXPORT_SYMBOL_GPL(filter_irq_stacks);
> diff --git a/lib/stackdepot.c b/lib/stackdepot.c
> index 69c8c9b0d8d7..b437ae79aca1 100644
> --- a/lib/stackdepot.c
> +++ b/lib/stackdepot.c
> @@ -20,7 +20,6 @@
>   */
>
>  #include <linux/gfp.h>
> -#include <linux/interrupt.h>
>  #include <linux/jhash.h>
>  #include <linux/kernel.h>
>  #include <linux/mm.h>
> @@ -417,26 +416,3 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
>         return __stack_depot_save(entries, nr_entries, alloc_flags, true);
>  }
>  EXPORT_SYMBOL_GPL(stack_depot_save);
> -
> -static inline int in_irqentry_text(unsigned long ptr)
> -{
> -       return (ptr >= (unsigned long)&__irqentry_text_start &&
> -               ptr < (unsigned long)&__irqentry_text_end) ||
> -               (ptr >= (unsigned long)&__softirqentry_text_start &&
> -                ptr < (unsigned long)&__softirqentry_text_end);
> -}
> -
> -unsigned int filter_irq_stacks(unsigned long *entries,
> -                                            unsigned int nr_entries)
> -{
> -       unsigned int i;
> -
> -       for (i = 0; i < nr_entries; i++) {
> -               if (in_irqentry_text(entries[i])) {
> -                       /* Include the irqentry function into the stack. */
> -                       return i + 1;
> -               }
> -       }
> -       return nr_entries;
> -}
> -EXPORT_SYMBOL_GPL(filter_irq_stacks);
> --
> 2.33.0.464.g1972c5931b-goog
>


-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 2/5] kfence: count unexpectedly skipped allocations
  2021-09-23 10:48   ` Marco Elver
@ 2021-09-23 11:15     ` Alexander Potapenko
  -1 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 11:15 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 12:48 PM Marco Elver <elver@google.com> wrote:
>
> Maintain a counter to count allocations that are skipped due to being
> incompatible (oversized, incompatible gfp flags) or no capacity.
>
> This is to compute the fraction of allocations that could not be
> serviced by KFENCE, which we expect to be rare.
>
> Signed-off-by: Marco Elver <elver@google.com>
> Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Alexander Potapenko <glider@google.com>

> ---
> v2:
> * Do not count deadlock-avoidance skips.
> ---
>  mm/kfence/core.c | 16 +++++++++++++---
>  1 file changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index 7a97db8bc8e7..249d75b7e5ee 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -112,6 +112,8 @@ enum kfence_counter_id {
>         KFENCE_COUNTER_FREES,
>         KFENCE_COUNTER_ZOMBIES,
>         KFENCE_COUNTER_BUGS,
> +       KFENCE_COUNTER_SKIP_INCOMPAT,
> +       KFENCE_COUNTER_SKIP_CAPACITY,
>         KFENCE_COUNTER_COUNT,
>  };
>  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> @@ -121,6 +123,8 @@ static const char *const counter_names[] = {
>         [KFENCE_COUNTER_FREES]          = "total frees",
>         [KFENCE_COUNTER_ZOMBIES]        = "zombie allocations",
>         [KFENCE_COUNTER_BUGS]           = "total bugs",
> +       [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
> +       [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
>  };
>  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
>
> @@ -271,8 +275,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>                 list_del_init(&meta->list);
>         }
>         raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
> -       if (!meta)
> +       if (!meta) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
>                 return NULL;
> +       }
>
>         if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
>                 /*
> @@ -740,8 +746,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>          * Perform size check before switching kfence_allocation_gate, so that
>          * we don't disable KFENCE without making an allocation.
>          */
> -       if (size > PAGE_SIZE)
> +       if (size > PAGE_SIZE) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
>                 return NULL;
> +       }
>
>         /*
>          * Skip allocations from non-default zones, including DMA. We cannot
> @@ -749,8 +757,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>          * properties (e.g. reside in DMAable memory).
>          */
>         if ((flags & GFP_ZONEMASK) ||
> -           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
> +           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
>                 return NULL;
> +       }
>
>         /*
>          * allocation_gate only needs to become non-zero, so it doesn't make
> --
> 2.33.0.464.g1972c5931b-goog
>


-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 2/5] kfence: count unexpectedly skipped allocations
@ 2021-09-23 11:15     ` Alexander Potapenko
  0 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 11:15 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 12:48 PM Marco Elver <elver@google.com> wrote:
>
> Maintain a counter to count allocations that are skipped due to being
> incompatible (oversized, incompatible gfp flags) or no capacity.
>
> This is to compute the fraction of allocations that could not be
> serviced by KFENCE, which we expect to be rare.
>
> Signed-off-by: Marco Elver <elver@google.com>
> Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Alexander Potapenko <glider@google.com>

> ---
> v2:
> * Do not count deadlock-avoidance skips.
> ---
>  mm/kfence/core.c | 16 +++++++++++++---
>  1 file changed, 13 insertions(+), 3 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index 7a97db8bc8e7..249d75b7e5ee 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -112,6 +112,8 @@ enum kfence_counter_id {
>         KFENCE_COUNTER_FREES,
>         KFENCE_COUNTER_ZOMBIES,
>         KFENCE_COUNTER_BUGS,
> +       KFENCE_COUNTER_SKIP_INCOMPAT,
> +       KFENCE_COUNTER_SKIP_CAPACITY,
>         KFENCE_COUNTER_COUNT,
>  };
>  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> @@ -121,6 +123,8 @@ static const char *const counter_names[] = {
>         [KFENCE_COUNTER_FREES]          = "total frees",
>         [KFENCE_COUNTER_ZOMBIES]        = "zombie allocations",
>         [KFENCE_COUNTER_BUGS]           = "total bugs",
> +       [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
> +       [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
>  };
>  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
>
> @@ -271,8 +275,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>                 list_del_init(&meta->list);
>         }
>         raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
> -       if (!meta)
> +       if (!meta) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
>                 return NULL;
> +       }
>
>         if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
>                 /*
> @@ -740,8 +746,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>          * Perform size check before switching kfence_allocation_gate, so that
>          * we don't disable KFENCE without making an allocation.
>          */
> -       if (size > PAGE_SIZE)
> +       if (size > PAGE_SIZE) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
>                 return NULL;
> +       }
>
>         /*
>          * Skip allocations from non-default zones, including DMA. We cannot
> @@ -749,8 +757,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>          * properties (e.g. reside in DMAable memory).
>          */
>         if ((flags & GFP_ZONEMASK) ||
> -           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
> +           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
>                 return NULL;
> +       }
>
>         /*
>          * allocation_gate only needs to become non-zero, so it doesn't make
> --
> 2.33.0.464.g1972c5931b-goog
>


-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
  2021-09-23 10:48   ` Marco Elver
@ 2021-09-23 11:18     ` Dmitry Vyukov
  -1 siblings, 0 replies; 28+ messages in thread
From: Dmitry Vyukov @ 2021-09-23 11:18 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Alexander Potapenko, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

On Thu, 23 Sept 2021 at 12:48, Marco Elver <elver@google.com> wrote:
>
> One of KFENCE's main design principles is that with increasing uptime,
> allocation coverage increases sufficiently to detect previously
> undetected bugs.
>
> We have observed that frequent long-lived allocations of the same
> source (e.g. pagecache) tend to permanently fill up the KFENCE pool
> with increasing system uptime, thus breaking the above requirement.
> The workaround thus far had been increasing the sample interval and/or
> increasing the KFENCE pool size, but is no reliable solution.
>
> To ensure diverse coverage of allocations, limit currently covered
> allocations of the same source once pool utilization reaches 75%
> (configurable via `kfence.skip_covered_thresh`) or above. The effect is
> retaining reasonable allocation coverage when the pool is close to full.
>
> A side-effect is that this also limits frequent long-lived allocations
> of the same source filling up the pool permanently.
>
> Uniqueness of an allocation for coverage purposes is based on its
> (partial) allocation stack trace (the source). A Counting Bloom filter
> is used to check if an allocation is covered; if the allocation is
> currently covered, the allocation is skipped by KFENCE.
>
> Testing was done using:
>
>         (a) a synthetic workload that performs frequent long-lived
>             allocations (default config values; sample_interval=1;
>             num_objects=63), and
>
>         (b) normal desktop workloads on an otherwise idle machine where
>             the problem was first reported after a few days of uptime
>             (default config values).
>
> In both test cases the sampled allocation rate no longer drops to zero
> at any point. In the case of (b) we observe (after 2 days uptime) 15%
> unique allocations in the pool, 77% pool utilization, with 20% "skipped
> allocations (covered)".
>
> Signed-off-by: Marco Elver <elver@google.com>

Reviewed-by: Dmitry Vyukov <dvyukov@google.com>

> ---
> v3:
> * Remove unneeded !alloc_stack_hash checks.
> * Remove unneeded meta->alloc_stack_hash=0 in kfence_guarded_free().
>
> v2:
> * Switch to counting bloom filter to guarantee currently covered
>   allocations being skipped.
> * Use a module param for skip_covered threshold.
> * Use kfence pool address as hash entropy.
> * Use filter_irq_stacks().
> ---
>  mm/kfence/core.c   | 103 ++++++++++++++++++++++++++++++++++++++++++++-
>  mm/kfence/kfence.h |   2 +
>  2 files changed, 103 insertions(+), 2 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index db01814f8ff0..58a0f6f1acc5 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -11,11 +11,13 @@
>  #include <linux/bug.h>
>  #include <linux/debugfs.h>
>  #include <linux/irq_work.h>
> +#include <linux/jhash.h>
>  #include <linux/kcsan-checks.h>
>  #include <linux/kfence.h>
>  #include <linux/kmemleak.h>
>  #include <linux/list.h>
>  #include <linux/lockdep.h>
> +#include <linux/log2.h>
>  #include <linux/memblock.h>
>  #include <linux/moduleparam.h>
>  #include <linux/random.h>
> @@ -82,6 +84,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
>  };
>  module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
>
> +/* Pool usage% threshold when currently covered allocations are skipped. */
> +static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
> +module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
> +
>  /* The pool of pages used for guard pages and objects. */
>  char *__kfence_pool __ro_after_init;
>  EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
> @@ -105,6 +111,25 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
>  /* Gates the allocation, ensuring only one succeeds in a given period. */
>  atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
>
> +/*
> + * A Counting Bloom filter of allocation coverage: limits currently covered
> + * allocations of the same source filling up the pool.
> + *
> + * Assuming a range of 15%-85% unique allocations in the pool at any point in
> + * time, the below parameters provide a probablity of 0.02-0.33 for false
> + * positive hits respectively:
> + *
> + *     P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
> + */
> +#define ALLOC_COVERED_HNUM     2
> +#define ALLOC_COVERED_SIZE     (1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
> +#define ALLOC_COVERED_HNEXT(h) (1664525 * (h) + 1013904223)
> +#define ALLOC_COVERED_MASK     (ALLOC_COVERED_SIZE - 1)
> +static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
> +
> +/* Stack depth used to determine uniqueness of an allocation. */
> +#define UNIQUE_ALLOC_STACK_DEPTH 8UL
> +
>  /* Statistics counters for debugfs. */
>  enum kfence_counter_id {
>         KFENCE_COUNTER_ALLOCATED,
> @@ -114,6 +139,7 @@ enum kfence_counter_id {
>         KFENCE_COUNTER_BUGS,
>         KFENCE_COUNTER_SKIP_INCOMPAT,
>         KFENCE_COUNTER_SKIP_CAPACITY,
> +       KFENCE_COUNTER_SKIP_COVERED,
>         KFENCE_COUNTER_COUNT,
>  };
>  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> @@ -125,11 +151,60 @@ static const char *const counter_names[] = {
>         [KFENCE_COUNTER_BUGS]           = "total bugs",
>         [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
>         [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
> +       [KFENCE_COUNTER_SKIP_COVERED]   = "skipped allocations (covered)",
>  };
>  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
>
>  /* === Internals ============================================================ */
>
> +static inline bool should_skip_covered(void)
> +{
> +       unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
> +
> +       return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
> +}
> +
> +static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
> +{
> +       /* Some randomness across reboots / different machines. */
> +       u32 seed = (u32)((unsigned long)__kfence_pool >> (BITS_PER_LONG - 32));
> +
> +       num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
> +       num_entries = filter_irq_stacks(stack_entries, num_entries);
> +       return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), seed);
> +}
> +
> +/*
> + * Adds (or subtracts) count @val for allocation stack trace hash
> + * @alloc_stack_hash from Counting Bloom filter.
> + */
> +static void alloc_covered_add(u32 alloc_stack_hash, int val)
> +{
> +       int i;
> +
> +       for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
> +               atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
> +               alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
> +       }
> +}
> +
> +/*
> + * Returns true if the allocation stack trace hash @alloc_stack_hash is
> + * currently contained (non-zero count) in Counting Bloom filter.
> + */
> +static bool alloc_covered_contains(u32 alloc_stack_hash)
> +{
> +       int i;
> +
> +       for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
> +               if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
> +                       return false;
> +               alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
> +       }
> +
> +       return true;
> +}
> +
>  static bool kfence_protect(unsigned long addr)
>  {
>         return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
> @@ -269,7 +344,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
>  }
>
>  static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
> -                                 unsigned long *stack_entries, size_t num_stack_entries)
> +                                 unsigned long *stack_entries, size_t num_stack_entries,
> +                                 u32 alloc_stack_hash)
>  {
>         struct kfence_metadata *meta = NULL;
>         unsigned long flags;
> @@ -332,6 +408,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>         /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
>         WRITE_ONCE(meta->cache, cache);
>         meta->size = size;
> +       meta->alloc_stack_hash = alloc_stack_hash;
> +
>         for_each_canary(meta, set_canary_byte);
>
>         /* Set required struct page fields. */
> @@ -344,6 +422,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>
>         raw_spin_unlock_irqrestore(&meta->lock, flags);
>
> +       alloc_covered_add(alloc_stack_hash, 1);
> +
>         /* Memory initialization. */
>
>         /*
> @@ -412,6 +492,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
>
>         raw_spin_unlock_irqrestore(&meta->lock, flags);
>
> +       alloc_covered_add(meta->alloc_stack_hash, -1);
> +
>         /* Protect to detect use-after-frees. */
>         kfence_protect((unsigned long)addr);
>
> @@ -752,6 +834,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>  {
>         unsigned long stack_entries[KFENCE_STACK_DEPTH];
>         size_t num_stack_entries;
> +       u32 alloc_stack_hash;
>
>         /*
>          * Perform size check before switching kfence_allocation_gate, so that
> @@ -799,7 +882,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>
>         num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
>
> -       return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
> +       /*
> +        * Do expensive check for coverage of allocation in slow-path after
> +        * allocation_gate has already become non-zero, even though it might
> +        * mean not making any allocation within a given sample interval.
> +        *
> +        * This ensures reasonable allocation coverage when the pool is almost
> +        * full, including avoiding long-lived allocations of the same source
> +        * filling up the pool (e.g. pagecache allocations).
> +        */
> +       alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
> +       if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
> +               return NULL;
> +       }
> +
> +       return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
> +                                   alloc_stack_hash);
>  }
>
>  size_t kfence_ksize(const void *addr)
> diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
> index c1f23c61e5f9..2a2d5de9d379 100644
> --- a/mm/kfence/kfence.h
> +++ b/mm/kfence/kfence.h
> @@ -87,6 +87,8 @@ struct kfence_metadata {
>         /* Allocation and free stack information. */
>         struct kfence_track alloc_track;
>         struct kfence_track free_track;
> +       /* For updating alloc_covered on frees. */
> +       u32 alloc_stack_hash;
>  };
>
>  extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
> --
> 2.33.0.464.g1972c5931b-goog
>

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
@ 2021-09-23 11:18     ` Dmitry Vyukov
  0 siblings, 0 replies; 28+ messages in thread
From: Dmitry Vyukov @ 2021-09-23 11:18 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Alexander Potapenko, Jann Horn, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

On Thu, 23 Sept 2021 at 12:48, Marco Elver <elver@google.com> wrote:
>
> One of KFENCE's main design principles is that with increasing uptime,
> allocation coverage increases sufficiently to detect previously
> undetected bugs.
>
> We have observed that frequent long-lived allocations of the same
> source (e.g. pagecache) tend to permanently fill up the KFENCE pool
> with increasing system uptime, thus breaking the above requirement.
> The workaround thus far had been increasing the sample interval and/or
> increasing the KFENCE pool size, but is no reliable solution.
>
> To ensure diverse coverage of allocations, limit currently covered
> allocations of the same source once pool utilization reaches 75%
> (configurable via `kfence.skip_covered_thresh`) or above. The effect is
> retaining reasonable allocation coverage when the pool is close to full.
>
> A side-effect is that this also limits frequent long-lived allocations
> of the same source filling up the pool permanently.
>
> Uniqueness of an allocation for coverage purposes is based on its
> (partial) allocation stack trace (the source). A Counting Bloom filter
> is used to check if an allocation is covered; if the allocation is
> currently covered, the allocation is skipped by KFENCE.
>
> Testing was done using:
>
>         (a) a synthetic workload that performs frequent long-lived
>             allocations (default config values; sample_interval=1;
>             num_objects=63), and
>
>         (b) normal desktop workloads on an otherwise idle machine where
>             the problem was first reported after a few days of uptime
>             (default config values).
>
> In both test cases the sampled allocation rate no longer drops to zero
> at any point. In the case of (b) we observe (after 2 days uptime) 15%
> unique allocations in the pool, 77% pool utilization, with 20% "skipped
> allocations (covered)".
>
> Signed-off-by: Marco Elver <elver@google.com>

Reviewed-by: Dmitry Vyukov <dvyukov@google.com>

> ---
> v3:
> * Remove unneeded !alloc_stack_hash checks.
> * Remove unneeded meta->alloc_stack_hash=0 in kfence_guarded_free().
>
> v2:
> * Switch to counting bloom filter to guarantee currently covered
>   allocations being skipped.
> * Use a module param for skip_covered threshold.
> * Use kfence pool address as hash entropy.
> * Use filter_irq_stacks().
> ---
>  mm/kfence/core.c   | 103 ++++++++++++++++++++++++++++++++++++++++++++-
>  mm/kfence/kfence.h |   2 +
>  2 files changed, 103 insertions(+), 2 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index db01814f8ff0..58a0f6f1acc5 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -11,11 +11,13 @@
>  #include <linux/bug.h>
>  #include <linux/debugfs.h>
>  #include <linux/irq_work.h>
> +#include <linux/jhash.h>
>  #include <linux/kcsan-checks.h>
>  #include <linux/kfence.h>
>  #include <linux/kmemleak.h>
>  #include <linux/list.h>
>  #include <linux/lockdep.h>
> +#include <linux/log2.h>
>  #include <linux/memblock.h>
>  #include <linux/moduleparam.h>
>  #include <linux/random.h>
> @@ -82,6 +84,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
>  };
>  module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
>
> +/* Pool usage% threshold when currently covered allocations are skipped. */
> +static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
> +module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
> +
>  /* The pool of pages used for guard pages and objects. */
>  char *__kfence_pool __ro_after_init;
>  EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
> @@ -105,6 +111,25 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
>  /* Gates the allocation, ensuring only one succeeds in a given period. */
>  atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
>
> +/*
> + * A Counting Bloom filter of allocation coverage: limits currently covered
> + * allocations of the same source filling up the pool.
> + *
> + * Assuming a range of 15%-85% unique allocations in the pool at any point in
> + * time, the below parameters provide a probablity of 0.02-0.33 for false
> + * positive hits respectively:
> + *
> + *     P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
> + */
> +#define ALLOC_COVERED_HNUM     2
> +#define ALLOC_COVERED_SIZE     (1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
> +#define ALLOC_COVERED_HNEXT(h) (1664525 * (h) + 1013904223)
> +#define ALLOC_COVERED_MASK     (ALLOC_COVERED_SIZE - 1)
> +static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
> +
> +/* Stack depth used to determine uniqueness of an allocation. */
> +#define UNIQUE_ALLOC_STACK_DEPTH 8UL
> +
>  /* Statistics counters for debugfs. */
>  enum kfence_counter_id {
>         KFENCE_COUNTER_ALLOCATED,
> @@ -114,6 +139,7 @@ enum kfence_counter_id {
>         KFENCE_COUNTER_BUGS,
>         KFENCE_COUNTER_SKIP_INCOMPAT,
>         KFENCE_COUNTER_SKIP_CAPACITY,
> +       KFENCE_COUNTER_SKIP_COVERED,
>         KFENCE_COUNTER_COUNT,
>  };
>  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> @@ -125,11 +151,60 @@ static const char *const counter_names[] = {
>         [KFENCE_COUNTER_BUGS]           = "total bugs",
>         [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
>         [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
> +       [KFENCE_COUNTER_SKIP_COVERED]   = "skipped allocations (covered)",
>  };
>  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
>
>  /* === Internals ============================================================ */
>
> +static inline bool should_skip_covered(void)
> +{
> +       unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
> +
> +       return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
> +}
> +
> +static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
> +{
> +       /* Some randomness across reboots / different machines. */
> +       u32 seed = (u32)((unsigned long)__kfence_pool >> (BITS_PER_LONG - 32));
> +
> +       num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
> +       num_entries = filter_irq_stacks(stack_entries, num_entries);
> +       return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), seed);
> +}
> +
> +/*
> + * Adds (or subtracts) count @val for allocation stack trace hash
> + * @alloc_stack_hash from Counting Bloom filter.
> + */
> +static void alloc_covered_add(u32 alloc_stack_hash, int val)
> +{
> +       int i;
> +
> +       for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
> +               atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
> +               alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
> +       }
> +}
> +
> +/*
> + * Returns true if the allocation stack trace hash @alloc_stack_hash is
> + * currently contained (non-zero count) in Counting Bloom filter.
> + */
> +static bool alloc_covered_contains(u32 alloc_stack_hash)
> +{
> +       int i;
> +
> +       for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
> +               if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
> +                       return false;
> +               alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
> +       }
> +
> +       return true;
> +}
> +
>  static bool kfence_protect(unsigned long addr)
>  {
>         return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
> @@ -269,7 +344,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
>  }
>
>  static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
> -                                 unsigned long *stack_entries, size_t num_stack_entries)
> +                                 unsigned long *stack_entries, size_t num_stack_entries,
> +                                 u32 alloc_stack_hash)
>  {
>         struct kfence_metadata *meta = NULL;
>         unsigned long flags;
> @@ -332,6 +408,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>         /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
>         WRITE_ONCE(meta->cache, cache);
>         meta->size = size;
> +       meta->alloc_stack_hash = alloc_stack_hash;
> +
>         for_each_canary(meta, set_canary_byte);
>
>         /* Set required struct page fields. */
> @@ -344,6 +422,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>
>         raw_spin_unlock_irqrestore(&meta->lock, flags);
>
> +       alloc_covered_add(alloc_stack_hash, 1);
> +
>         /* Memory initialization. */
>
>         /*
> @@ -412,6 +492,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
>
>         raw_spin_unlock_irqrestore(&meta->lock, flags);
>
> +       alloc_covered_add(meta->alloc_stack_hash, -1);
> +
>         /* Protect to detect use-after-frees. */
>         kfence_protect((unsigned long)addr);
>
> @@ -752,6 +834,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>  {
>         unsigned long stack_entries[KFENCE_STACK_DEPTH];
>         size_t num_stack_entries;
> +       u32 alloc_stack_hash;
>
>         /*
>          * Perform size check before switching kfence_allocation_gate, so that
> @@ -799,7 +882,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>
>         num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
>
> -       return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
> +       /*
> +        * Do expensive check for coverage of allocation in slow-path after
> +        * allocation_gate has already become non-zero, even though it might
> +        * mean not making any allocation within a given sample interval.
> +        *
> +        * This ensures reasonable allocation coverage when the pool is almost
> +        * full, including avoiding long-lived allocations of the same source
> +        * filling up the pool (e.g. pagecache allocations).
> +        */
> +       alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
> +       if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
> +               return NULL;
> +       }
> +
> +       return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
> +                                   alloc_stack_hash);
>  }
>
>  size_t kfence_ksize(const void *addr)
> diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
> index c1f23c61e5f9..2a2d5de9d379 100644
> --- a/mm/kfence/kfence.h
> +++ b/mm/kfence/kfence.h
> @@ -87,6 +87,8 @@ struct kfence_metadata {
>         /* Allocation and free stack information. */
>         struct kfence_track alloc_track;
>         struct kfence_track free_track;
> +       /* For updating alloc_covered on frees. */
> +       u32 alloc_stack_hash;
>  };
>
>  extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
> --
> 2.33.0.464.g1972c5931b-goog
>


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/5] kfence: move saving stack trace of allocations into __kfence_alloc()
  2021-09-23 10:48   ` Marco Elver
@ 2021-09-23 11:32     ` Alexander Potapenko
  -1 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 11:32 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 12:48 PM Marco Elver <elver@google.com> wrote:
>
> Move the saving of the stack trace of allocations into __kfence_alloc(),
> so that the stack entries array can be used outside of
> kfence_guarded_alloc() and we avoid potentially unwinding the stack
> multiple times.
>
> Signed-off-by: Marco Elver <elver@google.com>
> Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Alexander Potapenko <glider@google.com>

> ---
> v2:
> * New patch.
> ---
>  mm/kfence/core.c | 35 ++++++++++++++++++++++++-----------
>  1 file changed, 24 insertions(+), 11 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index 249d75b7e5ee..db01814f8ff0 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -187,19 +187,26 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m
>   * Update the object's metadata state, including updating the alloc/free stacks
>   * depending on the state transition.
>   */
> -static noinline void metadata_update_state(struct kfence_metadata *meta,
> -                                          enum kfence_object_state next)
> +static noinline void
> +metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
> +                     unsigned long *stack_entries, size_t num_stack_entries)
>  {
>         struct kfence_track *track =
>                 next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
>
>         lockdep_assert_held(&meta->lock);
>
> -       /*
> -        * Skip over 1 (this) functions; noinline ensures we do not accidentally
> -        * skip over the caller by never inlining.
> -        */
> -       track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
> +       if (stack_entries) {
> +               memcpy(track->stack_entries, stack_entries,
> +                      num_stack_entries * sizeof(stack_entries[0]));
> +       } else {
> +               /*
> +                * Skip over 1 (this) functions; noinline ensures we do not
> +                * accidentally skip over the caller by never inlining.
> +                */
> +               num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
> +       }
> +       track->num_stack_entries = num_stack_entries;
>         track->pid = task_pid_nr(current);
>         track->cpu = raw_smp_processor_id();
>         track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
> @@ -261,7 +268,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
>         }
>  }
>
> -static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
> +static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
> +                                 unsigned long *stack_entries, size_t num_stack_entries)
>  {
>         struct kfence_metadata *meta = NULL;
>         unsigned long flags;
> @@ -320,7 +328,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>         addr = (void *)meta->addr;
>
>         /* Update remaining metadata. */
> -       metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
> +       metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
>         /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
>         WRITE_ONCE(meta->cache, cache);
>         meta->size = size;
> @@ -400,7 +408,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
>                 memzero_explicit(addr, meta->size);
>
>         /* Mark the object as freed. */
> -       metadata_update_state(meta, KFENCE_OBJECT_FREED);
> +       metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
>
>         raw_spin_unlock_irqrestore(&meta->lock, flags);
>
> @@ -742,6 +750,9 @@ void kfence_shutdown_cache(struct kmem_cache *s)
>
>  void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>  {
> +       unsigned long stack_entries[KFENCE_STACK_DEPTH];
> +       size_t num_stack_entries;
> +
>         /*
>          * Perform size check before switching kfence_allocation_gate, so that
>          * we don't disable KFENCE without making an allocation.
> @@ -786,7 +797,9 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>         if (!READ_ONCE(kfence_enabled))
>                 return NULL;
>
> -       return kfence_guarded_alloc(s, size, flags);
> +       num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
> +
> +       return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
>  }
>
>  size_t kfence_ksize(const void *addr)
> --
> 2.33.0.464.g1972c5931b-goog
>


-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 3/5] kfence: move saving stack trace of allocations into __kfence_alloc()
@ 2021-09-23 11:32     ` Alexander Potapenko
  0 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 11:32 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 12:48 PM Marco Elver <elver@google.com> wrote:
>
> Move the saving of the stack trace of allocations into __kfence_alloc(),
> so that the stack entries array can be used outside of
> kfence_guarded_alloc() and we avoid potentially unwinding the stack
> multiple times.
>
> Signed-off-by: Marco Elver <elver@google.com>
> Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Alexander Potapenko <glider@google.com>

> ---
> v2:
> * New patch.
> ---
>  mm/kfence/core.c | 35 ++++++++++++++++++++++++-----------
>  1 file changed, 24 insertions(+), 11 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index 249d75b7e5ee..db01814f8ff0 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -187,19 +187,26 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m
>   * Update the object's metadata state, including updating the alloc/free stacks
>   * depending on the state transition.
>   */
> -static noinline void metadata_update_state(struct kfence_metadata *meta,
> -                                          enum kfence_object_state next)
> +static noinline void
> +metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
> +                     unsigned long *stack_entries, size_t num_stack_entries)
>  {
>         struct kfence_track *track =
>                 next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
>
>         lockdep_assert_held(&meta->lock);
>
> -       /*
> -        * Skip over 1 (this) functions; noinline ensures we do not accidentally
> -        * skip over the caller by never inlining.
> -        */
> -       track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
> +       if (stack_entries) {
> +               memcpy(track->stack_entries, stack_entries,
> +                      num_stack_entries * sizeof(stack_entries[0]));
> +       } else {
> +               /*
> +                * Skip over 1 (this) functions; noinline ensures we do not
> +                * accidentally skip over the caller by never inlining.
> +                */
> +               num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
> +       }
> +       track->num_stack_entries = num_stack_entries;
>         track->pid = task_pid_nr(current);
>         track->cpu = raw_smp_processor_id();
>         track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
> @@ -261,7 +268,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
>         }
>  }
>
> -static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
> +static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
> +                                 unsigned long *stack_entries, size_t num_stack_entries)
>  {
>         struct kfence_metadata *meta = NULL;
>         unsigned long flags;
> @@ -320,7 +328,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>         addr = (void *)meta->addr;
>
>         /* Update remaining metadata. */
> -       metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
> +       metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
>         /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
>         WRITE_ONCE(meta->cache, cache);
>         meta->size = size;
> @@ -400,7 +408,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
>                 memzero_explicit(addr, meta->size);
>
>         /* Mark the object as freed. */
> -       metadata_update_state(meta, KFENCE_OBJECT_FREED);
> +       metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
>
>         raw_spin_unlock_irqrestore(&meta->lock, flags);
>
> @@ -742,6 +750,9 @@ void kfence_shutdown_cache(struct kmem_cache *s)
>
>  void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>  {
> +       unsigned long stack_entries[KFENCE_STACK_DEPTH];
> +       size_t num_stack_entries;
> +
>         /*
>          * Perform size check before switching kfence_allocation_gate, so that
>          * we don't disable KFENCE without making an allocation.
> @@ -786,7 +797,9 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>         if (!READ_ONCE(kfence_enabled))
>                 return NULL;
>
> -       return kfence_guarded_alloc(s, size, flags);
> +       num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
> +
> +       return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
>  }
>
>  size_t kfence_ksize(const void *addr)
> --
> 2.33.0.464.g1972c5931b-goog
>


-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
  2021-09-23 11:18     ` Dmitry Vyukov
@ 2021-09-23 13:23       ` Alexander Potapenko
  -1 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 13:23 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Marco Elver, Andrew Morton, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 1:19 PM Dmitry Vyukov <dvyukov@google.com> wrote:
>
> On Thu, 23 Sept 2021 at 12:48, Marco Elver <elver@google.com> wrote:
> >
> > One of KFENCE's main design principles is that with increasing uptime,
> > allocation coverage increases sufficiently to detect previously
> > undetected bugs.
> >
> > We have observed that frequent long-lived allocations of the same
> > source (e.g. pagecache) tend to permanently fill up the KFENCE pool
> > with increasing system uptime, thus breaking the above requirement.
> > The workaround thus far had been increasing the sample interval and/or
> > increasing the KFENCE pool size, but is no reliable solution.
> >
> > To ensure diverse coverage of allocations, limit currently covered
> > allocations of the same source once pool utilization reaches 75%
> > (configurable via `kfence.skip_covered_thresh`) or above. The effect is
> > retaining reasonable allocation coverage when the pool is close to full.
> >
> > A side-effect is that this also limits frequent long-lived allocations
> > of the same source filling up the pool permanently.
> >
> > Uniqueness of an allocation for coverage purposes is based on its
> > (partial) allocation stack trace (the source). A Counting Bloom filter
> > is used to check if an allocation is covered; if the allocation is
> > currently covered, the allocation is skipped by KFENCE.
> >
> > Testing was done using:
> >
> >         (a) a synthetic workload that performs frequent long-lived
> >             allocations (default config values; sample_interval=1;
> >             num_objects=63), and
> >
> >         (b) normal desktop workloads on an otherwise idle machine where
> >             the problem was first reported after a few days of uptime
> >             (default config values).
> >
> > In both test cases the sampled allocation rate no longer drops to zero
> > at any point. In the case of (b) we observe (after 2 days uptime) 15%
> > unique allocations in the pool, 77% pool utilization, with 20% "skipped
> > allocations (covered)".
> >
> > Signed-off-by: Marco Elver <elver@google.com>
>
> Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Alexander Potapenko <glider@google.com>

>
> > ---
> > v3:
> > * Remove unneeded !alloc_stack_hash checks.
> > * Remove unneeded meta->alloc_stack_hash=0 in kfence_guarded_free().
> >
> > v2:
> > * Switch to counting bloom filter to guarantee currently covered
> >   allocations being skipped.
> > * Use a module param for skip_covered threshold.
> > * Use kfence pool address as hash entropy.
> > * Use filter_irq_stacks().
> > ---
> >  mm/kfence/core.c   | 103 ++++++++++++++++++++++++++++++++++++++++++++-
> >  mm/kfence/kfence.h |   2 +
> >  2 files changed, 103 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> > index db01814f8ff0..58a0f6f1acc5 100644
> > --- a/mm/kfence/core.c
> > +++ b/mm/kfence/core.c
> > @@ -11,11 +11,13 @@
> >  #include <linux/bug.h>
> >  #include <linux/debugfs.h>
> >  #include <linux/irq_work.h>
> > +#include <linux/jhash.h>
> >  #include <linux/kcsan-checks.h>
> >  #include <linux/kfence.h>
> >  #include <linux/kmemleak.h>
> >  #include <linux/list.h>
> >  #include <linux/lockdep.h>
> > +#include <linux/log2.h>
> >  #include <linux/memblock.h>
> >  #include <linux/moduleparam.h>
> >  #include <linux/random.h>
> > @@ -82,6 +84,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
> >  };
> >  module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
> >
> > +/* Pool usage% threshold when currently covered allocations are skipped. */
> > +static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
> > +module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
> > +
> >  /* The pool of pages used for guard pages and objects. */
> >  char *__kfence_pool __ro_after_init;
> >  EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
> > @@ -105,6 +111,25 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
> >  /* Gates the allocation, ensuring only one succeeds in a given period. */
> >  atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
> >
> > +/*
> > + * A Counting Bloom filter of allocation coverage: limits currently covered
> > + * allocations of the same source filling up the pool.
> > + *
> > + * Assuming a range of 15%-85% unique allocations in the pool at any point in

Where do these 85% come from?

> > + * time, the below parameters provide a probablity of 0.02-0.33 for false
> > + * positive hits respectively:
> > + *
> > + *     P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
> > + */
> > +#define ALLOC_COVERED_HNUM     2
> > +#define ALLOC_COVERED_SIZE     (1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
> > +#define ALLOC_COVERED_HNEXT(h) (1664525 * (h) + 1013904223)

Unless we are planning to change these primes, can you use
next_pseudo_random32() instead?


> > +#define ALLOC_COVERED_MASK     (ALLOC_COVERED_SIZE - 1)
> > +static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
> > +
> > +/* Stack depth used to determine uniqueness of an allocation. */
> > +#define UNIQUE_ALLOC_STACK_DEPTH 8UL
> > +
> >  /* Statistics counters for debugfs. */
> >  enum kfence_counter_id {
> >         KFENCE_COUNTER_ALLOCATED,
> > @@ -114,6 +139,7 @@ enum kfence_counter_id {
> >         KFENCE_COUNTER_BUGS,
> >         KFENCE_COUNTER_SKIP_INCOMPAT,
> >         KFENCE_COUNTER_SKIP_CAPACITY,
> > +       KFENCE_COUNTER_SKIP_COVERED,
> >         KFENCE_COUNTER_COUNT,
> >  };
> >  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> > @@ -125,11 +151,60 @@ static const char *const counter_names[] = {
> >         [KFENCE_COUNTER_BUGS]           = "total bugs",
> >         [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
> >         [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
> > +       [KFENCE_COUNTER_SKIP_COVERED]   = "skipped allocations (covered)",
> >  };
> >  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
> >
> >  /* === Internals ============================================================ */
> >
> > +static inline bool should_skip_covered(void)
> > +{
> > +       unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
> > +
> > +       return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
> > +}
> > +
> > +static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
> > +{
> > +       /* Some randomness across reboots / different machines. */
> > +       u32 seed = (u32)((unsigned long)__kfence_pool >> (BITS_PER_LONG - 32));
> > +
> > +       num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
> > +       num_entries = filter_irq_stacks(stack_entries, num_entries);
> > +       return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), seed);
> > +}
> > +
> > +/*
> > + * Adds (or subtracts) count @val for allocation stack trace hash
> > + * @alloc_stack_hash from Counting Bloom filter.
> > + */
> > +static void alloc_covered_add(u32 alloc_stack_hash, int val)
> > +{
> > +       int i;
> > +
> > +       for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
> > +               atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
> > +               alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
> > +       }
> > +}
> > +
> > +/*
> > + * Returns true if the allocation stack trace hash @alloc_stack_hash is
> > + * currently contained (non-zero count) in Counting Bloom filter.
> > + */
> > +static bool alloc_covered_contains(u32 alloc_stack_hash)
> > +{
> > +       int i;
> > +
> > +       for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
> > +               if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
> > +                       return false;
> > +               alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
> > +       }
> > +
> > +       return true;
> > +}
> > +
> >  static bool kfence_protect(unsigned long addr)
> >  {
> >         return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
> > @@ -269,7 +344,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
> >  }
> >
> >  static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
> > -                                 unsigned long *stack_entries, size_t num_stack_entries)
> > +                                 unsigned long *stack_entries, size_t num_stack_entries,
> > +                                 u32 alloc_stack_hash)
> >  {
> >         struct kfence_metadata *meta = NULL;
> >         unsigned long flags;
> > @@ -332,6 +408,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >         /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
> >         WRITE_ONCE(meta->cache, cache);
> >         meta->size = size;
> > +       meta->alloc_stack_hash = alloc_stack_hash;
> > +
> >         for_each_canary(meta, set_canary_byte);
> >
> >         /* Set required struct page fields. */
> > @@ -344,6 +422,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >
> >         raw_spin_unlock_irqrestore(&meta->lock, flags);
> >
> > +       alloc_covered_add(alloc_stack_hash, 1);
> > +
> >         /* Memory initialization. */
> >
> >         /*
> > @@ -412,6 +492,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
> >
> >         raw_spin_unlock_irqrestore(&meta->lock, flags);
> >
> > +       alloc_covered_add(meta->alloc_stack_hash, -1);
> > +
> >         /* Protect to detect use-after-frees. */
> >         kfence_protect((unsigned long)addr);
> >
> > @@ -752,6 +834,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >  {
> >         unsigned long stack_entries[KFENCE_STACK_DEPTH];
> >         size_t num_stack_entries;
> > +       u32 alloc_stack_hash;
> >
> >         /*
> >          * Perform size check before switching kfence_allocation_gate, so that
> > @@ -799,7 +882,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >
> >         num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
> >
> > -       return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
> > +       /*
> > +        * Do expensive check for coverage of allocation in slow-path after
> > +        * allocation_gate has already become non-zero, even though it might
> > +        * mean not making any allocation within a given sample interval.
> > +        *
> > +        * This ensures reasonable allocation coverage when the pool is almost
> > +        * full, including avoiding long-lived allocations of the same source
> > +        * filling up the pool (e.g. pagecache allocations).
> > +        */
> > +       alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
> > +       if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
> > +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
> > +               return NULL;
> > +       }
> > +
> > +       return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
> > +                                   alloc_stack_hash);
> >  }
> >
> >  size_t kfence_ksize(const void *addr)
> > diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
> > index c1f23c61e5f9..2a2d5de9d379 100644
> > --- a/mm/kfence/kfence.h
> > +++ b/mm/kfence/kfence.h
> > @@ -87,6 +87,8 @@ struct kfence_metadata {
> >         /* Allocation and free stack information. */
> >         struct kfence_track alloc_track;
> >         struct kfence_track free_track;
> > +       /* For updating alloc_covered on frees. */
> > +       u32 alloc_stack_hash;
> >  };
> >
> >  extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
> > --
> > 2.33.0.464.g1972c5931b-goog
> >



--
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
@ 2021-09-23 13:23       ` Alexander Potapenko
  0 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 13:23 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Marco Elver, Andrew Morton, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 1:19 PM Dmitry Vyukov <dvyukov@google.com> wrote:
>
> On Thu, 23 Sept 2021 at 12:48, Marco Elver <elver@google.com> wrote:
> >
> > One of KFENCE's main design principles is that with increasing uptime,
> > allocation coverage increases sufficiently to detect previously
> > undetected bugs.
> >
> > We have observed that frequent long-lived allocations of the same
> > source (e.g. pagecache) tend to permanently fill up the KFENCE pool
> > with increasing system uptime, thus breaking the above requirement.
> > The workaround thus far had been increasing the sample interval and/or
> > increasing the KFENCE pool size, but is no reliable solution.
> >
> > To ensure diverse coverage of allocations, limit currently covered
> > allocations of the same source once pool utilization reaches 75%
> > (configurable via `kfence.skip_covered_thresh`) or above. The effect is
> > retaining reasonable allocation coverage when the pool is close to full.
> >
> > A side-effect is that this also limits frequent long-lived allocations
> > of the same source filling up the pool permanently.
> >
> > Uniqueness of an allocation for coverage purposes is based on its
> > (partial) allocation stack trace (the source). A Counting Bloom filter
> > is used to check if an allocation is covered; if the allocation is
> > currently covered, the allocation is skipped by KFENCE.
> >
> > Testing was done using:
> >
> >         (a) a synthetic workload that performs frequent long-lived
> >             allocations (default config values; sample_interval=1;
> >             num_objects=63), and
> >
> >         (b) normal desktop workloads on an otherwise idle machine where
> >             the problem was first reported after a few days of uptime
> >             (default config values).
> >
> > In both test cases the sampled allocation rate no longer drops to zero
> > at any point. In the case of (b) we observe (after 2 days uptime) 15%
> > unique allocations in the pool, 77% pool utilization, with 20% "skipped
> > allocations (covered)".
> >
> > Signed-off-by: Marco Elver <elver@google.com>
>
> Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Alexander Potapenko <glider@google.com>

>
> > ---
> > v3:
> > * Remove unneeded !alloc_stack_hash checks.
> > * Remove unneeded meta->alloc_stack_hash=0 in kfence_guarded_free().
> >
> > v2:
> > * Switch to counting bloom filter to guarantee currently covered
> >   allocations being skipped.
> > * Use a module param for skip_covered threshold.
> > * Use kfence pool address as hash entropy.
> > * Use filter_irq_stacks().
> > ---
> >  mm/kfence/core.c   | 103 ++++++++++++++++++++++++++++++++++++++++++++-
> >  mm/kfence/kfence.h |   2 +
> >  2 files changed, 103 insertions(+), 2 deletions(-)
> >
> > diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> > index db01814f8ff0..58a0f6f1acc5 100644
> > --- a/mm/kfence/core.c
> > +++ b/mm/kfence/core.c
> > @@ -11,11 +11,13 @@
> >  #include <linux/bug.h>
> >  #include <linux/debugfs.h>
> >  #include <linux/irq_work.h>
> > +#include <linux/jhash.h>
> >  #include <linux/kcsan-checks.h>
> >  #include <linux/kfence.h>
> >  #include <linux/kmemleak.h>
> >  #include <linux/list.h>
> >  #include <linux/lockdep.h>
> > +#include <linux/log2.h>
> >  #include <linux/memblock.h>
> >  #include <linux/moduleparam.h>
> >  #include <linux/random.h>
> > @@ -82,6 +84,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
> >  };
> >  module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
> >
> > +/* Pool usage% threshold when currently covered allocations are skipped. */
> > +static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
> > +module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
> > +
> >  /* The pool of pages used for guard pages and objects. */
> >  char *__kfence_pool __ro_after_init;
> >  EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
> > @@ -105,6 +111,25 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
> >  /* Gates the allocation, ensuring only one succeeds in a given period. */
> >  atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
> >
> > +/*
> > + * A Counting Bloom filter of allocation coverage: limits currently covered
> > + * allocations of the same source filling up the pool.
> > + *
> > + * Assuming a range of 15%-85% unique allocations in the pool at any point in

Where do these 85% come from?

> > + * time, the below parameters provide a probablity of 0.02-0.33 for false
> > + * positive hits respectively:
> > + *
> > + *     P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
> > + */
> > +#define ALLOC_COVERED_HNUM     2
> > +#define ALLOC_COVERED_SIZE     (1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
> > +#define ALLOC_COVERED_HNEXT(h) (1664525 * (h) + 1013904223)

Unless we are planning to change these primes, can you use
next_pseudo_random32() instead?


> > +#define ALLOC_COVERED_MASK     (ALLOC_COVERED_SIZE - 1)
> > +static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
> > +
> > +/* Stack depth used to determine uniqueness of an allocation. */
> > +#define UNIQUE_ALLOC_STACK_DEPTH 8UL
> > +
> >  /* Statistics counters for debugfs. */
> >  enum kfence_counter_id {
> >         KFENCE_COUNTER_ALLOCATED,
> > @@ -114,6 +139,7 @@ enum kfence_counter_id {
> >         KFENCE_COUNTER_BUGS,
> >         KFENCE_COUNTER_SKIP_INCOMPAT,
> >         KFENCE_COUNTER_SKIP_CAPACITY,
> > +       KFENCE_COUNTER_SKIP_COVERED,
> >         KFENCE_COUNTER_COUNT,
> >  };
> >  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> > @@ -125,11 +151,60 @@ static const char *const counter_names[] = {
> >         [KFENCE_COUNTER_BUGS]           = "total bugs",
> >         [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
> >         [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
> > +       [KFENCE_COUNTER_SKIP_COVERED]   = "skipped allocations (covered)",
> >  };
> >  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
> >
> >  /* === Internals ============================================================ */
> >
> > +static inline bool should_skip_covered(void)
> > +{
> > +       unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
> > +
> > +       return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
> > +}
> > +
> > +static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
> > +{
> > +       /* Some randomness across reboots / different machines. */
> > +       u32 seed = (u32)((unsigned long)__kfence_pool >> (BITS_PER_LONG - 32));
> > +
> > +       num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
> > +       num_entries = filter_irq_stacks(stack_entries, num_entries);
> > +       return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), seed);
> > +}
> > +
> > +/*
> > + * Adds (or subtracts) count @val for allocation stack trace hash
> > + * @alloc_stack_hash from Counting Bloom filter.
> > + */
> > +static void alloc_covered_add(u32 alloc_stack_hash, int val)
> > +{
> > +       int i;
> > +
> > +       for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
> > +               atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
> > +               alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
> > +       }
> > +}
> > +
> > +/*
> > + * Returns true if the allocation stack trace hash @alloc_stack_hash is
> > + * currently contained (non-zero count) in Counting Bloom filter.
> > + */
> > +static bool alloc_covered_contains(u32 alloc_stack_hash)
> > +{
> > +       int i;
> > +
> > +       for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
> > +               if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
> > +                       return false;
> > +               alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
> > +       }
> > +
> > +       return true;
> > +}
> > +
> >  static bool kfence_protect(unsigned long addr)
> >  {
> >         return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
> > @@ -269,7 +344,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
> >  }
> >
> >  static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
> > -                                 unsigned long *stack_entries, size_t num_stack_entries)
> > +                                 unsigned long *stack_entries, size_t num_stack_entries,
> > +                                 u32 alloc_stack_hash)
> >  {
> >         struct kfence_metadata *meta = NULL;
> >         unsigned long flags;
> > @@ -332,6 +408,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >         /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
> >         WRITE_ONCE(meta->cache, cache);
> >         meta->size = size;
> > +       meta->alloc_stack_hash = alloc_stack_hash;
> > +
> >         for_each_canary(meta, set_canary_byte);
> >
> >         /* Set required struct page fields. */
> > @@ -344,6 +422,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >
> >         raw_spin_unlock_irqrestore(&meta->lock, flags);
> >
> > +       alloc_covered_add(alloc_stack_hash, 1);
> > +
> >         /* Memory initialization. */
> >
> >         /*
> > @@ -412,6 +492,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
> >
> >         raw_spin_unlock_irqrestore(&meta->lock, flags);
> >
> > +       alloc_covered_add(meta->alloc_stack_hash, -1);
> > +
> >         /* Protect to detect use-after-frees. */
> >         kfence_protect((unsigned long)addr);
> >
> > @@ -752,6 +834,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >  {
> >         unsigned long stack_entries[KFENCE_STACK_DEPTH];
> >         size_t num_stack_entries;
> > +       u32 alloc_stack_hash;
> >
> >         /*
> >          * Perform size check before switching kfence_allocation_gate, so that
> > @@ -799,7 +882,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >
> >         num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
> >
> > -       return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
> > +       /*
> > +        * Do expensive check for coverage of allocation in slow-path after
> > +        * allocation_gate has already become non-zero, even though it might
> > +        * mean not making any allocation within a given sample interval.
> > +        *
> > +        * This ensures reasonable allocation coverage when the pool is almost
> > +        * full, including avoiding long-lived allocations of the same source
> > +        * filling up the pool (e.g. pagecache allocations).
> > +        */
> > +       alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
> > +       if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
> > +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
> > +               return NULL;
> > +       }
> > +
> > +       return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
> > +                                   alloc_stack_hash);
> >  }
> >
> >  size_t kfence_ksize(const void *addr)
> > diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
> > index c1f23c61e5f9..2a2d5de9d379 100644
> > --- a/mm/kfence/kfence.h
> > +++ b/mm/kfence/kfence.h
> > @@ -87,6 +87,8 @@ struct kfence_metadata {
> >         /* Allocation and free stack information. */
> >         struct kfence_track alloc_track;
> >         struct kfence_track free_track;
> > +       /* For updating alloc_covered on frees. */
> > +       u32 alloc_stack_hash;
> >  };
> >
> >  extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
> > --
> > 2.33.0.464.g1972c5931b-goog
> >



--
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
  2021-09-23 13:23       ` Alexander Potapenko
@ 2021-09-23 13:44         ` Marco Elver
  -1 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 13:44 UTC (permalink / raw)
  To: Alexander Potapenko
  Cc: Dmitry Vyukov, Andrew Morton, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, 23 Sept 2021 at 15:24, Alexander Potapenko <glider@google.com> wrote:
>
> On Thu, Sep 23, 2021 at 1:19 PM Dmitry Vyukov <dvyukov@google.com> wrote:
> >
> > On Thu, 23 Sept 2021 at 12:48, Marco Elver <elver@google.com> wrote:
> > >
> > > One of KFENCE's main design principles is that with increasing uptime,
> > > allocation coverage increases sufficiently to detect previously
> > > undetected bugs.
> > >
> > > We have observed that frequent long-lived allocations of the same
> > > source (e.g. pagecache) tend to permanently fill up the KFENCE pool
> > > with increasing system uptime, thus breaking the above requirement.
> > > The workaround thus far had been increasing the sample interval and/or
> > > increasing the KFENCE pool size, but is no reliable solution.
> > >
> > > To ensure diverse coverage of allocations, limit currently covered
> > > allocations of the same source once pool utilization reaches 75%
> > > (configurable via `kfence.skip_covered_thresh`) or above. The effect is
> > > retaining reasonable allocation coverage when the pool is close to full.
> > >
> > > A side-effect is that this also limits frequent long-lived allocations
> > > of the same source filling up the pool permanently.
> > >
> > > Uniqueness of an allocation for coverage purposes is based on its
> > > (partial) allocation stack trace (the source). A Counting Bloom filter
> > > is used to check if an allocation is covered; if the allocation is
> > > currently covered, the allocation is skipped by KFENCE.
> > >
> > > Testing was done using:
> > >
> > >         (a) a synthetic workload that performs frequent long-lived
> > >             allocations (default config values; sample_interval=1;
> > >             num_objects=63), and
> > >
> > >         (b) normal desktop workloads on an otherwise idle machine where
> > >             the problem was first reported after a few days of uptime
> > >             (default config values).
> > >
> > > In both test cases the sampled allocation rate no longer drops to zero
> > > at any point. In the case of (b) we observe (after 2 days uptime) 15%
> > > unique allocations in the pool, 77% pool utilization, with 20% "skipped
> > > allocations (covered)".
> > >
> > > Signed-off-by: Marco Elver <elver@google.com>
> >
> > Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
> Acked-by: Alexander Potapenko <glider@google.com>

Thank you both!

> > > ---
> > > v3:
> > > * Remove unneeded !alloc_stack_hash checks.
> > > * Remove unneeded meta->alloc_stack_hash=0 in kfence_guarded_free().
> > >
> > > v2:
> > > * Switch to counting bloom filter to guarantee currently covered
> > >   allocations being skipped.
> > > * Use a module param for skip_covered threshold.
> > > * Use kfence pool address as hash entropy.
> > > * Use filter_irq_stacks().
> > > ---
> > >  mm/kfence/core.c   | 103 ++++++++++++++++++++++++++++++++++++++++++++-
> > >  mm/kfence/kfence.h |   2 +
> > >  2 files changed, 103 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> > > index db01814f8ff0..58a0f6f1acc5 100644
> > > --- a/mm/kfence/core.c
> > > +++ b/mm/kfence/core.c
> > > @@ -11,11 +11,13 @@
> > >  #include <linux/bug.h>
> > >  #include <linux/debugfs.h>
> > >  #include <linux/irq_work.h>
> > > +#include <linux/jhash.h>
> > >  #include <linux/kcsan-checks.h>
> > >  #include <linux/kfence.h>
> > >  #include <linux/kmemleak.h>
> > >  #include <linux/list.h>
> > >  #include <linux/lockdep.h>
> > > +#include <linux/log2.h>
> > >  #include <linux/memblock.h>
> > >  #include <linux/moduleparam.h>
> > >  #include <linux/random.h>
> > > @@ -82,6 +84,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
> > >  };
> > >  module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
> > >
> > > +/* Pool usage% threshold when currently covered allocations are skipped. */
> > > +static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
> > > +module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
> > > +
> > >  /* The pool of pages used for guard pages and objects. */
> > >  char *__kfence_pool __ro_after_init;
> > >  EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
> > > @@ -105,6 +111,25 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
> > >  /* Gates the allocation, ensuring only one succeeds in a given period. */
> > >  atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
> > >
> > > +/*
> > > + * A Counting Bloom filter of allocation coverage: limits currently covered
> > > + * allocations of the same source filling up the pool.
> > > + *
> > > + * Assuming a range of 15%-85% unique allocations in the pool at any point in
>
> Where do these 85% come from?

An imaginary worst case, just to illustrate the range of the false
positive probabilities (in the case of 85% it'd be 0.33). I expect
unique allocations to be around 10-15% on a freshly booted system (on
my real-system-experiment it stayed below 15%), but other workloads
may produce other unique allocations%.

> > > + * time, the below parameters provide a probablity of 0.02-0.33 for false
> > > + * positive hits respectively:
> > > + *
> > > + *     P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
> > > + */
> > > +#define ALLOC_COVERED_HNUM     2
> > > +#define ALLOC_COVERED_SIZE     (1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
> > > +#define ALLOC_COVERED_HNEXT(h) (1664525 * (h) + 1013904223)
>
> Unless we are planning to change these primes, can you use
> next_pseudo_random32() instead?

I'm worried about next_pseudo_random32() changing their implementation
to longer be deterministic or change in other ways that break our
usecase. In this case we want pseudorandomness, but we're not
implementing a PRNG.

Open-coding the constants (given they are from "Numerical Recipes") is
more reliable and doesn't introduce unwanted reliance on
next_pseudo_random32()'s behaviour.

Thanks,
-- Marco

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
@ 2021-09-23 13:44         ` Marco Elver
  0 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-23 13:44 UTC (permalink / raw)
  To: Alexander Potapenko
  Cc: Dmitry Vyukov, Andrew Morton, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, 23 Sept 2021 at 15:24, Alexander Potapenko <glider@google.com> wrote:
>
> On Thu, Sep 23, 2021 at 1:19 PM Dmitry Vyukov <dvyukov@google.com> wrote:
> >
> > On Thu, 23 Sept 2021 at 12:48, Marco Elver <elver@google.com> wrote:
> > >
> > > One of KFENCE's main design principles is that with increasing uptime,
> > > allocation coverage increases sufficiently to detect previously
> > > undetected bugs.
> > >
> > > We have observed that frequent long-lived allocations of the same
> > > source (e.g. pagecache) tend to permanently fill up the KFENCE pool
> > > with increasing system uptime, thus breaking the above requirement.
> > > The workaround thus far had been increasing the sample interval and/or
> > > increasing the KFENCE pool size, but is no reliable solution.
> > >
> > > To ensure diverse coverage of allocations, limit currently covered
> > > allocations of the same source once pool utilization reaches 75%
> > > (configurable via `kfence.skip_covered_thresh`) or above. The effect is
> > > retaining reasonable allocation coverage when the pool is close to full.
> > >
> > > A side-effect is that this also limits frequent long-lived allocations
> > > of the same source filling up the pool permanently.
> > >
> > > Uniqueness of an allocation for coverage purposes is based on its
> > > (partial) allocation stack trace (the source). A Counting Bloom filter
> > > is used to check if an allocation is covered; if the allocation is
> > > currently covered, the allocation is skipped by KFENCE.
> > >
> > > Testing was done using:
> > >
> > >         (a) a synthetic workload that performs frequent long-lived
> > >             allocations (default config values; sample_interval=1;
> > >             num_objects=63), and
> > >
> > >         (b) normal desktop workloads on an otherwise idle machine where
> > >             the problem was first reported after a few days of uptime
> > >             (default config values).
> > >
> > > In both test cases the sampled allocation rate no longer drops to zero
> > > at any point. In the case of (b) we observe (after 2 days uptime) 15%
> > > unique allocations in the pool, 77% pool utilization, with 20% "skipped
> > > allocations (covered)".
> > >
> > > Signed-off-by: Marco Elver <elver@google.com>
> >
> > Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
> Acked-by: Alexander Potapenko <glider@google.com>

Thank you both!

> > > ---
> > > v3:
> > > * Remove unneeded !alloc_stack_hash checks.
> > > * Remove unneeded meta->alloc_stack_hash=0 in kfence_guarded_free().
> > >
> > > v2:
> > > * Switch to counting bloom filter to guarantee currently covered
> > >   allocations being skipped.
> > > * Use a module param for skip_covered threshold.
> > > * Use kfence pool address as hash entropy.
> > > * Use filter_irq_stacks().
> > > ---
> > >  mm/kfence/core.c   | 103 ++++++++++++++++++++++++++++++++++++++++++++-
> > >  mm/kfence/kfence.h |   2 +
> > >  2 files changed, 103 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> > > index db01814f8ff0..58a0f6f1acc5 100644
> > > --- a/mm/kfence/core.c
> > > +++ b/mm/kfence/core.c
> > > @@ -11,11 +11,13 @@
> > >  #include <linux/bug.h>
> > >  #include <linux/debugfs.h>
> > >  #include <linux/irq_work.h>
> > > +#include <linux/jhash.h>
> > >  #include <linux/kcsan-checks.h>
> > >  #include <linux/kfence.h>
> > >  #include <linux/kmemleak.h>
> > >  #include <linux/list.h>
> > >  #include <linux/lockdep.h>
> > > +#include <linux/log2.h>
> > >  #include <linux/memblock.h>
> > >  #include <linux/moduleparam.h>
> > >  #include <linux/random.h>
> > > @@ -82,6 +84,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
> > >  };
> > >  module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
> > >
> > > +/* Pool usage% threshold when currently covered allocations are skipped. */
> > > +static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
> > > +module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
> > > +
> > >  /* The pool of pages used for guard pages and objects. */
> > >  char *__kfence_pool __ro_after_init;
> > >  EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
> > > @@ -105,6 +111,25 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
> > >  /* Gates the allocation, ensuring only one succeeds in a given period. */
> > >  atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
> > >
> > > +/*
> > > + * A Counting Bloom filter of allocation coverage: limits currently covered
> > > + * allocations of the same source filling up the pool.
> > > + *
> > > + * Assuming a range of 15%-85% unique allocations in the pool at any point in
>
> Where do these 85% come from?

An imaginary worst case, just to illustrate the range of the false
positive probabilities (in the case of 85% it'd be 0.33). I expect
unique allocations to be around 10-15% on a freshly booted system (on
my real-system-experiment it stayed below 15%), but other workloads
may produce other unique allocations%.

> > > + * time, the below parameters provide a probablity of 0.02-0.33 for false
> > > + * positive hits respectively:
> > > + *
> > > + *     P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
> > > + */
> > > +#define ALLOC_COVERED_HNUM     2
> > > +#define ALLOC_COVERED_SIZE     (1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
> > > +#define ALLOC_COVERED_HNEXT(h) (1664525 * (h) + 1013904223)
>
> Unless we are planning to change these primes, can you use
> next_pseudo_random32() instead?

I'm worried about next_pseudo_random32() changing their implementation
to longer be deterministic or change in other ways that break our
usecase. In this case we want pseudorandomness, but we're not
implementing a PRNG.

Open-coding the constants (given they are from "Numerical Recipes") is
more reliable and doesn't introduce unwanted reliance on
next_pseudo_random32()'s behaviour.

Thanks,
-- Marco


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
  2021-09-23 13:44         ` Marco Elver
@ 2021-09-23 13:46           ` Alexander Potapenko
  -1 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 13:46 UTC (permalink / raw)
  To: Marco Elver
  Cc: Dmitry Vyukov, Andrew Morton, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 3:44 PM Marco Elver <elver@google.com> wrote:
>
> On Thu, 23 Sept 2021 at 15:24, Alexander Potapenko <glider@google.com> wrote:
> >
> > On Thu, Sep 23, 2021 at 1:19 PM Dmitry Vyukov <dvyukov@google.com> wrote:
> > >
> > > On Thu, 23 Sept 2021 at 12:48, Marco Elver <elver@google.com> wrote:
> > > >
> > > > One of KFENCE's main design principles is that with increasing uptime,
> > > > allocation coverage increases sufficiently to detect previously
> > > > undetected bugs.
> > > >
> > > > We have observed that frequent long-lived allocations of the same
> > > > source (e.g. pagecache) tend to permanently fill up the KFENCE pool
> > > > with increasing system uptime, thus breaking the above requirement.
> > > > The workaround thus far had been increasing the sample interval and/or
> > > > increasing the KFENCE pool size, but is no reliable solution.
> > > >
> > > > To ensure diverse coverage of allocations, limit currently covered
> > > > allocations of the same source once pool utilization reaches 75%
> > > > (configurable via `kfence.skip_covered_thresh`) or above. The effect is
> > > > retaining reasonable allocation coverage when the pool is close to full.
> > > >
> > > > A side-effect is that this also limits frequent long-lived allocations
> > > > of the same source filling up the pool permanently.
> > > >
> > > > Uniqueness of an allocation for coverage purposes is based on its
> > > > (partial) allocation stack trace (the source). A Counting Bloom filter
> > > > is used to check if an allocation is covered; if the allocation is
> > > > currently covered, the allocation is skipped by KFENCE.
> > > >
> > > > Testing was done using:
> > > >
> > > >         (a) a synthetic workload that performs frequent long-lived
> > > >             allocations (default config values; sample_interval=1;
> > > >             num_objects=63), and
> > > >
> > > >         (b) normal desktop workloads on an otherwise idle machine where
> > > >             the problem was first reported after a few days of uptime
> > > >             (default config values).
> > > >
> > > > In both test cases the sampled allocation rate no longer drops to zero
> > > > at any point. In the case of (b) we observe (after 2 days uptime) 15%
> > > > unique allocations in the pool, 77% pool utilization, with 20% "skipped
> > > > allocations (covered)".
> > > >
> > > > Signed-off-by: Marco Elver <elver@google.com>
> > >
> > > Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
> > Acked-by: Alexander Potapenko <glider@google.com>
>
> Thank you both!
>
> > > > ---
> > > > v3:
> > > > * Remove unneeded !alloc_stack_hash checks.
> > > > * Remove unneeded meta->alloc_stack_hash=0 in kfence_guarded_free().
> > > >
> > > > v2:
> > > > * Switch to counting bloom filter to guarantee currently covered
> > > >   allocations being skipped.
> > > > * Use a module param for skip_covered threshold.
> > > > * Use kfence pool address as hash entropy.
> > > > * Use filter_irq_stacks().
> > > > ---
> > > >  mm/kfence/core.c   | 103 ++++++++++++++++++++++++++++++++++++++++++++-
> > > >  mm/kfence/kfence.h |   2 +
> > > >  2 files changed, 103 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> > > > index db01814f8ff0..58a0f6f1acc5 100644
> > > > --- a/mm/kfence/core.c
> > > > +++ b/mm/kfence/core.c
> > > > @@ -11,11 +11,13 @@
> > > >  #include <linux/bug.h>
> > > >  #include <linux/debugfs.h>
> > > >  #include <linux/irq_work.h>
> > > > +#include <linux/jhash.h>
> > > >  #include <linux/kcsan-checks.h>
> > > >  #include <linux/kfence.h>
> > > >  #include <linux/kmemleak.h>
> > > >  #include <linux/list.h>
> > > >  #include <linux/lockdep.h>
> > > > +#include <linux/log2.h>
> > > >  #include <linux/memblock.h>
> > > >  #include <linux/moduleparam.h>
> > > >  #include <linux/random.h>
> > > > @@ -82,6 +84,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
> > > >  };
> > > >  module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
> > > >
> > > > +/* Pool usage% threshold when currently covered allocations are skipped. */
> > > > +static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
> > > > +module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
> > > > +
> > > >  /* The pool of pages used for guard pages and objects. */
> > > >  char *__kfence_pool __ro_after_init;
> > > >  EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
> > > > @@ -105,6 +111,25 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
> > > >  /* Gates the allocation, ensuring only one succeeds in a given period. */
> > > >  atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
> > > >
> > > > +/*
> > > > + * A Counting Bloom filter of allocation coverage: limits currently covered
> > > > + * allocations of the same source filling up the pool.
> > > > + *
> > > > + * Assuming a range of 15%-85% unique allocations in the pool at any point in
> >
> > Where do these 85% come from?
>
> An imaginary worst case, just to illustrate the range of the false
> positive probabilities (in the case of 85% it'd be 0.33). I expect
> unique allocations to be around 10-15% on a freshly booted system (on
> my real-system-experiment it stayed below 15%), but other workloads
> may produce other unique allocations%.
>
> > > > + * time, the below parameters provide a probablity of 0.02-0.33 for false
> > > > + * positive hits respectively:
> > > > + *
> > > > + *     P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
> > > > + */
> > > > +#define ALLOC_COVERED_HNUM     2
> > > > +#define ALLOC_COVERED_SIZE     (1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
> > > > +#define ALLOC_COVERED_HNEXT(h) (1664525 * (h) + 1013904223)
> >
> > Unless we are planning to change these primes, can you use
> > next_pseudo_random32() instead?
>
> I'm worried about next_pseudo_random32() changing their implementation
> to longer be deterministic or change in other ways that break our
> usecase. In this case we want pseudorandomness, but we're not
> implementing a PRNG.
>
> Open-coding the constants (given they are from "Numerical Recipes") is
> more reliable and doesn't introduce unwanted reliance on
> next_pseudo_random32()'s behaviour.

Okay, fair enough.

>
> Thanks,
> -- Marco



-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
@ 2021-09-23 13:46           ` Alexander Potapenko
  0 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 13:46 UTC (permalink / raw)
  To: Marco Elver
  Cc: Dmitry Vyukov, Andrew Morton, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 3:44 PM Marco Elver <elver@google.com> wrote:
>
> On Thu, 23 Sept 2021 at 15:24, Alexander Potapenko <glider@google.com> wrote:
> >
> > On Thu, Sep 23, 2021 at 1:19 PM Dmitry Vyukov <dvyukov@google.com> wrote:
> > >
> > > On Thu, 23 Sept 2021 at 12:48, Marco Elver <elver@google.com> wrote:
> > > >
> > > > One of KFENCE's main design principles is that with increasing uptime,
> > > > allocation coverage increases sufficiently to detect previously
> > > > undetected bugs.
> > > >
> > > > We have observed that frequent long-lived allocations of the same
> > > > source (e.g. pagecache) tend to permanently fill up the KFENCE pool
> > > > with increasing system uptime, thus breaking the above requirement.
> > > > The workaround thus far had been increasing the sample interval and/or
> > > > increasing the KFENCE pool size, but is no reliable solution.
> > > >
> > > > To ensure diverse coverage of allocations, limit currently covered
> > > > allocations of the same source once pool utilization reaches 75%
> > > > (configurable via `kfence.skip_covered_thresh`) or above. The effect is
> > > > retaining reasonable allocation coverage when the pool is close to full.
> > > >
> > > > A side-effect is that this also limits frequent long-lived allocations
> > > > of the same source filling up the pool permanently.
> > > >
> > > > Uniqueness of an allocation for coverage purposes is based on its
> > > > (partial) allocation stack trace (the source). A Counting Bloom filter
> > > > is used to check if an allocation is covered; if the allocation is
> > > > currently covered, the allocation is skipped by KFENCE.
> > > >
> > > > Testing was done using:
> > > >
> > > >         (a) a synthetic workload that performs frequent long-lived
> > > >             allocations (default config values; sample_interval=1;
> > > >             num_objects=63), and
> > > >
> > > >         (b) normal desktop workloads on an otherwise idle machine where
> > > >             the problem was first reported after a few days of uptime
> > > >             (default config values).
> > > >
> > > > In both test cases the sampled allocation rate no longer drops to zero
> > > > at any point. In the case of (b) we observe (after 2 days uptime) 15%
> > > > unique allocations in the pool, 77% pool utilization, with 20% "skipped
> > > > allocations (covered)".
> > > >
> > > > Signed-off-by: Marco Elver <elver@google.com>
> > >
> > > Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
> > Acked-by: Alexander Potapenko <glider@google.com>
>
> Thank you both!
>
> > > > ---
> > > > v3:
> > > > * Remove unneeded !alloc_stack_hash checks.
> > > > * Remove unneeded meta->alloc_stack_hash=0 in kfence_guarded_free().
> > > >
> > > > v2:
> > > > * Switch to counting bloom filter to guarantee currently covered
> > > >   allocations being skipped.
> > > > * Use a module param for skip_covered threshold.
> > > > * Use kfence pool address as hash entropy.
> > > > * Use filter_irq_stacks().
> > > > ---
> > > >  mm/kfence/core.c   | 103 ++++++++++++++++++++++++++++++++++++++++++++-
> > > >  mm/kfence/kfence.h |   2 +
> > > >  2 files changed, 103 insertions(+), 2 deletions(-)
> > > >
> > > > diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> > > > index db01814f8ff0..58a0f6f1acc5 100644
> > > > --- a/mm/kfence/core.c
> > > > +++ b/mm/kfence/core.c
> > > > @@ -11,11 +11,13 @@
> > > >  #include <linux/bug.h>
> > > >  #include <linux/debugfs.h>
> > > >  #include <linux/irq_work.h>
> > > > +#include <linux/jhash.h>
> > > >  #include <linux/kcsan-checks.h>
> > > >  #include <linux/kfence.h>
> > > >  #include <linux/kmemleak.h>
> > > >  #include <linux/list.h>
> > > >  #include <linux/lockdep.h>
> > > > +#include <linux/log2.h>
> > > >  #include <linux/memblock.h>
> > > >  #include <linux/moduleparam.h>
> > > >  #include <linux/random.h>
> > > > @@ -82,6 +84,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
> > > >  };
> > > >  module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
> > > >
> > > > +/* Pool usage% threshold when currently covered allocations are skipped. */
> > > > +static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
> > > > +module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
> > > > +
> > > >  /* The pool of pages used for guard pages and objects. */
> > > >  char *__kfence_pool __ro_after_init;
> > > >  EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
> > > > @@ -105,6 +111,25 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
> > > >  /* Gates the allocation, ensuring only one succeeds in a given period. */
> > > >  atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
> > > >
> > > > +/*
> > > > + * A Counting Bloom filter of allocation coverage: limits currently covered
> > > > + * allocations of the same source filling up the pool.
> > > > + *
> > > > + * Assuming a range of 15%-85% unique allocations in the pool at any point in
> >
> > Where do these 85% come from?
>
> An imaginary worst case, just to illustrate the range of the false
> positive probabilities (in the case of 85% it'd be 0.33). I expect
> unique allocations to be around 10-15% on a freshly booted system (on
> my real-system-experiment it stayed below 15%), but other workloads
> may produce other unique allocations%.
>
> > > > + * time, the below parameters provide a probablity of 0.02-0.33 for false
> > > > + * positive hits respectively:
> > > > + *
> > > > + *     P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
> > > > + */
> > > > +#define ALLOC_COVERED_HNUM     2
> > > > +#define ALLOC_COVERED_SIZE     (1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
> > > > +#define ALLOC_COVERED_HNEXT(h) (1664525 * (h) + 1013904223)
> >
> > Unless we are planning to change these primes, can you use
> > next_pseudo_random32() instead?
>
> I'm worried about next_pseudo_random32() changing their implementation
> to longer be deterministic or change in other ways that break our
> usecase. In this case we want pseudorandomness, but we're not
> implementing a PRNG.
>
> Open-coding the constants (given they are from "Numerical Recipes") is
> more reliable and doesn't introduce unwanted reliance on
> next_pseudo_random32()'s behaviour.

Okay, fair enough.

>
> Thanks,
> -- Marco



-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 5/5] kfence: add note to documentation about skipping covered allocations
  2021-09-23 10:48   ` Marco Elver
@ 2021-09-23 15:46     ` Alexander Potapenko
  -1 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 15:46 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 12:48 PM Marco Elver <elver@google.com> wrote:
>
> Add a note briefly mentioning the new policy about "skipping currently
> covered allocations if pool close to full." Since this has a notable
> impact on KFENCE's bug-detection ability on systems with large uptimes,
> it is worth pointing out the feature.
>
> Signed-off-by: Marco Elver <elver@google.com>
> Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Alexander Potapenko <glider@google.com>

> ---
> v2:
> * Rewrite.
> ---
>  Documentation/dev-tools/kfence.rst | 11 +++++++++++
>  1 file changed, 11 insertions(+)
>
> diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
> index 0fbe3308bf37..d45f952986ae 100644
> --- a/Documentation/dev-tools/kfence.rst
> +++ b/Documentation/dev-tools/kfence.rst
> @@ -269,6 +269,17 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused
>  first, and the chances of detecting use-after-frees of recently freed objects
>  is increased.
>
> +If pool utilization reaches 75% (default) or above, to reduce the risk of the
> +pool eventually being fully occupied by allocated objects yet ensure diverse
> +coverage of allocations, KFENCE limits currently covered allocations of the
> +same source from further filling up the pool. The "source" of an allocation is
> +based on its partial allocation stack trace. A side-effect is that this also
> +limits frequent long-lived allocations (e.g. pagecache) of the same source
> +filling up the pool permanently, which is the most common risk for the pool
> +becoming full and the sampled allocation rate dropping to zero. The threshold
> +at which to start limiting currently covered allocations can be configured via
> +the boot parameter ``kfence.skip_covered_thresh`` (pool usage%).
> +
>  Interface
>  ---------
>
> --
> 2.33.0.464.g1972c5931b-goog
>


-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg


^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 5/5] kfence: add note to documentation about skipping covered allocations
@ 2021-09-23 15:46     ` Alexander Potapenko
  0 siblings, 0 replies; 28+ messages in thread
From: Alexander Potapenko @ 2021-09-23 15:46 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 12:48 PM Marco Elver <elver@google.com> wrote:
>
> Add a note briefly mentioning the new policy about "skipping currently
> covered allocations if pool close to full." Since this has a notable
> impact on KFENCE's bug-detection ability on systems with large uptimes,
> it is worth pointing out the feature.
>
> Signed-off-by: Marco Elver <elver@google.com>
> Reviewed-by: Dmitry Vyukov <dvyukov@google.com>
Acked-by: Alexander Potapenko <glider@google.com>

> ---
> v2:
> * Rewrite.
> ---
>  Documentation/dev-tools/kfence.rst | 11 +++++++++++
>  1 file changed, 11 insertions(+)
>
> diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
> index 0fbe3308bf37..d45f952986ae 100644
> --- a/Documentation/dev-tools/kfence.rst
> +++ b/Documentation/dev-tools/kfence.rst
> @@ -269,6 +269,17 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused
>  first, and the chances of detecting use-after-frees of recently freed objects
>  is increased.
>
> +If pool utilization reaches 75% (default) or above, to reduce the risk of the
> +pool eventually being fully occupied by allocated objects yet ensure diverse
> +coverage of allocations, KFENCE limits currently covered allocations of the
> +same source from further filling up the pool. The "source" of an allocation is
> +based on its partial allocation stack trace. A side-effect is that this also
> +limits frequent long-lived allocations (e.g. pagecache) of the same source
> +filling up the pool permanently, which is the most common risk for the pool
> +becoming full and the sampled allocation rate dropping to zero. The threshold
> +at which to start limiting currently covered allocations can be configured via
> +the boot parameter ``kfence.skip_covered_thresh`` (pool usage%).
> +
>  Interface
>  ---------
>
> --
> 2.33.0.464.g1972c5931b-goog
>


-- 
Alexander Potapenko
Software Engineer

Google Germany GmbH
Erika-Mann-Straße, 33
80636 München

Geschäftsführer: Paul Manicle, Halimah DeLaine Prado
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
  2021-09-23 13:44         ` Marco Elver
  (?)
  (?)
@ 2021-09-23 23:28         ` Andrew Morton
  2021-09-24 13:01           ` Marco Elver
  -1 siblings, 1 reply; 28+ messages in thread
From: Andrew Morton @ 2021-09-23 23:28 UTC (permalink / raw)
  To: Marco Elver
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, 23 Sep 2021 15:44:10 +0200 Marco Elver <elver@google.com> wrote:

> > > > + * time, the below parameters provide a probablity of 0.02-0.33 for false
> > > > + * positive hits respectively:
> > > > + *
> > > > + *     P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
> > > > + */
> > > > +#define ALLOC_COVERED_HNUM     2
> > > > +#define ALLOC_COVERED_SIZE     (1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
> > > > +#define ALLOC_COVERED_HNEXT(h) (1664525 * (h) + 1013904223)
> >
> > Unless we are planning to change these primes, can you use
> > next_pseudo_random32() instead?
> 
> I'm worried about next_pseudo_random32() changing their implementation
> to longer be deterministic or change in other ways that break our
> usecase. In this case we want pseudorandomness, but we're not
> implementing a PRNG.
> 
> Open-coding the constants (given they are from "Numerical Recipes") is
> more reliable and doesn't introduce unwanted reliance on
> next_pseudo_random32()'s behaviour.

Perhaps we could summarize this in an additional comment?

Also, this:

+static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
+{
+	/* Some randomness across reboots / different machines. */
+	u32 seed = (u32)((unsigned long)__kfence_pool >> (BITS_PER_LONG - 32));

seems a bit weak.  Would it be better to seed this at boot time with
a randomish number?

^ permalink raw reply	[flat|nested] 28+ messages in thread

* Re: [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full
  2021-09-23 23:28         ` Andrew Morton
@ 2021-09-24 13:01           ` Marco Elver
  0 siblings, 0 replies; 28+ messages in thread
From: Marco Elver @ 2021-09-24 13:01 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Jann Horn, Aleksandr Nogikh,
	Taras Madan, LKML, Linux Memory Management List, kasan-dev

On Thu, Sep 23, 2021 at 04:28PM -0700, Andrew Morton wrote:
> On Thu, 23 Sep 2021 15:44:10 +0200 Marco Elver <elver@google.com> wrote:
[...]
> > I'm worried about next_pseudo_random32() changing their implementation
> > to longer be deterministic or change in other ways that break our
> > usecase. In this case we want pseudorandomness, but we're not
> > implementing a PRNG.
> > 
> > Open-coding the constants (given they are from "Numerical Recipes") is
> > more reliable and doesn't introduce unwanted reliance on
> > next_pseudo_random32()'s behaviour.
> 
> Perhaps we could summarize this in an additional comment?

Hmm, on second thought, while trying to write the comment realized it's
unnecessary altogether. I've switched to just using hash_32() which is
probably better suited.

> Also, this:
> 
> +static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
> +{
> +	/* Some randomness across reboots / different machines. */
> +	u32 seed = (u32)((unsigned long)__kfence_pool >> (BITS_PER_LONG - 32));
> 
> seems a bit weak.  Would it be better to seed this at boot time with
> a randomish number?

Sure, makes sense.

Both fixes are included in the below fixup. (Let me know if resending as
v4 is better, but I've seen the patches already appeared in -mm.)

Thank you!

-- Marco

------ >8 ------

From: Marco Elver <elver@google.com>
Date: Fri, 24 Sep 2021 14:17:38 +0200
Subject: [PATCH] fixup! kfence: limit currently covered allocations when pool
 nearly full

* Simplify and just use hash_32().
* Use more random stack_hash_seed.

Signed-off-by: Marco Elver <elver@google.com>
---
 mm/kfence/core.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 58a0f6f1acc5..545999d04af4 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -10,6 +10,7 @@
 #include <linux/atomic.h>
 #include <linux/bug.h>
 #include <linux/debugfs.h>
+#include <linux/hash.h>
 #include <linux/irq_work.h>
 #include <linux/jhash.h>
 #include <linux/kcsan-checks.h>
@@ -122,14 +123,21 @@ atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
  *	P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
  */
 #define ALLOC_COVERED_HNUM	2
-#define ALLOC_COVERED_SIZE	(1 << (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2))
-#define ALLOC_COVERED_HNEXT(h)	(1664525 * (h) + 1013904223)
+#define ALLOC_COVERED_ORDER	(const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
+#define ALLOC_COVERED_SIZE	(1 << ALLOC_COVERED_ORDER)
+#define ALLOC_COVERED_HNEXT(h)	hash_32(h, ALLOC_COVERED_ORDER)
 #define ALLOC_COVERED_MASK	(ALLOC_COVERED_SIZE - 1)
 static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
 
 /* Stack depth used to determine uniqueness of an allocation. */
 #define UNIQUE_ALLOC_STACK_DEPTH 8UL
 
+/*
+ * Randomness for stack hashes, making the same collisions across reboots and
+ * different machines less likely.
+ */
+static u32 stack_hash_seed __ro_after_init;
+
 /* Statistics counters for debugfs. */
 enum kfence_counter_id {
 	KFENCE_COUNTER_ALLOCATED,
@@ -166,12 +174,9 @@ static inline bool should_skip_covered(void)
 
 static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
 {
-	/* Some randomness across reboots / different machines. */
-	u32 seed = (u32)((unsigned long)__kfence_pool >> (BITS_PER_LONG - 32));
-
 	num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
 	num_entries = filter_irq_stacks(stack_entries, num_entries);
-	return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), seed);
+	return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
 }
 
 /*
@@ -759,6 +764,7 @@ void __init kfence_init(void)
 	if (!kfence_sample_interval)
 		return;
 
+	stack_hash_seed = (u32)random_get_entropy();
 	if (!kfence_init_pool()) {
 		pr_err("%s failed\n", __func__);
 		return;
-- 
2.33.0.685.g46640cef36-goog


^ permalink raw reply related	[flat|nested] 28+ messages in thread

end of thread, other threads:[~2021-09-24 13:26 UTC | newest]

Thread overview: 28+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-23 10:47 [PATCH v3 1/5] stacktrace: move filter_irq_stacks() to kernel/stacktrace.c Marco Elver
2021-09-23 10:47 ` Marco Elver
2021-09-23 10:48 ` [PATCH v3 2/5] kfence: count unexpectedly skipped allocations Marco Elver
2021-09-23 10:48   ` Marco Elver
2021-09-23 11:15   ` Alexander Potapenko
2021-09-23 11:15     ` Alexander Potapenko
2021-09-23 10:48 ` [PATCH v3 3/5] kfence: move saving stack trace of allocations into __kfence_alloc() Marco Elver
2021-09-23 10:48   ` Marco Elver
2021-09-23 11:32   ` Alexander Potapenko
2021-09-23 11:32     ` Alexander Potapenko
2021-09-23 10:48 ` [PATCH v3 4/5] kfence: limit currently covered allocations when pool nearly full Marco Elver
2021-09-23 10:48   ` Marco Elver
2021-09-23 11:18   ` Dmitry Vyukov
2021-09-23 11:18     ` Dmitry Vyukov
2021-09-23 13:23     ` Alexander Potapenko
2021-09-23 13:23       ` Alexander Potapenko
2021-09-23 13:44       ` Marco Elver
2021-09-23 13:44         ` Marco Elver
2021-09-23 13:46         ` Alexander Potapenko
2021-09-23 13:46           ` Alexander Potapenko
2021-09-23 23:28         ` Andrew Morton
2021-09-24 13:01           ` Marco Elver
2021-09-23 10:48 ` [PATCH v3 5/5] kfence: add note to documentation about skipping covered allocations Marco Elver
2021-09-23 10:48   ` Marco Elver
2021-09-23 15:46   ` Alexander Potapenko
2021-09-23 15:46     ` Alexander Potapenko
2021-09-23 11:14 ` [PATCH v3 1/5] stacktrace: move filter_irq_stacks() to kernel/stacktrace.c Alexander Potapenko
2021-09-23 11:14   ` Alexander Potapenko

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.