All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] kfence: count unexpectedly skipped allocations
@ 2021-09-17 11:07 ` Marco Elver
  0 siblings, 0 replies; 14+ messages in thread
From: Marco Elver @ 2021-09-17 11:07 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

Maintain a counter to count allocations that are skipped due to being
incompatible (oversized, incompatible gfp flags) or no capacity.

This is to compute the fraction of allocations that could not be
serviced by KFENCE, which we expect to be rare.

Signed-off-by: Marco Elver <elver@google.com>
---
 mm/kfence/core.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 7a97db8bc8e7..2755800f3e2a 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -112,6 +112,8 @@ enum kfence_counter_id {
 	KFENCE_COUNTER_FREES,
 	KFENCE_COUNTER_ZOMBIES,
 	KFENCE_COUNTER_BUGS,
+	KFENCE_COUNTER_SKIP_INCOMPAT,
+	KFENCE_COUNTER_SKIP_CAPACITY,
 	KFENCE_COUNTER_COUNT,
 };
 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -121,6 +123,8 @@ static const char *const counter_names[] = {
 	[KFENCE_COUNTER_FREES]		= "total frees",
 	[KFENCE_COUNTER_ZOMBIES]	= "zombie allocations",
 	[KFENCE_COUNTER_BUGS]		= "total bugs",
+	[KFENCE_COUNTER_SKIP_INCOMPAT]	= "skipped allocations (incompatible)",
+	[KFENCE_COUNTER_SKIP_CAPACITY]	= "skipped allocations (capacity)",
 };
 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
 
@@ -272,7 +276,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	}
 	raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
 	if (!meta)
-		return NULL;
+		goto no_capacity;
 
 	if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
 		/*
@@ -289,7 +293,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 		list_add_tail(&meta->list, &kfence_freelist);
 		raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
 
-		return NULL;
+		goto no_capacity;
 	}
 
 	meta->addr = metadata_to_pageaddr(meta);
@@ -349,6 +353,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
 
 	return addr;
+
+no_capacity:
+	atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
+	return NULL;
 }
 
 static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
@@ -740,8 +748,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	 * Perform size check before switching kfence_allocation_gate, so that
 	 * we don't disable KFENCE without making an allocation.
 	 */
-	if (size > PAGE_SIZE)
+	if (size > PAGE_SIZE) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
 		return NULL;
+	}
 
 	/*
 	 * Skip allocations from non-default zones, including DMA. We cannot
@@ -749,8 +759,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	 * properties (e.g. reside in DMAable memory).
 	 */
 	if ((flags & GFP_ZONEMASK) ||
-	    (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
+	    (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
 		return NULL;
+	}
 
 	/*
 	 * allocation_gate only needs to become non-zero, so it doesn't make
-- 
2.33.0.464.g1972c5931b-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 1/3] kfence: count unexpectedly skipped allocations
@ 2021-09-17 11:07 ` Marco Elver
  0 siblings, 0 replies; 14+ messages in thread
From: Marco Elver @ 2021-09-17 11:07 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

Maintain a counter to count allocations that are skipped due to being
incompatible (oversized, incompatible gfp flags) or no capacity.

This is to compute the fraction of allocations that could not be
serviced by KFENCE, which we expect to be rare.

Signed-off-by: Marco Elver <elver@google.com>
---
 mm/kfence/core.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 7a97db8bc8e7..2755800f3e2a 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -112,6 +112,8 @@ enum kfence_counter_id {
 	KFENCE_COUNTER_FREES,
 	KFENCE_COUNTER_ZOMBIES,
 	KFENCE_COUNTER_BUGS,
+	KFENCE_COUNTER_SKIP_INCOMPAT,
+	KFENCE_COUNTER_SKIP_CAPACITY,
 	KFENCE_COUNTER_COUNT,
 };
 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -121,6 +123,8 @@ static const char *const counter_names[] = {
 	[KFENCE_COUNTER_FREES]		= "total frees",
 	[KFENCE_COUNTER_ZOMBIES]	= "zombie allocations",
 	[KFENCE_COUNTER_BUGS]		= "total bugs",
+	[KFENCE_COUNTER_SKIP_INCOMPAT]	= "skipped allocations (incompatible)",
+	[KFENCE_COUNTER_SKIP_CAPACITY]	= "skipped allocations (capacity)",
 };
 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
 
@@ -272,7 +276,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	}
 	raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
 	if (!meta)
-		return NULL;
+		goto no_capacity;
 
 	if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
 		/*
@@ -289,7 +293,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 		list_add_tail(&meta->list, &kfence_freelist);
 		raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
 
-		return NULL;
+		goto no_capacity;
 	}
 
 	meta->addr = metadata_to_pageaddr(meta);
@@ -349,6 +353,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
 
 	return addr;
+
+no_capacity:
+	atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
+	return NULL;
 }
 
 static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
@@ -740,8 +748,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	 * Perform size check before switching kfence_allocation_gate, so that
 	 * we don't disable KFENCE without making an allocation.
 	 */
-	if (size > PAGE_SIZE)
+	if (size > PAGE_SIZE) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
 		return NULL;
+	}
 
 	/*
 	 * Skip allocations from non-default zones, including DMA. We cannot
@@ -749,8 +759,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	 * properties (e.g. reside in DMAable memory).
 	 */
 	if ((flags & GFP_ZONEMASK) ||
-	    (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
+	    (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
 		return NULL;
+	}
 
 	/*
 	 * allocation_gate only needs to become non-zero, so it doesn't make
-- 
2.33.0.464.g1972c5931b-goog



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/3] kfence: limit currently covered allocations when pool nearly full
  2021-09-17 11:07 ` Marco Elver
@ 2021-09-17 11:07   ` Marco Elver
  -1 siblings, 0 replies; 14+ messages in thread
From: Marco Elver @ 2021-09-17 11:07 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

One of KFENCE's main design principles is that with increasing uptime,
allocation coverage increases sufficiently to detect previously
undetected bugs.

We have observed that frequent long-lived allocations of the same
source (e.g. pagecache) tend to permanently fill up the KFENCE pool
with increasing system uptime, thus breaking the above requirement.
The workaround thus far had been increasing the sample interval and/or
increasing the KFENCE pool size, but is no reliable solution.

To ensure diverse coverage of allocations, limit currently covered
allocations of the same source once pool utilization reaches 75% or
above. The effect is retaining reasonable allocation coverage when the
pool is close to full.

A side-effect is that this also limits frequent long-lived allocations
of the same source filling up the pool permanently.

Uniqueness of an allocation for coverage purposes is based on its
(partial) allocation stack trace (the source). A lossy hash map is
used to check if an allocation is covered; if the allocation is
currently covered, the allocation is skipped by KFENCE.

Testing was done using:

	(a) a synthetic workload that performs frequent long-lived
	    allocations (default config values + <10ms sample intervals
	    + smaller-than-default pool sizes), and

	(b) normal desktop workloads on an otherwise idle machine where
	    the problem was first reported (default config values).

In the case of (b) the observed result confirms that sampled allocation
rate no longer drops to zero after a few days of uptime, all while
"allocations skipped (covered)" are no more than 2% of total sampled
allocations.

Signed-off-by: Marco Elver <elver@google.com>
---
 mm/kfence/core.c   | 120 ++++++++++++++++++++++++++++++++++++++++++++-
 mm/kfence/kfence.h |   2 +
 2 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 2755800f3e2a..3b78402d7a5e 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -11,11 +11,13 @@
 #include <linux/bug.h>
 #include <linux/debugfs.h>
 #include <linux/irq_work.h>
+#include <linux/jhash.h>
 #include <linux/kcsan-checks.h>
 #include <linux/kfence.h>
 #include <linux/kmemleak.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
+#include <linux/log2.h>
 #include <linux/memblock.h>
 #include <linux/moduleparam.h>
 #include <linux/random.h>
@@ -86,6 +88,28 @@ module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_inte
 char *__kfence_pool __ro_after_init;
 EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
 
+/*
+ * A lossy hash map of allocation stack trace coverage: limits currently covered
+ * allocations of the same source filling up the pool when close to full.
+ *
+ * The required data fits in 64 bits, and therefore we can avoid a per-entry (or
+ * global) lock by simply storing each entry's data in an atomic64_t.
+ */
+union alloc_covered_entry {
+	struct {
+		u32 alloc_stack_hash;	/* stack trace hash */
+		u32 covered;		/* current coverage count */
+	};
+	u64 entry;
+};
+#define ALLOC_COVERED_SIZE (1 << const_ilog2(CONFIG_KFENCE_NUM_OBJECTS | 128)) /* >= 128 */
+#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
+static atomic64_t alloc_covered[ALLOC_COVERED_SIZE];
+/* Stack depth used to determine uniqueness of an allocation. */
+#define UNIQUE_ALLOC_STACK_DEPTH 8
+/* Pool usage threshold when currently covered allocations are skipped. */
+#define SKIP_COVERED_THRESHOLD ((CONFIG_KFENCE_NUM_OBJECTS * 3) / 4) /* 75% */
+
 /*
  * Per-object metadata, with one-to-one mapping of object metadata to
  * backing pages (in __kfence_pool).
@@ -114,6 +138,7 @@ enum kfence_counter_id {
 	KFENCE_COUNTER_BUGS,
 	KFENCE_COUNTER_SKIP_INCOMPAT,
 	KFENCE_COUNTER_SKIP_CAPACITY,
+	KFENCE_COUNTER_SKIP_COVERED,
 	KFENCE_COUNTER_COUNT,
 };
 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -125,11 +150,73 @@ static const char *const counter_names[] = {
 	[KFENCE_COUNTER_BUGS]		= "total bugs",
 	[KFENCE_COUNTER_SKIP_INCOMPAT]	= "skipped allocations (incompatible)",
 	[KFENCE_COUNTER_SKIP_CAPACITY]	= "skipped allocations (capacity)",
+	[KFENCE_COUNTER_SKIP_COVERED]	= "skipped allocations (covered)",
 };
 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
 
 /* === Internals ============================================================ */
 
+static u32 get_alloc_stack_hash(void)
+{
+	unsigned long stack_entries[UNIQUE_ALLOC_STACK_DEPTH];
+	size_t num_entries;
+
+	num_entries = stack_trace_save(stack_entries, UNIQUE_ALLOC_STACK_DEPTH, 1);
+	return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), 0);
+}
+
+/*
+ * Check if the allocation stack trace hash @alloc_stack_hash is contained in
+ * @alloc_covered and currently covered.
+ */
+static bool alloc_covered_contains(u32 alloc_stack_hash)
+{
+	union alloc_covered_entry entry;
+
+	if (!alloc_stack_hash)
+		return false;
+
+	entry.entry = (u64)atomic64_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
+	return entry.alloc_stack_hash == alloc_stack_hash && entry.covered;
+}
+
+/*
+ * Adds (or subtracts) coverage count to entry corresponding to
+ * @alloc_stack_hash. If @alloc_stack_hash is not yet contained in
+ * @alloc_covered, resets (potentially evicting existing) entry.
+ */
+static void alloc_covered_add(u32 alloc_stack_hash, int val)
+{
+	union alloc_covered_entry old;
+	union alloc_covered_entry new;
+	atomic64_t *bucket;
+
+	if (!alloc_stack_hash)
+		return;
+
+	bucket = &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK];
+	old.entry = (u64)atomic64_read(bucket);
+	new.alloc_stack_hash = alloc_stack_hash;
+	do {
+		if (val > 0) {
+			new.covered = old.alloc_stack_hash == alloc_stack_hash
+					? old.covered + val	/* increment */
+					: val;			/* evict/reset */
+		} else if (old.alloc_stack_hash == alloc_stack_hash && old.covered) {
+			new.covered = old.covered + val;
+		} else {
+			/*
+			 * Hash mismatch or covered has become zero. The latter
+			 * is possible if we race with:
+			 *	reset (!= alloc_stack_hash)
+			 *	 -> reset (== alloc_stack_hash)
+			 *	  -> decrement
+			 */
+			break;
+		}
+	} while (!atomic64_try_cmpxchg_relaxed(bucket, (s64 *)&old.entry, (s64)new.entry));
+}
+
 static bool kfence_protect(unsigned long addr)
 {
 	return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
@@ -261,7 +348,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
 	}
 }
 
-static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
+static void *
+kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, u32 alloc_stack_hash)
 {
 	struct kfence_metadata *meta = NULL;
 	unsigned long flags;
@@ -322,6 +410,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
 	WRITE_ONCE(meta->cache, cache);
 	meta->size = size;
+	meta->alloc_stack_hash = alloc_stack_hash;
+
 	for_each_canary(meta, set_canary_byte);
 
 	/* Set required struct page fields. */
@@ -334,6 +424,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
+	alloc_covered_add(alloc_stack_hash, 1);
+
 	/* Memory initialization. */
 
 	/*
@@ -362,6 +454,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
 {
 	struct kcsan_scoped_access assert_page_exclusive;
+	u32 alloc_stack_hash;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&meta->lock, flags);
@@ -404,8 +497,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 	/* Mark the object as freed. */
 	metadata_update_state(meta, KFENCE_OBJECT_FREED);
 
+	alloc_stack_hash = meta->alloc_stack_hash;
+	meta->alloc_stack_hash = 0;
+
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
+	alloc_covered_add(alloc_stack_hash, -1);
+
 	/* Protect to detect use-after-frees. */
 	kfence_protect((unsigned long)addr);
 
@@ -744,6 +842,8 @@ void kfence_shutdown_cache(struct kmem_cache *s)
 
 void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 {
+	u32 alloc_stack_hash;
+
 	/*
 	 * Perform size check before switching kfence_allocation_gate, so that
 	 * we don't disable KFENCE without making an allocation.
@@ -788,7 +888,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	if (!READ_ONCE(kfence_enabled))
 		return NULL;
 
-	return kfence_guarded_alloc(s, size, flags);
+	/*
+	 * Do expensive check for coverage of allocation in slow-path after
+	 * allocation_gate has already become non-zero, even though it might
+	 * mean not making any allocation within a given sample interval.
+	 *
+	 * This ensures reasonable allocation coverage when the pool is almost
+	 * full, including avoiding long-lived allocations of the same source
+	 * filling up the pool (e.g. pagecache allocations).
+	 */
+	alloc_stack_hash = get_alloc_stack_hash();
+	if (atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > SKIP_COVERED_THRESHOLD &&
+	    alloc_covered_contains(alloc_stack_hash)) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
+		return NULL;
+	}
+
+	return kfence_guarded_alloc(s, size, flags, alloc_stack_hash);
 }
 
 size_t kfence_ksize(const void *addr)
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index c1f23c61e5f9..2a2d5de9d379 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -87,6 +87,8 @@ struct kfence_metadata {
 	/* Allocation and free stack information. */
 	struct kfence_track alloc_track;
 	struct kfence_track free_track;
+	/* For updating alloc_covered on frees. */
+	u32 alloc_stack_hash;
 };
 
 extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
-- 
2.33.0.464.g1972c5931b-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 2/3] kfence: limit currently covered allocations when pool nearly full
@ 2021-09-17 11:07   ` Marco Elver
  0 siblings, 0 replies; 14+ messages in thread
From: Marco Elver @ 2021-09-17 11:07 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

One of KFENCE's main design principles is that with increasing uptime,
allocation coverage increases sufficiently to detect previously
undetected bugs.

We have observed that frequent long-lived allocations of the same
source (e.g. pagecache) tend to permanently fill up the KFENCE pool
with increasing system uptime, thus breaking the above requirement.
The workaround thus far had been increasing the sample interval and/or
increasing the KFENCE pool size, but is no reliable solution.

To ensure diverse coverage of allocations, limit currently covered
allocations of the same source once pool utilization reaches 75% or
above. The effect is retaining reasonable allocation coverage when the
pool is close to full.

A side-effect is that this also limits frequent long-lived allocations
of the same source filling up the pool permanently.

Uniqueness of an allocation for coverage purposes is based on its
(partial) allocation stack trace (the source). A lossy hash map is
used to check if an allocation is covered; if the allocation is
currently covered, the allocation is skipped by KFENCE.

Testing was done using:

	(a) a synthetic workload that performs frequent long-lived
	    allocations (default config values + <10ms sample intervals
	    + smaller-than-default pool sizes), and

	(b) normal desktop workloads on an otherwise idle machine where
	    the problem was first reported (default config values).

In the case of (b) the observed result confirms that sampled allocation
rate no longer drops to zero after a few days of uptime, all while
"allocations skipped (covered)" are no more than 2% of total sampled
allocations.

Signed-off-by: Marco Elver <elver@google.com>
---
 mm/kfence/core.c   | 120 ++++++++++++++++++++++++++++++++++++++++++++-
 mm/kfence/kfence.h |   2 +
 2 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 2755800f3e2a..3b78402d7a5e 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -11,11 +11,13 @@
 #include <linux/bug.h>
 #include <linux/debugfs.h>
 #include <linux/irq_work.h>
+#include <linux/jhash.h>
 #include <linux/kcsan-checks.h>
 #include <linux/kfence.h>
 #include <linux/kmemleak.h>
 #include <linux/list.h>
 #include <linux/lockdep.h>
+#include <linux/log2.h>
 #include <linux/memblock.h>
 #include <linux/moduleparam.h>
 #include <linux/random.h>
@@ -86,6 +88,28 @@ module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_inte
 char *__kfence_pool __ro_after_init;
 EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
 
+/*
+ * A lossy hash map of allocation stack trace coverage: limits currently covered
+ * allocations of the same source filling up the pool when close to full.
+ *
+ * The required data fits in 64 bits, and therefore we can avoid a per-entry (or
+ * global) lock by simply storing each entry's data in an atomic64_t.
+ */
+union alloc_covered_entry {
+	struct {
+		u32 alloc_stack_hash;	/* stack trace hash */
+		u32 covered;		/* current coverage count */
+	};
+	u64 entry;
+};
+#define ALLOC_COVERED_SIZE (1 << const_ilog2(CONFIG_KFENCE_NUM_OBJECTS | 128)) /* >= 128 */
+#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
+static atomic64_t alloc_covered[ALLOC_COVERED_SIZE];
+/* Stack depth used to determine uniqueness of an allocation. */
+#define UNIQUE_ALLOC_STACK_DEPTH 8
+/* Pool usage threshold when currently covered allocations are skipped. */
+#define SKIP_COVERED_THRESHOLD ((CONFIG_KFENCE_NUM_OBJECTS * 3) / 4) /* 75% */
+
 /*
  * Per-object metadata, with one-to-one mapping of object metadata to
  * backing pages (in __kfence_pool).
@@ -114,6 +138,7 @@ enum kfence_counter_id {
 	KFENCE_COUNTER_BUGS,
 	KFENCE_COUNTER_SKIP_INCOMPAT,
 	KFENCE_COUNTER_SKIP_CAPACITY,
+	KFENCE_COUNTER_SKIP_COVERED,
 	KFENCE_COUNTER_COUNT,
 };
 static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -125,11 +150,73 @@ static const char *const counter_names[] = {
 	[KFENCE_COUNTER_BUGS]		= "total bugs",
 	[KFENCE_COUNTER_SKIP_INCOMPAT]	= "skipped allocations (incompatible)",
 	[KFENCE_COUNTER_SKIP_CAPACITY]	= "skipped allocations (capacity)",
+	[KFENCE_COUNTER_SKIP_COVERED]	= "skipped allocations (covered)",
 };
 static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
 
 /* === Internals ============================================================ */
 
+static u32 get_alloc_stack_hash(void)
+{
+	unsigned long stack_entries[UNIQUE_ALLOC_STACK_DEPTH];
+	size_t num_entries;
+
+	num_entries = stack_trace_save(stack_entries, UNIQUE_ALLOC_STACK_DEPTH, 1);
+	return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), 0);
+}
+
+/*
+ * Check if the allocation stack trace hash @alloc_stack_hash is contained in
+ * @alloc_covered and currently covered.
+ */
+static bool alloc_covered_contains(u32 alloc_stack_hash)
+{
+	union alloc_covered_entry entry;
+
+	if (!alloc_stack_hash)
+		return false;
+
+	entry.entry = (u64)atomic64_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
+	return entry.alloc_stack_hash == alloc_stack_hash && entry.covered;
+}
+
+/*
+ * Adds (or subtracts) coverage count to entry corresponding to
+ * @alloc_stack_hash. If @alloc_stack_hash is not yet contained in
+ * @alloc_covered, resets (potentially evicting existing) entry.
+ */
+static void alloc_covered_add(u32 alloc_stack_hash, int val)
+{
+	union alloc_covered_entry old;
+	union alloc_covered_entry new;
+	atomic64_t *bucket;
+
+	if (!alloc_stack_hash)
+		return;
+
+	bucket = &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK];
+	old.entry = (u64)atomic64_read(bucket);
+	new.alloc_stack_hash = alloc_stack_hash;
+	do {
+		if (val > 0) {
+			new.covered = old.alloc_stack_hash == alloc_stack_hash
+					? old.covered + val	/* increment */
+					: val;			/* evict/reset */
+		} else if (old.alloc_stack_hash == alloc_stack_hash && old.covered) {
+			new.covered = old.covered + val;
+		} else {
+			/*
+			 * Hash mismatch or covered has become zero. The latter
+			 * is possible if we race with:
+			 *	reset (!= alloc_stack_hash)
+			 *	 -> reset (== alloc_stack_hash)
+			 *	  -> decrement
+			 */
+			break;
+		}
+	} while (!atomic64_try_cmpxchg_relaxed(bucket, (s64 *)&old.entry, (s64)new.entry));
+}
+
 static bool kfence_protect(unsigned long addr)
 {
 	return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
@@ -261,7 +348,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
 	}
 }
 
-static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
+static void *
+kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, u32 alloc_stack_hash)
 {
 	struct kfence_metadata *meta = NULL;
 	unsigned long flags;
@@ -322,6 +410,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 	/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
 	WRITE_ONCE(meta->cache, cache);
 	meta->size = size;
+	meta->alloc_stack_hash = alloc_stack_hash;
+
 	for_each_canary(meta, set_canary_byte);
 
 	/* Set required struct page fields. */
@@ -334,6 +424,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
+	alloc_covered_add(alloc_stack_hash, 1);
+
 	/* Memory initialization. */
 
 	/*
@@ -362,6 +454,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
 static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
 {
 	struct kcsan_scoped_access assert_page_exclusive;
+	u32 alloc_stack_hash;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&meta->lock, flags);
@@ -404,8 +497,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
 	/* Mark the object as freed. */
 	metadata_update_state(meta, KFENCE_OBJECT_FREED);
 
+	alloc_stack_hash = meta->alloc_stack_hash;
+	meta->alloc_stack_hash = 0;
+
 	raw_spin_unlock_irqrestore(&meta->lock, flags);
 
+	alloc_covered_add(alloc_stack_hash, -1);
+
 	/* Protect to detect use-after-frees. */
 	kfence_protect((unsigned long)addr);
 
@@ -744,6 +842,8 @@ void kfence_shutdown_cache(struct kmem_cache *s)
 
 void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 {
+	u32 alloc_stack_hash;
+
 	/*
 	 * Perform size check before switching kfence_allocation_gate, so that
 	 * we don't disable KFENCE without making an allocation.
@@ -788,7 +888,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
 	if (!READ_ONCE(kfence_enabled))
 		return NULL;
 
-	return kfence_guarded_alloc(s, size, flags);
+	/*
+	 * Do expensive check for coverage of allocation in slow-path after
+	 * allocation_gate has already become non-zero, even though it might
+	 * mean not making any allocation within a given sample interval.
+	 *
+	 * This ensures reasonable allocation coverage when the pool is almost
+	 * full, including avoiding long-lived allocations of the same source
+	 * filling up the pool (e.g. pagecache allocations).
+	 */
+	alloc_stack_hash = get_alloc_stack_hash();
+	if (atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > SKIP_COVERED_THRESHOLD &&
+	    alloc_covered_contains(alloc_stack_hash)) {
+		atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
+		return NULL;
+	}
+
+	return kfence_guarded_alloc(s, size, flags, alloc_stack_hash);
 }
 
 size_t kfence_ksize(const void *addr)
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index c1f23c61e5f9..2a2d5de9d379 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -87,6 +87,8 @@ struct kfence_metadata {
 	/* Allocation and free stack information. */
 	struct kfence_track alloc_track;
 	struct kfence_track free_track;
+	/* For updating alloc_covered on frees. */
+	u32 alloc_stack_hash;
 };
 
 extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
-- 
2.33.0.464.g1972c5931b-goog



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/3] kfence: add note to documentation about skipping covered allocations
  2021-09-17 11:07 ` Marco Elver
@ 2021-09-17 11:07   ` Marco Elver
  -1 siblings, 0 replies; 14+ messages in thread
From: Marco Elver @ 2021-09-17 11:07 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

Add a note briefly mentioning the new policy about "skipping currently
covered allocations if pool close to full." Since this has a notable
impact on KFENCE's bug-detection ability on systems with large uptimes,
it is worth pointing out the feature.

Signed-off-by: Marco Elver <elver@google.com>
---
 Documentation/dev-tools/kfence.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
index 0fbe3308bf37..e698234999d6 100644
--- a/Documentation/dev-tools/kfence.rst
+++ b/Documentation/dev-tools/kfence.rst
@@ -269,6 +269,14 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused
 first, and the chances of detecting use-after-frees of recently freed objects
 is increased.
 
+If pool utilization reaches 75% or above, to reduce the probability of the pool
+containing ~100% allocated objects yet ensure diverse coverage of allocations,
+KFENCE limits currently covered allocations of the same source from further
+filling up the pool. A side-effect is that this also limits frequent long-lived
+allocations of the same source filling up the pool permanently, thereby
+reducing the risk of the pool becoming full and the sampled allocation rate
+dropping to zero.
+
 Interface
 ---------
 
-- 
2.33.0.464.g1972c5931b-goog


^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH 3/3] kfence: add note to documentation about skipping covered allocations
@ 2021-09-17 11:07   ` Marco Elver
  0 siblings, 0 replies; 14+ messages in thread
From: Marco Elver @ 2021-09-17 11:07 UTC (permalink / raw)
  To: elver, Andrew Morton
  Cc: Alexander Potapenko, Dmitry Vyukov, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

Add a note briefly mentioning the new policy about "skipping currently
covered allocations if pool close to full." Since this has a notable
impact on KFENCE's bug-detection ability on systems with large uptimes,
it is worth pointing out the feature.

Signed-off-by: Marco Elver <elver@google.com>
---
 Documentation/dev-tools/kfence.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/Documentation/dev-tools/kfence.rst b/Documentation/dev-tools/kfence.rst
index 0fbe3308bf37..e698234999d6 100644
--- a/Documentation/dev-tools/kfence.rst
+++ b/Documentation/dev-tools/kfence.rst
@@ -269,6 +269,14 @@ tail of KFENCE's freelist, so that the least recently freed objects are reused
 first, and the chances of detecting use-after-frees of recently freed objects
 is increased.
 
+If pool utilization reaches 75% or above, to reduce the probability of the pool
+containing ~100% allocated objects yet ensure diverse coverage of allocations,
+KFENCE limits currently covered allocations of the same source from further
+filling up the pool. A side-effect is that this also limits frequent long-lived
+allocations of the same source filling up the pool permanently, thereby
+reducing the risk of the pool becoming full and the sampled allocation rate
+dropping to zero.
+
 Interface
 ---------
 
-- 
2.33.0.464.g1972c5931b-goog



^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] kfence: count unexpectedly skipped allocations
  2021-09-17 11:07 ` Marco Elver
@ 2021-09-17 12:58   ` Dmitry Vyukov
  -1 siblings, 0 replies; 14+ messages in thread
From: Dmitry Vyukov @ 2021-09-17 12:58 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Alexander Potapenko, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

On Fri, 17 Sept 2021 at 13:08, 'Marco Elver' via kasan-dev
<kasan-dev@googlegroups.com> wrote:
>
> Maintain a counter to count allocations that are skipped due to being
> incompatible (oversized, incompatible gfp flags) or no capacity.
>
> This is to compute the fraction of allocations that could not be
> serviced by KFENCE, which we expect to be rare.
>
> Signed-off-by: Marco Elver <elver@google.com>
> ---
>  mm/kfence/core.c | 20 ++++++++++++++++----
>  1 file changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index 7a97db8bc8e7..2755800f3e2a 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -112,6 +112,8 @@ enum kfence_counter_id {
>         KFENCE_COUNTER_FREES,
>         KFENCE_COUNTER_ZOMBIES,
>         KFENCE_COUNTER_BUGS,
> +       KFENCE_COUNTER_SKIP_INCOMPAT,
> +       KFENCE_COUNTER_SKIP_CAPACITY,
>         KFENCE_COUNTER_COUNT,
>  };
>  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> @@ -121,6 +123,8 @@ static const char *const counter_names[] = {
>         [KFENCE_COUNTER_FREES]          = "total frees",
>         [KFENCE_COUNTER_ZOMBIES]        = "zombie allocations",
>         [KFENCE_COUNTER_BUGS]           = "total bugs",
> +       [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
> +       [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
>  };
>  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
>
> @@ -272,7 +276,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>         }
>         raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
>         if (!meta)
> -               return NULL;
> +               goto no_capacity;
>
>         if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
>                 /*
> @@ -289,7 +293,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>                 list_add_tail(&meta->list, &kfence_freelist);
>                 raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
>
> -               return NULL;
> +               goto no_capacity;

Do we expect this case to be so rare that we don't care?
Strictly speaking it's not no_capacity. So if I see large no_capacity
numbers, the first question I will have is: is it really no_capacity,
or some other case that we mixed together?



>         }
>
>         meta->addr = metadata_to_pageaddr(meta);
> @@ -349,6 +353,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>         atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
>
>         return addr;
> +
> +no_capacity:
> +       atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
> +       return NULL;
>  }
>
>  static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
> @@ -740,8 +748,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>          * Perform size check before switching kfence_allocation_gate, so that
>          * we don't disable KFENCE without making an allocation.
>          */
> -       if (size > PAGE_SIZE)
> +       if (size > PAGE_SIZE) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
>                 return NULL;
> +       }
>
>         /*
>          * Skip allocations from non-default zones, including DMA. We cannot
> @@ -749,8 +759,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>          * properties (e.g. reside in DMAable memory).
>          */
>         if ((flags & GFP_ZONEMASK) ||
> -           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
> +           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
>                 return NULL;
> +       }
>
>         /*
>          * allocation_gate only needs to become non-zero, so it doesn't make
> --
> 2.33.0.464.g1972c5931b-goog
>
> --
> You received this message because you are subscribed to the Google Groups "kasan-dev" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to kasan-dev+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/kasan-dev/20210917110756.1121272-1-elver%40google.com.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] kfence: count unexpectedly skipped allocations
@ 2021-09-17 12:58   ` Dmitry Vyukov
  0 siblings, 0 replies; 14+ messages in thread
From: Dmitry Vyukov @ 2021-09-17 12:58 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Alexander Potapenko, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

On Fri, 17 Sept 2021 at 13:08, 'Marco Elver' via kasan-dev
<kasan-dev@googlegroups.com> wrote:
>
> Maintain a counter to count allocations that are skipped due to being
> incompatible (oversized, incompatible gfp flags) or no capacity.
>
> This is to compute the fraction of allocations that could not be
> serviced by KFENCE, which we expect to be rare.
>
> Signed-off-by: Marco Elver <elver@google.com>
> ---
>  mm/kfence/core.c | 20 ++++++++++++++++----
>  1 file changed, 16 insertions(+), 4 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index 7a97db8bc8e7..2755800f3e2a 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -112,6 +112,8 @@ enum kfence_counter_id {
>         KFENCE_COUNTER_FREES,
>         KFENCE_COUNTER_ZOMBIES,
>         KFENCE_COUNTER_BUGS,
> +       KFENCE_COUNTER_SKIP_INCOMPAT,
> +       KFENCE_COUNTER_SKIP_CAPACITY,
>         KFENCE_COUNTER_COUNT,
>  };
>  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> @@ -121,6 +123,8 @@ static const char *const counter_names[] = {
>         [KFENCE_COUNTER_FREES]          = "total frees",
>         [KFENCE_COUNTER_ZOMBIES]        = "zombie allocations",
>         [KFENCE_COUNTER_BUGS]           = "total bugs",
> +       [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
> +       [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
>  };
>  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
>
> @@ -272,7 +276,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>         }
>         raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
>         if (!meta)
> -               return NULL;
> +               goto no_capacity;
>
>         if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
>                 /*
> @@ -289,7 +293,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>                 list_add_tail(&meta->list, &kfence_freelist);
>                 raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
>
> -               return NULL;
> +               goto no_capacity;

Do we expect this case to be so rare that we don't care?
Strictly speaking it's not no_capacity. So if I see large no_capacity
numbers, the first question I will have is: is it really no_capacity,
or some other case that we mixed together?



>         }
>
>         meta->addr = metadata_to_pageaddr(meta);
> @@ -349,6 +353,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>         atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
>
>         return addr;
> +
> +no_capacity:
> +       atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
> +       return NULL;
>  }
>
>  static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
> @@ -740,8 +748,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>          * Perform size check before switching kfence_allocation_gate, so that
>          * we don't disable KFENCE without making an allocation.
>          */
> -       if (size > PAGE_SIZE)
> +       if (size > PAGE_SIZE) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
>                 return NULL;
> +       }
>
>         /*
>          * Skip allocations from non-default zones, including DMA. We cannot
> @@ -749,8 +759,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>          * properties (e.g. reside in DMAable memory).
>          */
>         if ((flags & GFP_ZONEMASK) ||
> -           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
> +           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
>                 return NULL;
> +       }
>
>         /*
>          * allocation_gate only needs to become non-zero, so it doesn't make
> --
> 2.33.0.464.g1972c5931b-goog
>
> --
> You received this message because you are subscribed to the Google Groups "kasan-dev" group.
> To unsubscribe from this group and stop receiving emails from it, send an email to kasan-dev+unsubscribe@googlegroups.com.
> To view this discussion on the web visit https://groups.google.com/d/msgid/kasan-dev/20210917110756.1121272-1-elver%40google.com.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] kfence: count unexpectedly skipped allocations
  2021-09-17 12:58   ` Dmitry Vyukov
@ 2021-09-17 13:08     ` Marco Elver
  -1 siblings, 0 replies; 14+ messages in thread
From: Marco Elver @ 2021-09-17 13:08 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Andrew Morton, Alexander Potapenko, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

On Fri, 17 Sept 2021 at 14:58, Dmitry Vyukov <dvyukov@google.com> wrote:
>
> On Fri, 17 Sept 2021 at 13:08, 'Marco Elver' via kasan-dev
> <kasan-dev@googlegroups.com> wrote:
> >
> > Maintain a counter to count allocations that are skipped due to being
> > incompatible (oversized, incompatible gfp flags) or no capacity.
> >
> > This is to compute the fraction of allocations that could not be
> > serviced by KFENCE, which we expect to be rare.
> >
> > Signed-off-by: Marco Elver <elver@google.com>
> > ---
> >  mm/kfence/core.c | 20 ++++++++++++++++----
> >  1 file changed, 16 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> > index 7a97db8bc8e7..2755800f3e2a 100644
> > --- a/mm/kfence/core.c
> > +++ b/mm/kfence/core.c
> > @@ -112,6 +112,8 @@ enum kfence_counter_id {
> >         KFENCE_COUNTER_FREES,
> >         KFENCE_COUNTER_ZOMBIES,
> >         KFENCE_COUNTER_BUGS,
> > +       KFENCE_COUNTER_SKIP_INCOMPAT,
> > +       KFENCE_COUNTER_SKIP_CAPACITY,
> >         KFENCE_COUNTER_COUNT,
> >  };
> >  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> > @@ -121,6 +123,8 @@ static const char *const counter_names[] = {
> >         [KFENCE_COUNTER_FREES]          = "total frees",
> >         [KFENCE_COUNTER_ZOMBIES]        = "zombie allocations",
> >         [KFENCE_COUNTER_BUGS]           = "total bugs",
> > +       [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
> > +       [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
> >  };
> >  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
> >
> > @@ -272,7 +276,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >         }
> >         raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
> >         if (!meta)
> > -               return NULL;
> > +               goto no_capacity;
> >
> >         if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
> >                 /*
> > @@ -289,7 +293,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >                 list_add_tail(&meta->list, &kfence_freelist);
> >                 raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
> >
> > -               return NULL;
> > +               goto no_capacity;
>
> Do we expect this case to be so rare that we don't care?
> Strictly speaking it's not no_capacity. So if I see large no_capacity
> numbers, the first question I will have is: is it really no_capacity,
> or some other case that we mixed together?

Hmm, true. I think we can just ignore counting this -- I'd expect some
bug-storm for this to become likely, at which point the system is in a
pretty bad state anyway (and we see bug counts increasing).

I'll remove this one.

>
>
> >         }
> >
> >         meta->addr = metadata_to_pageaddr(meta);
> > @@ -349,6 +353,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >         atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
> >
> >         return addr;
> > +
> > +no_capacity:
> > +       atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
> > +       return NULL;
> >  }
> >
> >  static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
> > @@ -740,8 +748,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >          * Perform size check before switching kfence_allocation_gate, so that
> >          * we don't disable KFENCE without making an allocation.
> >          */
> > -       if (size > PAGE_SIZE)
> > +       if (size > PAGE_SIZE) {
> > +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
> >                 return NULL;
> > +       }
> >
> >         /*
> >          * Skip allocations from non-default zones, including DMA. We cannot
> > @@ -749,8 +759,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >          * properties (e.g. reside in DMAable memory).
> >          */
> >         if ((flags & GFP_ZONEMASK) ||
> > -           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
> > +           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
> > +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
> >                 return NULL;
> > +       }
> >
> >         /*
> >          * allocation_gate only needs to become non-zero, so it doesn't make
> > --
> > 2.33.0.464.g1972c5931b-goog
> >
> > --
> > You received this message because you are subscribed to the Google Groups "kasan-dev" group.
> > To unsubscribe from this group and stop receiving emails from it, send an email to kasan-dev+unsubscribe@googlegroups.com.
> > To view this discussion on the web visit https://groups.google.com/d/msgid/kasan-dev/20210917110756.1121272-1-elver%40google.com.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 1/3] kfence: count unexpectedly skipped allocations
@ 2021-09-17 13:08     ` Marco Elver
  0 siblings, 0 replies; 14+ messages in thread
From: Marco Elver @ 2021-09-17 13:08 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Andrew Morton, Alexander Potapenko, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

On Fri, 17 Sept 2021 at 14:58, Dmitry Vyukov <dvyukov@google.com> wrote:
>
> On Fri, 17 Sept 2021 at 13:08, 'Marco Elver' via kasan-dev
> <kasan-dev@googlegroups.com> wrote:
> >
> > Maintain a counter to count allocations that are skipped due to being
> > incompatible (oversized, incompatible gfp flags) or no capacity.
> >
> > This is to compute the fraction of allocations that could not be
> > serviced by KFENCE, which we expect to be rare.
> >
> > Signed-off-by: Marco Elver <elver@google.com>
> > ---
> >  mm/kfence/core.c | 20 ++++++++++++++++----
> >  1 file changed, 16 insertions(+), 4 deletions(-)
> >
> > diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> > index 7a97db8bc8e7..2755800f3e2a 100644
> > --- a/mm/kfence/core.c
> > +++ b/mm/kfence/core.c
> > @@ -112,6 +112,8 @@ enum kfence_counter_id {
> >         KFENCE_COUNTER_FREES,
> >         KFENCE_COUNTER_ZOMBIES,
> >         KFENCE_COUNTER_BUGS,
> > +       KFENCE_COUNTER_SKIP_INCOMPAT,
> > +       KFENCE_COUNTER_SKIP_CAPACITY,
> >         KFENCE_COUNTER_COUNT,
> >  };
> >  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> > @@ -121,6 +123,8 @@ static const char *const counter_names[] = {
> >         [KFENCE_COUNTER_FREES]          = "total frees",
> >         [KFENCE_COUNTER_ZOMBIES]        = "zombie allocations",
> >         [KFENCE_COUNTER_BUGS]           = "total bugs",
> > +       [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
> > +       [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
> >  };
> >  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
> >
> > @@ -272,7 +276,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >         }
> >         raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
> >         if (!meta)
> > -               return NULL;
> > +               goto no_capacity;
> >
> >         if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
> >                 /*
> > @@ -289,7 +293,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >                 list_add_tail(&meta->list, &kfence_freelist);
> >                 raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
> >
> > -               return NULL;
> > +               goto no_capacity;
>
> Do we expect this case to be so rare that we don't care?
> Strictly speaking it's not no_capacity. So if I see large no_capacity
> numbers, the first question I will have is: is it really no_capacity,
> or some other case that we mixed together?

Hmm, true. I think we can just ignore counting this -- I'd expect some
bug-storm for this to become likely, at which point the system is in a
pretty bad state anyway (and we see bug counts increasing).

I'll remove this one.

>
>
> >         }
> >
> >         meta->addr = metadata_to_pageaddr(meta);
> > @@ -349,6 +353,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >         atomic_long_inc(&counters[KFENCE_COUNTER_ALLOCS]);
> >
> >         return addr;
> > +
> > +no_capacity:
> > +       atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
> > +       return NULL;
> >  }
> >
> >  static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
> > @@ -740,8 +748,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >          * Perform size check before switching kfence_allocation_gate, so that
> >          * we don't disable KFENCE without making an allocation.
> >          */
> > -       if (size > PAGE_SIZE)
> > +       if (size > PAGE_SIZE) {
> > +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
> >                 return NULL;
> > +       }
> >
> >         /*
> >          * Skip allocations from non-default zones, including DMA. We cannot
> > @@ -749,8 +759,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >          * properties (e.g. reside in DMAable memory).
> >          */
> >         if ((flags & GFP_ZONEMASK) ||
> > -           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
> > +           (s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
> > +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
> >                 return NULL;
> > +       }
> >
> >         /*
> >          * allocation_gate only needs to become non-zero, so it doesn't make
> > --
> > 2.33.0.464.g1972c5931b-goog
> >
> > --
> > You received this message because you are subscribed to the Google Groups "kasan-dev" group.
> > To unsubscribe from this group and stop receiving emails from it, send an email to kasan-dev+unsubscribe@googlegroups.com.
> > To view this discussion on the web visit https://groups.google.com/d/msgid/kasan-dev/20210917110756.1121272-1-elver%40google.com.


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] kfence: limit currently covered allocations when pool nearly full
  2021-09-17 11:07   ` Marco Elver
@ 2021-09-17 13:52     ` Dmitry Vyukov
  -1 siblings, 0 replies; 14+ messages in thread
From: Dmitry Vyukov @ 2021-09-17 13:52 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Alexander Potapenko, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

On Fri, 17 Sept 2021 at 13:08, 'Marco Elver' via kasan-dev
<kasan-dev@googlegroups.com> wrote:
>
> One of KFENCE's main design principles is that with increasing uptime,
> allocation coverage increases sufficiently to detect previously
> undetected bugs.
>
> We have observed that frequent long-lived allocations of the same
> source (e.g. pagecache) tend to permanently fill up the KFENCE pool
> with increasing system uptime, thus breaking the above requirement.
> The workaround thus far had been increasing the sample interval and/or
> increasing the KFENCE pool size, but is no reliable solution.
>
> To ensure diverse coverage of allocations, limit currently covered
> allocations of the same source once pool utilization reaches 75% or
> above. The effect is retaining reasonable allocation coverage when the
> pool is close to full.
>
> A side-effect is that this also limits frequent long-lived allocations
> of the same source filling up the pool permanently.
>
> Uniqueness of an allocation for coverage purposes is based on its
> (partial) allocation stack trace (the source). A lossy hash map is
> used to check if an allocation is covered; if the allocation is
> currently covered, the allocation is skipped by KFENCE.
>
> Testing was done using:
>
>         (a) a synthetic workload that performs frequent long-lived
>             allocations (default config values + <10ms sample intervals
>             + smaller-than-default pool sizes), and
>
>         (b) normal desktop workloads on an otherwise idle machine where
>             the problem was first reported (default config values).
>
> In the case of (b) the observed result confirms that sampled allocation
> rate no longer drops to zero after a few days of uptime, all while
> "allocations skipped (covered)" are no more than 2% of total sampled
> allocations.
>
> Signed-off-by: Marco Elver <elver@google.com>
> ---
>  mm/kfence/core.c   | 120 ++++++++++++++++++++++++++++++++++++++++++++-
>  mm/kfence/kfence.h |   2 +
>  2 files changed, 120 insertions(+), 2 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index 2755800f3e2a..3b78402d7a5e 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -11,11 +11,13 @@
>  #include <linux/bug.h>
>  #include <linux/debugfs.h>
>  #include <linux/irq_work.h>
> +#include <linux/jhash.h>
>  #include <linux/kcsan-checks.h>
>  #include <linux/kfence.h>
>  #include <linux/kmemleak.h>
>  #include <linux/list.h>
>  #include <linux/lockdep.h>
> +#include <linux/log2.h>
>  #include <linux/memblock.h>
>  #include <linux/moduleparam.h>
>  #include <linux/random.h>
> @@ -86,6 +88,28 @@ module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_inte
>  char *__kfence_pool __ro_after_init;
>  EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
>
> +/*
> + * A lossy hash map of allocation stack trace coverage: limits currently covered
> + * allocations of the same source filling up the pool when close to full.
> + *
> + * The required data fits in 64 bits, and therefore we can avoid a per-entry (or
> + * global) lock by simply storing each entry's data in an atomic64_t.
> + */
> +union alloc_covered_entry {
> +       struct {
> +               u32 alloc_stack_hash;   /* stack trace hash */
> +               u32 covered;            /* current coverage count */
> +       };
> +       u64 entry;
> +};
> +#define ALLOC_COVERED_SIZE (1 << const_ilog2(CONFIG_KFENCE_NUM_OBJECTS | 128)) /* >= 128 */

const_ilog2 rounds down, so for 1023 objects we will have hashtable of
size 512, or am I missing something? This asking for collisions.
Hashtable size should be larger than expected population.

> +#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
> +static atomic64_t alloc_covered[ALLOC_COVERED_SIZE];
> +/* Stack depth used to determine uniqueness of an allocation. */
> +#define UNIQUE_ALLOC_STACK_DEPTH 8
> +/* Pool usage threshold when currently covered allocations are skipped. */
> +#define SKIP_COVERED_THRESHOLD ((CONFIG_KFENCE_NUM_OBJECTS * 3) / 4) /* 75% */
> +
>  /*
>   * Per-object metadata, with one-to-one mapping of object metadata to
>   * backing pages (in __kfence_pool).
> @@ -114,6 +138,7 @@ enum kfence_counter_id {
>         KFENCE_COUNTER_BUGS,
>         KFENCE_COUNTER_SKIP_INCOMPAT,
>         KFENCE_COUNTER_SKIP_CAPACITY,
> +       KFENCE_COUNTER_SKIP_COVERED,
>         KFENCE_COUNTER_COUNT,
>  };
>  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> @@ -125,11 +150,73 @@ static const char *const counter_names[] = {
>         [KFENCE_COUNTER_BUGS]           = "total bugs",
>         [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
>         [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
> +       [KFENCE_COUNTER_SKIP_COVERED]   = "skipped allocations (covered)",
>  };
>  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
>
>  /* === Internals ============================================================ */
>
> +static u32 get_alloc_stack_hash(void)
> +{
> +       unsigned long stack_entries[UNIQUE_ALLOC_STACK_DEPTH];
> +       size_t num_entries;
> +
> +       num_entries = stack_trace_save(stack_entries, UNIQUE_ALLOC_STACK_DEPTH, 1);

Strictly speaking, if a bad persistent allocation comes from an
interrupt it may still consume whole pool. We've hit this problem with
KASAN stackdepot unbounded growth. It's better to do
filter_irq_stacks() here, see:
https://elixir.bootlin.com/linux/v5.15-rc1/source/mm/kasan/common.c#L39


> +       return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), 0);
> +}
> +
> +/*
> + * Check if the allocation stack trace hash @alloc_stack_hash is contained in
> + * @alloc_covered and currently covered.
> + */
> +static bool alloc_covered_contains(u32 alloc_stack_hash)
> +{
> +       union alloc_covered_entry entry;
> +
> +       if (!alloc_stack_hash)
> +               return false;
> +
> +       entry.entry = (u64)atomic64_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
> +       return entry.alloc_stack_hash == alloc_stack_hash && entry.covered;
> +}
> +
> +/*
> + * Adds (or subtracts) coverage count to entry corresponding to
> + * @alloc_stack_hash. If @alloc_stack_hash is not yet contained in
> + * @alloc_covered, resets (potentially evicting existing) entry.
> + */
> +static void alloc_covered_add(u32 alloc_stack_hash, int val)
> +{
> +       union alloc_covered_entry old;
> +       union alloc_covered_entry new;
> +       atomic64_t *bucket;
> +
> +       if (!alloc_stack_hash)
> +               return;
> +
> +       bucket = &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK];
> +       old.entry = (u64)atomic64_read(bucket);
> +       new.alloc_stack_hash = alloc_stack_hash;
> +       do {
> +               if (val > 0) {
> +                       new.covered = old.alloc_stack_hash == alloc_stack_hash
> +                                       ? old.covered + val     /* increment */
> +                                       : val;                  /* evict/reset */

I am trying to understand the effects of this eviction policy on the result.
It seems that it can render the pool overflow protection void.
Consider, two stacks (ABC, DEF) hash to the same bucket. One
allocation is frequent and not persistent, another is less frequent
but almost persistent. The first one will evict the second one, so we
will always save the second effectively defeating the overflow
protection.

There are also some interesting effects due to cyclic evictions
(A->B->A), where we do not count increment, but count decrement.

Have you considered not evicting, but rather simply combining
allocations with the same hash?
I.e. doing alloc_covered[hash]++/--.
It would err on the side of not sampling allocations that are unlucky
to collide with persistent allocations, but would provide more
reliable overflow guarantees (at least we continue sampling
allocations for all other buckets since we have pool capacity).
FWIW also simpler code.

I am also thinking if collisions can be resolved by adding some salt
that is generated on boot. Resolving collisions across different
machines is good enough for KFENCE. Namely, if we have stacks ABC and
DEF, we hash XABC and XDEF, where X is filled on boot. It should work
for a good hash function, right? If this works, then the simpler
alloc_covered[hash]++/-- scheme should work (?).



> +               } else if (old.alloc_stack_hash == alloc_stack_hash && old.covered) {
> +                       new.covered = old.covered + val;
> +               } else {
> +                       /*
> +                        * Hash mismatch or covered has become zero. The latter
> +                        * is possible if we race with:
> +                        *      reset (!= alloc_stack_hash)
> +                        *       -> reset (== alloc_stack_hash)
> +                        *        -> decrement
> +                        */
> +                       break;
> +               }
> +       } while (!atomic64_try_cmpxchg_relaxed(bucket, (s64 *)&old.entry, (s64)new.entry));
> +}
> +
>  static bool kfence_protect(unsigned long addr)
>  {
>         return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
> @@ -261,7 +348,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
>         }
>  }
>
> -static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
> +static void *
> +kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, u32 alloc_stack_hash)
>  {
>         struct kfence_metadata *meta = NULL;
>         unsigned long flags;
> @@ -322,6 +410,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>         /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
>         WRITE_ONCE(meta->cache, cache);
>         meta->size = size;
> +       meta->alloc_stack_hash = alloc_stack_hash;
> +
>         for_each_canary(meta, set_canary_byte);
>
>         /* Set required struct page fields. */
> @@ -334,6 +424,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>
>         raw_spin_unlock_irqrestore(&meta->lock, flags);
>
> +       alloc_covered_add(alloc_stack_hash, 1);
> +
>         /* Memory initialization. */
>
>         /*
> @@ -362,6 +454,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>  static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
>  {
>         struct kcsan_scoped_access assert_page_exclusive;
> +       u32 alloc_stack_hash;
>         unsigned long flags;
>
>         raw_spin_lock_irqsave(&meta->lock, flags);
> @@ -404,8 +497,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
>         /* Mark the object as freed. */
>         metadata_update_state(meta, KFENCE_OBJECT_FREED);
>
> +       alloc_stack_hash = meta->alloc_stack_hash;
> +       meta->alloc_stack_hash = 0;
> +
>         raw_spin_unlock_irqrestore(&meta->lock, flags);
>
> +       alloc_covered_add(alloc_stack_hash, -1);
> +
>         /* Protect to detect use-after-frees. */
>         kfence_protect((unsigned long)addr);
>
> @@ -744,6 +842,8 @@ void kfence_shutdown_cache(struct kmem_cache *s)
>
>  void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>  {
> +       u32 alloc_stack_hash;
> +
>         /*
>          * Perform size check before switching kfence_allocation_gate, so that
>          * we don't disable KFENCE without making an allocation.
> @@ -788,7 +888,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>         if (!READ_ONCE(kfence_enabled))
>                 return NULL;
>
> -       return kfence_guarded_alloc(s, size, flags);
> +       /*
> +        * Do expensive check for coverage of allocation in slow-path after
> +        * allocation_gate has already become non-zero, even though it might
> +        * mean not making any allocation within a given sample interval.
> +        *
> +        * This ensures reasonable allocation coverage when the pool is almost
> +        * full, including avoiding long-lived allocations of the same source
> +        * filling up the pool (e.g. pagecache allocations).
> +        */
> +       alloc_stack_hash = get_alloc_stack_hash();

Is it possible to unwind the stack only once per allocation? I.e.
unwind here into a buffer on stack and then pass it down?

> +       if (atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > SKIP_COVERED_THRESHOLD &&
> +           alloc_covered_contains(alloc_stack_hash)) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
> +               return NULL;
> +       }
> +
> +       return kfence_guarded_alloc(s, size, flags, alloc_stack_hash);
>  }
>
>  size_t kfence_ksize(const void *addr)
> diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
> index c1f23c61e5f9..2a2d5de9d379 100644
> --- a/mm/kfence/kfence.h
> +++ b/mm/kfence/kfence.h
> @@ -87,6 +87,8 @@ struct kfence_metadata {
>         /* Allocation and free stack information. */
>         struct kfence_track alloc_track;
>         struct kfence_track free_track;
> +       /* For updating alloc_covered on frees. */
> +       u32 alloc_stack_hash;
>  };
>
>  extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
> --
> 2.33.0.464.g1972c5931b-goog

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] kfence: limit currently covered allocations when pool nearly full
@ 2021-09-17 13:52     ` Dmitry Vyukov
  0 siblings, 0 replies; 14+ messages in thread
From: Dmitry Vyukov @ 2021-09-17 13:52 UTC (permalink / raw)
  To: Marco Elver
  Cc: Andrew Morton, Alexander Potapenko, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

On Fri, 17 Sept 2021 at 13:08, 'Marco Elver' via kasan-dev
<kasan-dev@googlegroups.com> wrote:
>
> One of KFENCE's main design principles is that with increasing uptime,
> allocation coverage increases sufficiently to detect previously
> undetected bugs.
>
> We have observed that frequent long-lived allocations of the same
> source (e.g. pagecache) tend to permanently fill up the KFENCE pool
> with increasing system uptime, thus breaking the above requirement.
> The workaround thus far had been increasing the sample interval and/or
> increasing the KFENCE pool size, but is no reliable solution.
>
> To ensure diverse coverage of allocations, limit currently covered
> allocations of the same source once pool utilization reaches 75% or
> above. The effect is retaining reasonable allocation coverage when the
> pool is close to full.
>
> A side-effect is that this also limits frequent long-lived allocations
> of the same source filling up the pool permanently.
>
> Uniqueness of an allocation for coverage purposes is based on its
> (partial) allocation stack trace (the source). A lossy hash map is
> used to check if an allocation is covered; if the allocation is
> currently covered, the allocation is skipped by KFENCE.
>
> Testing was done using:
>
>         (a) a synthetic workload that performs frequent long-lived
>             allocations (default config values + <10ms sample intervals
>             + smaller-than-default pool sizes), and
>
>         (b) normal desktop workloads on an otherwise idle machine where
>             the problem was first reported (default config values).
>
> In the case of (b) the observed result confirms that sampled allocation
> rate no longer drops to zero after a few days of uptime, all while
> "allocations skipped (covered)" are no more than 2% of total sampled
> allocations.
>
> Signed-off-by: Marco Elver <elver@google.com>
> ---
>  mm/kfence/core.c   | 120 ++++++++++++++++++++++++++++++++++++++++++++-
>  mm/kfence/kfence.h |   2 +
>  2 files changed, 120 insertions(+), 2 deletions(-)
>
> diff --git a/mm/kfence/core.c b/mm/kfence/core.c
> index 2755800f3e2a..3b78402d7a5e 100644
> --- a/mm/kfence/core.c
> +++ b/mm/kfence/core.c
> @@ -11,11 +11,13 @@
>  #include <linux/bug.h>
>  #include <linux/debugfs.h>
>  #include <linux/irq_work.h>
> +#include <linux/jhash.h>
>  #include <linux/kcsan-checks.h>
>  #include <linux/kfence.h>
>  #include <linux/kmemleak.h>
>  #include <linux/list.h>
>  #include <linux/lockdep.h>
> +#include <linux/log2.h>
>  #include <linux/memblock.h>
>  #include <linux/moduleparam.h>
>  #include <linux/random.h>
> @@ -86,6 +88,28 @@ module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_inte
>  char *__kfence_pool __ro_after_init;
>  EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
>
> +/*
> + * A lossy hash map of allocation stack trace coverage: limits currently covered
> + * allocations of the same source filling up the pool when close to full.
> + *
> + * The required data fits in 64 bits, and therefore we can avoid a per-entry (or
> + * global) lock by simply storing each entry's data in an atomic64_t.
> + */
> +union alloc_covered_entry {
> +       struct {
> +               u32 alloc_stack_hash;   /* stack trace hash */
> +               u32 covered;            /* current coverage count */
> +       };
> +       u64 entry;
> +};
> +#define ALLOC_COVERED_SIZE (1 << const_ilog2(CONFIG_KFENCE_NUM_OBJECTS | 128)) /* >= 128 */

const_ilog2 rounds down, so for 1023 objects we will have hashtable of
size 512, or am I missing something? This asking for collisions.
Hashtable size should be larger than expected population.

> +#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
> +static atomic64_t alloc_covered[ALLOC_COVERED_SIZE];
> +/* Stack depth used to determine uniqueness of an allocation. */
> +#define UNIQUE_ALLOC_STACK_DEPTH 8
> +/* Pool usage threshold when currently covered allocations are skipped. */
> +#define SKIP_COVERED_THRESHOLD ((CONFIG_KFENCE_NUM_OBJECTS * 3) / 4) /* 75% */
> +
>  /*
>   * Per-object metadata, with one-to-one mapping of object metadata to
>   * backing pages (in __kfence_pool).
> @@ -114,6 +138,7 @@ enum kfence_counter_id {
>         KFENCE_COUNTER_BUGS,
>         KFENCE_COUNTER_SKIP_INCOMPAT,
>         KFENCE_COUNTER_SKIP_CAPACITY,
> +       KFENCE_COUNTER_SKIP_COVERED,
>         KFENCE_COUNTER_COUNT,
>  };
>  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> @@ -125,11 +150,73 @@ static const char *const counter_names[] = {
>         [KFENCE_COUNTER_BUGS]           = "total bugs",
>         [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
>         [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
> +       [KFENCE_COUNTER_SKIP_COVERED]   = "skipped allocations (covered)",
>  };
>  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
>
>  /* === Internals ============================================================ */
>
> +static u32 get_alloc_stack_hash(void)
> +{
> +       unsigned long stack_entries[UNIQUE_ALLOC_STACK_DEPTH];
> +       size_t num_entries;
> +
> +       num_entries = stack_trace_save(stack_entries, UNIQUE_ALLOC_STACK_DEPTH, 1);

Strictly speaking, if a bad persistent allocation comes from an
interrupt it may still consume whole pool. We've hit this problem with
KASAN stackdepot unbounded growth. It's better to do
filter_irq_stacks() here, see:
https://elixir.bootlin.com/linux/v5.15-rc1/source/mm/kasan/common.c#L39


> +       return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), 0);
> +}
> +
> +/*
> + * Check if the allocation stack trace hash @alloc_stack_hash is contained in
> + * @alloc_covered and currently covered.
> + */
> +static bool alloc_covered_contains(u32 alloc_stack_hash)
> +{
> +       union alloc_covered_entry entry;
> +
> +       if (!alloc_stack_hash)
> +               return false;
> +
> +       entry.entry = (u64)atomic64_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
> +       return entry.alloc_stack_hash == alloc_stack_hash && entry.covered;
> +}
> +
> +/*
> + * Adds (or subtracts) coverage count to entry corresponding to
> + * @alloc_stack_hash. If @alloc_stack_hash is not yet contained in
> + * @alloc_covered, resets (potentially evicting existing) entry.
> + */
> +static void alloc_covered_add(u32 alloc_stack_hash, int val)
> +{
> +       union alloc_covered_entry old;
> +       union alloc_covered_entry new;
> +       atomic64_t *bucket;
> +
> +       if (!alloc_stack_hash)
> +               return;
> +
> +       bucket = &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK];
> +       old.entry = (u64)atomic64_read(bucket);
> +       new.alloc_stack_hash = alloc_stack_hash;
> +       do {
> +               if (val > 0) {
> +                       new.covered = old.alloc_stack_hash == alloc_stack_hash
> +                                       ? old.covered + val     /* increment */
> +                                       : val;                  /* evict/reset */

I am trying to understand the effects of this eviction policy on the result.
It seems that it can render the pool overflow protection void.
Consider, two stacks (ABC, DEF) hash to the same bucket. One
allocation is frequent and not persistent, another is less frequent
but almost persistent. The first one will evict the second one, so we
will always save the second effectively defeating the overflow
protection.

There are also some interesting effects due to cyclic evictions
(A->B->A), where we do not count increment, but count decrement.

Have you considered not evicting, but rather simply combining
allocations with the same hash?
I.e. doing alloc_covered[hash]++/--.
It would err on the side of not sampling allocations that are unlucky
to collide with persistent allocations, but would provide more
reliable overflow guarantees (at least we continue sampling
allocations for all other buckets since we have pool capacity).
FWIW also simpler code.

I am also thinking if collisions can be resolved by adding some salt
that is generated on boot. Resolving collisions across different
machines is good enough for KFENCE. Namely, if we have stacks ABC and
DEF, we hash XABC and XDEF, where X is filled on boot. It should work
for a good hash function, right? If this works, then the simpler
alloc_covered[hash]++/-- scheme should work (?).



> +               } else if (old.alloc_stack_hash == alloc_stack_hash && old.covered) {
> +                       new.covered = old.covered + val;
> +               } else {
> +                       /*
> +                        * Hash mismatch or covered has become zero. The latter
> +                        * is possible if we race with:
> +                        *      reset (!= alloc_stack_hash)
> +                        *       -> reset (== alloc_stack_hash)
> +                        *        -> decrement
> +                        */
> +                       break;
> +               }
> +       } while (!atomic64_try_cmpxchg_relaxed(bucket, (s64 *)&old.entry, (s64)new.entry));
> +}
> +
>  static bool kfence_protect(unsigned long addr)
>  {
>         return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
> @@ -261,7 +348,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
>         }
>  }
>
> -static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
> +static void *
> +kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, u32 alloc_stack_hash)
>  {
>         struct kfence_metadata *meta = NULL;
>         unsigned long flags;
> @@ -322,6 +410,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>         /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
>         WRITE_ONCE(meta->cache, cache);
>         meta->size = size;
> +       meta->alloc_stack_hash = alloc_stack_hash;
> +
>         for_each_canary(meta, set_canary_byte);
>
>         /* Set required struct page fields. */
> @@ -334,6 +424,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>
>         raw_spin_unlock_irqrestore(&meta->lock, flags);
>
> +       alloc_covered_add(alloc_stack_hash, 1);
> +
>         /* Memory initialization. */
>
>         /*
> @@ -362,6 +454,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
>  static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
>  {
>         struct kcsan_scoped_access assert_page_exclusive;
> +       u32 alloc_stack_hash;
>         unsigned long flags;
>
>         raw_spin_lock_irqsave(&meta->lock, flags);
> @@ -404,8 +497,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
>         /* Mark the object as freed. */
>         metadata_update_state(meta, KFENCE_OBJECT_FREED);
>
> +       alloc_stack_hash = meta->alloc_stack_hash;
> +       meta->alloc_stack_hash = 0;
> +
>         raw_spin_unlock_irqrestore(&meta->lock, flags);
>
> +       alloc_covered_add(alloc_stack_hash, -1);
> +
>         /* Protect to detect use-after-frees. */
>         kfence_protect((unsigned long)addr);
>
> @@ -744,6 +842,8 @@ void kfence_shutdown_cache(struct kmem_cache *s)
>
>  void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>  {
> +       u32 alloc_stack_hash;
> +
>         /*
>          * Perform size check before switching kfence_allocation_gate, so that
>          * we don't disable KFENCE without making an allocation.
> @@ -788,7 +888,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
>         if (!READ_ONCE(kfence_enabled))
>                 return NULL;
>
> -       return kfence_guarded_alloc(s, size, flags);
> +       /*
> +        * Do expensive check for coverage of allocation in slow-path after
> +        * allocation_gate has already become non-zero, even though it might
> +        * mean not making any allocation within a given sample interval.
> +        *
> +        * This ensures reasonable allocation coverage when the pool is almost
> +        * full, including avoiding long-lived allocations of the same source
> +        * filling up the pool (e.g. pagecache allocations).
> +        */
> +       alloc_stack_hash = get_alloc_stack_hash();

Is it possible to unwind the stack only once per allocation? I.e.
unwind here into a buffer on stack and then pass it down?

> +       if (atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > SKIP_COVERED_THRESHOLD &&
> +           alloc_covered_contains(alloc_stack_hash)) {
> +               atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
> +               return NULL;
> +       }
> +
> +       return kfence_guarded_alloc(s, size, flags, alloc_stack_hash);
>  }
>
>  size_t kfence_ksize(const void *addr)
> diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
> index c1f23c61e5f9..2a2d5de9d379 100644
> --- a/mm/kfence/kfence.h
> +++ b/mm/kfence/kfence.h
> @@ -87,6 +87,8 @@ struct kfence_metadata {
>         /* Allocation and free stack information. */
>         struct kfence_track alloc_track;
>         struct kfence_track free_track;
> +       /* For updating alloc_covered on frees. */
> +       u32 alloc_stack_hash;
>  };
>
>  extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
> --
> 2.33.0.464.g1972c5931b-goog


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] kfence: limit currently covered allocations when pool nearly full
  2021-09-17 13:52     ` Dmitry Vyukov
@ 2021-09-17 15:04       ` Marco Elver
  -1 siblings, 0 replies; 14+ messages in thread
From: Marco Elver @ 2021-09-17 15:04 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Andrew Morton, Alexander Potapenko, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

On Fri, 17 Sept 2021 at 15:52, Dmitry Vyukov <dvyukov@google.com> wrote:
[...]
> > +/*
> > + * A lossy hash map of allocation stack trace coverage: limits currently covered
> > + * allocations of the same source filling up the pool when close to full.
> > + *
> > + * The required data fits in 64 bits, and therefore we can avoid a per-entry (or
> > + * global) lock by simply storing each entry's data in an atomic64_t.
> > + */
> > +union alloc_covered_entry {
> > +       struct {
> > +               u32 alloc_stack_hash;   /* stack trace hash */
> > +               u32 covered;            /* current coverage count */
> > +       };
> > +       u64 entry;
> > +};
> > +#define ALLOC_COVERED_SIZE (1 << const_ilog2(CONFIG_KFENCE_NUM_OBJECTS | 128)) /* >= 128 */
>
> const_ilog2 rounds down, so for 1023 objects we will have hashtable of
> size 512, or am I missing something? This asking for collisions.
> Hashtable size should be larger than expected population.

That's correct. I wanted to err on the side of allocating more and not
less, if we can afford it. Hence also the choice of lossy hash map.
However, I think if we consider the whole fleet, your proposal below
makes sense and I'll rerun experiments with that and see.

> > +#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
> > +static atomic64_t alloc_covered[ALLOC_COVERED_SIZE];
> > +/* Stack depth used to determine uniqueness of an allocation. */
> > +#define UNIQUE_ALLOC_STACK_DEPTH 8
> > +/* Pool usage threshold when currently covered allocations are skipped. */
> > +#define SKIP_COVERED_THRESHOLD ((CONFIG_KFENCE_NUM_OBJECTS * 3) / 4) /* 75% */
> > +
> >  /*
> >   * Per-object metadata, with one-to-one mapping of object metadata to
> >   * backing pages (in __kfence_pool).
> > @@ -114,6 +138,7 @@ enum kfence_counter_id {
> >         KFENCE_COUNTER_BUGS,
> >         KFENCE_COUNTER_SKIP_INCOMPAT,
> >         KFENCE_COUNTER_SKIP_CAPACITY,
> > +       KFENCE_COUNTER_SKIP_COVERED,
> >         KFENCE_COUNTER_COUNT,
> >  };
> >  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> > @@ -125,11 +150,73 @@ static const char *const counter_names[] = {
> >         [KFENCE_COUNTER_BUGS]           = "total bugs",
> >         [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
> >         [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
> > +       [KFENCE_COUNTER_SKIP_COVERED]   = "skipped allocations (covered)",
> >  };
> >  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
> >
> >  /* === Internals ============================================================ */
> >
> > +static u32 get_alloc_stack_hash(void)
> > +{
> > +       unsigned long stack_entries[UNIQUE_ALLOC_STACK_DEPTH];
> > +       size_t num_entries;
> > +
> > +       num_entries = stack_trace_save(stack_entries, UNIQUE_ALLOC_STACK_DEPTH, 1);
>
> Strictly speaking, if a bad persistent allocation comes from an
> interrupt it may still consume whole pool. We've hit this problem with
> KASAN stackdepot unbounded growth. It's better to do
> filter_irq_stacks() here, see:
> https://elixir.bootlin.com/linux/v5.15-rc1/source/mm/kasan/common.c#L39

Time to move filter_irq_stacks() out of stackdepot, we should not
depend on stackdepot just for filter_irq_stacks(). I'll probably move
it to kernel/stacktrace.c, which seems most appropriate.

> > +       return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), 0);
> > +}
> > +
> > +/*
> > + * Check if the allocation stack trace hash @alloc_stack_hash is contained in
> > + * @alloc_covered and currently covered.
> > + */
> > +static bool alloc_covered_contains(u32 alloc_stack_hash)
> > +{
> > +       union alloc_covered_entry entry;
> > +
> > +       if (!alloc_stack_hash)
> > +               return false;
> > +
> > +       entry.entry = (u64)atomic64_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
> > +       return entry.alloc_stack_hash == alloc_stack_hash && entry.covered;
> > +}
> > +
> > +/*
> > + * Adds (or subtracts) coverage count to entry corresponding to
> > + * @alloc_stack_hash. If @alloc_stack_hash is not yet contained in
> > + * @alloc_covered, resets (potentially evicting existing) entry.
> > + */
> > +static void alloc_covered_add(u32 alloc_stack_hash, int val)
> > +{
> > +       union alloc_covered_entry old;
> > +       union alloc_covered_entry new;
> > +       atomic64_t *bucket;
> > +
> > +       if (!alloc_stack_hash)
> > +               return;
> > +
> > +       bucket = &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK];
> > +       old.entry = (u64)atomic64_read(bucket);
> > +       new.alloc_stack_hash = alloc_stack_hash;
> > +       do {
> > +               if (val > 0) {
> > +                       new.covered = old.alloc_stack_hash == alloc_stack_hash
> > +                                       ? old.covered + val     /* increment */
> > +                                       : val;                  /* evict/reset */
>
> I am trying to understand the effects of this eviction policy on the result.
> It seems that it can render the pool overflow protection void.
> Consider, two stacks (ABC, DEF) hash to the same bucket. One
> allocation is frequent and not persistent, another is less frequent
> but almost persistent. The first one will evict the second one, so we
> will always save the second effectively defeating the overflow
> protection.
>
> There are also some interesting effects due to cyclic evictions
> (A->B->A), where we do not count increment, but count decrement.
>
> Have you considered not evicting, but rather simply combining
> allocations with the same hash?

Hmm, good point. It's probably not as bad as a real bloom filter,
because we might successfully remove an entry if all the allocations
that mapped to 1 bucket are freed.

> I.e. doing alloc_covered[hash]++/--.
> It would err on the side of not sampling allocations that are unlucky
> to collide with persistent allocations, but would provide more
> reliable overflow guarantees (at least we continue sampling
> allocations for all other buckets since we have pool capacity).
> FWIW also simpler code.
>
> I am also thinking if collisions can be resolved by adding some salt
> that is generated on boot. Resolving collisions across different
> machines is good enough for KFENCE. Namely, if we have stacks ABC and
> DEF, we hash XABC and XDEF, where X is filled on boot. It should work
> for a good hash function, right? If this works, then the simpler
> alloc_covered[hash]++/-- scheme should work (?).

Good idea, I think I'll introduce a seed for the hash function.

Let me experiment with the simplified version you suggest, and see what I get.

> > +               } else if (old.alloc_stack_hash == alloc_stack_hash && old.covered) {
> > +                       new.covered = old.covered + val;
> > +               } else {
> > +                       /*
> > +                        * Hash mismatch or covered has become zero. The latter
> > +                        * is possible if we race with:
> > +                        *      reset (!= alloc_stack_hash)
> > +                        *       -> reset (== alloc_stack_hash)
> > +                        *        -> decrement
> > +                        */
> > +                       break;
> > +               }
> > +       } while (!atomic64_try_cmpxchg_relaxed(bucket, (s64 *)&old.entry, (s64)new.entry));
> > +}
> > +
> >  static bool kfence_protect(unsigned long addr)
> >  {
> >         return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
> > @@ -261,7 +348,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
> >         }
> >  }
> >
> > -static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
> > +static void *
> > +kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, u32 alloc_stack_hash)
> >  {
> >         struct kfence_metadata *meta = NULL;
> >         unsigned long flags;
> > @@ -322,6 +410,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >         /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
> >         WRITE_ONCE(meta->cache, cache);
> >         meta->size = size;
> > +       meta->alloc_stack_hash = alloc_stack_hash;
> > +
> >         for_each_canary(meta, set_canary_byte);
> >
> >         /* Set required struct page fields. */
> > @@ -334,6 +424,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >
> >         raw_spin_unlock_irqrestore(&meta->lock, flags);
> >
> > +       alloc_covered_add(alloc_stack_hash, 1);
> > +
> >         /* Memory initialization. */
> >
> >         /*
> > @@ -362,6 +454,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >  static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
> >  {
> >         struct kcsan_scoped_access assert_page_exclusive;
> > +       u32 alloc_stack_hash;
> >         unsigned long flags;
> >
> >         raw_spin_lock_irqsave(&meta->lock, flags);
> > @@ -404,8 +497,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
> >         /* Mark the object as freed. */
> >         metadata_update_state(meta, KFENCE_OBJECT_FREED);
> >
> > +       alloc_stack_hash = meta->alloc_stack_hash;
> > +       meta->alloc_stack_hash = 0;
> > +
> >         raw_spin_unlock_irqrestore(&meta->lock, flags);
> >
> > +       alloc_covered_add(alloc_stack_hash, -1);
> > +
> >         /* Protect to detect use-after-frees. */
> >         kfence_protect((unsigned long)addr);
> >
> > @@ -744,6 +842,8 @@ void kfence_shutdown_cache(struct kmem_cache *s)
> >
> >  void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >  {
> > +       u32 alloc_stack_hash;
> > +
> >         /*
> >          * Perform size check before switching kfence_allocation_gate, so that
> >          * we don't disable KFENCE without making an allocation.
> > @@ -788,7 +888,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >         if (!READ_ONCE(kfence_enabled))
> >                 return NULL;
> >
> > -       return kfence_guarded_alloc(s, size, flags);
> > +       /*
> > +        * Do expensive check for coverage of allocation in slow-path after
> > +        * allocation_gate has already become non-zero, even though it might
> > +        * mean not making any allocation within a given sample interval.
> > +        *
> > +        * This ensures reasonable allocation coverage when the pool is almost
> > +        * full, including avoiding long-lived allocations of the same source
> > +        * filling up the pool (e.g. pagecache allocations).
> > +        */
> > +       alloc_stack_hash = get_alloc_stack_hash();
>
> Is it possible to unwind the stack only once per allocation? I.e.
> unwind here into a buffer on stack and then pass it down?

I'll investigate how bad it looks if we do that.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/3] kfence: limit currently covered allocations when pool nearly full
@ 2021-09-17 15:04       ` Marco Elver
  0 siblings, 0 replies; 14+ messages in thread
From: Marco Elver @ 2021-09-17 15:04 UTC (permalink / raw)
  To: Dmitry Vyukov
  Cc: Andrew Morton, Alexander Potapenko, Aleksandr Nogikh,
	Taras Madan, linux-kernel, linux-mm, kasan-dev

On Fri, 17 Sept 2021 at 15:52, Dmitry Vyukov <dvyukov@google.com> wrote:
[...]
> > +/*
> > + * A lossy hash map of allocation stack trace coverage: limits currently covered
> > + * allocations of the same source filling up the pool when close to full.
> > + *
> > + * The required data fits in 64 bits, and therefore we can avoid a per-entry (or
> > + * global) lock by simply storing each entry's data in an atomic64_t.
> > + */
> > +union alloc_covered_entry {
> > +       struct {
> > +               u32 alloc_stack_hash;   /* stack trace hash */
> > +               u32 covered;            /* current coverage count */
> > +       };
> > +       u64 entry;
> > +};
> > +#define ALLOC_COVERED_SIZE (1 << const_ilog2(CONFIG_KFENCE_NUM_OBJECTS | 128)) /* >= 128 */
>
> const_ilog2 rounds down, so for 1023 objects we will have hashtable of
> size 512, or am I missing something? This asking for collisions.
> Hashtable size should be larger than expected population.

That's correct. I wanted to err on the side of allocating more and not
less, if we can afford it. Hence also the choice of lossy hash map.
However, I think if we consider the whole fleet, your proposal below
makes sense and I'll rerun experiments with that and see.

> > +#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
> > +static atomic64_t alloc_covered[ALLOC_COVERED_SIZE];
> > +/* Stack depth used to determine uniqueness of an allocation. */
> > +#define UNIQUE_ALLOC_STACK_DEPTH 8
> > +/* Pool usage threshold when currently covered allocations are skipped. */
> > +#define SKIP_COVERED_THRESHOLD ((CONFIG_KFENCE_NUM_OBJECTS * 3) / 4) /* 75% */
> > +
> >  /*
> >   * Per-object metadata, with one-to-one mapping of object metadata to
> >   * backing pages (in __kfence_pool).
> > @@ -114,6 +138,7 @@ enum kfence_counter_id {
> >         KFENCE_COUNTER_BUGS,
> >         KFENCE_COUNTER_SKIP_INCOMPAT,
> >         KFENCE_COUNTER_SKIP_CAPACITY,
> > +       KFENCE_COUNTER_SKIP_COVERED,
> >         KFENCE_COUNTER_COUNT,
> >  };
> >  static atomic_long_t counters[KFENCE_COUNTER_COUNT];
> > @@ -125,11 +150,73 @@ static const char *const counter_names[] = {
> >         [KFENCE_COUNTER_BUGS]           = "total bugs",
> >         [KFENCE_COUNTER_SKIP_INCOMPAT]  = "skipped allocations (incompatible)",
> >         [KFENCE_COUNTER_SKIP_CAPACITY]  = "skipped allocations (capacity)",
> > +       [KFENCE_COUNTER_SKIP_COVERED]   = "skipped allocations (covered)",
> >  };
> >  static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
> >
> >  /* === Internals ============================================================ */
> >
> > +static u32 get_alloc_stack_hash(void)
> > +{
> > +       unsigned long stack_entries[UNIQUE_ALLOC_STACK_DEPTH];
> > +       size_t num_entries;
> > +
> > +       num_entries = stack_trace_save(stack_entries, UNIQUE_ALLOC_STACK_DEPTH, 1);
>
> Strictly speaking, if a bad persistent allocation comes from an
> interrupt it may still consume whole pool. We've hit this problem with
> KASAN stackdepot unbounded growth. It's better to do
> filter_irq_stacks() here, see:
> https://elixir.bootlin.com/linux/v5.15-rc1/source/mm/kasan/common.c#L39

Time to move filter_irq_stacks() out of stackdepot, we should not
depend on stackdepot just for filter_irq_stacks(). I'll probably move
it to kernel/stacktrace.c, which seems most appropriate.

> > +       return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), 0);
> > +}
> > +
> > +/*
> > + * Check if the allocation stack trace hash @alloc_stack_hash is contained in
> > + * @alloc_covered and currently covered.
> > + */
> > +static bool alloc_covered_contains(u32 alloc_stack_hash)
> > +{
> > +       union alloc_covered_entry entry;
> > +
> > +       if (!alloc_stack_hash)
> > +               return false;
> > +
> > +       entry.entry = (u64)atomic64_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
> > +       return entry.alloc_stack_hash == alloc_stack_hash && entry.covered;
> > +}
> > +
> > +/*
> > + * Adds (or subtracts) coverage count to entry corresponding to
> > + * @alloc_stack_hash. If @alloc_stack_hash is not yet contained in
> > + * @alloc_covered, resets (potentially evicting existing) entry.
> > + */
> > +static void alloc_covered_add(u32 alloc_stack_hash, int val)
> > +{
> > +       union alloc_covered_entry old;
> > +       union alloc_covered_entry new;
> > +       atomic64_t *bucket;
> > +
> > +       if (!alloc_stack_hash)
> > +               return;
> > +
> > +       bucket = &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK];
> > +       old.entry = (u64)atomic64_read(bucket);
> > +       new.alloc_stack_hash = alloc_stack_hash;
> > +       do {
> > +               if (val > 0) {
> > +                       new.covered = old.alloc_stack_hash == alloc_stack_hash
> > +                                       ? old.covered + val     /* increment */
> > +                                       : val;                  /* evict/reset */
>
> I am trying to understand the effects of this eviction policy on the result.
> It seems that it can render the pool overflow protection void.
> Consider, two stacks (ABC, DEF) hash to the same bucket. One
> allocation is frequent and not persistent, another is less frequent
> but almost persistent. The first one will evict the second one, so we
> will always save the second effectively defeating the overflow
> protection.
>
> There are also some interesting effects due to cyclic evictions
> (A->B->A), where we do not count increment, but count decrement.
>
> Have you considered not evicting, but rather simply combining
> allocations with the same hash?

Hmm, good point. It's probably not as bad as a real bloom filter,
because we might successfully remove an entry if all the allocations
that mapped to 1 bucket are freed.

> I.e. doing alloc_covered[hash]++/--.
> It would err on the side of not sampling allocations that are unlucky
> to collide with persistent allocations, but would provide more
> reliable overflow guarantees (at least we continue sampling
> allocations for all other buckets since we have pool capacity).
> FWIW also simpler code.
>
> I am also thinking if collisions can be resolved by adding some salt
> that is generated on boot. Resolving collisions across different
> machines is good enough for KFENCE. Namely, if we have stacks ABC and
> DEF, we hash XABC and XDEF, where X is filled on boot. It should work
> for a good hash function, right? If this works, then the simpler
> alloc_covered[hash]++/-- scheme should work (?).

Good idea, I think I'll introduce a seed for the hash function.

Let me experiment with the simplified version you suggest, and see what I get.

> > +               } else if (old.alloc_stack_hash == alloc_stack_hash && old.covered) {
> > +                       new.covered = old.covered + val;
> > +               } else {
> > +                       /*
> > +                        * Hash mismatch or covered has become zero. The latter
> > +                        * is possible if we race with:
> > +                        *      reset (!= alloc_stack_hash)
> > +                        *       -> reset (== alloc_stack_hash)
> > +                        *        -> decrement
> > +                        */
> > +                       break;
> > +               }
> > +       } while (!atomic64_try_cmpxchg_relaxed(bucket, (s64 *)&old.entry, (s64)new.entry));
> > +}
> > +
> >  static bool kfence_protect(unsigned long addr)
> >  {
> >         return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
> > @@ -261,7 +348,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
> >         }
> >  }
> >
> > -static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
> > +static void *
> > +kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp, u32 alloc_stack_hash)
> >  {
> >         struct kfence_metadata *meta = NULL;
> >         unsigned long flags;
> > @@ -322,6 +410,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >         /* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
> >         WRITE_ONCE(meta->cache, cache);
> >         meta->size = size;
> > +       meta->alloc_stack_hash = alloc_stack_hash;
> > +
> >         for_each_canary(meta, set_canary_byte);
> >
> >         /* Set required struct page fields. */
> > @@ -334,6 +424,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >
> >         raw_spin_unlock_irqrestore(&meta->lock, flags);
> >
> > +       alloc_covered_add(alloc_stack_hash, 1);
> > +
> >         /* Memory initialization. */
> >
> >         /*
> > @@ -362,6 +454,7 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
> >  static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool zombie)
> >  {
> >         struct kcsan_scoped_access assert_page_exclusive;
> > +       u32 alloc_stack_hash;
> >         unsigned long flags;
> >
> >         raw_spin_lock_irqsave(&meta->lock, flags);
> > @@ -404,8 +497,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
> >         /* Mark the object as freed. */
> >         metadata_update_state(meta, KFENCE_OBJECT_FREED);
> >
> > +       alloc_stack_hash = meta->alloc_stack_hash;
> > +       meta->alloc_stack_hash = 0;
> > +
> >         raw_spin_unlock_irqrestore(&meta->lock, flags);
> >
> > +       alloc_covered_add(alloc_stack_hash, -1);
> > +
> >         /* Protect to detect use-after-frees. */
> >         kfence_protect((unsigned long)addr);
> >
> > @@ -744,6 +842,8 @@ void kfence_shutdown_cache(struct kmem_cache *s)
> >
> >  void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >  {
> > +       u32 alloc_stack_hash;
> > +
> >         /*
> >          * Perform size check before switching kfence_allocation_gate, so that
> >          * we don't disable KFENCE without making an allocation.
> > @@ -788,7 +888,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
> >         if (!READ_ONCE(kfence_enabled))
> >                 return NULL;
> >
> > -       return kfence_guarded_alloc(s, size, flags);
> > +       /*
> > +        * Do expensive check for coverage of allocation in slow-path after
> > +        * allocation_gate has already become non-zero, even though it might
> > +        * mean not making any allocation within a given sample interval.
> > +        *
> > +        * This ensures reasonable allocation coverage when the pool is almost
> > +        * full, including avoiding long-lived allocations of the same source
> > +        * filling up the pool (e.g. pagecache allocations).
> > +        */
> > +       alloc_stack_hash = get_alloc_stack_hash();
>
> Is it possible to unwind the stack only once per allocation? I.e.
> unwind here into a buffer on stack and then pass it down?

I'll investigate how bad it looks if we do that.


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2021-09-17 15:04 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-09-17 11:07 [PATCH 1/3] kfence: count unexpectedly skipped allocations Marco Elver
2021-09-17 11:07 ` Marco Elver
2021-09-17 11:07 ` [PATCH 2/3] kfence: limit currently covered allocations when pool nearly full Marco Elver
2021-09-17 11:07   ` Marco Elver
2021-09-17 13:52   ` Dmitry Vyukov
2021-09-17 13:52     ` Dmitry Vyukov
2021-09-17 15:04     ` Marco Elver
2021-09-17 15:04       ` Marco Elver
2021-09-17 11:07 ` [PATCH 3/3] kfence: add note to documentation about skipping covered allocations Marco Elver
2021-09-17 11:07   ` Marco Elver
2021-09-17 12:58 ` [PATCH 1/3] kfence: count unexpectedly skipped allocations Dmitry Vyukov
2021-09-17 12:58   ` Dmitry Vyukov
2021-09-17 13:08   ` Marco Elver
2021-09-17 13:08     ` Marco Elver

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.