All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
@ 2023-09-12 16:28 Robin Murphy
  2023-09-12 16:28 ` [PATCH v3 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
                   ` (3 more replies)
  0 siblings, 4 replies; 25+ messages in thread
From: Robin Murphy @ 2023-09-12 16:28 UTC (permalink / raw)
  To: joro
  Cc: will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

v2: https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/

Hi all,

I hope this is good to go now, just fixed the locking (and threw
lockdep at it to confirm, which of course I should have done to begin
with...) and picked up tags.

Cheers,
Robin.


Robin Murphy (2):
  iommu/iova: Make the rcache depot scale better
  iommu/iova: Manage the depot list size

 drivers/iommu/iova.c | 95 ++++++++++++++++++++++++++++++--------------
 1 file changed, 65 insertions(+), 30 deletions(-)

-- 
2.39.2.101.g768bb238c484.dirty


^ permalink raw reply	[flat|nested] 25+ messages in thread

* [PATCH v3 1/2] iommu/iova: Make the rcache depot scale better
  2023-09-12 16:28 [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
@ 2023-09-12 16:28 ` Robin Murphy
  2023-09-12 16:28 ` [PATCH v3 2/2] iommu/iova: Manage the depot list size Robin Murphy
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 25+ messages in thread
From: Robin Murphy @ 2023-09-12 16:28 UTC (permalink / raw)
  To: joro
  Cc: will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

The algorithm in the original paper specifies the storage of full
magazines in the depot as an unbounded list rather than a fixed-size
array. It turns out to be pretty straightforward to do this in our
implementation with no significant loss of efficiency. This allows
the depot to scale up to the working set sizes of larger systems,
while also potentially saving some memory on smaller ones too.

Since this involves touching struct iova_magazine with the requisite
care, we may as well reinforce the comment with a proper assertion too.

Reviewed-by: John Garry <john.g.garry@oracle.com>
Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v3: No change

 drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
 1 file changed, 35 insertions(+), 30 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 10b964600948..dd2309e9a6c5 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -622,15 +622,19 @@ EXPORT_SYMBOL_GPL(reserve_iova);
 /*
  * As kmalloc's buffer size is fixed to power of 2, 127 is chosen to
  * assure size of 'iova_magazine' to be 1024 bytes, so that no memory
- * will be wasted.
+ * will be wasted. Since only full magazines are inserted into the depot,
+ * we don't need to waste PFN capacity on a separate list head either.
  */
 #define IOVA_MAG_SIZE 127
-#define MAX_GLOBAL_MAGS 32	/* magazines per bin */
 
 struct iova_magazine {
-	unsigned long size;
+	union {
+		unsigned long size;
+		struct iova_magazine *next;
+	};
 	unsigned long pfns[IOVA_MAG_SIZE];
 };
+static_assert(!(sizeof(struct iova_magazine) & (sizeof(struct iova_magazine) - 1)));
 
 struct iova_cpu_rcache {
 	spinlock_t lock;
@@ -640,8 +644,7 @@ struct iova_cpu_rcache {
 
 struct iova_rcache {
 	spinlock_t lock;
-	unsigned long depot_size;
-	struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+	struct iova_magazine *depot;
 	struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
@@ -717,6 +720,21 @@ static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
 	mag->pfns[mag->size++] = pfn;
 }
 
+static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
+{
+	struct iova_magazine *mag = rcache->depot;
+
+	rcache->depot = mag->next;
+	mag->size = IOVA_MAG_SIZE;
+	return mag;
+}
+
+static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *mag)
+{
+	mag->next = rcache->depot;
+	rcache->depot = mag;
+}
+
 int iova_domain_init_rcaches(struct iova_domain *iovad)
 {
 	unsigned int cpu;
@@ -734,7 +752,6 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
 
 		rcache = &iovad->rcaches[i];
 		spin_lock_init(&rcache->lock);
-		rcache->depot_size = 0;
 		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
 						     cache_line_size());
 		if (!rcache->cpu_rcaches) {
@@ -776,7 +793,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 				 struct iova_rcache *rcache,
 				 unsigned long iova_pfn)
 {
-	struct iova_magazine *mag_to_free = NULL;
 	struct iova_cpu_rcache *cpu_rcache;
 	bool can_insert = false;
 	unsigned long flags;
@@ -794,12 +810,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 
 		if (new_mag) {
 			spin_lock(&rcache->lock);
-			if (rcache->depot_size < MAX_GLOBAL_MAGS) {
-				rcache->depot[rcache->depot_size++] =
-						cpu_rcache->loaded;
-			} else {
-				mag_to_free = cpu_rcache->loaded;
-			}
+			iova_depot_push(rcache, cpu_rcache->loaded);
 			spin_unlock(&rcache->lock);
 
 			cpu_rcache->loaded = new_mag;
@@ -812,11 +823,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 
 	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
 
-	if (mag_to_free) {
-		iova_magazine_free_pfns(mag_to_free, iovad);
-		iova_magazine_free(mag_to_free);
-	}
-
 	return can_insert;
 }
 
@@ -854,9 +860,9 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
 		has_pfn = true;
 	} else {
 		spin_lock(&rcache->lock);
-		if (rcache->depot_size > 0) {
+		if (rcache->depot) {
 			iova_magazine_free(cpu_rcache->loaded);
-			cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+			cpu_rcache->loaded = iova_depot_pop(rcache);
 			has_pfn = true;
 		}
 		spin_unlock(&rcache->lock);
@@ -895,9 +901,8 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 	struct iova_rcache *rcache;
 	struct iova_cpu_rcache *cpu_rcache;
 	unsigned int cpu;
-	int i, j;
 
-	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+	for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
 		rcache = &iovad->rcaches[i];
 		if (!rcache->cpu_rcaches)
 			break;
@@ -907,8 +912,8 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 			iova_magazine_free(cpu_rcache->prev);
 		}
 		free_percpu(rcache->cpu_rcaches);
-		for (j = 0; j < rcache->depot_size; ++j)
-			iova_magazine_free(rcache->depot[j]);
+		while (rcache->depot)
+			iova_magazine_free(iova_depot_pop(rcache));
 	}
 
 	kfree(iovad->rcaches);
@@ -942,16 +947,16 @@ static void free_global_cached_iovas(struct iova_domain *iovad)
 {
 	struct iova_rcache *rcache;
 	unsigned long flags;
-	int i, j;
 
-	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+	for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
 		rcache = &iovad->rcaches[i];
 		spin_lock_irqsave(&rcache->lock, flags);
-		for (j = 0; j < rcache->depot_size; ++j) {
-			iova_magazine_free_pfns(rcache->depot[j], iovad);
-			iova_magazine_free(rcache->depot[j]);
+		while (rcache->depot) {
+			struct iova_magazine *mag = iova_depot_pop(rcache);
+
+			iova_magazine_free_pfns(mag, iovad);
+			iova_magazine_free(mag);
 		}
-		rcache->depot_size = 0;
 		spin_unlock_irqrestore(&rcache->lock, flags);
 	}
 }
-- 
2.39.2.101.g768bb238c484.dirty


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* [PATCH v3 2/2] iommu/iova: Manage the depot list size
  2023-09-12 16:28 [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
  2023-09-12 16:28 ` [PATCH v3 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
@ 2023-09-12 16:28 ` Robin Murphy
  2023-09-25 10:08 ` [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible Joerg Roedel
  2023-12-28 12:23 ` Ido Schimmel
  3 siblings, 0 replies; 25+ messages in thread
From: Robin Murphy @ 2023-09-12 16:28 UTC (permalink / raw)
  To: joro
  Cc: will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

Automatically scaling the depot up to suit the peak capacity of a
workload is all well and good, but it would be nice to have a way to
scale it back down again if the workload changes. To that end, add
backround reclaim that will gradually free surplus magazines if the
depot size remains above a reasonable threshold for long enough.

Reviewed-by: Jerry Snitselaar <jsnitsel@redhat.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---

v3: Make sure iova_depot_work_func() locking is IRQ-safe

 drivers/iommu/iova.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index dd2309e9a6c5..d30e453d0fb4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -11,6 +11,7 @@
 #include <linux/smp.h>
 #include <linux/bitops.h>
 #include <linux/cpu.h>
+#include <linux/workqueue.h>
 
 /* The anchor node sits above the top of the usable address space */
 #define IOVA_ANCHOR	~0UL
@@ -627,6 +628,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
  */
 #define IOVA_MAG_SIZE 127
 
+#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
+
 struct iova_magazine {
 	union {
 		unsigned long size;
@@ -644,8 +647,11 @@ struct iova_cpu_rcache {
 
 struct iova_rcache {
 	spinlock_t lock;
+	unsigned int depot_size;
 	struct iova_magazine *depot;
 	struct iova_cpu_rcache __percpu *cpu_rcaches;
+	struct iova_domain *iovad;
+	struct delayed_work work;
 };
 
 static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
@@ -726,6 +732,7 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
 
 	rcache->depot = mag->next;
 	mag->size = IOVA_MAG_SIZE;
+	rcache->depot_size--;
 	return mag;
 }
 
@@ -733,6 +740,25 @@ static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *ma
 {
 	mag->next = rcache->depot;
 	rcache->depot = mag;
+	rcache->depot_size++;
+}
+
+static void iova_depot_work_func(struct work_struct *work)
+{
+	struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work);
+	struct iova_magazine *mag = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rcache->lock, flags);
+	if (rcache->depot_size > num_online_cpus())
+		mag = iova_depot_pop(rcache);
+	spin_unlock_irqrestore(&rcache->lock, flags);
+
+	if (mag) {
+		iova_magazine_free_pfns(mag, rcache->iovad);
+		iova_magazine_free(mag);
+		schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
+	}
 }
 
 int iova_domain_init_rcaches(struct iova_domain *iovad)
@@ -752,6 +778,8 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
 
 		rcache = &iovad->rcaches[i];
 		spin_lock_init(&rcache->lock);
+		rcache->iovad = iovad;
+		INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func);
 		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
 						     cache_line_size());
 		if (!rcache->cpu_rcaches) {
@@ -812,6 +840,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 			spin_lock(&rcache->lock);
 			iova_depot_push(rcache, cpu_rcache->loaded);
 			spin_unlock(&rcache->lock);
+			schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
 
 			cpu_rcache->loaded = new_mag;
 			can_insert = true;
@@ -912,6 +941,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 			iova_magazine_free(cpu_rcache->prev);
 		}
 		free_percpu(rcache->cpu_rcaches);
+		cancel_delayed_work_sync(&rcache->work);
 		while (rcache->depot)
 			iova_magazine_free(iova_depot_pop(rcache));
 	}
-- 
2.39.2.101.g768bb238c484.dirty


^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-09-12 16:28 [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
  2023-09-12 16:28 ` [PATCH v3 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
  2023-09-12 16:28 ` [PATCH v3 2/2] iommu/iova: Manage the depot list size Robin Murphy
@ 2023-09-25 10:08 ` Joerg Roedel
  2023-12-28 12:23 ` Ido Schimmel
  3 siblings, 0 replies; 25+ messages in thread
From: Joerg Roedel @ 2023-09-25 10:08 UTC (permalink / raw)
  To: Robin Murphy
  Cc: will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
> Robin Murphy (2):
>   iommu/iova: Make the rcache depot scale better
>   iommu/iova: Manage the depot list size

Applied, thanks.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-09-12 16:28 [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
                   ` (2 preceding siblings ...)
  2023-09-25 10:08 ` [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible Joerg Roedel
@ 2023-12-28 12:23 ` Ido Schimmel
  2024-01-02  7:24   ` Ido Schimmel
                     ` (3 more replies)
  3 siblings, 4 replies; 25+ messages in thread
From: Ido Schimmel @ 2023-12-28 12:23 UTC (permalink / raw)
  To: Robin Murphy
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
> v2: https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
> 
> Hi all,
> 
> I hope this is good to go now, just fixed the locking (and threw
> lockdep at it to confirm, which of course I should have done to begin
> with...) and picked up tags.

Hi,

After pulling the v6.7 changes we started seeing the following memory
leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
which is why I didn't perform bisection. However, looking at the
mentioned code paths, they seem to have been changed in v6.7 as part of
this patchset. I reverted both patches and didn't see any memory leaks
when running a full regression (~10 hours), but I will repeat it to be
sure.

Any idea what could be the problem?

Thanks

[1]
unreferenced object 0xffff8881a5301000 (size 1024): 
  comm "softirq", pid 0, jiffies 4306297099 (age 462.991s) 
  hex dump (first 32 bytes): 
    00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00  .........}...... 
    0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00  ................ 
  backtrace: 
    [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320 
    [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60 
    [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0 
    [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310 
    [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0 
    [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0 
    [<ffffffff813ea16b>] __run_timers+0x78b/0xb80 
    [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0 
    [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5 

unreferenced object 0xffff8881392ec000 (size 1024): 
  comm "softirq", pid 0, jiffies 4306326731 (age 433.359s) 
  hex dump (first 32 bytes): 
    00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00  ..0.....P....... 
    f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00  ................ 
  backtrace: 
    [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320 
    [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60 
    [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0 
    [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310 
    [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0 
    [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0 
    [<ffffffff813ea16b>] __run_timers+0x78b/0xb80 
    [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0 
    [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5 

unreferenced object 0xffff8881411f9000 (size 1024): 
  comm "softirq", pid 0, jiffies 4306708887 (age 51.459s) 
  hex dump (first 32 bytes): 
    00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00  ...&....,....... 
    ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00  ................ 
  backtrace: 
    [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320 
    [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60 
    [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0 
    [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310 
    [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0 
    [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0 
    [<ffffffff813ea16b>] __run_timers+0x78b/0xb80 
    [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0 
    [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5 

unreferenced object 0xffff88812be26400 (size 1024): 
  comm "softirq", pid 0, jiffies 4306710027 (age 50.319s) 
  hex dump (first 32 bytes): 
    00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00  ...9....2....... 
    e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00  ................ 
  backtrace: 
    [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320 
    [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60 
    [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0 
    [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310 
    [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0 
    [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0 
    [<ffffffff813ea16b>] __run_timers+0x78b/0xb80 
    [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0 
    [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5 

unreferenced object 0xffff8881261c1000 (size 1024): 
  comm "softirq", pid 0, jiffies 4306711547 (age 48.799s) 
  hex dump (first 32 bytes): 
    00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00  .d.+.....|...... 
    87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00  ................ 
  backtrace: 
    [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320 
    [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60 
    [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0 
    [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310 
    [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0 
    [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0 
    [<ffffffff813ea16b>] __run_timers+0x78b/0xb80 
    [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0 
    [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5 

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-12-28 12:23 ` Ido Schimmel
@ 2024-01-02  7:24   ` Ido Schimmel
  2024-01-03  8:38     ` Joerg Roedel
  2024-01-06  4:21     ` Ethan Zhao
  2024-01-06  4:03   ` Ethan Zhao
                     ` (2 subsequent siblings)
  3 siblings, 2 replies; 25+ messages in thread
From: Ido Schimmel @ 2024-01-02  7:24 UTC (permalink / raw)
  To: Robin Murphy
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

On Thu, Dec 28, 2023 at 02:23:20PM +0200, Ido Schimmel wrote:
> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
> > v2: https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
> > 
> > Hi all,
> > 
> > I hope this is good to go now, just fixed the locking (and threw
> > lockdep at it to confirm, which of course I should have done to begin
> > with...) and picked up tags.
> 
> Hi,
> 
> After pulling the v6.7 changes we started seeing the following memory
> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
> which is why I didn't perform bisection. However, looking at the
> mentioned code paths, they seem to have been changed in v6.7 as part of
> this patchset. I reverted both patches and didn't see any memory leaks
> when running a full regression (~10 hours), but I will repeat it to be
> sure.

FYI, we didn't see the leaks since reverting these two patches whereas
before we saw them almost everyday, so I'm quite sure they introduced
the leaks.

Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-02  7:24   ` Ido Schimmel
@ 2024-01-03  8:38     ` Joerg Roedel
  2024-01-06  4:21     ` Ethan Zhao
  1 sibling, 0 replies; 25+ messages in thread
From: Joerg Roedel @ 2024-01-03  8:38 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Ido Schimmel, will, iommu, linux-kernel, zhangzekun11,
	john.g.garry, dheerajkumar.srivastava, jsnitsel

On Tue, Jan 02, 2024 at 09:24:37AM +0200, Ido Schimmel wrote:
> FYI, we didn't see the leaks since reverting these two patches whereas
> before we saw them almost everyday, so I'm quite sure they introduced
> the leaks.

Robin, can you have a look please?

Thanks,

	Joerg

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-12-28 12:23 ` Ido Schimmel
  2024-01-02  7:24   ` Ido Schimmel
@ 2024-01-06  4:03   ` Ethan Zhao
  2024-01-08  3:13   ` Ethan Zhao
  2024-01-08 17:35   ` Robin Murphy
  3 siblings, 0 replies; 25+ messages in thread
From: Ethan Zhao @ 2024-01-06  4:03 UTC (permalink / raw)
  To: Ido Schimmel, Robin Murphy
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel


On 12/28/2023 8:23 PM, Ido Schimmel wrote:
> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>> v2: https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
>>
>> Hi all,
>>
>> I hope this is good to go now, just fixed the locking (and threw
>> lockdep at it to confirm, which of course I should have done to begin
>> with...) and picked up tags.
> Hi,
>
> After pulling the v6.7 changes we started seeing the following memory
> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
> which is why I didn't perform bisection. However, looking at the
> mentioned code paths, they seem to have been changed in v6.7 as part of
> this patchset. I reverted both patches and didn't see any memory leaks
> when running a full regression (~10 hours), but I will repeat it to be
> sure.
>
> Any idea what could be the problem?
>
> Thanks
>
> [1]
> unreferenced object 0xffff8881a5301000 (size 1024):

size 1024, seems one "iova_magazine" was leaked ?

no other struct happens to be size 1024.


Thanks,

Ethan

>    comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
>    hex dump (first 32 bytes):
>      00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00  .........}......
>      0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881392ec000 (size 1024):
>    comm "softirq", pid 0, jiffies 4306326731 (age 433.359s)
>    hex dump (first 32 bytes):
>      00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00  ..0.....P.......
>      f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881411f9000 (size 1024):
>    comm "softirq", pid 0, jiffies 4306708887 (age 51.459s)
>    hex dump (first 32 bytes):
>      00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00  ...&....,.......
>      ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff88812be26400 (size 1024):
>    comm "softirq", pid 0, jiffies 4306710027 (age 50.319s)
>    hex dump (first 32 bytes):
>      00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00  ...9....2.......
>      e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881261c1000 (size 1024):
>    comm "softirq", pid 0, jiffies 4306711547 (age 48.799s)
>    hex dump (first 32 bytes):
>      00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00  .d.+.....|......
>      87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-02  7:24   ` Ido Schimmel
  2024-01-03  8:38     ` Joerg Roedel
@ 2024-01-06  4:21     ` Ethan Zhao
  2024-01-06  7:07       ` zhangzekun (A)
  1 sibling, 1 reply; 25+ messages in thread
From: Ethan Zhao @ 2024-01-06  4:21 UTC (permalink / raw)
  To: Ido Schimmel, Robin Murphy
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel


On 1/2/2024 3:24 PM, Ido Schimmel wrote:
> On Thu, Dec 28, 2023 at 02:23:20PM +0200, Ido Schimmel wrote:
>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>> v2: https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
>>>
>>> Hi all,
>>>
>>> I hope this is good to go now, just fixed the locking (and threw
>>> lockdep at it to confirm, which of course I should have done to begin
>>> with...) and picked up tags.
>> Hi,
>>
>> After pulling the v6.7 changes we started seeing the following memory
>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>> which is why I didn't perform bisection. However, looking at the
>> mentioned code paths, they seem to have been changed in v6.7 as part of
>> this patchset. I reverted both patches and didn't see any memory leaks
>> when running a full regression (~10 hours), but I will repeat it to be
>> sure.
> FYI, we didn't see the leaks since reverting these two patches whereas
> before we saw them almost everyday, so I'm quite sure they introduced
> the leaks.

Seems some magazines were not freed when one CPU is dead (hot unplugged) ?

static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain 
*iovad)
{
     struct iova_cpu_rcache *cpu_rcache;
     struct iova_rcache *rcache;
     unsigned long flags;
     int i;

     for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
         rcache = &iovad->rcaches[i];
         cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
         spin_lock_irqsave(&cpu_rcache->lock, flags);
         iova_magazine_free_pfns(cpu_rcache->loaded, iovad);

+     iova_magazine_free(cpu_rcache->loaded);

         iova_magazine_free_pfns(cpu_rcache->prev, iovad);

+     iova_magazine_free(cpu_rcache->prev);

         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
     }
}



Thanks,

Ethan

> Thanks
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-06  4:21     ` Ethan Zhao
@ 2024-01-06  7:07       ` zhangzekun (A)
  2024-01-06  7:33         ` Ethan Zhao
  0 siblings, 1 reply; 25+ messages in thread
From: zhangzekun (A) @ 2024-01-06  7:07 UTC (permalink / raw)
  To: Ethan Zhao
  Cc: joro, will, iommu, linux-kernel, john.g.garry,
	dheerajkumar.srivastava, jsnitsel, Ido Schimmel, Robin Murphy



在 2024/1/6 12:21, Ethan Zhao 写道:
>
> On 1/2/2024 3:24 PM, Ido Schimmel wrote:
>> On Thu, Dec 28, 2023 at 02:23:20PM +0200, Ido Schimmel wrote:
>>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>>> v2: 
>>>> https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
>>>>
>>>> Hi all,
>>>>
>>>> I hope this is good to go now, just fixed the locking (and threw
>>>> lockdep at it to confirm, which of course I should have done to begin
>>>> with...) and picked up tags.
>>> Hi,
>>>
>>> After pulling the v6.7 changes we started seeing the following memory
>>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>>> which is why I didn't perform bisection. However, looking at the
>>> mentioned code paths, they seem to have been changed in v6.7 as part of
>>> this patchset. I reverted both patches and didn't see any memory leaks
>>> when running a full regression (~10 hours), but I will repeat it to be
>>> sure.
>> FYI, we didn't see the leaks since reverting these two patches whereas
>> before we saw them almost everyday, so I'm quite sure they introduced
>> the leaks.
>
> Seems some magazines were not freed when one CPU is dead (hot 
> unplugged) ?
>
> static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain 
> *iovad)
> {
>     struct iova_cpu_rcache *cpu_rcache;
>     struct iova_rcache *rcache;
>     unsigned long flags;
>     int i;
>
>     for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>         rcache = &iovad->rcaches[i];
>         cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
>         spin_lock_irqsave(&cpu_rcache->lock, flags);
>         iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
>
> +     iova_magazine_free(cpu_rcache->loaded);
>
>         iova_magazine_free_pfns(cpu_rcache->prev, iovad);
>
> +     iova_magazine_free(cpu_rcache->prev);
>
>         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
>     }
> }
It seems cpu_rcache->loaded and cpu_rcache->prev will be freed in 
free_iova_rcaches(), and it should not cause memory leak because 
iova_magazine_free() will be called for each possible cpu.
free_cpu_cached_iovas() is used to free cached iovas in magazines.

Thanks,
Zekun



^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-06  7:07       ` zhangzekun (A)
@ 2024-01-06  7:33         ` Ethan Zhao
  0 siblings, 0 replies; 25+ messages in thread
From: Ethan Zhao @ 2024-01-06  7:33 UTC (permalink / raw)
  To: zhangzekun (A)
  Cc: joro, will, iommu, linux-kernel, john.g.garry,
	dheerajkumar.srivastava, jsnitsel, Ido Schimmel, Robin Murphy


On 1/6/2024 3:07 PM, zhangzekun (A) wrote:
>
>
> 在 2024/1/6 12:21, Ethan Zhao 写道:
>>
>> On 1/2/2024 3:24 PM, Ido Schimmel wrote:
>>> On Thu, Dec 28, 2023 at 02:23:20PM +0200, Ido Schimmel wrote:
>>>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>>>> v2: 
>>>>> https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
>>>>>
>>>>> Hi all,
>>>>>
>>>>> I hope this is good to go now, just fixed the locking (and threw
>>>>> lockdep at it to confirm, which of course I should have done to begin
>>>>> with...) and picked up tags.
>>>> Hi,
>>>>
>>>> After pulling the v6.7 changes we started seeing the following memory
>>>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>>>> which is why I didn't perform bisection. However, looking at the
>>>> mentioned code paths, they seem to have been changed in v6.7 as 
>>>> part of
>>>> this patchset. I reverted both patches and didn't see any memory leaks
>>>> when running a full regression (~10 hours), but I will repeat it to be
>>>> sure.
>>> FYI, we didn't see the leaks since reverting these two patches whereas
>>> before we saw them almost everyday, so I'm quite sure they introduced
>>> the leaks.
>>
>> Seems some magazines were not freed when one CPU is dead (hot 
>> unplugged) ?
>>
>> static void free_cpu_cached_iovas(unsigned int cpu, struct 
>> iova_domain *iovad)
>> {
>>     struct iova_cpu_rcache *cpu_rcache;
>>     struct iova_rcache *rcache;
>>     unsigned long flags;
>>     int i;
>>
>>     for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>>         rcache = &iovad->rcaches[i];
>>         cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
>>         spin_lock_irqsave(&cpu_rcache->lock, flags);
>>         iova_magazine_free_pfns(cpu_rcache->loaded, iovad);
>>
>> +     iova_magazine_free(cpu_rcache->loaded);
>>
>>         iova_magazine_free_pfns(cpu_rcache->prev, iovad);
>>
>> +     iova_magazine_free(cpu_rcache->prev);
>>
>>         spin_unlock_irqrestore(&cpu_rcache->lock, flags);
>>     }
>> }
> It seems cpu_rcache->loaded and cpu_rcache->prev will be freed in 
> free_iova_rcaches(), and it should not cause memory leak because 
> iova_magazine_free() will be called for each possible cpu.
> free_cpu_cached_iovas() is used to free cached iovas in magazines.

Yup, looked  closely.  possible cpu, not online cpu.

Thanks,

Ethan

>
> Thanks,
> Zekun
>
>
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-12-28 12:23 ` Ido Schimmel
  2024-01-02  7:24   ` Ido Schimmel
  2024-01-06  4:03   ` Ethan Zhao
@ 2024-01-08  3:13   ` Ethan Zhao
  2024-01-08 17:35   ` Robin Murphy
  3 siblings, 0 replies; 25+ messages in thread
From: Ethan Zhao @ 2024-01-08  3:13 UTC (permalink / raw)
  To: Ido Schimmel, Robin Murphy
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel


On 12/28/2023 8:23 PM, Ido Schimmel wrote:
> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>> v2: https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
>>
>> Hi all,
>>
>> I hope this is good to go now, just fixed the locking (and threw
>> lockdep at it to confirm, which of course I should have done to begin
>> with...) and picked up tags.
> Hi,
>
> After pulling the v6.7 changes we started seeing the following memory
> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
> which is why I didn't perform bisection. However, looking at the
> mentioned code paths, they seem to have been changed in v6.7 as part of
> this patchset. I reverted both patches and didn't see any memory leaks
> when running a full regression (~10 hours), but I will repeat it to be
> sure.
>
> Any idea what could be the problem?
>
> Thanks
>
> [1]
> unreferenced object 0xffff8881a5301000 (size 1024):
>    comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
>    hex dump (first 32 bytes):
>      00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00  .........}......
Here the magazine is empty & loaded, size is 0.
>      0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881392ec000 (size 1024):
>    comm "softirq", pid 0, jiffies 4306326731 (age 433.359s)
>    hex dump (first 32 bytes):
>      00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00  ..0.....P.......
The above *unreferenced object 0xffff8881a5301000* is referenced here,

00 10 30 a5 81 88 ff ff -> ff ff 88 81 a5 30 10 00

>      f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881411f9000 (size 1024):
>    comm "softirq", pid 0, jiffies 4306708887 (age 51.459s)
>    hex dump (first 32 bytes):
>      00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00  ...&....,.......
>      ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff88812be26400 (size 1024):
>    comm "softirq", pid 0, jiffies 4306710027 (age 50.319s)
>    hex dump (first 32 bytes):
>      00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00  ...9....2.......
>      e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>
> unreferenced object 0xffff8881261c1000 (size 1024):
>    comm "softirq", pid 0, jiffies 4306711547 (age 48.799s)
>    hex dump (first 32 bytes):
>      00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00  .d.+.....|......

The value of first 8 bytes  is : ff ff 88 81 2b e2 64 00 (little endian)

this is the address of above object 0xffff88812be26400

so the struct is exactly the

struct iova_magazine { union { unsigned long size; struct iova_magazine 
*next; }; unsigned long pfns[IOVA_MAG_SIZE]; };

when the magazine is stored in depot, the member *next is used to

pointer the next magazine.

But when the magzine is loaded, the size member is used,

This report should be *false positive* by kmemleak.

Thanks,

Ethan

>      87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-12-28 12:23 ` Ido Schimmel
                     ` (2 preceding siblings ...)
  2024-01-08  3:13   ` Ethan Zhao
@ 2024-01-08 17:35   ` Robin Murphy
  2024-01-09  5:54     ` Ethan Zhao
  2024-01-09 17:21     ` Ido Schimmel
  3 siblings, 2 replies; 25+ messages in thread
From: Robin Murphy @ 2024-01-08 17:35 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

On 2023-12-28 12:23 pm, Ido Schimmel wrote:
> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>> v2: https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
>>
>> Hi all,
>>
>> I hope this is good to go now, just fixed the locking (and threw
>> lockdep at it to confirm, which of course I should have done to begin
>> with...) and picked up tags.
> 
> Hi,
> 
> After pulling the v6.7 changes we started seeing the following memory
> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
> which is why I didn't perform bisection. However, looking at the
> mentioned code paths, they seem to have been changed in v6.7 as part of
> this patchset. I reverted both patches and didn't see any memory leaks
> when running a full regression (~10 hours), but I will repeat it to be
> sure.
> 
> Any idea what could be the problem?

Hmm, we've got what looks to be a set of magazines forming a plausible 
depot list (or at least the tail end of one):

ffff8881411f9000 -> ffff8881261c1000

ffff8881261c1000 -> ffff88812be26400

ffff88812be26400 -> ffff8188392ec000

ffff8188392ec000 -> ffff8881a5301000

ffff8881a5301000 -> NULL

which I guess has somehow become detached from its rcache->depot without 
being freed properly? However I'm struggling to see any conceivable way 
that could happen which wouldn't already be more severely broken in 
other ways as well (i.e. either general memory corruption or someone 
somehow still trying to use the IOVA domain while it's being torn down).

Out of curiosity, does reverting just patch #2 alone make a difference? 
And is your workload doing anything "interesting" in relation to IOVA 
domain lifetimes, like creating and destroying SR-IOV virtual functions, 
changing IOMMU domain types via sysfs, or using that horrible vdpa 
thing, or are you seeing this purely from regular driver DMA API usage?

Thanks,
Robin.

> 
> Thanks
> 
> [1]
> unreferenced object 0xffff8881a5301000 (size 1024):
>    comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
>    hex dump (first 32 bytes):
>      00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00  .........}......
>      0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
> 
> unreferenced object 0xffff8881392ec000 (size 1024):
>    comm "softirq", pid 0, jiffies 4306326731 (age 433.359s)
>    hex dump (first 32 bytes):
>      00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00  ..0.....P.......
>      f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
> 
> unreferenced object 0xffff8881411f9000 (size 1024):
>    comm "softirq", pid 0, jiffies 4306708887 (age 51.459s)
>    hex dump (first 32 bytes):
>      00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00  ...&....,.......
>      ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
> 
> unreferenced object 0xffff88812be26400 (size 1024):
>    comm "softirq", pid 0, jiffies 4306710027 (age 50.319s)
>    hex dump (first 32 bytes):
>      00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00  ...9....2.......
>      e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
> 
> unreferenced object 0xffff8881261c1000 (size 1024):
>    comm "softirq", pid 0, jiffies 4306711547 (age 48.799s)
>    hex dump (first 32 bytes):
>      00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00  .d.+.....|......
>      87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00  ................
>    backtrace:
>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-08 17:35   ` Robin Murphy
@ 2024-01-09  5:54     ` Ethan Zhao
  2024-01-09  6:23       ` Ethan Zhao
  2024-01-09 17:21     ` Ido Schimmel
  1 sibling, 1 reply; 25+ messages in thread
From: Ethan Zhao @ 2024-01-09  5:54 UTC (permalink / raw)
  To: Robin Murphy, Ido Schimmel
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel


On 1/9/2024 1:35 AM, Robin Murphy wrote:
> On 2023-12-28 12:23 pm, Ido Schimmel wrote:
>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>> v2: 
>>> https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
>>>
>>> Hi all,
>>>
>>> I hope this is good to go now, just fixed the locking (and threw
>>> lockdep at it to confirm, which of course I should have done to begin
>>> with...) and picked up tags.
>>
>> Hi,
>>
>> After pulling the v6.7 changes we started seeing the following memory
>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>> which is why I didn't perform bisection. However, looking at the
>> mentioned code paths, they seem to have been changed in v6.7 as part of
>> this patchset. I reverted both patches and didn't see any memory leaks
>> when running a full regression (~10 hours), but I will repeat it to be
>> sure.
>>
>> Any idea what could be the problem?
>
> Hmm, we've got what looks to be a set of magazines forming a plausible 
> depot list (or at least the tail end of one):
>
> ffff8881411f9000 -> ffff8881261c1000
>
> ffff8881261c1000 -> ffff88812be26400
>
> ffff88812be26400 -> ffff8188392ec000
>
> ffff8188392ec000 -> ffff8881a5301000
>
> ffff8881a5301000 -> NULL
>
> which I guess has somehow become detached from its rcache->depot 
> without being freed properly? However I'm struggling to see any 
> conceivable way that could happen which wouldn't already be more 
> severely broken in other ways as well (i.e. either general memory 
> corruption or someone somehow still trying to use the IOVA domain 
> while it's being torn down).
>
> Out of curiosity, does reverting just patch #2 alone make a 
> difference? And is your workload doing anything "interesting" in 
> relation to IOVA domain lifetimes, like creating and destroying SR-IOV 
> virtual functions, changing IOMMU domain types via sysfs, or using 
> that horrible vdpa thing, or are you seeing this purely from regular 
> driver DMA API usage?

There no lock held when free_iova_rcaches(), is it possible 
free_iova_rcaches() race with the delayed iova_depot_work_func() ?

I don't know why not call cancel_delayed_work_sync(&rcache->work); first 
in free_iova_rcaches() to avoid possible race.


Thanks,

Ethan

>
> Thanks,
> Robin.
>
>>
>> Thanks
>>
>> [1]
>> unreferenced object 0xffff8881a5301000 (size 1024):
>>    comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
>>    hex dump (first 32 bytes):
>>      00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00 .........}......
>>      0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00 ................
>>    backtrace:
>>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>
>> unreferenced object 0xffff8881392ec000 (size 1024):
>>    comm "softirq", pid 0, jiffies 4306326731 (age 433.359s)
>>    hex dump (first 32 bytes):
>>      00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00 ..0.....P.......
>>      f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00 ................
>>    backtrace:
>>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>
>> unreferenced object 0xffff8881411f9000 (size 1024):
>>    comm "softirq", pid 0, jiffies 4306708887 (age 51.459s)
>>    hex dump (first 32 bytes):
>>      00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00 ...&....,.......
>>      ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00 ................
>>    backtrace:
>>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>
>> unreferenced object 0xffff88812be26400 (size 1024):
>>    comm "softirq", pid 0, jiffies 4306710027 (age 50.319s)
>>    hex dump (first 32 bytes):
>>      00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00 ...9....2.......
>>      e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00 ................
>>    backtrace:
>>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>
>> unreferenced object 0xffff8881261c1000 (size 1024):
>>    comm "softirq", pid 0, jiffies 4306711547 (age 48.799s)
>>    hex dump (first 32 bytes):
>>      00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00 .d.+.....|......
>>      87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00 ................
>>    backtrace:
>>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-09  5:54     ` Ethan Zhao
@ 2024-01-09  6:23       ` Ethan Zhao
  2024-01-09 11:26         ` Robin Murphy
  0 siblings, 1 reply; 25+ messages in thread
From: Ethan Zhao @ 2024-01-09  6:23 UTC (permalink / raw)
  To: Robin Murphy, Ido Schimmel
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel


On 1/9/2024 1:54 PM, Ethan Zhao wrote:
>
> On 1/9/2024 1:35 AM, Robin Murphy wrote:
>> On 2023-12-28 12:23 pm, Ido Schimmel wrote:
>>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>>> v2: 
>>>> https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
>>>>
>>>> Hi all,
>>>>
>>>> I hope this is good to go now, just fixed the locking (and threw
>>>> lockdep at it to confirm, which of course I should have done to begin
>>>> with...) and picked up tags.
>>>
>>> Hi,
>>>
>>> After pulling the v6.7 changes we started seeing the following memory
>>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>>> which is why I didn't perform bisection. However, looking at the
>>> mentioned code paths, they seem to have been changed in v6.7 as part of
>>> this patchset. I reverted both patches and didn't see any memory leaks
>>> when running a full regression (~10 hours), but I will repeat it to be
>>> sure.
>>>
>>> Any idea what could be the problem?
>>
>> Hmm, we've got what looks to be a set of magazines forming a 
>> plausible depot list (or at least the tail end of one):
>>
>> ffff8881411f9000 -> ffff8881261c1000
>>
>> ffff8881261c1000 -> ffff88812be26400
>>
>> ffff88812be26400 -> ffff8188392ec000
>>
>> ffff8188392ec000 -> ffff8881a5301000
>>
>> ffff8881a5301000 -> NULL
>>
>> which I guess has somehow become detached from its rcache->depot 
>> without being freed properly? However I'm struggling to see any 
>> conceivable way that could happen which wouldn't already be more 
>> severely broken in other ways as well (i.e. either general memory 
>> corruption or someone somehow still trying to use the IOVA domain 
>> while it's being torn down).
>>
>> Out of curiosity, does reverting just patch #2 alone make a 
>> difference? And is your workload doing anything "interesting" in 
>> relation to IOVA domain lifetimes, like creating and destroying 
>> SR-IOV virtual functions, changing IOMMU domain types via sysfs, or 
>> using that horrible vdpa thing, or are you seeing this purely from 
>> regular driver DMA API usage?
>
> There no lock held when free_iova_rcaches(), is it possible 
> free_iova_rcaches() race with the delayed cancel_delayed_work_sync() ?
>
> I don't know why not call cancel_delayed_work_sync(&rcache->work); 
> first in free_iova_rcaches() to avoid possible race.
>
between following functions pair, race possible ? if called cocurrently.

1. free_iova_rcaches() with iova_depot_work_func()

    free_iova_rcaches() holds no lock, iova_depot_work_func() holds 
rcache->lock.

2. iova_cpuhp_dead() with iova_depot_work_func()

   iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock, 
iova_depot_work_func() holds rcache->lock.

3. iova_cpuhp_dead() with free_iova_rcaches()

    iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock, 
free_iova_rcaches() holds no lock.

4. iova_cpuhp_dead() with free_global_cached_iovas()

    iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock and 
free_global_cached_iovas() holds rcache->lock.

......

Thanks,

Ethan


>
> Thanks,
>
> Ethan
>
>>
>> Thanks,
>> Robin.
>>
>>>
>>> Thanks
>>>
>>> [1]
>>> unreferenced object 0xffff8881a5301000 (size 1024):
>>>    comm "softirq", pid 0, jiffies 4306297099 (age 462.991s)
>>>    hex dump (first 32 bytes):
>>>      00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00 .........}......
>>>      0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00 ................
>>>    backtrace:
>>>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>>
>>> unreferenced object 0xffff8881392ec000 (size 1024):
>>>    comm "softirq", pid 0, jiffies 4306326731 (age 433.359s)
>>>    hex dump (first 32 bytes):
>>>      00 10 30 a5 81 88 ff ff 50 ff 0f 00 00 00 00 00 ..0.....P.......
>>>      f3 99 05 00 00 00 00 00 87 b7 05 00 00 00 00 00 ................
>>>    backtrace:
>>>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>>
>>> unreferenced object 0xffff8881411f9000 (size 1024):
>>>    comm "softirq", pid 0, jiffies 4306708887 (age 51.459s)
>>>    hex dump (first 32 bytes):
>>>      00 10 1c 26 81 88 ff ff 2c 96 05 00 00 00 00 00 ...&....,.......
>>>      ac fe 0f 00 00 00 00 00 a6 fe 0f 00 00 00 00 00 ................
>>>    backtrace:
>>>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>>
>>> unreferenced object 0xffff88812be26400 (size 1024):
>>>    comm "softirq", pid 0, jiffies 4306710027 (age 50.319s)
>>>    hex dump (first 32 bytes):
>>>      00 c0 2e 39 81 88 ff ff 32 ab 05 00 00 00 00 00 ...9....2.......
>>>      e3 ac 05 00 00 00 00 00 1f b6 05 00 00 00 00 00 ................
>>>    backtrace:
>>>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>>
>>> unreferenced object 0xffff8881261c1000 (size 1024):
>>>    comm "softirq", pid 0, jiffies 4306711547 (age 48.799s)
>>>    hex dump (first 32 bytes):
>>>      00 64 e2 2b 81 88 ff ff c0 7c 05 00 00 00 00 00 .d.+.....|......
>>>      87 a5 05 00 00 00 00 00 0e 9a 05 00 00 00 00 00 ................
>>>    backtrace:
>>>      [<ffffffff819f5f08>] __kmem_cache_alloc_node+0x1e8/0x320
>>>      [<ffffffff818a239a>] kmalloc_trace+0x2a/0x60
>>>      [<ffffffff8231d31e>] free_iova_fast+0x28e/0x4e0
>>>      [<ffffffff82310860>] fq_ring_free_locked+0x1b0/0x310
>>>      [<ffffffff8231225d>] fq_flush_timeout+0x19d/0x2e0
>>>      [<ffffffff813e95ba>] call_timer_fn+0x19a/0x5c0
>>>      [<ffffffff813ea16b>] __run_timers+0x78b/0xb80
>>>      [<ffffffff813ea5bd>] run_timer_softirq+0x5d/0xd0
>>>      [<ffffffff82f1d915>] __do_softirq+0x205/0x8b5
>>
>

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-09  6:23       ` Ethan Zhao
@ 2024-01-09 11:26         ` Robin Murphy
  2024-01-10  0:52           ` Ethan Zhao
  0 siblings, 1 reply; 25+ messages in thread
From: Robin Murphy @ 2024-01-09 11:26 UTC (permalink / raw)
  To: Ethan Zhao, Ido Schimmel
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

On 2024-01-09 6:23 am, Ethan Zhao wrote:
> 
> On 1/9/2024 1:54 PM, Ethan Zhao wrote:
>>
>> On 1/9/2024 1:35 AM, Robin Murphy wrote:
>>> On 2023-12-28 12:23 pm, Ido Schimmel wrote:
>>>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>>>> v2: 
>>>>> https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
>>>>>
>>>>> Hi all,
>>>>>
>>>>> I hope this is good to go now, just fixed the locking (and threw
>>>>> lockdep at it to confirm, which of course I should have done to begin
>>>>> with...) and picked up tags.
>>>>
>>>> Hi,
>>>>
>>>> After pulling the v6.7 changes we started seeing the following memory
>>>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce it,
>>>> which is why I didn't perform bisection. However, looking at the
>>>> mentioned code paths, they seem to have been changed in v6.7 as part of
>>>> this patchset. I reverted both patches and didn't see any memory leaks
>>>> when running a full regression (~10 hours), but I will repeat it to be
>>>> sure.
>>>>
>>>> Any idea what could be the problem?
>>>
>>> Hmm, we've got what looks to be a set of magazines forming a 
>>> plausible depot list (or at least the tail end of one):
>>>
>>> ffff8881411f9000 -> ffff8881261c1000
>>>
>>> ffff8881261c1000 -> ffff88812be26400
>>>
>>> ffff88812be26400 -> ffff8188392ec000
>>>
>>> ffff8188392ec000 -> ffff8881a5301000
>>>
>>> ffff8881a5301000 -> NULL
>>>
>>> which I guess has somehow become detached from its rcache->depot 
>>> without being freed properly? However I'm struggling to see any 
>>> conceivable way that could happen which wouldn't already be more 
>>> severely broken in other ways as well (i.e. either general memory 
>>> corruption or someone somehow still trying to use the IOVA domain 
>>> while it's being torn down).
>>>
>>> Out of curiosity, does reverting just patch #2 alone make a 
>>> difference? And is your workload doing anything "interesting" in 
>>> relation to IOVA domain lifetimes, like creating and destroying 
>>> SR-IOV virtual functions, changing IOMMU domain types via sysfs, or 
>>> using that horrible vdpa thing, or are you seeing this purely from 
>>> regular driver DMA API usage?
>>
>> There no lock held when free_iova_rcaches(), is it possible 
>> free_iova_rcaches() race with the delayed cancel_delayed_work_sync() ?
>>
>> I don't know why not call cancel_delayed_work_sync(&rcache->work); 
>> first in free_iova_rcaches() to avoid possible race.
>>
> between following functions pair, race possible ? if called cocurrently.
> 
> 1. free_iova_rcaches() with iova_depot_work_func()
> 
>     free_iova_rcaches() holds no lock, iova_depot_work_func() holds 
> rcache->lock.

Unless I've completely misunderstood the workqueue API, that can't 
happen, since free_iova_rcaches() *does* synchronously cancel the work 
before it starts freeing the depot list.

> 2. iova_cpuhp_dead() with iova_depot_work_func()
> 
>    iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock, 
> iova_depot_work_func() holds rcache->lock.

That's not a race because those are touching completely different things 
- the closest they come to interacting is where they both free IOVAs 
back to the rbtree.

> 3. iova_cpuhp_dead() with free_iova_rcaches()
> 
>     iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock, 
> free_iova_rcaches() holds no lock.

See iova_domain_free_rcaches() - by the time we call 
free_iova_rcaches(), the hotplug handler has already been removed (and 
either way it couldn't account for *this* issue since it doesn't touch 
the depot at all).

> 4. iova_cpuhp_dead() with free_global_cached_iovas()
> 
>     iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock and 
> free_global_cached_iovas() holds rcache->lock.

Again, they hold different locks because they're touching unrelated things.

Thanks,
Robin.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-08 17:35   ` Robin Murphy
  2024-01-09  5:54     ` Ethan Zhao
@ 2024-01-09 17:21     ` Ido Schimmel
  2024-01-10 12:48       ` Robin Murphy
  1 sibling, 1 reply; 25+ messages in thread
From: Ido Schimmel @ 2024-01-09 17:21 UTC (permalink / raw)
  To: Robin Murphy
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel

Hi Robin,

Thanks for the reply.

On Mon, Jan 08, 2024 at 05:35:26PM +0000, Robin Murphy wrote:
> Hmm, we've got what looks to be a set of magazines forming a plausible depot
> list (or at least the tail end of one):
> 
> ffff8881411f9000 -> ffff8881261c1000
> 
> ffff8881261c1000 -> ffff88812be26400
> 
> ffff88812be26400 -> ffff8188392ec000
> 
> ffff8188392ec000 -> ffff8881a5301000
> 
> ffff8881a5301000 -> NULL
> 
> which I guess has somehow become detached from its rcache->depot without
> being freed properly? However I'm struggling to see any conceivable way that
> could happen which wouldn't already be more severely broken in other ways as
> well (i.e. either general memory corruption or someone somehow still trying
> to use the IOVA domain while it's being torn down).

The machine is running a debug kernel that among other things has KASAN
enabled, but there are no traces in the kernel log so there is no memory
corruption that I'm aware of.

> Out of curiosity, does reverting just patch #2 alone make a difference?

Will try and let you know.

> And is your workload doing anything "interesting" in relation to IOVA
> domain lifetimes, like creating and destroying SR-IOV virtual
> functions, changing IOMMU domain types via sysfs, or using that
> horrible vdpa thing, or are you seeing this purely from regular driver
> DMA API usage?

The machine is running networking related tests, but it is not using
SR-IOV, VMs or VDPA so there shouldn't be anything "interesting" as far
as IOMMU is concerned.

The two networking drivers on the machine are "igb" for the management
port and "mlxsw" for the data ports (the machine is a physical switch).
I believe the DMA API usage in the latter is quite basic and I don't
recall any DMA related problems with this driver since it was first
accepted upstream in 2015.

Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-09 11:26         ` Robin Murphy
@ 2024-01-10  0:52           ` Ethan Zhao
  0 siblings, 0 replies; 25+ messages in thread
From: Ethan Zhao @ 2024-01-10  0:52 UTC (permalink / raw)
  To: Robin Murphy, Ido Schimmel
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel


On 1/9/2024 7:26 PM, Robin Murphy wrote:
> On 2024-01-09 6:23 am, Ethan Zhao wrote:
>>
>> On 1/9/2024 1:54 PM, Ethan Zhao wrote:
>>>
>>> On 1/9/2024 1:35 AM, Robin Murphy wrote:
>>>> On 2023-12-28 12:23 pm, Ido Schimmel wrote:
>>>>> On Tue, Sep 12, 2023 at 05:28:04PM +0100, Robin Murphy wrote:
>>>>>> v2: 
>>>>>> https://lore.kernel.org/linux-iommu/cover.1692641204.git.robin.murphy@arm.com/
>>>>>>
>>>>>> Hi all,
>>>>>>
>>>>>> I hope this is good to go now, just fixed the locking (and threw
>>>>>> lockdep at it to confirm, which of course I should have done to 
>>>>>> begin
>>>>>> with...) and picked up tags.
>>>>>
>>>>> Hi,
>>>>>
>>>>> After pulling the v6.7 changes we started seeing the following memory
>>>>> leaks [1] of 'struct iova_magazine'. I'm not sure how to reproduce 
>>>>> it,
>>>>> which is why I didn't perform bisection. However, looking at the
>>>>> mentioned code paths, they seem to have been changed in v6.7 as 
>>>>> part of
>>>>> this patchset. I reverted both patches and didn't see any memory 
>>>>> leaks
>>>>> when running a full regression (~10 hours), but I will repeat it 
>>>>> to be
>>>>> sure.
>>>>>
>>>>> Any idea what could be the problem?
>>>>
>>>> Hmm, we've got what looks to be a set of magazines forming a 
>>>> plausible depot list (or at least the tail end of one):
>>>>
>>>> ffff8881411f9000 -> ffff8881261c1000
>>>>
>>>> ffff8881261c1000 -> ffff88812be26400
>>>>
>>>> ffff88812be26400 -> ffff8188392ec000
>>>>
>>>> ffff8188392ec000 -> ffff8881a5301000
>>>>
>>>> ffff8881a5301000 -> NULL
>>>>
>>>> which I guess has somehow become detached from its rcache->depot 
>>>> without being freed properly? However I'm struggling to see any 
>>>> conceivable way that could happen which wouldn't already be more 
>>>> severely broken in other ways as well (i.e. either general memory 
>>>> corruption or someone somehow still trying to use the IOVA domain 
>>>> while it's being torn down).
>>>>
>>>> Out of curiosity, does reverting just patch #2 alone make a 
>>>> difference? And is your workload doing anything "interesting" in 
>>>> relation to IOVA domain lifetimes, like creating and destroying 
>>>> SR-IOV virtual functions, changing IOMMU domain types via sysfs, or 
>>>> using that horrible vdpa thing, or are you seeing this purely from 
>>>> regular driver DMA API usage?
>>>
>>> There no lock held when free_iova_rcaches(), is it possible 
>>> free_iova_rcaches() race with the delayed cancel_delayed_work_sync() ?
>>>
>>> I don't know why not call cancel_delayed_work_sync(&rcache->work); 
>>> first in free_iova_rcaches() to avoid possible race.
>>>
>> between following functions pair, race possible ? if called cocurrently.
>>
>> 1. free_iova_rcaches() with iova_depot_work_func()
>>
>>     free_iova_rcaches() holds no lock, iova_depot_work_func() holds 
>> rcache->lock.
>
> Unless I've completely misunderstood the workqueue API, that can't 
> happen, since free_iova_rcaches() *does* synchronously cancel the work 
> before it starts freeing the depot list.

iova_depot_work_func() pop and free mag from depot. free_iova_rcaches() 
frees loaded and previous mag before syncronously cancelled.

different thing. okay here.

>
>> 2. iova_cpuhp_dead() with iova_depot_work_func()
>>
>>    iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock, 
>> iova_depot_work_func() holds rcache->lock.
>
> That's not a race because those are touching completely different 
> things - the closest they come to interacting is where they both free 
> IOVAs back to the rbtree.

iova_cpuhp_dead() free pages with 
iova_magazine_free_pfns(cpu_rcache->loaded, iovad);

iova_depot_work_func() free mag from depot. iova_magazine_free_pfns() 
hold rbtree lock.

Okay, different thing.

>
>> 3. iova_cpuhp_dead() with free_iova_rcaches()
>>
>>     iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock, 
>> free_iova_rcaches() holds no lock.
>
> See iova_domain_free_rcaches() - by the time we call 
> free_iova_rcaches(), the hotplug handler has already been removed (and 
> either way it couldn't account for *this* issue since it doesn't touch 
> the depot at all).
Yes, iova_cpuhp_dead() was removed before free_iova_rcaches().
>
>> 4. iova_cpuhp_dead() with free_global_cached_iovas()
>>
>>     iova_cpuhp_dead() holds per cpu lock cpu_rcache->lock and 
>> free_global_cached_iovas() holds rcache->lock.
>
> Again, they hold different locks because they're touching unrelated 
> things.

iova_cpuhp_dead() free loaded and previous pages. 
free_global_cached_iovas() free mags from depot.

Okay too.

then free_global_cached_iovas() with iova_depot_work_func() ? they all 
hold rcache->lock.

So there is no race at all, perfect.  out of imagination, that memory 
leak report.

kmemleak not always right.


Thanks,

Ethan

>
> Thanks,
> Robin.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-09 17:21     ` Ido Schimmel
@ 2024-01-10 12:48       ` Robin Murphy
  2024-01-10 14:00         ` Ido Schimmel
  2024-01-10 17:58         ` Catalin Marinas
  0 siblings, 2 replies; 25+ messages in thread
From: Robin Murphy @ 2024-01-10 12:48 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel, Catalin Marinas

On 2024-01-09 5:21 pm, Ido Schimmel wrote:
> Hi Robin,
> 
> Thanks for the reply.
> 
> On Mon, Jan 08, 2024 at 05:35:26PM +0000, Robin Murphy wrote:
>> Hmm, we've got what looks to be a set of magazines forming a plausible depot
>> list (or at least the tail end of one):
>>
>> ffff8881411f9000 -> ffff8881261c1000
>>
>> ffff8881261c1000 -> ffff88812be26400
>>
>> ffff88812be26400 -> ffff8188392ec000
>>
>> ffff8188392ec000 -> ffff8881a5301000
>>
>> ffff8881a5301000 -> NULL
>>
>> which I guess has somehow become detached from its rcache->depot without
>> being freed properly? However I'm struggling to see any conceivable way that
>> could happen which wouldn't already be more severely broken in other ways as
>> well (i.e. either general memory corruption or someone somehow still trying
>> to use the IOVA domain while it's being torn down).
> 
> The machine is running a debug kernel that among other things has KASAN
> enabled, but there are no traces in the kernel log so there is no memory
> corruption that I'm aware of.
> 
>> Out of curiosity, does reverting just patch #2 alone make a difference?
> 
> Will try and let you know.
> 
>> And is your workload doing anything "interesting" in relation to IOVA
>> domain lifetimes, like creating and destroying SR-IOV virtual
>> functions, changing IOMMU domain types via sysfs, or using that
>> horrible vdpa thing, or are you seeing this purely from regular driver
>> DMA API usage?
> 
> The machine is running networking related tests, but it is not using
> SR-IOV, VMs or VDPA so there shouldn't be anything "interesting" as far
> as IOMMU is concerned.
> 
> The two networking drivers on the machine are "igb" for the management
> port and "mlxsw" for the data ports (the machine is a physical switch).
> I believe the DMA API usage in the latter is quite basic and I don't
> recall any DMA related problems with this driver since it was first
> accepted upstream in 2015.

Thanks for the clarifications, that seems to rule out all the most 
confusingly impossible scenarios, at least.

The best explanation I've managed to come up with is a false-positive 
race dependent on the order in which kmemleak scans the relevant 
objects. Say we have the list as depot -> A -> B -> C; the rcache object 
is scanned and sees the pointer to magazine A, but then A is popped 
*before* kmemleak scans it, such that when it is then scanned, its 
"next" pointer has already been wiped, thus kmemleak never observes any 
reference to B, so it appears that B and (transitively) C are "leaked". 
If that is the case, then I'd expect it should be reproducible with 
patch #1 alone (although patch #2 might make it slightly more likely if 
the work ever does result in additional pops happening), but I'd expect 
the leaked objects to be transient and not persist forever through 
repeated scans (what I don't know is whether kmemleak automatically 
un-leaks an object if it subsequently finds a new reference, or if it 
needs manually clearing in between scans). I'm not sure if there's a 
nice way to make that any better... unless maybe it might make sense to 
call kmemleak_not_leak(mag->next) in iova_depot_pop() before that 
reference disappears?

Thanks,
Robin.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-10 12:48       ` Robin Murphy
@ 2024-01-10 14:00         ` Ido Schimmel
  2024-01-10 17:58         ` Catalin Marinas
  1 sibling, 0 replies; 25+ messages in thread
From: Ido Schimmel @ 2024-01-10 14:00 UTC (permalink / raw)
  To: Robin Murphy
  Cc: joro, will, iommu, linux-kernel, zhangzekun11, john.g.garry,
	dheerajkumar.srivastava, jsnitsel, Catalin Marinas

On Wed, Jan 10, 2024 at 12:48:06PM +0000, Robin Murphy wrote:
> On 2024-01-09 5:21 pm, Ido Schimmel wrote:
> > Hi Robin,
> > 
> > Thanks for the reply.
> > 
> > On Mon, Jan 08, 2024 at 05:35:26PM +0000, Robin Murphy wrote:
> > > Hmm, we've got what looks to be a set of magazines forming a plausible depot
> > > list (or at least the tail end of one):
> > > 
> > > ffff8881411f9000 -> ffff8881261c1000
> > > 
> > > ffff8881261c1000 -> ffff88812be26400
> > > 
> > > ffff88812be26400 -> ffff8188392ec000
> > > 
> > > ffff8188392ec000 -> ffff8881a5301000
> > > 
> > > ffff8881a5301000 -> NULL
> > > 
> > > which I guess has somehow become detached from its rcache->depot without
> > > being freed properly? However I'm struggling to see any conceivable way that
> > > could happen which wouldn't already be more severely broken in other ways as
> > > well (i.e. either general memory corruption or someone somehow still trying
> > > to use the IOVA domain while it's being torn down).
> > 
> > The machine is running a debug kernel that among other things has KASAN
> > enabled, but there are no traces in the kernel log so there is no memory
> > corruption that I'm aware of.
> > 
> > > Out of curiosity, does reverting just patch #2 alone make a difference?
> > 
> > Will try and let you know.

I can confirm that the issue reproduces when only patch #2 is reverted.
IOW, patch #1 seems to be the problem:

unreferenced object 0xffff8881a1ff3400 (size 1024):
  comm "softirq", pid 0, jiffies 4296362635 (age 3540.420s)
  hex dump (first 32 bytes):
    00 00 00 00 00 00 00 00 67 b7 05 00 00 00 00 00  ........g.......
    3f a6 05 00 00 00 00 00 93 99 05 00 00 00 00 00  ?...............
  backtrace:
    [<ffffffff819f7a68>] __kmem_cache_alloc_node+0x1e8/0x320
    [<ffffffff818a3efa>] kmalloc_trace+0x2a/0x60
    [<ffffffff8231f8f3>] free_iova_fast+0x293/0x460
    [<ffffffff823132f0>] fq_ring_free_locked+0x1b0/0x310
    [<ffffffff82314ced>] fq_flush_timeout+0x19d/0x2e0
    [<ffffffff813e97da>] call_timer_fn+0x19a/0x5c0
    [<ffffffff813ea38b>] __run_timers+0x78b/0xb80
    [<ffffffff813ea7dd>] run_timer_softirq+0x5d/0xd0
    [<ffffffff82f21605>] __do_softirq+0x205/0x8b5
unreferenced object 0xffff888165b9a800 (size 1024):
  comm "softirq", pid 0, jiffies 4299383627 (age 519.460s)
  hex dump (first 32 bytes):
    00 34 ff a1 81 88 ff ff bd 9d 05 00 00 00 00 00  .4..............
    f3 ab 05 00 00 00 00 00 37 b5 05 00 00 00 00 00  ........7.......
  backtrace:
    [<ffffffff819f7a68>] __kmem_cache_alloc_node+0x1e8/0x320
    [<ffffffff818a3efa>] kmalloc_trace+0x2a/0x60
    [<ffffffff8231f8f3>] free_iova_fast+0x293/0x460
    [<ffffffff823132f0>] fq_ring_free_locked+0x1b0/0x310
    [<ffffffff82314ced>] fq_flush_timeout+0x19d/0x2e0
    [<ffffffff813e97da>] call_timer_fn+0x19a/0x5c0
    [<ffffffff813ea38b>] __run_timers+0x78b/0xb80
    [<ffffffff813ea7dd>] run_timer_softirq+0x5d/0xd0
    [<ffffffff82f21605>] __do_softirq+0x205/0x8b5

> > 
> > > And is your workload doing anything "interesting" in relation to IOVA
> > > domain lifetimes, like creating and destroying SR-IOV virtual
> > > functions, changing IOMMU domain types via sysfs, or using that
> > > horrible vdpa thing, or are you seeing this purely from regular driver
> > > DMA API usage?
> > 
> > The machine is running networking related tests, but it is not using
> > SR-IOV, VMs or VDPA so there shouldn't be anything "interesting" as far
> > as IOMMU is concerned.
> > 
> > The two networking drivers on the machine are "igb" for the management
> > port and "mlxsw" for the data ports (the machine is a physical switch).
> > I believe the DMA API usage in the latter is quite basic and I don't
> > recall any DMA related problems with this driver since it was first
> > accepted upstream in 2015.
> 
> Thanks for the clarifications, that seems to rule out all the most
> confusingly impossible scenarios, at least.
> 
> The best explanation I've managed to come up with is a false-positive race
> dependent on the order in which kmemleak scans the relevant objects. Say we
> have the list as depot -> A -> B -> C; the rcache object is scanned and sees
> the pointer to magazine A, but then A is popped *before* kmemleak scans it,
> such that when it is then scanned, its "next" pointer has already been
> wiped, thus kmemleak never observes any reference to B, so it appears that B
> and (transitively) C are "leaked". If that is the case, then I'd expect it
> should be reproducible with patch #1 alone (although patch #2 might make it
> slightly more likely if the work ever does result in additional pops
> happening), but I'd expect the leaked objects to be transient and not
> persist forever through repeated scans (what I don't know is whether
> kmemleak automatically un-leaks an object if it subsequently finds a new
> reference, or if it needs manually clearing in between scans). I'm not sure
> if there's a nice way to make that any better... unless maybe it might make
> sense to call kmemleak_not_leak(mag->next) in iova_depot_pop() before that
> reference disappears?

I'm not familiar with the code so I can't comment if that's the best
solution, but I will say that we've been running kmemleak as part of our
regression for years and every time we got a report it was an actual
memory leak. Therefore, in order to keep the tool reliable, I think it's
better to annotate the code to suppress false-positives rather than
ignoring it.

Please let me know if you want me to test a fix.

Thanks for looking into this!

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-10 12:48       ` Robin Murphy
  2024-01-10 14:00         ` Ido Schimmel
@ 2024-01-10 17:58         ` Catalin Marinas
  2024-01-11  8:20           ` Ido Schimmel
  1 sibling, 1 reply; 25+ messages in thread
From: Catalin Marinas @ 2024-01-10 17:58 UTC (permalink / raw)
  To: Robin Murphy
  Cc: Ido Schimmel, joro, will, iommu, linux-kernel, zhangzekun11,
	john.g.garry, dheerajkumar.srivastava, jsnitsel

On Wed, Jan 10, 2024 at 12:48:06PM +0000, Robin Murphy wrote:
> On 2024-01-09 5:21 pm, Ido Schimmel wrote:
> > On Mon, Jan 08, 2024 at 05:35:26PM +0000, Robin Murphy wrote:
> > > Hmm, we've got what looks to be a set of magazines forming a plausible depot
> > > list (or at least the tail end of one):
> > > 
> > > ffff8881411f9000 -> ffff8881261c1000
> > > 
> > > ffff8881261c1000 -> ffff88812be26400
> > > 
> > > ffff88812be26400 -> ffff8188392ec000
> > > 
> > > ffff8188392ec000 -> ffff8881a5301000
> > > 
> > > ffff8881a5301000 -> NULL
> > > 
> > > which I guess has somehow become detached from its rcache->depot without
> > > being freed properly? However I'm struggling to see any conceivable way that
> > > could happen which wouldn't already be more severely broken in other ways as
> > > well (i.e. either general memory corruption or someone somehow still trying
> > > to use the IOVA domain while it's being torn down).
> > 
> > The machine is running a debug kernel that among other things has KASAN
> > enabled, but there are no traces in the kernel log so there is no memory
> > corruption that I'm aware of.
> > 
> > > Out of curiosity, does reverting just patch #2 alone make a difference?
> > 
> > Will try and let you know.
> > 
> > > And is your workload doing anything "interesting" in relation to IOVA
> > > domain lifetimes, like creating and destroying SR-IOV virtual
> > > functions, changing IOMMU domain types via sysfs, or using that
> > > horrible vdpa thing, or are you seeing this purely from regular driver
> > > DMA API usage?
> > 
> > The machine is running networking related tests, but it is not using
> > SR-IOV, VMs or VDPA so there shouldn't be anything "interesting" as far
> > as IOMMU is concerned.
> > 
> > The two networking drivers on the machine are "igb" for the management
> > port and "mlxsw" for the data ports (the machine is a physical switch).
> > I believe the DMA API usage in the latter is quite basic and I don't
> > recall any DMA related problems with this driver since it was first
> > accepted upstream in 2015.
> 
> Thanks for the clarifications, that seems to rule out all the most
> confusingly impossible scenarios, at least.
> 
> The best explanation I've managed to come up with is a false-positive race
> dependent on the order in which kmemleak scans the relevant objects. Say we
> have the list as depot -> A -> B -> C; the rcache object is scanned and sees
> the pointer to magazine A, but then A is popped *before* kmemleak scans it,
> such that when it is then scanned, its "next" pointer has already been
> wiped, thus kmemleak never observes any reference to B, so it appears that B
> and (transitively) C are "leaked".

Transient false positives are possible, especially as the code doesn't
use a double-linked list (for the latter, kmemleak does checksumming and
detects the prev/next change, defers the reporting until the object
becomes stable). That said, if a new scan is forced (echo scan >
/sys/kernel/debug/kmemleak), are the same objects still listed as leaks?
If yes, they may not be transient.

If it is indeed transient, I think a better fix than kmemleak_not_leak()
is to add a new API, something like kmemleak_mark_transient() which
resets the checksum, skips the object reporting for one scan.

-- 
Catalin

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-10 17:58         ` Catalin Marinas
@ 2024-01-11  8:20           ` Ido Schimmel
  2024-01-11 10:13             ` Catalin Marinas
  0 siblings, 1 reply; 25+ messages in thread
From: Ido Schimmel @ 2024-01-11  8:20 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Robin Murphy, joro, will, iommu, linux-kernel, zhangzekun11,
	john.g.garry, dheerajkumar.srivastava, jsnitsel

On Wed, Jan 10, 2024 at 05:58:15PM +0000, Catalin Marinas wrote:
> Transient false positives are possible, especially as the code doesn't
> use a double-linked list (for the latter, kmemleak does checksumming and
> detects the prev/next change, defers the reporting until the object
> becomes stable). That said, if a new scan is forced (echo scan >
> /sys/kernel/debug/kmemleak), are the same objects still listed as leaks?
> If yes, they may not be transient.

We are doing "scan" and "clear" after each test. I will disable the
"clear" and see if the leaks persist.

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-11  8:20           ` Ido Schimmel
@ 2024-01-11 10:13             ` Catalin Marinas
  2024-01-12 15:31               ` Ido Schimmel
  0 siblings, 1 reply; 25+ messages in thread
From: Catalin Marinas @ 2024-01-11 10:13 UTC (permalink / raw)
  To: Ido Schimmel
  Cc: Robin Murphy, joro, will, iommu, linux-kernel, zhangzekun11,
	john.g.garry, dheerajkumar.srivastava, jsnitsel

On Thu, Jan 11, 2024 at 10:20:27AM +0200, Ido Schimmel wrote:
> On Wed, Jan 10, 2024 at 05:58:15PM +0000, Catalin Marinas wrote:
> > Transient false positives are possible, especially as the code doesn't
> > use a double-linked list (for the latter, kmemleak does checksumming and
> > detects the prev/next change, defers the reporting until the object
> > becomes stable). That said, if a new scan is forced (echo scan >
> > /sys/kernel/debug/kmemleak), are the same objects still listed as leaks?
> > If yes, they may not be transient.
> 
> We are doing "scan" and "clear" after each test. I will disable the
> "clear" and see if the leaks persist.

If it is indeed a false positive, you can try the patch below (I haven't
given it any run-time test, only compiled):

diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst
index 2cb00b53339f..7d784e03f3f9 100644
--- a/Documentation/dev-tools/kmemleak.rst
+++ b/Documentation/dev-tools/kmemleak.rst
@@ -161,6 +161,7 @@ See the include/linux/kmemleak.h header for the functions prototype.
 - ``kmemleak_free_percpu``	 - notify of a percpu memory block freeing
 - ``kmemleak_update_trace``	 - update object allocation stack trace
 - ``kmemleak_not_leak``	 - mark an object as not a leak
+- ``kmemleak_transient_leak``	 - mark an object as a transient leak
 - ``kmemleak_ignore``		 - do not scan or report an object as leak
 - ``kmemleak_scan_area``	 - add scan areas inside a memory block
 - ``kmemleak_no_scan``	 - do not scan a memory block
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d30e453d0fb4..c1d0775080ff 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/iova.h>
+#include <linux/kmemleak.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
@@ -730,6 +731,11 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
 {
 	struct iova_magazine *mag = rcache->depot;
 
+	/*
+	 * As the mag->next pointer is moved to rcache->depot and reset via
+	 * the mag->size assignment, mark the transient false positive.
+	 */
+	kmemleak_transient_leak(mag->next);
 	rcache->depot = mag->next;
 	mag->size = IOVA_MAG_SIZE;
 	rcache->depot_size--;
diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
index 6a3cd1bf4680..93a73c076d16 100644
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -26,6 +26,7 @@ extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
 extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
 extern void kmemleak_update_trace(const void *ptr) __ref;
 extern void kmemleak_not_leak(const void *ptr) __ref;
+extern void kmemleak_transient_leak(const void *ptr) __ref;
 extern void kmemleak_ignore(const void *ptr) __ref;
 extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref;
 extern void kmemleak_no_scan(const void *ptr) __ref;
@@ -93,6 +94,9 @@ static inline void kmemleak_update_trace(const void *ptr)
 static inline void kmemleak_not_leak(const void *ptr)
 {
 }
+static inline void kmemleak_transient_leak(const void *ptr)
+{
+}
 static inline void kmemleak_ignore(const void *ptr)
 {
 }
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 5501363d6b31..9fd338063cea 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -915,6 +915,28 @@ static void make_black_object(unsigned long ptr, bool is_phys)
 	paint_ptr(ptr, KMEMLEAK_BLACK, is_phys);
 }
 
+/*
+ * Reset the checksum of an object. The immediate effect is that it will not
+ * be reported as a leak during the next scan until its checksum is updated.
+ */
+static void reset_checksum(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("Not resetting the checksum of an unknown object at 0x%08lx\n",
+			      ptr);
+		return;
+	}
+
+	raw_spin_lock_irqsave(&object->lock, flags);
+	object->checksum = 0;
+	raw_spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
 /*
  * Add a scanning area to the object. If at least one such area is added,
  * kmemleak will only scan these ranges rather than the whole memory block.
@@ -1194,6 +1216,23 @@ void __ref kmemleak_not_leak(const void *ptr)
 }
 EXPORT_SYMBOL(kmemleak_not_leak);
 
+/**
+ * kmemleak_transient_leak - mark an allocated object as transient false positive
+ * @ptr:	pointer to beginning of the object
+ *
+ * Calling this function on an object will cause the memory block to not be
+ * reported as a leak temporarily. This may happen, for example, if the object
+ * is part of a singly linked list and the ->next reference it is changed.
+ */
+void __ref kmemleak_transient_leak(const void *ptr)
+{
+	pr_debug("%s(0x%px)\n", __func__, ptr);
+
+	if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+		reset_checksum((unsigned long)ptr);
+}
+EXPORT_SYMBOL(kmemleak_transient_leak);
+
 /**
  * kmemleak_ignore - ignore an allocated object
  * @ptr:	pointer to beginning of the object

-- 
Catalin

^ permalink raw reply related	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-11 10:13             ` Catalin Marinas
@ 2024-01-12 15:31               ` Ido Schimmel
  2024-01-15  7:17                 ` Ido Schimmel
  0 siblings, 1 reply; 25+ messages in thread
From: Ido Schimmel @ 2024-01-12 15:31 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Robin Murphy, joro, will, iommu, linux-kernel, zhangzekun11,
	john.g.garry, dheerajkumar.srivastava, jsnitsel

On Thu, Jan 11, 2024 at 10:13:01AM +0000, Catalin Marinas wrote:
> On Thu, Jan 11, 2024 at 10:20:27AM +0200, Ido Schimmel wrote:
> > On Wed, Jan 10, 2024 at 05:58:15PM +0000, Catalin Marinas wrote:
> > > Transient false positives are possible, especially as the code doesn't
> > > use a double-linked list (for the latter, kmemleak does checksumming and
> > > detects the prev/next change, defers the reporting until the object
> > > becomes stable). That said, if a new scan is forced (echo scan >
> > > /sys/kernel/debug/kmemleak), are the same objects still listed as leaks?
> > > If yes, they may not be transient.
> > 
> > We are doing "scan" and "clear" after each test. I will disable the
> > "clear" and see if the leaks persist.
> 
> If it is indeed a false positive

Looks like the leaks are transient. After removing the "clear" step the
leaks do not seem to persist.

> you can try the patch below (I haven't given it any run-time test,
> only compiled):

Will try and let you know next week.

Thanks

^ permalink raw reply	[flat|nested] 25+ messages in thread

* Re: [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible
  2024-01-12 15:31               ` Ido Schimmel
@ 2024-01-15  7:17                 ` Ido Schimmel
  0 siblings, 0 replies; 25+ messages in thread
From: Ido Schimmel @ 2024-01-15  7:17 UTC (permalink / raw)
  To: Catalin Marinas
  Cc: Robin Murphy, joro, will, iommu, linux-kernel, zhangzekun11,
	john.g.garry, dheerajkumar.srivastava, jsnitsel

On Fri, Jan 12, 2024 at 05:31:15PM +0200, Ido Schimmel wrote:
> On Thu, Jan 11, 2024 at 10:13:01AM +0000, Catalin Marinas wrote:
> > On Thu, Jan 11, 2024 at 10:20:27AM +0200, Ido Schimmel wrote:
> > > On Wed, Jan 10, 2024 at 05:58:15PM +0000, Catalin Marinas wrote:
> > > > Transient false positives are possible, especially as the code doesn't
> > > > use a double-linked list (for the latter, kmemleak does checksumming and
> > > > detects the prev/next change, defers the reporting until the object
> > > > becomes stable). That said, if a new scan is forced (echo scan >
> > > > /sys/kernel/debug/kmemleak), are the same objects still listed as leaks?
> > > > If yes, they may not be transient.
> > > 
> > > We are doing "scan" and "clear" after each test. I will disable the
> > > "clear" and see if the leaks persist.
> > 
> > If it is indeed a false positive
> 
> Looks like the leaks are transient. After removing the "clear" step the
> leaks do not seem to persist.
> 
> > you can try the patch below (I haven't given it any run-time test,
> > only compiled):
> 
> Will try and let you know next week.

Looks good. Feel free to add:

Tested-by: Ido Schimmel <idosch@nvidia.com>

Thanks!

^ permalink raw reply	[flat|nested] 25+ messages in thread

end of thread, other threads:[~2024-01-15  7:17 UTC | newest]

Thread overview: 25+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-09-12 16:28 [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
2023-09-12 16:28 ` [PATCH v3 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
2023-09-12 16:28 ` [PATCH v3 2/2] iommu/iova: Manage the depot list size Robin Murphy
2023-09-25 10:08 ` [PATCH v3 0/2] iommu/iova: Make the rcache depot properly flexible Joerg Roedel
2023-12-28 12:23 ` Ido Schimmel
2024-01-02  7:24   ` Ido Schimmel
2024-01-03  8:38     ` Joerg Roedel
2024-01-06  4:21     ` Ethan Zhao
2024-01-06  7:07       ` zhangzekun (A)
2024-01-06  7:33         ` Ethan Zhao
2024-01-06  4:03   ` Ethan Zhao
2024-01-08  3:13   ` Ethan Zhao
2024-01-08 17:35   ` Robin Murphy
2024-01-09  5:54     ` Ethan Zhao
2024-01-09  6:23       ` Ethan Zhao
2024-01-09 11:26         ` Robin Murphy
2024-01-10  0:52           ` Ethan Zhao
2024-01-09 17:21     ` Ido Schimmel
2024-01-10 12:48       ` Robin Murphy
2024-01-10 14:00         ` Ido Schimmel
2024-01-10 17:58         ` Catalin Marinas
2024-01-11  8:20           ` Ido Schimmel
2024-01-11 10:13             ` Catalin Marinas
2024-01-12 15:31               ` Ido Schimmel
2024-01-15  7:17                 ` Ido Schimmel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.