[PATCH 0/2] iommu/iova: Make the rcache depot properly flexible

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible
@ 2023-08-14 17:53 Robin Murphy
  2023-08-14 17:53 ` [PATCH 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
                   ` (3 more replies)
  0 siblings, 4 replies; 17+ messages in thread
From: Robin Murphy @ 2023-08-14 17:53 UTC (permalink / raw)
  To: joro; +Cc: will, iommu, linux-kernel, john.g.garry, zhangzekun11

Hi all,

Prompted by [1], which reminded me I started this a while ago, I've now
finished off my own attempt at sorting out the horrid lack of rcache
scalability. It's become quite clear that given the vast range of system
sizes and workloads there is no right size for a fixed depot array, so I
reckon we're better off not having one at all.

Note that the reclaim threshold and rate are chosen fairly arbitrarily -
it's enough of a challenge to get my 4-core dev board with spinning disk
and gigabit ethernet to push anything into a depot at all :)

Thanks,
Robin.

[1] https://lore.kernel.org/linux-iommu/20230811130246.42719-1-zhangzekun11@huawei.com


Robin Murphy (2):
  iommu/iova: Make the rcache depot scale better
  iommu/iova: Manage the depot list size

 drivers/iommu/iova.c | 94 ++++++++++++++++++++++++++++++--------------
 1 file changed, 65 insertions(+), 29 deletions(-)

-- 
2.39.2.101.g768bb238c484.dirty


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH 1/2] iommu/iova: Make the rcache depot scale better
  2023-08-14 17:53 [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
@ 2023-08-14 17:53 ` Robin Murphy
  2023-08-21  8:11   ` Srivastava, Dheeraj Kumar
  2023-08-21 12:02   ` John Garry
  2023-08-14 17:53 ` [PATCH 2/2] iommu/iova: Manage the depot list size Robin Murphy
                   ` (2 subsequent siblings)
  3 siblings, 2 replies; 17+ messages in thread
From: Robin Murphy @ 2023-08-14 17:53 UTC (permalink / raw)
  To: joro; +Cc: will, iommu, linux-kernel, john.g.garry, zhangzekun11

The algorithm in the original paper specifies the storage of full
magazines in the depot as an unbounded list rather than a fixed-size
array. It turns out to be pretty straightforward to do this in our
implementation with no significant loss of efficiency. This allows
the depot to scale up to the working set sizes of larger systems,
while also potentially saving some memory on smaller ones too.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
 drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
 1 file changed, 36 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 10b964600948..d2de6fb0e9f4 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -625,10 +625,16 @@ EXPORT_SYMBOL_GPL(reserve_iova);
  * will be wasted.
  */
 #define IOVA_MAG_SIZE 127
-#define MAX_GLOBAL_MAGS 32	/* magazines per bin */
 
 struct iova_magazine {
-	unsigned long size;
+	/*
+	 * Only full magazines are inserted into the depot, so we can avoid
+	 * a separate list head and preserve maximum space-efficiency.
+	 */
+	union {
+		unsigned long size;
+		struct iova_magazine *next;
+	};
 	unsigned long pfns[IOVA_MAG_SIZE];
 };
 
@@ -640,8 +646,7 @@ struct iova_cpu_rcache {
 
 struct iova_rcache {
 	spinlock_t lock;
-	unsigned long depot_size;
-	struct iova_magazine *depot[MAX_GLOBAL_MAGS];
+	struct iova_magazine *depot;
 	struct iova_cpu_rcache __percpu *cpu_rcaches;
 };
 
@@ -717,6 +722,21 @@ static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
 	mag->pfns[mag->size++] = pfn;
 }
 
+static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
+{
+	struct iova_magazine *mag = rcache->depot;
+
+	rcache->depot = mag->next;
+	mag->size = IOVA_MAG_SIZE;
+	return mag;
+}
+
+static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *mag)
+{
+	mag->next = rcache->depot;
+	rcache->depot = mag;
+}
+
 int iova_domain_init_rcaches(struct iova_domain *iovad)
 {
 	unsigned int cpu;
@@ -734,7 +754,6 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
 
 		rcache = &iovad->rcaches[i];
 		spin_lock_init(&rcache->lock);
-		rcache->depot_size = 0;
 		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
 						     cache_line_size());
 		if (!rcache->cpu_rcaches) {
@@ -776,7 +795,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 				 struct iova_rcache *rcache,
 				 unsigned long iova_pfn)
 {
-	struct iova_magazine *mag_to_free = NULL;
 	struct iova_cpu_rcache *cpu_rcache;
 	bool can_insert = false;
 	unsigned long flags;
@@ -794,12 +812,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 
 		if (new_mag) {
 			spin_lock(&rcache->lock);
-			if (rcache->depot_size < MAX_GLOBAL_MAGS) {
-				rcache->depot[rcache->depot_size++] =
-						cpu_rcache->loaded;
-			} else {
-				mag_to_free = cpu_rcache->loaded;
-			}
+			iova_depot_push(rcache, cpu_rcache->loaded);
 			spin_unlock(&rcache->lock);
 
 			cpu_rcache->loaded = new_mag;
@@ -812,11 +825,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 
 	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
 
-	if (mag_to_free) {
-		iova_magazine_free_pfns(mag_to_free, iovad);
-		iova_magazine_free(mag_to_free);
-	}
-
 	return can_insert;
 }
 
@@ -854,9 +862,9 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
 		has_pfn = true;
 	} else {
 		spin_lock(&rcache->lock);
-		if (rcache->depot_size > 0) {
+		if (rcache->depot) {
 			iova_magazine_free(cpu_rcache->loaded);
-			cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
+			cpu_rcache->loaded = iova_depot_pop(rcache);
 			has_pfn = true;
 		}
 		spin_unlock(&rcache->lock);
@@ -894,10 +902,10 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 {
 	struct iova_rcache *rcache;
 	struct iova_cpu_rcache *cpu_rcache;
+	struct iova_magazine *mag;
 	unsigned int cpu;
-	int i, j;
 
-	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+	for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
 		rcache = &iovad->rcaches[i];
 		if (!rcache->cpu_rcaches)
 			break;
@@ -907,8 +915,8 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 			iova_magazine_free(cpu_rcache->prev);
 		}
 		free_percpu(rcache->cpu_rcaches);
-		for (j = 0; j < rcache->depot_size; ++j)
-			iova_magazine_free(rcache->depot[j]);
+		while ((mag = iova_depot_pop(rcache)))
+			iova_magazine_free(mag);
 	}
 
 	kfree(iovad->rcaches);
@@ -941,17 +949,16 @@ static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
 static void free_global_cached_iovas(struct iova_domain *iovad)
 {
 	struct iova_rcache *rcache;
+	struct iova_magazine *mag;
 	unsigned long flags;
-	int i, j;
 
-	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
+	for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
 		rcache = &iovad->rcaches[i];
 		spin_lock_irqsave(&rcache->lock, flags);
-		for (j = 0; j < rcache->depot_size; ++j) {
-			iova_magazine_free_pfns(rcache->depot[j], iovad);
-			iova_magazine_free(rcache->depot[j]);
+		while ((mag = iova_depot_pop(rcache))) {
+			iova_magazine_free_pfns(mag, iovad);
+			iova_magazine_free(mag);
 		}
-		rcache->depot_size = 0;
 		spin_unlock_irqrestore(&rcache->lock, flags);
 	}
 }
-- 
2.39.2.101.g768bb238c484.dirty


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* [PATCH 2/2] iommu/iova: Manage the depot list size
  2023-08-14 17:53 [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
  2023-08-14 17:53 ` [PATCH 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
@ 2023-08-14 17:53 ` Robin Murphy
  2023-08-15 14:11   ` zhangzekun (A)
  2023-08-15 10:24 ` [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible John Garry
  2023-08-17 16:39 ` Jerry Snitselaar
  3 siblings, 1 reply; 17+ messages in thread
From: Robin Murphy @ 2023-08-14 17:53 UTC (permalink / raw)
  To: joro; +Cc: will, iommu, linux-kernel, john.g.garry, zhangzekun11

Automatically scaling the depot up to suit the peak capacity of a
workload is all well and good, but it would be nice to have a way to
scale it back down again if the workload changes. To that end, add
automatic reclaim that will gradually free unused magazines if the
depot size remains above a reasonable threshold for long enough.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
---
 drivers/iommu/iova.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d2de6fb0e9f4..76a7d694708e 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -11,6 +11,7 @@
 #include <linux/smp.h>
 #include <linux/bitops.h>
 #include <linux/cpu.h>
+#include <linux/workqueue.h>
 
 /* The anchor node sits above the top of the usable address space */
 #define IOVA_ANCHOR	~0UL
@@ -626,6 +627,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
  */
 #define IOVA_MAG_SIZE 127
 
+#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
+
 struct iova_magazine {
 	/*
 	 * Only full magazines are inserted into the depot, so we can avoid
@@ -646,8 +649,11 @@ struct iova_cpu_rcache {
 
 struct iova_rcache {
 	spinlock_t lock;
+	unsigned int depot_size;
 	struct iova_magazine *depot;
 	struct iova_cpu_rcache __percpu *cpu_rcaches;
+	struct iova_domain *iovad;
+	struct delayed_work work;
 };
 
 static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
@@ -728,6 +734,7 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
 
 	rcache->depot = mag->next;
 	mag->size = IOVA_MAG_SIZE;
+	rcache->depot_size--;
 	return mag;
 }
 
@@ -735,6 +742,24 @@ static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *ma
 {
 	mag->next = rcache->depot;
 	rcache->depot = mag;
+	rcache->depot_size++;
+}
+
+static void iova_depot_work_func(struct work_struct *work)
+{
+	struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work);
+	struct iova_magazine *mag = NULL;
+
+	spin_lock(&rcache->lock);
+	if (rcache->depot_size > num_online_cpus())
+		mag = iova_depot_pop(rcache);
+	spin_unlock(&rcache->lock);
+
+	if (mag) {
+		iova_magazine_free_pfns(mag, rcache->iovad);
+		iova_magazine_free(mag);
+		schedule_delayed_work(&rcache->work, msecs_to_jiffies(IOVA_DEPOT_DELAY));
+	}
 }
 
 int iova_domain_init_rcaches(struct iova_domain *iovad)
@@ -754,6 +779,8 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
 
 		rcache = &iovad->rcaches[i];
 		spin_lock_init(&rcache->lock);
+		rcache->iovad = iovad;
+		INIT_DELAYED_WORK(&rcache->work, iova_depot_work_func);
 		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
 						     cache_line_size());
 		if (!rcache->cpu_rcaches) {
@@ -814,6 +841,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
 			spin_lock(&rcache->lock);
 			iova_depot_push(rcache, cpu_rcache->loaded);
 			spin_unlock(&rcache->lock);
+			schedule_delayed_work(&rcache->work, IOVA_DEPOT_DELAY);
 
 			cpu_rcache->loaded = new_mag;
 			can_insert = true;
@@ -915,6 +943,7 @@ static void free_iova_rcaches(struct iova_domain *iovad)
 			iova_magazine_free(cpu_rcache->prev);
 		}
 		free_percpu(rcache->cpu_rcaches);
+		cancel_delayed_work_sync(&rcache->work);
 		while ((mag = iova_depot_pop(rcache)))
 			iova_magazine_free(mag);
 	}
-- 
2.39.2.101.g768bb238c484.dirty


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-08-14 17:53 [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
  2023-08-14 17:53 ` [PATCH 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
  2023-08-14 17:53 ` [PATCH 2/2] iommu/iova: Manage the depot list size Robin Murphy
@ 2023-08-15 10:24 ` John Garry
  2023-08-15 11:11   ` Robin Murphy
  2023-08-17 16:39 ` Jerry Snitselaar
  3 siblings, 1 reply; 17+ messages in thread
From: John Garry @ 2023-08-15 10:24 UTC (permalink / raw)
  To: Robin Murphy, joro; +Cc: will, iommu, linux-kernel, zhangzekun11

On 14/08/2023 18:53, Robin Murphy wrote:
> Hi all,
> 

Hi Robin,

> Prompted by [1], which reminded me I started this a while ago, I've now
> finished off my own attempt at sorting out the horrid lack of rcache
> scalability. It's become quite clear that given the vast range of system
> sizes and workloads there is no right size for a fixed depot array, so I
> reckon we're better off not having one at all.
> 
> Note that the reclaim threshold and rate are chosen fairly arbitrarily -

This threshold is the number of online CPUs, right?

> it's enough of a challenge to get my 4-core dev board with spinning disk
> and gigabit ethernet to push anything into a depot at all :)
> 

I have to admit that I was hoping to also see a more aggressive reclaim 
strategy, where we also trim the per-CPU rcaches when not in use. 
Leizhen proposed something like this a long time ago.

Thanks,
John

> Thanks,
> Robin.
> 
> [1] https://urldefense.com/v3/__https://lore.kernel.org/linux-iommu/20230811130246.42719-1-zhangzekun11@huawei.com__;!!ACWV5N9M2RV99hQ!Oj-N3yDamuhrlNTcuL5MA2LQRVf1EwFxQU21BMXSFBR1Fb3z4H_on1uiFG0EOoYpNc-FKGeoKvw9wzEV_1TRcr4$
> 
> 
> Robin Murphy (2):
>    iommu/iova: Make the rcache depot scale better
>    iommu/iova: Manage the depot list size
> 
>   drivers/iommu/iova.c | 94 ++++++++++++++++++++++++++++++--------------
>   1 file changed, 65 insertions(+), 29 deletions(-)
> 


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-08-15 10:24 ` [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible John Garry
@ 2023-08-15 11:11   ` Robin Murphy
  2023-08-15 13:35     ` John Garry
  0 siblings, 1 reply; 17+ messages in thread
From: Robin Murphy @ 2023-08-15 11:11 UTC (permalink / raw)
  To: John Garry, joro; +Cc: will, iommu, linux-kernel, zhangzekun11

On 15/08/2023 11:24 am, John Garry wrote:
> On 14/08/2023 18:53, Robin Murphy wrote:
>> Hi all,
>>
> 
> Hi Robin,
> 
>> Prompted by [1], which reminded me I started this a while ago, I've now
>> finished off my own attempt at sorting out the horrid lack of rcache
>> scalability. It's become quite clear that given the vast range of system
>> sizes and workloads there is no right size for a fixed depot array, so I
>> reckon we're better off not having one at all.
>>
>> Note that the reclaim threshold and rate are chosen fairly arbitrarily -
> 
> This threshold is the number of online CPUs, right?

Yes, that's nominally half of the current fixed size (based on all the 
performance figures from the original series seemingly coming from a 
16-thread machine, but seemed like a fair compromise. I am of course 
keen to see how real-world testing actually pans out.

>> it's enough of a challenge to get my 4-core dev board with spinning disk
>> and gigabit ethernet to push anything into a depot at all :)
>>
> 
> I have to admit that I was hoping to also see a more aggressive reclaim 
> strategy, where we also trim the per-CPU rcaches when not in use. 
> Leizhen proposed something like this a long time ago.

Don't think I haven't been having various elaborate ideas for making it 
cleverer with multiple thresholds and self-tuning, however I have 
managed to restrain myself ;)

At this point I'm just looking to confirm whether the fundamental 
concepts are sound, and at least no worse than the current behaviour 
(hence keeping it split into 2 distinct patches for the sake of review 
and debugging). If it proves solid then we can absolutely come back and 
go to town on enhancements later.

Cheers,
Robin.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-08-15 11:11   ` Robin Murphy
@ 2023-08-15 13:35     ` John Garry
  2023-08-16 15:10       ` Robin Murphy
  0 siblings, 1 reply; 17+ messages in thread
From: John Garry @ 2023-08-15 13:35 UTC (permalink / raw)
  To: Robin Murphy, joro; +Cc: will, iommu, linux-kernel, zhangzekun11

On 15/08/2023 12:11, Robin Murphy wrote:
>>
>> This threshold is the number of online CPUs, right?
> 
> Yes, that's nominally half of the current fixed size (based on all the 
> performance figures from the original series seemingly coming from a 
> 16-thread machine, 

If you are talking about 
https://lore.kernel.org/linux-iommu/20230811130246.42719-1-zhangzekun11@huawei.com/, 
then I think it's a 256-CPU system and the DMA controller has 16 HW 
queues. The 16 HW queues are relevant as the per-completion queue 
interrupt handler runs on a fixed CPU from the set of 16 CPUs in the HW 
queue interrupt handler affinity mask. And what this means is while any 
CPU may alloc an IOVA, only those 16 CPUs handling each HW queue 
interrupt will be free'ing IOVAs.

> but seemed like a fair compromise. I am of course 
> keen to see how real-world testing actually pans out.
> 
>>> it's enough of a challenge to get my 4-core dev board with spinning disk
>>> and gigabit ethernet to push anything into a depot at all 😄
>>>
>>
>> I have to admit that I was hoping to also see a more aggressive 
>> reclaim strategy, where we also trim the per-CPU rcaches when not in 
>> use. Leizhen proposed something like this a long time ago.
> 
> Don't think I haven't been having various elaborate ideas for making it 
> cleverer with multiple thresholds and self-tuning, however I have 
> managed to restrain myself 😉
> 

OK, understood. My main issue WRT scalability is that the total 
cacheable IOVAs (CPU and depot rcache) scales up with the number of 
CPUs, but many DMA controllers have a fixed number of max in-flight 
requests.

Consider a SCSI storage controller on a 256-CPU system. The in-flight 
limit for this example controller is 4096, which would typically never 
be even used up or may not be even usable.

For this device, we need 4096 * 6 [IOVA rcache range] = ~24K cached 
IOVAs if we were to pre-allocate them all - obviously I am ignoring that 
we have the per-CPU rcache for speed and it would not make sense to 
share one set. However, according to current IOVA driver, we can in 
theory cache upto ((256 [CPUs] * 2 [loaded + prev]) + 32 [depot size]) * 
6 [rcache range] * 128 (IOVA per mag) = ~420K IOVAs. That's ~17x what we 
would ever need.

Something like NVMe is different, as its total requests can scale up 
with the CPU count, but only to a limit. I am not sure about network 
controllers.

Anyway, this is just something which I think should be considered - 
which I guess already has been.

> At this point I'm just looking to confirm whether the fundamental 
> concepts are sound, and at least no worse than the current behaviour 
> (hence keeping it split into 2 distinct patches for the sake of review 
> and debugging). If it proves solid then we can absolutely come back and 
> go to town on enhancements later.

Thanks,
John

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] iommu/iova: Manage the depot list size
  2023-08-14 17:53 ` [PATCH 2/2] iommu/iova: Manage the depot list size Robin Murphy
@ 2023-08-15 14:11   ` zhangzekun (A)
  2023-08-16  4:25     ` Jerry Snitselaar
  2023-08-16 16:52     ` Robin Murphy
  0 siblings, 2 replies; 17+ messages in thread
From: zhangzekun (A) @ 2023-08-15 14:11 UTC (permalink / raw)
  To: Robin Murphy; +Cc: will, iommu, linux-kernel, john.g.garry, joro



在 2023/8/15 1:53, Robin Murphy 写道:
> Automatically scaling the depot up to suit the peak capacity of a
> workload is all well and good, but it would be nice to have a way to
> scale it back down again if the workload changes. To that end, add
> automatic reclaim that will gradually free unused magazines if the
> depot size remains above a reasonable threshold for long enough.
>
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>   drivers/iommu/iova.c | 29 +++++++++++++++++++++++++++++
>   1 file changed, 29 insertions(+)
>
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index d2de6fb0e9f4..76a7d694708e 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -11,6 +11,7 @@
>   #include <linux/smp.h>
>   #include <linux/bitops.h>
>   #include <linux/cpu.h>
> +#include <linux/workqueue.h>
>   
>   /* The anchor node sits above the top of the usable address space */
>   #define IOVA_ANCHOR	~0UL
> @@ -626,6 +627,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
>    */
>   #define IOVA_MAG_SIZE 127
>   
> +#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
> +
>   struct iova_magazine {
>   	/*
>   	 * Only full magazines are inserted into the depot, so we can avoid
> @@ -646,8 +649,11 @@ struct iova_cpu_rcache {
>   
>   struct iova_rcache {
>   	spinlock_t lock;
> +	unsigned int depot_size;
>   	struct iova_magazine *depot;
>   	struct iova_cpu_rcache __percpu *cpu_rcaches;
> +	struct iova_domain *iovad;
> +	struct delayed_work work;
>   };
>   
>   static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
> @@ -728,6 +734,7 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
>   
>   	rcache->depot = mag->next;
>   	mag->size = IOVA_MAG_SIZE;
> +	rcache->depot_size--;
>   	return mag;
>   }
>   
> @@ -735,6 +742,24 @@ static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *ma
>   {
>   	mag->next = rcache->depot;
>   	rcache->depot = mag;
> +	rcache->depot_size++;
> +}
> +
> +static void iova_depot_work_func(struct work_struct *work)
> +{
> +	struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work);
> +	struct iova_magazine *mag = NULL;
> +
> +	spin_lock(&rcache->lock);
> +	if (rcache->depot_size > num_online_cpus())
> +		mag = iova_depot_pop(rcache);
> +	spin_unlock(&rcache->lock);
> +
> +	if (mag) {
> +		iova_magazine_free_pfns(mag, rcache->iovad);
> +		iova_magazine_free(mag);
> +		schedule_delayed_work(&rcache->work, msecs_to_jiffies(IOVA_DEPOT_DELAY));
Hi, Robin,

I am a little confused why IOVA_DEPOT_DELAY need to be calculated twice 
in iova_depot_work_func(), as it already equals to 
"msecs_to_jiffies(100)". Besides, do we really need to invoke a 
delayed_work in iova_depot_work_func()? As each time we put a iova 
magazine to depot, a delayed_work will be invoked which is reponsible to 
free a iova magazine in depot if the depot size is greater than 
num_online_cpus().

Thanks,
Zekun

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] iommu/iova: Manage the depot list size
  2023-08-15 14:11   ` zhangzekun (A)
@ 2023-08-16  4:25     ` Jerry Snitselaar
  2023-08-16 16:52     ` Robin Murphy
  1 sibling, 0 replies; 17+ messages in thread
From: Jerry Snitselaar @ 2023-08-16  4:25 UTC (permalink / raw)
  To: zhangzekun (A)
  Cc: Robin Murphy, will, iommu, linux-kernel, john.g.garry, joro

On Tue, Aug 15, 2023 at 10:11:51PM +0800, zhangzekun (A) wrote:
> 
> 
> 在 2023/8/15 1:53, Robin Murphy 写道:
> > Automatically scaling the depot up to suit the peak capacity of a
> > workload is all well and good, but it would be nice to have a way to
> > scale it back down again if the workload changes. To that end, add
> > automatic reclaim that will gradually free unused magazines if the
> > depot size remains above a reasonable threshold for long enough.
> > 
> > Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> > ---
> >   drivers/iommu/iova.c | 29 +++++++++++++++++++++++++++++
> >   1 file changed, 29 insertions(+)
> > 
> > diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> > index d2de6fb0e9f4..76a7d694708e 100644
> > --- a/drivers/iommu/iova.c
> > +++ b/drivers/iommu/iova.c
> > @@ -11,6 +11,7 @@
> >   #include <linux/smp.h>
> >   #include <linux/bitops.h>
> >   #include <linux/cpu.h>
> > +#include <linux/workqueue.h>
> >   /* The anchor node sits above the top of the usable address space */
> >   #define IOVA_ANCHOR	~0UL
> > @@ -626,6 +627,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
> >    */
> >   #define IOVA_MAG_SIZE 127
> > +#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
> > +
> >   struct iova_magazine {
> >   	/*
> >   	 * Only full magazines are inserted into the depot, so we can avoid
> > @@ -646,8 +649,11 @@ struct iova_cpu_rcache {
> >   struct iova_rcache {
> >   	spinlock_t lock;
> > +	unsigned int depot_size;
> >   	struct iova_magazine *depot;
> >   	struct iova_cpu_rcache __percpu *cpu_rcaches;
> > +	struct iova_domain *iovad;
> > +	struct delayed_work work;
> >   };
> >   static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
> > @@ -728,6 +734,7 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
> >   	rcache->depot = mag->next;
> >   	mag->size = IOVA_MAG_SIZE;
> > +	rcache->depot_size--;
> >   	return mag;
> >   }
> > @@ -735,6 +742,24 @@ static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *ma
> >   {
> >   	mag->next = rcache->depot;
> >   	rcache->depot = mag;
> > +	rcache->depot_size++;
> > +}
> > +
> > +static void iova_depot_work_func(struct work_struct *work)
> > +{
> > +	struct iova_rcache *rcache = container_of(work, typeof(*rcache), work.work);
> > +	struct iova_magazine *mag = NULL;
> > +
> > +	spin_lock(&rcache->lock);
> > +	if (rcache->depot_size > num_online_cpus())
> > +		mag = iova_depot_pop(rcache);
> > +	spin_unlock(&rcache->lock);
> > +
> > +	if (mag) {
> > +		iova_magazine_free_pfns(mag, rcache->iovad);
> > +		iova_magazine_free(mag);
> > +		schedule_delayed_work(&rcache->work, msecs_to_jiffies(IOVA_DEPOT_DELAY));
> Hi, Robin,
> 
> I am a little confused why IOVA_DEPOT_DELAY need to be calculated twice in
> iova_depot_work_func(), as it already equals to "msecs_to_jiffies(100)".

I think it was a typo, and is meant to be IOVA_DEPOT_DELAY like it is in
__iova_rcache_insert.

Regards,
Jerry

> Besides, do we really need to invoke a delayed_work in
> iova_depot_work_func()? As each time we put a iova magazine to depot, a
> delayed_work will be invoked which is reponsible to free a iova magazine in
> depot if the depot size is greater than num_online_cpus().
> 
> Thanks,
> Zekun


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-08-15 13:35     ` John Garry
@ 2023-08-16 15:10       ` Robin Murphy
  2023-08-21 11:35         ` John Garry
  0 siblings, 1 reply; 17+ messages in thread
From: Robin Murphy @ 2023-08-16 15:10 UTC (permalink / raw)
  To: John Garry, joro; +Cc: will, iommu, linux-kernel, zhangzekun11

On 15/08/2023 2:35 pm, John Garry wrote:
> On 15/08/2023 12:11, Robin Murphy wrote:
>>>
>>> This threshold is the number of online CPUs, right?
>>
>> Yes, that's nominally half of the current fixed size (based on all the 
>> performance figures from the original series seemingly coming from a 
>> 16-thread machine, 
> 
> If you are talking about 
> https://lore.kernel.org/linux-iommu/20230811130246.42719-1-zhangzekun11@huawei.com/,

No, I mean the *original* rcache patch submission, and its associated paper:

https://lore.kernel.org/linux-iommu/cover.1461135861.git.mad@cs.technion.ac.il/

> then I think it's a 256-CPU system and the DMA controller has 16 HW queues. The 16 HW queues are relevant as the per-completion queue interrupt handler runs on a fixed CPU from the set of 16 CPUs in the HW queue interrupt handler affinity mask. And what this means is while any CPU may alloc an IOVA, only those 16 CPUs handling each HW queue interrupt will be free'ing IOVAs.
> 
>> but seemed like a fair compromise. I am of course keen to see how 
>> real-world testing actually pans out.
>>
>>>> it's enough of a challenge to get my 4-core dev board with spinning 
>>>> disk
>>>> and gigabit ethernet to push anything into a depot at all 😄
>>>>
>>>
>>> I have to admit that I was hoping to also see a more aggressive 
>>> reclaim strategy, where we also trim the per-CPU rcaches when not in 
>>> use. Leizhen proposed something like this a long time ago.
>>
>> Don't think I haven't been having various elaborate ideas for making 
>> it cleverer with multiple thresholds and self-tuning, however I have 
>> managed to restrain myself 😉
>>
> 
> OK, understood. My main issue WRT scalability is that the total 
> cacheable IOVAs (CPU and depot rcache) scales up with the number of 
> CPUs, but many DMA controllers have a fixed number of max in-flight 
> requests.
> 
> Consider a SCSI storage controller on a 256-CPU system. The in-flight 
> limit for this example controller is 4096, which would typically never 
> be even used up or may not be even usable.
> 
> For this device, we need 4096 * 6 [IOVA rcache range] = ~24K cached 
> IOVAs if we were to pre-allocate them all - obviously I am ignoring that 
> we have the per-CPU rcache for speed and it would not make sense to 
> share one set. However, according to current IOVA driver, we can in 
> theory cache upto ((256 [CPUs] * 2 [loaded + prev]) + 32 [depot size]) * 
> 6 [rcache range] * 128 (IOVA per mag) = ~420K IOVAs. That's ~17x what we 
> would ever need.
> 
> Something like NVMe is different, as its total requests can scale up 
> with the CPU count, but only to a limit. I am not sure about network 
> controllers.

Remember that this threshold only represents a point at which we 
consider the cache to have grown "big enough" to start background 
reclaim - over the short term it is neither an upper nor a lower limit 
on the cache capacity itself. Indeed it will be larger than the working 
set of some workloads, but then it still wants to be enough of a buffer 
to be useful for others which do make big bursts of allocations only 
periodically.

> Anyway, this is just something which I think should be considered - 
> which I guess already has been.

Indeed, I would tend to assume that machines with hundreds of CPUs are 
less likely to be constrained on overall memory and/or IOVA space, so 
tuning for a more responsive cache should be more beneficial than any 
potential wastage is detrimental.

Cheers,
Robin.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 2/2] iommu/iova: Manage the depot list size
  2023-08-15 14:11   ` zhangzekun (A)
  2023-08-16  4:25     ` Jerry Snitselaar
@ 2023-08-16 16:52     ` Robin Murphy
  1 sibling, 0 replies; 17+ messages in thread
From: Robin Murphy @ 2023-08-16 16:52 UTC (permalink / raw)
  To: zhangzekun (A); +Cc: will, iommu, linux-kernel, john.g.garry, joro

On 15/08/2023 3:11 pm, zhangzekun (A) wrote:
> 
> 
> 在 2023/8/15 1:53, Robin Murphy 写道:
>> Automatically scaling the depot up to suit the peak capacity of a
>> workload is all well and good, but it would be nice to have a way to
>> scale it back down again if the workload changes. To that end, add
>> automatic reclaim that will gradually free unused magazines if the
>> depot size remains above a reasonable threshold for long enough.
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>> ---
>>   drivers/iommu/iova.c | 29 +++++++++++++++++++++++++++++
>>   1 file changed, 29 insertions(+)
>>
>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
>> index d2de6fb0e9f4..76a7d694708e 100644
>> --- a/drivers/iommu/iova.c
>> +++ b/drivers/iommu/iova.c
>> @@ -11,6 +11,7 @@
>>   #include <linux/smp.h>
>>   #include <linux/bitops.h>
>>   #include <linux/cpu.h>
>> +#include <linux/workqueue.h>
>>   /* The anchor node sits above the top of the usable address space */
>>   #define IOVA_ANCHOR    ~0UL
>> @@ -626,6 +627,8 @@ EXPORT_SYMBOL_GPL(reserve_iova);
>>    */
>>   #define IOVA_MAG_SIZE 127
>> +#define IOVA_DEPOT_DELAY msecs_to_jiffies(100)
>> +
>>   struct iova_magazine {
>>       /*
>>        * Only full magazines are inserted into the depot, so we can avoid
>> @@ -646,8 +649,11 @@ struct iova_cpu_rcache {
>>   struct iova_rcache {
>>       spinlock_t lock;
>> +    unsigned int depot_size;
>>       struct iova_magazine *depot;
>>       struct iova_cpu_rcache __percpu *cpu_rcaches;
>> +    struct iova_domain *iovad;
>> +    struct delayed_work work;
>>   };
>>   static struct iova_magazine *iova_magazine_alloc(gfp_t flags)
>> @@ -728,6 +734,7 @@ static struct iova_magazine *iova_depot_pop(struct 
>> iova_rcache *rcache)
>>       rcache->depot = mag->next;
>>       mag->size = IOVA_MAG_SIZE;
>> +    rcache->depot_size--;
>>       return mag;
>>   }
>> @@ -735,6 +742,24 @@ static void iova_depot_push(struct iova_rcache 
>> *rcache, struct iova_magazine *ma
>>   {
>>       mag->next = rcache->depot;
>>       rcache->depot = mag;
>> +    rcache->depot_size++;
>> +}
>> +
>> +static void iova_depot_work_func(struct work_struct *work)
>> +{
>> +    struct iova_rcache *rcache = container_of(work, typeof(*rcache), 
>> work.work);
>> +    struct iova_magazine *mag = NULL;
>> +
>> +    spin_lock(&rcache->lock);
>> +    if (rcache->depot_size > num_online_cpus())
>> +        mag = iova_depot_pop(rcache);
>> +    spin_unlock(&rcache->lock);
>> +
>> +    if (mag) {
>> +        iova_magazine_free_pfns(mag, rcache->iovad);
>> +        iova_magazine_free(mag);
>> +        schedule_delayed_work(&rcache->work, 
>> msecs_to_jiffies(IOVA_DEPOT_DELAY));
> Hi, Robin,
> 
> I am a little confused why IOVA_DEPOT_DELAY need to be calculated twice 
> in iova_depot_work_func(), as it already equals to 
> "msecs_to_jiffies(100)".

Oof, not sure how I managed to leave a mere 3-line refactoring 
half-finished... yeah, this msecs_to_jiffies() just shouldn't be here :)

> Besides, do we really need to invoke a 
> delayed_work in iova_depot_work_func()? As each time we put a iova 
> magazine to depot, a delayed_work will be invoked which is reponsible to 
> free a iova magazine in depot if the depot size is greater than 
> num_online_cpus().

The idea is to free excess magazines one at a time at a relatively low 
rate, so as not to interfere too much with "bursty" workloads which 
might release a large number of IOVAs at once, but then want to 
reallocate them again relatively soon. I'm hoping that the overhead of 
scheduling the reclaim work unconditionally whenever the depot grows is 
sufficiently negligible to avoid having to check the threshold in 
multiple places, as that's the part which I anticipate might grow more 
complex in future. As far as I could see it should be pretty minimal if 
the work is already scheduled, which I'd expect to be the case most of 
the time while the depot is busy. The reason the work also reschedules 
itself is to handle the opposite situation, and make sure it can run to 
completion after the depot goes idle.

Thanks,
Robin.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-08-14 17:53 [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
                   ` (2 preceding siblings ...)
  2023-08-15 10:24 ` [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible John Garry
@ 2023-08-17 16:39 ` Jerry Snitselaar
  3 siblings, 0 replies; 17+ messages in thread
From: Jerry Snitselaar @ 2023-08-17 16:39 UTC (permalink / raw)
  To: Robin Murphy; +Cc: joro, will, iommu, linux-kernel, john.g.garry, zhangzekun11

On Mon, Aug 14, 2023 at 06:53:32PM +0100, Robin Murphy wrote:
> Hi all,
> 
> Prompted by [1], which reminded me I started this a while ago, I've now
> finished off my own attempt at sorting out the horrid lack of rcache
> scalability. It's become quite clear that given the vast range of system
> sizes and workloads there is no right size for a fixed depot array, so I
> reckon we're better off not having one at all.
> 
> Note that the reclaim threshold and rate are chosen fairly arbitrarily -
> it's enough of a challenge to get my 4-core dev board with spinning disk
> and gigabit ethernet to push anything into a depot at all :)
> 
> Thanks,
> Robin.
> 
> [1] https://lore.kernel.org/linux-iommu/20230811130246.42719-1-zhangzekun11@huawei.com
> 
> 
> Robin Murphy (2):
>   iommu/iova: Make the rcache depot scale better
>   iommu/iova: Manage the depot list size
> 
>  drivers/iommu/iova.c | 94 ++++++++++++++++++++++++++++++--------------
>  1 file changed, 65 insertions(+), 29 deletions(-)
> 
> -- 
> 2.39.2.101.g768bb238c484.dirty
> 

I'm trying to hunt down a system where we've seen some issues before,
but most of them have involved systems with nvme drives. Commit
3710e2b056cb ("nvme-pci: clamp max_hw_sectors based on DMA optimized
limitation") has helped those cases. I ran the patches overnight with
IOVA_DEPOT_DELAY fixed up on a couple of Genoa based systems (384
cores) without issue.

Regards,
Jerry


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] iommu/iova: Make the rcache depot scale better
  2023-08-14 17:53 ` [PATCH 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
@ 2023-08-21  8:11   ` Srivastava, Dheeraj Kumar
  2023-08-21  8:55     ` Robin Murphy
  2023-08-21 12:02   ` John Garry
  1 sibling, 1 reply; 17+ messages in thread
From: Srivastava, Dheeraj Kumar @ 2023-08-21  8:11 UTC (permalink / raw)
  To: Robin Murphy, joro; +Cc: will, iommu, linux-kernel, john.g.garry, zhangzekun11

Hello Robin,

On 8/14/2023 11:23 PM, Robin Murphy wrote:
> The algorithm in the original paper specifies the storage of full
> magazines in the depot as an unbounded list rather than a fixed-size
> array. It turns out to be pretty straightforward to do this in our
> implementation with no significant loss of efficiency. This allows
> the depot to scale up to the working set sizes of larger systems,
> while also potentially saving some memory on smaller ones too.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> ---
>   drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
>   1 file changed, 36 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index 10b964600948..d2de6fb0e9f4 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -625,10 +625,16 @@ EXPORT_SYMBOL_GPL(reserve_iova);
>    * will be wasted.
>    */
>   #define IOVA_MAG_SIZE 127
> -#define MAX_GLOBAL_MAGS 32	/* magazines per bin */
>   
>   struct iova_magazine {
> -	unsigned long size;
> +	/*
> +	 * Only full magazines are inserted into the depot, so we can avoid
> +	 * a separate list head and preserve maximum space-efficiency.
> +	 */
> +	union {
> +		unsigned long size;
> +		struct iova_magazine *next;
> +	};
>   	unsigned long pfns[IOVA_MAG_SIZE];
>   };
>   
> @@ -640,8 +646,7 @@ struct iova_cpu_rcache {
>   
>   struct iova_rcache {
>   	spinlock_t lock;
> -	unsigned long depot_size;
> -	struct iova_magazine *depot[MAX_GLOBAL_MAGS];
> +	struct iova_magazine *depot;
>   	struct iova_cpu_rcache __percpu *cpu_rcaches;
>   };
>   
> @@ -717,6 +722,21 @@ static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
>   	mag->pfns[mag->size++] = pfn;
>   }
>   
> +static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
> +{
> +	struct iova_magazine *mag = rcache->depot;
> +
> +	rcache->depot = mag->next;

While doing routine domain change test for a device ("unbind device from 
driver -> change domain of the device -> bind device back to the 
driver"), i ran into the following NULL pointer dereferencing issue.

[  599.020261] BUG: kernel NULL pointer dereference, address: 
0000000000000000
[  599.020986] #PF: supervisor read access in kernel mode
[  599.021588] #PF: error_code(0x0000) - not-present page
[  599.022180] PGD 0 P4D 0
[  599.022770] Oops: 0000 [#1] PREEMPT SMP NOPTI
[  599.023365] CPU: 68 PID: 3122 Comm: avocado Not tainted 
6.5.0-rc6-ChngDomainIssue #16
[  599.023970] Hardware name: Dell Inc. PowerEdge R6515/07PXPY, BIOS 
2.3.6 07/06/2021
[  599.024571] RIP: 0010:free_iova_rcaches+0x9c/0x110
[  599.025170] Code: d1 ff 39 05 36 d2 bc 01 48 89 c3 77 b4 49 8b 7f 10 
e8 b8 69 93 ff 49 8d 7f 20 e8 6f e4 6b ff eb 05 e8 48 ba 93 ff 49 8b 7f 
08 <48> 8b 07 49 89 47 08 48 c7 07 7f 00 00 00 41 83 6f 04 01 48 85 ff
[  599.026436] RSP: 0018:ffffb78b4c9f7c68 EFLAGS: 00010296
[  599.027075] RAX: ffffffff9fa8c100 RBX: 0000000000000080 RCX: 
0000000000000005
[  599.027719] RDX: 0000000000000000 RSI: 000000007fffffff RDI: 
0000000000000000
[  599.028359] RBP: ffffb78b4c9f7c98 R08: 0000000000000000 R09: 
0000000000000006
[  599.028995] R10: 0000000000000000 R11: 0000000000000000 R12: 
0000000000000000
[  599.029636] R13: ffff93910d9c6008 R14: ffffd78b3bde1000 R15: 
ffff9391144ebc00
[  599.030283] FS:  00007fa5c9e5c000(0000) GS:ffff93cf72700000(0000) 
knlGS:0000000000000000
[  599.030941] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  599.031588] CR2: 0000000000000000 CR3: 000000013f526006 CR4: 
0000000000770ee0
[  599.032237] PKRU: 55555554
[  599.032878] Call Trace:
[  599.033512]  <TASK>
[  599.034140]  ? show_regs+0x6e/0x80
[  599.034769]  ? __die+0x29/0x70
[  599.035393]  ? page_fault_oops+0x154/0x4a0
[  599.036021]  ? __x86_return_thunk+0x9/0x10
[  599.036647]  ? do_user_addr_fault+0x318/0x6b0
[  599.037258]  ? __x86_return_thunk+0x9/0x10
[  599.037866]  ? __slab_free+0xc7/0x320
[  599.038472]  ? exc_page_fault+0x7d/0x190
[  599.039074]  ? asm_exc_page_fault+0x2b/0x30
[  599.039683]  ? free_iova_rcaches+0x9c/0x110
[  599.040286]  ? free_iova_rcaches+0x91/0x110
[  599.040875]  ? __x86_return_thunk+0x9/0x10
[  599.041460]  put_iova_domain+0x32/0xa0
[  599.042041]  iommu_put_dma_cookie+0x177/0x1b0
[  599.042620]  iommu_domain_free+0x1f/0x50
[  599.043194]  iommu_setup_default_domain+0x2fb/0x420
[  599.043774]  iommu_group_store_type+0xb6/0x210
[  599.044362]  iommu_group_attr_store+0x21/0x40
[  599.044938]  sysfs_kf_write+0x42/0x50
[  599.045511]  kernfs_fop_write_iter+0x143/0x1d0
[  599.046084]  vfs_write+0x2c2/0x3f0
[  599.046653]  ksys_write+0x6b/0xf0
[  599.047219]  __x64_sys_write+0x1d/0x30
[  599.047782]  do_syscall_64+0x60/0x90
[  599.048346]  ? syscall_exit_to_user_mode+0x2a/0x50
[  599.048911]  ? __x64_sys_lseek+0x1c/0x30
[  599.049465]  ? __x86_return_thunk+0x9/0x10
[  599.050013]  ? do_syscall_64+0x6d/0x90
[  599.050562]  ? do_syscall_64+0x6d/0x90
[  599.051098]  ? do_syscall_64+0x6d/0x90
[  599.051625]  ? __x86_return_thunk+0x9/0x10
[  599.052149]  ? exc_page_fault+0x8e/0x190
[  599.052665]  entry_SYSCALL_64_after_hwframe+0x73/0xdd
[  599.053180] RIP: 0033:0x7fa5c9d14a6f
[  599.053670] Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 19 c0 f7 
ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 
05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 5c c0 f7 ff 48
[  599.054672] RSP: 002b:00007ffdeb05f540 EFLAGS: 00000293 ORIG_RAX: 
0000000000000001
[  599.055182] RAX: ffffffffffffffda RBX: 0000557de3275a80 RCX: 
00007fa5c9d14a6f
[  599.055692] RDX: 0000000000000003 RSI: 0000557de418ff60 RDI: 
0000000000000011
[  599.056203] RBP: 0000557de39a25e0 R08: 0000000000000000 R09: 
0000000000000000
[  599.056718] R10: 0000000000000000 R11: 0000000000000293 R12: 
0000000000000003
[  599.057227] R13: 00007fa5c9e5bf80 R14: 0000000000000011 R15: 
0000557de418ff60
[  599.057738]  </TASK>
[  599.058227] Modules linked in: xt_CHECKSUM xt_MASQUERADE xt_conntrack 
ipt_REJECT nf_reject_ipv4 xt_tcpudp nft_compat nft_chain_nat nf_nat 
nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge 
stp llc ipmi_ssif binfmt_misc nls_iso8859_1 intel_rapl_msr 
intel_rapl_common amd64_edac edac_mce_amd kvm_amd kvm dell_smbios dcdbas 
rapl dell_wmi_descriptor wmi_bmof joydev input_leds ccp ptdma k10temp 
acpi_ipmi ipmi_si acpi_power_meter mac_hid sch_fq_codel dm_multipath 
scsi_dh_rdac ipmi_devintf scsi_dh_emc ipmi_msghandler scsi_dh_alua msr 
ramoops reed_solomon pstore_blk pstore_zone efi_pstore ip_tables 
x_tables autofs4 hid_generic usbhid hid btrfs blake2b_generic raid10 
raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor 
raid6_pq libcrc32c raid1 raid0 multipath linear mgag200 i2c_algo_bit 
drm_shmem_helper drm_kms_helper crct10dif_pclmul crc32_pclmul 
ghash_clmulni_intel sha512_ssse3 aesni_intel crypto_simd cryptd nvme drm 
mpt3sas tg3 ahci nvme_core libahci xhci_pci raid_class i2c_piix4
[  599.058423]  xhci_pci_renesas scsi_transport_sas wmi
[  599.064210] CR2: 0000000000000000
[  599.064841] ---[ end trace 0000000000000000 ]---
--

Looking at the RIP: free_iova_rcaches+0x9c/0x110 pointed me to the above 
line leading me to believe we are popping element from an empty stack. 
Following hunk fixed the issue for me:

diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index 76a7d694708e..899f1c2ba62a 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -732,6 +732,9 @@ static struct iova_magazine *iova_depot_pop(struct 
iova_rcache *rcache)
  {
  	struct iova_magazine *mag = rcache->depot;

+	if (!mag)
+		return NULL;
+
  	rcache->depot = mag->next;
  	mag->size = IOVA_MAG_SIZE;
  	rcache->depot_size--;
--

> +	mag->size = IOVA_MAG_SIZE;
> +	return mag;
> +}
> +

--
Thanks and Regards
Dheeraj Kumar Srivastava


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] iommu/iova: Make the rcache depot scale better
  2023-08-21  8:11   ` Srivastava, Dheeraj Kumar
@ 2023-08-21  8:55     ` Robin Murphy
  2023-08-21  9:03       ` Srivastava, Dheeraj Kumar
  0 siblings, 1 reply; 17+ messages in thread
From: Robin Murphy @ 2023-08-21  8:55 UTC (permalink / raw)
  To: Srivastava, Dheeraj Kumar, joro
  Cc: will, iommu, linux-kernel, john.g.garry, zhangzekun11

On 2023-08-21 09:11, Srivastava, Dheeraj Kumar wrote:
> Hello Robin,
> 
> On 8/14/2023 11:23 PM, Robin Murphy wrote:
>> The algorithm in the original paper specifies the storage of full
>> magazines in the depot as an unbounded list rather than a fixed-size
>> array. It turns out to be pretty straightforward to do this in our
>> implementation with no significant loss of efficiency. This allows
>> the depot to scale up to the working set sizes of larger systems,
>> while also potentially saving some memory on smaller ones too.
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
>> ---
>>   drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
>>   1 file changed, 36 insertions(+), 29 deletions(-)
>>
>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
>> index 10b964600948..d2de6fb0e9f4 100644
>> --- a/drivers/iommu/iova.c
>> +++ b/drivers/iommu/iova.c
>> @@ -625,10 +625,16 @@ EXPORT_SYMBOL_GPL(reserve_iova);
>>    * will be wasted.
>>    */
>>   #define IOVA_MAG_SIZE 127
>> -#define MAX_GLOBAL_MAGS 32    /* magazines per bin */
>>   struct iova_magazine {
>> -    unsigned long size;
>> +    /*
>> +     * Only full magazines are inserted into the depot, so we can avoid
>> +     * a separate list head and preserve maximum space-efficiency.
>> +     */
>> +    union {
>> +        unsigned long size;
>> +        struct iova_magazine *next;
>> +    };
>>       unsigned long pfns[IOVA_MAG_SIZE];
>>   };
>> @@ -640,8 +646,7 @@ struct iova_cpu_rcache {
>>   struct iova_rcache {
>>       spinlock_t lock;
>> -    unsigned long depot_size;
>> -    struct iova_magazine *depot[MAX_GLOBAL_MAGS];
>> +    struct iova_magazine *depot;
>>       struct iova_cpu_rcache __percpu *cpu_rcaches;
>>   };
>> @@ -717,6 +722,21 @@ static void iova_magazine_push(struct 
>> iova_magazine *mag, unsigned long pfn)
>>       mag->pfns[mag->size++] = pfn;
>>   }
>> +static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
>> +{
>> +    struct iova_magazine *mag = rcache->depot;
>> +
>> +    rcache->depot = mag->next;
> 
> While doing routine domain change test for a device ("unbind device from 
> driver -> change domain of the device -> bind device back to the 
> driver"), i ran into the following NULL pointer dereferencing issue.
> 
> [  599.020261] BUG: kernel NULL pointer dereference, address: 
> 0000000000000000
> [  599.020986] #PF: supervisor read access in kernel mode
> [  599.021588] #PF: error_code(0x0000) - not-present page
> [  599.022180] PGD 0 P4D 0
> [  599.022770] Oops: 0000 [#1] PREEMPT SMP NOPTI
> [  599.023365] CPU: 68 PID: 3122 Comm: avocado Not tainted 
> 6.5.0-rc6-ChngDomainIssue #16
> [  599.023970] Hardware name: Dell Inc. PowerEdge R6515/07PXPY, BIOS 
> 2.3.6 07/06/2021
> [  599.024571] RIP: 0010:free_iova_rcaches+0x9c/0x110
> [  599.025170] Code: d1 ff 39 05 36 d2 bc 01 48 89 c3 77 b4 49 8b 7f 10 
> e8 b8 69 93 ff 49 8d 7f 20 e8 6f e4 6b ff eb 05 e8 48 ba 93 ff 49 8b 7f 
> 08 <48> 8b 07 49 89 47 08 48 c7 07 7f 00 00 00 41 83 6f 04 01 48 85 ff
> [  599.026436] RSP: 0018:ffffb78b4c9f7c68 EFLAGS: 00010296
> [  599.027075] RAX: ffffffff9fa8c100 RBX: 0000000000000080 RCX: 
> 0000000000000005
> [  599.027719] RDX: 0000000000000000 RSI: 000000007fffffff RDI: 
> 0000000000000000
> [  599.028359] RBP: ffffb78b4c9f7c98 R08: 0000000000000000 R09: 
> 0000000000000006
> [  599.028995] R10: 0000000000000000 R11: 0000000000000000 R12: 
> 0000000000000000
> [  599.029636] R13: ffff93910d9c6008 R14: ffffd78b3bde1000 R15: 
> ffff9391144ebc00
> [  599.030283] FS:  00007fa5c9e5c000(0000) GS:ffff93cf72700000(0000) 
> knlGS:0000000000000000
> [  599.030941] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [  599.031588] CR2: 0000000000000000 CR3: 000000013f526006 CR4: 
> 0000000000770ee0
> [  599.032237] PKRU: 55555554
> [  599.032878] Call Trace:
> [  599.033512]  <TASK>
> [  599.034140]  ? show_regs+0x6e/0x80
> [  599.034769]  ? __die+0x29/0x70
> [  599.035393]  ? page_fault_oops+0x154/0x4a0
> [  599.036021]  ? __x86_return_thunk+0x9/0x10
> [  599.036647]  ? do_user_addr_fault+0x318/0x6b0
> [  599.037258]  ? __x86_return_thunk+0x9/0x10
> [  599.037866]  ? __slab_free+0xc7/0x320
> [  599.038472]  ? exc_page_fault+0x7d/0x190
> [  599.039074]  ? asm_exc_page_fault+0x2b/0x30
> [  599.039683]  ? free_iova_rcaches+0x9c/0x110
> [  599.040286]  ? free_iova_rcaches+0x91/0x110
> [  599.040875]  ? __x86_return_thunk+0x9/0x10
> [  599.041460]  put_iova_domain+0x32/0xa0
> [  599.042041]  iommu_put_dma_cookie+0x177/0x1b0
> [  599.042620]  iommu_domain_free+0x1f/0x50
> [  599.043194]  iommu_setup_default_domain+0x2fb/0x420
> [  599.043774]  iommu_group_store_type+0xb6/0x210
> [  599.044362]  iommu_group_attr_store+0x21/0x40
> [  599.044938]  sysfs_kf_write+0x42/0x50
> [  599.045511]  kernfs_fop_write_iter+0x143/0x1d0
> [  599.046084]  vfs_write+0x2c2/0x3f0
> [  599.046653]  ksys_write+0x6b/0xf0
> [  599.047219]  __x64_sys_write+0x1d/0x30
> [  599.047782]  do_syscall_64+0x60/0x90
> [  599.048346]  ? syscall_exit_to_user_mode+0x2a/0x50
> [  599.048911]  ? __x64_sys_lseek+0x1c/0x30
> [  599.049465]  ? __x86_return_thunk+0x9/0x10
> [  599.050013]  ? do_syscall_64+0x6d/0x90
> [  599.050562]  ? do_syscall_64+0x6d/0x90
> [  599.051098]  ? do_syscall_64+0x6d/0x90
> [  599.051625]  ? __x86_return_thunk+0x9/0x10
> [  599.052149]  ? exc_page_fault+0x8e/0x190
> [  599.052665]  entry_SYSCALL_64_after_hwframe+0x73/0xdd
> [  599.053180] RIP: 0033:0x7fa5c9d14a6f
> [  599.053670] Code: 89 54 24 18 48 89 74 24 10 89 7c 24 08 e8 19 c0 f7 
> ff 48 8b 54 24 18 48 8b 74 24 10 41 89 c0 8b 7c 24 08 b8 01 00 00 00 0f 
> 05 <48> 3d 00 f0 ff ff 77 31 44 89 c7 48 89 44 24 08 e8 5c c0 f7 ff 48
> [  599.054672] RSP: 002b:00007ffdeb05f540 EFLAGS: 00000293 ORIG_RAX: 
> 0000000000000001
> [  599.055182] RAX: ffffffffffffffda RBX: 0000557de3275a80 RCX: 
> 00007fa5c9d14a6f
> [  599.055692] RDX: 0000000000000003 RSI: 0000557de418ff60 RDI: 
> 0000000000000011
> [  599.056203] RBP: 0000557de39a25e0 R08: 0000000000000000 R09: 
> 0000000000000000
> [  599.056718] R10: 0000000000000000 R11: 0000000000000293 R12: 
> 0000000000000003
> [  599.057227] R13: 00007fa5c9e5bf80 R14: 0000000000000011 R15: 
> 0000557de418ff60
> [  599.057738]  </TASK>
> [  599.058227] Modules linked in: xt_CHECKSUM xt_MASQUERADE xt_conntrack 
> ipt_REJECT nf_reject_ipv4 xt_tcpudp nft_compat nft_chain_nat nf_nat 
> nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nf_tables nfnetlink bridge 
> stp llc ipmi_ssif binfmt_misc nls_iso8859_1 intel_rapl_msr 
> intel_rapl_common amd64_edac edac_mce_amd kvm_amd kvm dell_smbios dcdbas 
> rapl dell_wmi_descriptor wmi_bmof joydev input_leds ccp ptdma k10temp 
> acpi_ipmi ipmi_si acpi_power_meter mac_hid sch_fq_codel dm_multipath 
> scsi_dh_rdac ipmi_devintf scsi_dh_emc ipmi_msghandler scsi_dh_alua msr 
> ramoops reed_solomon pstore_blk pstore_zone efi_pstore ip_tables 
> x_tables autofs4 hid_generic usbhid hid btrfs blake2b_generic raid10 
> raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor 
> raid6_pq libcrc32c raid1 raid0 multipath linear mgag200 i2c_algo_bit 
> drm_shmem_helper drm_kms_helper crct10dif_pclmul crc32_pclmul 
> ghash_clmulni_intel sha512_ssse3 aesni_intel crypto_simd cryptd nvme drm 
> mpt3sas tg3 ahci nvme_core libahci xhci_pci raid_class i2c_piix4
> [  599.058423]  xhci_pci_renesas scsi_transport_sas wmi
> [  599.064210] CR2: 0000000000000000
> [  599.064841] ---[ end trace 0000000000000000 ]---
> -- 
> 
> Looking at the RIP: free_iova_rcaches+0x9c/0x110 pointed me to the above 
> line leading me to believe we are popping element from an empty stack. 
> Following hunk fixed the issue for me:

Oh dear... looks like this was a brainfart when I factored out the 
push/pop helpers to replace the original list_head-based prototype. 
Thanks for the catch!

This fix is functionally fine, but I think what I'll do for v2 is 
change those "while ((mag = iova_depot_pop(rcache)))" loops, since 
assignment-in-the-loop-condition logic tends to look a bit suspect 
anyway. Other than that, though, were you able to notice any difference 
(good or bad) in CPU load or memory consumption overall?

Thanks,
Robin.

> 
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index 76a7d694708e..899f1c2ba62a 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -732,6 +732,9 @@ static struct iova_magazine *iova_depot_pop(struct 
> iova_rcache *rcache)
>   {
>       struct iova_magazine *mag = rcache->depot;
> 
> +    if (!mag)
> +        return NULL;
> +
>       rcache->depot = mag->next;
>       mag->size = IOVA_MAG_SIZE;
>       rcache->depot_size--;
> -- 
> 
>> +    mag->size = IOVA_MAG_SIZE;
>> +    return mag;
>> +}
>> +
> 
> -- 
> Thanks and Regards
> Dheeraj Kumar Srivastava
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] iommu/iova: Make the rcache depot scale better
  2023-08-21  8:55     ` Robin Murphy
@ 2023-08-21  9:03       ` Srivastava, Dheeraj Kumar
  0 siblings, 0 replies; 17+ messages in thread
From: Srivastava, Dheeraj Kumar @ 2023-08-21  9:03 UTC (permalink / raw)
  To: Robin Murphy, joro; +Cc: will, iommu, linux-kernel, john.g.garry, zhangzekun11

Hello Robin,

Thanks for the quick reply.

On 8/21/2023 2:25 PM, Robin Murphy wrote:
> Other than that, though, were you able to notice any difference (good or 
> bad) in CPU load or memory consumption overall?

I am still in process of testing the patches. I will let you know if any 
observe any improvement in CPU load or memory consumption.

Thanks and Regards,
Dheeraj Kumar Srivastava

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible
  2023-08-16 15:10       ` Robin Murphy
@ 2023-08-21 11:35         ` John Garry
  0 siblings, 0 replies; 17+ messages in thread
From: John Garry @ 2023-08-21 11:35 UTC (permalink / raw)
  To: Robin Murphy, joro; +Cc: will, iommu, linux-kernel, zhangzekun11

On 16/08/2023 16:10, Robin Murphy wrote:
> On 15/08/2023 2:35 pm, John Garry wrote:
>> On 15/08/2023 12:11, Robin Murphy wrote:
>>>>
>>>> This threshold is the number of online CPUs, right?
>>>
>>> Yes, that's nominally half of the current fixed size (based on all 
>>> the performance figures from the original series seemingly coming 
>>> from a 16-thread machine, 
>>
>> If you are talking about 
>> https://urldefense.com/v3/__https://lore.kernel.org/linux-iommu/20230811130246.42719-1-zhangzekun11@huawei.com/__;!!ACWV5N9M2RV99hQ!Op6GUnd7phh1sFyJwVOngmoeyKKbHWbSsNkhPB_7BpG45JFOHmN0HQ0Y7NOZZQ7VduKXaRYCXTta8LjrS99neyg$ ,
> 
> No, I mean the *original* rcache patch submission, and its associated 
> paper:
> 
> https://urldefense.com/v3/__https://lore.kernel.org/linux-iommu/cover.1461135861.git.mad@cs.technion.ac.il/__;!!ACWV5N9M2RV99hQ!Op6GUnd7phh1sFyJwVOngmoeyKKbHWbSsNkhPB_7BpG45JFOHmN0HQ0Y7NOZZQ7VduKXaRYCXTta8LjrOGggfnA$

oh, that one :)

>> then I think it's a 256-CPU system and the DMA controller has 16 HW 
>> queues. The 16 HW queues are relevant as the per-completion queue 
>> interrupt handler runs on a fixed CPU from the set of 16 CPUs in the 
>> HW queue interrupt handler affinity mask. And what this means is while 
>> any CPU may alloc an IOVA, only those 16 CPUs handling each HW queue 
>> interrupt will be free'ing IOVAs.
>>
>>> but seemed like a fair compromise. I am of course keen to see how 
>>> real-world testing actually pans out.
>>>
>>>>> it's enough of a challenge to get my 4-core dev board with spinning 
>>>>> disk
>>>>> and gigabit ethernet to push anything into a depot at all 😄
>>>>>
>>>>
>>>> I have to admit that I was hoping to also see a more aggressive 
>>>> reclaim strategy, where we also trim the per-CPU rcaches when not in 
>>>> use. Leizhen proposed something like this a long time ago.
>>>
>>> Don't think I haven't been having various elaborate ideas for making 
>>> it cleverer with multiple thresholds and self-tuning, however I have 
>>> managed to restrain myself 😉
>>>
>>
>> OK, understood. My main issue WRT scalability is that the total 
>> cacheable IOVAs (CPU and depot rcache) scales up with the number of 
>> CPUs, but many DMA controllers have a fixed number of max in-flight 
>> requests.
>>
>> Consider a SCSI storage controller on a 256-CPU system. The in-flight 
>> limit for this example controller is 4096, which would typically never 
>> be even used up or may not be even usable.
>>
>> For this device, we need 4096 * 6 [IOVA rcache range] = ~24K cached 
>> IOVAs if we were to pre-allocate them all - obviously I am ignoring 
>> that we have the per-CPU rcache for speed and it would not make sense 
>> to share one set. However, according to current IOVA driver, we can in 
>> theory cache upto ((256 [CPUs] * 2 [loaded + prev]) + 32 [depot size]) 
>> * 6 [rcache range] * 128 (IOVA per mag) = ~420K IOVAs. That's ~17x 
>> what we would ever need.
>>
>> Something like NVMe is different, as its total requests can scale up 
>> with the CPU count, but only to a limit. I am not sure about network 
>> controllers.
> 
> Remember that this threshold only represents a point at which we 
> consider the cache to have grown "big enough" to start background 
> reclaim - over the short term it is neither an upper nor a lower limit 
> on the cache capacity itself. Indeed it will be larger than the working 
> set of some workloads, but then it still wants to be enough of a buffer 
> to be useful for others which do make big bursts of allocations only 
> periodically.
> 

It would be interesting to see what zhangzekun finds for this series. He 
was testing on a 5.10-based kernel - things have changed a lot since 
then and I am not really sure what the problem could have been there.

>> Anyway, this is just something which I think should be considered - 
>> which I guess already has been.
> 
> Indeed, I would tend to assume that machines with hundreds of CPUs are 
> less likely to be constrained on overall memory and/or IOVA space, 

Cheers,
John


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] iommu/iova: Make the rcache depot scale better
  2023-08-14 17:53 ` [PATCH 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
  2023-08-21  8:11   ` Srivastava, Dheeraj Kumar
@ 2023-08-21 12:02   ` John Garry
  2023-08-21 12:28     ` Robin Murphy
  1 sibling, 1 reply; 17+ messages in thread
From: John Garry @ 2023-08-21 12:02 UTC (permalink / raw)
  To: Robin Murphy, joro; +Cc: will, iommu, linux-kernel, zhangzekun11

On 14/08/2023 18:53, Robin Murphy wrote:
> The algorithm in the original paper specifies the storage of full
> magazines in the depot as an unbounded list rather than a fixed-size
> array. It turns out to be pretty straightforward to do this in our
> implementation with no significant loss of efficiency. This allows
> the depot to scale up to the working set sizes of larger systems,
> while also potentially saving some memory on smaller ones too.
> 
> Signed-off-by: Robin Murphy <robin.murphy@arm.com>

This looks ok (ignoring the crash reported), so feel free to add:

Reviewed-by: John Garry <john.g.garry@oracle.com>

A small comment and question below.

> ---
>   drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
>   1 file changed, 36 insertions(+), 29 deletions(-)
> 
> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
> index 10b964600948..d2de6fb0e9f4 100644
> --- a/drivers/iommu/iova.c
> +++ b/drivers/iommu/iova.c
> @@ -625,10 +625,16 @@ EXPORT_SYMBOL_GPL(reserve_iova);
>    * will be wasted.
>    */
>   #define IOVA_MAG_SIZE 127
> -#define MAX_GLOBAL_MAGS 32	/* magazines per bin */
>   
>   struct iova_magazine {
> -	unsigned long size;
> +	/*
> +	 * Only full magazines are inserted into the depot, so we can avoid
> +	 * a separate list head and preserve maximum space-efficiency.

It might be worth explicitly mentioning that we try to keep total mag 
size as power-of-2

> +	 */
> +	union {
> +		unsigned long size;
> +		struct iova_magazine *next;
> +	};
>   	unsigned long pfns[IOVA_MAG_SIZE];
>   };
>   
> @@ -640,8 +646,7 @@ struct iova_cpu_rcache {
>   
>   struct iova_rcache {
>   	spinlock_t lock;
> -	unsigned long depot_size;
> -	struct iova_magazine *depot[MAX_GLOBAL_MAGS];
> +	struct iova_magazine *depot;
>   	struct iova_cpu_rcache __percpu *cpu_rcaches;
>   };
>   
> @@ -717,6 +722,21 @@ static void iova_magazine_push(struct iova_magazine *mag, unsigned long pfn)
>   	mag->pfns[mag->size++] = pfn;
>   }
>   
> +static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
> +{
> +	struct iova_magazine *mag = rcache->depot;
> +
> +	rcache->depot = mag->next;
> +	mag->size = IOVA_MAG_SIZE;
> +	return mag;
> +}
> +
> +static void iova_depot_push(struct iova_rcache *rcache, struct iova_magazine *mag)
> +{
> +	mag->next = rcache->depot;
> +	rcache->depot = mag;
> +}
> +
>   int iova_domain_init_rcaches(struct iova_domain *iovad)
>   {
>   	unsigned int cpu;
> @@ -734,7 +754,6 @@ int iova_domain_init_rcaches(struct iova_domain *iovad)
>   
>   		rcache = &iovad->rcaches[i];
>   		spin_lock_init(&rcache->lock);
> -		rcache->depot_size = 0;
>   		rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
>   						     cache_line_size());
>   		if (!rcache->cpu_rcaches) {
> @@ -776,7 +795,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
>   				 struct iova_rcache *rcache,
>   				 unsigned long iova_pfn)
>   {
> -	struct iova_magazine *mag_to_free = NULL;
>   	struct iova_cpu_rcache *cpu_rcache;
>   	bool can_insert = false;
>   	unsigned long flags;
> @@ -794,12 +812,7 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
>   
>   		if (new_mag) {
>   			spin_lock(&rcache->lock);
> -			if (rcache->depot_size < MAX_GLOBAL_MAGS) {
> -				rcache->depot[rcache->depot_size++] =
> -						cpu_rcache->loaded;
> -			} else {
> -				mag_to_free = cpu_rcache->loaded;
> -			}
> +			iova_depot_push(rcache, cpu_rcache->loaded);
>   			spin_unlock(&rcache->lock);

Out of curiosity, do you know why we take the approach (prior to this 
change) to free the loaded mag and alloc a new empty mag? Why not 
instead just say that we can't insert and bail out?

>   
>   			cpu_rcache->loaded = new_mag;
> @@ -812,11 +825,6 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
>   
>   	spin_unlock_irqrestore(&cpu_rcache->lock, flags);
>   
> -	if (mag_to_free) {
> -		iova_magazine_free_pfns(mag_to_free, iovad);
> -		iova_magazine_free(mag_to_free);
> -	}
> -
>   	return can_insert;
>   }
>   
> @@ -854,9 +862,9 @@ static unsigned long __iova_rcache_get(struct iova_rcache *rcache,
>   		has_pfn = true;
>   	} else {
>   		spin_lock(&rcache->lock);
> -		if (rcache->depot_size > 0) {
> +		if (rcache->depot) {
>   			iova_magazine_free(cpu_rcache->loaded);
> -			cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
> +			cpu_rcache->loaded = iova_depot_pop(rcache);
>   			has_pfn = true;
>   		}
>   		spin_unlock(&rcache->lock);
> @@ -894,10 +902,10 @@ static void free_iova_rcaches(struct iova_domain *iovad)
>   {
>   	struct iova_rcache *rcache;
>   	struct iova_cpu_rcache *cpu_rcache;
> +	struct iova_magazine *mag;
>   	unsigned int cpu;
> -	int i, j;
>   
> -	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
> +	for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>   		rcache = &iovad->rcaches[i];
>   		if (!rcache->cpu_rcaches)
>   			break;
> @@ -907,8 +915,8 @@ static void free_iova_rcaches(struct iova_domain *iovad)
>   			iova_magazine_free(cpu_rcache->prev);
>   		}
>   		free_percpu(rcache->cpu_rcaches);
> -		for (j = 0; j < rcache->depot_size; ++j)
> -			iova_magazine_free(rcache->depot[j]);
> +		while ((mag = iova_depot_pop(rcache)))
> +			iova_magazine_free(mag);
>   	}
>   
>   	kfree(iovad->rcaches);
> @@ -941,17 +949,16 @@ static void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad)
>   static void free_global_cached_iovas(struct iova_domain *iovad)
>   {
>   	struct iova_rcache *rcache;
> +	struct iova_magazine *mag;
>   	unsigned long flags;
> -	int i, j;
>   
> -	for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
> +	for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>   		rcache = &iovad->rcaches[i];
>   		spin_lock_irqsave(&rcache->lock, flags);
> -		for (j = 0; j < rcache->depot_size; ++j) {
> -			iova_magazine_free_pfns(rcache->depot[j], iovad);
> -			iova_magazine_free(rcache->depot[j]);
> +		while ((mag = iova_depot_pop(rcache))) {
> +			iova_magazine_free_pfns(mag, iovad);
> +			iova_magazine_free(mag);
>   		}
> -		rcache->depot_size = 0;
>   		spin_unlock_irqrestore(&rcache->lock, flags);
>   	}
>   }


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH 1/2] iommu/iova: Make the rcache depot scale better
  2023-08-21 12:02   ` John Garry
@ 2023-08-21 12:28     ` Robin Murphy
  0 siblings, 0 replies; 17+ messages in thread
From: Robin Murphy @ 2023-08-21 12:28 UTC (permalink / raw)
  To: John Garry, joro; +Cc: will, iommu, linux-kernel, zhangzekun11

On 2023-08-21 13:02, John Garry wrote:
> On 14/08/2023 18:53, Robin Murphy wrote:
>> The algorithm in the original paper specifies the storage of full
>> magazines in the depot as an unbounded list rather than a fixed-size
>> array. It turns out to be pretty straightforward to do this in our
>> implementation with no significant loss of efficiency. This allows
>> the depot to scale up to the working set sizes of larger systems,
>> while also potentially saving some memory on smaller ones too.
>>
>> Signed-off-by: Robin Murphy <robin.murphy@arm.com>
> 
> This looks ok (ignoring the crash reported), so feel free to add:
> 
> Reviewed-by: John Garry <john.g.garry@oracle.com>

Thanks!

> A small comment and question below.
> 
>> ---
>>   drivers/iommu/iova.c | 65 ++++++++++++++++++++++++--------------------
>>   1 file changed, 36 insertions(+), 29 deletions(-)
>>
>> diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
>> index 10b964600948..d2de6fb0e9f4 100644
>> --- a/drivers/iommu/iova.c
>> +++ b/drivers/iommu/iova.c
>> @@ -625,10 +625,16 @@ EXPORT_SYMBOL_GPL(reserve_iova);
>>    * will be wasted.
>>    */
>>   #define IOVA_MAG_SIZE 127
>> -#define MAX_GLOBAL_MAGS 32    /* magazines per bin */
>>   struct iova_magazine {
>> -    unsigned long size;
>> +    /*
>> +     * Only full magazines are inserted into the depot, so we can avoid
>> +     * a separate list head and preserve maximum space-efficiency.
> 
> It might be worth explicitly mentioning that we try to keep total mag 
> size as power-of-2

Sure, I can tie it in with the existing comment above, which might 
actually end up more readable anyway.

>> +     */
>> +    union {
>> +        unsigned long size;
>> +        struct iova_magazine *next;
>> +    };
>>       unsigned long pfns[IOVA_MAG_SIZE];
>>   };
>> @@ -640,8 +646,7 @@ struct iova_cpu_rcache {
>>   struct iova_rcache {
>>       spinlock_t lock;
>> -    unsigned long depot_size;
>> -    struct iova_magazine *depot[MAX_GLOBAL_MAGS];
>> +    struct iova_magazine *depot;
>>       struct iova_cpu_rcache __percpu *cpu_rcaches;
>>   };
>> @@ -717,6 +722,21 @@ static void iova_magazine_push(struct 
>> iova_magazine *mag, unsigned long pfn)
>>       mag->pfns[mag->size++] = pfn;
>>   }
>> +static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache)
>> +{
>> +    struct iova_magazine *mag = rcache->depot;
>> +
>> +    rcache->depot = mag->next;
>> +    mag->size = IOVA_MAG_SIZE;
>> +    return mag;
>> +}
>> +
>> +static void iova_depot_push(struct iova_rcache *rcache, struct 
>> iova_magazine *mag)
>> +{
>> +    mag->next = rcache->depot;
>> +    rcache->depot = mag;
>> +}
>> +
>>   int iova_domain_init_rcaches(struct iova_domain *iovad)
>>   {
>>       unsigned int cpu;
>> @@ -734,7 +754,6 @@ int iova_domain_init_rcaches(struct iova_domain 
>> *iovad)
>>           rcache = &iovad->rcaches[i];
>>           spin_lock_init(&rcache->lock);
>> -        rcache->depot_size = 0;
>>           rcache->cpu_rcaches = __alloc_percpu(sizeof(*cpu_rcache),
>>                                cache_line_size());
>>           if (!rcache->cpu_rcaches) {
>> @@ -776,7 +795,6 @@ static bool __iova_rcache_insert(struct 
>> iova_domain *iovad,
>>                    struct iova_rcache *rcache,
>>                    unsigned long iova_pfn)
>>   {
>> -    struct iova_magazine *mag_to_free = NULL;
>>       struct iova_cpu_rcache *cpu_rcache;
>>       bool can_insert = false;
>>       unsigned long flags;
>> @@ -794,12 +812,7 @@ static bool __iova_rcache_insert(struct 
>> iova_domain *iovad,
>>           if (new_mag) {
>>               spin_lock(&rcache->lock);
>> -            if (rcache->depot_size < MAX_GLOBAL_MAGS) {
>> -                rcache->depot[rcache->depot_size++] =
>> -                        cpu_rcache->loaded;
>> -            } else {
>> -                mag_to_free = cpu_rcache->loaded;
>> -            }
>> +            iova_depot_push(rcache, cpu_rcache->loaded);
>>               spin_unlock(&rcache->lock);
> 
> Out of curiosity, do you know why we take the approach (prior to this 
> change) to free the loaded mag and alloc a new empty mag? Why not 
> instead just say that we can't insert and bail out?

I have a feeling it must have been mentioned at some point, since my 
memory says there was a deliberate intent to keep the flow through the 
critical section simple and consistent, and minimise time spent holding 
the rcache lock, and I'm 99% sure that isn't my own inferred reasoning...

Cheers,
Robin.

>>               cpu_rcache->loaded = new_mag;
>> @@ -812,11 +825,6 @@ static bool __iova_rcache_insert(struct 
>> iova_domain *iovad,
>>       spin_unlock_irqrestore(&cpu_rcache->lock, flags);
>> -    if (mag_to_free) {
>> -        iova_magazine_free_pfns(mag_to_free, iovad);
>> -        iova_magazine_free(mag_to_free);
>> -    }
>> -
>>       return can_insert;
>>   }
>> @@ -854,9 +862,9 @@ static unsigned long __iova_rcache_get(struct 
>> iova_rcache *rcache,
>>           has_pfn = true;
>>       } else {
>>           spin_lock(&rcache->lock);
>> -        if (rcache->depot_size > 0) {
>> +        if (rcache->depot) {
>>               iova_magazine_free(cpu_rcache->loaded);
>> -            cpu_rcache->loaded = rcache->depot[--rcache->depot_size];
>> +            cpu_rcache->loaded = iova_depot_pop(rcache);
>>               has_pfn = true;
>>           }
>>           spin_unlock(&rcache->lock);
>> @@ -894,10 +902,10 @@ static void free_iova_rcaches(struct iova_domain 
>> *iovad)
>>   {
>>       struct iova_rcache *rcache;
>>       struct iova_cpu_rcache *cpu_rcache;
>> +    struct iova_magazine *mag;
>>       unsigned int cpu;
>> -    int i, j;
>> -    for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>> +    for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>>           rcache = &iovad->rcaches[i];
>>           if (!rcache->cpu_rcaches)
>>               break;
>> @@ -907,8 +915,8 @@ static void free_iova_rcaches(struct iova_domain 
>> *iovad)
>>               iova_magazine_free(cpu_rcache->prev);
>>           }
>>           free_percpu(rcache->cpu_rcaches);
>> -        for (j = 0; j < rcache->depot_size; ++j)
>> -            iova_magazine_free(rcache->depot[j]);
>> +        while ((mag = iova_depot_pop(rcache)))
>> +            iova_magazine_free(mag);
>>       }
>>       kfree(iovad->rcaches);
>> @@ -941,17 +949,16 @@ static void free_cpu_cached_iovas(unsigned int 
>> cpu, struct iova_domain *iovad)
>>   static void free_global_cached_iovas(struct iova_domain *iovad)
>>   {
>>       struct iova_rcache *rcache;
>> +    struct iova_magazine *mag;
>>       unsigned long flags;
>> -    int i, j;
>> -    for (i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>> +    for (int i = 0; i < IOVA_RANGE_CACHE_MAX_SIZE; ++i) {
>>           rcache = &iovad->rcaches[i];
>>           spin_lock_irqsave(&rcache->lock, flags);
>> -        for (j = 0; j < rcache->depot_size; ++j) {
>> -            iova_magazine_free_pfns(rcache->depot[j], iovad);
>> -            iova_magazine_free(rcache->depot[j]);
>> +        while ((mag = iova_depot_pop(rcache))) {
>> +            iova_magazine_free_pfns(mag, iovad);
>> +            iova_magazine_free(mag);
>>           }
>> -        rcache->depot_size = 0;
>>           spin_unlock_irqrestore(&rcache->lock, flags);
>>       }
>>   }
> 

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2023-08-21 12:28 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-08-14 17:53 [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible Robin Murphy
2023-08-14 17:53 ` [PATCH 1/2] iommu/iova: Make the rcache depot scale better Robin Murphy
2023-08-21  8:11   ` Srivastava, Dheeraj Kumar
2023-08-21  8:55     ` Robin Murphy
2023-08-21  9:03       ` Srivastava, Dheeraj Kumar
2023-08-21 12:02   ` John Garry
2023-08-21 12:28     ` Robin Murphy
2023-08-14 17:53 ` [PATCH 2/2] iommu/iova: Manage the depot list size Robin Murphy
2023-08-15 14:11   ` zhangzekun (A)
2023-08-16  4:25     ` Jerry Snitselaar
2023-08-16 16:52     ` Robin Murphy
2023-08-15 10:24 ` [PATCH 0/2] iommu/iova: Make the rcache depot properly flexible John Garry
2023-08-15 11:11   ` Robin Murphy
2023-08-15 13:35     ` John Garry
2023-08-16 15:10       ` Robin Murphy
2023-08-21 11:35         ` John Garry
2023-08-17 16:39 ` Jerry Snitselaar

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).