of: cache phandle nodes to decrease cost of of_find_node_by_phandle()
diff mbox series

Message ID 1517429142-25727-1-git-send-email-frowand.list@gmail.com
State New, archived
Headers show
Series
  • of: cache phandle nodes to decrease cost of of_find_node_by_phandle()
Related show

Commit Message

Frank Rowand Jan. 31, 2018, 8:05 p.m. UTC
From: Frank Rowand <frank.rowand@sony.com>

Create a cache of the nodes that contain a phandle property.  Use this
cache to find the node for a given phandle value instead of scanning
the devicetree to find the node.  If the phandle value is not found
in the cache, of_find_node_by_phandle() will fall back to the tree
scan algorithm.

The cache is initialized in of_core_init().

The cache is freed via a late_initcall_sync().

Signed-off-by: Frank Rowand <frank.rowand@sony.com>
---

Some of_find_by_phandle() calls may occur before the cache is
initialized or after it is freed.  For example, for the qualcomm
qcom-apq8074-dragonboard, 11 calls occur before the initialization
and 80 occur after the cache is freed (out of 516 total calls.)


 drivers/of/base.c       | 85 ++++++++++++++++++++++++++++++++++++++++++++++---
 drivers/of/of_private.h |  5 +++
 drivers/of/resolver.c   | 21 ------------
 3 files changed, 86 insertions(+), 25 deletions(-)

Comments

Frank Rowand Jan. 31, 2018, 8:10 p.m. UTC | #1
Hi Rob, Chintan,

This patch resolves one of the issues that Rob had raised.  The space for
the phandle cache is freed after boot, instead of existing until shutdown.

Chintan, can you please apply this patch and see how it impacts your
boot time?  The portion of the patch that removes code from resolver.c
will fail on 4.9 because that code does not exist yet -- you can just
remove that portion of the patch.

-Frank


On 01/31/18 12:05, frowand.list@gmail.com wrote:
> From: Frank Rowand <frank.rowand@sony.com>
> 
> Create a cache of the nodes that contain a phandle property.  Use this
> cache to find the node for a given phandle value instead of scanning
> the devicetree to find the node.  If the phandle value is not found
> in the cache, of_find_node_by_phandle() will fall back to the tree
> scan algorithm.
> 
> The cache is initialized in of_core_init().
> 
> The cache is freed via a late_initcall_sync().
> 
> Signed-off-by: Frank Rowand <frank.rowand@sony.com>
> ---
> 
> Some of_find_by_phandle() calls may occur before the cache is
> initialized or after it is freed.  For example, for the qualcomm
> qcom-apq8074-dragonboard, 11 calls occur before the initialization
> and 80 occur after the cache is freed (out of 516 total calls.)
> 
> 
>  drivers/of/base.c       | 85 ++++++++++++++++++++++++++++++++++++++++++++++---
>  drivers/of/of_private.h |  5 +++
>  drivers/of/resolver.c   | 21 ------------
>  3 files changed, 86 insertions(+), 25 deletions(-)
> 
> diff --git a/drivers/of/base.c b/drivers/of/base.c
> index 26618ba8f92a..c3091d0e391f 100644
> --- a/drivers/of/base.c
> +++ b/drivers/of/base.c
> @@ -95,10 +95,14 @@ int __weak of_node_to_nid(struct device_node *np)
>  }
>  #endif
>  
> +static void of_populate_phandle_cache(void);
> +
>  void __init of_core_init(void)
>  {
>  	struct device_node *np;
>  
> +	of_populate_phandle_cache();
> +
>  	/* Create the kset, and register existing nodes */
>  	mutex_lock(&of_mutex);
>  	of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj);
> @@ -990,6 +994,72 @@ int of_modalias_node(struct device_node *node, char *modalias, int len)
>  }
>  EXPORT_SYMBOL_GPL(of_modalias_node);
>  
> +phandle live_tree_max_phandle(void)
> +{
> +	struct device_node *node;
> +	phandle max_phandle;
> +	unsigned long flags;
> +
> +	raw_spin_lock_irqsave(&devtree_lock, flags);
> +	max_phandle = 0;
> +	for_each_of_allnodes(node) {
> +		if (node->phandle != OF_PHANDLE_ILLEGAL &&
> +		    node->phandle > max_phandle)
> +			max_phandle = node->phandle;
> +	}
> +	raw_spin_unlock_irqrestore(&devtree_lock, flags);
> +
> +	return max_phandle;
> +}
> +
> +static struct device_node **phandle_cache;
> +static u32 max_phandle_cache;
> +
> +static int __init of_free_phandle_cache(void)
> +{
> +	max_phandle_cache = 0;
> +	kfree(phandle_cache);
> +	phandle_cache = NULL;
> +
> +	return 0;
> +}
> +late_initcall_sync(of_free_phandle_cache);
> +
> +static void of_populate_phandle_cache(void)
> +{
> +	unsigned long flags;
> +	phandle max_phandle;
> +	u32 nodes = 0;
> +	struct device_node *np;
> +
> +	if (phandle_cache)
> +		return;
> +
> +	max_phandle = live_tree_max_phandle();
> +
> +	raw_spin_lock_irqsave(&devtree_lock, flags);
> +
> +	for_each_of_allnodes(np)
> +		nodes++;
> +
> +	/* sanity cap for malformed tree */
> +	if (max_phandle > nodes)
> +		max_phandle = nodes;
> +
> +	phandle_cache = kzalloc((max_phandle + 1) * sizeof(*phandle_cache),
> +				GFP_KERNEL);
> +
> +	for_each_of_allnodes(np)
> +		if (np->phandle != OF_PHANDLE_ILLEGAL  &&
> +		    np->phandle <= max_phandle &&
> +		    np->phandle)
> +			phandle_cache[np->phandle] = np;
> +
> +	max_phandle_cache = max_phandle;
> +
> +	raw_spin_unlock_irqrestore(&devtree_lock, flags);
> +}
> +
>  /**
>   * of_find_node_by_phandle - Find a node given a phandle
>   * @handle:	phandle of the node to find
> @@ -999,16 +1069,23 @@ int of_modalias_node(struct device_node *node, char *modalias, int len)
>   */
>  struct device_node *of_find_node_by_phandle(phandle handle)
>  {
> -	struct device_node *np;
> +	struct device_node *np = NULL;
>  	unsigned long flags;
>  
>  	if (!handle)
>  		return NULL;
>  
>  	raw_spin_lock_irqsave(&devtree_lock, flags);
> -	for_each_of_allnodes(np)
> -		if (np->phandle == handle)
> -			break;
> +
> +	if (handle <= max_phandle_cache)
> +		np = phandle_cache[handle];
> +
> +	if (!np || np->phandle != handle) {
> +		for_each_of_allnodes(np)
> +			if (np->phandle == handle)
> +				break;
> +	}
> +
>  	of_node_get(np);
>  	raw_spin_unlock_irqrestore(&devtree_lock, flags);
>  	return np;
> diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
> index 92a9a3687446..77005978d60a 100644
> --- a/drivers/of/of_private.h
> +++ b/drivers/of/of_private.h
> @@ -135,6 +135,11 @@ extern void __of_update_property_sysfs(struct device_node *np,
>  extern void __of_sysfs_remove_bin_file(struct device_node *np,
>  				       struct property *prop);
>  
> +/* illegal phandle value (set when unresolved) */
> +#define OF_PHANDLE_ILLEGAL	0xdeadbeef
> +
> +extern phandle live_tree_max_phandle(void);
> +
>  /* iterators for transactions, used for overlays */
>  /* forward iterator */
>  #define for_each_transaction_entry(_oft, _te) \
> diff --git a/drivers/of/resolver.c b/drivers/of/resolver.c
> index cfaeef5f6cb1..0384ce8fdc3b 100644
> --- a/drivers/of/resolver.c
> +++ b/drivers/of/resolver.c
> @@ -22,27 +22,6 @@
>  
>  #include "of_private.h"
>  
> -/* illegal phandle value (set when unresolved) */
> -#define OF_PHANDLE_ILLEGAL	0xdeadbeef
> -
> -static phandle live_tree_max_phandle(void)
> -{
> -	struct device_node *node;
> -	phandle phandle;
> -	unsigned long flags;
> -
> -	raw_spin_lock_irqsave(&devtree_lock, flags);
> -	phandle = 0;
> -	for_each_of_allnodes(node) {
> -		if (node->phandle != OF_PHANDLE_ILLEGAL &&
> -				node->phandle > phandle)
> -			phandle = node->phandle;
> -	}
> -	raw_spin_unlock_irqrestore(&devtree_lock, flags);
> -
> -	return phandle;
> -}
> -
>  static void adjust_overlay_phandles(struct device_node *overlay,
>  		int phandle_delta)
>  {
>
Frank Rowand Jan. 31, 2018, 9:43 p.m. UTC | #2
On 01/31/18 12:05, frowand.list@gmail.com wrote:
> From: Frank Rowand <frank.rowand@sony.com>
> 
> Create a cache of the nodes that contain a phandle property.  Use this
> cache to find the node for a given phandle value instead of scanning
> the devicetree to find the node.  If the phandle value is not found
> in the cache, of_find_node_by_phandle() will fall back to the tree
> scan algorithm.
> 
> The cache is initialized in of_core_init().
> 
> The cache is freed via a late_initcall_sync().
> 
> Signed-off-by: Frank Rowand <frank.rowand@sony.com>
> ---
> 
> Some of_find_by_phandle() calls may occur before the cache is
> initialized or after it is freed.  For example, for the qualcomm
> qcom-apq8074-dragonboard, 11 calls occur before the initialization
> and 80 occur after the cache is freed (out of 516 total calls.)
> 
> 
>  drivers/of/base.c       | 85 ++++++++++++++++++++++++++++++++++++++++++++++---
>  drivers/of/of_private.h |  5 +++
>  drivers/of/resolver.c   | 21 ------------
>  3 files changed, 86 insertions(+), 25 deletions(-)

Some observations....

The size of the cache for a normal device tree would be a couple of
words of overhead for the cache, plus one pointer per devicetree node
that contains a phandle property.  This will be less space than
would be used by adding a hash field to each device node.  It is
also less space than was used by the older algorithm (long gone)
that added a linked list through the nodes that contained a
phandle property.

This is assuming that the values of the phandle properties are
the default ones created by the dtc compiler.  In the case
where a very large phandle property value is hand-coded in
a devicetree source, the size of the cache is capped at one
entry per node.  In this case, a little bit of space will be
wasted -- but this is just a sanity fallback, it should not
be encountered, and can be fixed by fixing the devicetree
source.
Chintan Pandya Feb. 1, 2018, 6:45 a.m. UTC | #3
On 2/1/2018 1:35 AM, frowand.list@gmail.com wrote:
> From: Frank Rowand <frank.rowand@sony.com>

> +
> +static void of_populate_phandle_cache(void)
> +{
> +	unsigned long flags;
> +	phandle max_phandle;
> +	u32 nodes = 0;
> +	struct device_node *np;
> +
> +	if (phandle_cache)
> +		return;
> +
> +	max_phandle = live_tree_max_phandle();
> +
> +	raw_spin_lock_irqsave(&devtree_lock, flags);
> +
> +	for_each_of_allnodes(np)
> +		nodes++;
> +
> +	/* sanity cap for malformed tree */
> +	if (max_phandle > nodes)
> +		max_phandle = nodes;
Shouldn't we speak up about this in kernel log ? May be WARN_ON() ?

> +
> +	phandle_cache = kzalloc((max_phandle + 1) * sizeof(*phandle_cache),
> +				GFP_KERNEL);
kzalloc (might_sleep) in critical context will break.

Anyways, will fix this locally and share test results.

Thanks,
Chintan
Frank Rowand Feb. 1, 2018, 8:59 a.m. UTC | #4
On 01/31/18 22:45, Chintan Pandya wrote:
> 
> 
> On 2/1/2018 1:35 AM, frowand.list@gmail.com wrote:
>> From: Frank Rowand <frank.rowand@sony.com>
> 
>> +
>> +static void of_populate_phandle_cache(void)
>> +{
>> +    unsigned long flags;
>> +    phandle max_phandle;
>> +    u32 nodes = 0;
>> +    struct device_node *np;
>> +
>> +    if (phandle_cache)
>> +        return;
>> +
>> +    max_phandle = live_tree_max_phandle();
>> +
>> +    raw_spin_lock_irqsave(&devtree_lock, flags);
>> +
>> +    for_each_of_allnodes(np)
>> +        nodes++;
>> +
>> +    /* sanity cap for malformed tree */
>> +    if (max_phandle > nodes)
>> +        max_phandle = nodes;
> Shouldn't we speak up about this in kernel log ? May be WARN_ON() ?

Probably not.  If we care enough about a hand coded phandle property
value we should add a check to checkpatch and/or dtc instead of adding
the warning here.


>> +
>> +    phandle_cache = kzalloc((max_phandle + 1) * sizeof(*phandle_cache),
>> +                GFP_KERNEL);
> kzalloc (might_sleep) in critical context will break.

Yes, thanks.

I also need to ensure memory ordering in of_free_phandle_cache()
to ensure that max_phandle_cache is zero before the cache memory
is freed.


> Anyways, will fix this locally and share test results.

Thanks, I look forward to the results.


> Thanks,
> Chintan
>
Chintan Pandya Feb. 1, 2018, 10:31 a.m. UTC | #5
>> Anyways, will fix this locally and share test results.
> 
> Thanks, I look forward to the results.
> 

Set up for this time was slightly different. So, taken all the numbers 
again.

Boot to shell time (in ms): Experiment 2
[1] Base			: 14.843805 14.784842 14.842338
[2] 64 sized fixed cache	: 14.189292 14.200003 14.266711
[3] Dynamic freeable cache	: 14.112412 14.064772 14.036052

So, [3] (this patch) looks to be improving 750ms (on avg from base build).
Rob Herring Feb. 1, 2018, 2:24 p.m. UTC | #6
On Wed, Jan 31, 2018 at 3:43 PM, Frank Rowand <frowand.list@gmail.com> wrote:
> On 01/31/18 12:05, frowand.list@gmail.com wrote:
>> From: Frank Rowand <frank.rowand@sony.com>
>>
>> Create a cache of the nodes that contain a phandle property.  Use this
>> cache to find the node for a given phandle value instead of scanning
>> the devicetree to find the node.  If the phandle value is not found
>> in the cache, of_find_node_by_phandle() will fall back to the tree
>> scan algorithm.
>>
>> The cache is initialized in of_core_init().
>>
>> The cache is freed via a late_initcall_sync().
>>
>> Signed-off-by: Frank Rowand <frank.rowand@sony.com>
>> ---
>>
>> Some of_find_by_phandle() calls may occur before the cache is
>> initialized or after it is freed.  For example, for the qualcomm
>> qcom-apq8074-dragonboard, 11 calls occur before the initialization
>> and 80 occur after the cache is freed (out of 516 total calls.)
>>
>>
>>  drivers/of/base.c       | 85 ++++++++++++++++++++++++++++++++++++++++++++++---
>>  drivers/of/of_private.h |  5 +++
>>  drivers/of/resolver.c   | 21 ------------
>>  3 files changed, 86 insertions(+), 25 deletions(-)
>
> Some observations....
>
> The size of the cache for a normal device tree would be a couple of
> words of overhead for the cache, plus one pointer per devicetree node
> that contains a phandle property.  This will be less space than
> would be used by adding a hash field to each device node.  It is
> also less space than was used by the older algorithm (long gone)
> that added a linked list through the nodes that contained a
> phandle property.
>
> This is assuming that the values of the phandle properties are
> the default ones created by the dtc compiler.  In the case
> where a very large phandle property value is hand-coded in
> a devicetree source, the size of the cache is capped at one
> entry per node.  In this case, a little bit of space will be
> wasted -- but this is just a sanity fallback, it should not
> be encountered, and can be fixed by fixing the devicetree
> source.

I don't think we should rely on how dtc allocates phandles. dtc is not
the only source of DeviceTrees. If we could do that, then lets make
them have some known flag in the upper byte so we have some hint for
phandle values. 2^24 phandles should be enough for anyone.TM

Your cache size is also going to balloon if the dtb was built with
'-@'. Since you walk the tree for every phandle, it is conceivable
that you could make things slower.

Freeing after boot is nice, but if someone has lots of modules or
large overlays, this doesn't help them at all.


There's still more tweaks we could do with a cache based (i.e. can
miss) approach. We could have an access count or some most recently
used list to avoid evicting frequently accessed phandles (your data
tends to indicate that would help). We could have cache sets. And so
far, no one has explained why a bigger cache got slower.

Or we could do something decentralized and make the frequent callers
cache their phandles.

Rob
Rob Herring Feb. 1, 2018, 2:26 p.m. UTC | #7
On Thu, Feb 1, 2018 at 8:24 AM, Rob Herring <robh+dt@kernel.org> wrote:
> On Wed, Jan 31, 2018 at 3:43 PM, Frank Rowand <frowand.list@gmail.com> wrote:
>> On 01/31/18 12:05, frowand.list@gmail.com wrote:
>>> From: Frank Rowand <frank.rowand@sony.com>
>>>
>>> Create a cache of the nodes that contain a phandle property.  Use this
>>> cache to find the node for a given phandle value instead of scanning
>>> the devicetree to find the node.  If the phandle value is not found
>>> in the cache, of_find_node_by_phandle() will fall back to the tree
>>> scan algorithm.
>>>
>>> The cache is initialized in of_core_init().
>>>
>>> The cache is freed via a late_initcall_sync().
>>>
>>> Signed-off-by: Frank Rowand <frank.rowand@sony.com>
>>> ---
>>>
>>> Some of_find_by_phandle() calls may occur before the cache is
>>> initialized or after it is freed.  For example, for the qualcomm
>>> qcom-apq8074-dragonboard, 11 calls occur before the initialization
>>> and 80 occur after the cache is freed (out of 516 total calls.)
>>>
>>>
>>>  drivers/of/base.c       | 85 ++++++++++++++++++++++++++++++++++++++++++++++---
>>>  drivers/of/of_private.h |  5 +++
>>>  drivers/of/resolver.c   | 21 ------------
>>>  3 files changed, 86 insertions(+), 25 deletions(-)
>>
>> Some observations....
>>
>> The size of the cache for a normal device tree would be a couple of
>> words of overhead for the cache, plus one pointer per devicetree node
>> that contains a phandle property.  This will be less space than
>> would be used by adding a hash field to each device node.  It is
>> also less space than was used by the older algorithm (long gone)
>> that added a linked list through the nodes that contained a
>> phandle property.
>>
>> This is assuming that the values of the phandle properties are
>> the default ones created by the dtc compiler.  In the case
>> where a very large phandle property value is hand-coded in
>> a devicetree source, the size of the cache is capped at one
>> entry per node.  In this case, a little bit of space will be
>> wasted -- but this is just a sanity fallback, it should not
>> be encountered, and can be fixed by fixing the devicetree
>> source.
>
> I don't think we should rely on how dtc allocates phandles. dtc is not
> the only source of DeviceTrees. If we could do that, then lets make
> them have some known flag in the upper byte so we have some hint for
> phandle values. 2^24 phandles should be enough for anyone.TM
>
> Your cache size is also going to balloon if the dtb was built with
> '-@'. Since you walk the tree for every phandle, it is conceivable
> that you could make things slower.

Disregard the 2nd statement. Only 1 walk is not going to register.
Rob Herring Feb. 1, 2018, 2:34 p.m. UTC | #8
On Wed, Jan 31, 2018 at 2:05 PM,  <frowand.list@gmail.com> wrote:
> From: Frank Rowand <frank.rowand@sony.com>
>
> Create a cache of the nodes that contain a phandle property.  Use this
> cache to find the node for a given phandle value instead of scanning
> the devicetree to find the node.  If the phandle value is not found
> in the cache, of_find_node_by_phandle() will fall back to the tree
> scan algorithm.
>
> The cache is initialized in of_core_init().
>
> The cache is freed via a late_initcall_sync().
>
> Signed-off-by: Frank Rowand <frank.rowand@sony.com>
> ---
>
> Some of_find_by_phandle() calls may occur before the cache is
> initialized or after it is freed.  For example, for the qualcomm
> qcom-apq8074-dragonboard, 11 calls occur before the initialization
> and 80 occur after the cache is freed (out of 516 total calls.)

We should be able to do this earlier. We already walk the tree twice
in unflattening. We can get the max phandle (or number of phandles
IMO) on the first pass, allocate with the early allocator and then
populate the cache in the 2nd pass. AIUI, you can alloc with memblock
and then free with kfree as the memblock allocations get transferred
to the slab.

Rob
Frank Rowand Feb. 1, 2018, 7:10 p.m. UTC | #9
On 02/01/18 02:31, Chintan Pandya wrote:
> 
>>> Anyways, will fix this locally and share test results.
>>
>> Thanks, I look forward to the results.
>>
> 
> Set up for this time was slightly different. So, taken all the numbers again.
> 
> Boot to shell time (in ms): Experiment 2
> [1] Base            : 14.843805 14.784842 14.842338
> [2] 64 sized fixed cache    : 14.189292 14.200003 14.266711
> [3] Dynamic freeable cache    : 14.112412 14.064772 14.036052
> 
> So, [3] (this patch) looks to be improving 750ms (on avg from base build).
> 

Is this with the many debug options enabled?  If so, can you repeat with
a normal configuration?

Thanks,

Frank
Frank Rowand Feb. 1, 2018, 9:09 p.m. UTC | #10
On 02/01/18 06:24, Rob Herring wrote:
> On Wed, Jan 31, 2018 at 3:43 PM, Frank Rowand <frowand.list@gmail.com> wrote:
>> On 01/31/18 12:05, frowand.list@gmail.com wrote:
>>> From: Frank Rowand <frank.rowand@sony.com>
>>>
>>> Create a cache of the nodes that contain a phandle property.  Use this
>>> cache to find the node for a given phandle value instead of scanning
>>> the devicetree to find the node.  If the phandle value is not found
>>> in the cache, of_find_node_by_phandle() will fall back to the tree
>>> scan algorithm.
>>>
>>> The cache is initialized in of_core_init().
>>>
>>> The cache is freed via a late_initcall_sync().
>>>
>>> Signed-off-by: Frank Rowand <frank.rowand@sony.com>
>>> ---
>>>
>>> Some of_find_by_phandle() calls may occur before the cache is
>>> initialized or after it is freed.  For example, for the qualcomm
>>> qcom-apq8074-dragonboard, 11 calls occur before the initialization
>>> and 80 occur after the cache is freed (out of 516 total calls.)
>>>
>>>
>>>  drivers/of/base.c       | 85 ++++++++++++++++++++++++++++++++++++++++++++++---
>>>  drivers/of/of_private.h |  5 +++
>>>  drivers/of/resolver.c   | 21 ------------
>>>  3 files changed, 86 insertions(+), 25 deletions(-)
>>
>> Some observations....
>>
>> The size of the cache for a normal device tree would be a couple of
>> words of overhead for the cache, plus one pointer per devicetree node
>> that contains a phandle property.  This will be less space than
>> would be used by adding a hash field to each device node.  It is
>> also less space than was used by the older algorithm (long gone)
>> that added a linked list through the nodes that contained a
>> phandle property.
>>
>> This is assuming that the values of the phandle properties are
>> the default ones created by the dtc compiler.  In the case
>> where a very large phandle property value is hand-coded in
>> a devicetree source, the size of the cache is capped at one
>> entry per node.  In this case, a little bit of space will be
>> wasted -- but this is just a sanity fallback, it should not
>> be encountered, and can be fixed by fixing the devicetree
>> source.
> 
> I don't think we should rely on how dtc allocates phandles. dtc is not
> the only source of DeviceTrees. If we could do that, then lets make

It seems like a reasonable thing to rely on.  dtc is the in-tree
compiler to create an FDT.

Are you thinking about the IBM PPC devicetrees as devicetrees created
in some manner other than dtc?  Are there other examples you are
aware of?

If non-dtc tools create phandle property values that are not a
contiguous range of values starting with one, then the devicetrees
they create may not benefit from this performance optimization.
But no user of such a devicetree is complaining about performance
issues with of_find_node_by_phandle() against their tree.  So until
there is an issue, no big deal.

If my effort to create a new version of the FDT, I would like to
include a rule to the effect of "phandle property values created
by the compiler _should_ be in the range of 1..n, where n is the
number of phandle properties in the tree".  That would provide
some assurance of future trees being able to benefit from this
specific optimization.

Also, this specific implementation to decrease the cost of
of_find_node_by_phandle() is just an implementation, not an
architecture.  Other implementations to achieve the same goal
have existed in the past, and yet other methods could replace
this one in the future if needed.


> them have some known flag in the upper byte so we have some hint for
> phandle values. 2^24 phandles should be enough for anyone.TM

I don't understand.  What is the definition of the flag?  A flag
that says the phandle property values are in the range of 1..n,
where n is the number of phandle properties in the tree?


> Your cache size is also going to balloon if the dtb was built with
> '-@'.

"Balloon" is a bit strong.  Worst case is one entry per node,
which is comparable to the old method of a linked list of nodes
with phandle properties but with lower of_find_node_by_phandle()
cost than the linked list implementation.  And this assumes that
every node has a label on it.

< snip (retracted) >


> Freeing after boot is nice, but if someone has lots of modules or
> large overlays, this doesn't help them at all.

The cache has to be regenerated anyway after applying an overlay
that adds phandle properties to the live tree.  Modules is something
I thought about, but did not want to complicate the patch until we
decided if this was a good direction to follow.  Some ways to deal
with overlays could be: don't auto-free the cache if modules are
configured in the kernel, repopulate the cache any time a module
is loaded, add a boot command line option to specify "do not free
the cache" (or alternatively, do not automatically free the cache
but provide an option of "do free the cache").

For now this seems like a lot of complexity for a non-problem.
And we don't even have the performance numbers yet to see if
this solves the reported problem.  I'd prefer to start simple,
and add complexity if needed.


> There's still more tweaks we could do with a cache based (i.e. can
> miss) approach. We could have an access count or some most recently
> used list to avoid evicting frequently accessed phandles (your data
> tends to indicate that would help). We could have cache sets. 

That seems like a lot of complexity for little or no gain.

I actually like the elegance of the patch you created, thinking that
the cache population and freeing code in my patch added a level of
complexity.  In the end, I think the reduced overhead of my
approach supports the slight increase in complexity.


> And so
> far, no one has explained why a bigger cache got slower.

Yes, I still find that surprising.



> Or we could do something decentralized and make the frequent callers
> cache their phandles.

That could be a good solution if we have just one or two uses of
of_find_node_by_phandle() that are the cause of the long boot time.

That is why I was trying to get Chintan to examine and understand the
use(s) of of_find_by_phandle() that were causing the slow boot, or if
the problem is dispersed to a wide variety of callers.

> 
> Rob
>
Frank Rowand Feb. 1, 2018, 9:13 p.m. UTC | #11
On 02/01/18 06:34, Rob Herring wrote:
> On Wed, Jan 31, 2018 at 2:05 PM,  <frowand.list@gmail.com> wrote:
>> From: Frank Rowand <frank.rowand@sony.com>
>>
>> Create a cache of the nodes that contain a phandle property.  Use this
>> cache to find the node for a given phandle value instead of scanning
>> the devicetree to find the node.  If the phandle value is not found
>> in the cache, of_find_node_by_phandle() will fall back to the tree
>> scan algorithm.
>>
>> The cache is initialized in of_core_init().
>>
>> The cache is freed via a late_initcall_sync().
>>
>> Signed-off-by: Frank Rowand <frank.rowand@sony.com>
>> ---
>>
>> Some of_find_by_phandle() calls may occur before the cache is
>> initialized or after it is freed.  For example, for the qualcomm
>> qcom-apq8074-dragonboard, 11 calls occur before the initialization
>> and 80 occur after the cache is freed (out of 516 total calls.)
> 
> We should be able to do this earlier. We already walk the tree twice
> in unflattening. We can get the max phandle (or number of phandles
> IMO) on the first pass, allocate with the early allocator and then
> populate the cache in the 2nd pass. AIUI, you can alloc with memblock
> and then free with kfree as the memblock allocations get transferred
> to the slab.

Thanks for pointing out that kfree() can be used for memory alloced
with memblock.  I'll change to populate the cache earlier.
Rob Herring Feb. 2, 2018, 3:45 a.m. UTC | #12
On Thu, Feb 1, 2018 at 3:09 PM, Frank Rowand <frowand.list@gmail.com> wrote:
> On 02/01/18 06:24, Rob Herring wrote:
>> On Wed, Jan 31, 2018 at 3:43 PM, Frank Rowand <frowand.list@gmail.com> wrote:
>>> On 01/31/18 12:05, frowand.list@gmail.com wrote:
>>>> From: Frank Rowand <frank.rowand@sony.com>
>>>>
>>>> Create a cache of the nodes that contain a phandle property.  Use this
>>>> cache to find the node for a given phandle value instead of scanning
>>>> the devicetree to find the node.  If the phandle value is not found
>>>> in the cache, of_find_node_by_phandle() will fall back to the tree
>>>> scan algorithm.
>>>>
>>>> The cache is initialized in of_core_init().
>>>>
>>>> The cache is freed via a late_initcall_sync().
>>>>
>>>> Signed-off-by: Frank Rowand <frank.rowand@sony.com>
>>>> ---
>>>>
>>>> Some of_find_by_phandle() calls may occur before the cache is
>>>> initialized or after it is freed.  For example, for the qualcomm
>>>> qcom-apq8074-dragonboard, 11 calls occur before the initialization
>>>> and 80 occur after the cache is freed (out of 516 total calls.)
>>>>
>>>>
>>>>  drivers/of/base.c       | 85 ++++++++++++++++++++++++++++++++++++++++++++++---
>>>>  drivers/of/of_private.h |  5 +++
>>>>  drivers/of/resolver.c   | 21 ------------
>>>>  3 files changed, 86 insertions(+), 25 deletions(-)
>>>
>>> Some observations....
>>>
>>> The size of the cache for a normal device tree would be a couple of
>>> words of overhead for the cache, plus one pointer per devicetree node
>>> that contains a phandle property.  This will be less space than
>>> would be used by adding a hash field to each device node.  It is
>>> also less space than was used by the older algorithm (long gone)
>>> that added a linked list through the nodes that contained a
>>> phandle property.
>>>
>>> This is assuming that the values of the phandle properties are
>>> the default ones created by the dtc compiler.  In the case
>>> where a very large phandle property value is hand-coded in
>>> a devicetree source, the size of the cache is capped at one
>>> entry per node.  In this case, a little bit of space will be
>>> wasted -- but this is just a sanity fallback, it should not
>>> be encountered, and can be fixed by fixing the devicetree
>>> source.
>>
>> I don't think we should rely on how dtc allocates phandles. dtc is not
>> the only source of DeviceTrees. If we could do that, then lets make
>
> It seems like a reasonable thing to rely on.  dtc is the in-tree
> compiler to create an FDT.
>
> Are you thinking about the IBM PPC devicetrees as devicetrees created
> in some manner other than dtc?  Are there other examples you are
> aware of?

There's that and any other platform with real OF. There's also the BSD
implementation of dtc.

> If non-dtc tools create phandle property values that are not a
> contiguous range of values starting with one, then the devicetrees
> they create may not benefit from this performance optimization.
> But no user of such a devicetree is complaining about performance
> issues with of_find_node_by_phandle() against their tree.  So until
> there is an issue, no big deal.

All I'm really saying is mask the low bits like I did. Then it works
equally well for any continuous range. Yes, someone could allocate in
multiples of 1024 or something and it wouldn't work well (still works,
but misses). Then we're really only debating dynamically sizing it and
whether to free it.

> If my effort to create a new version of the FDT, I would like to
> include a rule to the effect of "phandle property values created
> by the compiler _should_ be in the range of 1..n, where n is the
> number of phandle properties in the tree".  That would provide
> some assurance of future trees being able to benefit from this
> specific optimization.

Did you think of that before this issue? :)

> Also, this specific implementation to decrease the cost of
> of_find_node_by_phandle() is just an implementation, not an
> architecture.  Other implementations to achieve the same goal
> have existed in the past, and yet other methods could replace
> this one in the future if needed.
>
>
>> them have some known flag in the upper byte so we have some hint for
>> phandle values. 2^24 phandles should be enough for anyone.TM
>
> I don't understand.  What is the definition of the flag?  A flag
> that says the phandle property values are in the range of 1..n,
> where n is the number of phandle properties in the tree?

If we defined that phandles have values of say "0xABxxxxxx", then we
could use that for parsing properties without looking up #*-cells.
Maybe you encode the cell count too. Yes, you'd have to handle
possible collisions, but it would be better than nothing. My point is
that we don't do this because then we'd be making assumptions on
phandle values. We can't make assumptions because the dtbs already
exist and dtc is only one of the things generating phandles I can
change.

>> Your cache size is also going to balloon if the dtb was built with
>> '-@'.
>
> "Balloon" is a bit strong.  Worst case is one entry per node,
> which is comparable to the old method of a linked list of nodes
> with phandle properties but with lower of_find_node_by_phandle()
> cost than the linked list implementation.  And this assumes that
> every node has a label on it.
>
> < snip (retracted) >
>
>
>> Freeing after boot is nice, but if someone has lots of modules or
>> large overlays, this doesn't help them at all.
>
> The cache has to be regenerated anyway after applying an overlay
> that adds phandle properties to the live tree. Modules is something
> I thought about, but did not want to complicate the patch until we
> decided if this was a good direction to follow.  Some ways to deal
> with overlays could be: don't auto-free the cache if modules are
> configured in the kernel, repopulate the cache any time a module
> is loaded, add a boot command line option to specify "do not free
> the cache" (or alternatively, do not automatically free the cache
> but provide an option of "do free the cache").

Or just make the cache small enough to not matter.

> For now this seems like a lot of complexity for a non-problem.
> And we don't even have the performance numbers yet to see if
> this solves the reported problem.  I'd prefer to start simple,
> and add complexity if needed.

I agree. I'm just saying there's a path towards further improvements if need be.

>> There's still more tweaks we could do with a cache based (i.e. can
>> miss) approach. We could have an access count or some most recently
>> used list to avoid evicting frequently accessed phandles (your data
>> tends to indicate that would help). We could have cache sets.
>
> That seems like a lot of complexity for little or no gain.
>
> I actually like the elegance of the patch you created, thinking that
> the cache population and freeing code in my patch added a level of
> complexity.  In the end, I think the reduced overhead of my
> approach supports the slight increase in complexity.

How is your approach less overhead? The fastest so far is an 850ms
improvement with a 1024 entry, pre-populated cache. Next is yours (an
array of all phandles) at 750ms. Then a 64-entry cache is 650ms
improvement.

I'm still of the opinion that the 64 entry cache is good enough in the
trade off of speed and size.

Rob
Chintan Pandya Feb. 2, 2018, 5:53 a.m. UTC | #13
On 2/2/2018 2:39 AM, Frank Rowand wrote:
> On 02/01/18 06:24, Rob Herring wrote:
>> And so
>> far, no one has explained why a bigger cache got slower.
> 
> Yes, I still find that surprising.

I thought a bit about this. And realized that increasing the cache size 
should help improve the performance only if there are too many misses 
with the smaller cache. So, from my experiments some time back, I looked 
up the logs and saw the access pattern. Seems like, there is 
*not_too_much* juggling during look up by phandles.

See the access pattern here: 
https://drive.google.com/file/d/1qfAD8OsswNJABgAwjJf6Gr_JZMeK7rLV/view?usp=sharing

Sample log is pasted below where number in the last is phandle value.
	Line 8853: [   37.425405] OF: want to search this 262
	Line 8854: [   37.425453] OF: want to search this 262
	Line 8855: [   37.425499] OF: want to search this 262
	Line 8856: [   37.425549] OF: want to search this 15
	Line 8857: [   37.425599] OF: want to search this 5
	Line 8858: [   37.429989] OF: want to search this 253
	Line 8859: [   37.430058] OF: want to search this 253
	Line 8860: [   37.430217] OF: want to search this 253
	Line 8861: [   37.430278] OF: want to search this 253
	Line 8862: [   37.430337] OF: want to search this 253
	Line 8863: [   37.430399] OF: want to search this 254
	Line 8864: [   37.430597] OF: want to search this 254
	Line 8865: [   37.430656] OF: want to search this 254


Above explains why results with cache size 64 and 128 have almost 
similar results. Now, for cache size 256 we have degrading performance. 
I don't have a good theory here but I'm assuming that by making large SW 
cache, we miss the benefits of real HW cache which is typically smaller 
than our array size. Also, in my set up, I've set max_cpu=1 to reduce 
the variance. That again, should affect the cache holding pattern in HW 
and affect the perf numbers.


Chintan
Chintan Pandya Feb. 2, 2018, 5:59 a.m. UTC | #14
On 2/2/2018 12:40 AM, Frank Rowand wrote:
> On 02/01/18 02:31, Chintan Pandya wrote:
>>
>>>> Anyways, will fix this locally and share test results.
>>>
>>> Thanks, I look forward to the results.
>>>
>>
>> Set up for this time was slightly different. So, taken all the numbers again.
>>
>> Boot to shell time (in ms): Experiment 2
>> [1] Base            : 14.843805 14.784842 14.842338
>> [2] 64 sized fixed cache    : 14.189292 14.200003 14.266711
>> [3] Dynamic freeable cache    : 14.112412 14.064772 14.036052
>>
>> So, [3] (this patch) looks to be improving 750ms (on avg from base build).
>>
> 
> Is this with the many debug options enabled?  If so, can you repeat with
> a normal configuration?

Could you share me the point of doing this experiment in perf mode ? I 
don't have a set up for taking these numbers in perf mode. For that, I 
need to ask some other team and round trip follow ups. In my set up, I 
rely on serial console logging which gets disabled in perf mode.

> 
> Thanks,
> 
> Frank
> 

Chintan
Frank Rowand Feb. 2, 2018, 9:26 p.m. UTC | #15
On 02/01/18 21:59, Chintan Pandya wrote:
> 
> 
> On 2/2/2018 12:40 AM, Frank Rowand wrote:
>> On 02/01/18 02:31, Chintan Pandya wrote:
>>>
>>>>> Anyways, will fix this locally and share test results.
>>>>
>>>> Thanks, I look forward to the results.
>>>>
>>>
>>> Set up for this time was slightly different. So, taken all the numbers again.
>>>
>>> Boot to shell time (in ms): Experiment 2
>>> [1] Base            : 14.843805 14.784842 14.842338
>>> [2] 64 sized fixed cache    : 14.189292 14.200003 14.266711
>>> [3] Dynamic freeable cache    : 14.112412 14.064772 14.036052
>>>
>>> So, [3] (this patch) looks to be improving 750ms (on avg from base build).
>>>
>>
>> Is this with the many debug options enabled?  If so, can you repeat with
>> a normal configuration?
> 
> Could you share me the point of doing this experiment in perf mode ?


You had mentioned earlier in another thread:

   My recent results were taken on debug_defconfig which has many performance
   slowing code. So, gap between base-build and w/ the test patches would be
   more than the actual production build.

If you measure a large performance gain with a debug configuration, that
may not represent the actual gain you will get with a production
configuration, as you noted.

My question was trying to determine whether the numbers reported above
are for a debug configuration or a production configuration.  And if
not a production configuration, I was requesting the numbers for a
production configuration.  If the production configuration does not
show a significant boot time reduction from the patch then there is
less justification for adding complexity to the existing code.  I
prefer to use simpler data structures and algorithms __if__ extra
complexity does not provide any advantage.  The balance between
complexity and benefits is a core software engineering issue.


> I don't have a set up for taking these numbers in perf mode. For
> that, I need to ask some other team and round trip follow ups. In my
> set up, I rely on serial console logging which gets disabled in perf
> mode.
> 
>>
>> Thanks,
>>
>> Frank
>>
> 
> Chintan
Frank Rowand Feb. 2, 2018, 10:34 p.m. UTC | #16
On 02/01/18 19:45, Rob Herring wrote:
> On Thu, Feb 1, 2018 at 3:09 PM, Frank Rowand <frowand.list@gmail.com> wrote:
>> On 02/01/18 06:24, Rob Herring wrote:
>>> On Wed, Jan 31, 2018 at 3:43 PM, Frank Rowand <frowand.list@gmail.com> wrote:
>>>> On 01/31/18 12:05, frowand.list@gmail.com wrote:
>>>>> From: Frank Rowand <frank.rowand@sony.com>
>>>>>
>>>>> Create a cache of the nodes that contain a phandle property.  Use this
>>>>> cache to find the node for a given phandle value instead of scanning
>>>>> the devicetree to find the node.  If the phandle value is not found
>>>>> in the cache, of_find_node_by_phandle() will fall back to the tree
>>>>> scan algorithm.
>>>>>
>>>>> The cache is initialized in of_core_init().
>>>>>
>>>>> The cache is freed via a late_initcall_sync().
>>>>>
>>>>> Signed-off-by: Frank Rowand <frank.rowand@sony.com>
>>>>> ---
>>>>>
>>>>> Some of_find_by_phandle() calls may occur before the cache is
>>>>> initialized or after it is freed.  For example, for the qualcomm
>>>>> qcom-apq8074-dragonboard, 11 calls occur before the initialization
>>>>> and 80 occur after the cache is freed (out of 516 total calls.)
>>>>>
>>>>>
>>>>>  drivers/of/base.c       | 85 ++++++++++++++++++++++++++++++++++++++++++++++---
>>>>>  drivers/of/of_private.h |  5 +++
>>>>>  drivers/of/resolver.c   | 21 ------------
>>>>>  3 files changed, 86 insertions(+), 25 deletions(-)
>>>>
>>>> Some observations....
>>>>
>>>> The size of the cache for a normal device tree would be a couple of
>>>> words of overhead for the cache, plus one pointer per devicetree node
>>>> that contains a phandle property.  This will be less space than
>>>> would be used by adding a hash field to each device node.  It is
>>>> also less space than was used by the older algorithm (long gone)
>>>> that added a linked list through the nodes that contained a
>>>> phandle property.
>>>>
>>>> This is assuming that the values of the phandle properties are
>>>> the default ones created by the dtc compiler.  In the case
>>>> where a very large phandle property value is hand-coded in
>>>> a devicetree source, the size of the cache is capped at one
>>>> entry per node.  In this case, a little bit of space will be
>>>> wasted -- but this is just a sanity fallback, it should not
>>>> be encountered, and can be fixed by fixing the devicetree
>>>> source.
>>>
>>> I don't think we should rely on how dtc allocates phandles. dtc is not
>>> the only source of DeviceTrees. If we could do that, then lets make
>>
>> It seems like a reasonable thing to rely on.  dtc is the in-tree
>> compiler to create an FDT.
>>
>> Are you thinking about the IBM PPC devicetrees as devicetrees created
>> in some manner other than dtc?  Are there other examples you are
>> aware of?
> 
> There's that and any other platform with real OF. There's also the BSD
> implementation of dtc.
> 
>> If non-dtc tools create phandle property values that are not a
>> contiguous range of values starting with one, then the devicetrees
>> they create may not benefit from this performance optimization.
>> But no user of such a devicetree is complaining about performance
>> issues with of_find_node_by_phandle() against their tree.  So until
>> there is an issue, no big deal.
> 
> All I'm really saying is mask the low bits like I did. Then it works
> equally well for any continuous range. Yes, someone could allocate in
> multiples of 1024 or something and it wouldn't work well (still works,
> but misses). Then we're really only debating dynamically sizing it and
> whether to free it.

I see what you are saying.

Are you aware of any compiler that creates phandle property values
in this manner?  If so, then there is a complexity vs benefit
analysis to be done.  If not, there is no need to code for a
what-if scenario that would not have any negative consequences
(other than not being able to take advantage of a performance
enhancement).


>> If my effort to create a new version of the FDT, I would like to
>> include a rule to the effect of "phandle property values created
>> by the compiler _should_ be in the range of 1..n, where n is the
>> number of phandle properties in the tree".  That would provide
>> some assurance of future trees being able to benefit from this
>> specific optimization.
> 
> Did you think of that before this issue? :)

I don't remember, but I doubt it.  Not that it has any bearing
on this.  :-)


>> Also, this specific implementation to decrease the cost of
>> of_find_node_by_phandle() is just an implementation, not an
>> architecture.  Other implementations to achieve the same goal
>> have existed in the past, and yet other methods could replace
>> this one in the future if needed.
>>
>>
>>> them have some known flag in the upper byte so we have some hint for
>>> phandle values. 2^24 phandles should be enough for anyone.TM
>>
>> I don't understand.  What is the definition of the flag?  A flag
>> that says the phandle property values are in the range of 1..n,
>> where n is the number of phandle properties in the tree?
> 
> If we defined that phandles have values of say "0xABxxxxxx", then we
> could use that for parsing properties without looking up #*-cells.

For those at home reading along, this would reduce the number of
of_find_node_by_phandle() calls.


> Maybe you encode the cell count too. Yes, you'd have to handle
> possible collisions, but it would be better than nothing. My point is
> that we don't do this because then we'd be making assumptions on

Yes, that would be really fragile.


> phandle values. We can't make assumptions because the dtbs already
> exist and dtc is only one of the things generating phandles I can
> change.

That is a different argument.  It is arguing that making assumptions
about phandle values could lead to incorrect results.  My argument
is that making assumptions about phandle values means that a dtb
that does not meet these assumptions will still be interpretted
correctly, but will not be able to benefit from a performance
enhancement.

The performance enhancement becomes available to that devicetree
simply by recompiling with a compiler that matches the phandle
property value assumption.


>>> Your cache size is also going to balloon if the dtb was built with
>>> '-@'.
>>
>> "Balloon" is a bit strong.  Worst case is one entry per node,
>> which is comparable to the old method of a linked list of nodes
>> with phandle properties but with lower of_find_node_by_phandle()
>> cost than the linked list implementation.  And this assumes that
>> every node has a label on it.
>>
>> < snip (retracted) >
>>
>>
>>> Freeing after boot is nice, but if someone has lots of modules or
>>> large overlays, this doesn't help them at all.
>>
>> The cache has to be regenerated anyway after applying an overlay
>> that adds phandle properties to the live tree. Modules is something
>> I thought about, but did not want to complicate the patch until we
>> decided if this was a good direction to follow.  Some ways to deal
>> with overlays could be: don't auto-free the cache if modules are
>> configured in the kernel, repopulate the cache any time a module
>> is loaded, add a boot command line option to specify "do not free
>> the cache" (or alternatively, do not automatically free the cache
>> but provide an option of "do free the cache").
> 
> Or just make the cache small enough to not matter.

I would argue that the cache is small enough to not matter because
it is dynamically sized, based on the complexity of the devicetree.
A devicetree with few phandle properties is unlikely to have a
noticable boot time reduction, but also has a very small cache.
A devicetree with a large number of phandle properties is likely
to have a noticable boot time reduction (if the current test
reports bear out), but such a system is also likely to have more
memory and not be sensitive to the memory used by the cache (which
is still quite small).

For the system which started this discussion, the memory size is 6 GB.

The alternate approaches for choosing whether to free the cache
or not are available _if_ we decide the memory usage is an
issue.


>> For now this seems like a lot of complexity for a non-problem.
>> And we don't even have the performance numbers yet to see if
>> this solves the reported problem.  I'd prefer to start simple,
>> and add complexity if needed.
> 
> I agree. I'm just saying there's a path towards further improvements if need be.

Yes.  And I expect some to be pursued in the future, when it makes
sense to (eg small additions to make this more useful for overlays).

And it is an interesting discussion looking at the alternatives.


>>> There's still more tweaks we could do with a cache based (i.e. can
>>> miss) approach. We could have an access count or some most recently
>>> used list to avoid evicting frequently accessed phandles (your data
>>> tends to indicate that would help). We could have cache sets.
>>
>> That seems like a lot of complexity for little or no gain.
>>
>> I actually like the elegance of the patch you created, thinking that
>> the cache population and freeing code in my patch added a level of
>> complexity.  In the end, I think the reduced overhead of my
>> approach supports the slight increase in complexity.
> 
> How is your approach less overhead? The fastest so far is an 850ms
> improvement with a 1024 entry, pre-populated cache. Next is yours (an
> array of all phandles) at 750ms. Then a 64-entry cache is 650ms
> improvement.

Those values are not comparable.  The 850ms improvement was against a
base boot time of 15.04, 15.24, and 15.26 seconds.  The 750ms was against a
base boot time of 14.78, 14.84, and 14.84 seconds.  Different base boot
times says something was different that _could_ affect the measured
improvement for the different approaches.

Being really simplistic (eg ignoring L1 cache size, which is reported to
be 32KB per CPU, and L2 cache size, reported to be 128KB per cpu, and thus
_could_ be a factor given my approach has a phandle cache size of
814 * 4 = 3256 bytes if 32 bit pointers or 6512 bytes if 64 bit pointers)...

My algorithm...

   - two scans of the devicetree nodes to populate the cache
     O(2 * number of nodes)

   - an O(1) cache access time.

   - memory overhead: one pointer per node that contains a phandle
     property, plus a couple overhead head words: ~3256 bytes
     or ~6512 bytes for this specific devicetree

Your algorithm, if the cache is large enough to hold all phandle
values without any collision...

   - one scan of the devicetree per phandle property accessed,
     each scan is of half of the devicetree nodes (assumes nodes with
     a phandle property are evenly distributed throughout the devicetree)
     to populate the cache
     O(number of nodes with phandle properties) * (1/2 * number of nodes)

   - each time a phandle cache entry has a collision and a new value has
     to be determined repeats an O(1/2 * number of nodes) scan

   - an O(1) cache access time

   - memory overhead: depends on the implementation chosen, but could
     be as small as 64 * 4 = 256 bytes (or 64 * 8 = 512 bytes)

My algorithm: fewer memory accesses, more memory used

Your algorithm: more memory accesses, less memory used

But for small devicetrees (less than 64 nodes with phandle properties)
my algorithm uses less memory for the phandle cache.


> I'm still of the opinion that the 64 entry cache is good enough in the
> trade off of speed and size.

That is an arbitrary size that may or may not scale with the number of
phandle properties in a devicetree.  That is most likely dependent upon
the access pattern of each phandle value.  Chintan sent some data that
allows us to look at that for his specific system.  There are some time
ranges in the boot that show a lot of temporal locality and there are
some ranges that do not.  I'll reply to that email separately.


> 
> Rob
>
Frank Rowand Feb. 3, 2018, 3:55 a.m. UTC | #17
On 02/01/18 21:53, Chintan Pandya wrote:
> 
> 
> On 2/2/2018 2:39 AM, Frank Rowand wrote:
>> On 02/01/18 06:24, Rob Herring wrote:
>>> And so
>>> far, no one has explained why a bigger cache got slower.
>>
>> Yes, I still find that surprising.
> 
> I thought a bit about this. And realized that increasing the cache size should help improve the performance only if there are too many misses with the smaller cache. So, from my experiments some time back, I looked up the logs and saw the access pattern. Seems like, there is *not_too_much* juggling during look up by phandles.
> 
> See the access pattern here: https://drive.google.com/file/d/1qfAD8OsswNJABgAwjJf6Gr_JZMeK7rLV/view?usp=sharing

Thanks!  Very interesting.

I was somewhat limited at playing detective with this, because the phandle
values are not consistent with the dts file you are currently working
with (arch/arm64/boot/dts/qcom/sda670-mtp.dts).  For example, I could
not determine what the target nodes for the hot phandle values.  That
information _could_ possibly point at algorithms within the devicetree
core code that could be improved.  Or maybe not.  Hard to tell until
actually looking at the data.

Anyway, some observations were possible.

There are 485 unique phandle values searched for.

The ten phandle values most frequently referenced account for
3932 / 6745 (or 58%) of all references.

Without the corresponding devicetree I can not tell how many nodes
need to be scanned to locate each of these ten values (using the
existing algorithm).  Thus I can not determine how much scanning
would be eliminated by caching just the nodes corresponding to
these ten phandle values.

There are 89 phandle values that were searched for 10 times
or more, accounting for 86% of the searches.

Only 164 phandle values were searched for just one time.

303 phandle values were searched for just one or two times.

Here is a more complete picture:

  10 values each used 100 or more times; searches:  3932   58%
  11 values each used  90 or more times; searches:  3994   59%
  12 values each used  80 or more times; searches:  4045   60%
  13 values each used  70 or more times; searches:  4093   61%
  14 values each used  60 or more times; searches:  4136   61%
  15 values each used  50 or more times; searches:  4178   62%
  18 values each used  40 or more times; searches:  4300   64%
  32 values each used  30 or more times; searches:  4774   71%
  54 values each used  20 or more times; searches:  5293   78%
  89 values each used  10 or more times; searches:  5791   86%
  93 values each used   9 or more times; searches:  5827   86%
 117 values each used   8 or more times; searches:  6019   89%
 122 values each used   7 or more times; searches:  6054   90%
 132 values each used   6 or more times; searches:  6114   91%
 144 values each used   5 or more times; searches:  6174   92%
 162 values each used   4 or more times; searches:  6246   93%
 181 values each used   3 or more times; searches:  6303   93%
 320 values each used   2 or more times; searches:  6581   98%
 484 values each used   1 or more times; searches:  6746  100%

A single system does not prove anything.  It is possible that
other devicetrees would exhibit similarly long tailed behavior,
but that is just wild speculation on my part.

_If_ the long tail is representative of other systems, then
identifying a few hot spots could be useful, but fixing them
is not likely to significantly reduce the overhead of calls
to of_find_node_by_phandle().  Some method of reducing the
overhead of each call would be the answer for a system of
this class.


> Sample log is pasted below where number in the last is phandle value.
>     Line 8853: [   37.425405] OF: want to search this 262
>     Line 8854: [   37.425453] OF: want to search this 262
>     Line 8855: [   37.425499] OF: want to search this 262
>     Line 8856: [   37.425549] OF: want to search this 15
>     Line 8857: [   37.425599] OF: want to search this 5
>     Line 8858: [   37.429989] OF: want to search this 253
>     Line 8859: [   37.430058] OF: want to search this 253
>     Line 8860: [   37.430217] OF: want to search this 253
>     Line 8861: [   37.430278] OF: want to search this 253
>     Line 8862: [   37.430337] OF: want to search this 253
>     Line 8863: [   37.430399] OF: want to search this 254
>     Line 8864: [   37.430597] OF: want to search this 254
>     Line 8865: [   37.430656] OF: want to search this 254
> 
> 
> Above explains why results with cache size 64 and 128 have almost similar results. Now, for cache size 256 we have degrading performance. I don't have a good theory here but I'm assuming that by making large SW cache, we miss the benefits of real HW cache which is typically smaller than our array size. Also, in my set up, I've set max_cpu=1 to reduce the variance. That again, should affect the cache holding pattern in HW and affect the perf numbers.
> 
> 
> Chintan
Chintan Pandya Feb. 5, 2018, 12:23 p.m. UTC | #18
> 
> My question was trying to determine whether the numbers reported above
> are for a debug configuration or a production configuration.
My reported numbers are from debug configuration.

> not a production configuration, I was requesting the numbers for a
> production configuration.
I'm working on it. But please expect some delay in my response for this. 
As I mentioned earlier, I need to work with few teams to get these numbers.


> show a significant boot time reduction from the patch then there is
> less justification for adding complexity to the existing code.  I
> prefer to use simpler data structures and algorithms __if__ extra
> complexity does not provide any advantage.  The balance between
> complexity and benefits is a core software engineering issue.
> 
Ok
> 
>> I don't have a set up for taking these numbers in perf mode. For
>> that, I need to ask some other team and round trip follow ups. In my
>> set up, I rely on serial console logging which gets disabled in perf
>> mode.
>>
>>>
>>> Thanks,
>>>
>>> Frank
>>>
>>
>> Chintan
> 

Chintan
Chintan Pandya Feb. 7, 2018, 12:44 p.m. UTC | #19
On 2/5/2018 5:53 PM, Chintan Pandya wrote:
> 
>>
>> My question was trying to determine whether the numbers reported above
>> are for a debug configuration or a production configuration.
> My reported numbers are from debug configuration.
> 
>> not a production configuration, I was requesting the numbers for a
>> production configuration.
> I'm working on it. But please expect some delay in my response for this. 
> As I mentioned earlier, I need to work with few teams to get these numbers.
> 
> 
>> show a significant boot time reduction from the patch then there is
>> less justification for adding complexity to the existing code.  I
>> prefer to use simpler data structures and algorithms __if__ extra
>> complexity does not provide any advantage.  The balance between
>> complexity and benefits is a core software engineering issue.
>>
> Ok

Avg Kernel boot time comparison in production set up:

[0] Base: 4519ms
[1] 4115ms (~400ms improvement)
[2] 4115ms (~400ms improvement)
[3] 4177ms (~340ms improvement)

Full data:
[1] 1024 sized pre-populated cache
ITR-1	ITR-2	ITR-3	ITR-4	Avg
4115	4123	4124	4107	4115

[2] Dynamic sized cache allocation/free
ITR-1	ITR-2	ITR-3	ITR-4	Avg
4122	4131	4106	4118	4115

[3] Fixed 64 sized cache
ITR-1	ITR-2	ITR-3	ITR-4	Avg
4153	4186	4198	4181	4177


[1] is my experimental patch and dirty enough to not get merged 
anywhere. So, I will not push it.



Chintan
Frank Rowand Feb. 7, 2018, 8:09 p.m. UTC | #20
On 02/07/18 04:44, Chintan Pandya wrote:
> 
> 
> On 2/5/2018 5:53 PM, Chintan Pandya wrote:
>>
>>>
>>> My question was trying to determine whether the numbers reported above
>>> are for a debug configuration or a production configuration.
>> My reported numbers are from debug configuration.
>>
>>> not a production configuration, I was requesting the numbers for a
>>> production configuration.
>> I'm working on it. But please expect some delay in my response for this. As I mentioned earlier, I need to work with few teams to get these numbers.
>>
>>
>>> show a significant boot time reduction from the patch then there is
>>> less justification for adding complexity to the existing code.  I
>>> prefer to use simpler data structures and algorithms __if__ extra
>>> complexity does not provide any advantage.  The balance between
>>> complexity and benefits is a core software engineering issue.
>>>
>> Ok
> 
> Avg Kernel boot time comparison in production set up:
> 
> [0] Base: 4519ms
> [1] 4115ms (~400ms improvement)
> [2] 4115ms (~400ms improvement)
> [3] 4177ms (~340ms improvement)
> 
> Full data:
> [1] 1024 sized pre-populated cache
> ITR-1    ITR-2    ITR-3    ITR-4    Avg
> 4115    4123    4124    4107    4115
> 
> [2] Dynamic sized cache allocation/free
> ITR-1    ITR-2    ITR-3    ITR-4    Avg
> 4122    4131    4106    4118    4115
> 
> [3] Fixed 64 sized cache
> ITR-1    ITR-2    ITR-3    ITR-4    Avg
> 4153    4186    4198    4181    4177
> 
> 
> [1] is my experimental patch and dirty enough to not get merged anywhere. So, I will not push it.
> 
> 
> 
> Chintan

Thank you very much.  This looks like a real improvement to me.

I'll rebase my patches, updated to address the comments, on 4.16-rc1.

-Frank

Patch
diff mbox series

diff --git a/drivers/of/base.c b/drivers/of/base.c
index 26618ba8f92a..c3091d0e391f 100644
--- a/drivers/of/base.c
+++ b/drivers/of/base.c
@@ -95,10 +95,14 @@  int __weak of_node_to_nid(struct device_node *np)
 }
 #endif
 
+static void of_populate_phandle_cache(void);
+
 void __init of_core_init(void)
 {
 	struct device_node *np;
 
+	of_populate_phandle_cache();
+
 	/* Create the kset, and register existing nodes */
 	mutex_lock(&of_mutex);
 	of_kset = kset_create_and_add("devicetree", NULL, firmware_kobj);
@@ -990,6 +994,72 @@  int of_modalias_node(struct device_node *node, char *modalias, int len)
 }
 EXPORT_SYMBOL_GPL(of_modalias_node);
 
+phandle live_tree_max_phandle(void)
+{
+	struct device_node *node;
+	phandle max_phandle;
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&devtree_lock, flags);
+	max_phandle = 0;
+	for_each_of_allnodes(node) {
+		if (node->phandle != OF_PHANDLE_ILLEGAL &&
+		    node->phandle > max_phandle)
+			max_phandle = node->phandle;
+	}
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
+
+	return max_phandle;
+}
+
+static struct device_node **phandle_cache;
+static u32 max_phandle_cache;
+
+static int __init of_free_phandle_cache(void)
+{
+	max_phandle_cache = 0;
+	kfree(phandle_cache);
+	phandle_cache = NULL;
+
+	return 0;
+}
+late_initcall_sync(of_free_phandle_cache);
+
+static void of_populate_phandle_cache(void)
+{
+	unsigned long flags;
+	phandle max_phandle;
+	u32 nodes = 0;
+	struct device_node *np;
+
+	if (phandle_cache)
+		return;
+
+	max_phandle = live_tree_max_phandle();
+
+	raw_spin_lock_irqsave(&devtree_lock, flags);
+
+	for_each_of_allnodes(np)
+		nodes++;
+
+	/* sanity cap for malformed tree */
+	if (max_phandle > nodes)
+		max_phandle = nodes;
+
+	phandle_cache = kzalloc((max_phandle + 1) * sizeof(*phandle_cache),
+				GFP_KERNEL);
+
+	for_each_of_allnodes(np)
+		if (np->phandle != OF_PHANDLE_ILLEGAL  &&
+		    np->phandle <= max_phandle &&
+		    np->phandle)
+			phandle_cache[np->phandle] = np;
+
+	max_phandle_cache = max_phandle;
+
+	raw_spin_unlock_irqrestore(&devtree_lock, flags);
+}
+
 /**
  * of_find_node_by_phandle - Find a node given a phandle
  * @handle:	phandle of the node to find
@@ -999,16 +1069,23 @@  int of_modalias_node(struct device_node *node, char *modalias, int len)
  */
 struct device_node *of_find_node_by_phandle(phandle handle)
 {
-	struct device_node *np;
+	struct device_node *np = NULL;
 	unsigned long flags;
 
 	if (!handle)
 		return NULL;
 
 	raw_spin_lock_irqsave(&devtree_lock, flags);
-	for_each_of_allnodes(np)
-		if (np->phandle == handle)
-			break;
+
+	if (handle <= max_phandle_cache)
+		np = phandle_cache[handle];
+
+	if (!np || np->phandle != handle) {
+		for_each_of_allnodes(np)
+			if (np->phandle == handle)
+				break;
+	}
+
 	of_node_get(np);
 	raw_spin_unlock_irqrestore(&devtree_lock, flags);
 	return np;
diff --git a/drivers/of/of_private.h b/drivers/of/of_private.h
index 92a9a3687446..77005978d60a 100644
--- a/drivers/of/of_private.h
+++ b/drivers/of/of_private.h
@@ -135,6 +135,11 @@  extern void __of_update_property_sysfs(struct device_node *np,
 extern void __of_sysfs_remove_bin_file(struct device_node *np,
 				       struct property *prop);
 
+/* illegal phandle value (set when unresolved) */
+#define OF_PHANDLE_ILLEGAL	0xdeadbeef
+
+extern phandle live_tree_max_phandle(void);
+
 /* iterators for transactions, used for overlays */
 /* forward iterator */
 #define for_each_transaction_entry(_oft, _te) \
diff --git a/drivers/of/resolver.c b/drivers/of/resolver.c
index cfaeef5f6cb1..0384ce8fdc3b 100644
--- a/drivers/of/resolver.c
+++ b/drivers/of/resolver.c
@@ -22,27 +22,6 @@ 
 
 #include "of_private.h"
 
-/* illegal phandle value (set when unresolved) */
-#define OF_PHANDLE_ILLEGAL	0xdeadbeef
-
-static phandle live_tree_max_phandle(void)
-{
-	struct device_node *node;
-	phandle phandle;
-	unsigned long flags;
-
-	raw_spin_lock_irqsave(&devtree_lock, flags);
-	phandle = 0;
-	for_each_of_allnodes(node) {
-		if (node->phandle != OF_PHANDLE_ILLEGAL &&
-				node->phandle > phandle)
-			phandle = node->phandle;
-	}
-	raw_spin_unlock_irqrestore(&devtree_lock, flags);
-
-	return phandle;
-}
-
 static void adjust_overlay_phandles(struct device_node *overlay,
 		int phandle_delta)
 {