netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted
@ 2020-10-15  9:46 Eelco Chaudron
  2020-10-15 10:27 ` [ovs-dev] " Ilya Maximets
  2020-10-15 12:34 ` Sebastian Andrzej Siewior
  0 siblings, 2 replies; 8+ messages in thread
From: Eelco Chaudron @ 2020-10-15  9:46 UTC (permalink / raw)
  To: netdev; +Cc: davem, dev, kuba, pabeni, pshelar, jlelli, bigeasy

The flow_lookup() function uses per CPU variables, which must not be
preempted. However, this is fine in the general napi use case where
the local BH is disabled. But, it's also called in the netlink
context, which is preemptible. The below patch makes sure that even
in the netlink path, preemption is disabled.

In addition, the u64_stats_update_begin() sync point was not protected,
making the sync point part of the per CPU variable fixed this.

Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage")
Reported-by: Juri Lelli <jlelli@redhat.com>
Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
v2: - Add u64_stats_update_begin() sync point protection
    - Moved patch to net from net-next branch

 net/openvswitch/flow_table.c |   56 +++++++++++++++++++++++++-----------------
 net/openvswitch/flow_table.h |    8 +++++-
 2 files changed, 39 insertions(+), 25 deletions(-)

diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index e2235849a57e..d90b4af6f539 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -172,7 +172,7 @@ static struct table_instance *table_instance_alloc(int new_size)
 
 static void __mask_array_destroy(struct mask_array *ma)
 {
-	free_percpu(ma->masks_usage_cntr);
+	free_percpu(ma->masks_usage_stats);
 	kfree(ma);
 }
 
@@ -196,15 +196,15 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
 		ma->masks_usage_zero_cntr[i] = 0;
 
 		for_each_possible_cpu(cpu) {
-			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
-							  cpu);
+			struct mask_array_stats *stats;
 			unsigned int start;
 			u64 counter;
 
+			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
 			do {
-				start = u64_stats_fetch_begin_irq(&ma->syncp);
-				counter = usage_counters[i];
-			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
+				start = u64_stats_fetch_begin_irq(&stats->syncp);
+				counter = stats->usage_cntrs[i];
+			} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
 
 			ma->masks_usage_zero_cntr[i] += counter;
 		}
@@ -227,9 +227,10 @@ static struct mask_array *tbl_mask_array_alloc(int size)
 					     sizeof(struct sw_flow_mask *) *
 					     size);
 
-	new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
-					       __alignof__(u64));
-	if (!new->masks_usage_cntr) {
+	new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
+						sizeof(u64) * size,
+						__alignof__(u64));
+	if (!new->masks_usage_stats) {
 		kfree(new);
 		return NULL;
 	}
@@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
 				   u32 *n_cache_hit,
 				   u32 *index)
 {
-	u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
+	struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
 	struct sw_flow *flow;
 	struct sw_flow_mask *mask;
 	int i;
@@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
 		if (mask) {
 			flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
 			if (flow) {
-				u64_stats_update_begin(&ma->syncp);
-				usage_counters[*index]++;
-				u64_stats_update_end(&ma->syncp);
+				u64_stats_update_begin(&stats->syncp);
+				stats->usage_cntrs[*index]++;
+				u64_stats_update_end(&stats->syncp);
 				(*n_cache_hit)++;
 				return flow;
 			}
@@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
 		flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
 		if (flow) { /* Found */
 			*index = i;
-			u64_stats_update_begin(&ma->syncp);
-			usage_counters[*index]++;
-			u64_stats_update_end(&ma->syncp);
+			u64_stats_update_begin(&stats->syncp);
+			stats->usage_cntrs[*index]++;
+			u64_stats_update_end(&stats->syncp);
 			return flow;
 		}
 	}
@@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
 	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
 	u32 __always_unused n_mask_hit;
 	u32 __always_unused n_cache_hit;
+	struct sw_flow *flow;
 	u32 index = 0;
 
-	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
+	/* This function gets called trough the netlink interface and therefore
+	 * is preemptible. However, flow_lookup() function needs to be called
+	 * with preemption disabled due to CPU specific variables.
+	 */
+	local_bh_disable();
+	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
+	local_bh_enable();
+	return flow;
 }
 
 struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
@@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
 
 	for (i = 0; i < ma->max; i++)  {
 		struct sw_flow_mask *mask;
-		unsigned int start;
 		int cpu;
 
 		mask = rcu_dereference_ovsl(ma->masks[i]);
@@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
 		masks_and_count[i].counter = 0;
 
 		for_each_possible_cpu(cpu) {
-			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
-							  cpu);
+			struct mask_array_stats *stats;
+			unsigned int start;
 			u64 counter;
 
+			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
 			do {
-				start = u64_stats_fetch_begin_irq(&ma->syncp);
-				counter = usage_counters[i];
-			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
+				start = u64_stats_fetch_begin_irq(&stats->syncp);
+				counter = stats->usage_cntrs[i];
+			} while (u64_stats_fetch_retry_irq(&stats->syncp,
+							   start));
 
 			masks_and_count[i].counter += counter;
 		}
diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
index 6e7d4ac59353..43144396e192 100644
--- a/net/openvswitch/flow_table.h
+++ b/net/openvswitch/flow_table.h
@@ -38,12 +38,16 @@ struct mask_count {
 	u64 counter;
 };
 
+struct mask_array_stats {
+	struct u64_stats_sync syncp;
+	u64 usage_cntrs[];
+};
+
 struct mask_array {
 	struct rcu_head rcu;
 	int count, max;
-	u64 __percpu *masks_usage_cntr;
+	struct mask_array_stats __percpu *masks_usage_stats;
 	u64 *masks_usage_zero_cntr;
-	struct u64_stats_sync syncp;
 	struct sw_flow_mask __rcu *masks[];
 };
 


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [ovs-dev] [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted
  2020-10-15  9:46 [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted Eelco Chaudron
@ 2020-10-15 10:27 ` Ilya Maximets
  2020-10-15 10:54   ` Eelco Chaudron
  2020-10-15 12:34 ` Sebastian Andrzej Siewior
  1 sibling, 1 reply; 8+ messages in thread
From: Ilya Maximets @ 2020-10-15 10:27 UTC (permalink / raw)
  To: Eelco Chaudron, netdev
  Cc: i.maximets, dev, bigeasy, jlelli, kuba, pabeni, davem

On 10/15/20 11:46 AM, Eelco Chaudron wrote:
> The flow_lookup() function uses per CPU variables, which must not be
> preempted. However, this is fine in the general napi use case where
> the local BH is disabled. But, it's also called in the netlink
> context, which is preemptible. The below patch makes sure that even
> in the netlink path, preemption is disabled.
> 
> In addition, the u64_stats_update_begin() sync point was not protected,
> making the sync point part of the per CPU variable fixed this.
> 
> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage")
> Reported-by: Juri Lelli <jlelli@redhat.com>
> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
> ---
> v2: - Add u64_stats_update_begin() sync point protection
>     - Moved patch to net from net-next branch
> 
>  net/openvswitch/flow_table.c |   56 +++++++++++++++++++++++++-----------------
>  net/openvswitch/flow_table.h |    8 +++++-
>  2 files changed, 39 insertions(+), 25 deletions(-)
> 
> diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
> index e2235849a57e..d90b4af6f539 100644
> --- a/net/openvswitch/flow_table.c
> +++ b/net/openvswitch/flow_table.c
> @@ -172,7 +172,7 @@ static struct table_instance *table_instance_alloc(int new_size)
>  
>  static void __mask_array_destroy(struct mask_array *ma)
>  {
> -	free_percpu(ma->masks_usage_cntr);
> +	free_percpu(ma->masks_usage_stats);
>  	kfree(ma);
>  }
>  
> @@ -196,15 +196,15 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
>  		ma->masks_usage_zero_cntr[i] = 0;
>  
>  		for_each_possible_cpu(cpu) {
> -			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
> -							  cpu);
> +			struct mask_array_stats *stats;
>  			unsigned int start;
>  			u64 counter;
>  
> +			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>  			do {
> -				start = u64_stats_fetch_begin_irq(&ma->syncp);
> -				counter = usage_counters[i];
> -			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
> +				start = u64_stats_fetch_begin_irq(&stats->syncp);
> +				counter = stats->usage_cntrs[i];
> +			} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
>  
>  			ma->masks_usage_zero_cntr[i] += counter;
>  		}
> @@ -227,9 +227,10 @@ static struct mask_array *tbl_mask_array_alloc(int size)
>  					     sizeof(struct sw_flow_mask *) *
>  					     size);
>  
> -	new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
> -					       __alignof__(u64));
> -	if (!new->masks_usage_cntr) {
> +	new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
> +						sizeof(u64) * size,
> +						__alignof__(u64));
> +	if (!new->masks_usage_stats) {
>  		kfree(new);
>  		return NULL;
>  	}
> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>  				   u32 *n_cache_hit,
>  				   u32 *index)
>  {
> -	u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
> +	struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
>  	struct sw_flow *flow;
>  	struct sw_flow_mask *mask;
>  	int i;
> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>  		if (mask) {
>  			flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>  			if (flow) {
> -				u64_stats_update_begin(&ma->syncp);
> -				usage_counters[*index]++;
> -				u64_stats_update_end(&ma->syncp);
> +				u64_stats_update_begin(&stats->syncp);
> +				stats->usage_cntrs[*index]++;
> +				u64_stats_update_end(&stats->syncp);
>  				(*n_cache_hit)++;
>  				return flow;
>  			}
> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>  		flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>  		if (flow) { /* Found */
>  			*index = i;
> -			u64_stats_update_begin(&ma->syncp);
> -			usage_counters[*index]++;
> -			u64_stats_update_end(&ma->syncp);
> +			u64_stats_update_begin(&stats->syncp);
> +			stats->usage_cntrs[*index]++;
> +			u64_stats_update_end(&stats->syncp);
>  			return flow;
>  		}
>  	}
> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
>  	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>  	u32 __always_unused n_mask_hit;
>  	u32 __always_unused n_cache_hit;
> +	struct sw_flow *flow;
>  	u32 index = 0;
>  
> -	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
> +	/* This function gets called trough the netlink interface and therefore
> +	 * is preemptible. However, flow_lookup() function needs to be called
> +	 * with preemption disabled due to CPU specific variables.

Is it possible to add some kind of assertion inside flow_lookup() to avoid
this kind of issues in the future?

It might be also good to update the comment for flow_lookup() function itself.

> +	 */
> +	local_bh_disable();
> +	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
> +	local_bh_enable();
> +	return flow;
>  }
>  
>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>  
>  	for (i = 0; i < ma->max; i++)  {
>  		struct sw_flow_mask *mask;
> -		unsigned int start;
>  		int cpu;
>  
>  		mask = rcu_dereference_ovsl(ma->masks[i]);
> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>  		masks_and_count[i].counter = 0;
>  
>  		for_each_possible_cpu(cpu) {
> -			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
> -							  cpu);
> +			struct mask_array_stats *stats;
> +			unsigned int start;
>  			u64 counter;
>  
> +			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>  			do {
> -				start = u64_stats_fetch_begin_irq(&ma->syncp);
> -				counter = usage_counters[i];
> -			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
> +				start = u64_stats_fetch_begin_irq(&stats->syncp);
> +				counter = stats->usage_cntrs[i];
> +			} while (u64_stats_fetch_retry_irq(&stats->syncp,
> +							   start));
>  
>  			masks_and_count[i].counter += counter;
>  		}
> diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
> index 6e7d4ac59353..43144396e192 100644
> --- a/net/openvswitch/flow_table.h
> +++ b/net/openvswitch/flow_table.h
> @@ -38,12 +38,16 @@ struct mask_count {
>  	u64 counter;
>  };
>  
> +struct mask_array_stats {
> +	struct u64_stats_sync syncp;
> +	u64 usage_cntrs[];
> +};
> +
>  struct mask_array {
>  	struct rcu_head rcu;
>  	int count, max;
> -	u64 __percpu *masks_usage_cntr;
> +	struct mask_array_stats __percpu *masks_usage_stats;
>  	u64 *masks_usage_zero_cntr;
> -	struct u64_stats_sync syncp;
>  	struct sw_flow_mask __rcu *masks[];
>  };
>  
> 
> _______________________________________________
> dev mailing list
> dev@openvswitch.org
> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [ovs-dev] [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted
  2020-10-15 10:27 ` [ovs-dev] " Ilya Maximets
@ 2020-10-15 10:54   ` Eelco Chaudron
  2020-10-15 11:04     ` Ilya Maximets
  0 siblings, 1 reply; 8+ messages in thread
From: Eelco Chaudron @ 2020-10-15 10:54 UTC (permalink / raw)
  To: Ilya Maximets; +Cc: netdev, dev, bigeasy, jlelli, kuba, pabeni, davem



On 15 Oct 2020, at 12:27, Ilya Maximets wrote:

> On 10/15/20 11:46 AM, Eelco Chaudron wrote:
>> The flow_lookup() function uses per CPU variables, which must not be
>> preempted. However, this is fine in the general napi use case where
>> the local BH is disabled. But, it's also called in the netlink
>> context, which is preemptible. The below patch makes sure that even
>> in the netlink path, preemption is disabled.
>>
>> In addition, the u64_stats_update_begin() sync point was not 
>> protected,
>> making the sync point part of the per CPU variable fixed this.
>>
>> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on 
>> usage")
>> Reported-by: Juri Lelli <jlelli@redhat.com>
>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>> ---
>> v2: - Add u64_stats_update_begin() sync point protection
>>     - Moved patch to net from net-next branch
>>
>>  net/openvswitch/flow_table.c |   56 
>> +++++++++++++++++++++++++-----------------
>>  net/openvswitch/flow_table.h |    8 +++++-
>>  2 files changed, 39 insertions(+), 25 deletions(-)
>>
>> diff --git a/net/openvswitch/flow_table.c 
>> b/net/openvswitch/flow_table.c
>> index e2235849a57e..d90b4af6f539 100644
>> --- a/net/openvswitch/flow_table.c
>> +++ b/net/openvswitch/flow_table.c
>> @@ -172,7 +172,7 @@ static struct table_instance 
>> *table_instance_alloc(int new_size)
>>
>>  static void __mask_array_destroy(struct mask_array *ma)
>>  {
>> -	free_percpu(ma->masks_usage_cntr);
>> +	free_percpu(ma->masks_usage_stats);
>>  	kfree(ma);
>>  }
>>
>> @@ -196,15 +196,15 @@ static void 
>> tbl_mask_array_reset_counters(struct mask_array *ma)
>>  		ma->masks_usage_zero_cntr[i] = 0;
>>
>>  		for_each_possible_cpu(cpu) {
>> -			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>> -							  cpu);
>> +			struct mask_array_stats *stats;
>>  			unsigned int start;
>>  			u64 counter;
>>
>> +			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>  			do {
>> -				start = u64_stats_fetch_begin_irq(&ma->syncp);
>> -				counter = usage_counters[i];
>> -			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>> +				start = u64_stats_fetch_begin_irq(&stats->syncp);
>> +				counter = stats->usage_cntrs[i];
>> +			} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
>>
>>  			ma->masks_usage_zero_cntr[i] += counter;
>>  		}
>> @@ -227,9 +227,10 @@ static struct mask_array 
>> *tbl_mask_array_alloc(int size)
>>  					     sizeof(struct sw_flow_mask *) *
>>  					     size);
>>
>> -	new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
>> -					       __alignof__(u64));
>> -	if (!new->masks_usage_cntr) {
>> +	new->masks_usage_stats = __alloc_percpu(sizeof(struct 
>> mask_array_stats) +
>> +						sizeof(u64) * size,
>> +						__alignof__(u64));
>> +	if (!new->masks_usage_stats) {
>>  		kfree(new);
>>  		return NULL;
>>  	}
>> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct 
>> flow_table *tbl,
>>  				   u32 *n_cache_hit,
>>  				   u32 *index)
>>  {
>> -	u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
>> +	struct mask_array_stats *stats = 
>> this_cpu_ptr(ma->masks_usage_stats);
>>  	struct sw_flow *flow;
>>  	struct sw_flow_mask *mask;
>>  	int i;
>> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct 
>> flow_table *tbl,
>>  		if (mask) {
>>  			flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>  			if (flow) {
>> -				u64_stats_update_begin(&ma->syncp);
>> -				usage_counters[*index]++;
>> -				u64_stats_update_end(&ma->syncp);
>> +				u64_stats_update_begin(&stats->syncp);
>> +				stats->usage_cntrs[*index]++;
>> +				u64_stats_update_end(&stats->syncp);
>>  				(*n_cache_hit)++;
>>  				return flow;
>>  			}
>> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct 
>> flow_table *tbl,
>>  		flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>  		if (flow) { /* Found */
>>  			*index = i;
>> -			u64_stats_update_begin(&ma->syncp);
>> -			usage_counters[*index]++;
>> -			u64_stats_update_end(&ma->syncp);
>> +			u64_stats_update_begin(&stats->syncp);
>> +			stats->usage_cntrs[*index]++;
>> +			u64_stats_update_end(&stats->syncp);
>>  			return flow;
>>  		}
>>  	}
>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct 
>> flow_table *tbl,
>>  	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>>  	u32 __always_unused n_mask_hit;
>>  	u32 __always_unused n_cache_hit;
>> +	struct sw_flow *flow;
>>  	u32 index = 0;
>>
>> -	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, 
>> &index);
>> +	/* This function gets called trough the netlink interface and 
>> therefore
>> +	 * is preemptible. However, flow_lookup() function needs to be 
>> called
>> +	 * with preemption disabled due to CPU specific variables.
>
> Is it possible to add some kind of assertion inside flow_lookup() to 
> avoid
> this kind of issues in the future?

We could do something like WARN_ON_ONCE(preemptible()) but do not think 
such check should be added to the fast path.

> It might be also good to update the comment for flow_lookup() function 
> itself.

Good idea, will do this in a v3

>> +	 */
>> +	local_bh_disable();
>> +	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, 
>> &index);
>> +	local_bh_enable();
>> +	return flow;
>>  }
>>
>>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
>> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table 
>> *table)
>>
>>  	for (i = 0; i < ma->max; i++)  {
>>  		struct sw_flow_mask *mask;
>> -		unsigned int start;
>>  		int cpu;
>>
>>  		mask = rcu_dereference_ovsl(ma->masks[i]);
>> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct 
>> flow_table *table)
>>  		masks_and_count[i].counter = 0;
>>
>>  		for_each_possible_cpu(cpu) {
>> -			u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>> -							  cpu);
>> +			struct mask_array_stats *stats;
>> +			unsigned int start;
>>  			u64 counter;
>>
>> +			stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>  			do {
>> -				start = u64_stats_fetch_begin_irq(&ma->syncp);
>> -				counter = usage_counters[i];
>> -			} while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>> +				start = u64_stats_fetch_begin_irq(&stats->syncp);
>> +				counter = stats->usage_cntrs[i];
>> +			} while (u64_stats_fetch_retry_irq(&stats->syncp,
>> +							   start));
>>
>>  			masks_and_count[i].counter += counter;
>>  		}
>> diff --git a/net/openvswitch/flow_table.h 
>> b/net/openvswitch/flow_table.h
>> index 6e7d4ac59353..43144396e192 100644
>> --- a/net/openvswitch/flow_table.h
>> +++ b/net/openvswitch/flow_table.h
>> @@ -38,12 +38,16 @@ struct mask_count {
>>  	u64 counter;
>>  };
>>
>> +struct mask_array_stats {
>> +	struct u64_stats_sync syncp;
>> +	u64 usage_cntrs[];
>> +};
>> +
>>  struct mask_array {
>>  	struct rcu_head rcu;
>>  	int count, max;
>> -	u64 __percpu *masks_usage_cntr;
>> +	struct mask_array_stats __percpu *masks_usage_stats;
>>  	u64 *masks_usage_zero_cntr;
>> -	struct u64_stats_sync syncp;
>>  	struct sw_flow_mask __rcu *masks[];
>>  };
>>
>>
>> _______________________________________________
>> dev mailing list
>> dev@openvswitch.org
>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [ovs-dev] [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted
  2020-10-15 10:54   ` Eelco Chaudron
@ 2020-10-15 11:04     ` Ilya Maximets
  2020-10-15 11:22       ` Ilya Maximets
  0 siblings, 1 reply; 8+ messages in thread
From: Ilya Maximets @ 2020-10-15 11:04 UTC (permalink / raw)
  To: Eelco Chaudron, Ilya Maximets
  Cc: netdev, dev, bigeasy, jlelli, kuba, pabeni, davem

On 10/15/20 12:54 PM, Eelco Chaudron wrote:
> 
> 
> On 15 Oct 2020, at 12:27, Ilya Maximets wrote:
> 
>> On 10/15/20 11:46 AM, Eelco Chaudron wrote:
>>> The flow_lookup() function uses per CPU variables, which must not be
>>> preempted. However, this is fine in the general napi use case where
>>> the local BH is disabled. But, it's also called in the netlink
>>> context, which is preemptible. The below patch makes sure that even
>>> in the netlink path, preemption is disabled.
>>>
>>> In addition, the u64_stats_update_begin() sync point was not protected,
>>> making the sync point part of the per CPU variable fixed this.
>>>
>>> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage")
>>> Reported-by: Juri Lelli <jlelli@redhat.com>
>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>> ---
>>> v2: - Add u64_stats_update_begin() sync point protection
>>>     - Moved patch to net from net-next branch
>>>
>>>  net/openvswitch/flow_table.c |   56 +++++++++++++++++++++++++-----------------
>>>  net/openvswitch/flow_table.h |    8 +++++-
>>>  2 files changed, 39 insertions(+), 25 deletions(-)
>>>
>>> diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
>>> index e2235849a57e..d90b4af6f539 100644
>>> --- a/net/openvswitch/flow_table.c
>>> +++ b/net/openvswitch/flow_table.c
>>> @@ -172,7 +172,7 @@ static struct table_instance *table_instance_alloc(int new_size)
>>>
>>>  static void __mask_array_destroy(struct mask_array *ma)
>>>  {
>>> -    free_percpu(ma->masks_usage_cntr);
>>> +    free_percpu(ma->masks_usage_stats);
>>>      kfree(ma);
>>>  }
>>>
>>> @@ -196,15 +196,15 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
>>>          ma->masks_usage_zero_cntr[i] = 0;
>>>
>>>          for_each_possible_cpu(cpu) {
>>> -            u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>>> -                              cpu);
>>> +            struct mask_array_stats *stats;
>>>              unsigned int start;
>>>              u64 counter;
>>>
>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>>              do {
>>> -                start = u64_stats_fetch_begin_irq(&ma->syncp);
>>> -                counter = usage_counters[i];
>>> -            } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>> +                start = u64_stats_fetch_begin_irq(&stats->syncp);
>>> +                counter = stats->usage_cntrs[i];
>>> +            } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
>>>
>>>              ma->masks_usage_zero_cntr[i] += counter;
>>>          }
>>> @@ -227,9 +227,10 @@ static struct mask_array *tbl_mask_array_alloc(int size)
>>>                           sizeof(struct sw_flow_mask *) *
>>>                           size);
>>>
>>> -    new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
>>> -                           __alignof__(u64));
>>> -    if (!new->masks_usage_cntr) {
>>> +    new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
>>> +                        sizeof(u64) * size,
>>> +                        __alignof__(u64));
>>> +    if (!new->masks_usage_stats) {
>>>          kfree(new);
>>>          return NULL;
>>>      }
>>> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>                     u32 *n_cache_hit,
>>>                     u32 *index)
>>>  {
>>> -    u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
>>> +    struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
>>>      struct sw_flow *flow;
>>>      struct sw_flow_mask *mask;
>>>      int i;
>>> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>          if (mask) {
>>>              flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>>              if (flow) {
>>> -                u64_stats_update_begin(&ma->syncp);
>>> -                usage_counters[*index]++;
>>> -                u64_stats_update_end(&ma->syncp);
>>> +                u64_stats_update_begin(&stats->syncp);
>>> +                stats->usage_cntrs[*index]++;
>>> +                u64_stats_update_end(&stats->syncp);
>>>                  (*n_cache_hit)++;
>>>                  return flow;
>>>              }
>>> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>          flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>>          if (flow) { /* Found */
>>>              *index = i;
>>> -            u64_stats_update_begin(&ma->syncp);
>>> -            usage_counters[*index]++;
>>> -            u64_stats_update_end(&ma->syncp);
>>> +            u64_stats_update_begin(&stats->syncp);
>>> +            stats->usage_cntrs[*index]++;
>>> +            u64_stats_update_end(&stats->syncp);
>>>              return flow;
>>>          }
>>>      }
>>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
>>>      struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>>>      u32 __always_unused n_mask_hit;
>>>      u32 __always_unused n_cache_hit;
>>> +    struct sw_flow *flow;
>>>      u32 index = 0;
>>>
>>> -    return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
>>> +    /* This function gets called trough the netlink interface and therefore
>>> +     * is preemptible. However, flow_lookup() function needs to be called
>>> +     * with preemption disabled due to CPU specific variables.
>>
>> Is it possible to add some kind of assertion inside flow_lookup() to avoid
>> this kind of issues in the future?
> 
> We could do something like WARN_ON_ONCE(preemptible()) but do not think such check should be added to the fast path.

I meant something like lockdep_assert_preemption_disabled().  This will not
impact fast path if CONFIG_PROVE_LOCKING disabled, but will allow to catch
issues during development.

> 
>> It might be also good to update the comment for flow_lookup() function itself.
> 
> Good idea, will do this in a v3
> 
>>> +     */
>>> +    local_bh_disable();
>>> +    flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
>>> +    local_bh_enable();
>>> +    return flow;
>>>  }
>>>
>>>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
>>> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>>>
>>>      for (i = 0; i < ma->max; i++)  {
>>>          struct sw_flow_mask *mask;
>>> -        unsigned int start;
>>>          int cpu;
>>>
>>>          mask = rcu_dereference_ovsl(ma->masks[i]);
>>> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>>>          masks_and_count[i].counter = 0;
>>>
>>>          for_each_possible_cpu(cpu) {
>>> -            u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>>> -                              cpu);
>>> +            struct mask_array_stats *stats;
>>> +            unsigned int start;
>>>              u64 counter;
>>>
>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>>              do {
>>> -                start = u64_stats_fetch_begin_irq(&ma->syncp);
>>> -                counter = usage_counters[i];
>>> -            } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>> +                start = u64_stats_fetch_begin_irq(&stats->syncp);
>>> +                counter = stats->usage_cntrs[i];
>>> +            } while (u64_stats_fetch_retry_irq(&stats->syncp,
>>> +                               start));
>>>
>>>              masks_and_count[i].counter += counter;
>>>          }
>>> diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
>>> index 6e7d4ac59353..43144396e192 100644
>>> --- a/net/openvswitch/flow_table.h
>>> +++ b/net/openvswitch/flow_table.h
>>> @@ -38,12 +38,16 @@ struct mask_count {
>>>      u64 counter;
>>>  };
>>>
>>> +struct mask_array_stats {
>>> +    struct u64_stats_sync syncp;
>>> +    u64 usage_cntrs[];
>>> +};
>>> +
>>>  struct mask_array {
>>>      struct rcu_head rcu;
>>>      int count, max;
>>> -    u64 __percpu *masks_usage_cntr;
>>> +    struct mask_array_stats __percpu *masks_usage_stats;
>>>      u64 *masks_usage_zero_cntr;
>>> -    struct u64_stats_sync syncp;
>>>      struct sw_flow_mask __rcu *masks[];
>>>  };
>>>
>>>
>>> _______________________________________________
>>> dev mailing list
>>> dev@openvswitch.org
>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [ovs-dev] [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted
  2020-10-15 11:04     ` Ilya Maximets
@ 2020-10-15 11:22       ` Ilya Maximets
  2020-10-15 12:42         ` Eelco Chaudron
  0 siblings, 1 reply; 8+ messages in thread
From: Ilya Maximets @ 2020-10-15 11:22 UTC (permalink / raw)
  To: Ilya Maximets, Eelco Chaudron
  Cc: netdev, dev, bigeasy, jlelli, kuba, pabeni, davem

On 10/15/20 1:04 PM, Ilya Maximets wrote:
> On 10/15/20 12:54 PM, Eelco Chaudron wrote:
>>
>>
>> On 15 Oct 2020, at 12:27, Ilya Maximets wrote:
>>
>>> On 10/15/20 11:46 AM, Eelco Chaudron wrote:
>>>> The flow_lookup() function uses per CPU variables, which must not be
>>>> preempted. However, this is fine in the general napi use case where
>>>> the local BH is disabled. But, it's also called in the netlink
>>>> context, which is preemptible. The below patch makes sure that even
>>>> in the netlink path, preemption is disabled.
>>>>
>>>> In addition, the u64_stats_update_begin() sync point was not protected,
>>>> making the sync point part of the per CPU variable fixed this.
>>>>
>>>> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based on usage")
>>>> Reported-by: Juri Lelli <jlelli@redhat.com>
>>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>>> ---
>>>> v2: - Add u64_stats_update_begin() sync point protection
>>>>     - Moved patch to net from net-next branch
>>>>
>>>>  net/openvswitch/flow_table.c |   56 +++++++++++++++++++++++++-----------------
>>>>  net/openvswitch/flow_table.h |    8 +++++-
>>>>  2 files changed, 39 insertions(+), 25 deletions(-)
>>>>
>>>> diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
>>>> index e2235849a57e..d90b4af6f539 100644
>>>> --- a/net/openvswitch/flow_table.c
>>>> +++ b/net/openvswitch/flow_table.c
>>>> @@ -172,7 +172,7 @@ static struct table_instance *table_instance_alloc(int new_size)
>>>>
>>>>  static void __mask_array_destroy(struct mask_array *ma)
>>>>  {
>>>> -    free_percpu(ma->masks_usage_cntr);
>>>> +    free_percpu(ma->masks_usage_stats);
>>>>      kfree(ma);
>>>>  }
>>>>
>>>> @@ -196,15 +196,15 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma)
>>>>          ma->masks_usage_zero_cntr[i] = 0;
>>>>
>>>>          for_each_possible_cpu(cpu) {
>>>> -            u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>>>> -                              cpu);
>>>> +            struct mask_array_stats *stats;
>>>>              unsigned int start;
>>>>              u64 counter;
>>>>
>>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>>>              do {
>>>> -                start = u64_stats_fetch_begin_irq(&ma->syncp);
>>>> -                counter = usage_counters[i];
>>>> -            } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>>> +                start = u64_stats_fetch_begin_irq(&stats->syncp);
>>>> +                counter = stats->usage_cntrs[i];
>>>> +            } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
>>>>
>>>>              ma->masks_usage_zero_cntr[i] += counter;
>>>>          }
>>>> @@ -227,9 +227,10 @@ static struct mask_array *tbl_mask_array_alloc(int size)
>>>>                           sizeof(struct sw_flow_mask *) *
>>>>                           size);
>>>>
>>>> -    new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
>>>> -                           __alignof__(u64));
>>>> -    if (!new->masks_usage_cntr) {
>>>> +    new->masks_usage_stats = __alloc_percpu(sizeof(struct mask_array_stats) +
>>>> +                        sizeof(u64) * size,
>>>> +                        __alignof__(u64));
>>>> +    if (!new->masks_usage_stats) {
>>>>          kfree(new);
>>>>          return NULL;
>>>>      }
>>>> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>>                     u32 *n_cache_hit,
>>>>                     u32 *index)
>>>>  {
>>>> -    u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
>>>> +    struct mask_array_stats *stats = this_cpu_ptr(ma->masks_usage_stats);
>>>>      struct sw_flow *flow;
>>>>      struct sw_flow_mask *mask;
>>>>      int i;
>>>> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>>          if (mask) {
>>>>              flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>>>              if (flow) {
>>>> -                u64_stats_update_begin(&ma->syncp);
>>>> -                usage_counters[*index]++;
>>>> -                u64_stats_update_end(&ma->syncp);
>>>> +                u64_stats_update_begin(&stats->syncp);
>>>> +                stats->usage_cntrs[*index]++;
>>>> +                u64_stats_update_end(&stats->syncp);
>>>>                  (*n_cache_hit)++;
>>>>                  return flow;
>>>>              }
>>>> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct flow_table *tbl,
>>>>          flow = masked_flow_lookup(ti, key, mask, n_mask_hit);
>>>>          if (flow) { /* Found */
>>>>              *index = i;
>>>> -            u64_stats_update_begin(&ma->syncp);
>>>> -            usage_counters[*index]++;
>>>> -            u64_stats_update_end(&ma->syncp);
>>>> +            u64_stats_update_begin(&stats->syncp);
>>>> +            stats->usage_cntrs[*index]++;
>>>> +            u64_stats_update_end(&stats->syncp);
>>>>              return flow;
>>>>          }
>>>>      }
>>>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
>>>>      struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>>>>      u32 __always_unused n_mask_hit;
>>>>      u32 __always_unused n_cache_hit;
>>>> +    struct sw_flow *flow;
>>>>      u32 index = 0;
>>>>
>>>> -    return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
>>>> +    /* This function gets called trough the netlink interface and therefore
>>>> +     * is preemptible. However, flow_lookup() function needs to be called
>>>> +     * with preemption disabled due to CPU specific variables.
>>>
>>> Is it possible to add some kind of assertion inside flow_lookup() to avoid
>>> this kind of issues in the future?
>>
>> We could do something like WARN_ON_ONCE(preemptible()) but do not think such check should be added to the fast path.
> 
> I meant something like lockdep_assert_preemption_disabled().  This will not
> impact fast path if CONFIG_PROVE_LOCKING disabled, but will allow to catch
> issues during development.

To be clear, I just mean that this could be compiled conditionally under some
debugging config.  Do not know which of existing configs might be used, though. 

> 
>>
>>> It might be also good to update the comment for flow_lookup() function itself.
>>
>> Good idea, will do this in a v3
>>
>>>> +     */
>>>> +    local_bh_disable();
>>>> +    flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
>>>> +    local_bh_enable();
>>>> +    return flow;
>>>>  }
>>>>
>>>>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
>>>> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>>>>
>>>>      for (i = 0; i < ma->max; i++)  {
>>>>          struct sw_flow_mask *mask;
>>>> -        unsigned int start;
>>>>          int cpu;
>>>>
>>>>          mask = rcu_dereference_ovsl(ma->masks[i]);
>>>> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct flow_table *table)
>>>>          masks_and_count[i].counter = 0;
>>>>
>>>>          for_each_possible_cpu(cpu) {
>>>> -            u64 *usage_counters = per_cpu_ptr(ma->masks_usage_cntr,
>>>> -                              cpu);
>>>> +            struct mask_array_stats *stats;
>>>> +            unsigned int start;
>>>>              u64 counter;
>>>>
>>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, cpu);
>>>>              do {
>>>> -                start = u64_stats_fetch_begin_irq(&ma->syncp);
>>>> -                counter = usage_counters[i];
>>>> -            } while (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>>> +                start = u64_stats_fetch_begin_irq(&stats->syncp);
>>>> +                counter = stats->usage_cntrs[i];
>>>> +            } while (u64_stats_fetch_retry_irq(&stats->syncp,
>>>> +                               start));
>>>>
>>>>              masks_and_count[i].counter += counter;
>>>>          }
>>>> diff --git a/net/openvswitch/flow_table.h b/net/openvswitch/flow_table.h
>>>> index 6e7d4ac59353..43144396e192 100644
>>>> --- a/net/openvswitch/flow_table.h
>>>> +++ b/net/openvswitch/flow_table.h
>>>> @@ -38,12 +38,16 @@ struct mask_count {
>>>>      u64 counter;
>>>>  };
>>>>
>>>> +struct mask_array_stats {
>>>> +    struct u64_stats_sync syncp;
>>>> +    u64 usage_cntrs[];
>>>> +};
>>>> +
>>>>  struct mask_array {
>>>>      struct rcu_head rcu;
>>>>      int count, max;
>>>> -    u64 __percpu *masks_usage_cntr;
>>>> +    struct mask_array_stats __percpu *masks_usage_stats;
>>>>      u64 *masks_usage_zero_cntr;
>>>> -    struct u64_stats_sync syncp;
>>>>      struct sw_flow_mask __rcu *masks[];
>>>>  };
>>>>
>>>>
>>>> _______________________________________________
>>>> dev mailing list
>>>> dev@openvswitch.org
>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>>
>>
> 


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted
  2020-10-15  9:46 [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted Eelco Chaudron
  2020-10-15 10:27 ` [ovs-dev] " Ilya Maximets
@ 2020-10-15 12:34 ` Sebastian Andrzej Siewior
  2020-10-15 17:06   ` Eelco Chaudron
  1 sibling, 1 reply; 8+ messages in thread
From: Sebastian Andrzej Siewior @ 2020-10-15 12:34 UTC (permalink / raw)
  To: Eelco Chaudron
  Cc: netdev, davem, dev, kuba, pabeni, pshelar, jlelli, Peter Zijlstra, tglx

On 2020-10-15 11:46:53 [+0200], Eelco Chaudron wrote:
> The flow_lookup() function uses per CPU variables, which must not be
> preempted. However, this is fine in the general napi use case where
> the local BH is disabled. But, it's also called in the netlink
> context, which is preemptible. The below patch makes sure that even
> in the netlink path, preemption is disabled.

I would suggest to rephrase it: the term preemption usually means
preempt_disable(). A preempt disabled section can be preempted /
interrupted by hardirq and softirq. The later is mentioned and I think
is confusing.

> In addition, the u64_stats_update_begin() sync point was not protected,
> making the sync point part of the per CPU variable fixed this.

I would rephrase it and mention the key details:
u64_stats_update_begin() requires a lock to ensure one writer which is
not ensured here. Making it per-CPU and disabling NAPI (softirq) ensures
that there is always only one writer.

Regarding the annotation which were mentioned here in the thread.
Basically the this_cpu_ptr() warning worked as expected and got us here.
I don't think it is wise to add annotation distinguished from the actual
problem like assert_the_softirq_is_switched_off() in flow_lookup(). The
assert may become obsolete once the reason is removed and gets overseen
and remains in the code. The commits

	c60c32a577561 ("posix-cpu-timers: Remove lockdep_assert_irqs_disabled()")
	f9dae5554aed4 ("dpaa2-eth: Remove preempt_disable() from seed_pool()")

are just two examples which came to mind while writing this.

Instead I would prefer lockdep annotation in u64_stats_update_begin()
which is around also in 64bit kernels and complains if it is seen
without disabled BH if observed in-serving-softirq.
PeterZ, wasn't this mentioned before?

> --- a/net/openvswitch/flow_table.c
> +++ b/net/openvswitch/flow_table.c
> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *tbl,
>  	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>  	u32 __always_unused n_mask_hit;
>  	u32 __always_unused n_cache_hit;
> +	struct sw_flow *flow;
>  	u32 index = 0;
>  
> -	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
> +	/* This function gets called trough the netlink interface and therefore
> +	 * is preemptible. However, flow_lookup() function needs to be called
> +	 * with preemption disabled due to CPU specific variables.

preemption vs BH.

> +	 */
> +	local_bh_disable();
> +	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, &index);
> +	local_bh_enable();
> +	return flow;
>  }
>  
>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,

Otherwise it looks good.

Sebastian

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [ovs-dev] [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted
  2020-10-15 11:22       ` Ilya Maximets
@ 2020-10-15 12:42         ` Eelco Chaudron
  0 siblings, 0 replies; 8+ messages in thread
From: Eelco Chaudron @ 2020-10-15 12:42 UTC (permalink / raw)
  To: Ilya Maximets; +Cc: netdev, dev, bigeasy, jlelli, kuba, pabeni, davem



On 15 Oct 2020, at 13:22, Ilya Maximets wrote:

> On 10/15/20 1:04 PM, Ilya Maximets wrote:
>> On 10/15/20 12:54 PM, Eelco Chaudron wrote:
>>>
>>>
>>> On 15 Oct 2020, at 12:27, Ilya Maximets wrote:
>>>
>>>> On 10/15/20 11:46 AM, Eelco Chaudron wrote:
>>>>> The flow_lookup() function uses per CPU variables, which must not 
>>>>> be
>>>>> preempted. However, this is fine in the general napi use case 
>>>>> where
>>>>> the local BH is disabled. But, it's also called in the netlink
>>>>> context, which is preemptible. The below patch makes sure that 
>>>>> even
>>>>> in the netlink path, preemption is disabled.
>>>>>
>>>>> In addition, the u64_stats_update_begin() sync point was not 
>>>>> protected,
>>>>> making the sync point part of the per CPU variable fixed this.
>>>>>
>>>>> Fixes: eac87c413bf9 ("net: openvswitch: reorder masks array based 
>>>>> on usage")
>>>>> Reported-by: Juri Lelli <jlelli@redhat.com>
>>>>> Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
>>>>> ---
>>>>> v2: - Add u64_stats_update_begin() sync point protection
>>>>>     - Moved patch to net from net-next branch
>>>>>
>>>>>  net/openvswitch/flow_table.c |   56 
>>>>> +++++++++++++++++++++++++-----------------
>>>>>  net/openvswitch/flow_table.h |    8 +++++-
>>>>>  2 files changed, 39 insertions(+), 25 deletions(-)
>>>>>
>>>>> diff --git a/net/openvswitch/flow_table.c 
>>>>> b/net/openvswitch/flow_table.c
>>>>> index e2235849a57e..d90b4af6f539 100644
>>>>> --- a/net/openvswitch/flow_table.c
>>>>> +++ b/net/openvswitch/flow_table.c
>>>>> @@ -172,7 +172,7 @@ static struct table_instance 
>>>>> *table_instance_alloc(int new_size)
>>>>>
>>>>>  static void __mask_array_destroy(struct mask_array *ma)
>>>>>  {
>>>>> -    free_percpu(ma->masks_usage_cntr);
>>>>> +    free_percpu(ma->masks_usage_stats);
>>>>>      kfree(ma);
>>>>>  }
>>>>>
>>>>> @@ -196,15 +196,15 @@ static void 
>>>>> tbl_mask_array_reset_counters(struct mask_array *ma)
>>>>>          ma->masks_usage_zero_cntr[i] = 0;
>>>>>
>>>>>          for_each_possible_cpu(cpu) {
>>>>> -            u64 *usage_counters = 
>>>>> per_cpu_ptr(ma->masks_usage_cntr,
>>>>> -                              cpu);
>>>>> +            struct mask_array_stats *stats;
>>>>>              unsigned int start;
>>>>>              u64 counter;
>>>>>
>>>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, 
>>>>> cpu);
>>>>>              do {
>>>>> -                start = 
>>>>> u64_stats_fetch_begin_irq(&ma->syncp);
>>>>> -                counter = usage_counters[i];
>>>>> -            } while 
>>>>> (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>>>> +                start = 
>>>>> u64_stats_fetch_begin_irq(&stats->syncp);
>>>>> +                counter = stats->usage_cntrs[i];
>>>>> +            } while 
>>>>> (u64_stats_fetch_retry_irq(&stats->syncp, start));
>>>>>
>>>>>              ma->masks_usage_zero_cntr[i] += counter;
>>>>>          }
>>>>> @@ -227,9 +227,10 @@ static struct mask_array 
>>>>> *tbl_mask_array_alloc(int size)
>>>>>                           sizeof(struct 
>>>>> sw_flow_mask *) *
>>>>>                           size);
>>>>>
>>>>> -    new->masks_usage_cntr = __alloc_percpu(sizeof(u64) * size,
>>>>> -                           
>>>>> __alignof__(u64));
>>>>> -    if (!new->masks_usage_cntr) {
>>>>> +    new->masks_usage_stats = __alloc_percpu(sizeof(struct 
>>>>> mask_array_stats) +
>>>>> +                        sizeof(u64) * 
>>>>> size,
>>>>> +                        __alignof__(u64));
>>>>> +    if (!new->masks_usage_stats) {
>>>>>          kfree(new);
>>>>>          return NULL;
>>>>>      }
>>>>> @@ -732,7 +733,7 @@ static struct sw_flow *flow_lookup(struct 
>>>>> flow_table *tbl,
>>>>>                     u32 *n_cache_hit,
>>>>>                     u32 *index)
>>>>>  {
>>>>> -    u64 *usage_counters = this_cpu_ptr(ma->masks_usage_cntr);
>>>>> +    struct mask_array_stats *stats = 
>>>>> this_cpu_ptr(ma->masks_usage_stats);
>>>>>      struct sw_flow *flow;
>>>>>      struct sw_flow_mask *mask;
>>>>>      int i;
>>>>> @@ -742,9 +743,9 @@ static struct sw_flow *flow_lookup(struct 
>>>>> flow_table *tbl,
>>>>>          if (mask) {
>>>>>              flow = masked_flow_lookup(ti, key, mask, 
>>>>> n_mask_hit);
>>>>>              if (flow) {
>>>>> -                
>>>>> u64_stats_update_begin(&ma->syncp);
>>>>> -                usage_counters[*index]++;
>>>>> -                u64_stats_update_end(&ma->syncp);
>>>>> +                
>>>>> u64_stats_update_begin(&stats->syncp);
>>>>> +                stats->usage_cntrs[*index]++;
>>>>> +                
>>>>> u64_stats_update_end(&stats->syncp);
>>>>>                  (*n_cache_hit)++;
>>>>>                  return flow;
>>>>>              }
>>>>> @@ -763,9 +764,9 @@ static struct sw_flow *flow_lookup(struct 
>>>>> flow_table *tbl,
>>>>>          flow = masked_flow_lookup(ti, key, mask, 
>>>>> n_mask_hit);
>>>>>          if (flow) { /* Found */
>>>>>              *index = i;
>>>>> -            u64_stats_update_begin(&ma->syncp);
>>>>> -            usage_counters[*index]++;
>>>>> -            u64_stats_update_end(&ma->syncp);
>>>>> +            u64_stats_update_begin(&stats->syncp);
>>>>> +            stats->usage_cntrs[*index]++;
>>>>> +            u64_stats_update_end(&stats->syncp);
>>>>>              return flow;
>>>>>          }
>>>>>      }
>>>>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct 
>>>>> flow_table *tbl,
>>>>>      struct mask_array *ma = 
>>>>> rcu_dereference_ovsl(tbl->mask_array);
>>>>>      u32 __always_unused n_mask_hit;
>>>>>      u32 __always_unused n_cache_hit;
>>>>> +    struct sw_flow *flow;
>>>>>      u32 index = 0;
>>>>>
>>>>> -    return flow_lookup(tbl, ti, ma, key, &n_mask_hit, 
>>>>> &n_cache_hit, &index);
>>>>> +    /* This function gets called trough the netlink interface 
>>>>> and therefore
>>>>> +     * is preemptible. However, flow_lookup() function needs 
>>>>> to be called
>>>>> +     * with preemption disabled due to CPU specific 
>>>>> variables.
>>>>
>>>> Is it possible to add some kind of assertion inside flow_lookup() 
>>>> to avoid
>>>> this kind of issues in the future?
>>>
>>> We could do something like WARN_ON_ONCE(preemptible()) but do not 
>>> think such check should be added to the fast path.
>>
>> I meant something like lockdep_assert_preemption_disabled().  This 
>> will not
>> impact fast path if CONFIG_PROVE_LOCKING disabled, but will allow to 
>> catch
>> issues during development.
>
> To be clear, I just mean that this could be compiled conditionally 
> under some
> debugging config.  Do not know which of existing configs might be 
> used, though.

Looked at the debug flags, but there is not really one that makes sense 
to use. Also using a general one like DEBUG_MISC will probably no help, 
as none would probably run OVS tests with this flag enabled. So for now 
I would leave it as is unless someone has a better idea…

>>>
>>>> It might be also good to update the comment for flow_lookup() 
>>>> function itself.
>>>
>>> Good idea, will do this in a v3
>>>
>>>>> +     */
>>>>> +    local_bh_disable();
>>>>> +    flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, 
>>>>> &n_cache_hit, &index);
>>>>> +    local_bh_enable();
>>>>> +    return flow;
>>>>>  }
>>>>>
>>>>>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table 
>>>>> *tbl,
>>>>> @@ -1109,7 +1118,6 @@ void ovs_flow_masks_rebalance(struct 
>>>>> flow_table *table)
>>>>>
>>>>>      for (i = 0; i < ma->max; i++)  {
>>>>>          struct sw_flow_mask *mask;
>>>>> -        unsigned int start;
>>>>>          int cpu;
>>>>>
>>>>>          mask = rcu_dereference_ovsl(ma->masks[i]);
>>>>> @@ -1120,14 +1128,16 @@ void ovs_flow_masks_rebalance(struct 
>>>>> flow_table *table)
>>>>>          masks_and_count[i].counter = 0;
>>>>>
>>>>>          for_each_possible_cpu(cpu) {
>>>>> -            u64 *usage_counters = 
>>>>> per_cpu_ptr(ma->masks_usage_cntr,
>>>>> -                              cpu);
>>>>> +            struct mask_array_stats *stats;
>>>>> +            unsigned int start;
>>>>>              u64 counter;
>>>>>
>>>>> +            stats = per_cpu_ptr(ma->masks_usage_stats, 
>>>>> cpu);
>>>>>              do {
>>>>> -                start = 
>>>>> u64_stats_fetch_begin_irq(&ma->syncp);
>>>>> -                counter = usage_counters[i];
>>>>> -            } while 
>>>>> (u64_stats_fetch_retry_irq(&ma->syncp, start));
>>>>> +                start = 
>>>>> u64_stats_fetch_begin_irq(&stats->syncp);
>>>>> +                counter = stats->usage_cntrs[i];
>>>>> +            } while 
>>>>> (u64_stats_fetch_retry_irq(&stats->syncp,
>>>>> +                               
>>>>> start));
>>>>>
>>>>>              masks_and_count[i].counter += counter;
>>>>>          }
>>>>> diff --git a/net/openvswitch/flow_table.h 
>>>>> b/net/openvswitch/flow_table.h
>>>>> index 6e7d4ac59353..43144396e192 100644
>>>>> --- a/net/openvswitch/flow_table.h
>>>>> +++ b/net/openvswitch/flow_table.h
>>>>> @@ -38,12 +38,16 @@ struct mask_count {
>>>>>      u64 counter;
>>>>>  };
>>>>>
>>>>> +struct mask_array_stats {
>>>>> +    struct u64_stats_sync syncp;
>>>>> +    u64 usage_cntrs[];
>>>>> +};
>>>>> +
>>>>>  struct mask_array {
>>>>>      struct rcu_head rcu;
>>>>>      int count, max;
>>>>> -    u64 __percpu *masks_usage_cntr;
>>>>> +    struct mask_array_stats __percpu *masks_usage_stats;
>>>>>      u64 *masks_usage_zero_cntr;
>>>>> -    struct u64_stats_sync syncp;
>>>>>      struct sw_flow_mask __rcu *masks[];
>>>>>  };
>>>>>
>>>>>
>>>>> _______________________________________________
>>>>> dev mailing list
>>>>> dev@openvswitch.org
>>>>> https://mail.openvswitch.org/mailman/listinfo/ovs-dev
>>>>>
>>>
>>


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted
  2020-10-15 12:34 ` Sebastian Andrzej Siewior
@ 2020-10-15 17:06   ` Eelco Chaudron
  0 siblings, 0 replies; 8+ messages in thread
From: Eelco Chaudron @ 2020-10-15 17:06 UTC (permalink / raw)
  To: Sebastian Andrzej Siewior
  Cc: netdev, davem, dev, kuba, pabeni, pshelar, jlelli, Peter Zijlstra, tglx



On 15 Oct 2020, at 14:34, Sebastian Andrzej Siewior wrote:

> On 2020-10-15 11:46:53 [+0200], Eelco Chaudron wrote:
>> The flow_lookup() function uses per CPU variables, which must not be
>> preempted. However, this is fine in the general napi use case where
>> the local BH is disabled. But, it's also called in the netlink
>> context, which is preemptible. The below patch makes sure that even
>> in the netlink path, preemption is disabled.
>
> I would suggest to rephrase it: the term preemption usually means
> preempt_disable(). A preempt disabled section can be preempted /
> interrupted by hardirq and softirq. The later is mentioned and I think
> is confusing.
>
>> In addition, the u64_stats_update_begin() sync point was not 
>> protected,
>> making the sync point part of the per CPU variable fixed this.
>
> I would rephrase it and mention the key details:
> u64_stats_update_begin() requires a lock to ensure one writer which is
> not ensured here. Making it per-CPU and disabling NAPI (softirq) 
> ensures
> that there is always only one writer.
>
> Regarding the annotation which were mentioned here in the thread.
> Basically the this_cpu_ptr() warning worked as expected and got us 
> here.
> I don't think it is wise to add annotation distinguished from the 
> actual
> problem like assert_the_softirq_is_switched_off() in flow_lookup(). 
> The
> assert may become obsolete once the reason is removed and gets 
> overseen
> and remains in the code. The commits
>
> 	c60c32a577561 ("posix-cpu-timers: Remove 
> lockdep_assert_irqs_disabled()")
> 	f9dae5554aed4 ("dpaa2-eth: Remove preempt_disable() from 
> seed_pool()")
>
> are just two examples which came to mind while writing this.
>
> Instead I would prefer lockdep annotation in u64_stats_update_begin()
> which is around also in 64bit kernels and complains if it is seen
> without disabled BH if observed in-serving-softirq.
> PeterZ, wasn't this mentioned before?
>
>> --- a/net/openvswitch/flow_table.c
>> +++ b/net/openvswitch/flow_table.c
>> @@ -851,9 +852,17 @@ struct sw_flow *ovs_flow_tbl_lookup(struct 
>> flow_table *tbl,
>>  	struct mask_array *ma = rcu_dereference_ovsl(tbl->mask_array);
>>  	u32 __always_unused n_mask_hit;
>>  	u32 __always_unused n_cache_hit;
>> +	struct sw_flow *flow;
>>  	u32 index = 0;
>>
>> -	return flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, 
>> &index);
>> +	/* This function gets called trough the netlink interface and 
>> therefore
>> +	 * is preemptible. However, flow_lookup() function needs to be 
>> called
>> +	 * with preemption disabled due to CPU specific variables.
>
> preemption vs BH.
>
>> +	 */
>> +	local_bh_disable();
>> +	flow = flow_lookup(tbl, ti, ma, key, &n_mask_hit, &n_cache_hit, 
>> &index);
>> +	local_bh_enable();
>> +	return flow;
>>  }
>>
>>  struct sw_flow *ovs_flow_tbl_lookup_exact(struct flow_table *tbl,
>
> Otherwise it looks good.
>

Thanks for your review! Made the modifications you suggested and will 
send out a v3 soon.

//Eelco


^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-10-15 17:06 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-10-15  9:46 [PATCH net v2] net: openvswitch: fix to make sure flow_lookup() is not preempted Eelco Chaudron
2020-10-15 10:27 ` [ovs-dev] " Ilya Maximets
2020-10-15 10:54   ` Eelco Chaudron
2020-10-15 11:04     ` Ilya Maximets
2020-10-15 11:22       ` Ilya Maximets
2020-10-15 12:42         ` Eelco Chaudron
2020-10-15 12:34 ` Sebastian Andrzej Siewior
2020-10-15 17:06   ` Eelco Chaudron

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).