All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next 0/2] net/sched: act_police: lockless data path
@ 2018-09-13 17:29 Davide Caratti
  2018-09-13 17:29 ` [PATCH net-next 1/2] net/sched: act_police: use per-cpu counters Davide Caratti
                   ` (2 more replies)
  0 siblings, 3 replies; 12+ messages in thread
From: Davide Caratti @ 2018-09-13 17:29 UTC (permalink / raw)
  To: Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller; +Cc: netdev

the data path of 'police' action can be faster if we avoid using spinlocks:
 - patch 1 converts act_police to use per-cpu counters
 - patch 2 lets act_police use RCU to access its configuration data.

test procedure (using pktgen from https://github.com/netoptimizer):
 # ip link add name eth1 type dummy
 # ip link set dev eth1 up
 # tc qdisc add dev eth1 clsact
 # tc filter add dev eth1 egress matchall action police \
 > rate 2gbit burst 100k conform-exceed pass/pass index 100
 # for c in 1 2 4; do
 > ./pktgen_bench_xmit_mode_queue_xmit.sh -v -s 64 -t $c -n 5000000 -i eth1
 > done

test results (avg. pps/thread):

  $c | before patch |  after patch | improvement
 ----+--------------+--------------+-------------
   1 |      3518448 |      3591240 |  irrelevant
   2 |      3070065 |      3383393 |         10%
   4 |      1540969 |      3238385 |        110%


Davide Caratti (2):
  net/sched: act_police: use per-cpu counters
  net/sched: act_police: don't use spinlock in the data path

 net/sched/act_police.c | 186 +++++++++++++++++++++++------------------
 1 file changed, 106 insertions(+), 80 deletions(-)

-- 
2.17.1

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH net-next 1/2] net/sched: act_police: use per-cpu counters
  2018-09-13 17:29 [PATCH net-next 0/2] net/sched: act_police: lockless data path Davide Caratti
@ 2018-09-13 17:29 ` Davide Caratti
  2018-09-13 17:29 ` [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path Davide Caratti
  2018-09-16 22:32 ` [PATCH net-next 0/2] net/sched: act_police: lockless " David Miller
  2 siblings, 0 replies; 12+ messages in thread
From: Davide Caratti @ 2018-09-13 17:29 UTC (permalink / raw)
  To: Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller; +Cc: netdev

use per-CPU counters, instead of sharing a single set of stats with all
cores. This removes the need of using spinlock when statistics are read
or updated.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/sched/act_police.c | 46 ++++++++++++++++++++----------------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 393c7a670300..965a48d3ec35 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -110,7 +110,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 
 	if (!exists) {
 		ret = tcf_idr_create(tn, parm->index, NULL, a,
-				     &act_police_ops, bind, false);
+				     &act_police_ops, bind, true);
 		if (ret) {
 			tcf_idr_cleanup(tn, parm->index);
 			return ret;
@@ -137,7 +137,8 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	}
 
 	if (est) {
-		err = gen_replace_estimator(&police->tcf_bstats, NULL,
+		err = gen_replace_estimator(&police->tcf_bstats,
+					    police->common.cpu_bstats,
 					    &police->tcf_rate_est,
 					    &police->tcf_lock,
 					    NULL, est);
@@ -207,32 +208,27 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
 	struct tcf_police *police = to_police(a);
-	s64 now;
-	s64 toks;
-	s64 ptoks = 0;
+	s64 now, toks, ptoks = 0;
+	int ret;
 
-	spin_lock(&police->tcf_lock);
-
-	bstats_update(&police->tcf_bstats, skb);
 	tcf_lastuse_update(&police->tcf_tm);
+	bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb);
 
+	spin_lock(&police->tcf_lock);
 	if (police->tcfp_ewma_rate) {
 		struct gnet_stats_rate_est64 sample;
 
 		if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
 		    sample.bps >= police->tcfp_ewma_rate) {
-			police->tcf_qstats.overlimits++;
-			if (police->tcf_action == TC_ACT_SHOT)
-				police->tcf_qstats.drops++;
-			spin_unlock(&police->tcf_lock);
-			return police->tcf_action;
+			ret = police->tcf_action;
+			goto inc_overlimits;
 		}
 	}
 
 	if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
 		if (!police->rate_present) {
-			spin_unlock(&police->tcf_lock);
-			return police->tcfp_result;
+			ret = police->tcfp_result;
+			goto unlock;
 		}
 
 		now = ktime_get_ns();
@@ -253,18 +249,20 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
 			police->tcfp_t_c = now;
 			police->tcfp_toks = toks;
 			police->tcfp_ptoks = ptoks;
-			if (police->tcfp_result == TC_ACT_SHOT)
-				police->tcf_qstats.drops++;
-			spin_unlock(&police->tcf_lock);
-			return police->tcfp_result;
+			ret = police->tcfp_result;
+			goto inc_drops;
 		}
 	}
-
-	police->tcf_qstats.overlimits++;
-	if (police->tcf_action == TC_ACT_SHOT)
-		police->tcf_qstats.drops++;
+	ret = police->tcf_action;
+
+inc_overlimits:
+	qstats_overlimit_inc(this_cpu_ptr(police->common.cpu_qstats));
+inc_drops:
+	if (ret == TC_ACT_SHOT)
+		qstats_drop_inc(this_cpu_ptr(police->common.cpu_qstats));
+unlock:
 	spin_unlock(&police->tcf_lock);
-	return police->tcf_action;
+	return ret;
 }
 
 static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path
  2018-09-13 17:29 [PATCH net-next 0/2] net/sched: act_police: lockless data path Davide Caratti
  2018-09-13 17:29 ` [PATCH net-next 1/2] net/sched: act_police: use per-cpu counters Davide Caratti
@ 2018-09-13 17:29 ` Davide Caratti
  2018-11-15  6:46   ` Eric Dumazet
  2018-09-16 22:32 ` [PATCH net-next 0/2] net/sched: act_police: lockless " David Miller
  2 siblings, 1 reply; 12+ messages in thread
From: Davide Caratti @ 2018-09-13 17:29 UTC (permalink / raw)
  To: Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller; +Cc: netdev

use RCU instead of spinlocks, to protect concurrent read/write on
act_police configuration. This reduces the effects of contention in the
data path, in case multiple readers are present.

Signed-off-by: Davide Caratti <dcaratti@redhat.com>
---
 net/sched/act_police.c | 156 ++++++++++++++++++++++++-----------------
 1 file changed, 92 insertions(+), 64 deletions(-)

diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 965a48d3ec35..92649d2667ed 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -22,8 +22,7 @@
 #include <net/act_api.h>
 #include <net/netlink.h>
 
-struct tcf_police {
-	struct tc_action	common;
+struct tcf_police_params {
 	int			tcfp_result;
 	u32			tcfp_ewma_rate;
 	s64			tcfp_burst;
@@ -36,6 +35,12 @@ struct tcf_police {
 	bool			rate_present;
 	struct psched_ratecfg	peak;
 	bool			peak_present;
+	struct rcu_head rcu;
+};
+
+struct tcf_police {
+	struct tc_action	common;
+	struct tcf_police_params __rcu *params;
 };
 
 #define to_police(pc) ((struct tcf_police *)pc)
@@ -84,6 +89,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	struct tcf_police *police;
 	struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
 	struct tc_action_net *tn = net_generic(net, police_net_id);
+	struct tcf_police_params *new;
 	bool exists = false;
 	int size;
 
@@ -151,50 +157,60 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 		goto failure;
 	}
 
-	spin_lock_bh(&police->tcf_lock);
+	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	if (unlikely(!new)) {
+		err = -ENOMEM;
+		goto failure;
+	}
+
 	/* No failure allowed after this point */
-	police->tcfp_mtu = parm->mtu;
-	if (police->tcfp_mtu == 0) {
-		police->tcfp_mtu = ~0;
+	new->tcfp_mtu = parm->mtu;
+	if (!new->tcfp_mtu) {
+		new->tcfp_mtu = ~0;
 		if (R_tab)
-			police->tcfp_mtu = 255 << R_tab->rate.cell_log;
+			new->tcfp_mtu = 255 << R_tab->rate.cell_log;
 	}
 	if (R_tab) {
-		police->rate_present = true;
-		psched_ratecfg_precompute(&police->rate, &R_tab->rate, 0);
+		new->rate_present = true;
+		psched_ratecfg_precompute(&new->rate, &R_tab->rate, 0);
 		qdisc_put_rtab(R_tab);
 	} else {
-		police->rate_present = false;
+		new->rate_present = false;
 	}
 	if (P_tab) {
-		police->peak_present = true;
-		psched_ratecfg_precompute(&police->peak, &P_tab->rate, 0);
+		new->peak_present = true;
+		psched_ratecfg_precompute(&new->peak, &P_tab->rate, 0);
 		qdisc_put_rtab(P_tab);
 	} else {
-		police->peak_present = false;
+		new->peak_present = false;
 	}
 
 	if (tb[TCA_POLICE_RESULT])
-		police->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
-	police->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
-	police->tcfp_toks = police->tcfp_burst;
-	if (police->peak_present) {
-		police->tcfp_mtu_ptoks = (s64) psched_l2t_ns(&police->peak,
-							     police->tcfp_mtu);
-		police->tcfp_ptoks = police->tcfp_mtu_ptoks;
+		new->tcfp_result = nla_get_u32(tb[TCA_POLICE_RESULT]);
+	new->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
+	new->tcfp_toks = new->tcfp_burst;
+	if (new->peak_present) {
+		new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak,
+							 new->tcfp_mtu);
+		new->tcfp_ptoks = new->tcfp_mtu_ptoks;
 	}
-	police->tcf_action = parm->action;
 
 	if (tb[TCA_POLICE_AVRATE])
-		police->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
+		new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
 
+	spin_lock_bh(&police->tcf_lock);
+	new->tcfp_t_c = ktime_get_ns();
+	police->tcf_action = parm->action;
+	rcu_swap_protected(police->params,
+			   new,
+			   lockdep_is_held(&police->tcf_lock));
 	spin_unlock_bh(&police->tcf_lock);
-	if (ret != ACT_P_CREATED)
-		return ret;
 
-	police->tcfp_t_c = ktime_get_ns();
-	tcf_idr_insert(tn, *a);
+	if (new)
+		kfree_rcu(new, rcu);
 
+	if (ret == ACT_P_CREATED)
+		tcf_idr_insert(tn, *a);
 	return ret;
 
 failure:
@@ -208,68 +224,77 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
 			  struct tcf_result *res)
 {
 	struct tcf_police *police = to_police(a);
+	struct tcf_police_params *p;
 	s64 now, toks, ptoks = 0;
 	int ret;
 
 	tcf_lastuse_update(&police->tcf_tm);
 	bstats_cpu_update(this_cpu_ptr(police->common.cpu_bstats), skb);
 
-	spin_lock(&police->tcf_lock);
-	if (police->tcfp_ewma_rate) {
+	ret = READ_ONCE(police->tcf_action);
+	p = rcu_dereference_bh(police->params);
+
+	if (p->tcfp_ewma_rate) {
 		struct gnet_stats_rate_est64 sample;
 
 		if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
-		    sample.bps >= police->tcfp_ewma_rate) {
-			ret = police->tcf_action;
+		    sample.bps >= p->tcfp_ewma_rate)
 			goto inc_overlimits;
-		}
 	}
 
-	if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
-		if (!police->rate_present) {
-			ret = police->tcfp_result;
-			goto unlock;
+	if (qdisc_pkt_len(skb) <= p->tcfp_mtu) {
+		if (!p->rate_present) {
+			ret = p->tcfp_result;
+			goto end;
 		}
 
 		now = ktime_get_ns();
-		toks = min_t(s64, now - police->tcfp_t_c,
-			     police->tcfp_burst);
-		if (police->peak_present) {
-			ptoks = toks + police->tcfp_ptoks;
-			if (ptoks > police->tcfp_mtu_ptoks)
-				ptoks = police->tcfp_mtu_ptoks;
-			ptoks -= (s64) psched_l2t_ns(&police->peak,
-						     qdisc_pkt_len(skb));
+		toks = min_t(s64, now - p->tcfp_t_c, p->tcfp_burst);
+		if (p->peak_present) {
+			ptoks = toks + p->tcfp_ptoks;
+			if (ptoks > p->tcfp_mtu_ptoks)
+				ptoks = p->tcfp_mtu_ptoks;
+			ptoks -= (s64)psched_l2t_ns(&p->peak,
+						    qdisc_pkt_len(skb));
 		}
-		toks += police->tcfp_toks;
-		if (toks > police->tcfp_burst)
-			toks = police->tcfp_burst;
-		toks -= (s64) psched_l2t_ns(&police->rate, qdisc_pkt_len(skb));
+		toks += p->tcfp_toks;
+		if (toks > p->tcfp_burst)
+			toks = p->tcfp_burst;
+		toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
 		if ((toks|ptoks) >= 0) {
-			police->tcfp_t_c = now;
-			police->tcfp_toks = toks;
-			police->tcfp_ptoks = ptoks;
-			ret = police->tcfp_result;
+			p->tcfp_t_c = now;
+			p->tcfp_toks = toks;
+			p->tcfp_ptoks = ptoks;
+			ret = p->tcfp_result;
 			goto inc_drops;
 		}
 	}
-	ret = police->tcf_action;
 
 inc_overlimits:
 	qstats_overlimit_inc(this_cpu_ptr(police->common.cpu_qstats));
 inc_drops:
 	if (ret == TC_ACT_SHOT)
 		qstats_drop_inc(this_cpu_ptr(police->common.cpu_qstats));
-unlock:
-	spin_unlock(&police->tcf_lock);
+end:
 	return ret;
 }
 
+static void tcf_police_cleanup(struct tc_action *a)
+{
+	struct tcf_police *police = to_police(a);
+	struct tcf_police_params *p;
+
+	p = rcu_dereference_protected(police->params, 1);
+	if (p)
+		kfree_rcu(p, rcu);
+}
+
 static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
 			       int bind, int ref)
 {
 	unsigned char *b = skb_tail_pointer(skb);
 	struct tcf_police *police = to_police(a);
+	struct tcf_police_params *p;
 	struct tc_police opt = {
 		.index = police->tcf_index,
 		.refcnt = refcount_read(&police->tcf_refcnt) - ref,
@@ -279,19 +304,21 @@ static int tcf_police_dump(struct sk_buff *skb, struct tc_action *a,
 
 	spin_lock_bh(&police->tcf_lock);
 	opt.action = police->tcf_action;
-	opt.mtu = police->tcfp_mtu;
-	opt.burst = PSCHED_NS2TICKS(police->tcfp_burst);
-	if (police->rate_present)
-		psched_ratecfg_getrate(&opt.rate, &police->rate);
-	if (police->peak_present)
-		psched_ratecfg_getrate(&opt.peakrate, &police->peak);
+	p = rcu_dereference_protected(police->params,
+				      lockdep_is_held(&police->tcf_lock));
+	opt.mtu = p->tcfp_mtu;
+	opt.burst = PSCHED_NS2TICKS(p->tcfp_burst);
+	if (p->rate_present)
+		psched_ratecfg_getrate(&opt.rate, &p->rate);
+	if (p->peak_present)
+		psched_ratecfg_getrate(&opt.peakrate, &p->peak);
 	if (nla_put(skb, TCA_POLICE_TBF, sizeof(opt), &opt))
 		goto nla_put_failure;
-	if (police->tcfp_result &&
-	    nla_put_u32(skb, TCA_POLICE_RESULT, police->tcfp_result))
+	if (p->tcfp_result &&
+	    nla_put_u32(skb, TCA_POLICE_RESULT, p->tcfp_result))
 		goto nla_put_failure;
-	if (police->tcfp_ewma_rate &&
-	    nla_put_u32(skb, TCA_POLICE_AVRATE, police->tcfp_ewma_rate))
+	if (p->tcfp_ewma_rate &&
+	    nla_put_u32(skb, TCA_POLICE_AVRATE, p->tcfp_ewma_rate))
 		goto nla_put_failure;
 
 	t.install = jiffies_to_clock_t(jiffies - police->tcf_tm.install);
@@ -330,6 +357,7 @@ static struct tc_action_ops act_police_ops = {
 	.init		=	tcf_police_init,
 	.walk		=	tcf_police_walker,
 	.lookup		=	tcf_police_search,
+	.cleanup	=	tcf_police_cleanup,
 	.size		=	sizeof(struct tcf_police),
 };
 
-- 
2.17.1

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 0/2] net/sched: act_police: lockless data path
  2018-09-13 17:29 [PATCH net-next 0/2] net/sched: act_police: lockless data path Davide Caratti
  2018-09-13 17:29 ` [PATCH net-next 1/2] net/sched: act_police: use per-cpu counters Davide Caratti
  2018-09-13 17:29 ` [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path Davide Caratti
@ 2018-09-16 22:32 ` David Miller
  2 siblings, 0 replies; 12+ messages in thread
From: David Miller @ 2018-09-16 22:32 UTC (permalink / raw)
  To: dcaratti; +Cc: jhs, xiyou.wangcong, jiri, netdev

From: Davide Caratti <dcaratti@redhat.com>
Date: Thu, 13 Sep 2018 19:29:11 +0200

> the data path of 'police' action can be faster if we avoid using spinlocks:
>  - patch 1 converts act_police to use per-cpu counters
>  - patch 2 lets act_police use RCU to access its configuration data.
> 
> test procedure (using pktgen from https://github.com/netoptimizer):
>  # ip link add name eth1 type dummy
>  # ip link set dev eth1 up
>  # tc qdisc add dev eth1 clsact
>  # tc filter add dev eth1 egress matchall action police \
>  > rate 2gbit burst 100k conform-exceed pass/pass index 100
>  # for c in 1 2 4; do
>  > ./pktgen_bench_xmit_mode_queue_xmit.sh -v -s 64 -t $c -n 5000000 -i eth1
>  > done
> 
> test results (avg. pps/thread):
> 
>   $c | before patch |  after patch | improvement
>  ----+--------------+--------------+-------------
>    1 |      3518448 |      3591240 |  irrelevant
>    2 |      3070065 |      3383393 |         10%
>    4 |      1540969 |      3238385 |        110%

Series applied.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path
  2018-09-13 17:29 ` [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path Davide Caratti
@ 2018-11-15  6:46   ` Eric Dumazet
  2018-11-15 11:43     ` Davide Caratti
  0 siblings, 1 reply; 12+ messages in thread
From: Eric Dumazet @ 2018-11-15  6:46 UTC (permalink / raw)
  To: Davide Caratti, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller
  Cc: netdev



On 09/13/2018 10:29 AM, Davide Caratti wrote:
> use RCU instead of spinlocks, to protect concurrent read/write on
> act_police configuration. This reduces the effects of contention in the
> data path, in case multiple readers are present.
> 
> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> ---
>  net/sched/act_police.c | 156 ++++++++++++++++++++++++-----------------
>  1 file changed, 92 insertions(+), 64 deletions(-)
> 

I must be missing something obvious with this patch.

How can the following piece of code in tcf_police_act() can possibly be run
without a spinlock or something preventing multiple cpus messing badly with the state variables  ?


		now = ktime_get_ns();
		toks = min_t(s64, now - p->tcfp_t_c, p->tcfp_burst);
		if (p->peak_present) {
			ptoks = toks + p->tcfp_ptoks;
			if (ptoks > p->tcfp_mtu_ptoks)
				ptoks = p->tcfp_mtu_ptoks;
			ptoks -= (s64)psched_l2t_ns(&p->peak,
						    qdisc_pkt_len(skb));
		}
		toks += p->tcfp_toks;
		if (toks > p->tcfp_burst)
			toks = p->tcfp_burst;
		toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
		if ((toks|ptoks) >= 0) {
			p->tcfp_t_c = now;
			p->tcfp_toks = toks;
			p->tcfp_ptoks = ptoks;
			ret = p->tcfp_result;
			goto inc_drops;
		}

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path
  2018-11-15  6:46   ` Eric Dumazet
@ 2018-11-15 11:43     ` Davide Caratti
  2018-11-15 13:53       ` Eric Dumazet
  0 siblings, 1 reply; 12+ messages in thread
From: Davide Caratti @ 2018-11-15 11:43 UTC (permalink / raw)
  To: Eric Dumazet, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller
  Cc: netdev

On Wed, 2018-11-14 at 22:46 -0800, Eric Dumazet wrote:
> 
> On 09/13/2018 10:29 AM, Davide Caratti wrote:
> > use RCU instead of spinlocks, to protect concurrent read/write on
> > act_police configuration. This reduces the effects of contention in the
> > data path, in case multiple readers are present.
> > 
> > Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> > ---
> >  net/sched/act_police.c | 156 ++++++++++++++++++++++++-----------------
> >  1 file changed, 92 insertions(+), 64 deletions(-)
> > 
> 
> I must be missing something obvious with this patch.

hello Eric,

On the opposite, I missed something obvious when I wrote that patch: there
is a race condition on tcfp_toks, tcfp_ptoks and tcfp_t_c: thank you for
noticing it.

These variables still need to be protected with a spinlock. I will do a
patch and evaluate if 'act_police' is still faster than a version where   
2d550dbad83c ("net/sched: .... ") is reverted, and share results in the
next hours.

Ok?

-- 
davide

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path
  2018-11-15 11:43     ` Davide Caratti
@ 2018-11-15 13:53       ` Eric Dumazet
  2018-11-16 11:28         ` Davide Caratti
  0 siblings, 1 reply; 12+ messages in thread
From: Eric Dumazet @ 2018-11-15 13:53 UTC (permalink / raw)
  To: Davide Caratti, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller
  Cc: netdev



On 11/15/2018 03:43 AM, Davide Caratti wrote:
> On Wed, 2018-11-14 at 22:46 -0800, Eric Dumazet wrote:
>>
>> On 09/13/2018 10:29 AM, Davide Caratti wrote:
>>> use RCU instead of spinlocks, to protect concurrent read/write on
>>> act_police configuration. This reduces the effects of contention in the
>>> data path, in case multiple readers are present.
>>>
>>> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
>>> ---
>>>  net/sched/act_police.c | 156 ++++++++++++++++++++++++-----------------
>>>  1 file changed, 92 insertions(+), 64 deletions(-)
>>>
>>
>> I must be missing something obvious with this patch.
> 
> hello Eric,
> 
> On the opposite, I missed something obvious when I wrote that patch: there
> is a race condition on tcfp_toks, tcfp_ptoks and tcfp_t_c: thank you for
> noticing it.
> 
> These variables still need to be protected with a spinlock. I will do a
> patch and evaluate if 'act_police' is still faster than a version where   
> 2d550dbad83c ("net/sched: .... ") is reverted, and share results in the
> next hours.
> 
> Ok?
> 

SGTM, thanks.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path
  2018-11-15 13:53       ` Eric Dumazet
@ 2018-11-16 11:28         ` Davide Caratti
  2018-11-16 14:34           ` Eric Dumazet
  0 siblings, 1 reply; 12+ messages in thread
From: Davide Caratti @ 2018-11-16 11:28 UTC (permalink / raw)
  To: Eric Dumazet, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller
  Cc: netdev

On Thu, 2018-11-15 at 05:53 -0800, Eric Dumazet wrote:
> 
> On 11/15/2018 03:43 AM, Davide Caratti wrote:
> > On Wed, 2018-11-14 at 22:46 -0800, Eric Dumazet wrote:
> > > On 09/13/2018 10:29 AM, Davide Caratti wrote:
> > > > use RCU instead of spinlocks, to protect concurrent read/write on
> > > > act_police configuration. This reduces the effects of contention in the
> > > > data path, in case multiple readers are present.
> > > > 
> > > > Signed-off-by: Davide Caratti <dcaratti@redhat.com>
> > > > ---
> > > >  net/sched/act_police.c | 156 ++++++++++++++++++++++++-----------------
> > > >  1 file changed, 92 insertions(+), 64 deletions(-)
> > > > 
> > > 
> > > I must be missing something obvious with this patch.
> > 
> > hello Eric,
> > 
> > On the opposite, I missed something obvious when I wrote that patch: there
> > is a race condition on tcfp_toks, tcfp_ptoks and tcfp_t_c: thank you for
> > noticing it.
> > 
> > These variables still need to be protected with a spinlock. I will do a
> > patch and evaluate if 'act_police' is still faster than a version where   
> > 2d550dbad83c ("net/sched: .... ") is reverted, and share results in the
> > next hours.
> > 
> > Ok?
> > 
> 
> SGTM, thanks.

hello,
I just finished the comparison of act_police, in the following cases:

a) revert the RCU-ification (i.e. commit 2d550dbad83c ("net/sched:
act_police: don't use spinlock in the data path"), and leave per-cpu
counters used by the rate estimator

b) keep RCU-ified configuration parameters, and protect read/update of
tcfp_toks, tcfp_ptoks and tcfp_t with a spinlock (code at the bottom  of
this message).

## Test setup:

$DEV is a 'dummy' with clsact qdisc; the following two commands,

# test police with 'rate'
$TC filter add dev $DEV egress matchall \
 action police rate 2gbit burst 100k conform-exceed pass/pass index 100

# test police with 'avrate'
$TC filter add dev prova egress estimator 1s 8s matchall \
        action police avrate 2gbit conform-exceed pass/pass index 100

are tested with the following loop:

for c in 1 2 4 8 16; do
./pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -v -s 64  -t $c -n 5000000 -i $DEV
done


## Test results:

using  rate  | reverted       | patched
        $c   | act_police (a) | act_police (b)
-------------+----------------+---------------
         1   |       3364442  |      3345580  
         2   |       2703030  |      2721919  
         4   |       1130146  |      1253555
         8   |        664238  |       658777
        16   |        154026  |       155259


using avrate | reverted       | patched
        $c   | act_police (a) | act_police (b)
-------------+----------------+---------------
         1   |       3621796  |      3658472 
         2   |       3075589  |      3548135  
         4   |       2313314  |      3343717
         8   |        768458  |      3260480
        16   |        177776  |      3254128


so, 'avrate' still gets a significant improvement because the 'conform/exceed'
decision doesn't need the spinlock in this case. The estimation is probably
less accurate, because it use per-CPU variables: if this is not acceptable,
then we need to revert also 93be42f9173b ("net/sched: act_police: use per-cpu
counters").


## patch code:

-- >8 --
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 052855d..42db852 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -27,10 +27,7 @@ struct tcf_police_params {
 	u32			tcfp_ewma_rate;
 	s64			tcfp_burst;
 	u32			tcfp_mtu;
-	s64			tcfp_toks;
-	s64			tcfp_ptoks;
 	s64			tcfp_mtu_ptoks;
-	s64			tcfp_t_c;
 	struct psched_ratecfg	rate;
 	bool			rate_present;
 	struct psched_ratecfg	peak;
@@ -40,6 +37,9 @@ struct tcf_police_params {
 
 struct tcf_police {
 	struct tc_action	common;
+	s64			tcfp_toks;
+	s64			tcfp_ptoks;
+	s64			tcfp_t_c;
 	struct tcf_police_params __rcu *params;
 };
 
@@ -186,12 +186,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	}
 
 	new->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
-	new->tcfp_toks = new->tcfp_burst;
-	if (new->peak_present) {
-		new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak,
-							 new->tcfp_mtu);
-		new->tcfp_ptoks = new->tcfp_mtu_ptoks;
-	}
 
 	if (tb[TCA_POLICE_AVRATE])
 		new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
@@ -207,7 +201,14 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
 	}
 
 	spin_lock_bh(&police->tcf_lock);
-	new->tcfp_t_c = ktime_get_ns();
+	police->tcfp_t_c = ktime_get_ns();
+	police->tcfp_toks = new->tcfp_burst;
+	if (new->peak_present) {
+		new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak,
+							 new->tcfp_mtu);
+		police->tcfp_ptoks = new->tcfp_mtu_ptoks;
+	}
+
 	police->tcf_action = parm->action;
 	rcu_swap_protected(police->params,
 			   new,
@@ -255,27 +256,29 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
 			ret = p->tcfp_result;
 			goto end;
 		}
-
+		spin_lock_bh(&police->tcf_lock);
 		now = ktime_get_ns();
-		toks = min_t(s64, now - p->tcfp_t_c, p->tcfp_burst);
+		toks = min_t(s64, now - police->tcfp_t_c, p->tcfp_burst);
 		if (p->peak_present) {
-			ptoks = toks + p->tcfp_ptoks;
+			ptoks = toks + police->tcfp_ptoks;
 			if (ptoks > p->tcfp_mtu_ptoks)
 				ptoks = p->tcfp_mtu_ptoks;
 			ptoks -= (s64)psched_l2t_ns(&p->peak,
 						    qdisc_pkt_len(skb));
 		}
-		toks += p->tcfp_toks;
+		toks += police->tcfp_toks;
 		if (toks > p->tcfp_burst)
 			toks = p->tcfp_burst;
 		toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
 		if ((toks|ptoks) >= 0) {
-			p->tcfp_t_c = now;
-			p->tcfp_toks = toks;
-			p->tcfp_ptoks = ptoks;
+			police->tcfp_t_c = now;
+			police->tcfp_toks = toks;
+			police->tcfp_ptoks = ptoks;
+			spin_unlock_bh(&police->tcf_lock);
 			ret = p->tcfp_result;
 			goto inc_drops;
 		}
+		spin_unlock_bh(&police->tcf_lock);
 	}
 
 inc_overlimits:
-- >8 --

any feedback is appreciated.
thanks!
-- 
davide

^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path
  2018-11-16 11:28         ` Davide Caratti
@ 2018-11-16 14:34           ` Eric Dumazet
  2018-11-16 14:39             ` Eric Dumazet
  2018-11-16 14:41             ` David Laight
  0 siblings, 2 replies; 12+ messages in thread
From: Eric Dumazet @ 2018-11-16 14:34 UTC (permalink / raw)
  To: Davide Caratti, Eric Dumazet, Jamal Hadi Salim, Cong Wang,
	Jiri Pirko, David S. Miller
  Cc: netdev



On 11/16/2018 03:28 AM, Davide Caratti wrote:
> On Thu, 2018-11-15 at 05:53 -0800, Eric Dumazet wrote:
>>
>> On 11/15/2018 03:43 AM, Davide Caratti wrote:
>>> On Wed, 2018-11-14 at 22:46 -0800, Eric Dumazet wrote:
>>>> On 09/13/2018 10:29 AM, Davide Caratti wrote:
>>>>> use RCU instead of spinlocks, to protect concurrent read/write on
>>>>> act_police configuration. This reduces the effects of contention in the
>>>>> data path, in case multiple readers are present.
>>>>>
>>>>> Signed-off-by: Davide Caratti <dcaratti@redhat.com>
>>>>> ---
>>>>>  net/sched/act_police.c | 156 ++++++++++++++++++++++++-----------------
>>>>>  1 file changed, 92 insertions(+), 64 deletions(-)
>>>>>
>>>>
>>>> I must be missing something obvious with this patch.
>>>
>>> hello Eric,
>>>
>>> On the opposite, I missed something obvious when I wrote that patch: there
>>> is a race condition on tcfp_toks, tcfp_ptoks and tcfp_t_c: thank you for
>>> noticing it.
>>>
>>> These variables still need to be protected with a spinlock. I will do a
>>> patch and evaluate if 'act_police' is still faster than a version where   
>>> 2d550dbad83c ("net/sched: .... ") is reverted, and share results in the
>>> next hours.
>>>
>>> Ok?
>>>
>>
>> SGTM, thanks.
> 
> hello,
> I just finished the comparison of act_police, in the following cases:
> 
> a) revert the RCU-ification (i.e. commit 2d550dbad83c ("net/sched:
> act_police: don't use spinlock in the data path"), and leave per-cpu
> counters used by the rate estimator
> 
> b) keep RCU-ified configuration parameters, and protect read/update of
> tcfp_toks, tcfp_ptoks and tcfp_t with a spinlock (code at the bottom  of
> this message).
> 
> ## Test setup:
> 
> $DEV is a 'dummy' with clsact qdisc; the following two commands,
> 
> # test police with 'rate'
> $TC filter add dev $DEV egress matchall \
>  action police rate 2gbit burst 100k conform-exceed pass/pass index 100
> 
> # test police with 'avrate'
> $TC filter add dev prova egress estimator 1s 8s matchall \
>         action police avrate 2gbit conform-exceed pass/pass index 100
> 
> are tested with the following loop:
> 
> for c in 1 2 4 8 16; do
> ./pktgen/pktgen_bench_xmit_mode_queue_xmit.sh -v -s 64  -t $c -n 5000000 -i $DEV
> done
> 
> 
> ## Test results:
> 
> using  rate  | reverted       | patched
>         $c   | act_police (a) | act_police (b)
> -------------+----------------+---------------
>          1   |       3364442  |      3345580  
>          2   |       2703030  |      2721919  
>          4   |       1130146  |      1253555
>          8   |        664238  |       658777
>         16   |        154026  |       155259
> 
> 
> using avrate | reverted       | patched
>         $c   | act_police (a) | act_police (b)
> -------------+----------------+---------------
>          1   |       3621796  |      3658472 
>          2   |       3075589  |      3548135  
>          4   |       2313314  |      3343717
>          8   |        768458  |      3260480
>         16   |        177776  |      3254128
> 
> 
> so, 'avrate' still gets a significant improvement because the 'conform/exceed'
> decision doesn't need the spinlock in this case. The estimation is probably
> less accurate, because it use per-CPU variables: if this is not acceptable,
> then we need to revert also 93be42f9173b ("net/sched: act_police: use per-cpu
> counters").
> 
> 
> ## patch code:
> 
> -- >8 --
> diff --git a/net/sched/act_police.c b/net/sched/act_police.c
> index 052855d..42db852 100644
> --- a/net/sched/act_police.c
> +++ b/net/sched/act_police.c
> @@ -27,10 +27,7 @@ struct tcf_police_params {
>  	u32			tcfp_ewma_rate;
>  	s64			tcfp_burst;
>  	u32			tcfp_mtu;
> -	s64			tcfp_toks;
> -	s64			tcfp_ptoks;
>  	s64			tcfp_mtu_ptoks;
> -	s64			tcfp_t_c;
>  	struct psched_ratecfg	rate;
>  	bool			rate_present;
>  	struct psched_ratecfg	peak;
> @@ -40,6 +37,9 @@ struct tcf_police_params {
>  
>  struct tcf_police {
>  	struct tc_action	common;


> +	s64			tcfp_toks;
> +	s64			tcfp_ptoks;
> +	s64			tcfp_t_c;

I suggest to use a single cache line with a dedicated spinlock and these three s64

	spinlock_t  tcfp_lock ____cacheline_aligned_in_smp;
	s64 			...
	s64			...
	s64			...


>  	struct tcf_police_params __rcu *params;

Make sure to use a different cache line for *params 

	struct tcf_police_params __rcu *params ____cacheline_aligned_in_smp;

>  };
>  
> @@ -186,12 +186,6 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
>  	}
>  
>  	new->tcfp_burst = PSCHED_TICKS2NS(parm->burst);
> -	new->tcfp_toks = new->tcfp_burst;
> -	if (new->peak_present) {
> -		new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak,
> -							 new->tcfp_mtu);
> -		new->tcfp_ptoks = new->tcfp_mtu_ptoks;
> -	}
>  
>  	if (tb[TCA_POLICE_AVRATE])
>  		new->tcfp_ewma_rate = nla_get_u32(tb[TCA_POLICE_AVRATE]);
> @@ -207,7 +201,14 @@ static int tcf_police_init(struct net *net, struct nlattr *nla,
>  	}
>  
>  	spin_lock_bh(&police->tcf_lock);
> -	new->tcfp_t_c = ktime_get_ns();
> +	police->tcfp_t_c = ktime_get_ns();
> +	police->tcfp_toks = new->tcfp_burst;
> +	if (new->peak_present) {
> +		new->tcfp_mtu_ptoks = (s64)psched_l2t_ns(&new->peak,
> +							 new->tcfp_mtu);
> +		police->tcfp_ptoks = new->tcfp_mtu_ptoks;
> +	}
> +
>  	police->tcf_action = parm->action;
>  	rcu_swap_protected(police->params,
>  			   new,
> @@ -255,27 +256,29 @@ static int tcf_police_act(struct sk_buff *skb, const struct tc_action *a,
>  			ret = p->tcfp_result;
>  			goto end;
>  		}
> -
> +		spin_lock_bh(&police->tcf_lock);
>  		now = ktime_get_ns();

The ktime_get_ns() call can be done before acquiring the spinlock

> -		toks = min_t(s64, now - p->tcfp_t_c, p->tcfp_burst);
> +		toks = min_t(s64, now - police->tcfp_t_c, p->tcfp_burst);
>  		if (p->peak_present) {
> -			ptoks = toks + p->tcfp_ptoks;
> +			ptoks = toks + police->tcfp_ptoks;
>  			if (ptoks > p->tcfp_mtu_ptoks)
>  				ptoks = p->tcfp_mtu_ptoks;
>  			ptoks -= (s64)psched_l2t_ns(&p->peak,
>  						    qdisc_pkt_len(skb));
>  		}
> -		toks += p->tcfp_toks;
> +		toks += police->tcfp_toks;
>  		if (toks > p->tcfp_burst)
>  			toks = p->tcfp_burst;
>  		toks -= (s64)psched_l2t_ns(&p->rate, qdisc_pkt_len(skb));
>  		if ((toks|ptoks) >= 0) {
> -			p->tcfp_t_c = now;
> -			p->tcfp_toks = toks;
> -			p->tcfp_ptoks = ptoks;
> +			police->tcfp_t_c = now;
> +			police->tcfp_toks = toks;
> +			police->tcfp_ptoks = ptoks;
> +			spin_unlock_bh(&police->tcf_lock);
>  			ret = p->tcfp_result;
>  			goto inc_drops;
>  		}
> +		spin_unlock_bh(&police->tcf_lock);
>  	}
>  
>  inc_overlimits:
> -- >8 --
> 
> any feedback is appreciated.
> thanks!
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path
  2018-11-16 14:34           ` Eric Dumazet
@ 2018-11-16 14:39             ` Eric Dumazet
  2018-11-16 14:41             ` David Laight
  1 sibling, 0 replies; 12+ messages in thread
From: Eric Dumazet @ 2018-11-16 14:39 UTC (permalink / raw)
  To: Davide Caratti, Jamal Hadi Salim, Cong Wang, Jiri Pirko, David S. Miller
  Cc: netdev



On 11/16/2018 06:34 AM, Eric Dumazet wrote:

> 
>> +	s64			tcfp_toks;
>> +	s64			tcfp_ptoks;
>> +	s64			tcfp_t_c;
> 
> I suggest to use a single cache line with a dedicated spinlock and these three s64
> 
> 	spinlock_t  tcfp_lock ____cacheline_aligned_in_smp;
> 	s64 			...
> 	s64			...
> 	s64			...
> 
> 
>>  	struct tcf_police_params __rcu *params;
> 
> Make sure to use a different cache line for *params 
> 
> 	struct tcf_police_params __rcu *params ____cacheline_aligned_in_smp;


Or move it before the cacheline used by the lock and three s64,
since 'common' should be read-mostly. No need for a separate cache line.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* RE: [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path
  2018-11-16 14:34           ` Eric Dumazet
  2018-11-16 14:39             ` Eric Dumazet
@ 2018-11-16 14:41             ` David Laight
  2018-11-16 14:55               ` Eric Dumazet
  1 sibling, 1 reply; 12+ messages in thread
From: David Laight @ 2018-11-16 14:41 UTC (permalink / raw)
  To: 'Eric Dumazet',
	Davide Caratti, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	David S. Miller
  Cc: netdev

From: Eric Dumazet
> Sent: 16 November 2018 14:35
...
> I suggest to use a single cache line with a dedicated spinlock and these three s64
> 
> 	spinlock_t  tcfp_lock ____cacheline_aligned_in_smp;
> 	s64 			...
> 	s64			...
> 	s64			...

Doesn't this do something really stupid when cache lines are big.
If the spinlock is 8 bytes you never want more than 32 byte alignment.
If cache lines are 256 bytes you don't even need that.

Also ISTR that the kmalloc() only guarantees 8 byte alignment on x86_64.
So aligning structure members to larger offsets is rather pointless.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path
  2018-11-16 14:41             ` David Laight
@ 2018-11-16 14:55               ` Eric Dumazet
  0 siblings, 0 replies; 12+ messages in thread
From: Eric Dumazet @ 2018-11-16 14:55 UTC (permalink / raw)
  To: David Laight, 'Eric Dumazet',
	Davide Caratti, Jamal Hadi Salim, Cong Wang, Jiri Pirko,
	David S. Miller
  Cc: netdev



On 11/16/2018 06:41 AM, David Laight wrote:
> From: Eric Dumazet
>> Sent: 16 November 2018 14:35
> ...
>> I suggest to use a single cache line with a dedicated spinlock and these three s64
>>
>> 	spinlock_t  tcfp_lock ____cacheline_aligned_in_smp;
>> 	s64 			...
>> 	s64			...
>> 	s64			...
> 
> Doesn't this do something really stupid when cache lines are big.
> If the spinlock is 8 bytes you never want more than 32 byte alignment.
> If cache lines are 256 bytes you don't even need that.

We do want that, even if cache lines are 256 bytes, thank you.

> 
> Also ISTR that the kmalloc() only guarantees 8 byte alignment on x86_64.
> So aligning structure members to larger offsets is rather pointless.


No it is not, we use these hints all the time.

Just double check and report a bug to mm teams if you disagree.

Please do not send feedback if you are not sure.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2018-11-17  1:08 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-09-13 17:29 [PATCH net-next 0/2] net/sched: act_police: lockless data path Davide Caratti
2018-09-13 17:29 ` [PATCH net-next 1/2] net/sched: act_police: use per-cpu counters Davide Caratti
2018-09-13 17:29 ` [PATCH net-next 2/2] net/sched: act_police: don't use spinlock in the data path Davide Caratti
2018-11-15  6:46   ` Eric Dumazet
2018-11-15 11:43     ` Davide Caratti
2018-11-15 13:53       ` Eric Dumazet
2018-11-16 11:28         ` Davide Caratti
2018-11-16 14:34           ` Eric Dumazet
2018-11-16 14:39             ` Eric Dumazet
2018-11-16 14:41             ` David Laight
2018-11-16 14:55               ` Eric Dumazet
2018-09-16 22:32 ` [PATCH net-next 0/2] net/sched: act_police: lockless " David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.