[PATCH 0/2] net/sched: Add hardware specific counters to TC actions

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
@ 2018-08-09 15:01 Eelco Chaudron
  2018-08-09 15:01 ` [PATCH 1/2] net/core: Add new basic hardware counter Eelco Chaudron
                   ` (2 more replies)
  0 siblings, 3 replies; 16+ messages in thread
From: Eelco Chaudron @ 2018-08-09 15:01 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, xiyou.wangcong, jiri

Add hardware specific counters to TC actions which will be exported
through the netlink API. This makes troubleshooting TC flower offload
easier, as it possible to differentiate the packets being offloaded.

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>

Eelco Chaudron (2):
      net/core: Add new basic hardware counter
      net/sched: Add hardware specific counters to TC actions


 include/net/act_api.h          |    8 +++-
 include/net/gen_stats.h        |    4 ++
 include/net/pkt_cls.h          |    2 +
 include/uapi/linux/gen_stats.h |    1 +
 net/core/gen_stats.c           |   73 ++++++++++++++++++++++++++++++----------
 net/sched/act_api.c            |   14 ++++++--
 net/sched/act_gact.c           |    6 +++
 net/sched/act_mirred.c         |    5 ++-
 8 files changed, 85 insertions(+), 28 deletions(-)

^ permalink raw reply	[flat|nested] 16+ messages in thread

* [PATCH 1/2] net/core: Add new basic hardware counter
  2018-08-09 15:01 [PATCH 0/2] net/sched: Add hardware specific counters to TC actions Eelco Chaudron
@ 2018-08-09 15:01 ` Eelco Chaudron
  2018-08-09 15:01 ` [PATCH 2/2] net/sched: Add hardware specific counters to TC actions Eelco Chaudron
  2018-08-10  3:26 ` [PATCH 0/2] " Jakub Kicinski
  2 siblings, 0 replies; 16+ messages in thread
From: Eelco Chaudron @ 2018-08-09 15:01 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, xiyou.wangcong, jiri

Add a new hardware specific basic counter, TCA_STATS_BASIC_HW. This can
be used to count packets/bytes processed by hardware offload.


Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
 include/net/gen_stats.h        |    4 ++
 include/uapi/linux/gen_stats.h |    1 +
 net/core/gen_stats.c           |   73 ++++++++++++++++++++++++++++++----------
 3 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h
index 0304ba2..7e54a9a 100644
--- a/include/net/gen_stats.h
+++ b/include/net/gen_stats.h
@@ -44,6 +44,10 @@ void __gnet_stats_copy_basic(const seqcount_t *running,
 			     struct gnet_stats_basic_packed *bstats,
 			     struct gnet_stats_basic_cpu __percpu *cpu,
 			     struct gnet_stats_basic_packed *b);
+int gnet_stats_copy_basic_hw(const seqcount_t *running,
+			     struct gnet_dump *d,
+			     struct gnet_stats_basic_cpu __percpu *cpu,
+			     struct gnet_stats_basic_packed *b);
 int gnet_stats_copy_rate_est(struct gnet_dump *d,
 			     struct net_rate_estimator __rcu **ptr);
 int gnet_stats_copy_queue(struct gnet_dump *d,
diff --git a/include/uapi/linux/gen_stats.h b/include/uapi/linux/gen_stats.h
index 24a861c..065408e 100644
--- a/include/uapi/linux/gen_stats.h
+++ b/include/uapi/linux/gen_stats.h
@@ -12,6 +12,7 @@ enum {
 	TCA_STATS_APP,
 	TCA_STATS_RATE_EST64,
 	TCA_STATS_PAD,
+	TCA_STATS_BASIC_HW,
 	__TCA_STATS_MAX,
 };
 #define TCA_STATS_MAX (__TCA_STATS_MAX - 1)
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 188d693..65a2e82 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -162,30 +162,18 @@
 }
 EXPORT_SYMBOL(__gnet_stats_copy_basic);
 
-/**
- * gnet_stats_copy_basic - copy basic statistics into statistic TLV
- * @running: seqcount_t pointer
- * @d: dumping handle
- * @cpu: copy statistic per cpu
- * @b: basic statistics
- *
- * Appends the basic statistics to the top level TLV created by
- * gnet_stats_start_copy().
- *
- * Returns 0 on success or -1 with the statistic lock released
- * if the room in the socket buffer was not sufficient.
- */
 int
-gnet_stats_copy_basic(const seqcount_t *running,
-		      struct gnet_dump *d,
-		      struct gnet_stats_basic_cpu __percpu *cpu,
-		      struct gnet_stats_basic_packed *b)
+___gnet_stats_copy_basic(const seqcount_t *running,
+			 struct gnet_dump *d,
+			 struct gnet_stats_basic_cpu __percpu *cpu,
+			 struct gnet_stats_basic_packed *b,
+			 int type)
 {
 	struct gnet_stats_basic_packed bstats = {0};
 
 	__gnet_stats_copy_basic(running, &bstats, cpu, b);
 
-	if (d->compat_tc_stats) {
+	if (d->compat_tc_stats && type == TCA_STATS_BASIC) {
 		d->tc_stats.bytes = bstats.bytes;
 		d->tc_stats.packets = bstats.packets;
 	}
@@ -196,14 +184,61 @@
 		memset(&sb, 0, sizeof(sb));
 		sb.bytes = bstats.bytes;
 		sb.packets = bstats.packets;
-		return gnet_stats_copy(d, TCA_STATS_BASIC, &sb, sizeof(sb),
+		return gnet_stats_copy(d, type, &sb, sizeof(sb),
 				       TCA_STATS_PAD);
 	}
 	return 0;
 }
+
+/**
+ * gnet_stats_copy_basic - copy basic statistics into statistic TLV
+ * @running: seqcount_t pointer
+ * @d: dumping handle
+ * @cpu: copy statistic per cpu
+ * @b: basic statistics
+ *
+ * Appends the basic statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_basic(const seqcount_t *running,
+		      struct gnet_dump *d,
+		      struct gnet_stats_basic_cpu __percpu *cpu,
+		      struct gnet_stats_basic_packed *b)
+{
+	return ___gnet_stats_copy_basic(running, d, cpu, b,
+					TCA_STATS_BASIC);
+}
 EXPORT_SYMBOL(gnet_stats_copy_basic);
 
 /**
+ * gnet_stats_copy_basic_hw - copy basic hw statistics into statistic TLV
+ * @running: seqcount_t pointer
+ * @d: dumping handle
+ * @cpu: copy statistic per cpu
+ * @b: basic statistics
+ *
+ * Appends the basic statistics to the top level TLV created by
+ * gnet_stats_start_copy().
+ *
+ * Returns 0 on success or -1 with the statistic lock released
+ * if the room in the socket buffer was not sufficient.
+ */
+int
+gnet_stats_copy_basic_hw(const seqcount_t *running,
+			 struct gnet_dump *d,
+			 struct gnet_stats_basic_cpu __percpu *cpu,
+			 struct gnet_stats_basic_packed *b)
+{
+	return ___gnet_stats_copy_basic(running, d, cpu, b,
+					TCA_STATS_BASIC_HW);
+}
+EXPORT_SYMBOL(gnet_stats_copy_basic_hw);
+
+/**
  * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
  * @d: dumping handle
  * @rate_est: rate estimator

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* [PATCH 2/2] net/sched: Add hardware specific counters to TC actions
  2018-08-09 15:01 [PATCH 0/2] net/sched: Add hardware specific counters to TC actions Eelco Chaudron
  2018-08-09 15:01 ` [PATCH 1/2] net/core: Add new basic hardware counter Eelco Chaudron
@ 2018-08-09 15:01 ` Eelco Chaudron
  2018-08-10  3:26 ` [PATCH 0/2] " Jakub Kicinski
  2 siblings, 0 replies; 16+ messages in thread
From: Eelco Chaudron @ 2018-08-09 15:01 UTC (permalink / raw)
  To: netdev; +Cc: davem, jhs, xiyou.wangcong, jiri

Add additional counters that will store the bytes/packets processed by
hardware. These will be exported through the netlink interface for
displaying by the iproute2 tc tool

Signed-off-by: Eelco Chaudron <echaudro@redhat.com>
---
 include/net/act_api.h  |    8 +++++---
 include/net/pkt_cls.h  |    2 +-
 net/sched/act_api.c    |   14 +++++++++++---
 net/sched/act_gact.c   |    6 +++++-
 net/sched/act_mirred.c |    5 ++++-
 5 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index 8c9bc02..9931d8a 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -33,10 +33,12 @@ struct tc_action {
 	int				tcfa_action;
 	struct tcf_t			tcfa_tm;
 	struct gnet_stats_basic_packed	tcfa_bstats;
+	struct gnet_stats_basic_packed	tcfa_bstats_hw;
 	struct gnet_stats_queue		tcfa_qstats;
 	struct net_rate_estimator __rcu *tcfa_rate_est;
 	spinlock_t			tcfa_lock;
 	struct gnet_stats_basic_cpu __percpu *cpu_bstats;
+	struct gnet_stats_basic_cpu __percpu *cpu_bstats_hw;
 	struct gnet_stats_queue __percpu *cpu_qstats;
 	struct tc_cookie	__rcu *act_cookie;
 	struct tcf_chain	*goto_chain;
@@ -98,7 +100,7 @@ struct tc_action_ops {
 			struct netlink_callback *, int,
 			const struct tc_action_ops *,
 			struct netlink_ext_ack *);
-	void	(*stats_update)(struct tc_action *, u64, u32, u64);
+	void	(*stats_update)(struct tc_action *, u64, u32, u64, bool);
 	size_t  (*get_fill_size)(const struct tc_action *act);
 	struct net_device *(*get_dev)(const struct tc_action *a);
 	int     (*delete)(struct net *net, u32 index);
@@ -189,13 +191,13 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind,
 #endif /* CONFIG_NET_CLS_ACT */
 
 static inline void tcf_action_stats_update(struct tc_action *a, u64 bytes,
-					   u64 packets, u64 lastuse)
+					   u64 packets, u64 lastuse, bool hw)
 {
 #ifdef CONFIG_NET_CLS_ACT
 	if (!a->ops->stats_update)
 		return;
 
-	a->ops->stats_update(a, bytes, packets, lastuse);
+	a->ops->stats_update(a, bytes, packets, lastuse, hw);
 #endif
 }
 
diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index ef727f7..de1f06a 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -324,7 +324,7 @@ static inline void tcf_exts_to_list(const struct tcf_exts *exts,
 	for (i = 0; i < exts->nr_actions; i++) {
 		struct tc_action *a = exts->actions[i];
 
-		tcf_action_stats_update(a, bytes, packets, lastuse);
+		tcf_action_stats_update(a, bytes, packets, lastuse, true);
 	}
 
 	preempt_enable();
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 229d63c..9ab3061 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -81,6 +81,7 @@ static void tcf_set_action_cookie(struct tc_cookie __rcu **old_cookie,
 static void free_tcf(struct tc_action *p)
 {
 	free_percpu(p->cpu_bstats);
+	free_percpu(p->cpu_bstats_hw);
 	free_percpu(p->cpu_qstats);
 
 	tcf_set_action_cookie(&p->act_cookie, NULL);
@@ -390,9 +391,12 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 		p->cpu_bstats = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
 		if (!p->cpu_bstats)
 			goto err1;
+		p->cpu_bstats_hw = netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
+		if (!p->cpu_bstats_hw)
+			goto err2;
 		p->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
 		if (!p->cpu_qstats)
-			goto err2;
+			goto err3;
 	}
 	spin_lock_init(&p->tcfa_lock);
 	p->tcfa_index = index;
@@ -404,7 +408,7 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 					&p->tcfa_rate_est,
 					&p->tcfa_lock, NULL, est);
 		if (err)
-			goto err3;
+			goto err4;
 	}
 
 	p->idrinfo = idrinfo;
@@ -412,8 +416,10 @@ int tcf_idr_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
 	INIT_LIST_HEAD(&p->list);
 	*a = p;
 	return 0;
-err3:
+err4:
 	free_percpu(p->cpu_qstats);
+err3:
+	free_percpu(p->cpu_bstats_hw);
 err2:
 	free_percpu(p->cpu_bstats);
 err1:
@@ -988,6 +994,8 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
 		goto errout;
 
 	if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
+	    gnet_stats_copy_basic_hw(NULL, &d, p->cpu_bstats_hw,
+				     &p->tcfa_bstats_hw) < 0 ||
 	    gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
 	    gnet_stats_copy_queue(&d, p->cpu_qstats,
 				  &p->tcfa_qstats,
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 661b72b..1fe6825 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -155,7 +155,7 @@ static int tcf_gact(struct sk_buff *skb, const struct tc_action *a,
 }
 
 static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
-				  u64 lastuse)
+				  u64 lastuse, bool hw)
 {
 	struct tcf_gact *gact = to_gact(a);
 	int action = READ_ONCE(gact->tcf_action);
@@ -166,6 +166,10 @@ static void tcf_gact_stats_update(struct tc_action *a, u64 bytes, u32 packets,
 	if (action == TC_ACT_SHOT)
 		this_cpu_ptr(gact->common.cpu_qstats)->drops += packets;
 
+	if (hw)
+		_bstats_cpu_update(this_cpu_ptr(gact->common.cpu_bstats_hw),
+				   bytes, packets);
+
 	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index b26d060..c193583 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -273,12 +273,15 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
 }
 
 static void tcf_stats_update(struct tc_action *a, u64 bytes, u32 packets,
-			     u64 lastuse)
+			     u64 lastuse, bool hw)
 {
 	struct tcf_mirred *m = to_mirred(a);
 	struct tcf_t *tm = &m->tcf_tm;
 
 	_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats), bytes, packets);
+	if (hw)
+		_bstats_cpu_update(this_cpu_ptr(a->cpu_bstats_hw),
+				   bytes, packets);
 	tm->lastuse = max_t(u64, tm->lastuse, lastuse);
 }
 

^ permalink raw reply related	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-08-09 15:01 [PATCH 0/2] net/sched: Add hardware specific counters to TC actions Eelco Chaudron
  2018-08-09 15:01 ` [PATCH 1/2] net/core: Add new basic hardware counter Eelco Chaudron
  2018-08-09 15:01 ` [PATCH 2/2] net/sched: Add hardware specific counters to TC actions Eelco Chaudron
@ 2018-08-10  3:26 ` Jakub Kicinski
  2018-08-11 19:06   ` David Miller
  2 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2018-08-10  3:26 UTC (permalink / raw)
  To: Eelco Chaudron; +Cc: netdev, davem, jhs, xiyou.wangcong, jiri, Simon Horman

On Thu,  9 Aug 2018 11:01:18 -0400, Eelco Chaudron wrote:
> Add hardware specific counters to TC actions which will be exported
> through the netlink API. This makes troubleshooting TC flower offload
> easier, as it possible to differentiate the packets being offloaded.

It is not immediately clear why this is needed.  The memory and
updating two sets of counters won't come for free, so perhaps a
stronger justification than troubleshooting is due? :S

Netdev has counters for fallback vs forwarded traffic, so you'd know
that traffic hits the SW datapath, plus the rules which are in_hw will
most likely not match as of today for flower (assuming correctness).

I'm slightly concerned about potential performance impact, would you
be able to share some numbers for non-trivial number of flows (100k
active?)?

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-08-10  3:26 ` [PATCH 0/2] " Jakub Kicinski
@ 2018-08-11 19:06   ` David Miller
  2018-08-16 12:02     ` Eelco Chaudron
  0 siblings, 1 reply; 16+ messages in thread
From: David Miller @ 2018-08-11 19:06 UTC (permalink / raw)
  To: jakub.kicinski; +Cc: echaudro, netdev, jhs, xiyou.wangcong, jiri, simon.horman

From: Jakub Kicinski <jakub.kicinski@netronome.com>
Date: Thu, 9 Aug 2018 20:26:08 -0700

> It is not immediately clear why this is needed.  The memory and
> updating two sets of counters won't come for free, so perhaps a
> stronger justification than troubleshooting is due? :S
> 
> Netdev has counters for fallback vs forwarded traffic, so you'd know
> that traffic hits the SW datapath, plus the rules which are in_hw will
> most likely not match as of today for flower (assuming correctness).
> 
> I'm slightly concerned about potential performance impact, would you
> be able to share some numbers for non-trivial number of flows (100k
> active?)?

Agreed, features used for diagnostics cannot have a harmful penalty for
fast path performance.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-08-11 19:06   ` David Miller
@ 2018-08-16 12:02     ` Eelco Chaudron
  2018-08-17 11:27       ` Jakub Kicinski
  0 siblings, 1 reply; 16+ messages in thread
From: Eelco Chaudron @ 2018-08-16 12:02 UTC (permalink / raw)
  To: David Miller, jakub.kicinski
  Cc: netdev, jhs, xiyou.wangcong, jiri, simon.horman, Marcelo Ricardo Leitner

On 11 Aug 2018, at 21:06, David Miller wrote:

> From: Jakub Kicinski <jakub.kicinski@netronome.com>
> Date: Thu, 9 Aug 2018 20:26:08 -0700
>
>> It is not immediately clear why this is needed.  The memory and
>> updating two sets of counters won't come for free, so perhaps a
>> stronger justification than troubleshooting is due? :S
>>
>> Netdev has counters for fallback vs forwarded traffic, so you'd know
>> that traffic hits the SW datapath, plus the rules which are in_hw 
>> will
>> most likely not match as of today for flower (assuming correctness).

I strongly believe that these counters are a requirement for a mixed 
software/hardware (flow) based forwarding environment. The global 
counters will not help much here as you might have chosen to have 
certain traffic forwarded by software.

These counters are probably the only option you have to figure out why 
forwarding is not as fast as expected, and you want to blame the TC 
offload NIC.

>> I'm slightly concerned about potential performance impact, would you
>> be able to share some numbers for non-trivial number of flows (100k
>> active?)?
>
> Agreed, features used for diagnostics cannot have a harmful penalty 
> for
> fast path performance.

Fast path performance is not affected as these counters are not 
incremented there. They are only incremented by the nic driver when they 
gather their statistics from hardware.

However, the flow creation is effected, as this is where the extra 
memory gets allocated. I had done some 40K flow tests before and did not 
see any noticeable change in flow insertion performance. As requested by 
Jakub I did it again for 100K (and threw a Netronome blade in the mix 
;). I used Marcelo’s test tool, 
https://github.com/marceloleitner/perf-flower.git.

Here are the numbers (time in seconds) for 10 runs in sorted order:

+-------------+----------------+
| Base_kernel | Change_applied |
+-------------+----------------+
|    5.684019 |       5.656388 |
|    5.699658 |       5.674974 |
|    5.725220 |       5.722107 |
|    5.739285 |       5.839855 |
|    5.748088 |       5.865238 |
|    5.766231 |       5.873913 |
|    5.842264 |       5.909259 |
|    5.902202 |       5.912685 |
|    5.905391 |       5.947138 |
|    6.032997 |       5.997779 |
+-------------+----------------+

I guess the deviation is in the userspace part, which is where in real 
life flows get added anyway.

Let me know if more is unclear.

//Eelco

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-08-16 12:02     ` Eelco Chaudron
@ 2018-08-17 11:27       ` Jakub Kicinski
  2018-08-20 14:03         ` Eelco Chaudron
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2018-08-17 11:27 UTC (permalink / raw)
  To: Eelco Chaudron
  Cc: David Miller, netdev, jhs, xiyou.wangcong, jiri, simon.horman,
	Marcelo Ricardo Leitner

On Thu, 16 Aug 2018 14:02:44 +0200, Eelco Chaudron wrote:
> On 11 Aug 2018, at 21:06, David Miller wrote:
> 
> > From: Jakub Kicinski <jakub.kicinski@netronome.com>
> > Date: Thu, 9 Aug 2018 20:26:08 -0700
> >  
> >> It is not immediately clear why this is needed.  The memory and
> >> updating two sets of counters won't come for free, so perhaps a
> >> stronger justification than troubleshooting is due? :S
> >>
> >> Netdev has counters for fallback vs forwarded traffic, so you'd know
> >> that traffic hits the SW datapath, plus the rules which are in_hw 
> >> will
> >> most likely not match as of today for flower (assuming correctness).  
> 
> I strongly believe that these counters are a requirement for a mixed 
> software/hardware (flow) based forwarding environment. The global 
> counters will not help much here as you might have chosen to have 
> certain traffic forwarded by software.
> 
> These counters are probably the only option you have to figure out why 
> forwarding is not as fast as expected, and you want to blame the TC 
> offload NIC.

The suggested debugging flow would be:
 (1) check the global counter for fallback are incrementing;
 (2) find a flow with high stats but no in_hw flag set.

The in_hw indication should be sufficient in most cases (unless there
are shared blocks between netdevs of different ASICs...).

> >> I'm slightly concerned about potential performance impact, would you
> >> be able to share some numbers for non-trivial number of flows (100k
> >> active?)?  
> >
> > Agreed, features used for diagnostics cannot have a harmful penalty 
> > for fast path performance.  
> 
> Fast path performance is not affected as these counters are not 
> incremented there. They are only incremented by the nic driver when they 
> gather their statistics from hardware.

Not by much, you are adding state to performance-critical structures,
though, for what is effectively debugging purposes.  

I was mostly talking about the HW offload stat updates (sorry for not
being clear).

We can have some hundreds of thousands active offloaded flows, each of
them can have multiple actions, and stats have to be updated multiple
times per second and dumped probably around once a second, too.  On a
busy system the stats will get evicted from cache between each round.  

But I'm speculating let's see if I can get some numbers on it (if you
could get some too, that would be great!).

> However, the flow creation is effected, as this is where the extra 
> memory gets allocated. I had done some 40K flow tests before and did not 
> see any noticeable change in flow insertion performance. As requested by 
> Jakub I did it again for 100K (and threw a Netronome blade in the mix 
> ;). I used Marcelo’s test tool, 
> https://github.com/marceloleitner/perf-flower.git.
> 
> Here are the numbers (time in seconds) for 10 runs in sorted order:
> 
> +-------------+----------------+
> | Base_kernel | Change_applied |
> +-------------+----------------+
> |    5.684019 |       5.656388 |
> |    5.699658 |       5.674974 |
> |    5.725220 |       5.722107 |
> |    5.739285 |       5.839855 |
> |    5.748088 |       5.865238 |
> |    5.766231 |       5.873913 |
> |    5.842264 |       5.909259 |
> |    5.902202 |       5.912685 |
> |    5.905391 |       5.947138 |
> |    6.032997 |       5.997779 |
> +-------------+----------------+
> 
> I guess the deviation is in the userspace part, which is where in real 
> life flows get added anyway.
> 
> Let me know if more is unclear.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-08-17 11:27       ` Jakub Kicinski
@ 2018-08-20 14:03         ` Eelco Chaudron
  2018-08-23 18:14           ` Jakub Kicinski
  0 siblings, 1 reply; 16+ messages in thread
From: Eelco Chaudron @ 2018-08-20 14:03 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, jhs, xiyou.wangcong, jiri, simon.horman,
	Marcelo Ricardo Leitner



On 17 Aug 2018, at 13:27, Jakub Kicinski wrote:

> On Thu, 16 Aug 2018 14:02:44 +0200, Eelco Chaudron wrote:
>> On 11 Aug 2018, at 21:06, David Miller wrote:
>>
>>> From: Jakub Kicinski <jakub.kicinski@netronome.com>
>>> Date: Thu, 9 Aug 2018 20:26:08 -0700
>>>
>>>> It is not immediately clear why this is needed.  The memory and
>>>> updating two sets of counters won't come for free, so perhaps a
>>>> stronger justification than troubleshooting is due? :S
>>>>
>>>> Netdev has counters for fallback vs forwarded traffic, so you'd 
>>>> know
>>>> that traffic hits the SW datapath, plus the rules which are in_hw
>>>> will
>>>> most likely not match as of today for flower (assuming 
>>>> correctness).
>>
>> I strongly believe that these counters are a requirement for a mixed
>> software/hardware (flow) based forwarding environment. The global
>> counters will not help much here as you might have chosen to have
>> certain traffic forwarded by software.
>>
>> These counters are probably the only option you have to figure out 
>> why
>> forwarding is not as fast as expected, and you want to blame the TC
>> offload NIC.
>
> The suggested debugging flow would be:
>  (1) check the global counter for fallback are incrementing;
>  (2) find a flow with high stats but no in_hw flag set.
>
> The in_hw indication should be sufficient in most cases (unless there
> are shared blocks between netdevs of different ASICs...).

I guess the aim is to find miss behaving hardware, i.e. having the in_hw 
flag set, but flows still coming to the kernel.

>>>> I'm slightly concerned about potential performance impact, would 
>>>> you
>>>> be able to share some numbers for non-trivial number of flows (100k
>>>> active?)?
>>>
>>> Agreed, features used for diagnostics cannot have a harmful penalty
>>> for fast path performance.
>>
>> Fast path performance is not affected as these counters are not
>> incremented there. They are only incremented by the nic driver when 
>> they
>> gather their statistics from hardware.
>
> Not by much, you are adding state to performance-critical structures,
> though, for what is effectively debugging purposes.
>
> I was mostly talking about the HW offload stat updates (sorry for not
> being clear).
>
> We can have some hundreds of thousands active offloaded flows, each of
> them can have multiple actions, and stats have to be updated multiple
> times per second and dumped probably around once a second, too.  On a
> busy system the stats will get evicted from cache between each round.
>
> But I'm speculating let's see if I can get some numbers on it (if you
> could get some too, that would be great!).

I’ll try to measure some of this later this week/early next week.

>> However, the flow creation is effected, as this is where the extra
>> memory gets allocated. I had done some 40K flow tests before and did 
>> not
>> see any noticeable change in flow insertion performance. As requested 
>> by
>> Jakub I did it again for 100K (and threw a Netronome blade in the mix
>> ;). I used Marcelo’s test tool,
>> https://github.com/marceloleitner/perf-flower.git.
>>
>> Here are the numbers (time in seconds) for 10 runs in sorted order:
>>
>> +-------------+----------------+
>> | Base_kernel | Change_applied |
>> +-------------+----------------+
>> |    5.684019 |       5.656388 |
>> |    5.699658 |       5.674974 |
>> |    5.725220 |       5.722107 |
>> |    5.739285 |       5.839855 |
>> |    5.748088 |       5.865238 |
>> |    5.766231 |       5.873913 |
>> |    5.842264 |       5.909259 |
>> |    5.902202 |       5.912685 |
>> |    5.905391 |       5.947138 |
>> |    6.032997 |       5.997779 |
>> +-------------+----------------+
>>
>> I guess the deviation is in the userspace part, which is where in 
>> real
>> life flows get added anyway.
>>
>> Let me know if more is unclear.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-08-20 14:03         ` Eelco Chaudron
@ 2018-08-23 18:14           ` Jakub Kicinski
  2018-08-29  9:43             ` Eelco Chaudron
  2018-08-29 10:23             ` Paolo Abeni
  0 siblings, 2 replies; 16+ messages in thread
From: Jakub Kicinski @ 2018-08-23 18:14 UTC (permalink / raw)
  To: Eelco Chaudron
  Cc: David Miller, netdev, jhs, xiyou.wangcong, jiri, simon.horman,
	Marcelo Ricardo Leitner, louis.peens

On Mon, 20 Aug 2018 16:03:40 +0200, Eelco Chaudron wrote:
> On 17 Aug 2018, at 13:27, Jakub Kicinski wrote:
> > On Thu, 16 Aug 2018 14:02:44 +0200, Eelco Chaudron wrote:  
> >> On 11 Aug 2018, at 21:06, David Miller wrote:
> >>  
> >>> From: Jakub Kicinski <jakub.kicinski@netronome.com>
> >>> Date: Thu, 9 Aug 2018 20:26:08 -0700
> >>>  
> >>>> It is not immediately clear why this is needed.  The memory and
> >>>> updating two sets of counters won't come for free, so perhaps a
> >>>> stronger justification than troubleshooting is due? :S
> >>>>
> >>>> Netdev has counters for fallback vs forwarded traffic, so you'd 
> >>>> know
> >>>> that traffic hits the SW datapath, plus the rules which are in_hw
> >>>> will
> >>>> most likely not match as of today for flower (assuming 
> >>>> correctness).  
> >>
> >> I strongly believe that these counters are a requirement for a mixed
> >> software/hardware (flow) based forwarding environment. The global
> >> counters will not help much here as you might have chosen to have
> >> certain traffic forwarded by software.
> >>
> >> These counters are probably the only option you have to figure out 
> >> why
> >> forwarding is not as fast as expected, and you want to blame the TC
> >> offload NIC.  
> >
> > The suggested debugging flow would be:
> >  (1) check the global counter for fallback are incrementing;
> >  (2) find a flow with high stats but no in_hw flag set.
> >
> > The in_hw indication should be sufficient in most cases (unless there
> > are shared blocks between netdevs of different ASICs...).  
> 
> I guess the aim is to find miss behaving hardware, i.e. having the in_hw 
> flag set, but flows still coming to the kernel.

For misbehaving hardware in_hw will not work indeed.  Whether we need
these extra always-on stats for such use case could be debated :)
 
> >>>> I'm slightly concerned about potential performance impact, would 
> >>>> you
> >>>> be able to share some numbers for non-trivial number of flows (100k
> >>>> active?)?  
> >>>
> >>> Agreed, features used for diagnostics cannot have a harmful penalty
> >>> for fast path performance.  
> >>
> >> Fast path performance is not affected as these counters are not
> >> incremented there. They are only incremented by the nic driver when 
> >> they
> >> gather their statistics from hardware.  
> >
> > Not by much, you are adding state to performance-critical structures,
> > though, for what is effectively debugging purposes.
> >
> > I was mostly talking about the HW offload stat updates (sorry for not
> > being clear).
> >
> > We can have some hundreds of thousands active offloaded flows, each of
> > them can have multiple actions, and stats have to be updated multiple
> > times per second and dumped probably around once a second, too.  On a
> > busy system the stats will get evicted from cache between each round.
> >
> > But I'm speculating let's see if I can get some numbers on it (if you
> > could get some too, that would be great!).  
> 
> I’ll try to measure some of this later this week/early next week.

I asked Louis to run some tests while I'm travelling, and he reports
that my worry about reporting the extra stats was unfounded.  Update
function does not show up in traces at all.  It seems under stress
(generated with stress-ng) the thread dumping the stats in userspace
(in OvS it would be the revalidator) actually consumes less CPU in
__gnet_stats_copy_basic (0.4% less for ~2.0% total).

Would this match with your results?  I'm not sure why dumping would be
faster with your change..

> >> However, the flow creation is effected, as this is where the extra
> >> memory gets allocated. I had done some 40K flow tests before and did 
> >> not
> >> see any noticeable change in flow insertion performance. As requested 
> >> by
> >> Jakub I did it again for 100K (and threw a Netronome blade in the mix
> >> ;). I used Marcelo’s test tool,
> >> https://github.com/marceloleitner/perf-flower.git.
> >>
> >> Here are the numbers (time in seconds) for 10 runs in sorted order:
> >>
> >> +-------------+----------------+
> >> | Base_kernel | Change_applied |
> >> +-------------+----------------+
> >> |    5.684019 |       5.656388 |
> >> |    5.699658 |       5.674974 |
> >> |    5.725220 |       5.722107 |
> >> |    5.739285 |       5.839855 |
> >> |    5.748088 |       5.865238 |
> >> |    5.766231 |       5.873913 |
> >> |    5.842264 |       5.909259 |
> >> |    5.902202 |       5.912685 |
> >> |    5.905391 |       5.947138 |
> >> |    6.032997 |       5.997779 |
> >> +-------------+----------------+
> >>
> >> I guess the deviation is in the userspace part, which is where in 
> >> real
> >> life flows get added anyway.
> >>
> >> Let me know if more is unclear.  

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-08-23 18:14           ` Jakub Kicinski
@ 2018-08-29  9:43             ` Eelco Chaudron
  2018-08-29 18:12               ` Jakub Kicinski
  2018-08-29 10:23             ` Paolo Abeni
  1 sibling, 1 reply; 16+ messages in thread
From: Eelco Chaudron @ 2018-08-29  9:43 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, jhs, xiyou.wangcong, jiri, simon.horman,
	Marcelo Ricardo Leitner, louis.peens



On 23 Aug 2018, at 20:14, Jakub Kicinski wrote:

> On Mon, 20 Aug 2018 16:03:40 +0200, Eelco Chaudron wrote:
>> On 17 Aug 2018, at 13:27, Jakub Kicinski wrote:
>>> On Thu, 16 Aug 2018 14:02:44 +0200, Eelco Chaudron wrote:
>>>> On 11 Aug 2018, at 21:06, David Miller wrote:
>>>>
>>>>> From: Jakub Kicinski <jakub.kicinski@netronome.com>
>>>>> Date: Thu, 9 Aug 2018 20:26:08 -0700
>>>>>
>>>>>> It is not immediately clear why this is needed.  The memory and
>>>>>> updating two sets of counters won't come for free, so perhaps a
>>>>>> stronger justification than troubleshooting is due? :S
>>>>>>
>>>>>> Netdev has counters for fallback vs forwarded traffic, so you'd
>>>>>> know
>>>>>> that traffic hits the SW datapath, plus the rules which are in_hw
>>>>>> will
>>>>>> most likely not match as of today for flower (assuming
>>>>>> correctness).
>>>>
>>>> I strongly believe that these counters are a requirement for a 
>>>> mixed
>>>> software/hardware (flow) based forwarding environment. The global
>>>> counters will not help much here as you might have chosen to have
>>>> certain traffic forwarded by software.
>>>>
>>>> These counters are probably the only option you have to figure out
>>>> why
>>>> forwarding is not as fast as expected, and you want to blame the TC
>>>> offload NIC.
>>>
>>> The suggested debugging flow would be:
>>>  (1) check the global counter for fallback are incrementing;
>>>  (2) find a flow with high stats but no in_hw flag set.
>>>
>>> The in_hw indication should be sufficient in most cases (unless 
>>> there
>>> are shared blocks between netdevs of different ASICs...).
>>
>> I guess the aim is to find miss behaving hardware, i.e. having the 
>> in_hw
>> flag set, but flows still coming to the kernel.
>
> For misbehaving hardware in_hw will not work indeed.  Whether we need
> these extra always-on stats for such use case could be debated :)
>
>>>>>> I'm slightly concerned about potential performance impact, would
>>>>>> you
>>>>>> be able to share some numbers for non-trivial number of flows 
>>>>>> (100k
>>>>>> active?)?
>>>>>
>>>>> Agreed, features used for diagnostics cannot have a harmful 
>>>>> penalty
>>>>> for fast path performance.
>>>>
>>>> Fast path performance is not affected as these counters are not
>>>> incremented there. They are only incremented by the nic driver when
>>>> they
>>>> gather their statistics from hardware.
>>>
>>> Not by much, you are adding state to performance-critical 
>>> structures,
>>> though, for what is effectively debugging purposes.
>>>
>>> I was mostly talking about the HW offload stat updates (sorry for 
>>> not
>>> being clear).
>>>
>>> We can have some hundreds of thousands active offloaded flows, each 
>>> of
>>> them can have multiple actions, and stats have to be updated 
>>> multiple
>>> times per second and dumped probably around once a second, too.  On 
>>> a
>>> busy system the stats will get evicted from cache between each 
>>> round.
>>>
>>> But I'm speculating let's see if I can get some numbers on it (if 
>>> you
>>> could get some too, that would be great!).
>>
>> I’ll try to measure some of this later this week/early next week.
>
> I asked Louis to run some tests while I'm travelling, and he reports
> that my worry about reporting the extra stats was unfounded.  Update
> function does not show up in traces at all.  It seems under stress
> (generated with stress-ng) the thread dumping the stats in userspace
> (in OvS it would be the revalidator) actually consumes less CPU in
> __gnet_stats_copy_basic (0.4% less for ~2.0% total).
>
> Would this match with your results?  I'm not sure why dumping would be
> faster with your change..

Tested with OVS and https://github.com/chaudron/ovs_perf using 300K TC 
rules installed in HW.

For __gnet_stats_copy_basic() being faster I have (had) a theory. Now 
this function is called twice, and I assumed the first call would cache 
memory and the second call would be faster.

Sampling a lot of perf data, I get an average of 1115ns with the base 
kernel and 954ns with the fix applied, so about ~14%.

Thought I would perf tcf_action_copy_stats() as it is the place updating 
the additional counter. But even in this case, I see a better 
performance with the patch applied.

In average 13581ns with the fix, vs base kernel at 1391ns, so about 
2.3%.

I guess the changes to the tc_action structure got better cache 
alignment.


>>>> However, the flow creation is effected, as this is where the extra
>>>> memory gets allocated. I had done some 40K flow tests before and 
>>>> did
>>>> not
>>>> see any noticeable change in flow insertion performance. As 
>>>> requested
>>>> by
>>>> Jakub I did it again for 100K (and threw a Netronome blade in the 
>>>> mix
>>>> ;). I used Marcelo’s test tool,
>>>> https://github.com/marceloleitner/perf-flower.git.
>>>>
>>>> Here are the numbers (time in seconds) for 10 runs in sorted order:
>>>>
>>>> +-------------+----------------+
>>>> | Base_kernel | Change_applied |
>>>> +-------------+----------------+
>>>> |    5.684019 |       5.656388 |
>>>> |    5.699658 |       5.674974 |
>>>> |    5.725220 |       5.722107 |
>>>> |    5.739285 |       5.839855 |
>>>> |    5.748088 |       5.865238 |
>>>> |    5.766231 |       5.873913 |
>>>> |    5.842264 |       5.909259 |
>>>> |    5.902202 |       5.912685 |
>>>> |    5.905391 |       5.947138 |
>>>> |    6.032997 |       5.997779 |
>>>> +-------------+----------------+
>>>>
>>>> I guess the deviation is in the userspace part, which is where in
>>>> real
>>>> life flows get added anyway.
>>>>
>>>> Let me know if more is unclear.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-08-23 18:14           ` Jakub Kicinski
  2018-08-29  9:43             ` Eelco Chaudron
@ 2018-08-29 10:23             ` Paolo Abeni
  2018-08-29 18:06               ` Jakub Kicinski
  1 sibling, 1 reply; 16+ messages in thread
From: Paolo Abeni @ 2018-08-29 10:23 UTC (permalink / raw)
  To: Jakub Kicinski, Eelco Chaudron
  Cc: David Miller, netdev, jhs, xiyou.wangcong, jiri, simon.horman,
	Marcelo Ricardo Leitner, louis.peens

On Thu, 2018-08-23 at 20:14 +0200, Jakub Kicinski wrote:
> I asked Louis to run some tests while I'm travelling, and he reports
> that my worry about reporting the extra stats was unfounded.  Update
> function does not show up in traces at all.  It seems under stress
> (generated with stress-ng) the thread dumping the stats in userspace
> (in OvS it would be the revalidator) actually consumes less CPU in
> __gnet_stats_copy_basic (0.4% less for ~2.0% total).
> 
> Would this match with your results?  I'm not sure why dumping would be
> faster with your change..

Wild guess on my side: the relevant patch changes a bit the binary
layout of the 'tc_action' struct, possibly (I still need to check with
pahole) moving the tcf_lock and the stats field on different
cachelines, reducing false sharing that could affect badly such test.

Cheers,

Paolo

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-08-29 10:23             ` Paolo Abeni
@ 2018-08-29 18:06               ` Jakub Kicinski
  0 siblings, 0 replies; 16+ messages in thread
From: Jakub Kicinski @ 2018-08-29 18:06 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: Eelco Chaudron, David Miller, netdev, jhs, xiyou.wangcong, jiri,
	simon.horman, Marcelo Ricardo Leitner, louis.peens

On Wed, 29 Aug 2018 12:23:15 +0200, Paolo Abeni wrote:
> On Thu, 2018-08-23 at 20:14 +0200, Jakub Kicinski wrote:
> > I asked Louis to run some tests while I'm travelling, and he reports
> > that my worry about reporting the extra stats was unfounded.  Update
> > function does not show up in traces at all.  It seems under stress
> > (generated with stress-ng) the thread dumping the stats in userspace
> > (in OvS it would be the revalidator) actually consumes less CPU in
> > __gnet_stats_copy_basic (0.4% less for ~2.0% total).
> > 
> > Would this match with your results?  I'm not sure why dumping would be
> > faster with your change..  
> 
> Wild guess on my side: the relevant patch changes a bit the binary
> layout of the 'tc_action' struct, possibly (I still need to check with
> pahole) moving the tcf_lock and the stats field on different
> cachelines, reducing false sharing that could affect badly such test.

I think in our tests we tried with and without pinning relevant
processing to one core, and both results shown improvement.  I don't
have the actual samples any more, just perf script dump without CPU
IDs to confirm things were pinned correctly.. :(

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-08-29  9:43             ` Eelco Chaudron
@ 2018-08-29 18:12               ` Jakub Kicinski
  2018-09-20  7:14                 ` Eelco Chaudron
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2018-08-29 18:12 UTC (permalink / raw)
  To: Eelco Chaudron
  Cc: David Miller, netdev, jhs, xiyou.wangcong, jiri, simon.horman,
	Marcelo Ricardo Leitner, louis.peens

On Wed, 29 Aug 2018 11:43:47 +0200, Eelco Chaudron wrote:
> On 23 Aug 2018, at 20:14, Jakub Kicinski wrote:
> 
> > On Mon, 20 Aug 2018 16:03:40 +0200, Eelco Chaudron wrote:  
> >> On 17 Aug 2018, at 13:27, Jakub Kicinski wrote:  
> >>> On Thu, 16 Aug 2018 14:02:44 +0200, Eelco Chaudron wrote:  
> >>>> On 11 Aug 2018, at 21:06, David Miller wrote:
> >>>>  
> >>>>> From: Jakub Kicinski <jakub.kicinski@netronome.com>
> >>>>> Date: Thu, 9 Aug 2018 20:26:08 -0700
> >>>>>  
> >>>>>> It is not immediately clear why this is needed.  The memory and
> >>>>>> updating two sets of counters won't come for free, so perhaps a
> >>>>>> stronger justification than troubleshooting is due? :S
> >>>>>>
> >>>>>> Netdev has counters for fallback vs forwarded traffic, so you'd
> >>>>>> know
> >>>>>> that traffic hits the SW datapath, plus the rules which are in_hw
> >>>>>> will
> >>>>>> most likely not match as of today for flower (assuming
> >>>>>> correctness).  
> >>>>
> >>>> I strongly believe that these counters are a requirement for a 
> >>>> mixed
> >>>> software/hardware (flow) based forwarding environment. The global
> >>>> counters will not help much here as you might have chosen to have
> >>>> certain traffic forwarded by software.
> >>>>
> >>>> These counters are probably the only option you have to figure out
> >>>> why
> >>>> forwarding is not as fast as expected, and you want to blame the TC
> >>>> offload NIC.  
> >>>
> >>> The suggested debugging flow would be:
> >>>  (1) check the global counter for fallback are incrementing;
> >>>  (2) find a flow with high stats but no in_hw flag set.
> >>>
> >>> The in_hw indication should be sufficient in most cases (unless 
> >>> there
> >>> are shared blocks between netdevs of different ASICs...).  
> >>
> >> I guess the aim is to find miss behaving hardware, i.e. having the 
> >> in_hw
> >> flag set, but flows still coming to the kernel.  
> >
> > For misbehaving hardware in_hw will not work indeed.  Whether we need
> > these extra always-on stats for such use case could be debated :)
> >  
> >>>>>> I'm slightly concerned about potential performance impact, would
> >>>>>> you
> >>>>>> be able to share some numbers for non-trivial number of flows 
> >>>>>> (100k
> >>>>>> active?)?  
> >>>>>
> >>>>> Agreed, features used for diagnostics cannot have a harmful 
> >>>>> penalty
> >>>>> for fast path performance.  
> >>>>
> >>>> Fast path performance is not affected as these counters are not
> >>>> incremented there. They are only incremented by the nic driver when
> >>>> they
> >>>> gather their statistics from hardware.  
> >>>
> >>> Not by much, you are adding state to performance-critical 
> >>> structures,
> >>> though, for what is effectively debugging purposes.
> >>>
> >>> I was mostly talking about the HW offload stat updates (sorry for 
> >>> not
> >>> being clear).
> >>>
> >>> We can have some hundreds of thousands active offloaded flows, each 
> >>> of
> >>> them can have multiple actions, and stats have to be updated 
> >>> multiple
> >>> times per second and dumped probably around once a second, too.  On 
> >>> a
> >>> busy system the stats will get evicted from cache between each 
> >>> round.
> >>>
> >>> But I'm speculating let's see if I can get some numbers on it (if 
> >>> you
> >>> could get some too, that would be great!).  
> >>
> >> I’ll try to measure some of this later this week/early next week.  
> >
> > I asked Louis to run some tests while I'm travelling, and he reports
> > that my worry about reporting the extra stats was unfounded.  Update
> > function does not show up in traces at all.  It seems under stress
> > (generated with stress-ng) the thread dumping the stats in userspace
> > (in OvS it would be the revalidator) actually consumes less CPU in
> > __gnet_stats_copy_basic (0.4% less for ~2.0% total).
> >
> > Would this match with your results?  I'm not sure why dumping would be
> > faster with your change..  
> 
> Tested with OVS and https://github.com/chaudron/ovs_perf using 300K TC 
> rules installed in HW.
> 
> For __gnet_stats_copy_basic() being faster I have (had) a theory. Now 
> this function is called twice, and I assumed the first call would cache 
> memory and the second call would be faster.
> 
> Sampling a lot of perf data, I get an average of 1115ns with the base 
> kernel and 954ns with the fix applied, so about ~14%.
> 
> Thought I would perf tcf_action_copy_stats() as it is the place updating 
> the additional counter. But even in this case, I see a better 
> performance with the patch applied.
> 
> In average 13581ns with the fix, vs base kernel at 1391ns, so about 
> 2.3%.
> 
> I guess the changes to the tc_action structure got better cache 
> alignment.

Interesting you could reproduce the speed up too!  +1 for the guess.
Seems like my caution about slowing down SW paths to support HW offload
landed on a very unfortunate patch :)

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-08-29 18:12               ` Jakub Kicinski
@ 2018-09-20  7:14                 ` Eelco Chaudron
  2018-09-20 14:14                   ` Jakub Kicinski
  0 siblings, 1 reply; 16+ messages in thread
From: Eelco Chaudron @ 2018-09-20  7:14 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, jhs, xiyou.wangcong, jiri, simon.horman,
	Marcelo Ricardo Leitner, louis.peens, Paolo, Davide Caratti



On 29 Aug 2018, at 20:12, Jakub Kicinski wrote:

> On Wed, 29 Aug 2018 11:43:47 +0200, Eelco Chaudron wrote:
>> On 23 Aug 2018, at 20:14, Jakub Kicinski wrote:
>>
>>> On Mon, 20 Aug 2018 16:03:40 +0200, Eelco Chaudron wrote:
>>>> On 17 Aug 2018, at 13:27, Jakub Kicinski wrote:
>>>>> On Thu, 16 Aug 2018 14:02:44 +0200, Eelco Chaudron wrote:
>>>>>> On 11 Aug 2018, at 21:06, David Miller wrote:
>>>>>>
>>>>>>> From: Jakub Kicinski <jakub.kicinski@netronome.com>
>>>>>>> Date: Thu, 9 Aug 2018 20:26:08 -0700
>>>>>>>
>>>>>>>> It is not immediately clear why this is needed.  The memory and
>>>>>>>> updating two sets of counters won't come for free, so perhaps a
>>>>>>>> stronger justification than troubleshooting is due? :S
>>>>>>>>
>>>>>>>> Netdev has counters for fallback vs forwarded traffic, so you'd
>>>>>>>> know
>>>>>>>> that traffic hits the SW datapath, plus the rules which are 
>>>>>>>> in_hw
>>>>>>>> will
>>>>>>>> most likely not match as of today for flower (assuming
>>>>>>>> correctness).
>>>>>>
>>>>>> I strongly believe that these counters are a requirement for a
>>>>>> mixed
>>>>>> software/hardware (flow) based forwarding environment. The global
>>>>>> counters will not help much here as you might have chosen to have
>>>>>> certain traffic forwarded by software.
>>>>>>
>>>>>> These counters are probably the only option you have to figure 
>>>>>> out
>>>>>> why
>>>>>> forwarding is not as fast as expected, and you want to blame the 
>>>>>> TC
>>>>>> offload NIC.
>>>>>
>>>>> The suggested debugging flow would be:
>>>>>  (1) check the global counter for fallback are incrementing;
>>>>>  (2) find a flow with high stats but no in_hw flag set.
>>>>>
>>>>> The in_hw indication should be sufficient in most cases (unless
>>>>> there
>>>>> are shared blocks between netdevs of different ASICs...).
>>>>
>>>> I guess the aim is to find miss behaving hardware, i.e. having the
>>>> in_hw
>>>> flag set, but flows still coming to the kernel.
>>>
>>> For misbehaving hardware in_hw will not work indeed.  Whether we 
>>> need
>>> these extra always-on stats for such use case could be debated :)
>>>
>>>>>>>> I'm slightly concerned about potential performance impact, 
>>>>>>>> would
>>>>>>>> you
>>>>>>>> be able to share some numbers for non-trivial number of flows
>>>>>>>> (100k
>>>>>>>> active?)?
>>>>>>>
>>>>>>> Agreed, features used for diagnostics cannot have a harmful
>>>>>>> penalty
>>>>>>> for fast path performance.
>>>>>>
>>>>>> Fast path performance is not affected as these counters are not
>>>>>> incremented there. They are only incremented by the nic driver 
>>>>>> when
>>>>>> they
>>>>>> gather their statistics from hardware.
>>>>>
>>>>> Not by much, you are adding state to performance-critical
>>>>> structures,
>>>>> though, for what is effectively debugging purposes.
>>>>>
>>>>> I was mostly talking about the HW offload stat updates (sorry for
>>>>> not
>>>>> being clear).
>>>>>
>>>>> We can have some hundreds of thousands active offloaded flows, 
>>>>> each
>>>>> of
>>>>> them can have multiple actions, and stats have to be updated
>>>>> multiple
>>>>> times per second and dumped probably around once a second, too.  
>>>>> On
>>>>> a
>>>>> busy system the stats will get evicted from cache between each
>>>>> round.
>>>>>
>>>>> But I'm speculating let's see if I can get some numbers on it (if
>>>>> you
>>>>> could get some too, that would be great!).
>>>>
>>>> I’ll try to measure some of this later this week/early next week.
>>>
>>> I asked Louis to run some tests while I'm travelling, and he reports
>>> that my worry about reporting the extra stats was unfounded.  Update
>>> function does not show up in traces at all.  It seems under stress
>>> (generated with stress-ng) the thread dumping the stats in userspace
>>> (in OvS it would be the revalidator) actually consumes less CPU in
>>> __gnet_stats_copy_basic (0.4% less for ~2.0% total).
>>>
>>> Would this match with your results?  I'm not sure why dumping would 
>>> be
>>> faster with your change..
>>
>> Tested with OVS and https://github.com/chaudron/ovs_perf using 300K 
>> TC
>> rules installed in HW.
>>
>> For __gnet_stats_copy_basic() being faster I have (had) a theory. Now
>> this function is called twice, and I assumed the first call would 
>> cache
>> memory and the second call would be faster.
>>
>> Sampling a lot of perf data, I get an average of 1115ns with the base
>> kernel and 954ns with the fix applied, so about ~14%.
>>
>> Thought I would perf tcf_action_copy_stats() as it is the place 
>> updating
>> the additional counter. But even in this case, I see a better
>> performance with the patch applied.
>>
>> In average 13581ns with the fix, vs base kernel at 1391ns, so about
>> 2.3%.
>>
>> I guess the changes to the tc_action structure got better cache
>> alignment.
>
> Interesting you could reproduce the speed up too!  +1 for the guess.
> Seems like my caution about slowing down SW paths to support HW 
> offload
> landed on a very unfortunate patch :)

Is there anything else blocking from getting this into net-next?

I still think this patch is beneficial for the full user experience, and 
I’ve got requests from QA and others for this.

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-09-20  7:14                 ` Eelco Chaudron
@ 2018-09-20 14:14                   ` Jakub Kicinski
  2018-09-21 11:14                     ` Eelco Chaudron
  0 siblings, 1 reply; 16+ messages in thread
From: Jakub Kicinski @ 2018-09-20 14:14 UTC (permalink / raw)
  To: Eelco Chaudron
  Cc: David Miller, netdev, jhs, xiyou.wangcong, jiri, simon.horman,
	Marcelo Ricardo Leitner, louis.peens, Paolo, Davide Caratti

On Thu, 20 Sep 2018 09:14:08 +0200, Eelco Chaudron wrote:
> Is there anything else blocking from getting this into net-next?
> 
> I still think this patch is beneficial for the full user experience, and 
> I’ve got requests from QA and others for this.

Not from my perspective, the numbers look okay, feel free to repost!

^ permalink raw reply	[flat|nested] 16+ messages in thread

* Re: [PATCH 0/2] net/sched: Add hardware specific counters to TC actions
  2018-09-20 14:14                   ` Jakub Kicinski
@ 2018-09-21 11:14                     ` Eelco Chaudron
  0 siblings, 0 replies; 16+ messages in thread
From: Eelco Chaudron @ 2018-09-21 11:14 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: David Miller, netdev, jhs, xiyou.wangcong, jiri, simon.horman,
	Marcelo Ricardo Leitner, louis.peens, Paolo, Davide Caratti



On 20 Sep 2018, at 16:14, Jakub Kicinski wrote:

> On Thu, 20 Sep 2018 09:14:08 +0200, Eelco Chaudron wrote:
>> Is there anything else blocking from getting this into net-next?
>>
>> I still think this patch is beneficial for the full user experience, and
>> I’ve got requests from QA and others for this.
>
> Not from my perspective, the numbers look okay, feel free to repost!

Thanks, I sent out a v2 re-based on the latest net-next.

^ permalink raw reply	[flat|nested] 16+ messages in thread

end of thread, other threads:[~2018-09-21 17:03 UTC | newest]

Thread overview: 16+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-08-09 15:01 [PATCH 0/2] net/sched: Add hardware specific counters to TC actions Eelco Chaudron
2018-08-09 15:01 ` [PATCH 1/2] net/core: Add new basic hardware counter Eelco Chaudron
2018-08-09 15:01 ` [PATCH 2/2] net/sched: Add hardware specific counters to TC actions Eelco Chaudron
2018-08-10  3:26 ` [PATCH 0/2] " Jakub Kicinski
2018-08-11 19:06   ` David Miller
2018-08-16 12:02     ` Eelco Chaudron
2018-08-17 11:27       ` Jakub Kicinski
2018-08-20 14:03         ` Eelco Chaudron
2018-08-23 18:14           ` Jakub Kicinski
2018-08-29  9:43             ` Eelco Chaudron
2018-08-29 18:12               ` Jakub Kicinski
2018-09-20  7:14                 ` Eelco Chaudron
2018-09-20 14:14                   ` Jakub Kicinski
2018-09-21 11:14                     ` Eelco Chaudron
2018-08-29 10:23             ` Paolo Abeni
2018-08-29 18:06               ` Jakub Kicinski

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.