* [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route()
@ 2014-10-20 20:42 Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro Martin KaFai Lau
` (5 more replies)
0 siblings, 6 replies; 11+ messages in thread
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
To: netdev
Hi,
This patch set is trying to reduce the number of fib6_lookup()
calls from ip6_pol_route().
I have adapted davem's udpflooda and kbench_mod test
(https://git.kernel.org/pub/scm/linux/kernel/git/davem/net_test_tools.git) to
support IPv6 and here is the result:
Before:
[root]# for i in $(seq 1 3); do time ./udpflood -l 20000000 -c 250 2401:face:face:face::2; done
real 0m34.190s
user 0m3.047s
sys 0m31.108s
real 0m34.635s
user 0m3.125s
sys 0m31.475s
real 0m34.517s
user 0m3.034s
sys 0m31.449s
[root]# insmod ip6_route_kbench.ko oif=2 src=2401:face:face:face::1 dst=2401:face:face:face::2
[ 660.160976] ip6_route_kbench: ip6_route_output tdiff: 933
[ 660.207261] ip6_route_kbench: ip6_route_output tdiff: 988
[ 660.253492] ip6_route_kbench: ip6_route_output tdiff: 896
[ 660.298862] ip6_route_kbench: ip6_route_output tdiff: 898
After:
[root]# for i in $(seq 1 3); do time ./udpflood -l 20000000 -c 250 2401:face:face:face::2; done
real 0m32.695s
user 0m2.925s
sys 0m29.737s
real 0m32.636s
user 0m3.007s
sys 0m29.596s
real 0m32.797s
user 0m2.866s
sys 0m29.898s
[root]# insmod ip6_route_kbench.ko oif=2 src=2401:face:face:face::1 dst=2401:face:face:face::2
[ 881.220793] ip6_route_kbench: ip6_route_output tdiff: 684
[ 881.253477] ip6_route_kbench: ip6_route_output tdiff: 640
[ 881.286867] ip6_route_kbench: ip6_route_output tdiff: 630
[ 881.320749] ip6_route_kbench: ip6_route_output tdiff: 653
/****************************** udpflood.c ******************************/
/* It is an adaptation of the Eric Dumazet's and David Miller's
* udpflood tool, by adding IPv6 support.
*/
#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <malloc.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <stdint.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#define _GNU_SOURCE
#include <getopt.h>
typedef uint32_t u32;
static int debug =3D 0;
/* Allow -fstrict-aliasing */
typedef union sa_u {
struct sockaddr_storage a46;
struct sockaddr_in a4;
struct sockaddr_in6 a6;
} sa_u;
static int usage(void)
{
printf("usage: udpflood [ -l count ] [ -m message_size ] [ -c num_ip_addrs=
] IP_ADDRESS\n");
return -1;
}
static u32 get_last32h(const sa_u *sa)
{
if (sa->a46.ss_family =3D=3D PF_INET)
return ntohl(sa->a4.sin_addr.s_addr);
else
return ntohl(sa->a6.sin6_addr.s6_addr32[3]);
}
static void set_last32h(sa_u *sa, u32 last32h)
{
if (sa->a46.ss_family =3D=3D PF_INET)
sa->a4.sin_addr.s_addr =3D htonl(last32h);
else
sa->a6.sin6_addr.s6_addr32[3] =3D htonl(last32h);
}
static void print_saddr(const sa_u *sa, const char *msg)
{
char buf[64];
if (!debug)
return;
switch (sa->a46.ss_family) {
case PF_INET:
inet_ntop(PF_INET, &(sa->a4.sin_addr.s_addr), buf,
sizeof(buf));
break;
case PF_INET6:
inet_ntop(PF_INET6, &(sa->a6.sin6_addr), buf, sizeof(buf));
break;
}
printf("%s: %s\n", msg, buf);
}
static int send_packets(const sa_u *sa, size_t num_addrs, int count, int ms=
g_sz)
{
char *msg =3D malloc(msg_sz);
sa_u saddr;
u32 start_addr32h, end_addr32h, cur_addr32h;
int fd, i, err;
if (!msg)
return -ENOMEM;
memset(msg, 0, msg_sz);
memcpy(&saddr, sa, sizeof(saddr));
cur_addr32h =3D start_addr32h =3D get_last32h(&saddr);
end_addr32h =3D start_addr32h + num_addrs;
fd =3D socket(saddr.a46.ss_family, SOCK_DGRAM, 0);
if (fd < 0) {
perror("socket");
err =3D fd;
goto out_nofd;
}
/* connect to avoid the kernel spending time in figuring
* out the source address (i.e pin the src address)
*/
err =3D connect(fd, (struct sockaddr *) &saddr, sizeof(saddr));
if (err < 0) {
perror("connect");
goto out;
}
print_saddr(&saddr, "start_addr");
for (i =3D 0; i < count; i++) {
print_saddr(&saddr, "sendto");
err =3D sendto(fd, msg, msg_sz, 0, (struct sockaddr *)&saddr,
sizeof(saddr));
if (err < 0) {
perror("sendto");
goto out;
}
if (++cur_addr32h >=3D end_addr32h)
cur_addr32h =3D start_addr32h;
set_last32h(&saddr, cur_addr32h);
}
err =3D 0;
out:
close(fd);
out_nofd:
free(msg);
return err;
}
int main(int argc, char **argv, char **envp)
{
int port, msg_sz, count, num_addrs, ret;
sa_u start_addr;
port =3D 6000;
msg_sz =3D 32;
count =3D 10000000;
num_addrs =3D 1;
while ((ret =3D getopt(argc, argv, "dl:s:p:c:")) >=3D 0) {
switch (ret) {
case 'l':
sscanf(optarg, "%d", &count);
break;
case 's':
sscanf(optarg, "%d", &msg_sz);
break;
case 'p':
sscanf(optarg, "%d", &port);
break;
case 'c':
sscanf(optarg, "%d", &num_addrs);
break;
case 'd':
debug =3D 1;
break;
case '?':
return usage();
}
}
if (num_addrs < 1)
return usage();
if (!argv[optind])
return usage();
start_addr.a4.sin_port =3D htons(port);
if (inet_pton(PF_INET, argv[optind], &start_addr.a4.sin_addr))
start_addr.a46.ss_family =3D PF_INET;
else if (inet_pton(PF_INET6, argv[optind], &start_addr.a6.sin6_addr.s6_add=
r))
start_addr.a46.ss_family =3D PF_INET6;
else
return usage();
return send_packets(&start_addr, num_addrs, count, msg_sz);
}
/****************** ip6_route_kbench_mod.c ******************/
#define pr_fmt(fmt) "ip6_route_kbench: " fmt
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/inet.h>
#include <linux/in6.h>
#include <net/route.h>
#include <net/ip6_route.h>
#include <linux/timex.h>
#include <uapi/linux/icmpv6.h>
/* We can't just use "get_cycles()" as on some platforms, such
* as sparc64, that gives system cycles rather than cpu clock
* cycles.
*/
#ifdef CONFIG_SPARC64
static inline unsigned long long get_tick(void)
{
unsigned long long t;
__asm__ __volatile__("rd %%tick, %0" : "=r" (t));
return t;
}
#elif defined(CONFIG_X86)
static inline unsigned long long get_tick(void)
{
unsigned long long t;
rdtscll(t);
return t;
}
#elif defined(CONFIG_POWERPC)
static inline unsigned long long get_tick(void)
{
return get_cycles();
}
#else
#error Unsupported architecture, please implement get_tick()
#endif
#define DEFAULT_WARMUP_COUNT 100000
#define DEFAULT_DST_IP_ADDR 0x4a800001
#define DEFAULT_SRC_IP_ADDR 0x00000000
#define DEFAULT_OIF 0
#define DEFAULT_IIF 0
#define DEFAULT_MARK 0x00000000
#define DEFAULT_TOS 0x00
static int flow_oif = DEFAULT_OIF;
static int flow_iif = DEFAULT_IIF;
static u32 flow_mark = DEFAULT_MARK;
static struct in6_addr flow_dst_ip_addr;
static struct in6_addr flow_src_ip_addr;
static int flow_tos = DEFAULT_TOS;
static char dst_string[64];
static char src_string[64];
module_param_string(dst, dst_string, sizeof(dst_string), 0);
module_param_string(src, src_string, sizeof(src_string), 0);
static int __init flow_setup(void)
{
if (dst_string[0] &&
!in6_pton(dst_string, -1, &flow_dst_ip_addr.s6_addr[0], -1, NULL)) {
pr_info("cannot parse \"%s\"\n", dst_string);
return -1;
}
if (src_string[0] &&
!in6_pton(src_string, -1, &flow_src_ip_addr.s6_addr[0], -1, NULL)) {
pr_info("cannot parse \"%s\"\n", dst_string);
return -1;
}
return 0;
}
module_param_named(oif, flow_oif, int, 0);
module_param_named(iif, flow_iif, int, 0);
module_param_named(mark, flow_mark, uint, 0);
module_param_named(tos, flow_tos, int, 0);
static int warmup_count = DEFAULT_WARMUP_COUNT;
module_param_named(count, warmup_count, int, 0);
static void flow_init(struct flowi6 *fl6)
{
memset(fl6, 0, sizeof(*fl6));
fl6->flowi6_proto = IPPROTO_ICMPV6;
fl6->flowi6_oif = flow_oif;
fl6->flowi6_iif = flow_iif;
fl6->flowi6_mark = flow_mark;
fl6->flowi6_tos = flow_tos;
fl6->daddr = flow_dst_ip_addr;
fl6->saddr = flow_src_ip_addr;
}
static struct sk_buff * fake_skb_get(void)
{
struct ipv6hdr *hdr;
struct sk_buff *skb;
skb = alloc_skb(4096, GFP_KERNEL);
if (!skb) {
pr_info("Cannot alloc SKB for test\n");
return NULL;
}
skb->dev = __dev_get_by_index(&init_net, flow_iif);
if (skb->dev == NULL) {
pr_info("Input device (%d) does not exist\n", flow_iif);
goto err;
}
skb_reset_mac_header(skb);
skb_reset_network_header(skb);
skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
hdr = ipv6_hdr(skb);
hdr->priority = 0;
hdr->version = 6;
memset(hdr->flow_lbl, 0, sizeof(hdr->flow_lbl));
hdr->payload_len = htons(sizeof(struct icmp6hdr));
hdr->nexthdr = IPPROTO_ICMPV6;
hdr->saddr = flow_src_ip_addr;
hdr->daddr = flow_dst_ip_addr;
skb->protocol = htons(ETH_P_IPV6);
skb->mark = flow_mark;
return skb;
err:
kfree_skb(skb);
return NULL;
}
static void do_full_output_lookup_bench(void)
{
unsigned long long t1, t2, tdiff;
struct rt6_info *rt;
struct flowi6 fl6;
int i;
rt = NULL;
for (i = 0; i < warmup_count; i++) {
flow_init(&fl6);
rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl6);
if (IS_ERR(rt))
break;
ip6_rt_put(rt);
}
if (IS_ERR(rt)) {
pr_info("ip_route_output_key: err=%ld\n", PTR_ERR(rt));
return;
}
flow_init(&fl6);
t1 = get_tick();
rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl6);
t2 = get_tick();
if (!IS_ERR(rt))
ip6_rt_put(rt);
tdiff = t2 - t1;
pr_info("ip6_route_output tdiff: %llu\n", tdiff);
}
static void do_full_input_lookup_bench(void)
{
unsigned long long t1, t2, tdiff;
struct sk_buff *skb;
struct rt6_info *rt;
int err, i;
skb = fake_skb_get();
if (skb == NULL)
goto out_free;
err = 0;
local_bh_disable();
for (i = 0; i < warmup_count; i++) {
ip6_route_input(skb);
rt = (struct rt6_info *)skb_dst(skb);
err = (!rt || rt == init_net.ipv6.ip6_null_entry);
skb_dst_drop(skb);
if (err)
break;
}
local_bh_enable();
if (err) {
pr_info("Input route lookup fails\n");
goto out_free;
}
local_bh_disable();
t1 = get_tick();
ip6_route_input(skb);
t2 = get_tick();
local_bh_enable();
rt = (struct rt6_info *)skb_dst(skb);
err = (!rt || rt == init_net.ipv6.ip6_null_entry);
skb_dst_drop(skb);
if (err) {
pr_info("Input route lookup fails\n");
goto out_free;
}
tdiff = t2 - t1;
pr_info("ip6_route_input tdiff: %llu\n", tdiff);
out_free:
kfree_skb(skb);
}
static void do_full_lookup_bench(void)
{
if (!flow_iif)
do_full_output_lookup_bench();
else
do_full_input_lookup_bench();
}
static void do_bench(void)
{
do_full_lookup_bench();
do_full_lookup_bench();
do_full_lookup_bench();
do_full_lookup_bench();
}
static int __init kbench_init(void)
{
if (flow_setup())
return -EINVAL;
pr_info("flow [IIF(%d),OIF(%d),MARK(0x%08x),D("IP6_FMT"),"
"S("IP6_FMT"),TOS(0x%02x)]\n",
flow_iif, flow_oif, flow_mark,
IP6_PRT(flow_dst_ip_addr),
IP6_PRT(flow_src_ip_addr),
flow_tos);
#if defined(CONFIG_X86)
if (!cpu_has_tsc) {
pr_err("X86 TSC is required, but is unavailable.\n");
return -EINVAL;
}
#endif
pr_info("sizeof(struct rt6_info)==%zu\n", sizeof(struct rt6_info));
do_bench();
return -ENODEV;
}
static void __exit kbench_exit(void)
{
}
module_init(kbench_init);
module_exit(kbench_exit);
MODULE_LICENSE("GPL");
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro
2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
@ 2014-10-20 20:42 ` Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 2/3] ipv6: Avoid redoing fib6_lookup() for RTF_CACHE hit case Martin KaFai Lau
` (4 subsequent siblings)
5 siblings, 0 replies; 11+ messages in thread
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
To: netdev; +Cc: David Miller, Hannes Frederic Sowa
It is the prep work to reduce the number of calls to fib6_lookup().
The BACKTRACK macro could be hard-to-read and error-prone due to
its side effects (mainly goto).
This patch is to:
1. Replace BACKTRACK macro with a function (fib6_backtrack) with the following
return values:
* If it is backtrack-able, returns next fn for retry.
* If it reaches the root, returns NULL.
2. The caller needs to decide if a backtrack is needed (by testing
rt == net->ipv6.ip6_null_entry).
3. Rename the goto labels in ip6_pol_route() to make the next few
patches easier to read.
Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
net/ipv6/route.c | 70 ++++++++++++++++++++++++++++++++------------------------
1 file changed, 40 insertions(+), 30 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a318dd89..f1ab2f4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -772,23 +772,22 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
}
#endif
-#define BACKTRACK(__net, saddr) \
-do { \
- if (rt == __net->ipv6.ip6_null_entry) { \
- struct fib6_node *pn; \
- while (1) { \
- if (fn->fn_flags & RTN_TL_ROOT) \
- goto out; \
- pn = fn->parent; \
- if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
- fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
- else \
- fn = pn; \
- if (fn->fn_flags & RTN_RTINFO) \
- goto restart; \
- } \
- } \
-} while (0)
+static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
+ struct in6_addr *saddr)
+{
+ struct fib6_node *pn;
+ while (1) {
+ if (fn->fn_flags & RTN_TL_ROOT)
+ return NULL;
+ pn = fn->parent;
+ if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
+ fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
+ else
+ fn = pn;
+ if (fn->fn_flags & RTN_RTINFO)
+ return fn;
+ }
+}
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
struct fib6_table *table,
@@ -804,8 +803,11 @@ restart:
rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
- BACKTRACK(net, &fl6->saddr);
-out:
+ if (rt == net->ipv6.ip6_null_entry) {
+ fn = fib6_backtrack(fn, &fl6->saddr);
+ if (fn)
+ goto restart;
+ }
dst_use(&rt->dst, jiffies);
read_unlock_bh(&table->tb6_lock);
return rt;
@@ -924,19 +926,25 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
strict |= flags & RT6_LOOKUP_F_IFACE;
-relookup:
+redo_fib6_lookup_lock:
read_lock_bh(&table->tb6_lock);
-restart_2:
+redo_fib6_lookup:
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
-restart:
+redo_rt6_select:
rt = rt6_select(fn, oif, strict | reachable);
if (rt->rt6i_nsiblings)
rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
- BACKTRACK(net, &fl6->saddr);
- if (rt == net->ipv6.ip6_null_entry ||
- rt->rt6i_flags & RTF_CACHE)
+ if (rt == net->ipv6.ip6_null_entry) {
+ fn = fib6_backtrack(fn, &fl6->saddr);
+ if (fn)
+ goto redo_rt6_select;
+ else
+ goto out;
+ }
+
+ if (rt->rt6i_flags & RTF_CACHE)
goto out;
dst_hold(&rt->dst);
@@ -967,12 +975,12 @@ restart:
* released someone could insert this route. Relookup.
*/
ip6_rt_put(rt);
- goto relookup;
+ goto redo_fib6_lookup_lock;
out:
if (reachable) {
reachable = 0;
- goto restart_2;
+ goto redo_fib6_lookup;
}
dst_hold(&rt->dst);
read_unlock_bh(&table->tb6_lock);
@@ -1235,10 +1243,12 @@ restart:
rt = net->ipv6.ip6_null_entry;
else if (rt->dst.error) {
rt = net->ipv6.ip6_null_entry;
- goto out;
+ } else if (rt == net->ipv6.ip6_null_entry) {
+ fn = fib6_backtrack(fn, &fl6->saddr);
+ if (fn)
+ goto restart;
}
- BACKTRACK(net, &fl6->saddr);
-out:
+
dst_hold(&rt->dst);
read_unlock_bh(&table->tb6_lock);
--
1.8.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH RFC v5 net 2/3] ipv6: Avoid redoing fib6_lookup() for RTF_CACHE hit case
2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro Martin KaFai Lau
@ 2014-10-20 20:42 ` Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 3/3] ipv6: Avoid redoing fib6_lookup() with reachable = 0 by saving fn Martin KaFai Lau
` (3 subsequent siblings)
5 siblings, 0 replies; 11+ messages in thread
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
To: netdev; +Cc: David Miller, Hannes Frederic Sowa
When there is a RTF_CACHE hit, no need to redo fib6_lookup()
with reachable=0.
Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
net/ipv6/route.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f1ab2f4..98c523f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -944,12 +944,12 @@ redo_rt6_select:
goto out;
}
- if (rt->rt6i_flags & RTF_CACHE)
- goto out;
-
dst_hold(&rt->dst);
read_unlock_bh(&table->tb6_lock);
+ if (rt->rt6i_flags & RTF_CACHE)
+ goto out2;
+
if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
else if (!(rt->dst.flags & DST_HOST))
--
1.8.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH RFC v5 net 3/3] ipv6: Avoid redoing fib6_lookup() with reachable = 0 by saving fn
2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 2/3] ipv6: Avoid redoing fib6_lookup() for RTF_CACHE hit case Martin KaFai Lau
@ 2014-10-20 20:42 ` Martin KaFai Lau
2014-10-24 4:15 ` [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() David Miller
` (2 subsequent siblings)
5 siblings, 0 replies; 11+ messages in thread
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
To: netdev; +Cc: David Miller, Hannes Frederic Sowa
This patch save the fn before doing rt6_backtrack.
Hence, without redo-ing the fib6_lookup(), saved_fn can be used
to redo rt6_select() with RT6_LOOKUP_F_REACHABLE off.
Some minor changes I think make sense to review as a single patch:
* Remove the 'out:' goto label.
* Remove the 'reachable' variable. Only use the 'strict' variable instead.
After this patch, "failing ip6_ins_rt()" should be the only case that
requires a redo of fib6_lookup().
Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
net/ipv6/route.c | 30 ++++++++++++++++--------------
1 file changed, 16 insertions(+), 14 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 98c523f..c910831 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -917,31 +917,40 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
struct flowi6 *fl6, int flags)
{
- struct fib6_node *fn;
+ struct fib6_node *fn, *saved_fn;
struct rt6_info *rt, *nrt;
int strict = 0;
int attempts = 3;
int err;
- int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
strict |= flags & RT6_LOOKUP_F_IFACE;
+ if (net->ipv6.devconf_all->forwarding == 0)
+ strict |= RT6_LOOKUP_F_REACHABLE;
redo_fib6_lookup_lock:
read_lock_bh(&table->tb6_lock);
-redo_fib6_lookup:
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+ saved_fn = fn;
redo_rt6_select:
- rt = rt6_select(fn, oif, strict | reachable);
+ rt = rt6_select(fn, oif, strict);
if (rt->rt6i_nsiblings)
- rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
+ rt = rt6_multipath_select(rt, fl6, oif, strict);
if (rt == net->ipv6.ip6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto redo_rt6_select;
- else
- goto out;
+ else if (strict & RT6_LOOKUP_F_REACHABLE) {
+ /* also consider unreachable route */
+ strict &= ~RT6_LOOKUP_F_REACHABLE;
+ fn = saved_fn;
+ goto redo_rt6_select;
+ } else {
+ dst_hold(&rt->dst);
+ read_unlock_bh(&table->tb6_lock);
+ goto out2;
+ }
}
dst_hold(&rt->dst);
@@ -977,13 +986,6 @@ redo_rt6_select:
ip6_rt_put(rt);
goto redo_fib6_lookup_lock;
-out:
- if (reachable) {
- reachable = 0;
- goto redo_fib6_lookup;
- }
- dst_hold(&rt->dst);
- read_unlock_bh(&table->tb6_lock);
out2:
rt->dst.lastuse = jiffies;
rt->dst.__use++;
--
1.8.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route()
2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
` (2 preceding siblings ...)
2014-10-20 20:42 ` [PATCH RFC v5 net 3/3] ipv6: Avoid redoing fib6_lookup() with reachable = 0 by saving fn Martin KaFai Lau
@ 2014-10-24 4:15 ` David Miller
2014-10-24 17:28 ` Martin Lau
2014-12-02 18:41 ` [net_test_tools] udpflood: Add IPv6 support Martin KaFai Lau
2014-12-02 20:37 ` [PATCH net-next] tcp: Add TCP tracer Martin KaFai Lau
5 siblings, 1 reply; 11+ messages in thread
From: David Miller @ 2014-10-24 4:15 UTC (permalink / raw)
To: kafai; +Cc: netdev
From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 20 Oct 2014 13:42:42 -0700
> This patch set is trying to reduce the number of fib6_lookup()
> calls from ip6_pol_route().
>
> I have adapted davem's udpflooda and kbench_mod test
> (https://git.kernel.org/pub/scm/linux/kernel/git/davem/net_test_tools.git) to
> support IPv6 and here is the result:
Series applied, thanks.
Can you cook up some clean patches against the net_test_tools repo so
that people can use it for both ipv4 and ipv6 route lookup measurements?
Thanks.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route()
2014-10-24 4:15 ` [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() David Miller
@ 2014-10-24 17:28 ` Martin Lau
0 siblings, 0 replies; 11+ messages in thread
From: Martin Lau @ 2014-10-24 17:28 UTC (permalink / raw)
To: David Miller; +Cc: netdev
Hi,
> Can you cook up some clean patches against the net_test_tools repo so
> that people can use it for both ipv4 and ipv6 route lookup measurements?
Yes, will do.
Thanks,
--Martin
^ permalink raw reply [flat|nested] 11+ messages in thread
* [net_test_tools] udpflood: Add IPv6 support
2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
` (3 preceding siblings ...)
2014-10-24 4:15 ` [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() David Miller
@ 2014-12-02 18:41 ` Martin KaFai Lau
2014-12-09 18:05 ` David Miller
2014-12-02 20:37 ` [PATCH net-next] tcp: Add TCP tracer Martin KaFai Lau
5 siblings, 1 reply; 11+ messages in thread
From: Martin KaFai Lau @ 2014-12-02 18:41 UTC (permalink / raw)
To: davem; +Cc: netdev
This patch:
1. Add IPv6 support
2. Print timing for every 65536 fib insert operations to observe
the gc effect (mostly for IPv6 fib).
---
udpflood.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 101 insertions(+), 24 deletions(-)
diff --git a/udpflood.c b/udpflood.c
index 6e658f7..5855012 100644
--- a/udpflood.c
+++ b/udpflood.c
@@ -6,7 +6,9 @@
#include <string.h>
#include <errno.h>
#include <unistd.h>
+#include <stdint.h>
+#include <sys/time.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
@@ -15,57 +17,121 @@
#define _GNU_SOURCE
#include <getopt.h>
+static int debug = 0;
+
+typedef union sa_u {
+ struct sockaddr_in a4;
+ struct sockaddr_in6 a6;
+} sa_u;
+
static int usage(void)
{
printf("usage: udpflood [ -l count ] [ -m message_size ] [ -c num_ip_addrs ] IP_ADDRESS\n");
return -1;
}
-static int send_packets(in_addr_t start_addr, in_addr_t end_addr,
- int port, int count, int msg_sz)
+static uint32_t get_last32h(const sa_u *sa)
+{
+ if (sa->a4.sin_family == PF_INET)
+ return ntohl(sa->a4.sin_addr.s_addr);
+ else
+ return ntohl(sa->a6.sin6_addr.s6_addr32[3]);
+}
+
+static void set_last32h(sa_u *sa, uint32_t last32h)
+{
+ if (sa->a4.sin_family == PF_INET)
+ sa->a4.sin_addr.s_addr = htonl(last32h);
+ else
+ sa->a6.sin6_addr.s6_addr32[3] = htonl(last32h);
+}
+
+static void print_sa(const sa_u *sa, const char *msg)
+{
+ char buf[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx")];
+
+ if (!debug)
+ return;
+
+ switch (sa->a4.sin_family) {
+ case PF_INET:
+ inet_ntop(PF_INET, &(sa->a4.sin_addr.s_addr), buf,
+ sizeof(buf));
+ break;
+ case PF_INET6:
+ inet_ntop(PF_INET6, sa->a6.sin6_addr.s6_addr, buf, sizeof(buf));
+ break;
+ }
+
+ printf("%s: %s\n", msg, buf);
+}
+
+static long get_diff_ms(const struct timeval *now,
+ const struct timeval *start)
+{
+ long start_ms, now_ms;
+ start_ms = start->tv_sec * 1000 + (start->tv_usec / 1000);
+ now_ms = now->tv_sec * 1000 + (now->tv_usec / 1000);
+ return now_ms - start_ms;
+}
+
+static int send_packets(const sa_u *start_sa, size_t num_addrs, int count,
+ int msg_sz)
{
char *msg = malloc(msg_sz);
- struct sockaddr_in saddr;
- in_addr_t addr;
+ sa_u cur_sa;
+ uint32_t start_addr32h, end_addr32h, cur_addr32h;
int fd, i, err;
+ struct timeval last, now;
if (!msg)
return -ENOMEM;
memset(msg, 0, msg_sz);
- addr = start_addr;
-
- memset(&saddr, 0, sizeof(saddr));
- saddr.sin_family = AF_INET;
- saddr.sin_port = port;
- saddr.sin_addr.s_addr = addr;
+ memcpy(&cur_sa, start_sa, sizeof(cur_sa));
+ cur_addr32h = start_addr32h = get_last32h(&cur_sa);
+ end_addr32h = start_addr32h + num_addrs;
- fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+ fd = socket(cur_sa.a4.sin_family, SOCK_DGRAM, IPPROTO_IP);
if (fd < 0) {
perror("socket");
err = fd;
goto out_nofd;
}
- err = connect(fd, (struct sockaddr *) &saddr, sizeof(saddr));
+ err = connect(fd, (struct sockaddr *) &cur_sa, sizeof(cur_sa));
if (err < 0) {
perror("connect");
- close(fd);
goto out;
}
+ print_sa(start_sa, "start_addr");
+ gettimeofday(&last, NULL);
for (i = 0; i < count; i++) {
- saddr.sin_addr.s_addr = addr;
-
+ print_sa(&cur_sa, "sendto");
err = sendto(fd, msg, msg_sz, 0,
- (struct sockaddr *) &saddr, sizeof(saddr));
+ (struct sockaddr *) &cur_sa, sizeof(cur_sa));
if (err < 0) {
perror("sendto");
goto out;
}
- if (++addr >= end_addr)
- addr = start_addr;
+ if (++cur_addr32h >= end_addr32h)
+ cur_addr32h = start_addr32h;
+ set_last32h(&cur_sa, cur_addr32h);
+
+ /*
+ * print timing info for every 65536 fib inserts to
+ * observe the gc effect (mostly for IPv6 fib).
+ */
+ if (i && (i & 0xFFFF) == 0) {
+ long diff_ms;
+ gettimeofday(&now, NULL);
+ diff_ms = get_diff_ms(&now, &last);
+ printf("%d %ld.%ld\n", i >> 16,
+ diff_ms / 1000, diff_ms % 1000);
+ memcpy(&last, &now, sizeof(last));
+ }
}
err = 0;
@@ -79,14 +145,14 @@ out_nofd:
int main(int argc, char **argv, char **envp)
{
int port, msg_sz, count, num_addrs, ret;
- in_addr_t start_addr, end_addr;
+ sa_u start_sa;
port = 6000;
msg_sz = 32;
count = 10000000;
num_addrs = 1;
- while ((ret = getopt(argc, argv, "l:s:p:c:")) >= 0) {
+ while ((ret = getopt(argc, argv, "dl:s:p:c:")) >= 0) {
switch (ret) {
case 'l':
sscanf(optarg, "%d", &count);
@@ -100,18 +166,29 @@ int main(int argc, char **argv, char **envp)
case 'c':
sscanf(optarg, "%d", &num_addrs);
break;
+ case 'd':
+ debug = 1;
+ break;
case '?':
return usage();
}
}
+ if (num_addrs < 1 || count < 1)
+ return usage();
+
if (!argv[optind])
return usage();
- start_addr = inet_addr(argv[optind]);
- if (start_addr == INADDR_NONE)
+ memset(&start_sa, 0, sizeof(start_sa));
+ start_sa.a4.sin_port = htons(port);
+ if (inet_pton(PF_INET, argv[optind], &start_sa.a4.sin_addr))
+ start_sa.a4.sin_family = PF_INET;
+ else if (inet_pton(PF_INET6, argv[optind],
+ start_sa.a6.sin6_addr.s6_addr))
+ start_sa.a6.sin6_family = PF_INET6;
+ else
return usage();
- end_addr = start_addr + num_addrs;
- return send_packets(start_addr, end_addr, port, count, msg_sz);
+ return send_packets(&start_sa, num_addrs, count, msg_sz);
}
--
1.8.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH net-next] tcp: Add TCP tracer
2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
` (4 preceding siblings ...)
2014-12-02 18:41 ` [net_test_tools] udpflood: Add IPv6 support Martin KaFai Lau
@ 2014-12-02 20:37 ` Martin KaFai Lau
2014-12-02 20:40 ` Martin Lau
2014-12-03 1:51 ` Stephen Hemminger
5 siblings, 2 replies; 11+ messages in thread
From: Martin KaFai Lau @ 2014-12-02 20:37 UTC (permalink / raw)
To: davem; +Cc: netdev
Define probes and register them to the TCP tracepoints. The probes
collect the data defined in struct tcp_sk_trace and record them to
the tracing's ring_buffer.
---
include/uapi/linux/tcp_trace.h | 9 +-
kernel/trace/tcp_trace.c | 448 +++++++++++++++++++++++++++++++++++++++++
kernel/trace/trace.h | 1 +
3 files changed, 451 insertions(+), 7 deletions(-)
diff --git a/include/uapi/linux/tcp_trace.h b/include/uapi/linux/tcp_trace.h
index 2644f7f..d913a3c 100644
--- a/include/uapi/linux/tcp_trace.h
+++ b/include/uapi/linux/tcp_trace.h
@@ -22,11 +22,11 @@ struct tcp_stats {
__u32 other_segs_retrans;
__u32 other_octets_retrans;
__u32 loss_segs_retrans;
- __u32 loss_octects_retrans;
+ __u32 loss_octets_retrans;
__u32 segs_in;
__u32 data_segs_in;
- __u64 rtt_sample_us;
__u64 data_octets_in;
+ __u64 rtt_sample_us;
__u64 max_rtt_us;
__u64 min_rtt_us;
__u64 sum_rtt_us;
@@ -64,9 +64,4 @@ struct tcp_trace_stats {
struct tcp_stats stats;
} __packed;
-typedef struct tcp_trace_basic tcp_trace_establish;
-typedef struct tcp_trace_basic tcp_trace_retrans;
-typedef struct tcp_trace_stats tcp_trace_periodic;
-typedef struct tcp_trace_stats tcp_trace_close;
-
#endif /* UAPI_TCP_TRACE_H */
diff --git a/kernel/trace/tcp_trace.c b/kernel/trace/tcp_trace.c
index 9d09fd0..376580b 100644
--- a/kernel/trace/tcp_trace.c
+++ b/kernel/trace/tcp_trace.c
@@ -1,9 +1,27 @@
#include <net/tcp_trace.h>
+#include <net/tcp.h>
+#include <trace/events/tcp.h>
#include <linux/tcp.h>
+#include <linux/ipv6.h>
+#include <linux/ftrace_event.h>
+#include <linux/jiffies.h>
#include <uapi/linux/tcp_trace.h>
+#include "trace_output.h"
+
+#define REPORT_INTERVAL_MS 2000
+
+static struct trace_array *tcp_tr;
static bool tcp_trace_enabled __read_mostly;
+static struct trace_print_flags tcp_trace_event_names[] = {
+ { TCP_TRACE_EVENT_ESTABLISHED, "established" },
+ { TCP_TRACE_EVENT_PERIODIC, "periodic" },
+ { TCP_TRACE_EVENT_RETRANS, "retrans" },
+ { TCP_TRACE_EVENT_RETRANS_LOSS, "retrans_loss" },
+ { TCP_TRACE_EVENT_CLOSE, "close" }
+};
+
struct tcp_sk_trace {
struct tcp_stats stats;
unsigned long start_ts;
@@ -35,3 +53,433 @@ void tcp_sk_trace_destruct(struct sock *sk)
{
kfree(tcp_sk(sk)->trace);
}
+
+static void tcp_trace_init(struct tcp_trace *tr,
+ enum tcp_trace_events trev,
+ struct sock *sk)
+{
+ tr->event = trev;
+ if (sk->sk_family == AF_INET) {
+ tr->ipv6 = 0;
+ tr->local_addr[0] = inet_sk(sk)->inet_saddr;
+ tr->remote_addr[0] = inet_sk(sk)->inet_daddr;
+ } else {
+ BUG_ON(sk->sk_family != AF_INET6);
+ tr->ipv6 = 1;
+ memcpy(tr->local_addr, inet6_sk(sk)->saddr.s6_addr32,
+ sizeof(tr->local_addr));
+ memcpy(tr->remote_addr, sk->sk_v6_daddr.s6_addr32,
+ sizeof(tr->remote_addr));
+ }
+ tr->local_port = inet_sk(sk)->inet_sport;
+ tr->remote_port = inet_sk(sk)->inet_dport;
+}
+
+static void tcp_trace_basic_init(struct tcp_trace_basic *trb,
+ enum tcp_trace_events trev,
+ struct sock *sk)
+{
+ struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
+ tcp_trace_init((struct tcp_trace *)trb, trev, sk);
+ trb->snd_cwnd = tcp_sk(sk)->snd_cwnd * tcp_sk(sk)->mss_cache;
+ trb->mss = tcp_sk(sk)->mss_cache;
+ trb->ssthresh = tcp_current_ssthresh(sk);
+ trb->srtt_us = tcp_sk(sk)->srtt_us >> 3;
+ trb->rto_ms = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
+ trb->life_ms = jiffies_to_msecs(jiffies - sktr->start_ts);
+}
+
+static void tcp_trace_basic_add(enum tcp_trace_events trev, struct sock *sk)
+{
+ struct ring_buffer *buffer;
+ int pc;
+ struct ring_buffer_event *event;
+ struct tcp_trace_basic *trb;
+ struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
+
+ if (!sktr)
+ return;
+
+ tracing_record_cmdline(current);
+ buffer = tcp_tr->trace_buffer.buffer;
+ pc = preempt_count();
+ event = trace_buffer_lock_reserve(buffer, TRACE_TCP,
+ sizeof(*trb), 0, pc);
+ if (!event)
+ return;
+ trb = ring_buffer_event_data(event);
+ tcp_trace_basic_init(trb, trev, sk);
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
+}
+
+static void tcp_trace_stats_init(struct tcp_trace_stats *trs,
+ enum tcp_trace_events trev,
+ struct sock *sk)
+{
+ struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
+
+ tcp_trace_basic_init((struct tcp_trace_basic *)trs, trev, sk);
+ memcpy(&trs->stats, &sktr->stats, sizeof(sktr->stats));
+}
+
+static void tcp_trace_stats_add(enum tcp_trace_events trev, struct sock *sk)
+{
+ struct ring_buffer *buffer;
+ int pc;
+ struct ring_buffer_event *event;
+ struct tcp_trace_stats *trs;
+ struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
+
+ if (!sktr)
+ return;
+
+ tracing_record_cmdline(current);
+ buffer = tcp_tr->trace_buffer.buffer;
+ pc = preempt_count();
+ event = trace_buffer_lock_reserve(buffer, TRACE_TCP,
+ sizeof(*trs), 0, pc);
+ if (!event)
+ return;
+ trs = ring_buffer_event_data(event);
+
+ tcp_trace_stats_init(trs, trev, sk);
+
+ trace_buffer_unlock_commit(buffer, event, 0, pc);
+}
+
+static void tcp_trace_established(void *ignore, struct sock *sk)
+{
+ tcp_trace_basic_add(TCP_TRACE_EVENT_ESTABLISHED, sk);
+}
+
+static void tcp_trace_transmit_skb(void *ignore, struct sock *sk,
+ struct sk_buff *skb)
+{
+ int pcount;
+ struct tcp_sk_trace *sktr;
+ struct tcp_skb_cb *tcb;
+ unsigned int data_len;
+ bool retrans = false;
+
+ sktr = tcp_sk(sk)->trace;
+ if (!sktr)
+ return;
+
+ tcb = TCP_SKB_CB(skb);
+ pcount = tcp_skb_pcount(skb);
+ data_len = tcb->end_seq - tcb->seq;
+
+ sktr->stats.segs_out += pcount;
+
+ if (!data_len)
+ goto out;
+
+ sktr->stats.data_segs_out += pcount;
+ sktr->stats.data_octets_out += data_len;
+
+ if (before(tcb->seq, tcp_sk(sk)->snd_nxt)) {
+ enum tcp_trace_events trev;
+ retrans = true;
+ if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
+ sktr->stats.loss_segs_retrans += pcount;
+ sktr->stats.loss_octets_retrans += data_len;
+ trev = TCP_TRACE_EVENT_RETRANS_LOSS;
+ } else {
+ sktr->stats.other_segs_retrans += pcount;
+ sktr->stats.other_octets_retrans += data_len;
+ trev = TCP_TRACE_EVENT_RETRANS;
+ }
+ tcp_trace_stats_add(trev, sk);
+ return;
+ }
+
+out:
+ if (jiffies_to_msecs(jiffies - sktr->last_ts) >=
+ REPORT_INTERVAL_MS) {
+ sktr->last_ts = jiffies;
+ tcp_trace_stats_add(TCP_TRACE_EVENT_PERIODIC, sk);
+ }
+}
+
+static void tcp_trace_rcv_established(void *ignore, struct sock *sk,
+ struct sk_buff *skb)
+{
+ struct tcp_sk_trace *sktr;
+ unsigned int data_len;
+ struct tcphdr *th;
+
+ sktr = tcp_sk(sk)->trace;
+ if (!sktr)
+ return;
+
+ th = tcp_hdr(skb);
+ WARN_ON_ONCE(skb->len < th->doff << 2);
+
+ sktr->stats.segs_in++;
+ data_len = skb->len - (th->doff << 2);
+ if (data_len) {
+ if (TCP_SKB_CB(skb)->ack_seq == tcp_sk(sk)->snd_una)
+ sktr->stats.dup_acks_in++;
+ } else {
+ sktr->stats.data_segs_in++;
+ sktr->stats.data_segs_in += data_len;
+ }
+
+ if (jiffies_to_msecs(jiffies - sktr->last_ts) >=
+ REPORT_INTERVAL_MS) {
+ sktr->last_ts = jiffies;
+ tcp_trace_stats_add(TCP_TRACE_EVENT_PERIODIC, sk);
+ }
+}
+
+static void tcp_trace_close(void *ignore, struct sock *sk)
+{
+ struct tcp_sk_trace *sktr;
+ sktr = tcp_sk(sk)->trace;
+ if (!sktr)
+ return;
+
+ tcp_trace_stats_add(TCP_TRACE_EVENT_CLOSE, sk);
+}
+
+static void tcp_trace_ooo_rcv(void *ignore, struct sock *sk)
+{
+ struct tcp_sk_trace *sktr;
+
+ sktr = tcp_sk(sk)->trace;
+ if (!sktr)
+ return;
+
+ sktr->stats.ooo_in++;
+}
+
+static void tcp_trace_sacks_rcv(void *ignore, struct sock *sk, int num_sacks)
+{
+ struct tcp_sk_trace *sktr;
+
+ sktr = tcp_sk(sk)->trace;
+ if (!sktr)
+ return;
+
+ sktr->stats.sacks_in++;
+ sktr->stats.sack_blks_in += num_sacks;
+}
+
+void tcp_trace_rtt_sample(void *ignore, struct sock *sk,
+ long rtt_sample_us)
+{
+ struct tcp_sk_trace *sktr;
+ u32 rto_ms;
+
+ sktr = tcp_sk(sk)->trace;
+ if (!sktr)
+ return;
+
+ rto_ms = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
+
+ sktr->stats.rtt_sample_us = rtt_sample_us;
+ sktr->stats.max_rtt_us = max_t(u64, sktr->stats.max_rtt_us, rtt_sample_us);
+ sktr->stats.min_rtt_us = min_t(u64, sktr->stats.min_rtt_us, rtt_sample_us);
+
+ sktr->stats.count_rtt++;
+ sktr->stats.sum_rtt_us += rtt_sample_us;
+
+ sktr->stats.max_rto_ms = max_t(u32, sktr->stats.max_rto_ms, rto_ms);
+ sktr->stats.min_rto_ms = min_t(u32, sktr->stats.min_rto_ms, rto_ms);
+}
+
+static enum print_line_t
+tcp_trace_print(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct tcp_trace *tr = (struct tcp_trace *)iter->ent;
+ struct tcp_trace_basic *trb;
+ struct tcp_stats *stats;
+ const char *last_seq_bptr, *cur_seq_bptr;
+ int ret = 0;
+
+ union {
+ struct sockaddr_in v4;
+ struct sockaddr_in6 v6;
+ } local_sa, remote_sa;
+
+ local_sa.v4.sin_port = tr->local_port;
+ remote_sa.v4.sin_port = tr->remote_port;
+ if (tr->ipv6) {
+ local_sa.v6.sin6_family = AF_INET6;
+ remote_sa.v6.sin6_family = AF_INET6;
+ memcpy(local_sa.v6.sin6_addr.s6_addr, tr->local_addr, 4);
+ memcpy(remote_sa.v6.sin6_addr.s6_addr, tr->remote_addr, 4);
+ } else {
+ local_sa.v4.sin_family = AF_INET;
+ remote_sa.v4.sin_family =AF_INET;
+ local_sa.v4.sin_addr.s_addr = tr->local_addr[0];
+ remote_sa.v4.sin_addr.s_addr = tr->remote_addr[0];
+ }
+
+ last_seq_bptr = ftrace_print_symbols_seq(s, tr->event,
+ tcp_trace_event_names);
+ cur_seq_bptr = trace_seq_buffer_ptr(s);
+ if (last_seq_bptr == cur_seq_bptr)
+ goto out;
+
+ trb = (struct tcp_trace_basic *)tr;
+ ret = trace_seq_printf(s,
+ " %pISpc %pISpc snd_cwnd=%u mss=%u ssthresh=%u"
+ " srtt_us=%llu rto_ms=%u life_ms=%u",
+ &local_sa, &remote_sa,
+ trb->snd_cwnd, trb->mss, trb->ssthresh,
+ trb->srtt_us, trb->rto_ms, trb->life_ms);
+
+ if (tr->event == TCP_TRACE_EVENT_ESTABLISHED || ret == 0)
+ goto out;
+
+ stats = &(((struct tcp_trace_stats *)tr)->stats);
+ ret = trace_seq_printf(s,
+ " segs_out=%u data_segs_out=%u data_octets_out=%llu"
+ " other_segs_retrans=%u other_octets_retrans=%u"
+ " loss_segs_retrans=%u loss_octets_retrans=%u"
+ " segs_in=%u data_segs_in=%u data_octets_in=%llu"
+ " max_rtt_us=%llu min_rtt_us=%llu"
+ " count_rtt=%u sum_rtt_us=%llu"
+ " rtt_sample_us=%llu"
+ " max_rto_ms=%u min_rto_ms=%u"
+ " dup_acks_in=%u sacks_in=%u"
+ " sack_blks_in=%u ooo_in=%u",
+ stats->segs_out, stats->data_segs_out, stats->data_octets_out,
+ stats->other_segs_retrans, stats->other_octets_retrans,
+ stats->loss_segs_retrans, stats->loss_octets_retrans,
+ stats->segs_in, stats->data_segs_in, stats->data_octets_in,
+ stats->max_rtt_us, stats->min_rtt_us,
+ stats->count_rtt, stats->sum_rtt_us,
+ stats->rtt_sample_us,
+ stats->max_rto_ms, stats->min_rto_ms,
+ stats->dup_acks_in, stats->sacks_in,
+ stats->sack_blks_in, stats->ooo_in);
+
+out:
+ if (ret)
+ ret = trace_seq_putc(s, '\n');
+
+ return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t
+tcp_trace_print_binary(struct trace_iterator *iter)
+{
+ int ret;
+ struct trace_seq *s = &iter->seq;
+ struct tcp_trace *tr = (struct tcp_trace *)iter->ent;
+ u32 magic = TCP_TRACE_MAGIC_VERSION;
+
+ ret = trace_seq_putmem(s, &magic, sizeof(magic));
+ if (!ret)
+ goto out;
+
+ if (tr->event == TCP_TRACE_EVENT_ESTABLISHED)
+ ret = trace_seq_putmem(s, tr + sizeof(magic),
+ sizeof(struct tcp_trace_basic));
+ else
+ ret = trace_seq_putmem(s, tr + sizeof(magic),
+ sizeof(struct tcp_trace_stats));
+
+out:
+ return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t
+tcp_tracer_print_line(struct trace_iterator *iter)
+{
+ return (trace_flags & TRACE_ITER_BIN) ?
+ tcp_trace_print_binary(iter) :
+ tcp_trace_print(iter);
+}
+
+static void tcp_register_tracepoints(void)
+{
+ int ret;
+
+ ret = register_trace_tcp_established(tcp_trace_established, NULL);
+ WARN_ON(ret);
+ ret = register_trace_tcp_close(tcp_trace_close, NULL);
+ WARN_ON(ret);
+ ret = register_trace_tcp_rcv_established(tcp_trace_rcv_established, NULL);
+ WARN_ON(ret);
+ ret = register_trace_tcp_transmit_skb(tcp_trace_transmit_skb, NULL);
+ WARN_ON(ret);
+ ret = register_trace_tcp_ooo_rcv(tcp_trace_ooo_rcv, NULL);
+ WARN_ON(ret);
+ ret = register_trace_tcp_sacks_rcv(tcp_trace_sacks_rcv, NULL);
+ WARN_ON(ret);
+ ret = register_trace_tcp_rtt_sample(tcp_trace_rtt_sample, NULL);
+ WARN_ON(ret);
+}
+
+static void tcp_unregister_tracepoints(void)
+{
+ unregister_trace_tcp_established(tcp_trace_established, NULL);
+ unregister_trace_tcp_rcv_established(tcp_trace_rcv_established, NULL);
+ unregister_trace_tcp_transmit_skb(tcp_trace_transmit_skb, NULL);
+ unregister_trace_tcp_ooo_rcv(tcp_trace_ooo_rcv, NULL);
+ unregister_trace_tcp_sacks_rcv(tcp_trace_sacks_rcv, NULL);
+ unregister_trace_tcp_rtt_sample(tcp_trace_rtt_sample, NULL);
+
+ tracepoint_synchronize_unregister();
+}
+
+static void tcp_tracer_start(struct trace_array *tr)
+{
+ tcp_register_tracepoints();
+ tcp_trace_enabled = true;
+}
+
+static void tcp_tracer_stop(struct trace_array *tr)
+{
+ tcp_unregister_tracepoints();
+ tcp_trace_enabled = false;
+}
+
+static void tcp_tracer_reset(struct trace_array *tr)
+{
+ tcp_tracer_stop(tr);
+}
+
+static int tcp_tracer_init(struct trace_array *tr)
+{
+ tcp_tr = tr;
+ tcp_tracer_start(tr);
+ return 0;
+}
+
+static struct tracer tcp_tracer __read_mostly = {
+ .name = "tcp",
+ .init = tcp_tracer_init,
+ .reset = tcp_tracer_reset,
+ .start = tcp_tracer_start,
+ .stop = tcp_tracer_stop,
+ .print_line = tcp_tracer_print_line,
+};
+
+static struct trace_event_functions tcp_trace_event_funcs;
+
+static struct trace_event tcp_trace_event = {
+ .type = TRACE_TCP,
+ .funcs = &tcp_trace_event_funcs,
+};
+
+static int __init init_tcp_tracer(void)
+{
+ if (!register_ftrace_event(&tcp_trace_event)) {
+ pr_warning("Cannot register TCP trace event\n");
+ return 1;
+ }
+
+ if (register_tracer(&tcp_tracer) != 0) {
+ pr_warning("Cannot register TCP tracer\n");
+ unregister_ftrace_event(&tcp_trace_event);
+ return 1;
+ }
+ return 0;
+}
+
+device_initcall(init_tcp_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 385391f..5dc5962 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -37,6 +37,7 @@ enum trace_type {
TRACE_USER_STACK,
TRACE_BLK,
TRACE_BPUTS,
+ TRACE_TCP,
__TRACE_LAST_TYPE,
};
--
1.8.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH net-next] tcp: Add TCP tracer
2014-12-02 20:37 ` [PATCH net-next] tcp: Add TCP tracer Martin KaFai Lau
@ 2014-12-02 20:40 ` Martin Lau
2014-12-03 1:51 ` Stephen Hemminger
1 sibling, 0 replies; 11+ messages in thread
From: Martin Lau @ 2014-12-02 20:40 UTC (permalink / raw)
To: davem; +Cc: netdev
Please ignore this patch which is not completely ready. It is sent out by
mistake.
On Tue, Dec 02, 2014 at 12:37:42PM -0800, Martin KaFai Lau wrote:
> Define probes and register them to the TCP tracepoints. The probes
> collect the data defined in struct tcp_sk_trace and record them to
> the tracing's ring_buffer.
> ---
> include/uapi/linux/tcp_trace.h | 9 +-
> kernel/trace/tcp_trace.c | 448 +++++++++++++++++++++++++++++++++++++++++
> kernel/trace/trace.h | 1 +
> 3 files changed, 451 insertions(+), 7 deletions(-)
>
> diff --git a/include/uapi/linux/tcp_trace.h b/include/uapi/linux/tcp_trace.h
> index 2644f7f..d913a3c 100644
> --- a/include/uapi/linux/tcp_trace.h
> +++ b/include/uapi/linux/tcp_trace.h
> @@ -22,11 +22,11 @@ struct tcp_stats {
> __u32 other_segs_retrans;
> __u32 other_octets_retrans;
> __u32 loss_segs_retrans;
> - __u32 loss_octects_retrans;
> + __u32 loss_octets_retrans;
> __u32 segs_in;
> __u32 data_segs_in;
> - __u64 rtt_sample_us;
> __u64 data_octets_in;
> + __u64 rtt_sample_us;
> __u64 max_rtt_us;
> __u64 min_rtt_us;
> __u64 sum_rtt_us;
> @@ -64,9 +64,4 @@ struct tcp_trace_stats {
> struct tcp_stats stats;
> } __packed;
>
> -typedef struct tcp_trace_basic tcp_trace_establish;
> -typedef struct tcp_trace_basic tcp_trace_retrans;
> -typedef struct tcp_trace_stats tcp_trace_periodic;
> -typedef struct tcp_trace_stats tcp_trace_close;
> -
> #endif /* UAPI_TCP_TRACE_H */
> diff --git a/kernel/trace/tcp_trace.c b/kernel/trace/tcp_trace.c
> index 9d09fd0..376580b 100644
> --- a/kernel/trace/tcp_trace.c
> +++ b/kernel/trace/tcp_trace.c
> @@ -1,9 +1,27 @@
> #include <net/tcp_trace.h>
> +#include <net/tcp.h>
> +#include <trace/events/tcp.h>
> #include <linux/tcp.h>
> +#include <linux/ipv6.h>
> +#include <linux/ftrace_event.h>
> +#include <linux/jiffies.h>
> #include <uapi/linux/tcp_trace.h>
>
> +#include "trace_output.h"
> +
> +#define REPORT_INTERVAL_MS 2000
> +
> +static struct trace_array *tcp_tr;
> static bool tcp_trace_enabled __read_mostly;
>
> +static struct trace_print_flags tcp_trace_event_names[] = {
> + { TCP_TRACE_EVENT_ESTABLISHED, "established" },
> + { TCP_TRACE_EVENT_PERIODIC, "periodic" },
> + { TCP_TRACE_EVENT_RETRANS, "retrans" },
> + { TCP_TRACE_EVENT_RETRANS_LOSS, "retrans_loss" },
> + { TCP_TRACE_EVENT_CLOSE, "close" }
> +};
> +
> struct tcp_sk_trace {
> struct tcp_stats stats;
> unsigned long start_ts;
> @@ -35,3 +53,433 @@ void tcp_sk_trace_destruct(struct sock *sk)
> {
> kfree(tcp_sk(sk)->trace);
> }
> +
> +static void tcp_trace_init(struct tcp_trace *tr,
> + enum tcp_trace_events trev,
> + struct sock *sk)
> +{
> + tr->event = trev;
> + if (sk->sk_family == AF_INET) {
> + tr->ipv6 = 0;
> + tr->local_addr[0] = inet_sk(sk)->inet_saddr;
> + tr->remote_addr[0] = inet_sk(sk)->inet_daddr;
> + } else {
> + BUG_ON(sk->sk_family != AF_INET6);
> + tr->ipv6 = 1;
> + memcpy(tr->local_addr, inet6_sk(sk)->saddr.s6_addr32,
> + sizeof(tr->local_addr));
> + memcpy(tr->remote_addr, sk->sk_v6_daddr.s6_addr32,
> + sizeof(tr->remote_addr));
> + }
> + tr->local_port = inet_sk(sk)->inet_sport;
> + tr->remote_port = inet_sk(sk)->inet_dport;
> +}
> +
> +static void tcp_trace_basic_init(struct tcp_trace_basic *trb,
> + enum tcp_trace_events trev,
> + struct sock *sk)
> +{
> + struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
> + tcp_trace_init((struct tcp_trace *)trb, trev, sk);
> + trb->snd_cwnd = tcp_sk(sk)->snd_cwnd * tcp_sk(sk)->mss_cache;
> + trb->mss = tcp_sk(sk)->mss_cache;
> + trb->ssthresh = tcp_current_ssthresh(sk);
> + trb->srtt_us = tcp_sk(sk)->srtt_us >> 3;
> + trb->rto_ms = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
> + trb->life_ms = jiffies_to_msecs(jiffies - sktr->start_ts);
> +}
> +
> +static void tcp_trace_basic_add(enum tcp_trace_events trev, struct sock *sk)
> +{
> + struct ring_buffer *buffer;
> + int pc;
> + struct ring_buffer_event *event;
> + struct tcp_trace_basic *trb;
> + struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
> +
> + if (!sktr)
> + return;
> +
> + tracing_record_cmdline(current);
> + buffer = tcp_tr->trace_buffer.buffer;
> + pc = preempt_count();
> + event = trace_buffer_lock_reserve(buffer, TRACE_TCP,
> + sizeof(*trb), 0, pc);
> + if (!event)
> + return;
> + trb = ring_buffer_event_data(event);
> + tcp_trace_basic_init(trb, trev, sk);
> + trace_buffer_unlock_commit(buffer, event, 0, pc);
> +}
> +
> +static void tcp_trace_stats_init(struct tcp_trace_stats *trs,
> + enum tcp_trace_events trev,
> + struct sock *sk)
> +{
> + struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
> +
> + tcp_trace_basic_init((struct tcp_trace_basic *)trs, trev, sk);
> + memcpy(&trs->stats, &sktr->stats, sizeof(sktr->stats));
> +}
> +
> +static void tcp_trace_stats_add(enum tcp_trace_events trev, struct sock *sk)
> +{
> + struct ring_buffer *buffer;
> + int pc;
> + struct ring_buffer_event *event;
> + struct tcp_trace_stats *trs;
> + struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
> +
> + if (!sktr)
> + return;
> +
> + tracing_record_cmdline(current);
> + buffer = tcp_tr->trace_buffer.buffer;
> + pc = preempt_count();
> + event = trace_buffer_lock_reserve(buffer, TRACE_TCP,
> + sizeof(*trs), 0, pc);
> + if (!event)
> + return;
> + trs = ring_buffer_event_data(event);
> +
> + tcp_trace_stats_init(trs, trev, sk);
> +
> + trace_buffer_unlock_commit(buffer, event, 0, pc);
> +}
> +
> +static void tcp_trace_established(void *ignore, struct sock *sk)
> +{
> + tcp_trace_basic_add(TCP_TRACE_EVENT_ESTABLISHED, sk);
> +}
> +
> +static void tcp_trace_transmit_skb(void *ignore, struct sock *sk,
> + struct sk_buff *skb)
> +{
> + int pcount;
> + struct tcp_sk_trace *sktr;
> + struct tcp_skb_cb *tcb;
> + unsigned int data_len;
> + bool retrans = false;
> +
> + sktr = tcp_sk(sk)->trace;
> + if (!sktr)
> + return;
> +
> + tcb = TCP_SKB_CB(skb);
> + pcount = tcp_skb_pcount(skb);
> + data_len = tcb->end_seq - tcb->seq;
> +
> + sktr->stats.segs_out += pcount;
> +
> + if (!data_len)
> + goto out;
> +
> + sktr->stats.data_segs_out += pcount;
> + sktr->stats.data_octets_out += data_len;
> +
> + if (before(tcb->seq, tcp_sk(sk)->snd_nxt)) {
> + enum tcp_trace_events trev;
> + retrans = true;
> + if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
> + sktr->stats.loss_segs_retrans += pcount;
> + sktr->stats.loss_octets_retrans += data_len;
> + trev = TCP_TRACE_EVENT_RETRANS_LOSS;
> + } else {
> + sktr->stats.other_segs_retrans += pcount;
> + sktr->stats.other_octets_retrans += data_len;
> + trev = TCP_TRACE_EVENT_RETRANS;
> + }
> + tcp_trace_stats_add(trev, sk);
> + return;
> + }
> +
> +out:
> + if (jiffies_to_msecs(jiffies - sktr->last_ts) >=
> + REPORT_INTERVAL_MS) {
> + sktr->last_ts = jiffies;
> + tcp_trace_stats_add(TCP_TRACE_EVENT_PERIODIC, sk);
> + }
> +}
> +
> +static void tcp_trace_rcv_established(void *ignore, struct sock *sk,
> + struct sk_buff *skb)
> +{
> + struct tcp_sk_trace *sktr;
> + unsigned int data_len;
> + struct tcphdr *th;
> +
> + sktr = tcp_sk(sk)->trace;
> + if (!sktr)
> + return;
> +
> + th = tcp_hdr(skb);
> + WARN_ON_ONCE(skb->len < th->doff << 2);
> +
> + sktr->stats.segs_in++;
> + data_len = skb->len - (th->doff << 2);
> + if (data_len) {
> + if (TCP_SKB_CB(skb)->ack_seq == tcp_sk(sk)->snd_una)
> + sktr->stats.dup_acks_in++;
> + } else {
> + sktr->stats.data_segs_in++;
> + sktr->stats.data_segs_in += data_len;
> + }
> +
> + if (jiffies_to_msecs(jiffies - sktr->last_ts) >=
> + REPORT_INTERVAL_MS) {
> + sktr->last_ts = jiffies;
> + tcp_trace_stats_add(TCP_TRACE_EVENT_PERIODIC, sk);
> + }
> +}
> +
> +static void tcp_trace_close(void *ignore, struct sock *sk)
> +{
> + struct tcp_sk_trace *sktr;
> + sktr = tcp_sk(sk)->trace;
> + if (!sktr)
> + return;
> +
> + tcp_trace_stats_add(TCP_TRACE_EVENT_CLOSE, sk);
> +}
> +
> +static void tcp_trace_ooo_rcv(void *ignore, struct sock *sk)
> +{
> + struct tcp_sk_trace *sktr;
> +
> + sktr = tcp_sk(sk)->trace;
> + if (!sktr)
> + return;
> +
> + sktr->stats.ooo_in++;
> +}
> +
> +static void tcp_trace_sacks_rcv(void *ignore, struct sock *sk, int num_sacks)
> +{
> + struct tcp_sk_trace *sktr;
> +
> + sktr = tcp_sk(sk)->trace;
> + if (!sktr)
> + return;
> +
> + sktr->stats.sacks_in++;
> + sktr->stats.sack_blks_in += num_sacks;
> +}
> +
> +void tcp_trace_rtt_sample(void *ignore, struct sock *sk,
> + long rtt_sample_us)
> +{
> + struct tcp_sk_trace *sktr;
> + u32 rto_ms;
> +
> + sktr = tcp_sk(sk)->trace;
> + if (!sktr)
> + return;
> +
> + rto_ms = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
> +
> + sktr->stats.rtt_sample_us = rtt_sample_us;
> + sktr->stats.max_rtt_us = max_t(u64, sktr->stats.max_rtt_us, rtt_sample_us);
> + sktr->stats.min_rtt_us = min_t(u64, sktr->stats.min_rtt_us, rtt_sample_us);
> +
> + sktr->stats.count_rtt++;
> + sktr->stats.sum_rtt_us += rtt_sample_us;
> +
> + sktr->stats.max_rto_ms = max_t(u32, sktr->stats.max_rto_ms, rto_ms);
> + sktr->stats.min_rto_ms = min_t(u32, sktr->stats.min_rto_ms, rto_ms);
> +}
> +
> +static enum print_line_t
> +tcp_trace_print(struct trace_iterator *iter)
> +{
> + struct trace_seq *s = &iter->seq;
> + struct tcp_trace *tr = (struct tcp_trace *)iter->ent;
> + struct tcp_trace_basic *trb;
> + struct tcp_stats *stats;
> + const char *last_seq_bptr, *cur_seq_bptr;
> + int ret = 0;
> +
> + union {
> + struct sockaddr_in v4;
> + struct sockaddr_in6 v6;
> + } local_sa, remote_sa;
> +
> + local_sa.v4.sin_port = tr->local_port;
> + remote_sa.v4.sin_port = tr->remote_port;
> + if (tr->ipv6) {
> + local_sa.v6.sin6_family = AF_INET6;
> + remote_sa.v6.sin6_family = AF_INET6;
> + memcpy(local_sa.v6.sin6_addr.s6_addr, tr->local_addr, 4);
> + memcpy(remote_sa.v6.sin6_addr.s6_addr, tr->remote_addr, 4);
> + } else {
> + local_sa.v4.sin_family = AF_INET;
> + remote_sa.v4.sin_family =AF_INET;
> + local_sa.v4.sin_addr.s_addr = tr->local_addr[0];
> + remote_sa.v4.sin_addr.s_addr = tr->remote_addr[0];
> + }
> +
> + last_seq_bptr = ftrace_print_symbols_seq(s, tr->event,
> + tcp_trace_event_names);
> + cur_seq_bptr = trace_seq_buffer_ptr(s);
> + if (last_seq_bptr == cur_seq_bptr)
> + goto out;
> +
> + trb = (struct tcp_trace_basic *)tr;
> + ret = trace_seq_printf(s,
> + " %pISpc %pISpc snd_cwnd=%u mss=%u ssthresh=%u"
> + " srtt_us=%llu rto_ms=%u life_ms=%u",
> + &local_sa, &remote_sa,
> + trb->snd_cwnd, trb->mss, trb->ssthresh,
> + trb->srtt_us, trb->rto_ms, trb->life_ms);
> +
> + if (tr->event == TCP_TRACE_EVENT_ESTABLISHED || ret == 0)
> + goto out;
> +
> + stats = &(((struct tcp_trace_stats *)tr)->stats);
> + ret = trace_seq_printf(s,
> + " segs_out=%u data_segs_out=%u data_octets_out=%llu"
> + " other_segs_retrans=%u other_octets_retrans=%u"
> + " loss_segs_retrans=%u loss_octets_retrans=%u"
> + " segs_in=%u data_segs_in=%u data_octets_in=%llu"
> + " max_rtt_us=%llu min_rtt_us=%llu"
> + " count_rtt=%u sum_rtt_us=%llu"
> + " rtt_sample_us=%llu"
> + " max_rto_ms=%u min_rto_ms=%u"
> + " dup_acks_in=%u sacks_in=%u"
> + " sack_blks_in=%u ooo_in=%u",
> + stats->segs_out, stats->data_segs_out, stats->data_octets_out,
> + stats->other_segs_retrans, stats->other_octets_retrans,
> + stats->loss_segs_retrans, stats->loss_octets_retrans,
> + stats->segs_in, stats->data_segs_in, stats->data_octets_in,
> + stats->max_rtt_us, stats->min_rtt_us,
> + stats->count_rtt, stats->sum_rtt_us,
> + stats->rtt_sample_us,
> + stats->max_rto_ms, stats->min_rto_ms,
> + stats->dup_acks_in, stats->sacks_in,
> + stats->sack_blks_in, stats->ooo_in);
> +
> +out:
> + if (ret)
> + ret = trace_seq_putc(s, '\n');
> +
> + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
> +}
> +
> +static enum print_line_t
> +tcp_trace_print_binary(struct trace_iterator *iter)
> +{
> + int ret;
> + struct trace_seq *s = &iter->seq;
> + struct tcp_trace *tr = (struct tcp_trace *)iter->ent;
> + u32 magic = TCP_TRACE_MAGIC_VERSION;
> +
> + ret = trace_seq_putmem(s, &magic, sizeof(magic));
> + if (!ret)
> + goto out;
> +
> + if (tr->event == TCP_TRACE_EVENT_ESTABLISHED)
> + ret = trace_seq_putmem(s, tr + sizeof(magic),
> + sizeof(struct tcp_trace_basic));
> + else
> + ret = trace_seq_putmem(s, tr + sizeof(magic),
> + sizeof(struct tcp_trace_stats));
> +
> +out:
> + return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
> +}
> +
> +static enum print_line_t
> +tcp_tracer_print_line(struct trace_iterator *iter)
> +{
> + return (trace_flags & TRACE_ITER_BIN) ?
> + tcp_trace_print_binary(iter) :
> + tcp_trace_print(iter);
> +}
> +
> +static void tcp_register_tracepoints(void)
> +{
> + int ret;
> +
> + ret = register_trace_tcp_established(tcp_trace_established, NULL);
> + WARN_ON(ret);
> + ret = register_trace_tcp_close(tcp_trace_close, NULL);
> + WARN_ON(ret);
> + ret = register_trace_tcp_rcv_established(tcp_trace_rcv_established, NULL);
> + WARN_ON(ret);
> + ret = register_trace_tcp_transmit_skb(tcp_trace_transmit_skb, NULL);
> + WARN_ON(ret);
> + ret = register_trace_tcp_ooo_rcv(tcp_trace_ooo_rcv, NULL);
> + WARN_ON(ret);
> + ret = register_trace_tcp_sacks_rcv(tcp_trace_sacks_rcv, NULL);
> + WARN_ON(ret);
> + ret = register_trace_tcp_rtt_sample(tcp_trace_rtt_sample, NULL);
> + WARN_ON(ret);
> +}
> +
> +static void tcp_unregister_tracepoints(void)
> +{
> + unregister_trace_tcp_established(tcp_trace_established, NULL);
> + unregister_trace_tcp_rcv_established(tcp_trace_rcv_established, NULL);
> + unregister_trace_tcp_transmit_skb(tcp_trace_transmit_skb, NULL);
> + unregister_trace_tcp_ooo_rcv(tcp_trace_ooo_rcv, NULL);
> + unregister_trace_tcp_sacks_rcv(tcp_trace_sacks_rcv, NULL);
> + unregister_trace_tcp_rtt_sample(tcp_trace_rtt_sample, NULL);
> +
> + tracepoint_synchronize_unregister();
> +}
> +
> +static void tcp_tracer_start(struct trace_array *tr)
> +{
> + tcp_register_tracepoints();
> + tcp_trace_enabled = true;
> +}
> +
> +static void tcp_tracer_stop(struct trace_array *tr)
> +{
> + tcp_unregister_tracepoints();
> + tcp_trace_enabled = false;
> +}
> +
> +static void tcp_tracer_reset(struct trace_array *tr)
> +{
> + tcp_tracer_stop(tr);
> +}
> +
> +static int tcp_tracer_init(struct trace_array *tr)
> +{
> + tcp_tr = tr;
> + tcp_tracer_start(tr);
> + return 0;
> +}
> +
> +static struct tracer tcp_tracer __read_mostly = {
> + .name = "tcp",
> + .init = tcp_tracer_init,
> + .reset = tcp_tracer_reset,
> + .start = tcp_tracer_start,
> + .stop = tcp_tracer_stop,
> + .print_line = tcp_tracer_print_line,
> +};
> +
> +static struct trace_event_functions tcp_trace_event_funcs;
> +
> +static struct trace_event tcp_trace_event = {
> + .type = TRACE_TCP,
> + .funcs = &tcp_trace_event_funcs,
> +};
> +
> +static int __init init_tcp_tracer(void)
> +{
> + if (!register_ftrace_event(&tcp_trace_event)) {
> + pr_warning("Cannot register TCP trace event\n");
> + return 1;
> + }
> +
> + if (register_tracer(&tcp_tracer) != 0) {
> + pr_warning("Cannot register TCP tracer\n");
> + unregister_ftrace_event(&tcp_trace_event);
> + return 1;
> + }
> + return 0;
> +}
> +
> +device_initcall(init_tcp_tracer);
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 385391f..5dc5962 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -37,6 +37,7 @@ enum trace_type {
> TRACE_USER_STACK,
> TRACE_BLK,
> TRACE_BPUTS,
> + TRACE_TCP,
>
> __TRACE_LAST_TYPE,
> };
> --
> 1.8.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at https://urldefense.proofpoint.com/v1/url?u=http://vger.kernel.org/majordomo-info.html&k=ZVNjlDMF0FElm4dQtryO4A%3D%3D%0A&r=%2Faj1ZOQObwbmtLwlDw3XzQ%3D%3D%0A&m=CW4scPRBfOgsdn0GCbMgedOQVytKe3ZEBV2fC4xJFOA%3D%0A&s=d8b63403525c4df85b423582337b753283978aef9d9be19238adeb1042270caf
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH net-next] tcp: Add TCP tracer
2014-12-02 20:37 ` [PATCH net-next] tcp: Add TCP tracer Martin KaFai Lau
2014-12-02 20:40 ` Martin Lau
@ 2014-12-03 1:51 ` Stephen Hemminger
1 sibling, 0 replies; 11+ messages in thread
From: Stephen Hemminger @ 2014-12-03 1:51 UTC (permalink / raw)
To: Martin KaFai Lau; +Cc: davem, netdev
On Tue, 2 Dec 2014 12:37:42 -0800
Martin KaFai Lau <kafai@fb.com> wrote:
> diff --git a/include/uapi/linux/tcp_trace.h b/include/uapi/linux/tcp_trace.h
> index 2644f7f..d913a3c 100644
> --- a/include/uapi/linux/tcp_trace.h
> +++ b/include/uapi/linux/tcp_trace.h
> @@ -22,11 +22,11 @@ struct tcp_stats {
> __u32 other_segs_retrans;
> __u32 other_octets_retrans;
> __u32 loss_segs_retrans;
> - __u32 loss_octects_retrans;
> + __u32 loss_octets_retrans;
> __u32 segs_in;
> __u32 data_segs_in;
> - __u64 rtt_sample_us;
> __u64 data_octets_in;
> + __u64 rtt_sample_us;
> __u64 max_rtt_us;
> __u64 min_rtt_us;
> __u64 sum_rtt_us;
> @@ -64,9 +64,4 @@ struct tcp_trace_stats {
> struct tcp_stats stats;
> } __packed;
You can't change exposed kernel API like that.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [net_test_tools] udpflood: Add IPv6 support
2014-12-02 18:41 ` [net_test_tools] udpflood: Add IPv6 support Martin KaFai Lau
@ 2014-12-09 18:05 ` David Miller
0 siblings, 0 replies; 11+ messages in thread
From: David Miller @ 2014-12-09 18:05 UTC (permalink / raw)
To: kafai; +Cc: netdev
From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 2 Dec 2014 10:41:46 -0800
> This patch:
> 1. Add IPv6 support
> 2. Print timing for every 65536 fib insert operations to observe
> the gc effect (mostly for IPv6 fib).
Applied, thanks.
^ permalink raw reply [flat|nested] 11+ messages in thread
end of thread, other threads:[~2014-12-09 18:05 UTC | newest]
Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 2/3] ipv6: Avoid redoing fib6_lookup() for RTF_CACHE hit case Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 3/3] ipv6: Avoid redoing fib6_lookup() with reachable = 0 by saving fn Martin KaFai Lau
2014-10-24 4:15 ` [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() David Miller
2014-10-24 17:28 ` Martin Lau
2014-12-02 18:41 ` [net_test_tools] udpflood: Add IPv6 support Martin KaFai Lau
2014-12-09 18:05 ` David Miller
2014-12-02 20:37 ` [PATCH net-next] tcp: Add TCP tracer Martin KaFai Lau
2014-12-02 20:40 ` Martin Lau
2014-12-03 1:51 ` Stephen Hemminger
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).