netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route()
@ 2014-10-20 20:42 Martin KaFai Lau
  2014-10-20 20:42 ` [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro Martin KaFai Lau
                   ` (5 more replies)
  0 siblings, 6 replies; 11+ messages in thread
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev

Hi,

This patch set is trying to reduce the number of fib6_lookup()
calls from ip6_pol_route().

I have adapted davem's udpflooda and kbench_mod test
(https://git.kernel.org/pub/scm/linux/kernel/git/davem/net_test_tools.git) to
support IPv6 and here is the result:


Before:
[root]# for i in $(seq 1 3); do time ./udpflood -l 20000000 -c 250 2401:face:face:face::2; done

real    0m34.190s
user    0m3.047s
sys     0m31.108s

real    0m34.635s
user    0m3.125s
sys     0m31.475s

real    0m34.517s
user    0m3.034s
sys     0m31.449s

[root]# insmod ip6_route_kbench.ko oif=2 src=2401:face:face:face::1 dst=2401:face:face:face::2
[  660.160976] ip6_route_kbench: ip6_route_output tdiff: 933
[  660.207261] ip6_route_kbench: ip6_route_output tdiff: 988
[  660.253492] ip6_route_kbench: ip6_route_output tdiff: 896
[  660.298862] ip6_route_kbench: ip6_route_output tdiff: 898

After:
[root]# for i in $(seq 1 3); do time ./udpflood -l 20000000 -c 250 2401:face:face:face::2; done

real    0m32.695s
user    0m2.925s
sys     0m29.737s

real    0m32.636s
user    0m3.007s
sys     0m29.596s

real    0m32.797s
user    0m2.866s
sys     0m29.898s

[root]# insmod ip6_route_kbench.ko oif=2 src=2401:face:face:face::1 dst=2401:face:face:face::2
[  881.220793] ip6_route_kbench: ip6_route_output tdiff: 684
[  881.253477] ip6_route_kbench: ip6_route_output tdiff: 640
[  881.286867] ip6_route_kbench: ip6_route_output tdiff: 630
[  881.320749] ip6_route_kbench: ip6_route_output tdiff: 653


/****************************** udpflood.c ******************************/
/* It is an adaptation of the Eric Dumazet's and David Miller's
 * udpflood tool, by adding IPv6 support.
 */

#include <stdio.h>
#include <stdlib.h>
#include <stddef.h>
#include <malloc.h>
#include <string.h>
#include <errno.h>
#include <unistd.h>
#include <stdint.h>
#include <assert.h>

#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>

#define _GNU_SOURCE
#include <getopt.h>

typedef uint32_t u32;

static int debug =3D 0;

/* Allow -fstrict-aliasing */
typedef union sa_u {
	struct sockaddr_storage a46;
	struct sockaddr_in a4;
	struct sockaddr_in6 a6;
} sa_u;

static int usage(void)
{
	printf("usage: udpflood [ -l count ] [ -m message_size ] [ -c num_ip_addrs=
 ] IP_ADDRESS\n");
	return -1;
}

static u32 get_last32h(const sa_u *sa)
{
	if (sa->a46.ss_family =3D=3D PF_INET)
		return ntohl(sa->a4.sin_addr.s_addr);
	else
		return ntohl(sa->a6.sin6_addr.s6_addr32[3]);
}

static void set_last32h(sa_u *sa, u32 last32h)
{
	if (sa->a46.ss_family =3D=3D PF_INET)
		sa->a4.sin_addr.s_addr =3D htonl(last32h);
	else
		sa->a6.sin6_addr.s6_addr32[3] =3D htonl(last32h);
}

static void print_saddr(const sa_u *sa, const char *msg)
{
	char buf[64];

	if (!debug)
		return;

	switch (sa->a46.ss_family) {
	case PF_INET:
		inet_ntop(PF_INET, &(sa->a4.sin_addr.s_addr), buf,
			  sizeof(buf));
		break;
	case PF_INET6:
		inet_ntop(PF_INET6, &(sa->a6.sin6_addr), buf, sizeof(buf));
		break;
	}

	printf("%s: %s\n", msg, buf);
}

static int send_packets(const sa_u *sa, size_t num_addrs, int count, int ms=
g_sz)
{
	char *msg =3D malloc(msg_sz);
	sa_u saddr;
	u32 start_addr32h, end_addr32h, cur_addr32h;
	int fd, i, err;

	if (!msg)
		return -ENOMEM;

	memset(msg, 0, msg_sz);

	memcpy(&saddr, sa, sizeof(saddr));
	cur_addr32h =3D start_addr32h =3D get_last32h(&saddr);
	end_addr32h =3D start_addr32h + num_addrs;

	fd =3D socket(saddr.a46.ss_family, SOCK_DGRAM, 0);
	if (fd < 0) {
		perror("socket");
		err =3D fd;
		goto out_nofd;
	}

	/* connect to avoid the kernel spending time in figuring
	 * out the source address (i.e pin the src address)
	 */
	err =3D connect(fd, (struct sockaddr *) &saddr, sizeof(saddr));
	if (err < 0) {
		perror("connect");
		goto out;
	}

	print_saddr(&saddr, "start_addr");
	for (i =3D 0; i < count; i++) {
		print_saddr(&saddr, "sendto");
		err =3D sendto(fd, msg, msg_sz, 0, (struct sockaddr *)&saddr,
			     sizeof(saddr));
		if (err < 0) {
			perror("sendto");
			goto out;
		}

		if (++cur_addr32h >=3D end_addr32h)
			cur_addr32h =3D start_addr32h;
		set_last32h(&saddr, cur_addr32h);
	}

	err =3D 0;
out:
	close(fd);
out_nofd:
	free(msg);
	return err;
}

int main(int argc, char **argv, char **envp)
{
	int port, msg_sz, count, num_addrs, ret;

	sa_u start_addr;

	port =3D 6000;
	msg_sz =3D 32;
	count =3D 10000000;
	num_addrs =3D 1;

	while ((ret =3D getopt(argc, argv, "dl:s:p:c:")) >=3D 0) {
		switch (ret) {
		case 'l':
			sscanf(optarg, "%d", &count);
			break;
		case 's':
			sscanf(optarg, "%d", &msg_sz);
			break;
		case 'p':
			sscanf(optarg, "%d", &port);
			break;
		case 'c':
			sscanf(optarg, "%d", &num_addrs);
			break;
		case 'd':
			debug =3D 1;
			break;
		case '?':
			return usage();
		}
	}

	if (num_addrs < 1)
		return usage();

	if (!argv[optind])
		return usage();

	start_addr.a4.sin_port =3D htons(port);
	if (inet_pton(PF_INET, argv[optind], &start_addr.a4.sin_addr))
		start_addr.a46.ss_family =3D PF_INET;
	else if (inet_pton(PF_INET6, argv[optind], &start_addr.a6.sin6_addr.s6_add=
r))
		start_addr.a46.ss_family =3D PF_INET6;
	else
		return usage();

	return send_packets(&start_addr, num_addrs, count, msg_sz);
}

/****************** ip6_route_kbench_mod.c ******************/
#define pr_fmt(fmt) "ip6_route_kbench: " fmt

#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/inet.h>
#include <linux/in6.h>

#include <net/route.h>
#include <net/ip6_route.h>

#include <linux/timex.h>
#include <uapi/linux/icmpv6.h>

/* We can't just use "get_cycles()" as on some platforms, such
 * as sparc64, that gives system cycles rather than cpu clock
 * cycles.
 */

#ifdef CONFIG_SPARC64
static inline unsigned long long get_tick(void)
{
	unsigned long long t;

	__asm__ __volatile__("rd %%tick, %0" : "=r" (t));
	return t;
}
#elif defined(CONFIG_X86)
static inline unsigned long long get_tick(void)
{
	unsigned long long t;

	rdtscll(t);

	return t;
}
#elif defined(CONFIG_POWERPC)
static inline unsigned long long get_tick(void)
{
	return get_cycles();
}
#else
#error Unsupported architecture, please implement get_tick()
#endif

#define DEFAULT_WARMUP_COUNT 100000

#define DEFAULT_DST_IP_ADDR	0x4a800001
#define DEFAULT_SRC_IP_ADDR	0x00000000
#define DEFAULT_OIF		0
#define DEFAULT_IIF		0
#define DEFAULT_MARK		0x00000000
#define DEFAULT_TOS		0x00

static int flow_oif = DEFAULT_OIF;
static int flow_iif = DEFAULT_IIF;
static u32 flow_mark = DEFAULT_MARK;
static struct in6_addr flow_dst_ip_addr;
static struct in6_addr flow_src_ip_addr;
static int flow_tos = DEFAULT_TOS;

static char dst_string[64];
static char src_string[64];

module_param_string(dst, dst_string, sizeof(dst_string), 0);
module_param_string(src, src_string, sizeof(src_string), 0);

static int __init flow_setup(void)
{
	if (dst_string[0] &&
	    !in6_pton(dst_string, -1, &flow_dst_ip_addr.s6_addr[0], -1, NULL)) {
		pr_info("cannot parse \"%s\"\n", dst_string);
		return -1;
	}

	if (src_string[0] &&
	    !in6_pton(src_string, -1, &flow_src_ip_addr.s6_addr[0], -1, NULL)) {
		pr_info("cannot parse \"%s\"\n", dst_string);
		return -1;
	}

	return 0;
}

module_param_named(oif, flow_oif, int, 0);
module_param_named(iif, flow_iif, int, 0);
module_param_named(mark, flow_mark, uint, 0);
module_param_named(tos, flow_tos, int, 0);

static int warmup_count = DEFAULT_WARMUP_COUNT;
module_param_named(count, warmup_count, int, 0);

static void flow_init(struct flowi6 *fl6)
{
	memset(fl6, 0, sizeof(*fl6));
	fl6->flowi6_proto = IPPROTO_ICMPV6;
	fl6->flowi6_oif = flow_oif;
	fl6->flowi6_iif = flow_iif;
	fl6->flowi6_mark = flow_mark;
	fl6->flowi6_tos = flow_tos;
	fl6->daddr = flow_dst_ip_addr;
	fl6->saddr = flow_src_ip_addr;
}

static struct sk_buff * fake_skb_get(void)
{
	struct ipv6hdr *hdr;
	struct sk_buff *skb;

	skb = alloc_skb(4096, GFP_KERNEL);
	if (!skb) {
		pr_info("Cannot alloc SKB for test\n");
		return NULL;
	}
	skb->dev = __dev_get_by_index(&init_net, flow_iif);
	if (skb->dev == NULL) {
		pr_info("Input device (%d) does not exist\n", flow_iif);
		goto err;
	}

	skb_reset_mac_header(skb);
	skb_reset_network_header(skb);
	skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
	hdr = ipv6_hdr(skb);

	hdr->priority = 0;
	hdr->version = 6;
	memset(hdr->flow_lbl, 0, sizeof(hdr->flow_lbl));
	hdr->payload_len = htons(sizeof(struct icmp6hdr));
	hdr->nexthdr = IPPROTO_ICMPV6;
	hdr->saddr = flow_src_ip_addr;
	hdr->daddr = flow_dst_ip_addr;
	skb->protocol = htons(ETH_P_IPV6);
	skb->mark = flow_mark;

	return skb;
err:
	kfree_skb(skb);
	return NULL;
}

static void do_full_output_lookup_bench(void)
{
	unsigned long long t1, t2, tdiff;
	struct rt6_info *rt;
	struct flowi6 fl6;
	int i;

	rt = NULL;

	for (i = 0; i < warmup_count; i++) {
		flow_init(&fl6);

		rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl6);
		if (IS_ERR(rt))
			break;
		ip6_rt_put(rt);
	}
	if (IS_ERR(rt)) {
		pr_info("ip_route_output_key: err=%ld\n", PTR_ERR(rt));
		return;
	}

	flow_init(&fl6);

	t1 = get_tick();
	rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl6);
	t2 = get_tick();
	if (!IS_ERR(rt))
		ip6_rt_put(rt);

	tdiff = t2 - t1;
	pr_info("ip6_route_output tdiff: %llu\n", tdiff);
}

static void do_full_input_lookup_bench(void)
{
	unsigned long long t1, t2, tdiff;
	struct sk_buff *skb;
	struct rt6_info *rt;
	int err, i;

	skb = fake_skb_get();
	if (skb == NULL)
		goto out_free;

	err = 0;
	local_bh_disable();
	for (i = 0; i < warmup_count; i++) {
		ip6_route_input(skb);
		rt = (struct rt6_info *)skb_dst(skb);
		err = (!rt || rt == init_net.ipv6.ip6_null_entry);
		skb_dst_drop(skb);
		if (err)
			break;
	}
	local_bh_enable();

	if (err) {
		pr_info("Input route lookup fails\n");
		goto out_free;
	}

	local_bh_disable();
	t1 = get_tick();
	ip6_route_input(skb);
	t2 = get_tick();
	local_bh_enable();

	rt = (struct rt6_info *)skb_dst(skb);
	err = (!rt || rt == init_net.ipv6.ip6_null_entry);
	skb_dst_drop(skb);
	if (err) {
		pr_info("Input route lookup fails\n");
		goto out_free;
	}

	tdiff = t2 - t1;
	pr_info("ip6_route_input tdiff: %llu\n", tdiff);

out_free:
	kfree_skb(skb);
}

static void do_full_lookup_bench(void)
{
	if (!flow_iif)
		do_full_output_lookup_bench();
	else
		do_full_input_lookup_bench();
}

static void do_bench(void)
{
	do_full_lookup_bench();
	do_full_lookup_bench();
	do_full_lookup_bench();
	do_full_lookup_bench();
}

static int __init kbench_init(void)
{
	if (flow_setup())
		return -EINVAL;

	pr_info("flow [IIF(%d),OIF(%d),MARK(0x%08x),D("IP6_FMT"),"
		"S("IP6_FMT"),TOS(0x%02x)]\n",
		flow_iif, flow_oif, flow_mark,
		IP6_PRT(flow_dst_ip_addr),
		IP6_PRT(flow_src_ip_addr),
		flow_tos);

#if defined(CONFIG_X86)
	if (!cpu_has_tsc) {
		pr_err("X86 TSC is required, but is unavailable.\n");
		return -EINVAL;
	}
#endif

	pr_info("sizeof(struct rt6_info)==%zu\n", sizeof(struct rt6_info));

	do_bench();

	return -ENODEV;
}

static void __exit kbench_exit(void)
{
}

module_init(kbench_init);
module_exit(kbench_exit);
MODULE_LICENSE("GPL");

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro
  2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
@ 2014-10-20 20:42 ` Martin KaFai Lau
  2014-10-20 20:42 ` [PATCH RFC v5 net 2/3] ipv6: Avoid redoing fib6_lookup() for RTF_CACHE hit case Martin KaFai Lau
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hannes Frederic Sowa

It is the prep work to reduce the number of calls to fib6_lookup().

The BACKTRACK macro could be hard-to-read and error-prone due to
its side effects (mainly goto).

This patch is to:
1. Replace BACKTRACK macro with a function (fib6_backtrack) with the following
   return values:
   * If it is backtrack-able, returns next fn for retry.
   * If it reaches the root, returns NULL.
2. The caller needs to decide if a backtrack is needed (by testing
   rt == net->ipv6.ip6_null_entry).
3. Rename the goto labels in ip6_pol_route() to make the next few
   patches easier to read.

Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv6/route.c | 70 ++++++++++++++++++++++++++++++++------------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a318dd89..f1ab2f4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -772,23 +772,22 @@ int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
 }
 #endif
 
-#define BACKTRACK(__net, saddr)			\
-do { \
-	if (rt == __net->ipv6.ip6_null_entry) {	\
-		struct fib6_node *pn; \
-		while (1) { \
-			if (fn->fn_flags & RTN_TL_ROOT) \
-				goto out; \
-			pn = fn->parent; \
-			if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
-				fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
-			else \
-				fn = pn; \
-			if (fn->fn_flags & RTN_RTINFO) \
-				goto restart; \
-		} \
-	} \
-} while (0)
+static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
+					struct in6_addr *saddr)
+{
+	struct fib6_node *pn;
+	while (1) {
+		if (fn->fn_flags & RTN_TL_ROOT)
+			return NULL;
+		pn = fn->parent;
+		if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
+			fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
+		else
+			fn = pn;
+		if (fn->fn_flags & RTN_RTINFO)
+			return fn;
+	}
+}
 
 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
 					     struct fib6_table *table,
@@ -804,8 +803,11 @@ restart:
 	rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
 	if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
 		rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
-	BACKTRACK(net, &fl6->saddr);
-out:
+	if (rt == net->ipv6.ip6_null_entry) {
+		fn = fib6_backtrack(fn, &fl6->saddr);
+		if (fn)
+			goto restart;
+	}
 	dst_use(&rt->dst, jiffies);
 	read_unlock_bh(&table->tb6_lock);
 	return rt;
@@ -924,19 +926,25 @@ static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
 
-relookup:
+redo_fib6_lookup_lock:
 	read_lock_bh(&table->tb6_lock);
 
-restart_2:
+redo_fib6_lookup:
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
 
-restart:
+redo_rt6_select:
 	rt = rt6_select(fn, oif, strict | reachable);
 	if (rt->rt6i_nsiblings)
 		rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
-	BACKTRACK(net, &fl6->saddr);
-	if (rt == net->ipv6.ip6_null_entry ||
-	    rt->rt6i_flags & RTF_CACHE)
+	if (rt == net->ipv6.ip6_null_entry) {
+		fn = fib6_backtrack(fn, &fl6->saddr);
+		if (fn)
+			goto redo_rt6_select;
+		else
+			goto out;
+	}
+
+	if (rt->rt6i_flags & RTF_CACHE)
 		goto out;
 
 	dst_hold(&rt->dst);
@@ -967,12 +975,12 @@ restart:
 	 * released someone could insert this route.  Relookup.
 	 */
 	ip6_rt_put(rt);
-	goto relookup;
+	goto redo_fib6_lookup_lock;
 
 out:
 	if (reachable) {
 		reachable = 0;
-		goto restart_2;
+		goto redo_fib6_lookup;
 	}
 	dst_hold(&rt->dst);
 	read_unlock_bh(&table->tb6_lock);
@@ -1235,10 +1243,12 @@ restart:
 		rt = net->ipv6.ip6_null_entry;
 	else if (rt->dst.error) {
 		rt = net->ipv6.ip6_null_entry;
-		goto out;
+	} else if (rt == net->ipv6.ip6_null_entry) {
+		fn = fib6_backtrack(fn, &fl6->saddr);
+		if (fn)
+			goto restart;
 	}
-	BACKTRACK(net, &fl6->saddr);
-out:
+
 	dst_hold(&rt->dst);
 
 	read_unlock_bh(&table->tb6_lock);
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH RFC v5 net 2/3] ipv6: Avoid redoing fib6_lookup() for RTF_CACHE hit case
  2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
  2014-10-20 20:42 ` [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro Martin KaFai Lau
@ 2014-10-20 20:42 ` Martin KaFai Lau
  2014-10-20 20:42 ` [PATCH RFC v5 net 3/3] ipv6: Avoid redoing fib6_lookup() with reachable = 0 by saving fn Martin KaFai Lau
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hannes Frederic Sowa

When there is a RTF_CACHE hit, no need to redo fib6_lookup()
with reachable=0.

Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv6/route.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index f1ab2f4..98c523f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -944,12 +944,12 @@ redo_rt6_select:
 			goto out;
 	}
 
-	if (rt->rt6i_flags & RTF_CACHE)
-		goto out;
-
 	dst_hold(&rt->dst);
 	read_unlock_bh(&table->tb6_lock);
 
+	if (rt->rt6i_flags & RTF_CACHE)
+		goto out2;
+
 	if (!(rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY)))
 		nrt = rt6_alloc_cow(rt, &fl6->daddr, &fl6->saddr);
 	else if (!(rt->dst.flags & DST_HOST))
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH RFC v5 net 3/3] ipv6: Avoid redoing fib6_lookup() with reachable = 0 by saving fn
  2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
  2014-10-20 20:42 ` [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro Martin KaFai Lau
  2014-10-20 20:42 ` [PATCH RFC v5 net 2/3] ipv6: Avoid redoing fib6_lookup() for RTF_CACHE hit case Martin KaFai Lau
@ 2014-10-20 20:42 ` Martin KaFai Lau
  2014-10-24  4:15 ` [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() David Miller
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 11+ messages in thread
From: Martin KaFai Lau @ 2014-10-20 20:42 UTC (permalink / raw)
  To: netdev; +Cc: David Miller, Hannes Frederic Sowa

This patch save the fn before doing rt6_backtrack.
Hence, without redo-ing the fib6_lookup(), saved_fn can be used
to redo rt6_select() with RT6_LOOKUP_F_REACHABLE off.

Some minor changes I think make sense to review as a single patch:
* Remove the 'out:' goto label.
* Remove the 'reachable' variable. Only use the 'strict' variable instead.

After this patch, "failing ip6_ins_rt()" should be the only case that
requires a redo of fib6_lookup().

Cc: David Miller <davem@davemloft.net>
Cc: Hannes Frederic Sowa <hannes@stressinduktion.org>
Signed-off-by: Martin KaFai Lau <kafai@fb.com>
---
 net/ipv6/route.c | 30 ++++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 98c523f..c910831 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -917,31 +917,40 @@ static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort,
 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
 				      struct flowi6 *fl6, int flags)
 {
-	struct fib6_node *fn;
+	struct fib6_node *fn, *saved_fn;
 	struct rt6_info *rt, *nrt;
 	int strict = 0;
 	int attempts = 3;
 	int err;
-	int reachable = net->ipv6.devconf_all->forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
 
 	strict |= flags & RT6_LOOKUP_F_IFACE;
+	if (net->ipv6.devconf_all->forwarding == 0)
+		strict |= RT6_LOOKUP_F_REACHABLE;
 
 redo_fib6_lookup_lock:
 	read_lock_bh(&table->tb6_lock);
 
-redo_fib6_lookup:
 	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
+	saved_fn = fn;
 
 redo_rt6_select:
-	rt = rt6_select(fn, oif, strict | reachable);
+	rt = rt6_select(fn, oif, strict);
 	if (rt->rt6i_nsiblings)
-		rt = rt6_multipath_select(rt, fl6, oif, strict | reachable);
+		rt = rt6_multipath_select(rt, fl6, oif, strict);
 	if (rt == net->ipv6.ip6_null_entry) {
 		fn = fib6_backtrack(fn, &fl6->saddr);
 		if (fn)
 			goto redo_rt6_select;
-		else
-			goto out;
+		else if (strict & RT6_LOOKUP_F_REACHABLE) {
+			/* also consider unreachable route */
+			strict &= ~RT6_LOOKUP_F_REACHABLE;
+			fn = saved_fn;
+			goto redo_rt6_select;
+		} else {
+			dst_hold(&rt->dst);
+			read_unlock_bh(&table->tb6_lock);
+			goto out2;
+		}
 	}
 
 	dst_hold(&rt->dst);
@@ -977,13 +986,6 @@ redo_rt6_select:
 	ip6_rt_put(rt);
 	goto redo_fib6_lookup_lock;
 
-out:
-	if (reachable) {
-		reachable = 0;
-		goto redo_fib6_lookup;
-	}
-	dst_hold(&rt->dst);
-	read_unlock_bh(&table->tb6_lock);
 out2:
 	rt->dst.lastuse = jiffies;
 	rt->dst.__use++;
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route()
  2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
                   ` (2 preceding siblings ...)
  2014-10-20 20:42 ` [PATCH RFC v5 net 3/3] ipv6: Avoid redoing fib6_lookup() with reachable = 0 by saving fn Martin KaFai Lau
@ 2014-10-24  4:15 ` David Miller
  2014-10-24 17:28   ` Martin Lau
  2014-12-02 18:41 ` [net_test_tools] udpflood: Add IPv6 support Martin KaFai Lau
  2014-12-02 20:37 ` [PATCH net-next] tcp: Add TCP tracer Martin KaFai Lau
  5 siblings, 1 reply; 11+ messages in thread
From: David Miller @ 2014-10-24  4:15 UTC (permalink / raw)
  To: kafai; +Cc: netdev

From: Martin KaFai Lau <kafai@fb.com>
Date: Mon, 20 Oct 2014 13:42:42 -0700

> This patch set is trying to reduce the number of fib6_lookup()
> calls from ip6_pol_route().
> 
> I have adapted davem's udpflooda and kbench_mod test
> (https://git.kernel.org/pub/scm/linux/kernel/git/davem/net_test_tools.git) to
> support IPv6 and here is the result:

Series applied, thanks.

Can you cook up some clean patches against the net_test_tools repo so
that people can use it for both ipv4 and ipv6 route lookup measurements?

Thanks.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route()
  2014-10-24  4:15 ` [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() David Miller
@ 2014-10-24 17:28   ` Martin Lau
  0 siblings, 0 replies; 11+ messages in thread
From: Martin Lau @ 2014-10-24 17:28 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Hi,

> Can you cook up some clean patches against the net_test_tools repo so
> that people can use it for both ipv4 and ipv6 route lookup measurements?
Yes, will do.

Thanks,
--Martin

^ permalink raw reply	[flat|nested] 11+ messages in thread

* [net_test_tools] udpflood: Add IPv6 support
  2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
                   ` (3 preceding siblings ...)
  2014-10-24  4:15 ` [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() David Miller
@ 2014-12-02 18:41 ` Martin KaFai Lau
  2014-12-09 18:05   ` David Miller
  2014-12-02 20:37 ` [PATCH net-next] tcp: Add TCP tracer Martin KaFai Lau
  5 siblings, 1 reply; 11+ messages in thread
From: Martin KaFai Lau @ 2014-12-02 18:41 UTC (permalink / raw)
  To: davem; +Cc: netdev

This patch:
1. Add IPv6 support
2. Print timing for every 65536 fib insert operations to observe
   the gc effect (mostly for IPv6 fib).
---
 udpflood.c | 125 +++++++++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 101 insertions(+), 24 deletions(-)

diff --git a/udpflood.c b/udpflood.c
index 6e658f7..5855012 100644
--- a/udpflood.c
+++ b/udpflood.c
@@ -6,7 +6,9 @@
 #include <string.h>
 #include <errno.h>
 #include <unistd.h>
+#include <stdint.h>
 
+#include <sys/time.h>
 #include <sys/types.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
@@ -15,57 +17,121 @@
 #define _GNU_SOURCE
 #include <getopt.h>
 
+static int debug = 0;
+
+typedef union sa_u {
+	struct sockaddr_in a4;
+	struct sockaddr_in6 a6;
+} sa_u;
+
 static int usage(void)
 {
 	printf("usage: udpflood [ -l count ] [ -m message_size ] [ -c num_ip_addrs ] IP_ADDRESS\n");
 	return -1;
 }
 
-static int send_packets(in_addr_t start_addr, in_addr_t end_addr,
-			int port, int count, int msg_sz)
+static uint32_t get_last32h(const sa_u *sa)
+{
+	if (sa->a4.sin_family == PF_INET)
+		return ntohl(sa->a4.sin_addr.s_addr);
+	else
+		return ntohl(sa->a6.sin6_addr.s6_addr32[3]);
+}
+
+static void set_last32h(sa_u *sa, uint32_t last32h)
+{
+	if (sa->a4.sin_family == PF_INET)
+		sa->a4.sin_addr.s_addr = htonl(last32h);
+	else
+		sa->a6.sin6_addr.s6_addr32[3] = htonl(last32h);
+}
+
+static void print_sa(const sa_u *sa, const char *msg)
+{
+	char buf[sizeof("xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx:xxxx")];
+
+	if (!debug)
+		return;
+
+	switch (sa->a4.sin_family) {
+	case PF_INET:
+		inet_ntop(PF_INET, &(sa->a4.sin_addr.s_addr), buf,
+			  sizeof(buf));
+		break;
+	case PF_INET6:
+		inet_ntop(PF_INET6, sa->a6.sin6_addr.s6_addr, buf, sizeof(buf));
+		break;
+	}
+
+	printf("%s: %s\n", msg, buf);
+}
+
+static long get_diff_ms(const struct timeval *now,
+			     const struct timeval *start)
+{
+	long start_ms, now_ms;
+	start_ms = start->tv_sec * 1000 + (start->tv_usec / 1000);
+	now_ms = now->tv_sec * 1000 + (now->tv_usec  / 1000);
+	return now_ms - start_ms;
+}
+
+static int send_packets(const sa_u *start_sa, size_t num_addrs, int count,
+			int msg_sz)
 {
 	char *msg = malloc(msg_sz);
-	struct sockaddr_in saddr;
-	in_addr_t addr;
+	sa_u cur_sa;
+	uint32_t start_addr32h, end_addr32h, cur_addr32h;
 	int fd, i, err;
+	struct timeval last, now;
 
 	if (!msg)
 		return -ENOMEM;
 
 	memset(msg, 0, msg_sz);
 
-	addr = start_addr;
-
-	memset(&saddr, 0, sizeof(saddr));
-	saddr.sin_family = AF_INET;
-	saddr.sin_port = port;
-	saddr.sin_addr.s_addr = addr;
+	memcpy(&cur_sa, start_sa, sizeof(cur_sa));
+	cur_addr32h = start_addr32h = get_last32h(&cur_sa);
+	end_addr32h = start_addr32h + num_addrs;
 
-	fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+	fd = socket(cur_sa.a4.sin_family, SOCK_DGRAM, IPPROTO_IP);
 	if (fd < 0) {
 		perror("socket");
 		err = fd;
 		goto out_nofd;
 	}
-	err = connect(fd, (struct sockaddr *) &saddr, sizeof(saddr));
+	err = connect(fd, (struct sockaddr *) &cur_sa, sizeof(cur_sa));
 	if (err < 0) {
 		perror("connect");
-		close(fd);
 		goto out;
 	}
 
+	print_sa(start_sa, "start_addr");
+	gettimeofday(&last, NULL);
 	for (i = 0; i < count; i++) {
-		saddr.sin_addr.s_addr = addr;
-
+		print_sa(&cur_sa, "sendto");
 		err = sendto(fd, msg, msg_sz, 0,
-			     (struct sockaddr *) &saddr, sizeof(saddr));
+			     (struct sockaddr *) &cur_sa, sizeof(cur_sa));
 		if (err < 0) {
 			perror("sendto");
 			goto out;
 		}
 
-		if (++addr >= end_addr)
-			addr = start_addr;
+		if (++cur_addr32h >= end_addr32h)
+			cur_addr32h = start_addr32h;
+		set_last32h(&cur_sa, cur_addr32h);
+
+		/*
+		 * print timing info for every 65536 fib inserts to
+		 * observe the gc effect (mostly for IPv6 fib).
+		 */
+		if (i && (i & 0xFFFF) == 0) {
+		    long diff_ms;
+		    gettimeofday(&now, NULL);
+		    diff_ms = get_diff_ms(&now, &last);
+		    printf("%d %ld.%ld\n", i >> 16,
+			   diff_ms / 1000, diff_ms % 1000);
+		    memcpy(&last, &now, sizeof(last));
+		}
 	}
 
 	err = 0;
@@ -79,14 +145,14 @@ out_nofd:
 int main(int argc, char **argv, char **envp)
 {
 	int port, msg_sz, count, num_addrs, ret;
-	in_addr_t start_addr, end_addr;
+	sa_u start_sa;
 
 	port = 6000;
 	msg_sz = 32;
 	count = 10000000;
 	num_addrs = 1;
 
-	while ((ret = getopt(argc, argv, "l:s:p:c:")) >= 0) {
+	while ((ret = getopt(argc, argv, "dl:s:p:c:")) >= 0) {
 		switch (ret) {
 		case 'l':
 			sscanf(optarg, "%d", &count);
@@ -100,18 +166,29 @@ int main(int argc, char **argv, char **envp)
 		case 'c':
 			sscanf(optarg, "%d", &num_addrs);
 			break;
+		case 'd':
+			debug = 1;
+			break;
 		case '?':
 			return usage();
 		}
 	}
 
+	if (num_addrs < 1 || count < 1)
+		return usage();
+
 	if (!argv[optind])
 		return usage();
 
-	start_addr = inet_addr(argv[optind]);
-	if (start_addr == INADDR_NONE)
+	memset(&start_sa, 0, sizeof(start_sa));
+	start_sa.a4.sin_port = htons(port);
+	if (inet_pton(PF_INET, argv[optind], &start_sa.a4.sin_addr))
+		start_sa.a4.sin_family = PF_INET;
+	else if (inet_pton(PF_INET6, argv[optind],
+			   start_sa.a6.sin6_addr.s6_addr))
+		start_sa.a6.sin6_family = PF_INET6;
+	else
 		return usage();
-	end_addr = start_addr + num_addrs;
 
-	return send_packets(start_addr, end_addr, port, count, msg_sz);
+	return send_packets(&start_sa, num_addrs, count, msg_sz);
 }
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH net-next] tcp: Add TCP tracer
  2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
                   ` (4 preceding siblings ...)
  2014-12-02 18:41 ` [net_test_tools] udpflood: Add IPv6 support Martin KaFai Lau
@ 2014-12-02 20:37 ` Martin KaFai Lau
  2014-12-02 20:40   ` Martin Lau
  2014-12-03  1:51   ` Stephen Hemminger
  5 siblings, 2 replies; 11+ messages in thread
From: Martin KaFai Lau @ 2014-12-02 20:37 UTC (permalink / raw)
  To: davem; +Cc: netdev

Define probes and register them to the TCP tracepoints.  The probes
collect the data defined in struct tcp_sk_trace and record them to
the tracing's ring_buffer.
---
 include/uapi/linux/tcp_trace.h |   9 +-
 kernel/trace/tcp_trace.c       | 448 +++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h           |   1 +
 3 files changed, 451 insertions(+), 7 deletions(-)

diff --git a/include/uapi/linux/tcp_trace.h b/include/uapi/linux/tcp_trace.h
index 2644f7f..d913a3c 100644
--- a/include/uapi/linux/tcp_trace.h
+++ b/include/uapi/linux/tcp_trace.h
@@ -22,11 +22,11 @@ struct tcp_stats {
 	__u32	other_segs_retrans;
 	__u32	other_octets_retrans;
 	__u32	loss_segs_retrans;
-	__u32	loss_octects_retrans;
+	__u32	loss_octets_retrans;
 	__u32	segs_in;
 	__u32	data_segs_in;
-	__u64	rtt_sample_us;
 	__u64	data_octets_in;
+	__u64	rtt_sample_us;
 	__u64	max_rtt_us;
 	__u64	min_rtt_us;
 	__u64   sum_rtt_us;
@@ -64,9 +64,4 @@ struct tcp_trace_stats {
         struct tcp_stats stats;
 } __packed;
 
-typedef struct tcp_trace_basic tcp_trace_establish;
-typedef struct tcp_trace_basic tcp_trace_retrans;
-typedef struct tcp_trace_stats tcp_trace_periodic;
-typedef struct tcp_trace_stats tcp_trace_close;
-
 #endif /* UAPI_TCP_TRACE_H */
diff --git a/kernel/trace/tcp_trace.c b/kernel/trace/tcp_trace.c
index 9d09fd0..376580b 100644
--- a/kernel/trace/tcp_trace.c
+++ b/kernel/trace/tcp_trace.c
@@ -1,9 +1,27 @@
 #include <net/tcp_trace.h>
+#include <net/tcp.h>
+#include <trace/events/tcp.h>
 #include <linux/tcp.h>
+#include <linux/ipv6.h>
+#include <linux/ftrace_event.h>
+#include <linux/jiffies.h>
 #include <uapi/linux/tcp_trace.h>
 
+#include "trace_output.h"
+
+#define REPORT_INTERVAL_MS 2000
+
+static struct trace_array *tcp_tr;
 static bool tcp_trace_enabled __read_mostly;
 
+static struct trace_print_flags tcp_trace_event_names[] = {
+	{ TCP_TRACE_EVENT_ESTABLISHED, "established" },
+	{ TCP_TRACE_EVENT_PERIODIC, "periodic" },
+	{ TCP_TRACE_EVENT_RETRANS, "retrans" },
+	{ TCP_TRACE_EVENT_RETRANS_LOSS, "retrans_loss" },
+	{ TCP_TRACE_EVENT_CLOSE, "close" }
+};
+
 struct tcp_sk_trace {
 	struct tcp_stats stats;
 	unsigned long start_ts;
@@ -35,3 +53,433 @@ void tcp_sk_trace_destruct(struct sock *sk)
 {
 	kfree(tcp_sk(sk)->trace);
 }
+
+static void tcp_trace_init(struct tcp_trace *tr,
+			   enum tcp_trace_events trev,
+			   struct sock *sk)
+{
+	tr->event = trev;
+	if (sk->sk_family == AF_INET) {
+		tr->ipv6 = 0;
+		tr->local_addr[0] = inet_sk(sk)->inet_saddr;
+		tr->remote_addr[0] = inet_sk(sk)->inet_daddr;
+	} else {
+		BUG_ON(sk->sk_family != AF_INET6);
+		tr->ipv6 = 1;
+		memcpy(tr->local_addr, inet6_sk(sk)->saddr.s6_addr32,
+		       sizeof(tr->local_addr));
+		memcpy(tr->remote_addr, sk->sk_v6_daddr.s6_addr32,
+		       sizeof(tr->remote_addr));
+	}
+	tr->local_port = inet_sk(sk)->inet_sport;
+	tr->remote_port = inet_sk(sk)->inet_dport;
+}
+
+static void tcp_trace_basic_init(struct tcp_trace_basic *trb,
+				 enum tcp_trace_events trev,
+				 struct sock *sk)
+{
+	struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
+	tcp_trace_init((struct tcp_trace *)trb, trev, sk);
+	trb->snd_cwnd = tcp_sk(sk)->snd_cwnd * tcp_sk(sk)->mss_cache;
+	trb->mss = tcp_sk(sk)->mss_cache;
+	trb->ssthresh = tcp_current_ssthresh(sk);
+	trb->srtt_us = tcp_sk(sk)->srtt_us >> 3;
+	trb->rto_ms = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
+	trb->life_ms = jiffies_to_msecs(jiffies - sktr->start_ts);
+}
+
+static void tcp_trace_basic_add(enum tcp_trace_events trev, struct sock *sk)
+{
+	struct ring_buffer *buffer;
+	int pc;
+	struct ring_buffer_event *event;
+	struct tcp_trace_basic *trb;
+	struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
+
+	if (!sktr)
+		return;
+
+	tracing_record_cmdline(current);
+	buffer = tcp_tr->trace_buffer.buffer;
+	pc = preempt_count();
+	event = trace_buffer_lock_reserve(buffer, TRACE_TCP,
+					  sizeof(*trb), 0, pc);
+	if (!event)
+		return;
+	trb = ring_buffer_event_data(event);
+	tcp_trace_basic_init(trb, trev, sk);
+	trace_buffer_unlock_commit(buffer, event, 0, pc);
+}
+
+static void tcp_trace_stats_init(struct tcp_trace_stats *trs,
+				 enum tcp_trace_events trev,
+				 struct sock *sk)
+{
+	struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
+
+	tcp_trace_basic_init((struct tcp_trace_basic *)trs, trev, sk);
+	memcpy(&trs->stats, &sktr->stats, sizeof(sktr->stats));
+}
+
+static void tcp_trace_stats_add(enum tcp_trace_events trev, struct sock *sk)
+{
+	struct ring_buffer *buffer;
+	int pc;
+	struct ring_buffer_event *event;
+	struct tcp_trace_stats *trs;
+	struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
+
+	if (!sktr)
+		return;
+
+	tracing_record_cmdline(current);
+	buffer = tcp_tr->trace_buffer.buffer;
+	pc = preempt_count();
+	event = trace_buffer_lock_reserve(buffer, TRACE_TCP,
+					  sizeof(*trs), 0, pc);
+	if (!event)
+		return;
+	trs = ring_buffer_event_data(event);
+
+	tcp_trace_stats_init(trs, trev, sk);
+
+	trace_buffer_unlock_commit(buffer, event, 0, pc);
+}
+
+static void tcp_trace_established(void *ignore, struct sock *sk)
+{
+	tcp_trace_basic_add(TCP_TRACE_EVENT_ESTABLISHED, sk);
+}
+
+static void tcp_trace_transmit_skb(void *ignore, struct sock *sk,
+				   struct sk_buff *skb)
+{
+	int pcount;
+	struct tcp_sk_trace *sktr;
+	struct tcp_skb_cb *tcb;
+	unsigned int data_len;
+	bool retrans = false;
+
+	sktr = tcp_sk(sk)->trace;
+	if (!sktr)
+		return;
+
+	tcb = TCP_SKB_CB(skb);
+	pcount = tcp_skb_pcount(skb);
+	data_len = tcb->end_seq - tcb->seq;
+
+	sktr->stats.segs_out += pcount;
+
+	if (!data_len)
+		goto out;
+
+	sktr->stats.data_segs_out += pcount;
+	sktr->stats.data_octets_out += data_len;
+
+	if (before(tcb->seq, tcp_sk(sk)->snd_nxt)) {
+		enum tcp_trace_events trev;
+		retrans = true;
+		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
+			sktr->stats.loss_segs_retrans += pcount;
+			sktr->stats.loss_octets_retrans += data_len;
+			trev = TCP_TRACE_EVENT_RETRANS_LOSS;
+		} else {
+			sktr->stats.other_segs_retrans += pcount;
+			sktr->stats.other_octets_retrans += data_len;
+			trev = TCP_TRACE_EVENT_RETRANS;
+		}
+		tcp_trace_stats_add(trev, sk);
+		return;
+	}
+
+out:
+	if (jiffies_to_msecs(jiffies - sktr->last_ts) >=
+	    REPORT_INTERVAL_MS) {
+		sktr->last_ts = jiffies;
+		tcp_trace_stats_add(TCP_TRACE_EVENT_PERIODIC, sk);
+	}
+}
+
+static void tcp_trace_rcv_established(void *ignore, struct sock *sk,
+				      struct sk_buff *skb)
+{
+	struct tcp_sk_trace *sktr;
+	unsigned int data_len;
+	struct tcphdr *th;
+
+	sktr = tcp_sk(sk)->trace;
+	if (!sktr)
+		return;
+
+	th = tcp_hdr(skb);
+	WARN_ON_ONCE(skb->len < th->doff << 2);
+
+	sktr->stats.segs_in++;
+	data_len = skb->len - (th->doff << 2);
+	if (data_len) {
+		if (TCP_SKB_CB(skb)->ack_seq == tcp_sk(sk)->snd_una)
+			sktr->stats.dup_acks_in++;
+	} else {
+		sktr->stats.data_segs_in++;
+		sktr->stats.data_segs_in += data_len;
+	}
+
+	if (jiffies_to_msecs(jiffies - sktr->last_ts) >=
+	    REPORT_INTERVAL_MS) {
+		sktr->last_ts = jiffies;
+		tcp_trace_stats_add(TCP_TRACE_EVENT_PERIODIC, sk);
+	}
+}
+
+static void tcp_trace_close(void *ignore, struct sock *sk)
+{
+	struct tcp_sk_trace *sktr;
+	sktr = tcp_sk(sk)->trace;
+	if (!sktr)
+		return;
+
+	tcp_trace_stats_add(TCP_TRACE_EVENT_CLOSE, sk);
+}
+
+static void tcp_trace_ooo_rcv(void *ignore, struct sock *sk)
+{
+	struct tcp_sk_trace *sktr;
+
+	sktr = tcp_sk(sk)->trace;
+	if (!sktr)
+		return;
+
+	sktr->stats.ooo_in++;
+}
+
+static void tcp_trace_sacks_rcv(void *ignore, struct sock *sk, int num_sacks)
+{
+	struct tcp_sk_trace *sktr;
+
+	sktr = tcp_sk(sk)->trace;
+	if (!sktr)
+		return;
+
+	sktr->stats.sacks_in++;
+	sktr->stats.sack_blks_in += num_sacks;
+}
+
+void tcp_trace_rtt_sample(void *ignore, struct sock *sk,
+			  long rtt_sample_us)
+{
+	struct tcp_sk_trace *sktr;
+        u32 rto_ms;
+
+	sktr = tcp_sk(sk)->trace;
+	if (!sktr)
+		return;
+
+	rto_ms = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
+
+	sktr->stats.rtt_sample_us = rtt_sample_us;
+	sktr->stats.max_rtt_us = max_t(u64, sktr->stats.max_rtt_us, rtt_sample_us);
+	sktr->stats.min_rtt_us = min_t(u64, sktr->stats.min_rtt_us, rtt_sample_us);
+
+	sktr->stats.count_rtt++;
+	sktr->stats.sum_rtt_us += rtt_sample_us;
+
+	sktr->stats.max_rto_ms = max_t(u32, sktr->stats.max_rto_ms, rto_ms);
+	sktr->stats.min_rto_ms = min_t(u32, sktr->stats.min_rto_ms, rto_ms);
+}
+
+static enum print_line_t
+tcp_trace_print(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct tcp_trace *tr = (struct tcp_trace *)iter->ent;
+	struct tcp_trace_basic *trb;
+	struct tcp_stats *stats;
+	const char *last_seq_bptr, *cur_seq_bptr;
+	int ret = 0;
+
+	union {
+		struct sockaddr_in v4;
+		struct sockaddr_in6 v6;
+	} local_sa, remote_sa;
+
+	local_sa.v4.sin_port = tr->local_port;
+	remote_sa.v4.sin_port = tr->remote_port;
+	if (tr->ipv6) {
+		local_sa.v6.sin6_family = AF_INET6;
+		remote_sa.v6.sin6_family = AF_INET6;
+		memcpy(local_sa.v6.sin6_addr.s6_addr, tr->local_addr, 4);
+		memcpy(remote_sa.v6.sin6_addr.s6_addr, tr->remote_addr, 4);
+	} else {
+		local_sa.v4.sin_family = AF_INET;
+		remote_sa.v4.sin_family =AF_INET;
+		local_sa.v4.sin_addr.s_addr =  tr->local_addr[0];
+		remote_sa.v4.sin_addr.s_addr = tr->remote_addr[0];
+	}
+
+	last_seq_bptr = ftrace_print_symbols_seq(s, tr->event,
+						tcp_trace_event_names);
+	cur_seq_bptr = trace_seq_buffer_ptr(s);
+	if (last_seq_bptr == cur_seq_bptr)
+		goto out;
+
+	trb = (struct tcp_trace_basic *)tr;
+	ret = trace_seq_printf(s,
+			       " %pISpc %pISpc snd_cwnd=%u mss=%u ssthresh=%u"
+			       " srtt_us=%llu rto_ms=%u life_ms=%u",
+			       &local_sa, &remote_sa,
+			       trb->snd_cwnd, trb->mss, trb->ssthresh,
+			       trb->srtt_us, trb->rto_ms, trb->life_ms);
+
+	if (tr->event == TCP_TRACE_EVENT_ESTABLISHED || ret == 0)
+		goto out;
+
+	stats = &(((struct tcp_trace_stats *)tr)->stats);
+	ret = trace_seq_printf(s,
+		" segs_out=%u data_segs_out=%u data_octets_out=%llu"
+		" other_segs_retrans=%u other_octets_retrans=%u"
+		" loss_segs_retrans=%u loss_octets_retrans=%u"
+		" segs_in=%u data_segs_in=%u data_octets_in=%llu"
+		" max_rtt_us=%llu min_rtt_us=%llu"
+		" count_rtt=%u sum_rtt_us=%llu"
+		" rtt_sample_us=%llu"
+		" max_rto_ms=%u min_rto_ms=%u"
+		" dup_acks_in=%u sacks_in=%u"
+		" sack_blks_in=%u ooo_in=%u",
+		stats->segs_out, stats->data_segs_out, stats->data_octets_out,
+		stats->other_segs_retrans, stats->other_octets_retrans,
+		stats->loss_segs_retrans, stats->loss_octets_retrans,
+		stats->segs_in, stats->data_segs_in, stats->data_octets_in,
+		stats->max_rtt_us, stats->min_rtt_us,
+		stats->count_rtt, stats->sum_rtt_us,
+		stats->rtt_sample_us,
+		stats->max_rto_ms, stats->min_rto_ms,
+		stats->dup_acks_in, stats->sacks_in,
+		stats->sack_blks_in, stats->ooo_in);
+
+out:
+	if (ret)
+		ret = trace_seq_putc(s, '\n');
+
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t
+tcp_trace_print_binary(struct trace_iterator *iter)
+{
+	int ret;
+	struct trace_seq *s = &iter->seq;
+	struct tcp_trace *tr = (struct tcp_trace *)iter->ent;
+	u32 magic = TCP_TRACE_MAGIC_VERSION;
+
+	ret = trace_seq_putmem(s, &magic, sizeof(magic));
+	if (!ret)
+		goto out;
+
+	if (tr->event == TCP_TRACE_EVENT_ESTABLISHED)
+		ret = trace_seq_putmem(s, tr + sizeof(magic),
+					sizeof(struct tcp_trace_basic));
+	else
+		ret = trace_seq_putmem(s, tr + sizeof(magic),
+				       sizeof(struct tcp_trace_stats));
+
+out:
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t
+tcp_tracer_print_line(struct trace_iterator *iter)
+{
+	return (trace_flags & TRACE_ITER_BIN) ?
+		tcp_trace_print_binary(iter) :
+		tcp_trace_print(iter);
+}
+
+static void tcp_register_tracepoints(void)
+{
+	int ret;
+
+	ret = register_trace_tcp_established(tcp_trace_established, NULL);
+	WARN_ON(ret);
+	ret = register_trace_tcp_close(tcp_trace_close, NULL);
+	WARN_ON(ret);
+	ret = register_trace_tcp_rcv_established(tcp_trace_rcv_established, NULL);
+	WARN_ON(ret);
+	ret = register_trace_tcp_transmit_skb(tcp_trace_transmit_skb, NULL);
+	WARN_ON(ret);
+	ret = register_trace_tcp_ooo_rcv(tcp_trace_ooo_rcv, NULL);
+	WARN_ON(ret);
+	ret = register_trace_tcp_sacks_rcv(tcp_trace_sacks_rcv, NULL);
+	WARN_ON(ret);
+	ret = register_trace_tcp_rtt_sample(tcp_trace_rtt_sample, NULL);
+	WARN_ON(ret);
+}
+
+static void tcp_unregister_tracepoints(void)
+{
+	unregister_trace_tcp_established(tcp_trace_established, NULL);
+	unregister_trace_tcp_rcv_established(tcp_trace_rcv_established, NULL);
+	unregister_trace_tcp_transmit_skb(tcp_trace_transmit_skb, NULL);
+	unregister_trace_tcp_ooo_rcv(tcp_trace_ooo_rcv, NULL);
+	unregister_trace_tcp_sacks_rcv(tcp_trace_sacks_rcv, NULL);
+	unregister_trace_tcp_rtt_sample(tcp_trace_rtt_sample, NULL);
+
+	tracepoint_synchronize_unregister();
+}
+
+static void tcp_tracer_start(struct trace_array *tr)
+{
+	tcp_register_tracepoints();
+	tcp_trace_enabled = true;
+}
+
+static void tcp_tracer_stop(struct trace_array *tr)
+{
+	tcp_unregister_tracepoints();
+	tcp_trace_enabled = false;
+}
+
+static void tcp_tracer_reset(struct trace_array *tr)
+{
+	tcp_tracer_stop(tr);
+}
+
+static int tcp_tracer_init(struct trace_array *tr)
+{
+	tcp_tr = tr;
+	tcp_tracer_start(tr);
+	return 0;
+}
+
+static struct tracer tcp_tracer __read_mostly = {
+        .name           = "tcp",
+        .init           = tcp_tracer_init,
+        .reset          = tcp_tracer_reset,
+        .start          = tcp_tracer_start,
+        .stop           = tcp_tracer_stop,
+	.print_line	= tcp_tracer_print_line,
+};
+
+static struct trace_event_functions tcp_trace_event_funcs;
+
+static struct trace_event tcp_trace_event = {
+        .type           = TRACE_TCP,
+        .funcs          = &tcp_trace_event_funcs,
+};
+
+static int __init init_tcp_tracer(void)
+{
+        if (!register_ftrace_event(&tcp_trace_event)) {
+                pr_warning("Cannot register TCP trace event\n");
+                return 1;
+        }
+
+        if (register_tracer(&tcp_tracer) != 0) {
+                pr_warning("Cannot register TCP tracer\n");
+                unregister_ftrace_event(&tcp_trace_event);
+                return 1;
+        }
+        return 0;
+}
+
+device_initcall(init_tcp_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 385391f..5dc5962 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -37,6 +37,7 @@ enum trace_type {
 	TRACE_USER_STACK,
 	TRACE_BLK,
 	TRACE_BPUTS,
+	TRACE_TCP,
 
 	__TRACE_LAST_TYPE,
 };
-- 
1.8.1

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] tcp: Add TCP tracer
  2014-12-02 20:37 ` [PATCH net-next] tcp: Add TCP tracer Martin KaFai Lau
@ 2014-12-02 20:40   ` Martin Lau
  2014-12-03  1:51   ` Stephen Hemminger
  1 sibling, 0 replies; 11+ messages in thread
From: Martin Lau @ 2014-12-02 20:40 UTC (permalink / raw)
  To: davem; +Cc: netdev

Please ignore this patch which is not completely ready. It is sent out by
mistake.

On Tue, Dec 02, 2014 at 12:37:42PM -0800, Martin KaFai Lau wrote:
> Define probes and register them to the TCP tracepoints.  The probes
> collect the data defined in struct tcp_sk_trace and record them to
> the tracing's ring_buffer.
> ---
>  include/uapi/linux/tcp_trace.h |   9 +-
>  kernel/trace/tcp_trace.c       | 448 +++++++++++++++++++++++++++++++++++++++++
>  kernel/trace/trace.h           |   1 +
>  3 files changed, 451 insertions(+), 7 deletions(-)
> 
> diff --git a/include/uapi/linux/tcp_trace.h b/include/uapi/linux/tcp_trace.h
> index 2644f7f..d913a3c 100644
> --- a/include/uapi/linux/tcp_trace.h
> +++ b/include/uapi/linux/tcp_trace.h
> @@ -22,11 +22,11 @@ struct tcp_stats {
>  	__u32	other_segs_retrans;
>  	__u32	other_octets_retrans;
>  	__u32	loss_segs_retrans;
> -	__u32	loss_octects_retrans;
> +	__u32	loss_octets_retrans;
>  	__u32	segs_in;
>  	__u32	data_segs_in;
> -	__u64	rtt_sample_us;
>  	__u64	data_octets_in;
> +	__u64	rtt_sample_us;
>  	__u64	max_rtt_us;
>  	__u64	min_rtt_us;
>  	__u64   sum_rtt_us;
> @@ -64,9 +64,4 @@ struct tcp_trace_stats {
>          struct tcp_stats stats;
>  } __packed;
>  
> -typedef struct tcp_trace_basic tcp_trace_establish;
> -typedef struct tcp_trace_basic tcp_trace_retrans;
> -typedef struct tcp_trace_stats tcp_trace_periodic;
> -typedef struct tcp_trace_stats tcp_trace_close;
> -
>  #endif /* UAPI_TCP_TRACE_H */
> diff --git a/kernel/trace/tcp_trace.c b/kernel/trace/tcp_trace.c
> index 9d09fd0..376580b 100644
> --- a/kernel/trace/tcp_trace.c
> +++ b/kernel/trace/tcp_trace.c
> @@ -1,9 +1,27 @@
>  #include <net/tcp_trace.h>
> +#include <net/tcp.h>
> +#include <trace/events/tcp.h>
>  #include <linux/tcp.h>
> +#include <linux/ipv6.h>
> +#include <linux/ftrace_event.h>
> +#include <linux/jiffies.h>
>  #include <uapi/linux/tcp_trace.h>
>  
> +#include "trace_output.h"
> +
> +#define REPORT_INTERVAL_MS 2000
> +
> +static struct trace_array *tcp_tr;
>  static bool tcp_trace_enabled __read_mostly;
>  
> +static struct trace_print_flags tcp_trace_event_names[] = {
> +	{ TCP_TRACE_EVENT_ESTABLISHED, "established" },
> +	{ TCP_TRACE_EVENT_PERIODIC, "periodic" },
> +	{ TCP_TRACE_EVENT_RETRANS, "retrans" },
> +	{ TCP_TRACE_EVENT_RETRANS_LOSS, "retrans_loss" },
> +	{ TCP_TRACE_EVENT_CLOSE, "close" }
> +};
> +
>  struct tcp_sk_trace {
>  	struct tcp_stats stats;
>  	unsigned long start_ts;
> @@ -35,3 +53,433 @@ void tcp_sk_trace_destruct(struct sock *sk)
>  {
>  	kfree(tcp_sk(sk)->trace);
>  }
> +
> +static void tcp_trace_init(struct tcp_trace *tr,
> +			   enum tcp_trace_events trev,
> +			   struct sock *sk)
> +{
> +	tr->event = trev;
> +	if (sk->sk_family == AF_INET) {
> +		tr->ipv6 = 0;
> +		tr->local_addr[0] = inet_sk(sk)->inet_saddr;
> +		tr->remote_addr[0] = inet_sk(sk)->inet_daddr;
> +	} else {
> +		BUG_ON(sk->sk_family != AF_INET6);
> +		tr->ipv6 = 1;
> +		memcpy(tr->local_addr, inet6_sk(sk)->saddr.s6_addr32,
> +		       sizeof(tr->local_addr));
> +		memcpy(tr->remote_addr, sk->sk_v6_daddr.s6_addr32,
> +		       sizeof(tr->remote_addr));
> +	}
> +	tr->local_port = inet_sk(sk)->inet_sport;
> +	tr->remote_port = inet_sk(sk)->inet_dport;
> +}
> +
> +static void tcp_trace_basic_init(struct tcp_trace_basic *trb,
> +				 enum tcp_trace_events trev,
> +				 struct sock *sk)
> +{
> +	struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
> +	tcp_trace_init((struct tcp_trace *)trb, trev, sk);
> +	trb->snd_cwnd = tcp_sk(sk)->snd_cwnd * tcp_sk(sk)->mss_cache;
> +	trb->mss = tcp_sk(sk)->mss_cache;
> +	trb->ssthresh = tcp_current_ssthresh(sk);
> +	trb->srtt_us = tcp_sk(sk)->srtt_us >> 3;
> +	trb->rto_ms = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
> +	trb->life_ms = jiffies_to_msecs(jiffies - sktr->start_ts);
> +}
> +
> +static void tcp_trace_basic_add(enum tcp_trace_events trev, struct sock *sk)
> +{
> +	struct ring_buffer *buffer;
> +	int pc;
> +	struct ring_buffer_event *event;
> +	struct tcp_trace_basic *trb;
> +	struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
> +
> +	if (!sktr)
> +		return;
> +
> +	tracing_record_cmdline(current);
> +	buffer = tcp_tr->trace_buffer.buffer;
> +	pc = preempt_count();
> +	event = trace_buffer_lock_reserve(buffer, TRACE_TCP,
> +					  sizeof(*trb), 0, pc);
> +	if (!event)
> +		return;
> +	trb = ring_buffer_event_data(event);
> +	tcp_trace_basic_init(trb, trev, sk);
> +	trace_buffer_unlock_commit(buffer, event, 0, pc);
> +}
> +
> +static void tcp_trace_stats_init(struct tcp_trace_stats *trs,
> +				 enum tcp_trace_events trev,
> +				 struct sock *sk)
> +{
> +	struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
> +
> +	tcp_trace_basic_init((struct tcp_trace_basic *)trs, trev, sk);
> +	memcpy(&trs->stats, &sktr->stats, sizeof(sktr->stats));
> +}
> +
> +static void tcp_trace_stats_add(enum tcp_trace_events trev, struct sock *sk)
> +{
> +	struct ring_buffer *buffer;
> +	int pc;
> +	struct ring_buffer_event *event;
> +	struct tcp_trace_stats *trs;
> +	struct tcp_sk_trace *sktr = tcp_sk(sk)->trace;
> +
> +	if (!sktr)
> +		return;
> +
> +	tracing_record_cmdline(current);
> +	buffer = tcp_tr->trace_buffer.buffer;
> +	pc = preempt_count();
> +	event = trace_buffer_lock_reserve(buffer, TRACE_TCP,
> +					  sizeof(*trs), 0, pc);
> +	if (!event)
> +		return;
> +	trs = ring_buffer_event_data(event);
> +
> +	tcp_trace_stats_init(trs, trev, sk);
> +
> +	trace_buffer_unlock_commit(buffer, event, 0, pc);
> +}
> +
> +static void tcp_trace_established(void *ignore, struct sock *sk)
> +{
> +	tcp_trace_basic_add(TCP_TRACE_EVENT_ESTABLISHED, sk);
> +}
> +
> +static void tcp_trace_transmit_skb(void *ignore, struct sock *sk,
> +				   struct sk_buff *skb)
> +{
> +	int pcount;
> +	struct tcp_sk_trace *sktr;
> +	struct tcp_skb_cb *tcb;
> +	unsigned int data_len;
> +	bool retrans = false;
> +
> +	sktr = tcp_sk(sk)->trace;
> +	if (!sktr)
> +		return;
> +
> +	tcb = TCP_SKB_CB(skb);
> +	pcount = tcp_skb_pcount(skb);
> +	data_len = tcb->end_seq - tcb->seq;
> +
> +	sktr->stats.segs_out += pcount;
> +
> +	if (!data_len)
> +		goto out;
> +
> +	sktr->stats.data_segs_out += pcount;
> +	sktr->stats.data_octets_out += data_len;
> +
> +	if (before(tcb->seq, tcp_sk(sk)->snd_nxt)) {
> +		enum tcp_trace_events trev;
> +		retrans = true;
> +		if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
> +			sktr->stats.loss_segs_retrans += pcount;
> +			sktr->stats.loss_octets_retrans += data_len;
> +			trev = TCP_TRACE_EVENT_RETRANS_LOSS;
> +		} else {
> +			sktr->stats.other_segs_retrans += pcount;
> +			sktr->stats.other_octets_retrans += data_len;
> +			trev = TCP_TRACE_EVENT_RETRANS;
> +		}
> +		tcp_trace_stats_add(trev, sk);
> +		return;
> +	}
> +
> +out:
> +	if (jiffies_to_msecs(jiffies - sktr->last_ts) >=
> +	    REPORT_INTERVAL_MS) {
> +		sktr->last_ts = jiffies;
> +		tcp_trace_stats_add(TCP_TRACE_EVENT_PERIODIC, sk);
> +	}
> +}
> +
> +static void tcp_trace_rcv_established(void *ignore, struct sock *sk,
> +				      struct sk_buff *skb)
> +{
> +	struct tcp_sk_trace *sktr;
> +	unsigned int data_len;
> +	struct tcphdr *th;
> +
> +	sktr = tcp_sk(sk)->trace;
> +	if (!sktr)
> +		return;
> +
> +	th = tcp_hdr(skb);
> +	WARN_ON_ONCE(skb->len < th->doff << 2);
> +
> +	sktr->stats.segs_in++;
> +	data_len = skb->len - (th->doff << 2);
> +	if (data_len) {
> +		if (TCP_SKB_CB(skb)->ack_seq == tcp_sk(sk)->snd_una)
> +			sktr->stats.dup_acks_in++;
> +	} else {
> +		sktr->stats.data_segs_in++;
> +		sktr->stats.data_segs_in += data_len;
> +	}
> +
> +	if (jiffies_to_msecs(jiffies - sktr->last_ts) >=
> +	    REPORT_INTERVAL_MS) {
> +		sktr->last_ts = jiffies;
> +		tcp_trace_stats_add(TCP_TRACE_EVENT_PERIODIC, sk);
> +	}
> +}
> +
> +static void tcp_trace_close(void *ignore, struct sock *sk)
> +{
> +	struct tcp_sk_trace *sktr;
> +	sktr = tcp_sk(sk)->trace;
> +	if (!sktr)
> +		return;
> +
> +	tcp_trace_stats_add(TCP_TRACE_EVENT_CLOSE, sk);
> +}
> +
> +static void tcp_trace_ooo_rcv(void *ignore, struct sock *sk)
> +{
> +	struct tcp_sk_trace *sktr;
> +
> +	sktr = tcp_sk(sk)->trace;
> +	if (!sktr)
> +		return;
> +
> +	sktr->stats.ooo_in++;
> +}
> +
> +static void tcp_trace_sacks_rcv(void *ignore, struct sock *sk, int num_sacks)
> +{
> +	struct tcp_sk_trace *sktr;
> +
> +	sktr = tcp_sk(sk)->trace;
> +	if (!sktr)
> +		return;
> +
> +	sktr->stats.sacks_in++;
> +	sktr->stats.sack_blks_in += num_sacks;
> +}
> +
> +void tcp_trace_rtt_sample(void *ignore, struct sock *sk,
> +			  long rtt_sample_us)
> +{
> +	struct tcp_sk_trace *sktr;
> +        u32 rto_ms;
> +
> +	sktr = tcp_sk(sk)->trace;
> +	if (!sktr)
> +		return;
> +
> +	rto_ms = jiffies_to_msecs(inet_csk(sk)->icsk_rto);
> +
> +	sktr->stats.rtt_sample_us = rtt_sample_us;
> +	sktr->stats.max_rtt_us = max_t(u64, sktr->stats.max_rtt_us, rtt_sample_us);
> +	sktr->stats.min_rtt_us = min_t(u64, sktr->stats.min_rtt_us, rtt_sample_us);
> +
> +	sktr->stats.count_rtt++;
> +	sktr->stats.sum_rtt_us += rtt_sample_us;
> +
> +	sktr->stats.max_rto_ms = max_t(u32, sktr->stats.max_rto_ms, rto_ms);
> +	sktr->stats.min_rto_ms = min_t(u32, sktr->stats.min_rto_ms, rto_ms);
> +}
> +
> +static enum print_line_t
> +tcp_trace_print(struct trace_iterator *iter)
> +{
> +	struct trace_seq *s = &iter->seq;
> +	struct tcp_trace *tr = (struct tcp_trace *)iter->ent;
> +	struct tcp_trace_basic *trb;
> +	struct tcp_stats *stats;
> +	const char *last_seq_bptr, *cur_seq_bptr;
> +	int ret = 0;
> +
> +	union {
> +		struct sockaddr_in v4;
> +		struct sockaddr_in6 v6;
> +	} local_sa, remote_sa;
> +
> +	local_sa.v4.sin_port = tr->local_port;
> +	remote_sa.v4.sin_port = tr->remote_port;
> +	if (tr->ipv6) {
> +		local_sa.v6.sin6_family = AF_INET6;
> +		remote_sa.v6.sin6_family = AF_INET6;
> +		memcpy(local_sa.v6.sin6_addr.s6_addr, tr->local_addr, 4);
> +		memcpy(remote_sa.v6.sin6_addr.s6_addr, tr->remote_addr, 4);
> +	} else {
> +		local_sa.v4.sin_family = AF_INET;
> +		remote_sa.v4.sin_family =AF_INET;
> +		local_sa.v4.sin_addr.s_addr =  tr->local_addr[0];
> +		remote_sa.v4.sin_addr.s_addr = tr->remote_addr[0];
> +	}
> +
> +	last_seq_bptr = ftrace_print_symbols_seq(s, tr->event,
> +						tcp_trace_event_names);
> +	cur_seq_bptr = trace_seq_buffer_ptr(s);
> +	if (last_seq_bptr == cur_seq_bptr)
> +		goto out;
> +
> +	trb = (struct tcp_trace_basic *)tr;
> +	ret = trace_seq_printf(s,
> +			       " %pISpc %pISpc snd_cwnd=%u mss=%u ssthresh=%u"
> +			       " srtt_us=%llu rto_ms=%u life_ms=%u",
> +			       &local_sa, &remote_sa,
> +			       trb->snd_cwnd, trb->mss, trb->ssthresh,
> +			       trb->srtt_us, trb->rto_ms, trb->life_ms);
> +
> +	if (tr->event == TCP_TRACE_EVENT_ESTABLISHED || ret == 0)
> +		goto out;
> +
> +	stats = &(((struct tcp_trace_stats *)tr)->stats);
> +	ret = trace_seq_printf(s,
> +		" segs_out=%u data_segs_out=%u data_octets_out=%llu"
> +		" other_segs_retrans=%u other_octets_retrans=%u"
> +		" loss_segs_retrans=%u loss_octets_retrans=%u"
> +		" segs_in=%u data_segs_in=%u data_octets_in=%llu"
> +		" max_rtt_us=%llu min_rtt_us=%llu"
> +		" count_rtt=%u sum_rtt_us=%llu"
> +		" rtt_sample_us=%llu"
> +		" max_rto_ms=%u min_rto_ms=%u"
> +		" dup_acks_in=%u sacks_in=%u"
> +		" sack_blks_in=%u ooo_in=%u",
> +		stats->segs_out, stats->data_segs_out, stats->data_octets_out,
> +		stats->other_segs_retrans, stats->other_octets_retrans,
> +		stats->loss_segs_retrans, stats->loss_octets_retrans,
> +		stats->segs_in, stats->data_segs_in, stats->data_octets_in,
> +		stats->max_rtt_us, stats->min_rtt_us,
> +		stats->count_rtt, stats->sum_rtt_us,
> +		stats->rtt_sample_us,
> +		stats->max_rto_ms, stats->min_rto_ms,
> +		stats->dup_acks_in, stats->sacks_in,
> +		stats->sack_blks_in, stats->ooo_in);
> +
> +out:
> +	if (ret)
> +		ret = trace_seq_putc(s, '\n');
> +
> +	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
> +}
> +
> +static enum print_line_t
> +tcp_trace_print_binary(struct trace_iterator *iter)
> +{
> +	int ret;
> +	struct trace_seq *s = &iter->seq;
> +	struct tcp_trace *tr = (struct tcp_trace *)iter->ent;
> +	u32 magic = TCP_TRACE_MAGIC_VERSION;
> +
> +	ret = trace_seq_putmem(s, &magic, sizeof(magic));
> +	if (!ret)
> +		goto out;
> +
> +	if (tr->event == TCP_TRACE_EVENT_ESTABLISHED)
> +		ret = trace_seq_putmem(s, tr + sizeof(magic),
> +					sizeof(struct tcp_trace_basic));
> +	else
> +		ret = trace_seq_putmem(s, tr + sizeof(magic),
> +				       sizeof(struct tcp_trace_stats));
> +
> +out:
> +	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
> +}
> +
> +static enum print_line_t
> +tcp_tracer_print_line(struct trace_iterator *iter)
> +{
> +	return (trace_flags & TRACE_ITER_BIN) ?
> +		tcp_trace_print_binary(iter) :
> +		tcp_trace_print(iter);
> +}
> +
> +static void tcp_register_tracepoints(void)
> +{
> +	int ret;
> +
> +	ret = register_trace_tcp_established(tcp_trace_established, NULL);
> +	WARN_ON(ret);
> +	ret = register_trace_tcp_close(tcp_trace_close, NULL);
> +	WARN_ON(ret);
> +	ret = register_trace_tcp_rcv_established(tcp_trace_rcv_established, NULL);
> +	WARN_ON(ret);
> +	ret = register_trace_tcp_transmit_skb(tcp_trace_transmit_skb, NULL);
> +	WARN_ON(ret);
> +	ret = register_trace_tcp_ooo_rcv(tcp_trace_ooo_rcv, NULL);
> +	WARN_ON(ret);
> +	ret = register_trace_tcp_sacks_rcv(tcp_trace_sacks_rcv, NULL);
> +	WARN_ON(ret);
> +	ret = register_trace_tcp_rtt_sample(tcp_trace_rtt_sample, NULL);
> +	WARN_ON(ret);
> +}
> +
> +static void tcp_unregister_tracepoints(void)
> +{
> +	unregister_trace_tcp_established(tcp_trace_established, NULL);
> +	unregister_trace_tcp_rcv_established(tcp_trace_rcv_established, NULL);
> +	unregister_trace_tcp_transmit_skb(tcp_trace_transmit_skb, NULL);
> +	unregister_trace_tcp_ooo_rcv(tcp_trace_ooo_rcv, NULL);
> +	unregister_trace_tcp_sacks_rcv(tcp_trace_sacks_rcv, NULL);
> +	unregister_trace_tcp_rtt_sample(tcp_trace_rtt_sample, NULL);
> +
> +	tracepoint_synchronize_unregister();
> +}
> +
> +static void tcp_tracer_start(struct trace_array *tr)
> +{
> +	tcp_register_tracepoints();
> +	tcp_trace_enabled = true;
> +}
> +
> +static void tcp_tracer_stop(struct trace_array *tr)
> +{
> +	tcp_unregister_tracepoints();
> +	tcp_trace_enabled = false;
> +}
> +
> +static void tcp_tracer_reset(struct trace_array *tr)
> +{
> +	tcp_tracer_stop(tr);
> +}
> +
> +static int tcp_tracer_init(struct trace_array *tr)
> +{
> +	tcp_tr = tr;
> +	tcp_tracer_start(tr);
> +	return 0;
> +}
> +
> +static struct tracer tcp_tracer __read_mostly = {
> +        .name           = "tcp",
> +        .init           = tcp_tracer_init,
> +        .reset          = tcp_tracer_reset,
> +        .start          = tcp_tracer_start,
> +        .stop           = tcp_tracer_stop,
> +	.print_line	= tcp_tracer_print_line,
> +};
> +
> +static struct trace_event_functions tcp_trace_event_funcs;
> +
> +static struct trace_event tcp_trace_event = {
> +        .type           = TRACE_TCP,
> +        .funcs          = &tcp_trace_event_funcs,
> +};
> +
> +static int __init init_tcp_tracer(void)
> +{
> +        if (!register_ftrace_event(&tcp_trace_event)) {
> +                pr_warning("Cannot register TCP trace event\n");
> +                return 1;
> +        }
> +
> +        if (register_tracer(&tcp_tracer) != 0) {
> +                pr_warning("Cannot register TCP tracer\n");
> +                unregister_ftrace_event(&tcp_trace_event);
> +                return 1;
> +        }
> +        return 0;
> +}
> +
> +device_initcall(init_tcp_tracer);
> diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
> index 385391f..5dc5962 100644
> --- a/kernel/trace/trace.h
> +++ b/kernel/trace/trace.h
> @@ -37,6 +37,7 @@ enum trace_type {
>  	TRACE_USER_STACK,
>  	TRACE_BLK,
>  	TRACE_BPUTS,
> +	TRACE_TCP,
>  
>  	__TRACE_LAST_TYPE,
>  };
> -- 
> 1.8.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  https://urldefense.proofpoint.com/v1/url?u=http://vger.kernel.org/majordomo-info.html&k=ZVNjlDMF0FElm4dQtryO4A%3D%3D%0A&r=%2Faj1ZOQObwbmtLwlDw3XzQ%3D%3D%0A&m=CW4scPRBfOgsdn0GCbMgedOQVytKe3ZEBV2fC4xJFOA%3D%0A&s=d8b63403525c4df85b423582337b753283978aef9d9be19238adeb1042270caf

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next] tcp: Add TCP tracer
  2014-12-02 20:37 ` [PATCH net-next] tcp: Add TCP tracer Martin KaFai Lau
  2014-12-02 20:40   ` Martin Lau
@ 2014-12-03  1:51   ` Stephen Hemminger
  1 sibling, 0 replies; 11+ messages in thread
From: Stephen Hemminger @ 2014-12-03  1:51 UTC (permalink / raw)
  To: Martin KaFai Lau; +Cc: davem, netdev

On Tue, 2 Dec 2014 12:37:42 -0800
Martin KaFai Lau <kafai@fb.com> wrote:

> diff --git a/include/uapi/linux/tcp_trace.h b/include/uapi/linux/tcp_trace.h
> index 2644f7f..d913a3c 100644
> --- a/include/uapi/linux/tcp_trace.h
> +++ b/include/uapi/linux/tcp_trace.h
> @@ -22,11 +22,11 @@ struct tcp_stats {
>  	__u32	other_segs_retrans;
>  	__u32	other_octets_retrans;
>  	__u32	loss_segs_retrans;
> -	__u32	loss_octects_retrans;
> +	__u32	loss_octets_retrans;
>  	__u32	segs_in;
>  	__u32	data_segs_in;
> -	__u64	rtt_sample_us;
>  	__u64	data_octets_in;
> +	__u64	rtt_sample_us;
>  	__u64	max_rtt_us;
>  	__u64	min_rtt_us;
>  	__u64   sum_rtt_us;
> @@ -64,9 +64,4 @@ struct tcp_trace_stats {
>          struct tcp_stats stats;
>  } __packed;

You can't change exposed kernel API like that.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [net_test_tools] udpflood: Add IPv6 support
  2014-12-02 18:41 ` [net_test_tools] udpflood: Add IPv6 support Martin KaFai Lau
@ 2014-12-09 18:05   ` David Miller
  0 siblings, 0 replies; 11+ messages in thread
From: David Miller @ 2014-12-09 18:05 UTC (permalink / raw)
  To: kafai; +Cc: netdev

From: Martin KaFai Lau <kafai@fb.com>
Date: Tue, 2 Dec 2014 10:41:46 -0800

> This patch:
> 1. Add IPv6 support
> 2. Print timing for every 65536 fib insert operations to observe
>    the gc effect (mostly for IPv6 fib).

Applied, thanks.

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2014-12-09 18:05 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-10-20 20:42 [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 1/3] ipv6: Remove BACKTRACK macro Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 2/3] ipv6: Avoid redoing fib6_lookup() for RTF_CACHE hit case Martin KaFai Lau
2014-10-20 20:42 ` [PATCH RFC v5 net 3/3] ipv6: Avoid redoing fib6_lookup() with reachable = 0 by saving fn Martin KaFai Lau
2014-10-24  4:15 ` [PATCH RFC v5 net 0/3] ipv6: Reduce the number of fib6_lookup() calls from ip6_pol_route() David Miller
2014-10-24 17:28   ` Martin Lau
2014-12-02 18:41 ` [net_test_tools] udpflood: Add IPv6 support Martin KaFai Lau
2014-12-09 18:05   ` David Miller
2014-12-02 20:37 ` [PATCH net-next] tcp: Add TCP tracer Martin KaFai Lau
2014-12-02 20:40   ` Martin Lau
2014-12-03  1:51   ` Stephen Hemminger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).