All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next v3] packet: packet fanout rollover during socket overload
@ 2013-03-19 20:18 Willem de Bruijn
  2013-03-19 20:37 ` Eric Dumazet
  0 siblings, 1 reply; 20+ messages in thread
From: Willem de Bruijn @ 2013-03-19 20:18 UTC (permalink / raw)
  To: eric.dumazet, davem, netdev; +Cc: Willem de Bruijn

Changes:
  v3->v2: rebase (no other changes)
          passes selftest
  v2->v1: read f->num_members only once
          fix bug: test rollover mode + flag

Minimize packet drop in a fanout group. If one socket is full,
roll over packets to another from the group. Maintain flow
affinity during normal load using an rxhash fanout policy, while
dispersing unexpected traffic storms that hit a single cpu, such
as spoofed-source DoS flows. Rollover breaks affinity for flows
arriving at saturated sockets during those conditions.

The patch adds a fanout policy ROLLOVER that rotates between sockets,
filling each socket before moving to the next. It also adds a fanout
flag ROLLOVER. If passed along with any other fanout policy, the
primary policy is applied until the chosen socket is full. Then,
rollover selects another socket, to delay packet drop until the
entire system is saturated.

Probing sockets is not free. Selecting the last used socket, as
rollover does, is a greedy approach that maximizes chance of
success, at the cost of extreme load imbalance. In practice, with
sufficiently long queues to absorb bursts, sockets are drained in
parallel and load balance looks uniform in `top`.

To avoid contention, scales counters with number of sockets and
accesses them lockfree. Values are bounds checked to ensure
correctness.

Tested using an application with 9 threads pinned to CPUs, one socket
per thread and sufficient busywork per packet operation to limits each
thread to handling 32 Kpps. When sent 500 Kpps single UDP stream
packets, a FANOUT_CPU setup processes 32 Kpps in total without this
patch, 270 Kpps with the patch. Tested with read() and with a packet
ring (V1). Also, passes unit test (perhaps for selftests) at
http://kernel.googlecode.com/files/psock_fanout.c

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 include/uapi/linux/if_packet.h |   2 +
 net/packet/af_packet.c         | 109 ++++++++++++++++++++++++++++++++---------
 net/packet/internal.h          |   3 +-
 3 files changed, 90 insertions(+), 24 deletions(-)

diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h
index f9a6037..8136658 100644
--- a/include/uapi/linux/if_packet.h
+++ b/include/uapi/linux/if_packet.h
@@ -55,6 +55,8 @@ struct sockaddr_ll {
 #define PACKET_FANOUT_HASH		0
 #define PACKET_FANOUT_LB		1
 #define PACKET_FANOUT_CPU		2
+#define PACKET_FANOUT_ROLLOVER		3
+#define PACKET_FANOUT_FLAG_ROLLOVER	0x1000
 #define PACKET_FANOUT_FLAG_DEFRAG	0x8000
 
 struct tpacket_stats {
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 1d6793d..0304c6b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -181,6 +181,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 
 struct packet_sock;
 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
+static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
+		       struct packet_type *pt, struct net_device *orig_dev);
 
 static void *packet_previous_frame(struct packet_sock *po,
 		struct packet_ring_buffer *rb,
@@ -973,11 +975,11 @@ static void *packet_current_rx_frame(struct packet_sock *po,
 
 static void *prb_lookup_block(struct packet_sock *po,
 				     struct packet_ring_buffer *rb,
-				     unsigned int previous,
+				     unsigned int idx,
 				     int status)
 {
 	struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
-	struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, previous);
+	struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
 
 	if (status != BLOCK_STATUS(pbd))
 		return NULL;
@@ -1041,6 +1043,29 @@ static void packet_increment_head(struct packet_ring_buffer *buff)
 	buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
 }
 
+static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+{
+	struct sock *sk = &po->sk;
+	bool has_room;
+
+	if (po->prot_hook.func != tpacket_rcv)
+		return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
+		       <= sk->sk_rcvbuf;
+
+	spin_lock(&sk->sk_receive_queue.lock);
+	if (po->tp_version == TPACKET_V3)
+		has_room = prb_lookup_block(po, &po->rx_ring,
+					po->rx_ring.prb_bdqc.kactive_blk_num,
+					TP_STATUS_KERNEL);
+	else
+		has_room = packet_lookup_frame(po, &po->rx_ring,
+					       po->rx_ring.head,
+					       TP_STATUS_KERNEL);
+	spin_unlock(&sk->sk_receive_queue.lock);
+
+	return has_room;
+}
+
 static void packet_sock_destruct(struct sock *sk)
 {
 	skb_queue_purge(&sk->sk_error_queue);
@@ -1066,16 +1091,16 @@ static int fanout_rr_next(struct packet_fanout *f, unsigned int num)
 	return x;
 }
 
-static struct sock *fanout_demux_hash(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+static unsigned int fanout_demux_hash(struct packet_fanout *f,
+				      struct sk_buff *skb,
+				      unsigned int num)
 {
-	u32 idx, hash = skb->rxhash;
-
-	idx = ((u64)hash * num) >> 32;
-
-	return f->arr[idx];
+	return (((u64)skb->rxhash) * num) >> 32;
 }
 
-static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+static unsigned int fanout_demux_lb(struct packet_fanout *f,
+				    struct sk_buff *skb,
+				    unsigned int num)
 {
 	int cur, old;
 
@@ -1083,14 +1108,40 @@ static struct sock *fanout_demux_lb(struct packet_fanout *f, struct sk_buff *skb
 	while ((old = atomic_cmpxchg(&f->rr_cur, cur,
 				     fanout_rr_next(f, num))) != cur)
 		cur = old;
-	return f->arr[cur];
+	return cur;
+}
+
+static unsigned int fanout_demux_cpu(struct packet_fanout *f,
+				     struct sk_buff *skb,
+				     unsigned int num)
+{
+	return smp_processor_id() % num;
 }
 
-static struct sock *fanout_demux_cpu(struct packet_fanout *f, struct sk_buff *skb, unsigned int num)
+static unsigned int fanout_demux_rollover(struct packet_fanout *f,
+					  struct sk_buff *skb,
+					  unsigned int idx, unsigned int skip,
+					  unsigned int num)
 {
-	unsigned int cpu = smp_processor_id();
+	unsigned int i, j;
 
-	return f->arr[cpu % num];
+	i = j = min_t(int, f->next[idx], num - 1);
+	do {
+		if (i != skip && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
+			if (i != j)
+				f->next[idx] = i;
+			return i;
+		}
+		if (++i == num)
+			i = 0;
+	} while (i != j);
+
+	return idx;
+}
+
+static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
+{
+	return f->flags & (flag >> 8);
 }
 
 static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
@@ -1099,7 +1150,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
 	struct packet_fanout *f = pt->af_packet_priv;
 	unsigned int num = f->num_members;
 	struct packet_sock *po;
-	struct sock *sk;
+	unsigned int idx;
 
 	if (!net_eq(dev_net(dev), read_pnet(&f->net)) ||
 	    !num) {
@@ -1110,23 +1161,31 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
 	switch (f->type) {
 	case PACKET_FANOUT_HASH:
 	default:
-		if (f->defrag) {
+		if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
 			skb = ip_check_defrag(skb, IP_DEFRAG_AF_PACKET);
 			if (!skb)
 				return 0;
 		}
 		skb_get_rxhash(skb);
-		sk = fanout_demux_hash(f, skb, num);
+		idx = fanout_demux_hash(f, skb, num);
 		break;
 	case PACKET_FANOUT_LB:
-		sk = fanout_demux_lb(f, skb, num);
+		idx = fanout_demux_lb(f, skb, num);
 		break;
 	case PACKET_FANOUT_CPU:
-		sk = fanout_demux_cpu(f, skb, num);
+		idx = fanout_demux_cpu(f, skb, num);
+		break;
+	case PACKET_FANOUT_ROLLOVER:
+		idx = fanout_demux_rollover(f, skb, 0, (unsigned int) -1, num);
 		break;
 	}
 
-	po = pkt_sk(sk);
+	po = pkt_sk(f->arr[idx]);
+	if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER) &&
+	    unlikely(!packet_rcv_has_room(po, skb))) {
+		idx = fanout_demux_rollover(f, skb, idx, idx, num);
+		po = pkt_sk(f->arr[idx]);
+	}
 
 	return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
 }
@@ -1175,10 +1234,13 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 	struct packet_sock *po = pkt_sk(sk);
 	struct packet_fanout *f, *match;
 	u8 type = type_flags & 0xff;
-	u8 defrag = (type_flags & PACKET_FANOUT_FLAG_DEFRAG) ? 1 : 0;
+	u8 flags = type_flags >> 8;
 	int err;
 
 	switch (type) {
+	case PACKET_FANOUT_ROLLOVER:
+		if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
+			return -EINVAL;
 	case PACKET_FANOUT_HASH:
 	case PACKET_FANOUT_LB:
 	case PACKET_FANOUT_CPU:
@@ -1203,7 +1265,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 		}
 	}
 	err = -EINVAL;
-	if (match && match->defrag != defrag)
+	if (match && match->flags != flags)
 		goto out;
 	if (!match) {
 		err = -ENOMEM;
@@ -1213,7 +1275,7 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags)
 		write_pnet(&match->net, sock_net(sk));
 		match->id = id;
 		match->type = type;
-		match->defrag = defrag;
+		match->flags = flags;
 		atomic_set(&match->rr_cur, 0);
 		INIT_LIST_HEAD(&match->list);
 		spin_lock_init(&match->lock);
@@ -3240,7 +3302,8 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 	case PACKET_FANOUT:
 		val = (po->fanout ?
 		       ((u32)po->fanout->id |
-			((u32)po->fanout->type << 16)) :
+			((u32)po->fanout->type << 16) |
+			((u32)po->fanout->flags << 24)) :
 		       0);
 		break;
 	case PACKET_TX_HAS_OFF:
diff --git a/net/packet/internal.h b/net/packet/internal.h
index e84cab8..e891f02 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -77,10 +77,11 @@ struct packet_fanout {
 	unsigned int		num_members;
 	u16			id;
 	u8			type;
-	u8			defrag;
+	u8			flags;
 	atomic_t		rr_cur;
 	struct list_head	list;
 	struct sock		*arr[PACKET_FANOUT_MAX];
+	int			next[PACKET_FANOUT_MAX];
 	spinlock_t		lock;
 	atomic_t		sk_ref;
 	struct packet_type	prot_hook ____cacheline_aligned_in_smp;
-- 
1.8.1.3

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next v3] packet: packet fanout rollover during socket overload
  2013-03-19 20:18 [PATCH net-next v3] packet: packet fanout rollover during socket overload Willem de Bruijn
@ 2013-03-19 20:37 ` Eric Dumazet
  2013-03-19 21:16   ` David Miller
  0 siblings, 1 reply; 20+ messages in thread
From: Eric Dumazet @ 2013-03-19 20:37 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: davem, netdev

On Tue, 2013-03-19 at 16:18 -0400, Willem de Bruijn wrote:
> Changes:
>   v3->v2: rebase (no other changes)
>           passes selftest
>   v2->v1: read f->num_members only once
>           fix bug: test rollover mode + flag
> 
> Minimize packet drop in a fanout group. If one socket is full,
> roll over packets to another from the group. Maintain flow
> affinity during normal load using an rxhash fanout policy, while
> dispersing unexpected traffic storms that hit a single cpu, such
> as spoofed-source DoS flows. Rollover breaks affinity for flows
> arriving at saturated sockets during those conditions.
> 
> The patch adds a fanout policy ROLLOVER that rotates between sockets,
> filling each socket before moving to the next. It also adds a fanout
> flag ROLLOVER. If passed along with any other fanout policy, the
> primary policy is applied until the chosen socket is full. Then,
> rollover selects another socket, to delay packet drop until the
> entire system is saturated.
> 
> Probing sockets is not free. Selecting the last used socket, as
> rollover does, is a greedy approach that maximizes chance of
> success, at the cost of extreme load imbalance. In practice, with
> sufficiently long queues to absorb bursts, sockets are drained in
> parallel and load balance looks uniform in `top`.
> 
> To avoid contention, scales counters with number of sockets and
> accesses them lockfree. Values are bounds checked to ensure
> correctness.
> 
> Tested using an application with 9 threads pinned to CPUs, one socket
> per thread and sufficient busywork per packet operation to limits each
> thread to handling 32 Kpps. When sent 500 Kpps single UDP stream
> packets, a FANOUT_CPU setup processes 32 Kpps in total without this
> patch, 270 Kpps with the patch. Tested with read() and with a packet
> ring (V1). Also, passes unit test (perhaps for selftests) at
> http://kernel.googlecode.com/files/psock_fanout.c
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>
> ---
>  include/uapi/linux/if_packet.h |   2 +
>  net/packet/af_packet.c         | 109 ++++++++++++++++++++++++++++++++---------
>  net/packet/internal.h          |   3 +-
>  3 files changed, 90 insertions(+), 24 deletions(-)

Reviewed-by: Eric Dumazet <edumazet@google.com>

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next v3] packet: packet fanout rollover during socket overload
  2013-03-19 20:37 ` Eric Dumazet
@ 2013-03-19 21:16   ` David Miller
  2013-03-19 21:34     ` Willem de Bruijn
  0 siblings, 1 reply; 20+ messages in thread
From: David Miller @ 2013-03-19 21:16 UTC (permalink / raw)
  To: eric.dumazet; +Cc: willemb, netdev

From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Tue, 19 Mar 2013 13:37:17 -0700

> On Tue, 2013-03-19 at 16:18 -0400, Willem de Bruijn wrote:
>> Changes:
>>   v3->v2: rebase (no other changes)
>>           passes selftest
>>   v2->v1: read f->num_members only once
>>           fix bug: test rollover mode + flag
 ...
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
 ...
> Reviewed-by: Eric Dumazet <edumazet@google.com>

Applied, and I added the selftest too, but it fails for me on my
128-cpu sparc64 machine:

make[1]: Entering directory `/home/davem/src/GIT/net-next/tools/testing/selftests/net-afpacket'
--------------------
running psock_fanout test
--------------------
test: control single socket
test: control multiple sockets
test: datapath 0x0
info: count=0,0, expect=0,0
info: count=0,20, expect=15,5
ERROR: incorrect queue lengths
[FAIL]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next v3] packet: packet fanout rollover during socket overload
  2013-03-19 21:16   ` David Miller
@ 2013-03-19 21:34     ` Willem de Bruijn
  2013-03-20  6:37       ` Willem de Bruijn
  0 siblings, 1 reply; 20+ messages in thread
From: Willem de Bruijn @ 2013-03-19 21:34 UTC (permalink / raw)
  To: David Miller; +Cc: Eric Dumazet, netdev

On Tue, Mar 19, 2013 at 5:16 PM, David Miller <davem@davemloft.net> wrote:
> From: Eric Dumazet <eric.dumazet@gmail.com>
> Date: Tue, 19 Mar 2013 13:37:17 -0700
>
>> On Tue, 2013-03-19 at 16:18 -0400, Willem de Bruijn wrote:
>>> Changes:
>>>   v3->v2: rebase (no other changes)
>>>           passes selftest
>>>   v2->v1: read f->num_members only once
>>>           fix bug: test rollover mode + flag
>  ...
>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>  ...
>> Reviewed-by: Eric Dumazet <edumazet@google.com>
>
> Applied, and I added the selftest too

Thanks!

>, but it fails for me on my
> 128-cpu sparc64 machine:
>
> make[1]: Entering directory `/home/davem/src/GIT/net-next/tools/testing/selftests/net-afpacket'
> --------------------
> running psock_fanout test
> --------------------
> test: control single socket
> test: control multiple sockets
> test: datapath 0x0
> info: count=0,0, expect=0,0
> info: count=0,20, expect=15,5
> ERROR: incorrect queue lengths

I'm looking into it. The test needs the packet sockets to be able to
hold an exact number of test packets and then overflow. Queue
length configured in bytes (SO_RCVBUF) was arrived at by
experimentation. My best hunch so far is that this differs between
platforms/nic, if skb->truesize does. If queue length is the problem,
I'll switch to a packet ring. Booting another machine right now.

> [FAIL]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next v3] packet: packet fanout rollover during socket overload
  2013-03-19 21:34     ` Willem de Bruijn
@ 2013-03-20  6:37       ` Willem de Bruijn
  2013-03-20  6:42         ` [PATCH net-next] net: fix psock_fanout selftest hash collision Willem de Bruijn
  0 siblings, 1 reply; 20+ messages in thread
From: Willem de Bruijn @ 2013-03-20  6:37 UTC (permalink / raw)
  To: David Miller; +Cc: Eric Dumazet, netdev

On Tue, Mar 19, 2013 at 5:34 PM, Willem de Bruijn <willemb@google.com> wrote:
> On Tue, Mar 19, 2013 at 5:16 PM, David Miller <davem@davemloft.net> wrote:
>> From: Eric Dumazet <eric.dumazet@gmail.com>
>> Date: Tue, 19 Mar 2013 13:37:17 -0700
>>
>>> On Tue, 2013-03-19 at 16:18 -0400, Willem de Bruijn wrote:
>>>> Changes:
>>>>   v3->v2: rebase (no other changes)
>>>>           passes selftest
>>>>   v2->v1: read f->num_members only once
>>>>           fix bug: test rollover mode + flag
>>  ...
>>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>>  ...
>>> Reviewed-by: Eric Dumazet <edumazet@google.com>
>>
>> Applied, and I added the selftest too
>
> Thanks!
>
>>, but it fails for me on my
>> 128-cpu sparc64 machine:
>>
>> make[1]: Entering directory `/home/davem/src/GIT/net-next/tools/testing/selftests/net-afpacket'
>> --------------------
>> running psock_fanout test
>> --------------------
>> test: control single socket
>> test: control multiple sockets
>> test: datapath 0x0
>> info: count=0,0, expect=0,0
>> info: count=0,20, expect=15,5
>> ERROR: incorrect queue lengths
>
> I'm looking into it. The test needs the packet sockets to be able to
> hold an exact number of test packets and then overflow. Queue
> length configured in bytes (SO_RCVBUF) was arrived at by
> experimentation. My best hunch so far is that this differs between
> platforms/nic, if skb->truesize does. If queue length is the problem,
> I'll switch to a packet ring. Booting another machine right now.

I was able to reproduce this. The issue is not queue length, but an
rxhash collision. The test either passes or fails consistently between
reboots. Rebooting to get a different hashrnd seed may change the test
outcome.

Before finding this, I had already updated the patch to use a packet
ring (still better than the estimated queue length) and added
testcases for the LB and CPU modes. I may need to clean it up, but I
will send the current patch to see if it resolves the bug.


>> [FAIL]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-20  6:37       ` Willem de Bruijn
@ 2013-03-20  6:42         ` Willem de Bruijn
  2013-03-20 16:33           ` David Miller
  0 siblings, 1 reply; 20+ messages in thread
From: Willem de Bruijn @ 2013-03-20  6:42 UTC (permalink / raw)
  To: davem, netdev; +Cc: Willem de Bruijn

Fix flaky results with PACKET_FANOUT_HASH depending on whether the
two flows hash into the same packet socket or not.

Also adds tests for PACKET_FANOUT_LB and PACKET_FANOUT_CPU and
replaces the counting method with a packet ring.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 .../testing/selftests/net-afpacket/psock_fanout.c  | 160 +++++++++++++++------
 1 file changed, 120 insertions(+), 40 deletions(-)

diff --git a/tools/testing/selftests/net-afpacket/psock_fanout.c b/tools/testing/selftests/net-afpacket/psock_fanout.c
index af9b019..f765f09 100644
--- a/tools/testing/selftests/net-afpacket/psock_fanout.c
+++ b/tools/testing/selftests/net-afpacket/psock_fanout.c
@@ -16,11 +16,11 @@
  *   The test currently runs for
  *   - PACKET_FANOUT_HASH
  *   - PACKET_FANOUT_HASH with PACKET_FANOUT_FLAG_ROLLOVER
+ *   - PACKET_FANOUT_LB
+ *   - PACKET_FANOUT_CPU
  *   - PACKET_FANOUT_ROLLOVER
  *
  * Todo:
- * - datapath: PACKET_FANOUT_LB
- * - datapath: PACKET_FANOUT_CPU
  * - functionality: PACKET_FANOUT_FLAG_DEFRAG
  *
  * License (GPLv2):
@@ -39,18 +39,23 @@
  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
  */
 
+#define _GNU_SOURCE		/* for sched_setaffinity */
+
 #include <arpa/inet.h>
 #include <errno.h>
+#include <fcntl.h>
 #include <linux/filter.h>
 #include <linux/if_packet.h>
 #include <net/ethernet.h>
 #include <netinet/ip.h>
 #include <netinet/udp.h>
-#include <fcntl.h>
+#include <poll.h>
+#include <sched.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/mman.h>
 #include <sys/socket.h>
 #include <sys/stat.h>
 #include <sys/types.h>
@@ -58,6 +63,8 @@
 
 #define DATA_LEN			100
 #define DATA_CHAR			'a'
+#define RING_NUM_FRAMES			20
+#define PORT_BASE			8000
 
 static void pair_udp_open(int fds[], uint16_t port)
 {
@@ -162,37 +169,55 @@ static int sock_fanout_open(uint16_t typeflags, int num_packets)
 		return -1;
 	}
 
-	val = sizeof(struct iphdr) + sizeof(struct udphdr) + DATA_LEN;
-	val *= num_packets;
-	/* hack: apparently, the above calculation is too small (TODO: fix) */
-	val *= 3;
-	if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val))) {
-		perror("setsockopt SO_RCVBUF");
-		exit(1);
-	}
-
 	sock_fanout_setfilter(fd);
 	return fd;
 }
 
-static void sock_fanout_read(int fds[], const int expect[])
+static char *sock_fanout_open_ring(int fd)
 {
-	struct tpacket_stats stats;
-	socklen_t ssize;
-	int ret[2];
+	struct tpacket_req req = {
+		.tp_block_size = getpagesize(),
+		.tp_frame_size = getpagesize(),
+		.tp_block_nr   = RING_NUM_FRAMES,
+		.tp_frame_nr   = RING_NUM_FRAMES,
+	};
+	char *ring;
 
-	ssize = sizeof(stats);
-	if (getsockopt(fds[0], SOL_PACKET, PACKET_STATISTICS, &stats, &ssize)) {
-		perror("getsockopt statistics 0");
+	if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req,
+		       sizeof(req))) {
+		perror("packetsock ring setsockopt");
 		exit(1);
 	}
-	ret[0] = stats.tp_packets - stats.tp_drops;
-	ssize = sizeof(stats);
-	if (getsockopt(fds[1], SOL_PACKET, PACKET_STATISTICS, &stats, &ssize)) {
-		perror("getsockopt statistics 1");
+
+	ring = mmap(0, req.tp_block_size * req.tp_block_nr,
+		    PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if (!ring) {
+		fprintf(stderr, "packetsock ring mmap\n");
 		exit(1);
 	}
-	ret[1] = stats.tp_packets - stats.tp_drops;
+
+	return ring;
+}
+
+static int sock_fanout_read_ring(int fd, void *ring)
+{
+	struct tpacket_hdr *header = ring;
+	int count = 0;
+
+	while (header->tp_status & TP_STATUS_USER && count < RING_NUM_FRAMES) {
+		count++;
+		header = ring + (count * getpagesize());
+	}
+
+	return count;
+}
+
+static int sock_fanout_read(int fds[], char *rings[], const int expect[])
+{
+	int ret[2];
+
+	ret[0] = sock_fanout_read_ring(fds[0], rings[0]);
+	ret[1] = sock_fanout_read_ring(fds[1], rings[1]);
 
 	fprintf(stderr, "info: count=%d,%d, expect=%d,%d\n",
 			ret[0], ret[1], expect[0], expect[1]);
@@ -200,8 +225,10 @@ static void sock_fanout_read(int fds[], const int expect[])
 	if ((!(ret[0] == expect[0] && ret[1] == expect[1])) &&
 	    (!(ret[0] == expect[1] && ret[1] == expect[0]))) {
 		fprintf(stderr, "ERROR: incorrect queue lengths\n");
-		exit(1);
+		return 1;
 	}
+
+	return 0;
 }
 
 /* Test illegal mode + flag combination */
@@ -253,11 +280,12 @@ static void test_control_group(void)
 	}
 }
 
-static void test_datapath(uint16_t typeflags,
-			  const int expect1[], const int expect2[])
+static int test_datapath(uint16_t typeflags, int port_off,
+			 const int expect1[], const int expect2[])
 {
 	const int expect0[] = { 0, 0 };
-	int fds[2], fds_udp[2][2];
+	char *rings[2];
+	int fds[2], fds_udp[2][2], ret;
 
 	fprintf(stderr, "test: datapath 0x%hx\n", typeflags);
 
@@ -267,41 +295,93 @@ static void test_datapath(uint16_t typeflags,
 		fprintf(stderr, "ERROR: failed open\n");
 		exit(1);
 	}
-	pair_udp_open(fds_udp[0], 8000);
-	pair_udp_open(fds_udp[1], 8002);
-	sock_fanout_read(fds, expect0);
+	rings[0] = sock_fanout_open_ring(fds[0]);
+	rings[1] = sock_fanout_open_ring(fds[1]);
+	pair_udp_open(fds_udp[0], PORT_BASE);
+	pair_udp_open(fds_udp[1], PORT_BASE + port_off);
+	sock_fanout_read(fds, rings, expect0);
 
 	/* Send data, but not enough to overflow a queue */
 	pair_udp_send(fds_udp[0], 15);
 	pair_udp_send(fds_udp[1], 5);
-	sock_fanout_read(fds, expect1);
+	ret = sock_fanout_read(fds, rings, expect1);
 
 	/* Send more data, overflow the queue */
 	pair_udp_send(fds_udp[0], 15);
 	/* TODO: ensure consistent order between expect1 and expect2 */
-	sock_fanout_read(fds, expect2);
+	ret |= sock_fanout_read(fds, rings, expect2);
 
+	if (munmap(rings[1], RING_NUM_FRAMES * getpagesize()) ||
+	    munmap(rings[0], RING_NUM_FRAMES * getpagesize())) {
+		fprintf(stderr, "close rings\n");
+		exit(1);
+	}
 	if (close(fds_udp[1][1]) || close(fds_udp[1][0]) ||
 	    close(fds_udp[0][1]) || close(fds_udp[0][0]) ||
 	    close(fds[1]) || close(fds[0])) {
 		fprintf(stderr, "close datapath\n");
 		exit(1);
 	}
+
+	return ret;
+}
+
+static int set_cpuaffinity(int cpuid)
+{
+	cpu_set_t mask;
+
+	CPU_ZERO(&mask);
+	CPU_SET(cpuid, &mask);
+	if (sched_setaffinity(0, sizeof(mask), &mask)) {
+		if (errno != EINVAL) {
+			fprintf(stderr, "setaffinity %d\n", cpuid);
+			exit(1);
+		}
+		return 1;
+	}
+
+	return 0;
 }
 
 int main(int argc, char **argv)
 {
-	const int expect_hash[2][2]	= { { 15, 5 }, { 5, 0 } };
-	const int expect_hash_rb[2][2]	= { { 15, 5 }, { 5, 10 } };
-	const int expect_rb[2][2]	= { { 20, 0 }, { 0, 15 } };
+	const int expect_hash[2][2]	= { { 15, 5 },  { 20, 5 } };
+	const int expect_hash_rb[2][2]	= { { 15, 5 },  { 20, 15 } };
+	const int expect_lb[2][2]	= { { 10, 10 }, { 18, 17 } };
+	const int expect_rb[2][2]	= { { 20, 0 },  { 20, 15 } };
+	const int expect_cpu0[2][2]	= { { 20, 0 },  { 20, 0 } };
+	const int expect_cpu1[2][2]	= { { 0, 20 },  { 0, 20 } };
+	int port_off = 2, tries = 5, ret;
 
 	test_control_single();
 	test_control_group();
 
-	test_datapath(PACKET_FANOUT_HASH, expect_hash[0], expect_hash[1]);
-	test_datapath(PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_ROLLOVER,
-		      expect_hash_rb[0], expect_hash_rb[1]);
-	test_datapath(PACKET_FANOUT_ROLLOVER, expect_rb[0], expect_rb[1]);
+	/* find a set of ports that do not collide onto the same socket */
+	ret = test_datapath(PACKET_FANOUT_HASH, port_off,
+			    expect_hash[0], expect_hash[1]);
+	while (ret && tries--) {
+		fprintf(stderr, "info: trying alternate ports (%d)\n", tries);
+		ret = test_datapath(PACKET_FANOUT_HASH, ++port_off,
+				    expect_hash[0], expect_hash[1]);
+	}
+
+	ret |= test_datapath(PACKET_FANOUT_HASH | PACKET_FANOUT_FLAG_ROLLOVER,
+			     port_off, expect_hash_rb[0], expect_hash_rb[1]);
+	ret |= test_datapath(PACKET_FANOUT_LB,
+			     port_off, expect_lb[0], expect_lb[1]);
+	ret |= test_datapath(PACKET_FANOUT_ROLLOVER,
+			     port_off, expect_rb[0], expect_rb[1]);
+
+	set_cpuaffinity(0);
+	ret |= test_datapath(PACKET_FANOUT_CPU, port_off,
+			     expect_cpu0[0], expect_cpu0[1]);
+	if (!set_cpuaffinity(1))
+		/* TODO: test that choice alternates with previous */
+		ret |= test_datapath(PACKET_FANOUT_CPU, port_off,
+				     expect_cpu1[0], expect_cpu1[1]);
+
+	if (ret)
+		return 1;
 
 	printf("OK. All tests passed\n");
 	return 0;
-- 
1.8.1.3

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-20  6:42         ` [PATCH net-next] net: fix psock_fanout selftest hash collision Willem de Bruijn
@ 2013-03-20 16:33           ` David Miller
  2013-03-20 17:59             ` David Miller
  0 siblings, 1 reply; 20+ messages in thread
From: David Miller @ 2013-03-20 16:33 UTC (permalink / raw)
  To: willemb; +Cc: netdev

From: Willem de Bruijn <willemb@google.com>
Date: Wed, 20 Mar 2013 02:42:44 -0400

> Fix flaky results with PACKET_FANOUT_HASH depending on whether the
> two flows hash into the same packet socket or not.
> 
> Also adds tests for PACKET_FANOUT_LB and PACKET_FANOUT_CPU and
> replaces the counting method with a packet ring.
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>

Applied, thanks.  I'll retest on my sparc64 box later today.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-20 16:33           ` David Miller
@ 2013-03-20 17:59             ` David Miller
  2013-03-21  0:07               ` Willem de Bruijn
  0 siblings, 1 reply; 20+ messages in thread
From: David Miller @ 2013-03-20 17:59 UTC (permalink / raw)
  To: willemb; +Cc: netdev

From: David Miller <davem@davemloft.net>
Date: Wed, 20 Mar 2013 12:33:44 -0400 (EDT)

> From: Willem de Bruijn <willemb@google.com>
> Date: Wed, 20 Mar 2013 02:42:44 -0400
> 
>> Fix flaky results with PACKET_FANOUT_HASH depending on whether the
>> two flows hash into the same packet socket or not.
>> 
>> Also adds tests for PACKET_FANOUT_LB and PACKET_FANOUT_CPU and
>> replaces the counting method with a packet ring.
>> 
>> Signed-off-by: Willem de Bruijn <willemb@google.com>
> 
> Applied, thanks.  I'll retest on my sparc64 box later today.

Unfortunately, it's still broken there:

--------------------
running psock_fanout test
--------------------
test: control single socket
test: control multiple sockets
test: datapath 0x0
info: count=0,0, expect=0,0
info: count=0,0, expect=15,5
ERROR: incorrect queue lengths
info: count=0,0, expect=20,5
ERROR: incorrect queue lengths
info: trying alternate ports (4)
test: datapath 0x0
info: count=0,0, expect=0,0
info: count=0,0, expect=15,5
ERROR: incorrect queue lengths
info: count=0,0, expect=20,5
ERROR: incorrect queue lengths
info: trying alternate ports (3)
test: datapath 0x0
info: count=0,0, expect=0,0
info: count=0,0, expect=15,5
ERROR: incorrect queue lengths
info: count=0,0, expect=20,5
ERROR: incorrect queue lengths
info: trying alternate ports (2)
test: datapath 0x0
info: count=0,0, expect=0,0
info: count=0,0, expect=15,5
ERROR: incorrect queue lengths
info: count=0,0, expect=20,5
ERROR: incorrect queue lengths
info: trying alternate ports (1)
test: datapath 0x0
info: count=0,0, expect=0,0
info: count=0,0, expect=15,5
ERROR: incorrect queue lengths
info: count=0,0, expect=20,5
ERROR: incorrect queue lengths
info: trying alternate ports (0)
test: datapath 0x0
info: count=0,0, expect=0,0
info: count=0,0, expect=15,5
ERROR: incorrect queue lengths
info: count=0,0, expect=20,5
ERROR: incorrect queue lengths
test: datapath 0x1000
info: count=0,0, expect=0,0
info: count=0,0, expect=15,5
ERROR: incorrect queue lengths
info: count=0,0, expect=20,15
ERROR: incorrect queue lengths
test: datapath 0x1
info: count=0,0, expect=0,0
info: count=0,0, expect=10,10
ERROR: incorrect queue lengths
info: count=0,0, expect=18,17
ERROR: incorrect queue lengths
test: datapath 0x3
info: count=0,0, expect=0,0
info: count=0,0, expect=20,0
ERROR: incorrect queue lengths
info: count=0,0, expect=20,15
ERROR: incorrect queue lengths
test: datapath 0x2
info: count=0,0, expect=0,0
info: count=0,0, expect=20,0
ERROR: incorrect queue lengths
info: count=0,0, expect=20,0
ERROR: incorrect queue lengths
test: datapath 0x2
info: count=0,0, expect=0,0
info: count=0,0, expect=0,20
ERROR: incorrect queue lengths
info: count=0,0, expect=0,20
ERROR: incorrect queue lengths
[FAIL]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-20 17:59             ` David Miller
@ 2013-03-21  0:07               ` Willem de Bruijn
  2013-03-21  1:16                 ` David Miller
  2013-03-21  6:31                 ` Daniel Borkmann
  0 siblings, 2 replies; 20+ messages in thread
From: Willem de Bruijn @ 2013-03-21  0:07 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

On Wed, Mar 20, 2013 at 1:59 PM, David Miller <davem@davemloft.net> wrote:
> From: David Miller <davem@davemloft.net>
> Date: Wed, 20 Mar 2013 12:33:44 -0400 (EDT)
>
>> From: Willem de Bruijn <willemb@google.com>
>> Date: Wed, 20 Mar 2013 02:42:44 -0400
>>
>>> Fix flaky results with PACKET_FANOUT_HASH depending on whether the
>>> two flows hash into the same packet socket or not.
>>>
>>> Also adds tests for PACKET_FANOUT_LB and PACKET_FANOUT_CPU and
>>> replaces the counting method with a packet ring.
>>>
>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>>
>> Applied, thanks.  I'll retest on my sparc64 box later today.
>
> Unfortunately, it's still broken there:

This looks like a new problem. Now the counters all stay zero.

I am looking into it. I have not been able to reproduce this on my
x86_64 so far, so just brought a sparc32 up in qemu. Had less luck
with sparc64, but impressive that it works at all. Come to think of
it, is this a 64-bit kernel with 32-bit userland? Perhaps that
affects packet ring memory layout.


> --------------------
> running psock_fanout test
> --------------------
> test: control single socket
> test: control multiple sockets
> test: datapath 0x0
> info: count=0,0, expect=0,0
> info: count=0,0, expect=15,5
> ERROR: incorrect queue lengths
> info: count=0,0, expect=20,5
> ERROR: incorrect queue lengths
> info: trying alternate ports (4)
> test: datapath 0x0
> info: count=0,0, expect=0,0
> info: count=0,0, expect=15,5
> ERROR: incorrect queue lengths
> info: count=0,0, expect=20,5
> ERROR: incorrect queue lengths
> info: trying alternate ports (3)
> test: datapath 0x0
> info: count=0,0, expect=0,0
> info: count=0,0, expect=15,5
> ERROR: incorrect queue lengths
> info: count=0,0, expect=20,5
> ERROR: incorrect queue lengths
> info: trying alternate ports (2)
> test: datapath 0x0
> info: count=0,0, expect=0,0
> info: count=0,0, expect=15,5
> ERROR: incorrect queue lengths
> info: count=0,0, expect=20,5
> ERROR: incorrect queue lengths
> info: trying alternate ports (1)
> test: datapath 0x0
> info: count=0,0, expect=0,0
> info: count=0,0, expect=15,5
> ERROR: incorrect queue lengths
> info: count=0,0, expect=20,5
> ERROR: incorrect queue lengths
> info: trying alternate ports (0)
> test: datapath 0x0
> info: count=0,0, expect=0,0
> info: count=0,0, expect=15,5
> ERROR: incorrect queue lengths
> info: count=0,0, expect=20,5
> ERROR: incorrect queue lengths
> test: datapath 0x1000
> info: count=0,0, expect=0,0
> info: count=0,0, expect=15,5
> ERROR: incorrect queue lengths
> info: count=0,0, expect=20,15
> ERROR: incorrect queue lengths
> test: datapath 0x1
> info: count=0,0, expect=0,0
> info: count=0,0, expect=10,10
> ERROR: incorrect queue lengths
> info: count=0,0, expect=18,17
> ERROR: incorrect queue lengths
> test: datapath 0x3
> info: count=0,0, expect=0,0
> info: count=0,0, expect=20,0
> ERROR: incorrect queue lengths
> info: count=0,0, expect=20,15
> ERROR: incorrect queue lengths
> test: datapath 0x2
> info: count=0,0, expect=0,0
> info: count=0,0, expect=20,0
> ERROR: incorrect queue lengths
> info: count=0,0, expect=20,0
> ERROR: incorrect queue lengths
> test: datapath 0x2
> info: count=0,0, expect=0,0
> info: count=0,0, expect=0,20
> ERROR: incorrect queue lengths
> info: count=0,0, expect=0,20
> ERROR: incorrect queue lengths
> [FAIL]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-21  0:07               ` Willem de Bruijn
@ 2013-03-21  1:16                 ` David Miller
  2013-03-21  6:31                 ` Daniel Borkmann
  1 sibling, 0 replies; 20+ messages in thread
From: David Miller @ 2013-03-21  1:16 UTC (permalink / raw)
  To: willemb; +Cc: netdev

From: Willem de Bruijn <willemb@google.com>
Date: Wed, 20 Mar 2013 20:07:21 -0400

> Come to think of it, is this a 64-bit kernel with 32-bit userland?
> Perhaps that affects packet ring memory layout.

Yes, it is.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-21  0:07               ` Willem de Bruijn
  2013-03-21  1:16                 ` David Miller
@ 2013-03-21  6:31                 ` Daniel Borkmann
  2013-03-21 17:27                   ` Willem de Bruijn
  1 sibling, 1 reply; 20+ messages in thread
From: Daniel Borkmann @ 2013-03-21  6:31 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: David Miller, netdev

On 03/21/2013 01:07 AM, Willem de Bruijn wrote:
> On Wed, Mar 20, 2013 at 1:59 PM, David Miller <davem@davemloft.net> wrote:
>> From: David Miller <davem@davemloft.net>
>> Date: Wed, 20 Mar 2013 12:33:44 -0400 (EDT)
>>
>>> From: Willem de Bruijn <willemb@google.com>
>>> Date: Wed, 20 Mar 2013 02:42:44 -0400
>>>
>>>> Fix flaky results with PACKET_FANOUT_HASH depending on whether the
>>>> two flows hash into the same packet socket or not.
>>>>
>>>> Also adds tests for PACKET_FANOUT_LB and PACKET_FANOUT_CPU and
>>>> replaces the counting method with a packet ring.
>>>>
>>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>>>
>>> Applied, thanks.  I'll retest on my sparc64 box later today.
>>
>> Unfortunately, it's still broken there:
>
> This looks like a new problem. Now the counters all stay zero.
>
> I am looking into it. I have not been able to reproduce this on my
> x86_64 so far, so just brought a sparc32 up in qemu. Had less luck
> with sparc64, but impressive that it works at all. Come to think of
> it, is this a 64-bit kernel with 32-bit userland? Perhaps that
> affects packet ring memory layout.

That can affect the ring buffer in case of TPACKET_V1, which is default
if not specified otherwise. See Documentation/networking/packet_mmap.txt +514

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-21  6:31                 ` Daniel Borkmann
@ 2013-03-21 17:27                   ` Willem de Bruijn
  2013-03-21 17:46                     ` David Miller
                                       ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Willem de Bruijn @ 2013-03-21 17:27 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: David Miller, netdev

On Thu, Mar 21, 2013 at 2:31 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 03/21/2013 01:07 AM, Willem de Bruijn wrote:
>>
>> On Wed, Mar 20, 2013 at 1:59 PM, David Miller <davem@davemloft.net> wrote:
>>>
>>> From: David Miller <davem@davemloft.net>
>>> Date: Wed, 20 Mar 2013 12:33:44 -0400 (EDT)
>>>
>>>> From: Willem de Bruijn <willemb@google.com>
>>>> Date: Wed, 20 Mar 2013 02:42:44 -0400
>>>>
>>>>> Fix flaky results with PACKET_FANOUT_HASH depending on whether the
>>>>> two flows hash into the same packet socket or not.
>>>>>
>>>>> Also adds tests for PACKET_FANOUT_LB and PACKET_FANOUT_CPU and
>>>>> replaces the counting method with a packet ring.
>>>>>
>>>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>>>>
>>>>
>>>> Applied, thanks.  I'll retest on my sparc64 box later today.
>>>
>>>
>>> Unfortunately, it's still broken there:
>>
>>
>> This looks like a new problem. Now the counters all stay zero.
>>
>> I am looking into it. I have not been able to reproduce this on my
>> x86_64 so far, so just brought a sparc32 up in qemu. Had less luck
>> with sparc64, but impressive that it works at all. Come to think of
>> it, is this a 64-bit kernel with 32-bit userland? Perhaps that
>> affects packet ring memory layout.
>
>
> That can affect the ring buffer in case of TPACKET_V1, which is default
> if not specified otherwise. See Documentation/networking/packet_mmap.txt
> +514

Thanks, Daniel. In that case, the following should fix it.
Unfortunately, I don't have the hardware to verify, but it still
passes on my platforms. Let me know if you prefer it as a regular
patch instead of inline.

diff --git a/tools/testing/selftests/net/psock_fanout.c
b/tools/testing/selftests/net/psock_fanout.c
index 226e5e3..59bd636 100644
--- a/tools/testing/selftests/net/psock_fanout.c
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -182,7 +182,13 @@ static char *sock_fanout_open_ring(int fd)
                .tp_frame_nr   = RING_NUM_FRAMES,
        };
        char *ring;
+       int val = TPACKET_V2;

+       if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, (void *) &val,
+                      sizeof(val))) {
+               perror("packetsock ring setsockopt version");
+               exit(1);
+       }
        if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req,
                       sizeof(req))) {
                perror("packetsock ring setsockopt");
@@ -201,7 +207,7 @@ static char *sock_fanout_open_ring(int fd)

 static int sock_fanout_read_ring(int fd, void *ring)
 {
-       struct tpacket_hdr *header = ring;
+       struct tpacket2_hdr *header = ring;
        int count = 0;

        while (header->tp_status & TP_STATUS_USER && count < RING_NUM_FRAMES) {

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-21 17:27                   ` Willem de Bruijn
@ 2013-03-21 17:46                     ` David Miller
  2013-03-21 17:47                     ` Daniel Borkmann
  2013-03-21 17:49                     ` David Miller
  2 siblings, 0 replies; 20+ messages in thread
From: David Miller @ 2013-03-21 17:46 UTC (permalink / raw)
  To: willemb; +Cc: dborkman, netdev

From: Willem de Bruijn <willemb@google.com>
Date: Thu, 21 Mar 2013 13:27:51 -0400

> Unfortunately, I don't have the hardware to verify, but it still
> passes on my platforms.

I think you do, simply build your test program with "gcc -m32"
and run it on your x86_64 machine.

But in any event I'll test your change too.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-21 17:27                   ` Willem de Bruijn
  2013-03-21 17:46                     ` David Miller
@ 2013-03-21 17:47                     ` Daniel Borkmann
  2013-03-21 18:00                       ` Willem de Bruijn
  2013-03-21 17:49                     ` David Miller
  2 siblings, 1 reply; 20+ messages in thread
From: Daniel Borkmann @ 2013-03-21 17:47 UTC (permalink / raw)
  To: Willem de Bruijn; +Cc: David Miller, netdev

On 03/21/2013 06:27 PM, Willem de Bruijn wrote:
> On Thu, Mar 21, 2013 at 2:31 AM, Daniel Borkmann <dborkman@redhat.com> wrote:
>> On 03/21/2013 01:07 AM, Willem de Bruijn wrote:
>>> On Wed, Mar 20, 2013 at 1:59 PM, David Miller <davem@davemloft.net> wrote:
>>>>
>>>> From: David Miller <davem@davemloft.net>
>>>> Date: Wed, 20 Mar 2013 12:33:44 -0400 (EDT)
>>>>
>>>>> From: Willem de Bruijn <willemb@google.com>
>>>>> Date: Wed, 20 Mar 2013 02:42:44 -0400
>>>>>
>>>>>> Fix flaky results with PACKET_FANOUT_HASH depending on whether the
>>>>>> two flows hash into the same packet socket or not.
>>>>>>
>>>>>> Also adds tests for PACKET_FANOUT_LB and PACKET_FANOUT_CPU and
>>>>>> replaces the counting method with a packet ring.
>>>>>>
>>>>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>>>>>
>>>>> Applied, thanks.  I'll retest on my sparc64 box later today.
>>>>
>>>> Unfortunately, it's still broken there:
>>>
>>> This looks like a new problem. Now the counters all stay zero.
>>>
>>> I am looking into it. I have not been able to reproduce this on my
>>> x86_64 so far, so just brought a sparc32 up in qemu. Had less luck
>>> with sparc64, but impressive that it works at all. Come to think of
>>> it, is this a 64-bit kernel with 32-bit userland? Perhaps that
>>> affects packet ring memory layout.
>>
>>
>> That can affect the ring buffer in case of TPACKET_V1, which is default
>> if not specified otherwise. See Documentation/networking/packet_mmap.txt
>> +514
>
> Thanks, Daniel. In that case, the following should fix it.
> Unfortunately, I don't have the hardware to verify, but it still
> passes on my platforms. Let me know if you prefer it as a regular
> patch instead of inline.

I can only tell you about x86_64: [PASS], although two ERRORs:

running psock_fanout test
--------------------
test: control single socket
test: control multiple sockets
test: datapath 0x0
info: count=0,0, expect=0,0
info: count=20,0, expect=15,5
ERROR: incorrect queue lengths
info: count=20,0, expect=20,5
ERROR: incorrect queue lengths
info: trying alternate ports (4)
test: datapath 0x0
[...]
OK. All tests passed
[PASS]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-21 17:27                   ` Willem de Bruijn
  2013-03-21 17:46                     ` David Miller
  2013-03-21 17:47                     ` Daniel Borkmann
@ 2013-03-21 17:49                     ` David Miller
  2013-03-21 17:56                       ` David Miller
  2 siblings, 1 reply; 20+ messages in thread
From: David Miller @ 2013-03-21 17:49 UTC (permalink / raw)
  To: willemb; +Cc: dborkman, netdev

From: Willem de Bruijn <willemb@google.com>
Date: Thu, 21 Mar 2013 13:27:51 -0400

> @@ -182,7 +182,13 @@ static char *sock_fanout_open_ring(int fd)
>                 .tp_frame_nr   = RING_NUM_FRAMES,
>         };
>         char *ring;
> +       int val = TPACKET_V2;
> 
> +       if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, (void *) &val,

This whole patch is whitespace damaged.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-21 17:49                     ` David Miller
@ 2013-03-21 17:56                       ` David Miller
  2013-03-21 18:01                         ` Willem de Bruijn
  0 siblings, 1 reply; 20+ messages in thread
From: David Miller @ 2013-03-21 17:56 UTC (permalink / raw)
  To: willemb; +Cc: dborkman, netdev

From: David Miller <davem@davemloft.net>
Date: Thu, 21 Mar 2013 13:49:14 -0400 (EDT)

> From: Willem de Bruijn <willemb@google.com>
> Date: Thu, 21 Mar 2013 13:27:51 -0400
> 
>> @@ -182,7 +182,13 @@ static char *sock_fanout_open_ring(int fd)
>>                 .tp_frame_nr   = RING_NUM_FRAMES,
>>         };
>>         char *ring;
>> +       int val = TPACKET_V2;
>> 
>> +       if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, (void *) &val,
> 
> This whole patch is whitespace damaged.

I hand applied this patch and it makes the test pass on sparc64.

Please submit this formally, thanks.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-21 17:47                     ` Daniel Borkmann
@ 2013-03-21 18:00                       ` Willem de Bruijn
  0 siblings, 0 replies; 20+ messages in thread
From: Willem de Bruijn @ 2013-03-21 18:00 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: David Miller, netdev

On Thu, Mar 21, 2013 at 1:47 PM, Daniel Borkmann <dborkman@redhat.com> wrote:
> On 03/21/2013 06:27 PM, Willem de Bruijn wrote:
>>
>> On Thu, Mar 21, 2013 at 2:31 AM, Daniel Borkmann <dborkman@redhat.com>
>> wrote:
>>>
>>> On 03/21/2013 01:07 AM, Willem de Bruijn wrote:
>>>>
>>>> On Wed, Mar 20, 2013 at 1:59 PM, David Miller <davem@davemloft.net>
>>>> wrote:
>>>>>
>>>>>
>>>>> From: David Miller <davem@davemloft.net>
>>>>> Date: Wed, 20 Mar 2013 12:33:44 -0400 (EDT)
>>>>>
>>>>>> From: Willem de Bruijn <willemb@google.com>
>>>>>> Date: Wed, 20 Mar 2013 02:42:44 -0400
>>>>>>
>>>>>>> Fix flaky results with PACKET_FANOUT_HASH depending on whether the
>>>>>>> two flows hash into the same packet socket or not.
>>>>>>>
>>>>>>> Also adds tests for PACKET_FANOUT_LB and PACKET_FANOUT_CPU and
>>>>>>> replaces the counting method with a packet ring.
>>>>>>>
>>>>>>> Signed-off-by: Willem de Bruijn <willemb@google.com>
>>>>>>
>>>>>>
>>>>>> Applied, thanks.  I'll retest on my sparc64 box later today.
>>>>>
>>>>>
>>>>> Unfortunately, it's still broken there:
>>>>
>>>>
>>>> This looks like a new problem. Now the counters all stay zero.
>>>>
>>>> I am looking into it. I have not been able to reproduce this on my
>>>> x86_64 so far, so just brought a sparc32 up in qemu. Had less luck
>>>> with sparc64, but impressive that it works at all. Come to think of
>>>> it, is this a 64-bit kernel with 32-bit userland? Perhaps that
>>>> affects packet ring memory layout.
>>>
>>>
>>>
>>> That can affect the ring buffer in case of TPACKET_V1, which is default
>>> if not specified otherwise. See Documentation/networking/packet_mmap.txt
>>> +514
>>
>>
>> Thanks, Daniel. In that case, the following should fix it.
>> Unfortunately, I don't have the hardware to verify, but it still
>> passes on my platforms. Let me know if you prefer it as a regular
>> patch instead of inline.
>
>
> I can only tell you about x86_64: [PASS],

Thanks for testing.

>  although two ERRORs:

That is due to a workaround for probing sockets whose rxhashes map
onto different packet sockets in the test. The current approach tries
up to N (is 5) times until we find a pair of connections that map onto
the different queues. These ERRORS are from a previous try that
failed. Warning would be a better name.

But the whole workaround is not very good, as it only reduces flaky
failures, not fix them. Worst case, if the hash is uniformly random,
then for two packet sockets there's a 1/2 chance on every try that the
two connections hit the same socket, so a 1/(2^N) chance that the test
gives up and fails. I haven't found a better solution yet to generate
a pair of sockets that always hash onto the different sockets.

>
> running psock_fanout test
> --------------------
> test: control single socket
> test: control multiple sockets
> test: datapath 0x0
> info: count=0,0, expect=0,0
> info: count=20,0, expect=15,5
> ERROR: incorrect queue lengths
> info: count=20,0, expect=20,5
>
> ERROR: incorrect queue lengths
> info: trying alternate ports (4)
> test: datapath 0x0
> [...]
> OK. All tests passed
> [PASS]

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout selftest hash collision
  2013-03-21 17:56                       ` David Miller
@ 2013-03-21 18:01                         ` Willem de Bruijn
  2013-03-21 18:10                           ` [PATCH net-next] net: fix psock_fanout on sparc64 Willem de Bruijn
  0 siblings, 1 reply; 20+ messages in thread
From: Willem de Bruijn @ 2013-03-21 18:01 UTC (permalink / raw)
  To: David Miller; +Cc: Daniel Borkmann, netdev

On Thu, Mar 21, 2013 at 1:56 PM, David Miller <davem@davemloft.net> wrote:
> From: David Miller <davem@davemloft.net>
> Date: Thu, 21 Mar 2013 13:49:14 -0400 (EDT)
>
>> From: Willem de Bruijn <willemb@google.com>
>> Date: Thu, 21 Mar 2013 13:27:51 -0400
>>
>>> @@ -182,7 +182,13 @@ static char *sock_fanout_open_ring(int fd)
>>>                 .tp_frame_nr   = RING_NUM_FRAMES,
>>>         };
>>>         char *ring;
>>> +       int val = TPACKET_V2;
>>>
>>> +       if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, (void *) &val,
>>
>> This whole patch is whitespace damaged.

Sorry. I did not know that that happened with cut and paste into gmail.

> I hand applied this patch and it makes the test pass on sparc64.

Great. Thanks for testing.

> Please submit this formally, thanks.

Will follow up right away.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH net-next] net: fix psock_fanout on sparc64
  2013-03-21 18:01                         ` Willem de Bruijn
@ 2013-03-21 18:10                           ` Willem de Bruijn
  2013-03-21 18:31                             ` David Miller
  0 siblings, 1 reply; 20+ messages in thread
From: Willem de Bruijn @ 2013-03-21 18:10 UTC (permalink / raw)
  To: davem, netdev; +Cc: Willem de Bruijn

The packetsocket fanout test uses a packet ring. Use TPACKET_V2
instead of TPACKET_V1 to work around a known 32/64 bit issue in
the older ring that manifests on sparc64.

Signed-off-by: Willem de Bruijn <willemb@google.com>
---
 tools/testing/selftests/net/psock_fanout.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/psock_fanout.c b/tools/testing/selftests/net/psock_fanout.c
index 226e5e3..59bd636 100644
--- a/tools/testing/selftests/net/psock_fanout.c
+++ b/tools/testing/selftests/net/psock_fanout.c
@@ -182,7 +182,13 @@ static char *sock_fanout_open_ring(int fd)
 		.tp_frame_nr   = RING_NUM_FRAMES,
 	};
 	char *ring;
+	int val = TPACKET_V2;
 
+	if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, (void *) &val,
+		       sizeof(val))) {
+		perror("packetsock ring setsockopt version");
+		exit(1);
+	}
 	if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, (void *) &req,
 		       sizeof(req))) {
 		perror("packetsock ring setsockopt");
@@ -201,7 +207,7 @@ static char *sock_fanout_open_ring(int fd)
 
 static int sock_fanout_read_ring(int fd, void *ring)
 {
-	struct tpacket_hdr *header = ring;
+	struct tpacket2_hdr *header = ring;
 	int count = 0;
 
 	while (header->tp_status & TP_STATUS_USER && count < RING_NUM_FRAMES) {
-- 
1.8.1.3

^ permalink raw reply related	[flat|nested] 20+ messages in thread

* Re: [PATCH net-next] net: fix psock_fanout on sparc64
  2013-03-21 18:10                           ` [PATCH net-next] net: fix psock_fanout on sparc64 Willem de Bruijn
@ 2013-03-21 18:31                             ` David Miller
  0 siblings, 0 replies; 20+ messages in thread
From: David Miller @ 2013-03-21 18:31 UTC (permalink / raw)
  To: willemb; +Cc: netdev

From: Willem de Bruijn <willemb@google.com>
Date: Thu, 21 Mar 2013 14:10:03 -0400

> The packetsocket fanout test uses a packet ring. Use TPACKET_V2
> instead of TPACKET_V1 to work around a known 32/64 bit issue in
> the older ring that manifests on sparc64.
> 
> Signed-off-by: Willem de Bruijn <willemb@google.com>

Applied, thanks.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2013-03-21 18:31 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-03-19 20:18 [PATCH net-next v3] packet: packet fanout rollover during socket overload Willem de Bruijn
2013-03-19 20:37 ` Eric Dumazet
2013-03-19 21:16   ` David Miller
2013-03-19 21:34     ` Willem de Bruijn
2013-03-20  6:37       ` Willem de Bruijn
2013-03-20  6:42         ` [PATCH net-next] net: fix psock_fanout selftest hash collision Willem de Bruijn
2013-03-20 16:33           ` David Miller
2013-03-20 17:59             ` David Miller
2013-03-21  0:07               ` Willem de Bruijn
2013-03-21  1:16                 ` David Miller
2013-03-21  6:31                 ` Daniel Borkmann
2013-03-21 17:27                   ` Willem de Bruijn
2013-03-21 17:46                     ` David Miller
2013-03-21 17:47                     ` Daniel Borkmann
2013-03-21 18:00                       ` Willem de Bruijn
2013-03-21 17:49                     ` David Miller
2013-03-21 17:56                       ` David Miller
2013-03-21 18:01                         ` Willem de Bruijn
2013-03-21 18:10                           ` [PATCH net-next] net: fix psock_fanout on sparc64 Willem de Bruijn
2013-03-21 18:31                             ` David Miller

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.