From: Willem de Bruijn <willemb@google.com>
To: netdev@vger.kernel.org
Cc: davem@davemloft.net, eric.dumazet@gmail.com,
david.laight@aculab.com, Willem de Bruijn <willemb@google.com>
Subject: [PATCH net-next v2 3/6] packet: rollover only to socket with headroom
Date: Sat, 9 May 2015 17:24:58 -0400 [thread overview]
Message-ID: <1431206701-5019-4-git-send-email-willemb@google.com> (raw)
In-Reply-To: <1431206701-5019-1-git-send-email-willemb@google.com>
From: Willem de Bruijn <willemb@google.com>
Only migrate flows to sockets that have sufficient headroom, where
sufficient is defined as having at least 25% empty space.
The kernel has three different buffer types: a regular socket, a ring
with frames (TPACKET_V[12]) or a ring with blocks (TPACKET_V3). The
latter two do not expose a read pointer to the kernel, so headroom is
not computed easily. All three needs a different implementation to
estimate free space.
Tested:
Ran bench_rollover for 10 sec with 1.5 Mpps of single flow input.
bench_rollover has as many sockets as there are NIC receive queues
in the system. Each socket is owned by a process that is pinned to
one of the receive cpus. RFS is disabled. RPS is enabled with an
identity mapping (cpu x -> cpu x), to count drops with softnettop.
lpbb5:/export/hda3/willemb# ./bench_rollover -r -l 1000 -s
Press [Enter] to exit
cpu rx rx.k drop.k rollover r.huge r.failed
0 16 16 0 0 0 0
1 21 21 0 0 0 0
2 5227502 5227502 0 0 0 0
3 18 18 0 0 0 0
4 6083289 6083289 0 5227496 0 0
5 22 22 0 0 0 0
6 21 21 0 0 0 0
7 9 9 0 0 0 0
Signed-off-by: Willem de Bruijn <willemb@google.com>
---
net/packet/af_packet.c | 74 +++++++++++++++++++++++++++++++++++++++-----------
1 file changed, 58 insertions(+), 16 deletions(-)
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index f8ec909..fb421a8 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -1234,27 +1234,68 @@ static void packet_free_pending(struct packet_sock *po)
free_percpu(po->tx_ring.pending_refcnt);
}
-static bool packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
+#define ROOM_POW_OFF 2
+#define ROOM_NONE 0x0
+#define ROOM_LOW 0x1
+#define ROOM_NORMAL 0x2
+
+static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
+{
+ int idx, len;
+
+ len = po->rx_ring.frame_max + 1;
+ idx = po->rx_ring.head;
+ if (pow_off)
+ idx += len >> pow_off;
+ if (idx >= len)
+ idx -= len;
+ return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
+}
+
+static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
+{
+ int idx, len;
+
+ len = po->rx_ring.prb_bdqc.knum_blocks;
+ idx = po->rx_ring.prb_bdqc.kactive_blk_num;
+ if (pow_off)
+ idx += len >> pow_off;
+ if (idx >= len)
+ idx -= len;
+ return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
+}
+
+static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
struct sock *sk = &po->sk;
- bool has_room;
+ int ret = ROOM_NONE;
- if (po->prot_hook.func != tpacket_rcv)
- return (atomic_read(&sk->sk_rmem_alloc) + skb->truesize)
- <= sk->sk_rcvbuf;
+ if (po->prot_hook.func != tpacket_rcv) {
+ int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
+ - skb->truesize;
+ if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
+ return ROOM_NORMAL;
+ else if (avail > 0)
+ return ROOM_LOW;
+ else
+ return ROOM_NONE;
+ }
spin_lock(&sk->sk_receive_queue.lock);
- if (po->tp_version == TPACKET_V3)
- has_room = prb_lookup_block(po, &po->rx_ring,
- po->rx_ring.prb_bdqc.kactive_blk_num,
- TP_STATUS_KERNEL);
- else
- has_room = packet_lookup_frame(po, &po->rx_ring,
- po->rx_ring.head,
- TP_STATUS_KERNEL);
+ if (po->tp_version == TPACKET_V3) {
+ if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
+ ret = ROOM_NORMAL;
+ else if (__tpacket_v3_has_room(po, 0))
+ ret = ROOM_LOW;
+ } else {
+ if (__tpacket_has_room(po, ROOM_POW_OFF))
+ ret = ROOM_NORMAL;
+ else if (__tpacket_has_room(po, 0))
+ ret = ROOM_LOW;
+ }
spin_unlock(&sk->sk_receive_queue.lock);
- return has_room;
+ return ret;
}
static void packet_sock_destruct(struct sock *sk)
@@ -1325,12 +1366,13 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
unsigned int i, j;
po = pkt_sk(f->arr[idx]);
- if (try_self && packet_rcv_has_room(po, skb))
+ if (try_self && packet_rcv_has_room(po, skb) != ROOM_NONE)
return idx;
i = j = min_t(int, po->rollover->sock, num - 1);
do {
- if (i != idx && packet_rcv_has_room(pkt_sk(f->arr[i]), skb)) {
+ if (i != idx &&
+ packet_rcv_has_room(pkt_sk(f->arr[i]), skb) == ROOM_NORMAL) {
if (i != j)
po->rollover->sock = i;
return i;
--
2.2.0.rc0.207.ga3a616c
next prev parent reply other threads:[~2015-05-09 21:25 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-05-09 21:24 [PATCH net-next v2 0/6] packet: refine rollover Willem de Bruijn
2015-05-09 21:24 ` [PATCH net-next v2 1/6] packet: rollover prepare: move code out of callsites Willem de Bruijn
2015-05-09 21:24 ` [PATCH net-next v2 2/6] packet: rollover prepare: per-socket state Willem de Bruijn
2015-05-09 21:24 ` Willem de Bruijn [this message]
2015-05-09 21:24 ` [PATCH net-next v2 4/6] packet: rollover lock contention avoidance Willem de Bruijn
2015-05-09 21:25 ` [PATCH net-next v2 5/6] packet: rollover huge flows before small flows Willem de Bruijn
2015-05-09 21:25 ` [PATCH net-next v2 6/6] packet: rollover statistics Willem de Bruijn
2015-05-11 17:09 ` David Miller
2015-05-12 15:04 ` Willem de Bruijn
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1431206701-5019-4-git-send-email-willemb@google.com \
--to=willemb@google.com \
--cc=davem@davemloft.net \
--cc=david.laight@aculab.com \
--cc=eric.dumazet@gmail.com \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).