netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v19 4/8] netfilter: Add nf_ct_get_tuple_skb global lookup function
  2018-07-06 15:37 [PATCH net-next v19 0/8] sched: Add Common Applications Kept Enhanced (cake) qdisc Toke Høiland-Jørgensen
  2018-07-06 15:37 ` [PATCH net-next v19 5/8] sch_cake: Add NAT awareness to packet classifier Toke Høiland-Jørgensen
@ 2018-07-06 15:37 ` Toke Høiland-Jørgensen
  2018-07-11  5:56 ` [PATCH net-next v19 0/8] sched: Add Common Applications Kept Enhanced (cake) qdisc David Miller
  2 siblings, 0 replies; 5+ messages in thread
From: Toke Høiland-Jørgensen @ 2018-07-06 15:37 UTC (permalink / raw)
  To: netdev; +Cc: netfilter-devel, cake

This adds a global netfilter function to extract a conntrack tuple from an
skb. The function uses a new function added to nf_ct_hook, which will try
to get the tuple from skb->_nfct, and do a full lookup if that fails. This
makes it possible to use the lookup function before the skb has passed
through the conntrack init hooks (e.g., in an ingress qdisc). The tuple is
copied to the caller to avoid issues with reference counting.

The function returns false if conntrack is not loaded, allowing it to be
used without incurring a module dependency on conntrack. This is used by
the NAT mode in sch_cake.

Cc: netfilter-devel@vger.kernel.org
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 include/linux/netfilter.h         |   11 +++++++++++
 net/netfilter/core.c              |   15 +++++++++++++++
 net/netfilter/nf_conntrack_core.c |   36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 5a5e0a2ab2a3..bf43e24b9dd0 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -410,8 +410,17 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
 
 extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu;
 void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
+struct nf_conntrack_tuple;
+bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+			 const struct sk_buff *skb);
 #else
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
+struct nf_conntrack_tuple;
+static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+				       const struct sk_buff *skb)
+{
+	return false;
+}
 #endif
 
 struct nf_conn;
@@ -420,6 +429,8 @@ enum ip_conntrack_info;
 struct nf_ct_hook {
 	int (*update)(struct net *net, struct sk_buff *skb);
 	void (*destroy)(struct nf_conntrack *);
+	bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
+			      const struct sk_buff *);
 };
 extern struct nf_ct_hook __rcu *nf_ct_hook;
 
diff --git a/net/netfilter/core.c b/net/netfilter/core.c
index 168af54db975..dc240cb47ddf 100644
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -603,6 +603,21 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)
 }
 EXPORT_SYMBOL(nf_conntrack_destroy);
 
+bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+			 const struct sk_buff *skb)
+{
+	struct nf_ct_hook *ct_hook;
+	bool ret = false;
+
+	rcu_read_lock();
+	ct_hook = rcu_dereference(nf_ct_hook);
+	if (ct_hook)
+		ret = ct_hook->get_tuple_skb(dst_tuple, skb);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL(nf_ct_get_tuple_skb);
+
 /* Built-in default zone used e.g. by modules. */
 const struct nf_conntrack_zone nf_ct_zone_dflt = {
 	.id	= NF_CT_DEFAULT_ZONE_ID,
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3465da2a98bd..85ab2fd6a665 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1683,6 +1683,41 @@ static int nf_conntrack_update(struct net *net, struct sk_buff *skb)
 	return 0;
 }
 
+static bool nf_conntrack_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+				       const struct sk_buff *skb)
+{
+	const struct nf_conntrack_tuple *src_tuple;
+	const struct nf_conntrack_tuple_hash *hash;
+	struct nf_conntrack_tuple srctuple;
+	enum ip_conntrack_info ctinfo;
+	struct nf_conn *ct;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct) {
+		src_tuple = nf_ct_tuple(ct, CTINFO2DIR(ctinfo));
+		memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
+		return true;
+	}
+
+	if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb),
+			       NFPROTO_IPV4, dev_net(skb->dev),
+			       &srctuple))
+		return false;
+
+	hash = nf_conntrack_find_get(dev_net(skb->dev),
+				     &nf_ct_zone_dflt,
+				     &srctuple);
+	if (!hash)
+		return false;
+
+	ct = nf_ct_tuplehash_to_ctrack(hash);
+	src_tuple = nf_ct_tuple(ct, !hash->tuple.dst.dir);
+	memcpy(dst_tuple, src_tuple, sizeof(*dst_tuple));
+	nf_ct_put(ct);
+
+	return true;
+}
+
 /* Bring out ya dead! */
 static struct nf_conn *
 get_next_corpse(int (*iter)(struct nf_conn *i, void *data),
@@ -2204,6 +2239,7 @@ int nf_conntrack_init_start(void)
 static struct nf_ct_hook nf_conntrack_hook = {
 	.update		= nf_conntrack_update,
 	.destroy	= destroy_conntrack,
+	.get_tuple_skb  = nf_conntrack_get_tuple_skb,
 };
 
 void nf_conntrack_init_end(void)

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH net-next v19 0/8] sched: Add Common Applications Kept Enhanced (cake) qdisc
@ 2018-07-06 15:37 Toke Høiland-Jørgensen
  2018-07-06 15:37 ` [PATCH net-next v19 5/8] sch_cake: Add NAT awareness to packet classifier Toke Høiland-Jørgensen
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: Toke Høiland-Jørgensen @ 2018-07-06 15:37 UTC (permalink / raw)
  To: netdev
  Cc: Georgios Amanakis, Pete Heist, Yuchung Cheng, Neal Cardwell,
	Dave Taht, netfilter-devel, cake

This patch series adds the CAKE qdisc, and has been split up to ease
review.

I have attempted to split out each configurable feature into its own patch.
The first commit adds the base shaper and packet scheduler, while
subsequent commits add the optional features. The full userspace API and
most data structures are included in this commit, but options not
understood in the base version will be ignored.

The result of applying the entire series is identical to the out of tree
version that have seen extensive testing in previous deployments, most
notably as an out of tree patch to OpenWrt. However, note that I have only
compile tested the individual patches; so the whole series should be
considered as a unit.

---
Changelog

v19:
  - Rebase to current net-next.
  - Don't rely on the value of sch->q.qlen to break loops; fixes possible
    infinite loop on multi-queue devices.
  - Don't overwrite NAT flag when setting flow mode.

v18:
  - Rework classification logic in the diffserv case to always hash if
    filter doesn't select a queue, and to run TC filters before
    selecting the diffserv tin (allowing filter to influence this).
  - Make sure we always call qdisc_watchdog_init() in cake_init(), so we
    don't crash in cake_destroy().

v17:
  - Rebase to newest net-next and move the conntrack callback to
    nf_ct_hook
  - Fix a compile error when NF_CONNTRACK is unset.

v16:
  - Move conntrack lookup function into conntrack core and read it via
    RCU so it is only active when the nf_conntrack module is loaded.
    This avoids the module dependency on conntrack for NAT mode. Thanks
    to Pablo for the idea.

v15:
  - Handle ECN flags in ACK filter

v14:
  - Handle seqno wraps and DSACKs in ACK filter

v13:
  - Avoid ktime_t to scalar compares
  - Add class dumping and basic stats
  - Fail with ENOTSUPP when requesting NAT mode and conntrack is not
    available.
  - Parse all TCP options in ACK filter and make sure to only drop safe
    ones. Also handle SACK ranges properly.

v12:
  - Get rid of custom time typedefs. Use ktime_t for time and u64 for
    duration instead.

v11:
  - Fix overhead compensation calculation for GSO packets
  - Change configured rate to be u64 (I ran out of bits before I ran out
    of CPU when testing the effects of the above)

v10:
  - Christmas tree gardening (fix variable declarations to be in reverse
    line length order)

v9:
  - Remove duplicated checks around kvfree() and just call it
    unconditionally.
  - Don't pass __GFP_NOWARN when allocating memory
  - Move options in cake_dump() that are related to optional features to
    later patches implementing the features.
  - Support attaching filters to the qdisc and use the classification
    result to select flow queue.
  - Support overriding diffserv priority tin from skb->priority

v8:
  - Remove inline keyword from function definitions
  - Simplify ACK filter; remove the complex state handling to make the
    logic easier to follow. This will potentially be a bit less efficient,
    but I have not been able to measure a difference.

v7:
  - Split up patch into a series to ease review.
  - Constify the ACK filter.

v6:
  - Fix 6in4 encapsulation checks in ACK filter code
  - Checkpatch fixes

v5:
  - Refactor ACK filter code and hopefully fix the safety issues
    properly this time.

v4:
  - Only split GSO packets if shaping at speeds <= 1Gbps
  - Fix overhead calculation code to also work for GSO packets
  - Don't re-implement kvzalloc()
  - Remove local header include from out-of-tree build (fixes kbuild-bot
    complaint).
  - Several fixes to the ACK filter:
    - Check pskb_may_pull() before deref of transport headers.
    - Don't run ACK filter logic on split GSO packets
    - Fix TCP sequence number compare to deal with wraparounds

v3:
  - Use IS_REACHABLE() macro to fix compilation when sch_cake is
    built-in and conntrack is a module.
  - Switch the stats output to use nested netlink attributes instead
    of a versioned struct.
  - Remove GPL boilerplate.
  - Fix array initialisation style.

v2:
  - Fix kbuild test bot complaint
  - Clean up the netlink ABI
  - Fix checkpatch complaints
  - A few tweaks to the behaviour of cake based on testing carried out
    while writing the paper.

---

Toke Høiland-Jørgensen (8):
      sched: Add Common Applications Kept Enhanced (cake) qdisc
      sch_cake: Add ingress mode
      sch_cake: Add optional ACK filter
      netfilter: Add nf_ct_get_tuple_skb global lookup function
      sch_cake: Add NAT awareness to packet classifier
      sch_cake: Add DiffServ handling
      sch_cake: Add overhead compensation support to the rate shaper
      sch_cake: Conditionally split GSO segments


 include/linux/netfilter.h         |   11 
 include/uapi/linux/pkt_sched.h    |  114 +
 net/netfilter/core.c              |   15 
 net/netfilter/nf_conntrack_core.c |   36 
 net/sched/Kconfig                 |   11 
 net/sched/Makefile                |    1 
 net/sched/sch_cake.c              | 3019 +++++++++++++++++++++++++++++++++++++
 7 files changed, 3207 insertions(+)
 create mode 100644 net/sched/sch_cake.c

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH net-next v19 5/8] sch_cake: Add NAT awareness to packet classifier
  2018-07-06 15:37 [PATCH net-next v19 0/8] sched: Add Common Applications Kept Enhanced (cake) qdisc Toke Høiland-Jørgensen
@ 2018-07-06 15:37 ` Toke Høiland-Jørgensen
  2018-07-06 15:37 ` [PATCH net-next v19 4/8] netfilter: Add nf_ct_get_tuple_skb global lookup function Toke Høiland-Jørgensen
  2018-07-11  5:56 ` [PATCH net-next v19 0/8] sched: Add Common Applications Kept Enhanced (cake) qdisc David Miller
  2 siblings, 0 replies; 5+ messages in thread
From: Toke Høiland-Jørgensen @ 2018-07-06 15:37 UTC (permalink / raw)
  To: netdev; +Cc: netfilter-devel, cake

When CAKE is deployed on a gateway that also performs NAT (which is a
common deployment mode), the host fairness mechanism cannot distinguish
internal hosts from each other, and so fails to work correctly.

To fix this, we add an optional NAT awareness mode, which will query the
kernel conntrack mechanism to obtain the pre-NAT addresses for each packet
and use that in the flow and host hashing.

When the shaper is enabled and the host is already performing NAT, the cost
of this lookup is negligible. However, in unlimited mode with no NAT being
performed, there is a significant CPU cost at higher bandwidths. For this
reason, the feature is turned off by default.

Cc: netfilter-devel@vger.kernel.org
Signed-off-by: Toke Høiland-Jørgensen <toke@toke.dk>
---
 net/sched/sch_cake.c |   51 ++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c
index 930096d46c4f..633ca1578114 100644
--- a/net/sched/sch_cake.c
+++ b/net/sched/sch_cake.c
@@ -71,6 +71,10 @@
 #include <net/tcp.h>
 #include <net/flow_dissector.h>
 
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#endif
+
 #define CAKE_SET_WAYS (8)
 #define CAKE_MAX_TINS (8)
 #define CAKE_QUEUES (1024)
@@ -516,6 +520,29 @@ static bool cobalt_should_drop(struct cobalt_vars *vars,
 	return drop;
 }
 
+static void cake_update_flowkeys(struct flow_keys *keys,
+				 const struct sk_buff *skb)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	struct nf_conntrack_tuple tuple = {};
+	bool rev = !skb->_nfct;
+
+	if (tc_skb_protocol(skb) != htons(ETH_P_IP))
+		return;
+
+	if (!nf_ct_get_tuple_skb(&tuple, skb))
+		return;
+
+	keys->addrs.v4addrs.src = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
+	keys->addrs.v4addrs.dst = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
+
+	if (keys->ports.ports) {
+		keys->ports.src = rev ? tuple.dst.u.all : tuple.src.u.all;
+		keys->ports.dst = rev ? tuple.src.u.all : tuple.dst.u.all;
+	}
+#endif
+}
+
 /* Cake has several subtle multiple bit settings. In these cases you
  *  would be matching triple isolate mode as well.
  */
@@ -543,6 +570,9 @@ static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 	skb_flow_dissect_flow_keys(skb, &keys,
 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 
+	if (flow_mode & CAKE_FLOW_NAT_FLAG)
+		cake_update_flowkeys(&keys, skb);
+
 	/* flow_hash_from_keys() sorts the addresses by value, so we have
 	 * to preserve their order in a separate data structure to treat
 	 * src and dst host addresses as independently selectable.
@@ -1939,12 +1969,25 @@ static int cake_change(struct Qdisc *sch, struct nlattr *opt,
 	if (err < 0)
 		return err;
 
+	if (tb[TCA_CAKE_NAT]) {
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
+		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
+			!!nla_get_u32(tb[TCA_CAKE_NAT]);
+#else
+		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
+				    "No conntrack support in kernel");
+		return -EOPNOTSUPP;
+#endif
+	}
+
 	if (tb[TCA_CAKE_BASE_RATE64])
 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
 
 	if (tb[TCA_CAKE_FLOW_MODE])
-		q->flow_mode = (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
-				CAKE_FLOW_MASK);
+		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
+				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
+					CAKE_FLOW_MASK));
 
 	if (tb[TCA_CAKE_RTT]) {
 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
@@ -2111,6 +2154,10 @@ static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
 		goto nla_put_failure;
 
+	if (nla_put_u32(skb, TCA_CAKE_NAT,
+			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
+		goto nla_put_failure;
+
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH net-next v19 0/8] sched: Add Common Applications Kept Enhanced (cake) qdisc
  2018-07-06 15:37 [PATCH net-next v19 0/8] sched: Add Common Applications Kept Enhanced (cake) qdisc Toke Høiland-Jørgensen
  2018-07-06 15:37 ` [PATCH net-next v19 5/8] sch_cake: Add NAT awareness to packet classifier Toke Høiland-Jørgensen
  2018-07-06 15:37 ` [PATCH net-next v19 4/8] netfilter: Add nf_ct_get_tuple_skb global lookup function Toke Høiland-Jørgensen
@ 2018-07-11  5:56 ` David Miller
  2018-07-11 20:40   ` Toke Høiland-Jørgensen
  2 siblings, 1 reply; 5+ messages in thread
From: David Miller @ 2018-07-11  5:56 UTC (permalink / raw)
  To: toke
  Cc: netdev, gamanakis, peteheist, ycheng, ncardwell, dave.taht,
	netfilter-devel, cake

From: Toke Høiland-Jørgensen <toke@toke.dk>
Date: Fri, 06 Jul 2018 17:37:19 +0200

> This patch series adds the CAKE qdisc, and has been split up to ease
> review.
> 
> I have attempted to split out each configurable feature into its own patch.
> The first commit adds the base shaper and packet scheduler, while
> subsequent commits add the optional features. The full userspace API and
> most data structures are included in this commit, but options not
> understood in the base version will be ignored.
> 
> The result of applying the entire series is identical to the out of tree
> version that have seen extensive testing in previous deployments, most
> notably as an out of tree patch to OpenWrt. However, note that I have only
> compile tested the individual patches; so the whole series should be
> considered as a unit.

Ok, I decided to apply this even though there are still bits I'm not
%100 happy with.

I don't like the netfilter dependency at all.

You can get the NAT addresses in other ways as I've tried to suggest
in the past.  Your scheme absolutely does not work with act_nat
in the packet scheduler, not any NAT done by XDP/eBPF programs.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH net-next v19 0/8] sched: Add Common Applications Kept Enhanced (cake) qdisc
  2018-07-11  5:56 ` [PATCH net-next v19 0/8] sched: Add Common Applications Kept Enhanced (cake) qdisc David Miller
@ 2018-07-11 20:40   ` Toke Høiland-Jørgensen
  0 siblings, 0 replies; 5+ messages in thread
From: Toke Høiland-Jørgensen @ 2018-07-11 20:40 UTC (permalink / raw)
  To: David Miller
  Cc: netdev, gamanakis, peteheist, ycheng, ncardwell, dave.taht,
	netfilter-devel, cake

David Miller <davem@davemloft.net> writes:

> From: Toke Høiland-Jørgensen <toke@toke.dk>
> Date: Fri, 06 Jul 2018 17:37:19 +0200
>
>> This patch series adds the CAKE qdisc, and has been split up to ease
>> review.
>> 
>> I have attempted to split out each configurable feature into its own patch.
>> The first commit adds the base shaper and packet scheduler, while
>> subsequent commits add the optional features. The full userspace API and
>> most data structures are included in this commit, but options not
>> understood in the base version will be ignored.
>> 
>> The result of applying the entire series is identical to the out of tree
>> version that have seen extensive testing in previous deployments, most
>> notably as an out of tree patch to OpenWrt. However, note that I have only
>> compile tested the individual patches; so the whole series should be
>> considered as a unit.
>
> Ok, I decided to apply this even though there are still bits I'm not
> %100 happy with.

Yay, awesome, thanks! :)

> I don't like the netfilter dependency at all.
>
> You can get the NAT addresses in other ways as I've tried to suggest
> in the past. Your scheme absolutely does not work with act_nat in the
> packet scheduler, not any NAT done by XDP/eBPF programs.

Just to reiterate why we didn't go with your suggestion of recording the
pre-NAT IP in the flow dissector as the packet comes in:

- It only works on egress; on ingress (with an ifb), packets hit the
  qdisc before NAT, so we need the stateful lookup in CAKE for this
  case, which is a common deployment scenario.

- It's not needed for act_nat (for 1-to-1 NAT, hashing on the post-NAT
  IP is fine), and it won't work for XDP (which would change the packets
  before the flow dissector sees them). This means that custom NAT
  solutions in TC BPF hooks are the only ones that would benefit; and
  they can just set the classifier to achieve the same thing.

Now, I'm absolutely not opposed to having this as a fallback egress-only
mechanism. I might even be convinced to write it myself if someone
demonstrates that they really need it :)

-Toke

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2018-07-11 20:40 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-07-06 15:37 [PATCH net-next v19 0/8] sched: Add Common Applications Kept Enhanced (cake) qdisc Toke Høiland-Jørgensen
2018-07-06 15:37 ` [PATCH net-next v19 5/8] sch_cake: Add NAT awareness to packet classifier Toke Høiland-Jørgensen
2018-07-06 15:37 ` [PATCH net-next v19 4/8] netfilter: Add nf_ct_get_tuple_skb global lookup function Toke Høiland-Jørgensen
2018-07-11  5:56 ` [PATCH net-next v19 0/8] sched: Add Common Applications Kept Enhanced (cake) qdisc David Miller
2018-07-11 20:40   ` Toke Høiland-Jørgensen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).