netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 0/3] make nf_ct_frag/6_gather elide the skb CB clear
@ 2020-07-05 14:28 wenxu
  2020-07-05 14:28 ` [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support wenxu
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: wenxu @ 2020-07-05 14:28 UTC (permalink / raw)
  To: davem, pablo; +Cc: netdev, netfilter-devel

From: wenxu <wenxu@ucloud.cn>

Add nf_ct_frag_gather and Make nf_ct_frag6_gather elide the CB clear 
when packets are defragmented by connection tracking. This can make
each subsystem such as br_netfilter, openvswitch, act_ct do defrag
without restore the CB. 
This also avoid serious crashes and problems in  ct subsystem.
Because Some packet schedulers store pointers in the qdisc CB private
area and parallel accesses to the SKB.

This series following up
http://patchwork.ozlabs.org/project/netdev/patch/1593422178-26949-1-git-send-email-wenxu@ucloud.cn/

patch1: add nf_ct_frag_gather elide the CB clear
patch2: make nf_ct_frag6_gather elide the CB clear
patch3: fix clobber qdisc_skb_cb in act_ct with defrag

wenxu (3):
  netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support
  netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB
    clear
  net/sched: act_ct: fix clobber qdisc_skb_cb in defrag

 include/linux/netfilter_ipv6.h              |   9 +-
 include/net/netfilter/ipv4/nf_defrag_ipv4.h |   2 +
 include/net/netfilter/ipv6/nf_defrag_ipv6.h |   3 +-
 net/bridge/netfilter/nf_conntrack_bridge.c  |   7 +-
 net/ipv4/netfilter/nf_defrag_ipv4.c         | 314 ++++++++++++++++++++++++++++
 net/ipv6/netfilter/nf_conntrack_reasm.c     |  19 +-
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c   |   3 +-
 net/openvswitch/conntrack.c                 |   8 +-
 net/sched/act_ct.c                          |  12 +-
 9 files changed, 350 insertions(+), 27 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support
  2020-07-05 14:28 [PATCH net-next 0/3] make nf_ct_frag/6_gather elide the skb CB clear wenxu
@ 2020-07-05 14:28 ` wenxu
  2020-07-06  5:40   ` Cong Wang
  2020-07-06 14:38   ` Florian Westphal
  2020-07-05 14:28 ` [PATCH net-next 2/3] netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB clear wenxu
  2020-07-05 14:28 ` [PATCH net-next 3/3] net/sched: act_ct: fix clobber qdisc_skb_cb in defrag wenxu
  2 siblings, 2 replies; 9+ messages in thread
From: wenxu @ 2020-07-05 14:28 UTC (permalink / raw)
  To: davem, pablo; +Cc: netdev, netfilter-devel

From: wenxu <wenxu@ucloud.cn>

Add nf_ct_frag_gather for conntrack defrag and it will
elide the CB clear when packets are defragmented by
connection tracking

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 include/net/netfilter/ipv4/nf_defrag_ipv4.h |   2 +
 net/ipv4/netfilter/nf_defrag_ipv4.c         | 314 ++++++++++++++++++++++++++++
 2 files changed, 316 insertions(+)

diff --git a/include/net/netfilter/ipv4/nf_defrag_ipv4.h b/include/net/netfilter/ipv4/nf_defrag_ipv4.h
index bcbd724..b3ac92b 100644
--- a/include/net/netfilter/ipv4/nf_defrag_ipv4.h
+++ b/include/net/netfilter/ipv4/nf_defrag_ipv4.h
@@ -4,5 +4,7 @@
 
 struct net;
 int nf_defrag_ipv4_enable(struct net *);
+int nf_ct_frag_gather(struct net *net, struct sk_buff *skb,
+		      u32 user, u16 *frag_max_size);
 
 #endif /* _NF_DEFRAG_IPV4_H */
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 8115611..a4ac207 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -11,6 +11,7 @@
 #include <net/netns/generic.h>
 #include <net/route.h>
 #include <net/ip.h>
+#include <net/inet_frag.h>
 
 #include <linux/netfilter_bridge.h>
 #include <linux/netfilter_ipv4.h>
@@ -22,6 +23,319 @@
 
 static DEFINE_MUTEX(defrag4_mutex);
 
+/* Describe an entry in the "incomplete datagrams" queue. */
+struct ipq {
+	struct inet_frag_queue q;
+
+	u8		ecn; /* RFC3168 support */
+	u16		max_df_size; /* largest frag with DF set seen */
+	int             iif;
+	unsigned int    rid;
+	struct inet_peer *peer;
+};
+
+static void ipq_kill(struct ipq *ipq)
+{
+	inet_frag_kill(&ipq->q);
+}
+
+static void ipq_put(struct ipq *ipq)
+{
+	inet_frag_put(&ipq->q);
+}
+
+static bool nf_ct_frag_coalesce_ok(const struct ipq *qp)
+{
+	return qp->q.key.v4.user == IP_DEFRAG_LOCAL_DELIVER;
+}
+
+static u8 nf_ct_frag_ecn(u8 tos)
+{
+	return 1 << (tos & INET_ECN_MASK);
+}
+
+static int nf_ct_frag_too_far(struct ipq *qp)
+{
+	struct inet_peer *peer = qp->peer;
+	unsigned int max = qp->q.fqdir->max_dist;
+	unsigned int start, end;
+
+	int rc;
+
+	if (!peer || !max)
+		return 0;
+
+	start = qp->rid;
+	end = atomic_inc_return(&peer->rid);
+	qp->rid = end;
+
+	rc = qp->q.fragments_tail && (end - start) > max;
+
+	return rc;
+}
+
+static int nf_ct_frag_reinit(struct ipq *qp)
+{
+	unsigned int sum_truesize = 0;
+
+	if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
+		refcount_inc(&qp->q.refcnt);
+		return -ETIMEDOUT;
+	}
+
+	sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
+	sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
+
+	qp->q.flags = 0;
+	qp->q.len = 0;
+	qp->q.meat = 0;
+	qp->q.rb_fragments = RB_ROOT;
+	qp->q.fragments_tail = NULL;
+	qp->q.last_run_head = NULL;
+	qp->iif = 0;
+	qp->ecn = 0;
+
+	return 0;
+}
+
+static struct ipq *ip_find(struct net *net, struct iphdr *iph,
+			   u32 user, int vif)
+{
+	struct frag_v4_compare_key key = {
+		.saddr = iph->saddr,
+		.daddr = iph->daddr,
+		.user = user,
+		.vif = vif,
+		.id = iph->id,
+		.protocol = iph->protocol,
+	};
+	struct inet_frag_queue *q;
+
+	q = inet_frag_find(net->ipv4.fqdir, &key);
+	if (!q)
+		return NULL;
+
+	return container_of(q, struct ipq, q);
+}
+
+static int nf_ct_frag_reasm(struct ipq *qp, struct sk_buff *skb,
+			    struct sk_buff *prev_tail, struct net_device *dev,
+			    u16 *frag_max_size)
+{
+	struct iphdr *iph;
+	void *reasm_data;
+	int len, err;
+	u8 ecn;
+
+	ipq_kill(qp);
+
+	ecn = ip_frag_ecn_table[qp->ecn];
+	if (unlikely(ecn == 0xff)) {
+		err = -EINVAL;
+		goto out_fail;
+	}
+
+	/* Make the one we just received the head. */
+	reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail);
+	if (!reasm_data)
+		goto out_nomem;
+
+	len = ip_hdrlen(skb) + qp->q.len;
+	err = -E2BIG;
+	if (len > 65535)
+		goto out_oversize;
+
+	inet_frag_reasm_finish(&qp->q, skb, reasm_data,
+			       nf_ct_frag_coalesce_ok(qp));
+
+	skb->dev = dev;
+	if (frag_max_size)
+		*frag_max_size = max(qp->max_df_size, qp->q.max_size);
+
+	iph = ip_hdr(skb);
+	iph->tot_len = htons(len);
+	iph->tos |= ecn;
+
+	/* When we set IP_DF on a refragmented skb we must also force a
+	 * call to ip_fragment to avoid forwarding a DF-skb of size s while
+	 * original sender only sent fragments of size f (where f < s).
+	 *
+	 * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
+	 * frag seen to avoid sending tiny DF-fragments in case skb was built
+	 * from one very small df-fragment and one large non-df frag.
+	 */
+	if (qp->max_df_size == qp->q.max_size)
+		iph->frag_off = htons(IP_DF);
+	else
+		iph->frag_off = 0;
+
+	ip_send_check(iph);
+
+	qp->q.rb_fragments = RB_ROOT;
+	qp->q.fragments_tail = NULL;
+	qp->q.last_run_head = NULL;
+	return 0;
+
+out_nomem:
+	net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
+	err = -ENOMEM;
+	goto out_fail;
+out_oversize:
+	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
+out_fail:
+	return err;
+}
+
+static int nf_ct_frag_queue(struct ipq *qp, struct sk_buff *skb, u16 *frag_max_size)
+{
+	int ihl, end, flags, offset;
+	struct sk_buff *prev_tail;
+	struct net_device *dev;
+	unsigned int fragsize;
+	int err = -ENOENT;
+	u8 ecn;
+
+	if (qp->q.flags & INET_FRAG_COMPLETE)
+		goto err;
+
+	if (unlikely(nf_ct_frag_too_far(qp))) {
+		err = nf_ct_frag_reinit(qp);
+		if (unlikely(err)) {
+			ipq_kill(qp);
+			goto err;
+		}
+	}
+
+	ecn = nf_ct_frag_ecn(ip_hdr(skb)->tos);
+	offset = ntohs(ip_hdr(skb)->frag_off);
+	flags = offset & ~IP_OFFSET;
+	offset &= IP_OFFSET;
+	offset <<= 3;		/* offset is in 8-byte chunks */
+	ihl = ip_hdrlen(skb);
+
+	/* Determine the position of this fragment. */
+	end = offset + skb->len - skb_network_offset(skb) - ihl;
+	err = -EINVAL;
+
+	/* Is this the final fragment? */
+	if ((flags & IP_MF) == 0) {
+		/* If we already have some bits beyond end
+		 * or have different end, the segment is corrupted.
+		 */
+		if (end < qp->q.len ||
+		    ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
+			goto discard_qp;
+		qp->q.flags |= INET_FRAG_LAST_IN;
+		qp->q.len = end;
+	} else {
+		if (end & 7) {
+			end &= ~7;
+			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
+				skb->ip_summed = CHECKSUM_NONE;
+		}
+		if (end > qp->q.len) {
+			/* Some bits beyond end -> corruption. */
+			if (qp->q.flags & INET_FRAG_LAST_IN)
+				goto discard_qp;
+			qp->q.len = end;
+		}
+	}
+	if (end == offset)
+		goto discard_qp;
+
+	err = -ENOMEM;
+	if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
+		goto discard_qp;
+
+	err = pskb_trim_rcsum(skb, end - offset);
+	if (err)
+		goto discard_qp;
+
+	/* Note : skb->rbnode and skb->dev share the same location. */
+	dev = skb->dev;
+	/* Makes sure compiler wont do silly aliasing games */
+	barrier();
+
+	prev_tail = qp->q.fragments_tail;
+	err = inet_frag_queue_insert(&qp->q, skb, offset, end);
+	if (err)
+		goto insert_error;
+
+	if (dev)
+		qp->iif = dev->ifindex;
+
+	qp->q.stamp = skb->tstamp;
+	qp->q.meat += skb->len;
+	qp->ecn |= ecn;
+	add_frag_mem_limit(qp->q.fqdir, skb->truesize);
+	if (offset == 0)
+		qp->q.flags |= INET_FRAG_FIRST_IN;
+
+	fragsize = skb->len + ihl;
+
+	if (fragsize > qp->q.max_size)
+		qp->q.max_size = fragsize;
+
+	if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
+	    fragsize > qp->max_df_size)
+		qp->max_df_size = fragsize;
+
+	if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
+	    qp->q.meat == qp->q.len) {
+		unsigned long orefdst = skb->_skb_refdst;
+
+		skb->_skb_refdst = 0UL;
+		err = nf_ct_frag_reasm(qp, skb, prev_tail, dev, frag_max_size);
+		skb->_skb_refdst = orefdst;
+		if (err)
+			inet_frag_kill(&qp->q);
+		return err;
+	}
+
+	skb_dst_drop(skb);
+	return -EINPROGRESS;
+
+insert_error:
+	if (err == IPFRAG_DUP) {
+		kfree_skb(skb);
+		return -EINVAL;
+	}
+	err = -EINVAL;
+discard_qp:
+	inet_frag_kill(&qp->q);
+err:
+	kfree_skb(skb);
+	return err;
+}
+
+int nf_ct_frag_gather(struct net *net, struct sk_buff *skb,
+		      u32 user, u16 *frag_max_size)
+{
+	struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
+	int vif = l3mdev_master_ifindex_rcu(dev);
+	struct ipq *qp;
+
+	skb_orphan(skb);
+
+	/* Lookup (or create) queue header */
+	qp = ip_find(net, ip_hdr(skb), user, vif);
+	if (qp) {
+		int ret;
+
+		spin_lock(&qp->q.lock);
+
+		ret = nf_ct_frag_queue(qp, skb, frag_max_size);
+
+		spin_unlock(&qp->q.lock);
+		ipq_put(qp);
+		return ret;
+	}
+
+	kfree_skb(skb);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(nf_ct_frag_gather);
+
 static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
 				   u_int32_t user)
 {
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH net-next 2/3] netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB clear
  2020-07-05 14:28 [PATCH net-next 0/3] make nf_ct_frag/6_gather elide the skb CB clear wenxu
  2020-07-05 14:28 ` [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support wenxu
@ 2020-07-05 14:28 ` wenxu
  2020-07-05 14:28 ` [PATCH net-next 3/3] net/sched: act_ct: fix clobber qdisc_skb_cb in defrag wenxu
  2 siblings, 0 replies; 9+ messages in thread
From: wenxu @ 2020-07-05 14:28 UTC (permalink / raw)
  To: davem, pablo; +Cc: netdev, netfilter-devel

From: wenxu <wenxu@ucloud.cn>

Make nf_ct_frag6_gather elide the CB clear when packets are defragmented
by connection tracking. This can make each subsystem such as br_netfilter
, openvswitch, act_ct do defrag without restore the CB. And avoid
serious crashes and problems in ct subsystem. Because Some packet
schedulers store pointers in the qdisc CB private area and Parallel
accesses to the SKB.

Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 include/linux/netfilter_ipv6.h              |  9 +++++----
 include/net/netfilter/ipv6/nf_defrag_ipv6.h |  3 ++-
 net/bridge/netfilter/nf_conntrack_bridge.c  |  7 ++-----
 net/ipv6/netfilter/nf_conntrack_reasm.c     | 19 ++++++++++++-------
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c   |  3 ++-
 net/openvswitch/conntrack.c                 |  8 +++-----
 net/sched/act_ct.c                          |  3 +--
 7 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index aac42c2..ef9fa28 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -58,7 +58,8 @@ struct nf_ipv6_ops {
 			int (*output)(struct net *, struct sock *, struct sk_buff *));
 	int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry);
 #if IS_MODULE(CONFIG_IPV6)
-	int (*br_defrag)(struct net *net, struct sk_buff *skb, u32 user);
+	int (*br_defrag)(struct net *net, struct sk_buff *skb, u32 user,
+			 u16 *frag_max_size);
 	int (*br_fragment)(struct net *net, struct sock *sk,
 			   struct sk_buff *skb,
 			   struct nf_bridge_frag_data *data,
@@ -118,7 +119,7 @@ static inline int nf_ip6_route(struct net *net, struct dst_entry **dst,
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
 static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
-				    u32 user)
+				    u32 user, u16 *frag_max_size)
 {
 #if IS_MODULE(CONFIG_IPV6)
 	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
@@ -126,9 +127,9 @@ static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
 	if (!v6_ops)
 		return 1;
 
-	return v6_ops->br_defrag(net, skb, user);
+	return v6_ops->br_defrag(net, skb, user, frag_max_size);
 #elif IS_BUILTIN(CONFIG_IPV6)
-	return nf_ct_frag6_gather(net, skb, user);
+	return nf_ct_frag6_gather(net, skb, user, frag_max_size);
 #else
 	return 1;
 #endif
diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
index 6d31cd0..d83add2 100644
--- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
@@ -9,7 +9,8 @@
 
 int nf_ct_frag6_init(void);
 void nf_ct_frag6_cleanup(void);
-int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user);
+int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb,
+		       u32 user, u16 *frag_max_size);
 
 struct inet_frags_ctl;
 
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index 8096732..a11927b 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -170,7 +170,6 @@ static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 	enum ip_conntrack_info ctinfo;
-	struct br_input_skb_cb cb;
 	const struct nf_conn *ct;
 	int err;
 
@@ -178,15 +177,13 @@ static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
 	if (ct)
 		zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
 
-	br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm));
-
 	err = nf_ipv6_br_defrag(state->net, skb,
-				IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+				IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id,
+				&BR_INPUT_SKB_CB(skb)->frag_max_size);
 	/* queued */
 	if (err == -EINPROGRESS)
 		return NF_STOLEN;
 
-	br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size);
 	return err == 0 ? NF_ACCEPT : NF_DROP;
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index fed9666..ab28390 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -128,7 +128,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
 #endif
 
 static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
-			     struct sk_buff *prev_tail, struct net_device *dev);
+			     struct sk_buff *prev_tail, struct net_device *dev,
+			     u16 *frag_max_size);
 
 static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
 {
@@ -167,7 +168,8 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
 
 
 static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
-			     const struct frag_hdr *fhdr, int nhoff)
+			     const struct frag_hdr *fhdr, int nhoff,
+			     u16 *frag_max_size)
 {
 	unsigned int payload_len;
 	struct net_device *dev;
@@ -286,7 +288,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		err = nf_ct_frag6_reasm(fq, skb, prev, dev);
+		err = nf_ct_frag6_reasm(fq, skb, prev, dev, frag_max_size);
 		skb->_skb_refdst = orefdst;
 
 		/* After queue has assumed skb ownership, only 0 or
@@ -313,7 +315,8 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
  *	the last and the first frames arrived and all the bits are here.
  */
 static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
-			     struct sk_buff *prev_tail, struct net_device *dev)
+			     struct sk_buff *prev_tail, struct net_device *dev,
+			     u16 *frag_max_size)
 {
 	void *reasm_data;
 	int payload_len;
@@ -354,7 +357,8 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
 	skb->dev = dev;
 	ipv6_hdr(skb)->payload_len = htons(payload_len);
 	ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
-	IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
+	if (frag_max_size)
+		*frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
 
 	/* Yes, and fold redundant checksum back. 8) */
 	if (skb->ip_summed == CHECKSUM_COMPLETE)
@@ -436,7 +440,8 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
 	return 0;
 }
 
-int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
+int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb,
+		       u32 user, u16 *frag_max_size)
 {
 	u16 savethdr = skb->transport_header;
 	int fhoff, nhoff, ret;
@@ -471,7 +476,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 
 	spin_lock_bh(&fq->q.lock);
 
-	ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff);
+	ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, frag_max_size);
 	if (ret == -EPROTO) {
 		skb->transport_header = savethdr;
 		ret = 0;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 6646a87..7d532ed 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -64,7 +64,8 @@ static unsigned int ipv6_defrag(void *priv,
 #endif
 
 	err = nf_ct_frag6_gather(state->net, skb,
-				 nf_ct6_defrag_user(state->hook, skb));
+				 nf_ct6_defrag_user(state->hook, skb),
+				 &IP6CB(skb)->frag_max_size);
 	/* queued */
 	if (err == -EINPROGRESS)
 		return NF_STOLEN;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 4340f25..5984a84 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -493,11 +493,11 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 static int handle_fragments(struct net *net, struct sw_flow_key *key,
 			    u16 zone, struct sk_buff *skb)
 {
-	struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
 	int err;
 
 	if (key->eth.type == htons(ETH_P_IP)) {
 		enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+		struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
 
 		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
 		err = ip_defrag(net, skb, user);
@@ -505,12 +505,12 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
 			return err;
 
 		ovs_cb.mru = IPCB(skb)->frag_max_size;
+		*OVS_CB(skb) = ovs_cb;
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
 		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
 
-		memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
-		err = nf_ct_frag6_gather(net, skb, user);
+		err = nf_ct_frag6_gather(net, skb, user, &OVS_CB(skb)->mru);
 		if (err) {
 			if (err != -EINPROGRESS)
 				kfree_skb(skb);
@@ -518,7 +518,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
 		}
 
 		key->ip.proto = ipv6_hdr(skb)->nexthdr;
-		ovs_cb.mru = IP6CB(skb)->frag_max_size;
 #endif
 	} else {
 		kfree_skb(skb);
@@ -533,7 +532,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
 	key->ip.frag = OVS_FRAG_TYPE_NONE;
 	skb_clear_hash(skb);
 	skb->ignore_df = 1;
-	*OVS_CB(skb) = ovs_cb;
 
 	return 0;
 }
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 1b9c6d4..20f3d11 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -707,8 +707,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
 
-		memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
-		err = nf_ct_frag6_gather(net, skb, user);
+		err = nf_ct_frag6_gather(net, skb, user, NULL);
 		if (err && err != -EINPROGRESS)
 			goto out_free;
 #else
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* [PATCH net-next 3/3] net/sched: act_ct: fix clobber qdisc_skb_cb in defrag
  2020-07-05 14:28 [PATCH net-next 0/3] make nf_ct_frag/6_gather elide the skb CB clear wenxu
  2020-07-05 14:28 ` [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support wenxu
  2020-07-05 14:28 ` [PATCH net-next 2/3] netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB clear wenxu
@ 2020-07-05 14:28 ` wenxu
  2 siblings, 0 replies; 9+ messages in thread
From: wenxu @ 2020-07-05 14:28 UTC (permalink / raw)
  To: davem, pablo; +Cc: netdev, netfilter-devel

From: wenxu <wenxu@ucloud.cn>

using nf_ct_frag_gather to defrag in act_ct to elide CB clear.
Avoid serious crashes and problems in ct subsystem. Because Some packet
schedulers store pointers in the qdisc CB private area and Parallel
accesses to the SKB.

Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 net/sched/act_ct.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 20f3d11..75562f4 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -31,6 +31,7 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_conntrack_helper.h>
 #include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/ipv4/nf_defrag_ipv4.h>
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 #include <uapi/linux/netfilter/nf_nat.h>
 
@@ -695,14 +696,18 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 	skb_get(skb);
 
 	if (family == NFPROTO_IPV4) {
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 		enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
 
-		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
 		local_bh_disable();
-		err = ip_defrag(net, skb, user);
+		err = nf_ct_frag_gather(net, skb, user, NULL);
 		local_bh_enable();
 		if (err && err != -EINPROGRESS)
 			goto out_free;
+#else
+		err = -EOPNOTSUPP;
+		goto out_free;
+#endif
 	} else { /* NFPROTO_IPV6 */
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support
  2020-07-05 14:28 ` [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support wenxu
@ 2020-07-06  5:40   ` Cong Wang
  2020-07-06 14:38   ` Florian Westphal
  1 sibling, 0 replies; 9+ messages in thread
From: Cong Wang @ 2020-07-06  5:40 UTC (permalink / raw)
  To: wenxu
  Cc: David Miller, Pablo Neira Ayuso, Linux Kernel Network Developers,
	NetFilter

On Sun, Jul 5, 2020 at 7:34 AM <wenxu@ucloud.cn> wrote:
> +static int nf_ct_frag_reinit(struct ipq *qp)
> +{
> +       unsigned int sum_truesize = 0;
> +
> +       if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
> +               refcount_inc(&qp->q.refcnt);
> +               return -ETIMEDOUT;
> +       }
> +
> +       sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
> +       sub_frag_mem_limit(qp->q.fqdir, sum_truesize);
> +
> +       qp->q.flags = 0;
> +       qp->q.len = 0;
> +       qp->q.meat = 0;
> +       qp->q.rb_fragments = RB_ROOT;
> +       qp->q.fragments_tail = NULL;
> +       qp->q.last_run_head = NULL;
> +       qp->iif = 0;
> +       qp->ecn = 0;
> +
> +       return 0;
> +}
> +
> +static struct ipq *ip_find(struct net *net, struct iphdr *iph,
> +                          u32 user, int vif)
> +{
> +       struct frag_v4_compare_key key = {
> +               .saddr = iph->saddr,
> +               .daddr = iph->daddr,
> +               .user = user,
> +               .vif = vif,
> +               .id = iph->id,
> +               .protocol = iph->protocol,
> +       };
> +       struct inet_frag_queue *q;
> +
> +       q = inet_frag_find(net->ipv4.fqdir, &key);
> +       if (!q)
> +               return NULL;
> +
> +       return container_of(q, struct ipq, q);
> +}


Please avoid copy-n-paste code by finding a proper way
to reuse them.

Thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support
  2020-07-05 14:28 ` [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support wenxu
  2020-07-06  5:40   ` Cong Wang
@ 2020-07-06 14:38   ` Florian Westphal
  2020-07-06 15:08     ` wenxu
  1 sibling, 1 reply; 9+ messages in thread
From: Florian Westphal @ 2020-07-06 14:38 UTC (permalink / raw)
  To: wenxu; +Cc: davem, pablo, netdev, netfilter-devel

wenxu@ucloud.cn <wenxu@ucloud.cn> wrote:
> From: wenxu <wenxu@ucloud.cn>
> 
> Add nf_ct_frag_gather for conntrack defrag and it will
> elide the CB clear when packets are defragmented by
> connection tracking

Why is this patch required?
Can't you rework ip_defrag to avoid the cb clear if you need that?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support
  2020-07-06 14:38   ` Florian Westphal
@ 2020-07-06 15:08     ` wenxu
  2020-07-06 16:29       ` Florian Westphal
  0 siblings, 1 reply; 9+ messages in thread
From: wenxu @ 2020-07-06 15:08 UTC (permalink / raw)
  To: Florian Westphal; +Cc: davem, pablo, netdev, netfilter-devel


在 2020/7/6 22:38, Florian Westphal 写道:
> wenxu@ucloud.cn <wenxu@ucloud.cn> wrote:
>> From: wenxu <wenxu@ucloud.cn>
>>
>> Add nf_ct_frag_gather for conntrack defrag and it will
>> elide the CB clear when packets are defragmented by
>> connection tracking
> Why is this patch required?
> Can't you rework ip_defrag to avoid the cb clear if you need that?

The ip_defrag used by ip stack and can work with the cb setting.

Defragment case only for conntrack maybe need to avoid the cb

clear. So it is more clear to nf_ct_frag_gather for conntrack like the

function nf_ct_frag6_gather for ipv6.


But this patch should reused some function in ip_defrag like Wang Cong's

suggestion

>

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support
  2020-07-06 15:08     ` wenxu
@ 2020-07-06 16:29       ` Florian Westphal
  2020-07-07  4:47         ` wenxu
  0 siblings, 1 reply; 9+ messages in thread
From: Florian Westphal @ 2020-07-06 16:29 UTC (permalink / raw)
  To: wenxu; +Cc: Florian Westphal, davem, pablo, netdev, netfilter-devel

wenxu <wenxu@ucloud.cn> wrote:
> 
> 在 2020/7/6 22:38, Florian Westphal 写道:
> > wenxu@ucloud.cn <wenxu@ucloud.cn> wrote:
> >> From: wenxu <wenxu@ucloud.cn>
> >>
> >> Add nf_ct_frag_gather for conntrack defrag and it will
> >> elide the CB clear when packets are defragmented by
> >> connection tracking
> > Why is this patch required?
> > Can't you rework ip_defrag to avoid the cb clear if you need that?
> 
> The ip_defrag used by ip stack and can work with the cb setting.

Yes, but does it have to?

If yes, why does nf_ct_frag not need it whereas ip_defrag has to?

> Defragment case only for conntrack maybe need to avoid the cb
> 
> clear. So it is more clear to nf_ct_frag_gather for conntrack like the
> 
> function nf_ct_frag6_gather for ipv6.

nf_ct_frag6_gather() is only re-using less code from ipv6 for historical
reasons.  If anything, ipv6 conntrack defrag should re-use more code from
ipv6 defrag, rather than making ipv4 conntrack defrag look more like
ipv6 conntrack defrag.


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support
  2020-07-06 16:29       ` Florian Westphal
@ 2020-07-07  4:47         ` wenxu
  0 siblings, 0 replies; 9+ messages in thread
From: wenxu @ 2020-07-07  4:47 UTC (permalink / raw)
  To: Florian Westphal; +Cc: davem, pablo, netdev, netfilter-devel


On 7/7/2020 12:29 AM, Florian Westphal wrote:
> wenxu <wenxu@ucloud.cn> wrote:
>> 在 2020/7/6 22:38, Florian Westphal 写道:
>>> wenxu@ucloud.cn <wenxu@ucloud.cn> wrote:
>>>> From: wenxu <wenxu@ucloud.cn>
>>>>
>>>> Add nf_ct_frag_gather for conntrack defrag and it will
>>>> elide the CB clear when packets are defragmented by
>>>> connection tracking
>>> Why is this patch required?
>>> Can't you rework ip_defrag to avoid the cb clear if you need that?
>> The ip_defrag used by ip stack and can work with the cb setting.
> Yes, but does it have to?
>
> If yes, why does nf_ct_frag not need it whereas ip_defrag has to?
>
>
Yes, rework with ip_defrag is much better. Thanks.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-07-07  4:49 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-05 14:28 [PATCH net-next 0/3] make nf_ct_frag/6_gather elide the skb CB clear wenxu
2020-07-05 14:28 ` [PATCH net-next 1/3] netfilter: nf_defrag_ipv4: Add nf_ct_frag_gather support wenxu
2020-07-06  5:40   ` Cong Wang
2020-07-06 14:38   ` Florian Westphal
2020-07-06 15:08     ` wenxu
2020-07-06 16:29       ` Florian Westphal
2020-07-07  4:47         ` wenxu
2020-07-05 14:28 ` [PATCH net-next 2/3] netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB clear wenxu
2020-07-05 14:28 ` [PATCH net-next 3/3] net/sched: act_ct: fix clobber qdisc_skb_cb in defrag wenxu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).