netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear
@ 2020-07-07  4:55 wenxu
  2020-07-07  4:55 ` [PATCH net-next v2 1/3] net: ip_fragment: Add ip_defrag_ignore_cb support wenxu
                   ` (3 more replies)
  0 siblings, 4 replies; 8+ messages in thread
From: wenxu @ 2020-07-07  4:55 UTC (permalink / raw)
  To: netdev; +Cc: netfilter-devel, fw

From: wenxu <wenxu@ucloud.cn>

Add nf_ct_frag_gather and Make nf_ct_frag6_gather elide the CB clear 
when packets are defragmented by connection tracking. This can make
each subsystem such as br_netfilter, openvswitch, act_ct do defrag
without restore the CB. 
This also avoid serious crashes and problems in  ct subsystem.
Because Some packet schedulers store pointers in the qdisc CB private
area and parallel accesses to the SKB.

This series following up
http://patchwork.ozlabs.org/project/netdev/patch/1593422178-26949-1-git-send-email-wenxu@ucloud.cn/

patch1: add nf_ct_frag_gather elide the CB clear
patch2: make nf_ct_frag6_gather elide the CB clear
patch3: fix clobber qdisc_skb_cb in act_ct with defrag

v2: resue some ip_defrag function in patch1

wenxu (3):
  net: ip_fragment: Add ip_defrag_ignore_cb support
  netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB
    clear
  net/sched: act_ct: fix clobber qdisc_skb_cb in defrag

 include/linux/netfilter_ipv6.h              |  9 ++---
 include/net/ip.h                            |  2 ++
 include/net/netfilter/ipv6/nf_defrag_ipv6.h |  3 +-
 net/bridge/netfilter/nf_conntrack_bridge.c  |  7 ++--
 net/ipv4/ip_fragment.c                      | 55 ++++++++++++++++++++++++-----
 net/ipv6/netfilter/nf_conntrack_reasm.c     | 19 ++++++----
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c   |  3 +-
 net/openvswitch/conntrack.c                 |  8 ++---
 net/sched/act_ct.c                          |  8 ++---
 9 files changed, 77 insertions(+), 37 deletions(-)

-- 
1.8.3.1


^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH net-next v2 1/3] net: ip_fragment: Add ip_defrag_ignore_cb support
  2020-07-07  4:55 [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear wenxu
@ 2020-07-07  4:55 ` wenxu
  2020-07-07  4:55 ` [PATCH net-next v2 2/3] netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB clear wenxu
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 8+ messages in thread
From: wenxu @ 2020-07-07  4:55 UTC (permalink / raw)
  To: netdev; +Cc: netfilter-devel, fw

From: wenxu <wenxu@ucloud.cn>

Add ip_defrag_ignore_cb for conntrack defrag and it will
elide the CB clear when packets are defragmented by
connection tracking.

Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 include/net/ip.h       |  2 ++
 net/ipv4/ip_fragment.c | 55 ++++++++++++++++++++++++++++++++++++++++++--------
 2 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 862c954..31779a5 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -671,6 +671,8 @@ static inline bool ip_defrag_user_in_between(u32 user,
 }
 
 int ip_defrag(struct net *net, struct sk_buff *skb, u32 user);
+int ip_defrag_ignore_cb(struct net *net, struct sk_buff *skb,
+			u32 user, u16 *frag_max_size);
 #ifdef CONFIG_INET
 struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user);
 #else
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cfeb889..afc2b3d 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -76,7 +76,8 @@ static u8 ip4_frag_ecn(u8 tos)
 static struct inet_frags ip4_frags;
 
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
-			 struct sk_buff *prev_tail, struct net_device *dev);
+			 struct sk_buff *prev_tail, struct net_device *dev,
+			 bool ignore_skb_cb, u16 *frag_max_size);
 
 
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
@@ -269,7 +270,8 @@ static int ip_frag_reinit(struct ipq *qp)
 }
 
 /* Add new segment to existing queue. */
-static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb,
+			 bool ignore_skb_cb, u16 *frag_max_size)
 {
 	struct net *net = qp->q.fqdir->net;
 	int ihl, end, flags, offset;
@@ -282,7 +284,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 	if (qp->q.flags & INET_FRAG_COMPLETE)
 		goto err;
 
-	if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+	if ((ignore_skb_cb || !(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE)) &&
 	    unlikely(ip_frag_too_far(qp)) &&
 	    unlikely(err = ip_frag_reinit(qp))) {
 		ipq_kill(qp);
@@ -368,7 +370,8 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		err = ip_frag_reasm(qp, skb, prev_tail, dev);
+		err = ip_frag_reasm(qp, skb, prev_tail, dev, ignore_skb_cb,
+				    frag_max_size);
 		skb->_skb_refdst = orefdst;
 		if (err)
 			inet_frag_kill(&qp->q);
@@ -400,7 +403,8 @@ static bool ip_frag_coalesce_ok(const struct ipq *qp)
 
 /* Build a new IP datagram from all its fragments. */
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
-			 struct sk_buff *prev_tail, struct net_device *dev)
+			 struct sk_buff *prev_tail, struct net_device *dev,
+			 bool ignore_skb_cb, u16 *frag_max_size)
 {
 	struct net *net = qp->q.fqdir->net;
 	struct iphdr *iph;
@@ -430,7 +434,10 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 			       ip_frag_coalesce_ok(qp));
 
 	skb->dev = dev;
-	IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
+	if (!ignore_skb_cb)
+		IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
+	else if (frag_max_size)
+		*frag_max_size = max(qp->max_df_size, qp->q.max_size);
 
 	iph = ip_hdr(skb);
 	iph->tot_len = htons(len);
@@ -445,7 +452,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
 	 * from one very small df-fragment and one large non-df frag.
 	 */
 	if (qp->max_df_size == qp->q.max_size) {
-		IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
+		if (!ignore_skb_cb)
+			IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
 		iph->frag_off = htons(IP_DF);
 	} else {
 		iph->frag_off = 0;
@@ -487,7 +495,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
 
 		spin_lock(&qp->q.lock);
 
-		ret = ip_frag_queue(qp, skb);
+		ret = ip_frag_queue(qp, skb, false, NULL);
 
 		spin_unlock(&qp->q.lock);
 		ipq_put(qp);
@@ -500,6 +508,37 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
 }
 EXPORT_SYMBOL(ip_defrag);
 
+/* Process an incoming IP datagram fragment. */
+int ip_defrag_ignore_cb(struct net *net, struct sk_buff *skb,
+			u32 user, u16 *frag_max_size)
+{
+	struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
+	int vif = l3mdev_master_ifindex_rcu(dev);
+	struct ipq *qp;
+
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
+	skb_orphan(skb);
+
+	/* Lookup (or create) queue header */
+	qp = ip_find(net, ip_hdr(skb), user, vif);
+	if (qp) {
+		int ret;
+
+		spin_lock_bh(&qp->q.lock);
+
+		ret = ip_frag_queue(qp, skb, true, frag_max_size);
+
+		spin_unlock_bh(&qp->q.lock);
+		ipq_put(qp);
+		return ret;
+	}
+
+	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
+	kfree_skb(skb);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(ip_defrag_ignore_cb);
+
 struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
 {
 	struct iphdr iph;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH net-next v2 2/3] netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB clear
  2020-07-07  4:55 [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear wenxu
  2020-07-07  4:55 ` [PATCH net-next v2 1/3] net: ip_fragment: Add ip_defrag_ignore_cb support wenxu
@ 2020-07-07  4:55 ` wenxu
  2020-07-07  4:55 ` [PATCH net-next v2 3/3] net/sched: act_ct: fix clobber qdisc_skb_cb in defrag wenxu
  2020-07-15 20:26 ` [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear Jakub Kicinski
  3 siblings, 0 replies; 8+ messages in thread
From: wenxu @ 2020-07-07  4:55 UTC (permalink / raw)
  To: netdev; +Cc: netfilter-devel, fw

From: wenxu <wenxu@ucloud.cn>

Make nf_ct_frag6_gather elide the CB clear when packets are defragmented
by connection tracking. This can make each subsystem such as br_netfilter
, openvswitch, act_ct do defrag without restore the CB. And avoid
serious crashes and problems in ct subsystem. Because Some packet
schedulers store pointers in the qdisc CB private area and Parallel
accesses to the SKB.

Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 include/linux/netfilter_ipv6.h              |  9 +++++----
 include/net/netfilter/ipv6/nf_defrag_ipv6.h |  3 ++-
 net/bridge/netfilter/nf_conntrack_bridge.c  |  7 ++-----
 net/ipv6/netfilter/nf_conntrack_reasm.c     | 19 ++++++++++++-------
 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c   |  3 ++-
 net/openvswitch/conntrack.c                 |  8 +++-----
 net/sched/act_ct.c                          |  3 +--
 7 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index aac42c2..ef9fa28 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -58,7 +58,8 @@ struct nf_ipv6_ops {
 			int (*output)(struct net *, struct sock *, struct sk_buff *));
 	int (*reroute)(struct sk_buff *skb, const struct nf_queue_entry *entry);
 #if IS_MODULE(CONFIG_IPV6)
-	int (*br_defrag)(struct net *net, struct sk_buff *skb, u32 user);
+	int (*br_defrag)(struct net *net, struct sk_buff *skb, u32 user,
+			 u16 *frag_max_size);
 	int (*br_fragment)(struct net *net, struct sock *sk,
 			   struct sk_buff *skb,
 			   struct nf_bridge_frag_data *data,
@@ -118,7 +119,7 @@ static inline int nf_ip6_route(struct net *net, struct dst_entry **dst,
 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
 
 static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
-				    u32 user)
+				    u32 user, u16 *frag_max_size)
 {
 #if IS_MODULE(CONFIG_IPV6)
 	const struct nf_ipv6_ops *v6_ops = nf_get_ipv6_ops();
@@ -126,9 +127,9 @@ static inline int nf_ipv6_br_defrag(struct net *net, struct sk_buff *skb,
 	if (!v6_ops)
 		return 1;
 
-	return v6_ops->br_defrag(net, skb, user);
+	return v6_ops->br_defrag(net, skb, user, frag_max_size);
 #elif IS_BUILTIN(CONFIG_IPV6)
-	return nf_ct_frag6_gather(net, skb, user);
+	return nf_ct_frag6_gather(net, skb, user, frag_max_size);
 #else
 	return 1;
 #endif
diff --git a/include/net/netfilter/ipv6/nf_defrag_ipv6.h b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
index 6d31cd0..d83add2 100644
--- a/include/net/netfilter/ipv6/nf_defrag_ipv6.h
+++ b/include/net/netfilter/ipv6/nf_defrag_ipv6.h
@@ -9,7 +9,8 @@
 
 int nf_ct_frag6_init(void);
 void nf_ct_frag6_cleanup(void);
-int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user);
+int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb,
+		       u32 user, u16 *frag_max_size);
 
 struct inet_frags_ctl;
 
diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
index 8096732..a11927b 100644
--- a/net/bridge/netfilter/nf_conntrack_bridge.c
+++ b/net/bridge/netfilter/nf_conntrack_bridge.c
@@ -170,7 +170,6 @@ static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
 {
 	u16 zone_id = NF_CT_DEFAULT_ZONE_ID;
 	enum ip_conntrack_info ctinfo;
-	struct br_input_skb_cb cb;
 	const struct nf_conn *ct;
 	int err;
 
@@ -178,15 +177,13 @@ static unsigned int nf_ct_br_defrag6(struct sk_buff *skb,
 	if (ct)
 		zone_id = nf_ct_zone_id(nf_ct_zone(ct), CTINFO2DIR(ctinfo));
 
-	br_skb_cb_save(skb, &cb, sizeof(struct inet6_skb_parm));
-
 	err = nf_ipv6_br_defrag(state->net, skb,
-				IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id);
+				IP_DEFRAG_CONNTRACK_BRIDGE_IN + zone_id,
+				&BR_INPUT_SKB_CB(skb)->frag_max_size);
 	/* queued */
 	if (err == -EINPROGRESS)
 		return NF_STOLEN;
 
-	br_skb_cb_restore(skb, &cb, IP6CB(skb)->frag_max_size);
 	return err == 0 ? NF_ACCEPT : NF_DROP;
 }
 
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index fed9666..ab28390 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -128,7 +128,8 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net)
 #endif
 
 static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
-			     struct sk_buff *prev_tail, struct net_device *dev);
+			     struct sk_buff *prev_tail, struct net_device *dev,
+			     u16 *frag_max_size);
 
 static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
 {
@@ -167,7 +168,8 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
 
 
 static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
-			     const struct frag_hdr *fhdr, int nhoff)
+			     const struct frag_hdr *fhdr, int nhoff,
+			     u16 *frag_max_size)
 {
 	unsigned int payload_len;
 	struct net_device *dev;
@@ -286,7 +288,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 		unsigned long orefdst = skb->_skb_refdst;
 
 		skb->_skb_refdst = 0UL;
-		err = nf_ct_frag6_reasm(fq, skb, prev, dev);
+		err = nf_ct_frag6_reasm(fq, skb, prev, dev, frag_max_size);
 		skb->_skb_refdst = orefdst;
 
 		/* After queue has assumed skb ownership, only 0 or
@@ -313,7 +315,8 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
  *	the last and the first frames arrived and all the bits are here.
  */
 static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
-			     struct sk_buff *prev_tail, struct net_device *dev)
+			     struct sk_buff *prev_tail, struct net_device *dev,
+			     u16 *frag_max_size)
 {
 	void *reasm_data;
 	int payload_len;
@@ -354,7 +357,8 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
 	skb->dev = dev;
 	ipv6_hdr(skb)->payload_len = htons(payload_len);
 	ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn);
-	IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
+	if (frag_max_size)
+		*frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size;
 
 	/* Yes, and fold redundant checksum back. 8) */
 	if (skb->ip_summed == CHECKSUM_COMPLETE)
@@ -436,7 +440,8 @@ static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb,
 	return 0;
 }
 
-int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
+int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb,
+		       u32 user, u16 *frag_max_size)
 {
 	u16 savethdr = skb->transport_header;
 	int fhoff, nhoff, ret;
@@ -471,7 +476,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 
 	spin_lock_bh(&fq->q.lock);
 
-	ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff);
+	ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff, frag_max_size);
 	if (ret == -EPROTO) {
 		skb->transport_header = savethdr;
 		ret = 0;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 6646a87..7d532ed 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -64,7 +64,8 @@ static unsigned int ipv6_defrag(void *priv,
 #endif
 
 	err = nf_ct_frag6_gather(state->net, skb,
-				 nf_ct6_defrag_user(state->hook, skb));
+				 nf_ct6_defrag_user(state->hook, skb),
+				 &IP6CB(skb)->frag_max_size);
 	/* queued */
 	if (err == -EINPROGRESS)
 		return NF_STOLEN;
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 4340f25..5984a84 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -493,11 +493,11 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
 static int handle_fragments(struct net *net, struct sw_flow_key *key,
 			    u16 zone, struct sk_buff *skb)
 {
-	struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
 	int err;
 
 	if (key->eth.type == htons(ETH_P_IP)) {
 		enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
+		struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
 
 		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
 		err = ip_defrag(net, skb, user);
@@ -505,12 +505,12 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
 			return err;
 
 		ovs_cb.mru = IPCB(skb)->frag_max_size;
+		*OVS_CB(skb) = ovs_cb;
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 	} else if (key->eth.type == htons(ETH_P_IPV6)) {
 		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
 
-		memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
-		err = nf_ct_frag6_gather(net, skb, user);
+		err = nf_ct_frag6_gather(net, skb, user, &OVS_CB(skb)->mru);
 		if (err) {
 			if (err != -EINPROGRESS)
 				kfree_skb(skb);
@@ -518,7 +518,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
 		}
 
 		key->ip.proto = ipv6_hdr(skb)->nexthdr;
-		ovs_cb.mru = IP6CB(skb)->frag_max_size;
 #endif
 	} else {
 		kfree_skb(skb);
@@ -533,7 +532,6 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
 	key->ip.frag = OVS_FRAG_TYPE_NONE;
 	skb_clear_hash(skb);
 	skb->ignore_df = 1;
-	*OVS_CB(skb) = ovs_cb;
 
 	return 0;
 }
diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 1b9c6d4..20f3d11 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -707,8 +707,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 		enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
 
-		memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
-		err = nf_ct_frag6_gather(net, skb, user);
+		err = nf_ct_frag6_gather(net, skb, user, NULL);
 		if (err && err != -EINPROGRESS)
 			goto out_free;
 #else
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* [PATCH net-next v2 3/3] net/sched: act_ct: fix clobber qdisc_skb_cb in defrag
  2020-07-07  4:55 [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear wenxu
  2020-07-07  4:55 ` [PATCH net-next v2 1/3] net: ip_fragment: Add ip_defrag_ignore_cb support wenxu
  2020-07-07  4:55 ` [PATCH net-next v2 2/3] netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB clear wenxu
@ 2020-07-07  4:55 ` wenxu
  2020-07-15 20:26 ` [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear Jakub Kicinski
  3 siblings, 0 replies; 8+ messages in thread
From: wenxu @ 2020-07-07  4:55 UTC (permalink / raw)
  To: netdev; +Cc: netfilter-devel, fw

From: wenxu <wenxu@ucloud.cn>

using ip_defrag_ignore_cb to defrag in act_ct to elide CB clear.
Avoid serious crashes and problems in ct subsystem. Because Some packet
schedulers store pointers in the qdisc CB private area and Parallel
accesses to the SKB.

Fixes: b57dc7c13ea9 ("net/sched: Introduce action ct")
Signed-off-by: wenxu <wenxu@ucloud.cn>
---
 net/sched/act_ct.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c
index 20f3d11..a8e9e62 100644
--- a/net/sched/act_ct.c
+++ b/net/sched/act_ct.c
@@ -697,10 +697,7 @@ static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 	if (family == NFPROTO_IPV4) {
 		enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
 
-		memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
-		local_bh_disable();
-		err = ip_defrag(net, skb, user);
-		local_bh_enable();
+		err = ip_defrag_ignore_cb(net, skb, user, NULL);
 		if (err && err != -EINPROGRESS)
 			goto out_free;
 	} else { /* NFPROTO_IPV6 */
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear
  2020-07-07  4:55 [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear wenxu
                   ` (2 preceding siblings ...)
  2020-07-07  4:55 ` [PATCH net-next v2 3/3] net/sched: act_ct: fix clobber qdisc_skb_cb in defrag wenxu
@ 2020-07-15 20:26 ` Jakub Kicinski
  2020-07-15 21:17   ` Florian Westphal
  3 siblings, 1 reply; 8+ messages in thread
From: Jakub Kicinski @ 2020-07-15 20:26 UTC (permalink / raw)
  To: wenxu; +Cc: netdev, netfilter-devel, fw, Cong Wang

On Tue,  7 Jul 2020 12:55:08 +0800 wenxu@ucloud.cn wrote:
> From: wenxu <wenxu@ucloud.cn>
> 
> Add nf_ct_frag_gather and Make nf_ct_frag6_gather elide the CB clear 
> when packets are defragmented by connection tracking. This can make
> each subsystem such as br_netfilter, openvswitch, act_ct do defrag
> without restore the CB. 
> This also avoid serious crashes and problems in  ct subsystem.
> Because Some packet schedulers store pointers in the qdisc CB private
> area and parallel accesses to the SKB.
> 
> This series following up
> http://patchwork.ozlabs.org/project/netdev/patch/1593422178-26949-1-git-send-email-wenxu@ucloud.cn/
> 
> patch1: add nf_ct_frag_gather elide the CB clear
> patch2: make nf_ct_frag6_gather elide the CB clear
> patch3: fix clobber qdisc_skb_cb in act_ct with defrag
> 
> v2: resue some ip_defrag function in patch1

Florian, Cong - are you willing to venture an ack on these? Anyone?

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear
  2020-07-15 20:26 ` [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear Jakub Kicinski
@ 2020-07-15 21:17   ` Florian Westphal
  2020-07-16  2:42     ` wenxu
  2020-07-17 15:32     ` Cong Wang
  0 siblings, 2 replies; 8+ messages in thread
From: Florian Westphal @ 2020-07-15 21:17 UTC (permalink / raw)
  To: Jakub Kicinski; +Cc: wenxu, netdev, netfilter-devel, fw, Cong Wang

Jakub Kicinski <kuba@kernel.org> wrote:
> On Tue,  7 Jul 2020 12:55:08 +0800 wenxu@ucloud.cn wrote:
> > From: wenxu <wenxu@ucloud.cn>
> > 
> > Add nf_ct_frag_gather and Make nf_ct_frag6_gather elide the CB clear 
> > when packets are defragmented by connection tracking. This can make
> > each subsystem such as br_netfilter, openvswitch, act_ct do defrag
> > without restore the CB. 
> > This also avoid serious crashes and problems in  ct subsystem.
> > Because Some packet schedulers store pointers in the qdisc CB private
> > area and parallel accesses to the SKB.
> > 
> > This series following up
> > http://patchwork.ozlabs.org/project/netdev/patch/1593422178-26949-1-git-send-email-wenxu@ucloud.cn/
> > 
> > patch1: add nf_ct_frag_gather elide the CB clear
> > patch2: make nf_ct_frag6_gather elide the CB clear
> > patch3: fix clobber qdisc_skb_cb in act_ct with defrag
> > 
> > v2: resue some ip_defrag function in patch1
> 
> Florian, Cong - are you willing to venture an ack on these? Anyone?

Nope, sorry.  Reason is that I can't figure out the need for this series.
Taking a huge step back:

http://patchwork.ozlabs.org/project/netdev/patch/1593422178-26949-1-git-send-email-wenxu@ucloud.cn/

That patch looks ok to me:
I understand the problem statement/commit message and I can see how its addressed.

I don't understand why the CB clearing must be avoided.

defrag assumes skb ownership -- e.g. it may realloc skb->data
(calls pskb_may_pull), it calls skb_orphan(), etc.

AFAICS, tcf_classify makes same assumption -- exclusive ownership
and no parallel skb accesses.

So, if in fact the "only" problem is the loss of
qdisc_skb_cb(skb)->pkt_len, then the other patch looks ok to me.

If we indeed have parallel access, then I do not understand how
avoiding the memsets in the defrag path makes things any better
(see above wrt. skb pull and the like).

As for these patches here:

- if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
+ if ((ignore_skb_cb || !(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE)) &&

This is very questionable, we take different code path depending
on call site.

Why is it okay to unconditionally take this branch for act_ct case (ignore_skb_cb set)?

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear
  2020-07-15 21:17   ` Florian Westphal
@ 2020-07-16  2:42     ` wenxu
  2020-07-17 15:32     ` Cong Wang
  1 sibling, 0 replies; 8+ messages in thread
From: wenxu @ 2020-07-16  2:42 UTC (permalink / raw)
  To: Florian Westphal, David S. Miller
  Cc: Jakub Kicinski, netdev, netfilter-devel, Cong Wang


On 7/16/2020 5:17 AM, Florian Westphal wrote:
> Jakub Kicinski <kuba@kernel.org> wrote:
>> On Tue,  7 Jul 2020 12:55:08 +0800 wenxu@ucloud.cn wrote:
>>> From: wenxu <wenxu@ucloud.cn>
>>>
>>> Add nf_ct_frag_gather and Make nf_ct_frag6_gather elide the CB clear 
>>> when packets are defragmented by connection tracking. This can make
>>> each subsystem such as br_netfilter, openvswitch, act_ct do defrag
>>> without restore the CB. 
>>> This also avoid serious crashes and problems in  ct subsystem.
>>> Because Some packet schedulers store pointers in the qdisc CB private
>>> area and parallel accesses to the SKB.
>>>
>>> This series following up
>>> http://patchwork.ozlabs.org/project/netdev/patch/1593422178-26949-1-git-send-email-wenxu@ucloud.cn/
>>>
>>> patch1: add nf_ct_frag_gather elide the CB clear
>>> patch2: make nf_ct_frag6_gather elide the CB clear
>>> patch3: fix clobber qdisc_skb_cb in act_ct with defrag
>>>
>>> v2: resue some ip_defrag function in patch1
>> Florian, Cong - are you willing to venture an ack on these? Anyone?
> Nope, sorry.  Reason is that I can't figure out the need for this series.
> Taking a huge step back:
>
> http://patchwork.ozlabs.org/project/netdev/patch/1593422178-26949-1-git-send-email-wenxu@ucloud.cn/
>
> That patch looks ok to me:
> I understand the problem statement/commit message and I can see how its addressed.
>
> I don't understand why the CB clearing must be avoided.
>
> defrag assumes skb ownership -- e.g. it may realloc skb->data
> (calls pskb_may_pull), it calls skb_orphan(), etc.
>
> AFAICS, tcf_classify makes same assumption -- exclusive ownership
> and no parallel skb accesses.
>
> So, if in fact the "only" problem is the loss of
> qdisc_skb_cb(skb)->pkt_len, then the other patch looks ok to me.
>
> If we indeed have parallel access, then I do not understand how
> avoiding the memsets in the defrag path makes things any better
> (see above wrt. skb pull and the like).

Hi David,


What case for the parallel access the skb in tcf_classify?

If there indeed have this. Maybe it can't do defrag which also

access and modify the skb?


BR

wenxu



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear
  2020-07-15 21:17   ` Florian Westphal
  2020-07-16  2:42     ` wenxu
@ 2020-07-17 15:32     ` Cong Wang
  1 sibling, 0 replies; 8+ messages in thread
From: Cong Wang @ 2020-07-17 15:32 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Jakub Kicinski, wenxu, Linux Kernel Network Developers, NetFilter

On Wed, Jul 15, 2020 at 2:17 PM Florian Westphal <fw@strlen.de> wrote:
>
> Jakub Kicinski <kuba@kernel.org> wrote:
> > On Tue,  7 Jul 2020 12:55:08 +0800 wenxu@ucloud.cn wrote:
> > > From: wenxu <wenxu@ucloud.cn>
> > >
> > > Add nf_ct_frag_gather and Make nf_ct_frag6_gather elide the CB clear
> > > when packets are defragmented by connection tracking. This can make
> > > each subsystem such as br_netfilter, openvswitch, act_ct do defrag
> > > without restore the CB.
> > > This also avoid serious crashes and problems in  ct subsystem.
> > > Because Some packet schedulers store pointers in the qdisc CB private
> > > area and parallel accesses to the SKB.
> > >
> > > This series following up
> > > http://patchwork.ozlabs.org/project/netdev/patch/1593422178-26949-1-git-send-email-wenxu@ucloud.cn/
> > >
> > > patch1: add nf_ct_frag_gather elide the CB clear
> > > patch2: make nf_ct_frag6_gather elide the CB clear
> > > patch3: fix clobber qdisc_skb_cb in act_ct with defrag
> > >
> > > v2: resue some ip_defrag function in patch1
> >
> > Florian, Cong - are you willing to venture an ack on these? Anyone?
>
> Nope, sorry.  Reason is that I can't figure out the need for this series.
> Taking a huge step back:
>
> http://patchwork.ozlabs.org/project/netdev/patch/1593422178-26949-1-git-send-email-wenxu@ucloud.cn/
>
> That patch looks ok to me:
> I understand the problem statement/commit message and I can see how its addressed.
>
> I don't understand why the CB clearing must be avoided.
>
> defrag assumes skb ownership -- e.g. it may realloc skb->data
> (calls pskb_may_pull), it calls skb_orphan(), etc.
>
> AFAICS, tcf_classify makes same assumption -- exclusive ownership
> and no parallel skb accesses.
>
> So, if in fact the "only" problem is the loss of
> qdisc_skb_cb(skb)->pkt_len, then the other patch looks ok to me.
>
> If we indeed have parallel access, then I do not understand how
> avoiding the memsets in the defrag path makes things any better
> (see above wrt. skb pull and the like).

+1

I don't see parallel access here either. skb can be cloned for packet
socket or act_mirred, but its CB is cloned at the same time.

Thanks.

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-07-17 15:32 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-07  4:55 [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear wenxu
2020-07-07  4:55 ` [PATCH net-next v2 1/3] net: ip_fragment: Add ip_defrag_ignore_cb support wenxu
2020-07-07  4:55 ` [PATCH net-next v2 2/3] netfilter: nf_conntrack_reasm: make nf_ct_frag6_gather elide the CB clear wenxu
2020-07-07  4:55 ` [PATCH net-next v2 3/3] net/sched: act_ct: fix clobber qdisc_skb_cb in defrag wenxu
2020-07-15 20:26 ` [PATCH net-next v2 0/3] make nf_ct_frag/6_gather elide the skb CB clear Jakub Kicinski
2020-07-15 21:17   ` Florian Westphal
2020-07-16  2:42     ` wenxu
2020-07-17 15:32     ` Cong Wang

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).