netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 1/6] ipv6: frag: remove unused field
@ 2018-03-30  5:22 Eric Dumazet
  2018-03-30  5:22 ` [PATCH net-next 2/6] inet: frags: change inet_frags_init_net() return value Eric Dumazet
                   ` (4 more replies)
  0 siblings, 5 replies; 11+ messages in thread
From: Eric Dumazet @ 2018-03-30  5:22 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Florian Westphal, Herbert Xu, Thomas Graf,
	Jesper Dangaard Brouer, Alexander Aring, Stefan Schmidt,
	Eric Dumazet, Eric Dumazet

csum field in struct frag_queue is not used, remove it.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/ipv6.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 50a6f0ddb8780f6c9169f4ae0b3b35af2d66cd4b..5c18836672e9d1c560cdce15f5b34928c337abfd 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -603,7 +603,6 @@ struct frag_queue {
 	struct in6_addr		daddr;
 
 	int			iif;
-	unsigned int		csum;
 	__u16			nhoffset;
 	u8			ecn;
 };
-- 
2.17.0.rc1.321.gba9d0f2565-goog

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH net-next 2/6] inet: frags: change inet_frags_init_net() return value
  2018-03-30  5:22 [PATCH net-next 1/6] ipv6: frag: remove unused field Eric Dumazet
@ 2018-03-30  5:22 ` Eric Dumazet
  2018-03-30  5:22 ` [PATCH net-next 3/6] inet: frags: add a pointer to struct netns_frags Eric Dumazet
                   ` (3 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2018-03-30  5:22 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Florian Westphal, Herbert Xu, Thomas Graf,
	Jesper Dangaard Brouer, Alexander Aring, Stefan Schmidt,
	Eric Dumazet, Eric Dumazet

We will soon initialize one rhashtable per struct netns_frags
in inet_frags_init_net().

This patch changes the return value to eventually propagate an
error.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/inet_frag.h                 |  3 ++-
 net/ieee802154/6lowpan/reassembly.c     | 11 ++++++++---
 net/ipv4/ip_fragment.c                  | 12 +++++++++---
 net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +++++++++---
 net/ipv6/reassembly.c                   | 11 +++++++++--
 5 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 351f0c3cdcd9df16078a40f07963fb605eeaa882..b1d62176f3b4fcf100bd263e8eae0db656a3d9b6 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -104,9 +104,10 @@ struct inet_frags {
 int inet_frags_init(struct inet_frags *);
 void inet_frags_fini(struct inet_frags *);
 
-static inline void inet_frags_init_net(struct netns_frags *nf)
+static inline int inet_frags_init_net(struct netns_frags *nf)
 {
 	atomic_set(&nf->mem, 0);
+	return 0;
 }
 void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
 
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 85bf86ad6b1801066a4252af18b5b511070a9e08..2aaab4bba42961647a4d3d1c0b8497917d5065ce 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -581,14 +581,19 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 {
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
 		net_ieee802154_lowpan(net);
+	int res;
 
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	inet_frags_init_net(&ieee802154_lowpan->frags);
-
-	return lowpan_frags_ns_sysctl_register(net);
+	res = inet_frags_init_net(&ieee802154_lowpan->frags);
+	if (res < 0)
+		return res;
+	res = lowpan_frags_ns_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+	return res;
 }
 
 static void __net_exit lowpan_frags_exit_net(struct net *net)
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index bbf1b94942c0ed53d1ddb87e4ee63833c08f2684..e0b39d4ecbd411ff4bb72d1ed973e45bd6da9ef1 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -846,6 +846,8 @@ static void __init ip4_frags_ctl_register(void)
 
 static int __net_init ipv4_frags_init_net(struct net *net)
 {
+	int res;
+
 	/* Fragment cache limits.
 	 *
 	 * The fragment memory accounting code, (tries to) account for
@@ -871,9 +873,13 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 
 	net->ipv4.frags.max_dist = 64;
 
-	inet_frags_init_net(&net->ipv4.frags);
-
-	return ip4_frags_ns_ctl_register(net);
+	res = inet_frags_init_net(&net->ipv4.frags);
+	if (res < 0)
+		return res;
+	res = ip4_frags_ns_ctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index b84ce3e6d728a5b8af65b91faf42ec640ff03910..6ff41569134ae36809a8b42d8e46d50d19ffde53 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -629,12 +629,18 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
 
 static int nf_ct_net_init(struct net *net)
 {
+	int res;
+
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
-	inet_frags_init_net(&net->nf_frag.frags);
-
-	return nf_ct_frag6_sysctl_register(net);
+	res = inet_frags_init_net(&net->nf_frag.frags);
+	if (res < 0)
+		return res;
+	res = nf_ct_frag6_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 08a139f14d0f6fa8ca326088cce1144411e09bf5..a8f7a5f0251a7af0b14cc6de5006b924d9d05672 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -711,13 +711,20 @@ static void ip6_frags_sysctl_unregister(void)
 
 static int __net_init ipv6_frags_init_net(struct net *net)
 {
+	int res;
+
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
 
-	inet_frags_init_net(&net->ipv6.frags);
+	res = inet_frags_init_net(&net->ipv6.frags);
+	if (res < 0)
+		return res;
 
-	return ip6_frags_ns_sysctl_register(net);
+	res = ip6_frags_ns_sysctl_register(net);
+	if (res < 0)
+		inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
-- 
2.17.0.rc1.321.gba9d0f2565-goog

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH net-next 3/6] inet: frags: add a pointer to struct netns_frags
  2018-03-30  5:22 [PATCH net-next 1/6] ipv6: frag: remove unused field Eric Dumazet
  2018-03-30  5:22 ` [PATCH net-next 2/6] inet: frags: change inet_frags_init_net() return value Eric Dumazet
@ 2018-03-30  5:22 ` Eric Dumazet
  2018-03-30  5:22 ` [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units Eric Dumazet
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2018-03-30  5:22 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Florian Westphal, Herbert Xu, Thomas Graf,
	Jesper Dangaard Brouer, Alexander Aring, Stefan Schmidt,
	Eric Dumazet, Eric Dumazet

In order to simplify the API, add a pointer to struct inet_frags.
This will allow us to make things less complex.

These functions no longer have a struct inet_frags parameter :

inet_frag_destroy(struct inet_frag_queue *q  /*, struct inet_frags *f */)
inet_frag_put(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frag_kill(struct inet_frag_queue *q /*, struct inet_frags *f */)
inet_frags_exit_net(struct netns_frags *nf /*, struct inet_frags *f */);

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/inet_frag.h                 | 11 ++++++-----
 net/ieee802154/6lowpan/reassembly.c     | 13 +++++++------
 net/ipv4/inet_fragment.c                | 17 ++++++++++-------
 net/ipv4/ip_fragment.c                  |  9 +++++----
 net/ipv6/netfilter/nf_conntrack_reasm.c | 14 ++++++++------
 net/ipv6/reassembly.c                   | 15 ++++++++-------
 6 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index b1d62176f3b4fcf100bd263e8eae0db656a3d9b6..69e531ed81894393e07cac9e953825fcb55ef42a 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -10,6 +10,7 @@ struct netns_frags {
 	int			high_thresh;
 	int			low_thresh;
 	int			max_dist;
+	struct inet_frags	*f;
 };
 
 /**
@@ -109,20 +110,20 @@ static inline int inet_frags_init_net(struct netns_frags *nf)
 	atomic_set(&nf->mem, 0);
 	return 0;
 }
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+void inet_frags_exit_net(struct netns_frags *nf);
 
-void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f);
+void inet_frag_kill(struct inet_frag_queue *q);
+void inet_frag_destroy(struct inet_frag_queue *q);
 struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
 		struct inet_frags *f, void *key, unsigned int hash);
 
 void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
 				   const char *prefix);
 
-static inline void inet_frag_put(struct inet_frag_queue *q, struct inet_frags *f)
+static inline void inet_frag_put(struct inet_frag_queue *q)
 {
 	if (refcount_dec_and_test(&q->refcnt))
-		inet_frag_destroy(q, f);
+		inet_frag_destroy(q);
 }
 
 static inline bool inet_frag_evicting(struct inet_frag_queue *q)
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 2aaab4bba42961647a4d3d1c0b8497917d5065ce..6badc055555b7baedac2051a1aaea15f9e9b180c 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -94,10 +94,10 @@ static void lowpan_frag_expire(struct timer_list *t)
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
-	inet_frag_kill(&fq->q, &lowpan_frags);
+	inet_frag_kill(&fq->q);
 out:
 	spin_unlock(&fq->q.lock);
-	inet_frag_put(&fq->q, &lowpan_frags);
+	inet_frag_put(&fq->q);
 }
 
 static inline struct lowpan_frag_queue *
@@ -230,7 +230,7 @@ static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, struct sk_buff *prev,
 	struct sk_buff *fp, *head = fq->q.fragments;
 	int sum_truesize;
 
-	inet_frag_kill(&fq->q, &lowpan_frags);
+	inet_frag_kill(&fq->q);
 
 	/* Make the one we just received the head. */
 	if (prev) {
@@ -438,7 +438,7 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 		ret = lowpan_frag_queue(fq, skb, frag_type);
 		spin_unlock(&fq->q.lock);
 
-		inet_frag_put(&fq->q, &lowpan_frags);
+		inet_frag_put(&fq->q);
 		return ret;
 	}
 
@@ -586,13 +586,14 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 	ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+	ieee802154_lowpan->frags.f = &lowpan_frags;
 
 	res = inet_frags_init_net(&ieee802154_lowpan->frags);
 	if (res < 0)
 		return res;
 	res = lowpan_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+		inet_frags_exit_net(&ieee802154_lowpan->frags);
 	return res;
 }
 
@@ -602,7 +603,7 @@ static void __net_exit lowpan_frags_exit_net(struct net *net)
 		net_ieee802154_lowpan(net);
 
 	lowpan_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&ieee802154_lowpan->frags, &lowpan_frags);
+	inet_frags_exit_net(&ieee802154_lowpan->frags);
 }
 
 static struct pernet_operations lowpan_frags_ops = {
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index e8ec28999f5ce0c5d496e9a97ca1748b18db0cf0..1ac69f65d0dee600d0ab4db20ff5942952932c40 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -219,8 +219,9 @@ void inet_frags_fini(struct inet_frags *f)
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
-void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f)
+void inet_frags_exit_net(struct netns_frags *nf)
 {
+	struct inet_frags *f =nf->f;
 	unsigned int seq;
 	int i;
 
@@ -264,33 +265,34 @@ __acquires(hb->chain_lock)
 	return hb;
 }
 
-static inline void fq_unlink(struct inet_frag_queue *fq, struct inet_frags *f)
+static inline void fq_unlink(struct inet_frag_queue *fq)
 {
 	struct inet_frag_bucket *hb;
 
-	hb = get_frag_bucket_locked(fq, f);
+	hb = get_frag_bucket_locked(fq, fq->net->f);
 	hlist_del(&fq->list);
 	fq->flags |= INET_FRAG_COMPLETE;
 	spin_unlock(&hb->chain_lock);
 }
 
-void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
+void inet_frag_kill(struct inet_frag_queue *fq)
 {
 	if (del_timer(&fq->timer))
 		refcount_dec(&fq->refcnt);
 
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
-		fq_unlink(fq, f);
+		fq_unlink(fq);
 		refcount_dec(&fq->refcnt);
 	}
 }
 EXPORT_SYMBOL(inet_frag_kill);
 
-void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
+void inet_frag_destroy(struct inet_frag_queue *q)
 {
 	struct sk_buff *fp;
 	struct netns_frags *nf;
 	unsigned int sum, sum_truesize = 0;
+	struct inet_frags *f;
 
 	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
 	WARN_ON(del_timer(&q->timer) != 0);
@@ -298,6 +300,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
 	/* Release all fragment data. */
 	fp = q->fragments;
 	nf = q->net;
+	f = nf->f;
 	while (fp) {
 		struct sk_buff *xp = fp->next;
 
@@ -333,7 +336,7 @@ static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
 			refcount_inc(&qp->refcnt);
 			spin_unlock(&hb->chain_lock);
 			qp_in->flags |= INET_FRAG_COMPLETE;
-			inet_frag_put(qp_in, f);
+			inet_frag_put(qp_in);
 			return qp;
 		}
 	}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e0b39d4ecbd411ff4bb72d1ed973e45bd6da9ef1..cd2b4c9419fc1552d367b572926e314b11cb6c00 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -168,7 +168,7 @@ static void ip4_frag_free(struct inet_frag_queue *q)
 
 static void ipq_put(struct ipq *ipq)
 {
-	inet_frag_put(&ipq->q, &ip4_frags);
+	inet_frag_put(&ipq->q);
 }
 
 /* Kill ipq entry. It is not destroyed immediately,
@@ -176,7 +176,7 @@ static void ipq_put(struct ipq *ipq)
  */
 static void ipq_kill(struct ipq *ipq)
 {
-	inet_frag_kill(&ipq->q, &ip4_frags);
+	inet_frag_kill(&ipq->q);
 }
 
 static bool frag_expire_skip_icmp(u32 user)
@@ -872,20 +872,21 @@ static int __net_init ipv4_frags_init_net(struct net *net)
 	net->ipv4.frags.timeout = IP_FRAG_TIME;
 
 	net->ipv4.frags.max_dist = 64;
+	net->ipv4.frags.f = &ip4_frags;
 
 	res = inet_frags_init_net(&net->ipv4.frags);
 	if (res < 0)
 		return res;
 	res = ip4_frags_ns_ctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+		inet_frags_exit_net(&net->ipv4.frags);
 	return res;
 }
 
 static void __net_exit ipv4_frags_exit_net(struct net *net)
 {
 	ip4_frags_ns_ctl_unregister(net);
-	inet_frags_exit_net(&net->ipv4.frags, &ip4_frags);
+	inet_frags_exit_net(&net->ipv4.frags);
 }
 
 static struct pernet_operations ip4_frags_ops = {
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 6ff41569134ae36809a8b42d8e46d50d19ffde53..f69b7ca52727c814eb2887c9deb9f356c56e5442 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -264,7 +264,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 			 * this case. -DaveM
 			 */
 			pr_debug("end of fragment not rounded to 8 bytes.\n");
-			inet_frag_kill(&fq->q, &nf_frags);
+			inet_frag_kill(&fq->q);
 			return -EPROTO;
 		}
 		if (end > fq->q.len) {
@@ -357,7 +357,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb,
 	return 0;
 
 discard_fq:
-	inet_frag_kill(&fq->q, &nf_frags);
+	inet_frag_kill(&fq->q);
 err:
 	return -EINVAL;
 }
@@ -379,7 +379,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev,  struct net_devic
 	int    payload_len;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q, &nf_frags);
+	inet_frag_kill(&fq->q);
 
 	WARN_ON(head == NULL);
 	WARN_ON(NFCT_FRAG6_CB(head)->offset != 0);
@@ -622,7 +622,7 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 
 out_unlock:
 	spin_unlock_bh(&fq->q.lock);
-	inet_frag_put(&fq->q, &nf_frags);
+	inet_frag_put(&fq->q);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
@@ -634,19 +634,21 @@ static int nf_ct_net_init(struct net *net)
 	net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
+	net->nf_frag.frags.f = &nf_frags;
+
 	res = inet_frags_init_net(&net->nf_frag.frags);
 	if (res < 0)
 		return res;
 	res = nf_ct_frag6_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+		inet_frags_exit_net(&net->nf_frag.frags);
 	return res;
 }
 
 static void nf_ct_net_exit(struct net *net)
 {
 	nf_ct_frags6_sysctl_unregister(net);
-	inet_frags_exit_net(&net->nf_frag.frags, &nf_frags);
+	inet_frags_exit_net(&net->nf_frag.frags);
 }
 
 static struct pernet_operations nf_ct_net_ops = {
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index a8f7a5f0251a7af0b14cc6de5006b924d9d05672..8cfea13a179c6f048177ac91fe26c8a5565e5820 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -138,7 +138,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 	if (fq->q.flags & INET_FRAG_COMPLETE)
 		goto out;
 
-	inet_frag_kill(&fq->q, frags);
+	inet_frag_kill(&fq->q);
 
 	rcu_read_lock();
 	dev = dev_get_by_index_rcu(net, fq->iif);
@@ -166,7 +166,7 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 	rcu_read_unlock();
 out:
 	spin_unlock(&fq->q.lock);
-	inet_frag_put(&fq->q, frags);
+	inet_frag_put(&fq->q);
 }
 EXPORT_SYMBOL(ip6_expire_frag_queue);
 
@@ -364,7 +364,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb,
 	return -1;
 
 discard_fq:
-	inet_frag_kill(&fq->q, &ip6_frags);
+	inet_frag_kill(&fq->q);
 err:
 	__IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			IPSTATS_MIB_REASMFAILS);
@@ -391,7 +391,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 	int sum_truesize;
 	u8 ecn;
 
-	inet_frag_kill(&fq->q, &ip6_frags);
+	inet_frag_kill(&fq->q);
 
 	ecn = ip_frag_ecn_table[fq->ecn];
 	if (unlikely(ecn == 0xff))
@@ -569,7 +569,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		spin_unlock(&fq->q.lock);
-		inet_frag_put(&fq->q, &ip6_frags);
+		inet_frag_put(&fq->q);
 		return ret;
 	}
 
@@ -716,6 +716,7 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 	net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
 	net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
+	net->ipv6.frags.f = &ip6_frags;
 
 	res = inet_frags_init_net(&net->ipv6.frags);
 	if (res < 0)
@@ -723,14 +724,14 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 
 	res = ip6_frags_ns_sysctl_register(net);
 	if (res < 0)
-		inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+		inet_frags_exit_net(&net->ipv6.frags);
 	return res;
 }
 
 static void __net_exit ipv6_frags_exit_net(struct net *net)
 {
 	ip6_frags_ns_sysctl_unregister(net);
-	inet_frags_exit_net(&net->ipv6.frags, &ip6_frags);
+	inet_frags_exit_net(&net->ipv6.frags);
 }
 
 static struct pernet_operations ip6_frags_ops = {
-- 
2.17.0.rc1.321.gba9d0f2565-goog

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units
  2018-03-30  5:22 [PATCH net-next 1/6] ipv6: frag: remove unused field Eric Dumazet
  2018-03-30  5:22 ` [PATCH net-next 2/6] inet: frags: change inet_frags_init_net() return value Eric Dumazet
  2018-03-30  5:22 ` [PATCH net-next 3/6] inet: frags: add a pointer to struct netns_frags Eric Dumazet
@ 2018-03-30  5:22 ` Eric Dumazet
  2018-03-30  7:18   ` Herbert Xu
  2018-03-30 11:44   ` Kirill Tkhai
  2018-03-30  5:22 ` [PATCH net-next 5/6] inet: frags: remove some helpers Eric Dumazet
  2018-03-30  5:22 ` [PATCH net-next 6/6] inet: frags: break the 2GB limit for frags storage Eric Dumazet
  4 siblings, 2 replies; 11+ messages in thread
From: Eric Dumazet @ 2018-03-30  5:22 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Florian Westphal, Herbert Xu, Thomas Graf,
	Jesper Dangaard Brouer, Alexander Aring, Stefan Schmidt,
	Eric Dumazet, Eric Dumazet, Nikolay Aleksandrov

Some applications still rely on IP fragmentation, and to be fair linux
reassembly unit is not working under any serious load.

It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)

A work queue is supposed to garbage collect items when host is under memory
pressure, and doing a hash rebuild, changing seed used in hash computations.

This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
occurring every 5 seconds if host is under fire.

Then there is the problem of sharing this hash table for all netns.

It is time to switch to rhashtables, and allocate one of them per netns
to speedup netns dismantle, since this is a critical metric these days.

Lookup is now using RCU. A followup patch will even remove
the refcount hold/release left from prior implementation and save
a couple of atomic operations.

Before this patch, 16 cpus (16 RX queue NIC) could not handle more
than 1 Mpps frags DDOS.

After the patch, I reach 7 Mpps without any tuning, and can use up to 2GB
of storage for the fragments.

$ grep FRAG /proc/net/sockstat
FRAG: inuse 1966916 memory 2140004608

A followup patch will change the limits for 64bit arches.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Nikolay Aleksandrov <nikolay@redhat.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Alexander Aring <alex.aring@gmail.com>
Cc: Stefan Schmidt <stefan@osg.samsung.com>
---
 Documentation/networking/ip-sysctl.txt  |   7 +-
 include/net/inet_frag.h                 |  99 +++---
 include/net/ipv6.h                      |  20 +-
 net/ieee802154/6lowpan/6lowpan_i.h      |  26 +-
 net/ieee802154/6lowpan/reassembly.c     | 108 +++----
 net/ipv4/inet_fragment.c                | 399 +++++-------------------
 net/ipv4/ip_fragment.c                  | 165 +++++-----
 net/ipv6/netfilter/nf_conntrack_reasm.c |  62 ++--
 net/ipv6/reassembly.c                   | 152 +++++----
 9 files changed, 344 insertions(+), 694 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 1d1120753ae82d0aee3e934a3d9c074b70dcbca6..c3b65f24e58aa72b720861d816fb76f9956800f0 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -134,13 +134,10 @@ min_adv_mss - INTEGER
 IP Fragmentation:
 
 ipfrag_high_thresh - INTEGER
-	Maximum memory used to reassemble IP fragments. When
-	ipfrag_high_thresh bytes of memory is allocated for this purpose,
-	the fragment handler will toss packets until ipfrag_low_thresh
-	is reached. This also serves as a maximum limit to namespaces
-	different from the initial one.
+	Maximum memory used to reassemble IP fragments.
 
 ipfrag_low_thresh - INTEGER
+	(Obsolete since linux-4.17)
 	Maximum memory used to reassemble IP fragments before the kernel
 	begins to remove incomplete fragment queues to free up resources.
 	The kernel still accepts new fragments for defragmentation.
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 69e531ed81894393e07cac9e953825fcb55ef42a..05099f9f980e2384c0c8cd7e74659656b585cd22 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -2,15 +2,20 @@
 #ifndef __NET_FRAG_H__
 #define __NET_FRAG_H__
 
+#include <linux/rhashtable.h>
+
 struct netns_frags {
-	/* Keep atomic mem on separate cachelines in structs that include it */
-	atomic_t		mem ____cacheline_aligned_in_smp;
 	/* sysctls */
 	int			timeout;
 	int			high_thresh;
 	int			low_thresh;
 	int			max_dist;
 	struct inet_frags	*f;
+
+	/* Keep atomic mem on separate cachelines in structs that include it */
+	atomic_t		mem ____cacheline_aligned_in_smp;
+
+	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
 };
 
 /**
@@ -26,12 +31,31 @@ enum {
 	INET_FRAG_COMPLETE	= BIT(2),
 };
 
+struct frag_v4_compare_key {
+	__be32		saddr;
+	__be32		daddr;
+	u32		user;
+	u32		vif;
+	__be16		id;
+	u16		protocol;
+};
+
+struct frag_v6_compare_key {
+	struct in6_addr	saddr;
+	struct in6_addr	daddr;
+	u32		user;
+	__be32		id;
+	u32		iif;
+};
+
 /**
  * struct inet_frag_queue - fragment queue
  *
- * @lock: spinlock protecting the queue
+ * @node: rhash node
+ * @key: keys identifying this frag.
  * @timer: queue expiration timer
- * @list: hash bucket list
+ * @net: namespace that this frag belongs to
+ * @lock: spinlock protecting this frag
  * @refcnt: reference count of the queue
  * @fragments: received fragments head
  * @fragments_tail: received fragments tail
@@ -40,66 +64,38 @@ enum {
  * @meat: length of received fragments so far
  * @flags: fragment queue flags
  * @max_size: maximum received fragment size
- * @net: namespace that this frag belongs to
- * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
+ * @rcu: rcu head for freeing deferall
  */
 struct inet_frag_queue {
-	spinlock_t		lock;
+	struct rhash_head	node;
+	union {
+		struct frag_v4_compare_key v4;
+		struct frag_v6_compare_key v6;
+	} key;
 	struct timer_list	timer;
-	struct hlist_node	list;
+	struct netns_frags      *net;
+	spinlock_t		lock;
 	refcount_t		refcnt;
 	struct sk_buff		*fragments;
 	struct sk_buff		*fragments_tail;
 	ktime_t			stamp;
 	int			len;
 	int			meat;
-	__u8			flags;
+	u8			flags;
 	u16			max_size;
-	struct netns_frags	*net;
-	struct hlist_node	list_evictor;
-};
-
-#define INETFRAGS_HASHSZ	1024
-
-/* averaged:
- * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
- *	       rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
- *	       struct frag_queue))
- */
-#define INETFRAGS_MAXDEPTH	128
-
-struct inet_frag_bucket {
-	struct hlist_head	chain;
-	spinlock_t		chain_lock;
+	struct rcu_head		rcu;
 };
 
 struct inet_frags {
-	struct inet_frag_bucket	hash[INETFRAGS_HASHSZ];
-
-	struct work_struct	frags_work;
-	unsigned int next_bucket;
-	unsigned long last_rebuild_jiffies;
-	bool rebuild;
-
-	/* The first call to hashfn is responsible to initialize
-	 * rnd. This is best done with net_get_random_once.
-	 *
-	 * rnd_seqlock is used to let hash insertion detect
-	 * when it needs to re-lookup the hash chain to use.
-	 */
-	u32			rnd;
-	seqlock_t		rnd_seqlock;
 	unsigned int		qsize;
 
-	unsigned int		(*hashfn)(const struct inet_frag_queue *);
-	bool			(*match)(const struct inet_frag_queue *q,
-					 const void *arg);
-	void			(*constructor)(struct inet_frag_queue *q,
+	void			(*constructor)(struct inet_frag_queue *fq,
 					       const void *arg);
-	void			(*destructor)(struct inet_frag_queue *);
+	void			(*destructor)(struct inet_frag_queue *fq);
 	void			(*frag_expire)(struct timer_list *t);
 	struct kmem_cache	*frags_cachep;
 	const char		*frags_cache_name;
+	struct rhashtable_params rhash_params;
 };
 
 int inet_frags_init(struct inet_frags *);
@@ -108,17 +104,13 @@ void inet_frags_fini(struct inet_frags *);
 static inline int inet_frags_init_net(struct netns_frags *nf)
 {
 	atomic_set(&nf->mem, 0);
-	return 0;
+	return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
 }
 void inet_frags_exit_net(struct netns_frags *nf);
 
 void inet_frag_kill(struct inet_frag_queue *q);
 void inet_frag_destroy(struct inet_frag_queue *q);
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
-		struct inet_frags *f, void *key, unsigned int hash);
-
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-				   const char *prefix);
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
 
 static inline void inet_frag_put(struct inet_frag_queue *q)
 {
@@ -126,11 +118,6 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
 		inet_frag_destroy(q);
 }
 
-static inline bool inet_frag_evicting(struct inet_frag_queue *q)
-{
-	return !hlist_unhashed(&q->list_evictor);
-}
-
 /* Memory Tracking Functions. */
 
 static inline int frag_mem_limit(struct netns_frags *nf)
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 5c18836672e9d1c560cdce15f5b34928c337abfd..76f84d4be91b92761fb9a26e7f52e2101ee34c0a 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -579,36 +579,20 @@ enum ip6_defrag_users {
 	__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
 };
 
-struct ip6_create_arg {
-	__be32 id;
-	u32 user;
-	const struct in6_addr *src;
-	const struct in6_addr *dst;
-	int iif;
-	u8 ecn;
-};
-
 void ip6_frag_init(struct inet_frag_queue *q, const void *a);
-bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
 
 /*
- *	Equivalent of ipv4 struct ip
+ *	Equivalent of ipv4 struct ipq
  */
 struct frag_queue {
 	struct inet_frag_queue	q;
 
-	__be32			id;		/* fragment id		*/
-	u32			user;
-	struct in6_addr		saddr;
-	struct in6_addr		daddr;
-
 	int			iif;
 	__u16			nhoffset;
 	u8			ecn;
 };
 
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
-			   struct inet_frags *frags);
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
 
 static inline bool ipv6_addr_any(const struct in6_addr *a)
 {
diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
index d8de3bcfb1032a1133402cb2a4c50a2448133846..b8d95cb71c25dd69c8a88b2c886a3f0d2ce1174f 100644
--- a/net/ieee802154/6lowpan/6lowpan_i.h
+++ b/net/ieee802154/6lowpan/6lowpan_i.h
@@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result;
 #define LOWPAN_DISPATCH_FRAG1           0xc0
 #define LOWPAN_DISPATCH_FRAGN           0xe0
 
-struct lowpan_create_arg {
+struct frag_lowpan_compare_key {
 	u16 tag;
 	u16 d_size;
-	const struct ieee802154_addr *src;
-	const struct ieee802154_addr *dst;
+	const struct ieee802154_addr src;
+	const struct ieee802154_addr dst;
 };
 
-/* Equivalent of ipv4 struct ip
+/* Equivalent of ipv4 struct ipq
  */
 struct lowpan_frag_queue {
 	struct inet_frag_queue	q;
-
-	u16			tag;
-	u16			d_size;
-	struct ieee802154_addr	saddr;
-	struct ieee802154_addr	daddr;
 };
 
-static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
-{
-	switch (a->mode) {
-	case IEEE802154_ADDR_LONG:
-		return (((__force u64)a->extended_addr) >> 32) ^
-			(((__force u64)a->extended_addr) & 0xffffffff);
-	case IEEE802154_ADDR_SHORT:
-		return (__force u32)(a->short_addr + (a->pan_id << 16));
-	default:
-		return 0;
-	}
-}
-
 int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
 void lowpan_net_frag_exit(void);
 int lowpan_net_frag_init(void);
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 6badc055555b7baedac2051a1aaea15f9e9b180c..9ee4d22666c26d6d9796d0f484bb4beb265dea42 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
 static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
 			     struct sk_buff *prev, struct net_device *ldev);
 
-static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
-				     const struct ieee802154_addr *saddr,
-				     const struct ieee802154_addr *daddr)
-{
-	net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
-	return jhash_3words(ieee802154_addr_hash(saddr),
-			    ieee802154_addr_hash(daddr),
-			    (__force u32)(tag + (d_size << 16)),
-			    lowpan_frags.rnd);
-}
-
-static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
-{
-	const struct lowpan_frag_queue *fq;
-
-	fq = container_of(q, struct lowpan_frag_queue, q);
-	return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
-}
-
-static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
-{
-	const struct lowpan_frag_queue *fq;
-	const struct lowpan_create_arg *arg = a;
-
-	fq = container_of(q, struct lowpan_frag_queue, q);
-	return	fq->tag == arg->tag && fq->d_size == arg->d_size &&
-		ieee802154_addr_equal(&fq->saddr, arg->src) &&
-		ieee802154_addr_equal(&fq->daddr, arg->dst);
-}
-
 static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
 {
-	const struct lowpan_create_arg *arg = a;
+	const struct frag_lowpan_compare_key *key = a;
 	struct lowpan_frag_queue *fq;
 
 	fq = container_of(q, struct lowpan_frag_queue, q);
 
-	fq->tag = arg->tag;
-	fq->d_size = arg->d_size;
-	fq->saddr = *arg->src;
-	fq->daddr = *arg->dst;
+	BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
+	memcpy(&q->key, key, sizeof(*key));
 }
 
 static void lowpan_frag_expire(struct timer_list *t)
@@ -105,25 +73,20 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
 	const struct ieee802154_addr *src,
 	const struct ieee802154_addr *dst)
 {
-	struct inet_frag_queue *q;
-	struct lowpan_create_arg arg;
-	unsigned int hash;
 	struct netns_ieee802154_lowpan *ieee802154_lowpan =
 		net_ieee802154_lowpan(net);
+	struct frag_lowpan_compare_key key = {
+		.tag = cb->d_tag,
+		.d_size = cb->d_size,
+		.src = *src,
+		.dst = *dst,
+	};
+	struct inet_frag_queue *q;
 
-	arg.tag = cb->d_tag;
-	arg.d_size = cb->d_size;
-	arg.src = src;
-	arg.dst = dst;
-
-	hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
-
-	q = inet_frag_find(&ieee802154_lowpan->frags,
-			   &lowpan_frags, &arg, hash);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	q = inet_frag_find(&ieee802154_lowpan->frags, &key);
+	if (IS_ERR_OR_NULL(q))
 		return NULL;
-	}
+
 	return container_of(q, struct lowpan_frag_queue, q);
 }
 
@@ -588,6 +551,7 @@ static int __net_init lowpan_frags_init_net(struct net *net)
 	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
 	ieee802154_lowpan->frags.f = &lowpan_frags;
 
+	ieee802154_lowpan->frags.f = &lowpan_frags;
 	res = inet_frags_init_net(&ieee802154_lowpan->frags);
 	if (res < 0)
 		return res;
@@ -611,6 +575,36 @@ static struct pernet_operations lowpan_frags_ops = {
 	.exit = lowpan_frags_exit_net,
 };
 
+static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
+{
+	return jhash2(data,
+		      sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
+}
+
+static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct inet_frag_queue *fq = data;
+
+	return jhash2((const u32 *)&fq->key,
+		      sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
+}
+
+static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct frag_lowpan_compare_key *key = arg->key;
+	const struct inet_frag_queue *fq = ptr;
+
+	return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+const struct rhashtable_params lowpan_rhash_params = {
+	.head_offset		= offsetof(struct inet_frag_queue, node),
+	.hashfn			= lowpan_key_hashfn,
+	.obj_hashfn		= lowpan_obj_hashfn,
+	.obj_cmpfn		= lowpan_obj_cmpfn,
+	.automatic_shrinking	= true,
+};
+
 int __init lowpan_net_frag_init(void)
 {
 	int ret;
@@ -619,22 +613,24 @@ int __init lowpan_net_frag_init(void)
 	if (ret)
 		return ret;
 
-	ret = register_pernet_subsys(&lowpan_frags_ops);
-	if (ret)
-		goto err_pernet;
-
-	lowpan_frags.hashfn = lowpan_hashfn;
 	lowpan_frags.constructor = lowpan_frag_init;
 	lowpan_frags.destructor = NULL;
 	lowpan_frags.qsize = sizeof(struct frag_queue);
-	lowpan_frags.match = lowpan_frag_match;
 	lowpan_frags.frag_expire = lowpan_frag_expire;
 	lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
+	lowpan_frags.rhash_params = lowpan_rhash_params;
 	ret = inet_frags_init(&lowpan_frags);
 	if (ret)
 		goto err_pernet;
 
+	ret = register_pernet_subsys(&lowpan_frags_ops);
+	if (ret)
+		goto err_pernet_frags;
+
 	return ret;
+
+err_pernet_frags:
+	inet_frags_fini(&lowpan_frags);
 err_pernet:
 	lowpan_frags_sysctl_unregister();
 	return ret;
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 1ac69f65d0dee600d0ab4db20ff5942952932c40..8ccaf605630f14270996ee1b5a37376299d78661 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -25,12 +25,6 @@
 #include <net/inet_frag.h>
 #include <net/inet_ecn.h>
 
-#define INETFRAGS_EVICT_BUCKETS   128
-#define INETFRAGS_EVICT_MAX	  512
-
-/* don't rebuild inetfrag table with new secret more often than this */
-#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
-
 /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
  * Value : 0xff if frame should be dropped.
  *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
@@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = {
 };
 EXPORT_SYMBOL(ip_frag_ecn_table);
 
-static unsigned int
-inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
-{
-	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
-}
-
-static bool inet_frag_may_rebuild(struct inet_frags *f)
-{
-	return time_after(jiffies,
-	       f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
-}
-
-static void inet_frag_secret_rebuild(struct inet_frags *f)
-{
-	int i;
-
-	write_seqlock_bh(&f->rnd_seqlock);
-
-	if (!inet_frag_may_rebuild(f))
-		goto out;
-
-	get_random_bytes(&f->rnd, sizeof(u32));
-
-	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
-		struct inet_frag_bucket *hb;
-		struct inet_frag_queue *q;
-		struct hlist_node *n;
-
-		hb = &f->hash[i];
-		spin_lock(&hb->chain_lock);
-
-		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
-			unsigned int hval = inet_frag_hashfn(f, q);
-
-			if (hval != i) {
-				struct inet_frag_bucket *hb_dest;
-
-				hlist_del(&q->list);
-
-				/* Relink to new hash chain. */
-				hb_dest = &f->hash[hval];
-
-				/* This is the only place where we take
-				 * another chain_lock while already holding
-				 * one.  As this will not run concurrently,
-				 * we cannot deadlock on hb_dest lock below, if its
-				 * already locked it will be released soon since
-				 * other caller cannot be waiting for hb lock
-				 * that we've taken above.
-				 */
-				spin_lock_nested(&hb_dest->chain_lock,
-						 SINGLE_DEPTH_NESTING);
-				hlist_add_head(&q->list, &hb_dest->chain);
-				spin_unlock(&hb_dest->chain_lock);
-			}
-		}
-		spin_unlock(&hb->chain_lock);
-	}
-
-	f->rebuild = false;
-	f->last_rebuild_jiffies = jiffies;
-out:
-	write_sequnlock_bh(&f->rnd_seqlock);
-}
-
-static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
-{
-	if (!hlist_unhashed(&q->list_evictor))
-		return false;
-
-	return q->net->low_thresh == 0 ||
-	       frag_mem_limit(q->net) >= q->net->low_thresh;
-}
-
-static unsigned int
-inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
-{
-	struct inet_frag_queue *fq;
-	struct hlist_node *n;
-	unsigned int evicted = 0;
-	HLIST_HEAD(expired);
-
-	spin_lock(&hb->chain_lock);
-
-	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
-		if (!inet_fragq_should_evict(fq))
-			continue;
-
-		if (!del_timer(&fq->timer))
-			continue;
-
-		hlist_add_head(&fq->list_evictor, &expired);
-		++evicted;
-	}
-
-	spin_unlock(&hb->chain_lock);
-
-	hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
-		f->frag_expire(&fq->timer);
-
-	return evicted;
-}
-
-static void inet_frag_worker(struct work_struct *work)
-{
-	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
-	unsigned int i, evicted = 0;
-	struct inet_frags *f;
-
-	f = container_of(work, struct inet_frags, frags_work);
-
-	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
-
-	local_bh_disable();
-
-	for (i = READ_ONCE(f->next_bucket); budget; --budget) {
-		evicted += inet_evict_bucket(f, &f->hash[i]);
-		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
-		if (evicted > INETFRAGS_EVICT_MAX)
-			break;
-	}
-
-	f->next_bucket = i;
-
-	local_bh_enable();
-
-	if (f->rebuild && inet_frag_may_rebuild(f))
-		inet_frag_secret_rebuild(f);
-}
-
-static void inet_frag_schedule_worker(struct inet_frags *f)
-{
-	if (unlikely(!work_pending(&f->frags_work)))
-		schedule_work(&f->frags_work);
-}
-
 int inet_frags_init(struct inet_frags *f)
 {
-	int i;
-
-	INIT_WORK(&f->frags_work, inet_frag_worker);
-
-	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
-		struct inet_frag_bucket *hb = &f->hash[i];
-
-		spin_lock_init(&hb->chain_lock);
-		INIT_HLIST_HEAD(&hb->chain);
-	}
-
-	seqlock_init(&f->rnd_seqlock);
-	f->last_rebuild_jiffies = 0;
 	f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
 					    NULL);
 	if (!f->frags_cachep)
@@ -214,93 +59,80 @@ EXPORT_SYMBOL(inet_frags_init);
 
 void inet_frags_fini(struct inet_frags *f)
 {
-	cancel_work_sync(&f->frags_work);
+	rcu_barrier();
 	kmem_cache_destroy(f->frags_cachep);
+	f->frags_cachep = NULL;
 }
 EXPORT_SYMBOL(inet_frags_fini);
 
 void inet_frags_exit_net(struct netns_frags *nf)
 {
-	struct inet_frags *f =nf->f;
-	unsigned int seq;
-	int i;
-
-	nf->low_thresh = 0;
+	struct rhashtable_iter hti;
+	struct inet_frag_queue *fq;
 
-evict_again:
-	local_bh_disable();
-	seq = read_seqbegin(&f->rnd_seqlock);
+	nf->low_thresh = 0; /* prevent creation of new frags */
 
-	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
-		inet_evict_bucket(f, &f->hash[i]);
+	rhashtable_walk_enter(&nf->rhashtable, &hti);
+	do {
+		rhashtable_walk_start(&hti);
 
-	local_bh_enable();
-	cond_resched();
+		while ((fq = rhashtable_walk_next(&hti)) && !IS_ERR(fq)) {
+			if (refcount_inc_not_zero(&fq->refcnt)) {
+				spin_lock_bh(&fq->lock);
+				inet_frag_kill(fq);
+				spin_unlock_bh(&fq->lock);
+				inet_frag_put(fq);
+			}
+		}
 
-	if (read_seqretry(&f->rnd_seqlock, seq) ||
-	    sum_frag_mem_limit(nf))
-		goto evict_again;
+		rhashtable_walk_stop(&hti);
+	} while (cond_resched(), fq == ERR_PTR(-EAGAIN));
+	rhashtable_walk_exit(&hti);
+	rhashtable_destroy(&nf->rhashtable);
 }
 EXPORT_SYMBOL(inet_frags_exit_net);
 
-static struct inet_frag_bucket *
-get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
-__acquires(hb->chain_lock)
-{
-	struct inet_frag_bucket *hb;
-	unsigned int seq, hash;
-
- restart:
-	seq = read_seqbegin(&f->rnd_seqlock);
-
-	hash = inet_frag_hashfn(f, fq);
-	hb = &f->hash[hash];
-
-	spin_lock(&hb->chain_lock);
-	if (read_seqretry(&f->rnd_seqlock, seq)) {
-		spin_unlock(&hb->chain_lock);
-		goto restart;
-	}
-
-	return hb;
-}
-
-static inline void fq_unlink(struct inet_frag_queue *fq)
-{
-	struct inet_frag_bucket *hb;
-
-	hb = get_frag_bucket_locked(fq, fq->net->f);
-	hlist_del(&fq->list);
-	fq->flags |= INET_FRAG_COMPLETE;
-	spin_unlock(&hb->chain_lock);
-}
-
 void inet_frag_kill(struct inet_frag_queue *fq)
 {
 	if (del_timer(&fq->timer))
 		refcount_dec(&fq->refcnt);
 
 	if (!(fq->flags & INET_FRAG_COMPLETE)) {
-		fq_unlink(fq);
+		struct netns_frags *nf = fq->net;
+
+		fq->flags |= INET_FRAG_COMPLETE;
+		rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
 		refcount_dec(&fq->refcnt);
 	}
 }
 EXPORT_SYMBOL(inet_frag_kill);
 
-void inet_frag_destroy(struct inet_frag_queue *q)
+static void inet_frag_destroy_rcu(struct rcu_head *head)
 {
-	struct sk_buff *fp;
+	struct inet_frag_queue *fq = container_of(head, struct inet_frag_queue,
+						 rcu);
+	struct inet_frags *f = fq->net->f;
+
+	if (f->destructor)
+		f->destructor(fq);
+	kmem_cache_free(f->frags_cachep, fq);
+}
+
+void inet_frag_destroy(struct inet_frag_queue *fq)
+{
+	unsigned int sum_truesize;
 	struct netns_frags *nf;
-	unsigned int sum, sum_truesize = 0;
 	struct inet_frags *f;
+	struct sk_buff *fp;
 
-	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
-	WARN_ON(del_timer(&q->timer) != 0);
+	WARN_ON(!(fq->flags & INET_FRAG_COMPLETE));
+	WARN_ON(del_timer(&fq->timer) != 0);
 
 	/* Release all fragment data. */
-	fp = q->fragments;
-	nf = q->net;
+	fp = fq->fragments;
+	nf = fq->net;
 	f = nf->f;
+	sum_truesize = f->qsize;
 	while (fp) {
 		struct sk_buff *xp = fp->next;
 
@@ -308,136 +140,63 @@ void inet_frag_destroy(struct inet_frag_queue *q)
 		kfree_skb(fp);
 		fp = xp;
 	}
-	sum = sum_truesize + f->qsize;
 
-	if (f->destructor)
-		f->destructor(q);
-	kmem_cache_free(f->frags_cachep, q);
+	call_rcu(&fq->rcu, inet_frag_destroy_rcu);
 
-	sub_frag_mem_limit(nf, sum);
+	sub_frag_mem_limit(nf, sum_truesize);
 }
 EXPORT_SYMBOL(inet_frag_destroy);
 
-static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
-						struct inet_frag_queue *qp_in,
-						struct inet_frags *f,
+static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
 						void *arg)
 {
-	struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
-	struct inet_frag_queue *qp;
-
-#ifdef CONFIG_SMP
-	/* With SMP race we have to recheck hash table, because
-	 * such entry could have been created on other cpu before
-	 * we acquired hash bucket lock.
-	 */
-	hlist_for_each_entry(qp, &hb->chain, list) {
-		if (qp->net == nf && f->match(qp, arg)) {
-			refcount_inc(&qp->refcnt);
-			spin_unlock(&hb->chain_lock);
-			qp_in->flags |= INET_FRAG_COMPLETE;
-			inet_frag_put(qp_in);
-			return qp;
-		}
-	}
-#endif
-	qp = qp_in;
-	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
-		refcount_inc(&qp->refcnt);
-
-	refcount_inc(&qp->refcnt);
-	hlist_add_head(&qp->list, &hb->chain);
-
-	spin_unlock(&hb->chain_lock);
-
-	return qp;
-}
-
-static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
-					       struct inet_frags *f,
-					       void *arg)
-{
-	struct inet_frag_queue *q;
+	struct inet_frags *f = nf->f;
+	struct inet_frag_queue *fq;
+	int err;
 
-	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
-		inet_frag_schedule_worker(f);
+	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
 		return NULL;
-	}
 
-	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
-	if (!q)
+	fq = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
+	if (!fq)
 		return NULL;
 
-	q->net = nf;
-	f->constructor(q, arg);
-	add_frag_mem_limit(nf, f->qsize);
-
-	timer_setup(&q->timer, f->frag_expire, 0);
-	spin_lock_init(&q->lock);
-	refcount_set(&q->refcnt, 1);
-
-	return q;
-}
+	fq->net = nf;
+	f->constructor(fq, arg);
 
-static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
-						struct inet_frags *f,
-						void *arg)
-{
-	struct inet_frag_queue *q;
+	timer_setup(&fq->timer, f->frag_expire, 0);
+	spin_lock_init(&fq->lock);
+	refcount_set(&fq->refcnt, 3);
+	mod_timer(&fq->timer, jiffies + nf->timeout);
 
-	q = inet_frag_alloc(nf, f, arg);
-	if (!q)
+	err = rhashtable_insert_fast(&nf->rhashtable, &fq->node,
+				     f->rhash_params);
+	add_frag_mem_limit(nf, f->qsize);
+	if (err < 0) {
+		fq->flags |= INET_FRAG_COMPLETE;
+		inet_frag_kill(fq);
+		inet_frag_destroy(fq);
 		return NULL;
-
-	return inet_frag_intern(nf, q, f, arg);
+	}
+	return fq;
 }
 
-struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
-				       struct inet_frags *f, void *key,
-				       unsigned int hash)
+/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
+struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
 {
-	struct inet_frag_bucket *hb;
-	struct inet_frag_queue *q;
-	int depth = 0;
-
-	if (frag_mem_limit(nf) > nf->low_thresh)
-		inet_frag_schedule_worker(f);
-
-	hash &= (INETFRAGS_HASHSZ - 1);
-	hb = &f->hash[hash];
-
-	spin_lock(&hb->chain_lock);
-	hlist_for_each_entry(q, &hb->chain, list) {
-		if (q->net == nf && f->match(q, key)) {
-			refcount_inc(&q->refcnt);
-			spin_unlock(&hb->chain_lock);
-			return q;
-		}
-		depth++;
-	}
-	spin_unlock(&hb->chain_lock);
+	struct inet_frag_queue *fq;
 
-	if (depth <= INETFRAGS_MAXDEPTH)
-		return inet_frag_create(nf, f, key);
+	rcu_read_lock();
 
-	if (inet_frag_may_rebuild(f)) {
-		if (!f->rebuild)
-			f->rebuild = true;
-		inet_frag_schedule_worker(f);
+	fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
+	if (fq) {
+		if (!refcount_inc_not_zero(&fq->refcnt))
+			fq = NULL;
+		rcu_read_unlock();
+		return fq;
 	}
+	rcu_read_unlock();
 
-	return ERR_PTR(-ENOBUFS);
+	return inet_frag_create(nf, key);
 }
 EXPORT_SYMBOL(inet_frag_find);
-
-void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
-				   const char *prefix)
-{
-	static const char msg[] = "inet_frag_find: Fragment hash bucket"
-		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
-		". Dropping fragment.\n";
-
-	if (PTR_ERR(q) == -ENOBUFS)
-		net_dbg_ratelimited("%s%s", prefix, msg);
-}
-EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index cd2b4c9419fc1552d367b572926e314b11cb6c00..1a7423e8ec0a8f88782ad8c945dc0cd6046f79f0 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -69,15 +69,9 @@ struct ipfrag_skb_cb
 struct ipq {
 	struct inet_frag_queue q;
 
-	u32		user;
-	__be32		saddr;
-	__be32		daddr;
-	__be16		id;
-	u8		protocol;
 	u8		ecn; /* RFC3168 support */
 	u16		max_df_size; /* largest frag with DF set seen */
 	int             iif;
-	int             vif;   /* L3 master device index */
 	unsigned int    rid;
 	struct inet_peer *peer;
 };
@@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net)
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 			 struct net_device *dev);
 
-struct ip4_create_arg {
-	struct iphdr *iph;
-	u32 user;
-	int vif;
-};
-
-static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
-{
-	net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
-	return jhash_3words((__force u32)id << 16 | prot,
-			    (__force u32)saddr, (__force u32)daddr,
-			    ip4_frags.rnd);
-}
-
-static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
-{
-	const struct ipq *ipq;
-
-	ipq = container_of(q, struct ipq, q);
-	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
-}
-
-static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
-{
-	const struct ipq *qp;
-	const struct ip4_create_arg *arg = a;
-
-	qp = container_of(q, struct ipq, q);
-	return	qp->id == arg->iph->id &&
-		qp->saddr == arg->iph->saddr &&
-		qp->daddr == arg->iph->daddr &&
-		qp->protocol == arg->iph->protocol &&
-		qp->user == arg->user &&
-		qp->vif == arg->vif;
-}
 
 static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 {
@@ -140,37 +99,23 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
 					       frags);
 	struct net *net = container_of(ipv4, struct net, ipv4);
 
-	const struct ip4_create_arg *arg = a;
+	const struct frag_v4_compare_key *key = a;
 
-	qp->protocol = arg->iph->protocol;
-	qp->id = arg->iph->id;
-	qp->ecn = ip4_frag_ecn(arg->iph->tos);
-	qp->saddr = arg->iph->saddr;
-	qp->daddr = arg->iph->daddr;
-	qp->vif = arg->vif;
-	qp->user = arg->user;
+	q->key.v4 = *key;
+	qp->ecn = 0;
 	qp->peer = q->net->max_dist ?
-		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
+		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
 		NULL;
 }
 
-static void ip4_frag_free(struct inet_frag_queue *q)
+static void ip4_frag_destructor(struct inet_frag_queue *q)
 {
-	struct ipq *qp;
+	struct ipq *qp = container_of(q, struct ipq, q);
 
-	qp = container_of(q, struct ipq, q);
 	if (qp->peer)
 		inet_putpeer(qp->peer);
 }
 
-
-/* Destruction primitives. */
-
-static void ipq_put(struct ipq *ipq)
-{
-	inet_frag_put(&ipq->q);
-}
-
 /* Kill ipq entry. It is not destroyed immediately,
  * because caller (and someone more) holds reference count.
  */
@@ -198,25 +143,25 @@ static void ip_expire(struct timer_list *t)
 	struct net *net;
 
 	qp = container_of(frag, struct ipq, q);
-	net = container_of(qp->q.net, struct net, ipv4.frags);
+	net = container_of(frag->net, struct net, ipv4.frags);
 
 	rcu_read_lock();
-	spin_lock(&qp->q.lock);
+	spin_lock(&frag->lock);
 
-	if (qp->q.flags & INET_FRAG_COMPLETE)
+	if (frag->flags & INET_FRAG_COMPLETE)
 		goto out;
 
 	ipq_kill(qp);
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 
-	if (!inet_frag_evicting(&qp->q)) {
-		struct sk_buff *clone, *head = qp->q.fragments;
+	if (true) {
+		struct sk_buff *clone, *head = frag->fragments;
 		const struct iphdr *iph;
 		int err;
 
 		__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
 
-		if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
+		if (!(frag->flags & INET_FRAG_FIRST_IN) || !frag->fragments)
 			goto out;
 
 		head->dev = dev_get_by_index_rcu(net, qp->iif);
@@ -234,7 +179,7 @@ static void ip_expire(struct timer_list *t)
 		/* Only an end host needs to send an ICMP
 		 * "Fragment Reassembly Timeout" message, per RFC792.
 		 */
-		if (frag_expire_skip_icmp(qp->user) &&
+		if (frag_expire_skip_icmp(frag->key.v4.user) &&
 		    (skb_rtable(head)->rt_type != RTN_LOCAL))
 			goto out;
 
@@ -242,7 +187,7 @@ static void ip_expire(struct timer_list *t)
 
 		/* Send an ICMP "Fragment Reassembly Timeout" message. */
 		if (clone) {
-			spin_unlock(&qp->q.lock);
+			spin_unlock(&frag->lock);
 			icmp_send(clone, ICMP_TIME_EXCEEDED,
 				  ICMP_EXC_FRAGTIME, 0);
 			consume_skb(clone);
@@ -250,33 +195,32 @@ static void ip_expire(struct timer_list *t)
 		}
 	}
 out:
-	spin_unlock(&qp->q.lock);
+	spin_unlock(&frag->lock);
 out_rcu_unlock:
 	rcu_read_unlock();
-	ipq_put(qp);
+	inet_frag_put(frag);
 }
 
 /* Find the correct entry in the "incomplete datagrams" queue for
  * this IP datagram, and create new one, if nothing is found.
  */
-static struct ipq *ip_find(struct net *net, struct iphdr *iph,
+static struct ipq *ip_find(struct net *net, const struct iphdr *iph,
 			   u32 user, int vif)
 {
+	struct frag_v4_compare_key key = {
+		.saddr = iph->saddr,
+		.daddr = iph->daddr,
+		.user = user,
+		.vif = vif,
+		.id = iph->id,
+		.protocol = iph->protocol,
+	};
 	struct inet_frag_queue *q;
-	struct ip4_create_arg arg;
-	unsigned int hash;
 
-	arg.iph = iph;
-	arg.user = user;
-	arg.vif = vif;
-
-	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
-
-	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	q = inet_frag_find(&net->ipv4.frags, &key);
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct ipq, q);
 }
 
@@ -310,8 +254,8 @@ static int ip_frag_too_far(struct ipq *qp)
 
 static int ip_frag_reinit(struct ipq *qp)
 {
-	struct sk_buff *fp;
 	unsigned int sum_truesize = 0;
+	struct sk_buff *fp;
 
 	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
 		refcount_inc(&qp->q.refcnt);
@@ -652,11 +596,11 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 	return 0;
 
 out_nomem:
-	net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
+	net_dbg_ratelimited("queue_glue: no memory for gluing queue\n");
 	err = -ENOMEM;
 	goto out_fail;
 out_oversize:
-	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
+	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
 out_fail:
 	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
 	return err;
@@ -682,7 +626,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
 		ret = ip_frag_queue(qp, skb);
 
 		spin_unlock(&qp->q.lock);
-		ipq_put(qp);
+		inet_frag_put(&qp->q);
 		return ret;
 	}
 
@@ -894,17 +838,52 @@ static struct pernet_operations ip4_frags_ops = {
 	.exit = ipv4_frags_exit_net,
 };
 
+
+static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
+{
+	return jhash2(data,
+		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
+}
+
+static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct inet_frag_queue *fq = data;
+
+	return jhash2((const u32 *)&fq->key.v4,
+		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
+}
+
+static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct frag_v4_compare_key *key = arg->key;
+	const struct inet_frag_queue *fq = ptr;
+
+	return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+static const struct rhashtable_params ip4_rhash_params = {
+	.head_offset		= offsetof(struct inet_frag_queue, node),
+	.key_offset		= offsetof(struct inet_frag_queue, key),
+	.key_len		= sizeof(struct frag_v4_compare_key),
+	.hashfn			= ip4_key_hashfn,
+	.obj_hashfn		= ip4_obj_hashfn,
+	.obj_cmpfn		= ip4_obj_cmpfn,
+	.automatic_shrinking	= true,
+};
+
 void __init ipfrag_init(void)
 {
-	ip4_frags_ctl_register();
-	register_pernet_subsys(&ip4_frags_ops);
-	ip4_frags.hashfn = ip4_hashfn;
 	ip4_frags.constructor = ip4_frag_init;
-	ip4_frags.destructor = ip4_frag_free;
+	ip4_frags.destructor = ip4_frag_destructor;
 	ip4_frags.qsize = sizeof(struct ipq);
-	ip4_frags.match = ip4_frag_match;
 	ip4_frags.frag_expire = ip_expire;
 	ip4_frags.frags_cache_name = ip_frag_cache_name;
+	ip4_frags.rhash_params = ip4_rhash_params;
+
 	if (inet_frags_init(&ip4_frags))
 		panic("IP: failed to allocate ip4_frags cache\n");
+
+	ip4_frags_ctl_register();
+	register_pernet_subsys(&ip4_frags_ops);
+
 }
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index f69b7ca52727c814eb2887c9deb9f356c56e5442..53859311dea96c03fa5ae8456de32de25009efbe 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
 	return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
 }
 
-static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
-				 const struct in6_addr *daddr)
-{
-	net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
-	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
-			    (__force u32)id, nf_frags.rnd);
-}
-
-
-static unsigned int nf_hashfn(const struct inet_frag_queue *q)
-{
-	const struct frag_queue *nq;
-
-	nq = container_of(q, struct frag_queue, q);
-	return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
-}
-
 static void nf_ct_frag6_expire(struct timer_list *t)
 {
 	struct inet_frag_queue *frag = from_timer(frag, t, timer);
@@ -178,34 +161,26 @@ static void nf_ct_frag6_expire(struct timer_list *t)
 	fq = container_of(frag, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, nf_frag.frags);
 
-	ip6_expire_frag_queue(net, fq, &nf_frags);
+	ip6_expire_frag_queue(net, fq);
 }
 
 /* Creation primitives. */
-static inline struct frag_queue *fq_find(struct net *net, __be32 id,
-					 u32 user, struct in6_addr *src,
-					 struct in6_addr *dst, int iif, u8 ecn)
+static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
+				  const struct ipv6hdr *hdr, int iif)
 {
+	struct frag_v6_compare_key key = {
+		.id = id,
+		.saddr = hdr->saddr,
+		.daddr = hdr->daddr,
+		.user = user,
+		.iif = iif,
+	};
 	struct inet_frag_queue *q;
-	struct ip6_create_arg arg;
-	unsigned int hash;
-
-	arg.id = id;
-	arg.user = user;
-	arg.src = src;
-	arg.dst = dst;
-	arg.iif = iif;
-	arg.ecn = ecn;
-
-	local_bh_disable();
-	hash = nf_hash_frag(id, src, dst);
-
-	q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
-	local_bh_enable();
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+
+	q = inet_frag_find(&net->nf_frag.frags, &key);
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct frag_queue, q);
 }
 
@@ -593,8 +568,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
 	fhdr = (struct frag_hdr *)skb_transport_header(skb);
 
 	skb_orphan(skb);
-	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
-		     skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
+	fq = fq_find(net, fhdr->identification, user, hdr,
+		     skb->dev ? skb->dev->ifindex : 0);
 	if (fq == NULL) {
 		pr_debug("Can't find and can't create new queue\n");
 		return -ENOMEM;
@@ -656,17 +631,18 @@ static struct pernet_operations nf_ct_net_ops = {
 	.exit = nf_ct_net_exit,
 };
 
+extern const struct rhashtable_params ip6_rhash_params;
+
 int nf_ct_frag6_init(void)
 {
 	int ret = 0;
 
-	nf_frags.hashfn = nf_hashfn;
 	nf_frags.constructor = ip6_frag_init;
 	nf_frags.destructor = NULL;
 	nf_frags.qsize = sizeof(struct frag_queue);
-	nf_frags.match = ip6_frag_match;
 	nf_frags.frag_expire = nf_ct_frag6_expire;
 	nf_frags.frags_cache_name = nf_frags_cache_name;
+	nf_frags.rhash_params = ip6_rhash_params;
 	ret = inet_frags_init(&nf_frags);
 	if (ret)
 		goto out;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 8cfea13a179c6f048177ac91fe26c8a5565e5820..737b0921ab0c9af198fefdf06d8f4ede91c7f3f6 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -79,59 +79,19 @@ static struct inet_frags ip6_frags;
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 			  struct net_device *dev);
 
-/*
- * callers should be careful not to use the hash value outside the ipfrag_lock
- * as doing so could race with ipfrag_hash_rnd being recalculated.
- */
-static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
-				    const struct in6_addr *daddr)
-{
-	net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
-	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
-			    (__force u32)id, ip6_frags.rnd);
-}
-
-static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
-{
-	const struct frag_queue *fq;
-
-	fq = container_of(q, struct frag_queue, q);
-	return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
-}
-
-bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
-{
-	const struct frag_queue *fq;
-	const struct ip6_create_arg *arg = a;
-
-	fq = container_of(q, struct frag_queue, q);
-	return	fq->id == arg->id &&
-		fq->user == arg->user &&
-		ipv6_addr_equal(&fq->saddr, arg->src) &&
-		ipv6_addr_equal(&fq->daddr, arg->dst) &&
-		(arg->iif == fq->iif ||
-		 !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
-					       IPV6_ADDR_LINKLOCAL)));
-}
-EXPORT_SYMBOL(ip6_frag_match);
-
 void ip6_frag_init(struct inet_frag_queue *q, const void *a)
 {
 	struct frag_queue *fq = container_of(q, struct frag_queue, q);
-	const struct ip6_create_arg *arg = a;
+	const struct frag_v6_compare_key *key = a;
 
-	fq->id = arg->id;
-	fq->user = arg->user;
-	fq->saddr = *arg->src;
-	fq->daddr = *arg->dst;
-	fq->ecn = arg->ecn;
+	q->key.v6 = *key;
+	fq->ecn = 0;
 }
 EXPORT_SYMBOL(ip6_frag_init);
 
-void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
-			   struct inet_frags *frags)
+void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
 {
-	struct net_device *dev = NULL;
+	struct net_device *dev;
 
 	spin_lock(&fq->q.lock);
 
@@ -146,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
 		goto out_rcu_unlock;
 
 	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
-
-	if (inet_frag_evicting(&fq->q))
-		goto out_rcu_unlock;
-
 	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
 
 	/* Don't send error if the first segment did not arrive. */
@@ -179,31 +135,29 @@ static void ip6_frag_expire(struct timer_list *t)
 	fq = container_of(frag, struct frag_queue, q);
 	net = container_of(fq->q.net, struct net, ipv6.frags);
 
-	ip6_expire_frag_queue(net, fq, &ip6_frags);
+	ip6_expire_frag_queue(net, fq);
 }
 
 static struct frag_queue *
-fq_find(struct net *net, __be32 id, const struct in6_addr *src,
-	const struct in6_addr *dst, int iif, u8 ecn)
+fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
 {
+	struct frag_v6_compare_key key = {
+		.id = id,
+		.saddr = hdr->saddr,
+		.daddr = hdr->daddr,
+		.user = IP6_DEFRAG_LOCAL_DELIVER,
+		.iif = iif,
+	};
 	struct inet_frag_queue *q;
-	struct ip6_create_arg arg;
-	unsigned int hash;
 
-	arg.id = id;
-	arg.user = IP6_DEFRAG_LOCAL_DELIVER;
-	arg.src = src;
-	arg.dst = dst;
-	arg.iif = iif;
-	arg.ecn = ecn;
+	if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
+					    IPV6_ADDR_LINKLOCAL)))
+		key.iif = 0;
 
-	hash = inet6_hash_frag(id, src, dst);
-
-	q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
-	if (IS_ERR_OR_NULL(q)) {
-		inet_frag_maybe_warn_overflow(q, pr_fmt());
+	q = inet_frag_find(&net->ipv6.frags, &key);
+	if (!q)
 		return NULL;
-	}
+
 	return container_of(q, struct frag_queue, q);
 }
 
@@ -527,10 +481,11 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
 
 static int ipv6_frag_rcv(struct sk_buff *skb)
 {
+	struct net *net = dev_net(skb_dst(skb)->dev);
+	const struct ipv6hdr *hdr = ipv6_hdr(skb);
 	struct frag_hdr *fhdr;
 	struct frag_queue *fq;
-	const struct ipv6hdr *hdr = ipv6_hdr(skb);
-	struct net *net = dev_net(skb_dst(skb)->dev);
+	int iif;
 
 	if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
 		goto fail_hdr;
@@ -559,13 +514,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
 		return 1;
 	}
 
-	fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
-		     skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
+	iif = skb->dev ? skb->dev->ifindex : 0;
+	fq = fq_find(net, fhdr->identification, hdr, iif);
 	if (fq) {
 		int ret;
 
 		spin_lock(&fq->q.lock);
 
+		fq->iif = iif;
 		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		spin_unlock(&fq->q.lock);
@@ -718,6 +674,7 @@ static int __net_init ipv6_frags_init_net(struct net *net)
 	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
 	net->ipv6.frags.f = &ip6_frags;
 
+	net->ipv6.frags.f = &ip6_frags;
 	res = inet_frags_init_net(&net->ipv6.frags);
 	if (res < 0)
 		return res;
@@ -739,14 +696,55 @@ static struct pernet_operations ip6_frags_ops = {
 	.exit = ipv6_frags_exit_net,
 };
 
+static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
+{
+	return jhash2(data,
+		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
+}
+
+static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
+{
+	const struct inet_frag_queue *fq = data;
+
+	return jhash2((const u32 *)&fq->key.v6,
+		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
+}
+
+static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
+{
+	const struct frag_v6_compare_key *key = arg->key;
+	const struct inet_frag_queue *fq = ptr;
+
+	return !!memcmp(&fq->key, key, sizeof(*key));
+}
+
+const struct rhashtable_params ip6_rhash_params = {
+	.head_offset		= offsetof(struct inet_frag_queue, node),
+	.hashfn			= ip6_key_hashfn,
+	.obj_hashfn		= ip6_obj_hashfn,
+	.obj_cmpfn		= ip6_obj_cmpfn,
+	.automatic_shrinking	= true,
+};
+EXPORT_SYMBOL(ip6_rhash_params);
+
 int __init ipv6_frag_init(void)
 {
 	int ret;
 
-	ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+	ip6_frags.constructor = ip6_frag_init;
+	ip6_frags.destructor = NULL;
+	ip6_frags.qsize = sizeof(struct frag_queue);
+	ip6_frags.frag_expire = ip6_frag_expire;
+	ip6_frags.frags_cache_name = ip6_frag_cache_name;
+	ip6_frags.rhash_params = ip6_rhash_params;
+	ret = inet_frags_init(&ip6_frags);
 	if (ret)
 		goto out;
 
+	ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+	if (ret)
+		goto err_protocol;
+
 	ret = ip6_frags_sysctl_register();
 	if (ret)
 		goto err_sysctl;
@@ -755,16 +753,6 @@ int __init ipv6_frag_init(void)
 	if (ret)
 		goto err_pernet;
 
-	ip6_frags.hashfn = ip6_hashfn;
-	ip6_frags.constructor = ip6_frag_init;
-	ip6_frags.destructor = NULL;
-	ip6_frags.qsize = sizeof(struct frag_queue);
-	ip6_frags.match = ip6_frag_match;
-	ip6_frags.frag_expire = ip6_frag_expire;
-	ip6_frags.frags_cache_name = ip6_frag_cache_name;
-	ret = inet_frags_init(&ip6_frags);
-	if (ret)
-		goto err_pernet;
 out:
 	return ret;
 
@@ -772,6 +760,8 @@ int __init ipv6_frag_init(void)
 	ip6_frags_sysctl_unregister();
 err_sysctl:
 	inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
+err_protocol:
+	inet_frags_fini(&ip6_frags);
 	goto out;
 }
 
-- 
2.17.0.rc1.321.gba9d0f2565-goog

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH net-next 5/6] inet: frags: remove some helpers
  2018-03-30  5:22 [PATCH net-next 1/6] ipv6: frag: remove unused field Eric Dumazet
                   ` (2 preceding siblings ...)
  2018-03-30  5:22 ` [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units Eric Dumazet
@ 2018-03-30  5:22 ` Eric Dumazet
  2018-03-30  5:22 ` [PATCH net-next 6/6] inet: frags: break the 2GB limit for frags storage Eric Dumazet
  4 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2018-03-30  5:22 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Florian Westphal, Herbert Xu, Thomas Graf,
	Jesper Dangaard Brouer, Alexander Aring, Stefan Schmidt,
	Eric Dumazet, Eric Dumazet

Remove sum_frag_mem_limit(), ip_frag_mem() & ip6_frag_mem()

Also since we use rhashtable we can bring back the number of fragments
in "grep FRAG /proc/net/sockstat /proc/net/sockstat6" that was
removed in commit 434d305405ab ("inet: frag: don't account number
of fragment queues")

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/inet_frag.h | 5 -----
 include/net/ip.h        | 1 -
 include/net/ipv6.h      | 7 -------
 net/ipv4/ip_fragment.c  | 5 -----
 net/ipv4/proc.c         | 6 +++---
 net/ipv6/proc.c         | 5 +++--
 6 files changed, 6 insertions(+), 23 deletions(-)

diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 05099f9f980e2384c0c8cd7e74659656b585cd22..fae84c10679c012bddc4367bcd0d44e34bd51372 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -135,11 +135,6 @@ static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
 	atomic_add(i, &nf->mem);
 }
 
-static inline int sum_frag_mem_limit(struct netns_frags *nf)
-{
-	return atomic_read(&nf->mem);
-}
-
 /* RFC 3168 support :
  * We want to check ECN values of all fragments, do detect invalid combinations.
  * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value.
diff --git a/include/net/ip.h b/include/net/ip.h
index 36f8f7811093c37de06194dc7410b7596f8bf9fa..ecffd843e7b896a83416847fdaa452be6223f3dc 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -588,7 +588,6 @@ static inline struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *s
 	return skb;
 }
 #endif
-int ip_frag_mem(struct net *net);
 
 /*
  *	Functions provided by ip_forward.c
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 76f84d4be91b92761fb9a26e7f52e2101ee34c0a..abbed2687fbd61cb47e2b6d0164ab6cf4d40a618 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -379,13 +379,6 @@ static inline bool ipv6_accept_ra(struct inet6_dev *idev)
 	    idev->cnf.accept_ra;
 }
 
-#if IS_ENABLED(CONFIG_IPV6)
-static inline int ip6_frag_mem(struct net *net)
-{
-	return sum_frag_mem_limit(&net->ipv6.frags);
-}
-#endif
-
 #define IPV6_FRAG_HIGH_THRESH	(4 * 1024*1024)	/* 4194304 */
 #define IPV6_FRAG_LOW_THRESH	(3 * 1024*1024)	/* 3145728 */
 #define IPV6_FRAG_TIMEOUT	(60 * HZ)	/* 60 seconds */
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1a7423e8ec0a8f88782ad8c945dc0cd6046f79f0..ef864db73613b491fb430ff3b594c7286705a1b3 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -83,11 +83,6 @@ static u8 ip4_frag_ecn(u8 tos)
 
 static struct inet_frags ip4_frags;
 
-int ip_frag_mem(struct net *net)
-{
-	return sum_frag_mem_limit(&net->ipv4.frags);
-}
-
 static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
 			 struct net_device *dev);
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index adfb75340275d240487574257c10feb295df44fe..aacfce0d7d82cf59269a69ef4d6ac8d9955b0bdc 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -54,7 +54,6 @@
 static int sockstat_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
-	unsigned int frag_mem;
 	int orphans, sockets;
 
 	orphans = percpu_counter_sum_positive(&tcp_orphan_count);
@@ -72,8 +71,9 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 		   sock_prot_inuse_get(net, &udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n",
 		   sock_prot_inuse_get(net, &raw_prot));
-	frag_mem = ip_frag_mem(net);
-	seq_printf(seq,  "FRAG: inuse %u memory %u\n", !!frag_mem, frag_mem);
+	seq_printf(seq,  "FRAG: inuse %u memory %u\n",
+		   atomic_read(&net->ipv4.frags.rhashtable.nelems),
+		   frag_mem_limit(&net->ipv4.frags));
 	return 0;
 }
 
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 6e57028d2e9160be264d07f9312658fcb677a568..8befeb91e0712ecc4d05c4c0a6ecca1808dcbcac 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -38,7 +38,6 @@
 static int sockstat6_seq_show(struct seq_file *seq, void *v)
 {
 	struct net *net = seq->private;
-	unsigned int frag_mem = ip6_frag_mem(net);
 
 	seq_printf(seq, "TCP6: inuse %d\n",
 		       sock_prot_inuse_get(net, &tcpv6_prot));
@@ -48,7 +47,9 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 			sock_prot_inuse_get(net, &udplitev6_prot));
 	seq_printf(seq, "RAW6: inuse %d\n",
 		       sock_prot_inuse_get(net, &rawv6_prot));
-	seq_printf(seq, "FRAG6: inuse %u memory %u\n", !!frag_mem, frag_mem);
+	seq_printf(seq, "FRAG6: inuse %u memory %u\n",
+		   atomic_read(&net->ipv6.frags.rhashtable.nelems),
+		   frag_mem_limit(&net->ipv6.frags));
 	return 0;
 }
 
-- 
2.17.0.rc1.321.gba9d0f2565-goog

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* [PATCH net-next 6/6] inet: frags: break the 2GB limit for frags storage
  2018-03-30  5:22 [PATCH net-next 1/6] ipv6: frag: remove unused field Eric Dumazet
                   ` (3 preceding siblings ...)
  2018-03-30  5:22 ` [PATCH net-next 5/6] inet: frags: remove some helpers Eric Dumazet
@ 2018-03-30  5:22 ` Eric Dumazet
  4 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2018-03-30  5:22 UTC (permalink / raw)
  To: David S . Miller
  Cc: netdev, Florian Westphal, Herbert Xu, Thomas Graf,
	Jesper Dangaard Brouer, Alexander Aring, Stefan Schmidt,
	Eric Dumazet, Eric Dumazet

Some users are willing to provision huge amounts of memory to be able
to perform reassembly reasonnably well under pressure.

Current memory tracking is using one atomic_t and integers.

Switch to atomic_long_t so that 64bit arches can use more than 2GB,
without any cost for 32bit arches.

Tested:

$ echo 16000000000 >/proc/sys/net/ipv4/ipfrag_high_thresh

<frag DDOS>

$ grep FRAG /proc/net/sockstat
FRAG: inuse 14705885 memory 16000002880

$ nstat -n ; sleep 1 ; nstat | grep Reas
IpReasmReqds                    3317150            0.0
IpReasmFails                    3317112            0.0

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 Documentation/networking/ip-sysctl.txt  |  4 ++--
 include/net/inet_frag.h                 | 20 ++++++++++----------
 net/ieee802154/6lowpan/reassembly.c     | 10 +++++-----
 net/ipv4/ip_fragment.c                  | 10 +++++-----
 net/ipv4/proc.c                         |  2 +-
 net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++++-----
 net/ipv6/proc.c                         |  2 +-
 net/ipv6/reassembly.c                   |  6 +++---
 8 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index c3b65f24e58aa72b720861d816fb76f9956800f0..1c18aa6606b771fb6afb95e55fb880b2484775dd 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -133,10 +133,10 @@ min_adv_mss - INTEGER
 
 IP Fragmentation:
 
-ipfrag_high_thresh - INTEGER
+ipfrag_high_thresh - LONG INTEGER
 	Maximum memory used to reassemble IP fragments.
 
-ipfrag_low_thresh - INTEGER
+ipfrag_low_thresh - LONG INTEGER
 	(Obsolete since linux-4.17)
 	Maximum memory used to reassemble IP fragments before the kernel
 	begins to remove incomplete fragment queues to free up resources.
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index fae84c10679c012bddc4367bcd0d44e34bd51372..b68f5d4338b63701587deb5ba0e48a324920c332 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -6,14 +6,14 @@
 
 struct netns_frags {
 	/* sysctls */
+	long			high_thresh;
+	long			low_thresh;
 	int			timeout;
-	int			high_thresh;
-	int			low_thresh;
 	int			max_dist;
 	struct inet_frags	*f;
 
 	/* Keep atomic mem on separate cachelines in structs that include it */
-	atomic_t		mem ____cacheline_aligned_in_smp;
+	atomic_long_t		mem ____cacheline_aligned_in_smp;
 
 	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
 };
@@ -103,7 +103,7 @@ void inet_frags_fini(struct inet_frags *);
 
 static inline int inet_frags_init_net(struct netns_frags *nf)
 {
-	atomic_set(&nf->mem, 0);
+	atomic_long_set(&nf->mem, 0);
 	return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
 }
 void inet_frags_exit_net(struct netns_frags *nf);
@@ -120,19 +120,19 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
 
 /* Memory Tracking Functions. */
 
-static inline int frag_mem_limit(struct netns_frags *nf)
+static inline long frag_mem_limit(const struct netns_frags *nf)
 {
-	return atomic_read(&nf->mem);
+	return atomic_long_read(&nf->mem);
 }
 
-static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
+static inline void sub_frag_mem_limit(struct netns_frags *nf, long val)
 {
-	atomic_sub(i, &nf->mem);
+	atomic_long_sub(val, &nf->mem);
 }
 
-static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
+static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
 {
-	atomic_add(i, &nf->mem);
+	atomic_long_add(val, &nf->mem);
 }
 
 /* RFC 3168 support :
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 9ee4d22666c26d6d9796d0f484bb4beb265dea42..d7125507c065cd23501f29664892d24c36050ea7 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -411,23 +411,23 @@ int lowpan_frag_rcv(struct sk_buff *skb, u8 frag_type)
 }
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table lowpan_frags_ns_ctl_table[] = {
 	{
 		.procname	= "6lowpanfrag_high_thresh",
 		.data		= &init_net.ieee802154_lowpan.frags.high_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.ieee802154_lowpan.frags.low_thresh
 	},
 	{
 		.procname	= "6lowpanfrag_low_thresh",
 		.data		= &init_net.ieee802154_lowpan.frags.low_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &init_net.ieee802154_lowpan.frags.high_thresh
 	},
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index ef864db73613b491fb430ff3b594c7286705a1b3..c1516e4a6a02a62972c2aa5d7a61ec3c26c2b73a 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -670,23 +670,23 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
 EXPORT_SYMBOL(ip_check_defrag);
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table ip4_frags_ns_ctl_table[] = {
 	{
 		.procname	= "ipfrag_high_thresh",
 		.data		= &init_net.ipv4.frags.high_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.ipv4.frags.low_thresh
 	},
 	{
 		.procname	= "ipfrag_low_thresh",
 		.data		= &init_net.ipv4.frags.low_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &init_net.ipv4.frags.high_thresh
 	},
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index aacfce0d7d82cf59269a69ef4d6ac8d9955b0bdc..a058de677e947846eb93020e0788148827c8f3cd 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -71,7 +71,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
 		   sock_prot_inuse_get(net, &udplite_prot));
 	seq_printf(seq, "RAW: inuse %d\n",
 		   sock_prot_inuse_get(net, &raw_prot));
-	seq_printf(seq,  "FRAG: inuse %u memory %u\n",
+	seq_printf(seq,  "FRAG: inuse %u memory %lu\n",
 		   atomic_read(&net->ipv4.frags.rhashtable.nelems),
 		   frag_mem_limit(&net->ipv4.frags));
 	return 0;
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index 53859311dea96c03fa5ae8456de32de25009efbe..14310372da06b956eefd6dfcbad9168e928b00e5 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -63,7 +63,7 @@ struct nf_ct_frag6_skb_cb
 static struct inet_frags nf_frags;
 
 #ifdef CONFIG_SYSCTL
-static int zero;
+static long zero;
 
 static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 	{
@@ -76,18 +76,18 @@ static struct ctl_table nf_ct_frag6_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_frag6_low_thresh",
 		.data		= &init_net.nf_frag.frags.low_thresh,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &init_net.nf_frag.frags.high_thresh
 	},
 	{
 		.procname	= "nf_conntrack_frag6_high_thresh",
 		.data		= &init_net.nf_frag.frags.high_thresh,
-		.maxlen		= sizeof(unsigned int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.nf_frag.frags.low_thresh
 	},
 	{ }
diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c
index 8befeb91e0712ecc4d05c4c0a6ecca1808dcbcac..a85f7e0b14b10f59fdd2ea6901f8e9a95c13654f 100644
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -47,7 +47,7 @@ static int sockstat6_seq_show(struct seq_file *seq, void *v)
 			sock_prot_inuse_get(net, &udplitev6_prot));
 	seq_printf(seq, "RAW6: inuse %d\n",
 		       sock_prot_inuse_get(net, &rawv6_prot));
-	seq_printf(seq, "FRAG6: inuse %u memory %u\n",
+	seq_printf(seq, "FRAG6: inuse %u memory %lu\n",
 		   atomic_read(&net->ipv6.frags.rhashtable.nelems),
 		   frag_mem_limit(&net->ipv6.frags));
 	return 0;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 737b0921ab0c9af198fefdf06d8f4ede91c7f3f6..bedbded6433b78e603a5b48d6ea493d6d90291a8 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -552,15 +552,15 @@ static struct ctl_table ip6_frags_ns_ctl_table[] = {
 	{
 		.procname	= "ip6frag_high_thresh",
 		.data		= &init_net.ipv6.frags.high_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= proc_doulongvec_minmax,
 		.extra1		= &init_net.ipv6.frags.low_thresh
 	},
 	{
 		.procname	= "ip6frag_low_thresh",
 		.data		= &init_net.ipv6.frags.low_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &zero,
-- 
2.17.0.rc1.321.gba9d0f2565-goog

^ permalink raw reply related	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units
  2018-03-30  5:22 ` [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units Eric Dumazet
@ 2018-03-30  7:18   ` Herbert Xu
  2018-03-30 13:30     ` Eric Dumazet
  2018-03-30 11:44   ` Kirill Tkhai
  1 sibling, 1 reply; 11+ messages in thread
From: Herbert Xu @ 2018-03-30  7:18 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, netdev, Florian Westphal, Thomas Graf,
	Jesper Dangaard Brouer, Alexander Aring, Stefan Schmidt,
	Eric Dumazet, Nikolay Aleksandrov

On Thu, Mar 29, 2018 at 10:22:39PM -0700, Eric Dumazet wrote:
>
>  void inet_frags_exit_net(struct netns_frags *nf)
>  {
> -	struct inet_frags *f =nf->f;
> -	unsigned int seq;
> -	int i;
> -
> -	nf->low_thresh = 0;
> +	struct rhashtable_iter hti;
> +	struct inet_frag_queue *fq;
>  
> -evict_again:
> -	local_bh_disable();
> -	seq = read_seqbegin(&f->rnd_seqlock);
> +	nf->low_thresh = 0; /* prevent creation of new frags */
>  
> -	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
> -		inet_evict_bucket(f, &f->hash[i]);
> +	rhashtable_walk_enter(&nf->rhashtable, &hti);
> +	do {
> +		rhashtable_walk_start(&hti);
>  
> -	local_bh_enable();
> -	cond_resched();
> +		while ((fq = rhashtable_walk_next(&hti)) && !IS_ERR(fq)) {
> +			if (refcount_inc_not_zero(&fq->refcnt)) {
> +				spin_lock_bh(&fq->lock);
> +				inet_frag_kill(fq);
> +				spin_unlock_bh(&fq->lock);
> +				inet_frag_put(fq);
> +			}
> +		}
>  
> -	if (read_seqretry(&f->rnd_seqlock, seq) ||
> -	    sum_frag_mem_limit(nf))
> -		goto evict_again;
> +		rhashtable_walk_stop(&hti);
> +	} while (cond_resched(), fq == ERR_PTR(-EAGAIN));
> +	rhashtable_walk_exit(&hti);
> +	rhashtable_destroy(&nf->rhashtable);
>  }
>  EXPORT_SYMBOL(inet_frags_exit_net);

Instead of using the walk interface, how about
rhashtable_free_and_destroy?

>  void inet_frag_kill(struct inet_frag_queue *fq)
>  {
>  	if (del_timer(&fq->timer))
>  		refcount_dec(&fq->refcnt);
>  
>  	if (!(fq->flags & INET_FRAG_COMPLETE)) {
> -		fq_unlink(fq);
> +		struct netns_frags *nf = fq->net;
> +
> +		fq->flags |= INET_FRAG_COMPLETE;
> +		rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
>  		refcount_dec(&fq->refcnt);
>  	}
>  }

This means that the hash won't inline properly.  Don't know big
of an issue it is to you.  But you could fix it by doing the same
hack as rhashtable by making inet_frag_kill an inline function and
take the rhash_params as an explicit argument.

> -static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
> -						struct inet_frags *f,
> -						void *arg)
> -{
> -	struct inet_frag_queue *q;
> +	timer_setup(&fq->timer, f->frag_expire, 0);
> +	spin_lock_init(&fq->lock);
> +	refcount_set(&fq->refcnt, 3);
> +	mod_timer(&fq->timer, jiffies + nf->timeout);
>  
> -	q = inet_frag_alloc(nf, f, arg);
> -	if (!q)
> +	err = rhashtable_insert_fast(&nf->rhashtable, &fq->node,
> +				     f->rhash_params);

Ditto.

> -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
> -				       struct inet_frags *f, void *key,
> -				       unsigned int hash)
> +/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
> +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
>  {
> -	struct inet_frag_bucket *hb;
> -	struct inet_frag_queue *q;
> -	int depth = 0;
> -
> -	if (frag_mem_limit(nf) > nf->low_thresh)
> -		inet_frag_schedule_worker(f);
> -
> -	hash &= (INETFRAGS_HASHSZ - 1);
> -	hb = &f->hash[hash];
> -
> -	spin_lock(&hb->chain_lock);
> -	hlist_for_each_entry(q, &hb->chain, list) {
> -		if (q->net == nf && f->match(q, key)) {
> -			refcount_inc(&q->refcnt);
> -			spin_unlock(&hb->chain_lock);
> -			return q;
> -		}
> -		depth++;
> -	}
> -	spin_unlock(&hb->chain_lock);
> +	struct inet_frag_queue *fq;
>  
> -	if (depth <= INETFRAGS_MAXDEPTH)
> -		return inet_frag_create(nf, f, key);
> +	rcu_read_lock();
>  
> -	if (inet_frag_may_rebuild(f)) {
> -		if (!f->rebuild)
> -			f->rebuild = true;
> -		inet_frag_schedule_worker(f);
> +	fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);

Ditto.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units
  2018-03-30  5:22 ` [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units Eric Dumazet
  2018-03-30  7:18   ` Herbert Xu
@ 2018-03-30 11:44   ` Kirill Tkhai
  2018-03-30 16:09     ` Eric Dumazet
  1 sibling, 1 reply; 11+ messages in thread
From: Kirill Tkhai @ 2018-03-30 11:44 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller
  Cc: netdev, Florian Westphal, Herbert Xu, Thomas Graf,
	Jesper Dangaard Brouer, Alexander Aring, Stefan Schmidt,
	Eric Dumazet, Nikolay Aleksandrov

Hi, Eric,

On 30.03.2018 08:22, Eric Dumazet wrote:
> Some applications still rely on IP fragmentation, and to be fair linux
> reassembly unit is not working under any serious load.
> 
> It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!)
> 
> A work queue is supposed to garbage collect items when host is under memory
> pressure, and doing a hash rebuild, changing seed used in hash computations.
> 
> This work queue blocks softirqs for up to 25 ms when doing a hash rebuild,
> occurring every 5 seconds if host is under fire.
> 
> Then there is the problem of sharing this hash table for all netns.
> 
> It is time to switch to rhashtables, and allocate one of them per netns
> to speedup netns dismantle, since this is a critical metric these days.
> 
> Lookup is now using RCU. A followup patch will even remove
> the refcount hold/release left from prior implementation and save
> a couple of atomic operations.
> 
> Before this patch, 16 cpus (16 RX queue NIC) could not handle more
> than 1 Mpps frags DDOS.
> 
> After the patch, I reach 7 Mpps without any tuning, and can use up to 2GB
> of storage for the fragments.

Great results!

Please, see some comments below.
 
> $ grep FRAG /proc/net/sockstat
> FRAG: inuse 1966916 memory 2140004608
> 
> A followup patch will change the limits for 64bit arches.
> 
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> Cc: Florian Westphal <fw@strlen.de>
> Cc: Nikolay Aleksandrov <nikolay@redhat.com>
> Cc: Jesper Dangaard Brouer <brouer@redhat.com>
> Cc: Alexander Aring <alex.aring@gmail.com>
> Cc: Stefan Schmidt <stefan@osg.samsung.com>
> ---
>  Documentation/networking/ip-sysctl.txt  |   7 +-
>  include/net/inet_frag.h                 |  99 +++---
>  include/net/ipv6.h                      |  20 +-
>  net/ieee802154/6lowpan/6lowpan_i.h      |  26 +-
>  net/ieee802154/6lowpan/reassembly.c     | 108 +++----
>  net/ipv4/inet_fragment.c                | 399 +++++-------------------
>  net/ipv4/ip_fragment.c                  | 165 +++++-----
>  net/ipv6/netfilter/nf_conntrack_reasm.c |  62 ++--
>  net/ipv6/reassembly.c                   | 152 +++++----
>  9 files changed, 344 insertions(+), 694 deletions(-)
> 
> diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
> index 1d1120753ae82d0aee3e934a3d9c074b70dcbca6..c3b65f24e58aa72b720861d816fb76f9956800f0 100644
> --- a/Documentation/networking/ip-sysctl.txt
> +++ b/Documentation/networking/ip-sysctl.txt
> @@ -134,13 +134,10 @@ min_adv_mss - INTEGER
>  IP Fragmentation:
>  
>  ipfrag_high_thresh - INTEGER
> -	Maximum memory used to reassemble IP fragments. When
> -	ipfrag_high_thresh bytes of memory is allocated for this purpose,
> -	the fragment handler will toss packets until ipfrag_low_thresh
> -	is reached. This also serves as a maximum limit to namespaces
> -	different from the initial one.
> +	Maximum memory used to reassemble IP fragments.
>  
>  ipfrag_low_thresh - INTEGER
> +	(Obsolete since linux-4.17)
>  	Maximum memory used to reassemble IP fragments before the kernel
>  	begins to remove incomplete fragment queues to free up resources.
>  	The kernel still accepts new fragments for defragmentation.
> diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
> index 69e531ed81894393e07cac9e953825fcb55ef42a..05099f9f980e2384c0c8cd7e74659656b585cd22 100644
> --- a/include/net/inet_frag.h
> +++ b/include/net/inet_frag.h
> @@ -2,15 +2,20 @@
>  #ifndef __NET_FRAG_H__
>  #define __NET_FRAG_H__
>  
> +#include <linux/rhashtable.h>
> +
>  struct netns_frags {
> -	/* Keep atomic mem on separate cachelines in structs that include it */
> -	atomic_t		mem ____cacheline_aligned_in_smp;
>  	/* sysctls */
>  	int			timeout;
>  	int			high_thresh;
>  	int			low_thresh;
>  	int			max_dist;
>  	struct inet_frags	*f;
> +
> +	/* Keep atomic mem on separate cachelines in structs that include it */
> +	atomic_t		mem ____cacheline_aligned_in_smp;

The patch is big, and it seems it's possible to extract refactorings like this
in separate patch/patches. Here just two lines moved down.

> +
> +	struct rhashtable       rhashtable ____cacheline_aligned_in_smp;
>  };
>  
>  /**
> @@ -26,12 +31,31 @@ enum {
>  	INET_FRAG_COMPLETE	= BIT(2),
>  };
>  
> +struct frag_v4_compare_key {
> +	__be32		saddr;
> +	__be32		daddr;
> +	u32		user;
> +	u32		vif;
> +	__be16		id;
> +	u16		protocol;
> +};
> +
> +struct frag_v6_compare_key {
> +	struct in6_addr	saddr;
> +	struct in6_addr	daddr;
> +	u32		user;
> +	__be32		id;
> +	u32		iif;
> +};
> +
>  /**
>   * struct inet_frag_queue - fragment queue
>   *
> - * @lock: spinlock protecting the queue
> + * @node: rhash node
> + * @key: keys identifying this frag.
>   * @timer: queue expiration timer
> - * @list: hash bucket list
> + * @net: namespace that this frag belongs to
> + * @lock: spinlock protecting this frag
>   * @refcnt: reference count of the queue
>   * @fragments: received fragments head
>   * @fragments_tail: received fragments tail
> @@ -40,66 +64,38 @@ enum {
>   * @meat: length of received fragments so far
>   * @flags: fragment queue flags
>   * @max_size: maximum received fragment size
> - * @net: namespace that this frag belongs to
> - * @list_evictor: list of queues to forcefully evict (e.g. due to low memory)
> + * @rcu: rcu head for freeing deferall
>   */
>  struct inet_frag_queue {
> -	spinlock_t		lock;
> +	struct rhash_head	node;
> +	union {
> +		struct frag_v4_compare_key v4;
> +		struct frag_v6_compare_key v6;
> +	} key;
>  	struct timer_list	timer;
> -	struct hlist_node	list;
> +	struct netns_frags      *net;
> +	spinlock_t		lock;

Here lock and net just change their position in struct { }.

>  	refcount_t		refcnt;
>  	struct sk_buff		*fragments;
>  	struct sk_buff		*fragments_tail;
>  	ktime_t			stamp;
>  	int			len;
>  	int			meat;
> -	__u8			flags;
> +	u8			flags;

Here just type is changed.

>  	u16			max_size;
> -	struct netns_frags	*net;
> -	struct hlist_node	list_evictor;
> -};
> -
> -#define INETFRAGS_HASHSZ	1024
> -
> -/* averaged:
> - * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ /
> - *	       rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or
> - *	       struct frag_queue))
> - */
> -#define INETFRAGS_MAXDEPTH	128
> -
> -struct inet_frag_bucket {
> -	struct hlist_head	chain;
> -	spinlock_t		chain_lock;
> +	struct rcu_head		rcu;
>  };
>  
>  struct inet_frags {
> -	struct inet_frag_bucket	hash[INETFRAGS_HASHSZ];
> -
> -	struct work_struct	frags_work;
> -	unsigned int next_bucket;
> -	unsigned long last_rebuild_jiffies;
> -	bool rebuild;
> -
> -	/* The first call to hashfn is responsible to initialize
> -	 * rnd. This is best done with net_get_random_once.
> -	 *
> -	 * rnd_seqlock is used to let hash insertion detect
> -	 * when it needs to re-lookup the hash chain to use.
> -	 */
> -	u32			rnd;
> -	seqlock_t		rnd_seqlock;
>  	unsigned int		qsize;
>  
> -	unsigned int		(*hashfn)(const struct inet_frag_queue *);
> -	bool			(*match)(const struct inet_frag_queue *q,
> -					 const void *arg);
> -	void			(*constructor)(struct inet_frag_queue *q,
> +	void			(*constructor)(struct inet_frag_queue *fq,

Here just parameter name is changed

>  					       const void *arg);
> -	void			(*destructor)(struct inet_frag_queue *);
> +	void			(*destructor)(struct inet_frag_queue *fq);

The same as above.

>  	void			(*frag_expire)(struct timer_list *t);
>  	struct kmem_cache	*frags_cachep;
>  	const char		*frags_cache_name;
> +	struct rhashtable_params rhash_params;
>  };
>  
>  int inet_frags_init(struct inet_frags *);
> @@ -108,17 +104,13 @@ void inet_frags_fini(struct inet_frags *);
>  static inline int inet_frags_init_net(struct netns_frags *nf)
>  {
>  	atomic_set(&nf->mem, 0);
> -	return 0;
> +	return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params);
>  }
>  void inet_frags_exit_net(struct netns_frags *nf);
>  
>  void inet_frag_kill(struct inet_frag_queue *q);
>  void inet_frag_destroy(struct inet_frag_queue *q);
> -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
> -		struct inet_frags *f, void *key, unsigned int hash);
> -
> -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
> -				   const char *prefix);
> +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key);
>  
>  static inline void inet_frag_put(struct inet_frag_queue *q)
>  {
> @@ -126,11 +118,6 @@ static inline void inet_frag_put(struct inet_frag_queue *q)
>  		inet_frag_destroy(q);
>  }
>  
> -static inline bool inet_frag_evicting(struct inet_frag_queue *q)
> -{
> -	return !hlist_unhashed(&q->list_evictor);
> -}
> -
>  /* Memory Tracking Functions. */
>  
>  static inline int frag_mem_limit(struct netns_frags *nf)
> diff --git a/include/net/ipv6.h b/include/net/ipv6.h
> index 5c18836672e9d1c560cdce15f5b34928c337abfd..76f84d4be91b92761fb9a26e7f52e2101ee34c0a 100644
> --- a/include/net/ipv6.h
> +++ b/include/net/ipv6.h
> @@ -579,36 +579,20 @@ enum ip6_defrag_users {
>  	__IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX,
>  };
>  
> -struct ip6_create_arg {
> -	__be32 id;
> -	u32 user;
> -	const struct in6_addr *src;
> -	const struct in6_addr *dst;
> -	int iif;
> -	u8 ecn;
> -};
> -
>  void ip6_frag_init(struct inet_frag_queue *q, const void *a);
> -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
>  
>  /*
> - *	Equivalent of ipv4 struct ip
> + *	Equivalent of ipv4 struct ipq
>   */
>  struct frag_queue {
>  	struct inet_frag_queue	q;
>  
> -	__be32			id;		/* fragment id		*/
> -	u32			user;
> -	struct in6_addr		saddr;
> -	struct in6_addr		daddr;
> -
>  	int			iif;
>  	__u16			nhoffset;
>  	u8			ecn;
>  };
>  
> -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
> -			   struct inet_frags *frags);
> +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq);
>  
>  static inline bool ipv6_addr_any(const struct in6_addr *a)
>  {
> diff --git a/net/ieee802154/6lowpan/6lowpan_i.h b/net/ieee802154/6lowpan/6lowpan_i.h
> index d8de3bcfb1032a1133402cb2a4c50a2448133846..b8d95cb71c25dd69c8a88b2c886a3f0d2ce1174f 100644
> --- a/net/ieee802154/6lowpan/6lowpan_i.h
> +++ b/net/ieee802154/6lowpan/6lowpan_i.h
> @@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_result;
>  #define LOWPAN_DISPATCH_FRAG1           0xc0
>  #define LOWPAN_DISPATCH_FRAGN           0xe0
>  
> -struct lowpan_create_arg {
> +struct frag_lowpan_compare_key {
>  	u16 tag;
>  	u16 d_size;
> -	const struct ieee802154_addr *src;
> -	const struct ieee802154_addr *dst;
> +	const struct ieee802154_addr src;
> +	const struct ieee802154_addr dst;
>  };
>  
> -/* Equivalent of ipv4 struct ip
> +/* Equivalent of ipv4 struct ipq
>   */
>  struct lowpan_frag_queue {
>  	struct inet_frag_queue	q;
> -
> -	u16			tag;
> -	u16			d_size;
> -	struct ieee802154_addr	saddr;
> -	struct ieee802154_addr	daddr;
>  };
>  
> -static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a)
> -{
> -	switch (a->mode) {
> -	case IEEE802154_ADDR_LONG:
> -		return (((__force u64)a->extended_addr) >> 32) ^
> -			(((__force u64)a->extended_addr) & 0xffffffff);
> -	case IEEE802154_ADDR_SHORT:
> -		return (__force u32)(a->short_addr + (a->pan_id << 16));
> -	default:
> -		return 0;
> -	}
> -}
> -
>  int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type);
>  void lowpan_net_frag_exit(void);
>  int lowpan_net_frag_init(void);
> diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
> index 6badc055555b7baedac2051a1aaea15f9e9b180c..9ee4d22666c26d6d9796d0f484bb4beb265dea42 100644
> --- a/net/ieee802154/6lowpan/reassembly.c
> +++ b/net/ieee802154/6lowpan/reassembly.c
> @@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags;
>  static int lowpan_frag_reasm(struct lowpan_frag_queue *fq,
>  			     struct sk_buff *prev, struct net_device *ldev);
>  
> -static unsigned int lowpan_hash_frag(u16 tag, u16 d_size,
> -				     const struct ieee802154_addr *saddr,
> -				     const struct ieee802154_addr *daddr)
> -{
> -	net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd));
> -	return jhash_3words(ieee802154_addr_hash(saddr),
> -			    ieee802154_addr_hash(daddr),
> -			    (__force u32)(tag + (d_size << 16)),
> -			    lowpan_frags.rnd);
> -}
> -
> -static unsigned int lowpan_hashfn(const struct inet_frag_queue *q)
> -{
> -	const struct lowpan_frag_queue *fq;
> -
> -	fq = container_of(q, struct lowpan_frag_queue, q);
> -	return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr);
> -}
> -
> -static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a)
> -{
> -	const struct lowpan_frag_queue *fq;
> -	const struct lowpan_create_arg *arg = a;
> -
> -	fq = container_of(q, struct lowpan_frag_queue, q);
> -	return	fq->tag == arg->tag && fq->d_size == arg->d_size &&
> -		ieee802154_addr_equal(&fq->saddr, arg->src) &&
> -		ieee802154_addr_equal(&fq->daddr, arg->dst);
> -}
> -
>  static void lowpan_frag_init(struct inet_frag_queue *q, const void *a)
>  {
> -	const struct lowpan_create_arg *arg = a;
> +	const struct frag_lowpan_compare_key *key = a;
>  	struct lowpan_frag_queue *fq;
>  
>  	fq = container_of(q, struct lowpan_frag_queue, q);
>  
> -	fq->tag = arg->tag;
> -	fq->d_size = arg->d_size;
> -	fq->saddr = *arg->src;
> -	fq->daddr = *arg->dst;
> +	BUILD_BUG_ON(sizeof(*key) > sizeof(q->key));
> +	memcpy(&q->key, key, sizeof(*key));
>  }
>  
>  static void lowpan_frag_expire(struct timer_list *t)
> @@ -105,25 +73,20 @@ fq_find(struct net *net, const struct lowpan_802154_cb *cb,
>  	const struct ieee802154_addr *src,
>  	const struct ieee802154_addr *dst)
>  {
> -	struct inet_frag_queue *q;
> -	struct lowpan_create_arg arg;
> -	unsigned int hash;
>  	struct netns_ieee802154_lowpan *ieee802154_lowpan =
>  		net_ieee802154_lowpan(net);
> +	struct frag_lowpan_compare_key key = {
> +		.tag = cb->d_tag,
> +		.d_size = cb->d_size,
> +		.src = *src,
> +		.dst = *dst,
> +	};
> +	struct inet_frag_queue *q;
>  
> -	arg.tag = cb->d_tag;
> -	arg.d_size = cb->d_size;
> -	arg.src = src;
> -	arg.dst = dst;
> -
> -	hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst);
> -
> -	q = inet_frag_find(&ieee802154_lowpan->frags,
> -			   &lowpan_frags, &arg, hash);
> -	if (IS_ERR_OR_NULL(q)) {
> -		inet_frag_maybe_warn_overflow(q, pr_fmt());
> +	q = inet_frag_find(&ieee802154_lowpan->frags, &key);
> +	if (IS_ERR_OR_NULL(q))
>  		return NULL;
> -	}
> +
>  	return container_of(q, struct lowpan_frag_queue, q);
>  }
>  
> @@ -588,6 +551,7 @@ static int __net_init lowpan_frags_init_net(struct net *net)
>  	ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
>  	ieee802154_lowpan->frags.f = &lowpan_frags;
>  
> +	ieee802154_lowpan->frags.f = &lowpan_frags;
>  	res = inet_frags_init_net(&ieee802154_lowpan->frags);
>  	if (res < 0)
>  		return res;
> @@ -611,6 +575,36 @@ static struct pernet_operations lowpan_frags_ops = {
>  	.exit = lowpan_frags_exit_net,
>  };
>  
> +static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	return jhash2(data,
> +		      sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
> +}
> +
> +static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	const struct inet_frag_queue *fq = data;
> +
> +	return jhash2((const u32 *)&fq->key,
> +		      sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed);
> +}
> +
> +static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
> +{
> +	const struct frag_lowpan_compare_key *key = arg->key;
> +	const struct inet_frag_queue *fq = ptr;
> +
> +	return !!memcmp(&fq->key, key, sizeof(*key));
> +}
> +
> +const struct rhashtable_params lowpan_rhash_params = {
> +	.head_offset		= offsetof(struct inet_frag_queue, node),
> +	.hashfn			= lowpan_key_hashfn,
> +	.obj_hashfn		= lowpan_obj_hashfn,
> +	.obj_cmpfn		= lowpan_obj_cmpfn,
> +	.automatic_shrinking	= true,
> +};
> +
>  int __init lowpan_net_frag_init(void)
>  {
>  	int ret;
> @@ -619,22 +613,24 @@ int __init lowpan_net_frag_init(void)
>  	if (ret)
>  		return ret;
>  
> -	ret = register_pernet_subsys(&lowpan_frags_ops);
> -	if (ret)
> -		goto err_pernet;
> -
> -	lowpan_frags.hashfn = lowpan_hashfn;
>  	lowpan_frags.constructor = lowpan_frag_init;
>  	lowpan_frags.destructor = NULL;
>  	lowpan_frags.qsize = sizeof(struct frag_queue);
> -	lowpan_frags.match = lowpan_frag_match;
>  	lowpan_frags.frag_expire = lowpan_frag_expire;
>  	lowpan_frags.frags_cache_name = lowpan_frags_cache_name;
> +	lowpan_frags.rhash_params = lowpan_rhash_params;
>  	ret = inet_frags_init(&lowpan_frags);
>  	if (ret)
>  		goto err_pernet;
>  
> +	ret = register_pernet_subsys(&lowpan_frags_ops);
> +	if (ret)
> +		goto err_pernet_frags;
> +

Can't we move this register_pernet_subsys() in separate patch?

>  	return ret;
> +
> +err_pernet_frags:
> +	inet_frags_fini(&lowpan_frags);
>  err_pernet:
>  	lowpan_frags_sysctl_unregister();
>  	return ret;
> diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
> index 1ac69f65d0dee600d0ab4db20ff5942952932c40..8ccaf605630f14270996ee1b5a37376299d78661 100644
> --- a/net/ipv4/inet_fragment.c
> +++ b/net/ipv4/inet_fragment.c
> @@ -25,12 +25,6 @@
>  #include <net/inet_frag.h>
>  #include <net/inet_ecn.h>
>  
> -#define INETFRAGS_EVICT_BUCKETS   128
> -#define INETFRAGS_EVICT_MAX	  512
> -
> -/* don't rebuild inetfrag table with new secret more often than this */
> -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ)
> -
>  /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
>   * Value : 0xff if frame should be dropped.
>   *         0 or INET_ECN_CE value, to be ORed in to final iph->tos field
> @@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = {
>  };
>  EXPORT_SYMBOL(ip_frag_ecn_table);
>  
> -static unsigned int
> -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q)
> -{
> -	return f->hashfn(q) & (INETFRAGS_HASHSZ - 1);
> -}
> -
> -static bool inet_frag_may_rebuild(struct inet_frags *f)
> -{
> -	return time_after(jiffies,
> -	       f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL);
> -}
> -
> -static void inet_frag_secret_rebuild(struct inet_frags *f)
> -{
> -	int i;
> -
> -	write_seqlock_bh(&f->rnd_seqlock);
> -
> -	if (!inet_frag_may_rebuild(f))
> -		goto out;
> -
> -	get_random_bytes(&f->rnd, sizeof(u32));
> -
> -	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
> -		struct inet_frag_bucket *hb;
> -		struct inet_frag_queue *q;
> -		struct hlist_node *n;
> -
> -		hb = &f->hash[i];
> -		spin_lock(&hb->chain_lock);
> -
> -		hlist_for_each_entry_safe(q, n, &hb->chain, list) {
> -			unsigned int hval = inet_frag_hashfn(f, q);
> -
> -			if (hval != i) {
> -				struct inet_frag_bucket *hb_dest;
> -
> -				hlist_del(&q->list);
> -
> -				/* Relink to new hash chain. */
> -				hb_dest = &f->hash[hval];
> -
> -				/* This is the only place where we take
> -				 * another chain_lock while already holding
> -				 * one.  As this will not run concurrently,
> -				 * we cannot deadlock on hb_dest lock below, if its
> -				 * already locked it will be released soon since
> -				 * other caller cannot be waiting for hb lock
> -				 * that we've taken above.
> -				 */
> -				spin_lock_nested(&hb_dest->chain_lock,
> -						 SINGLE_DEPTH_NESTING);
> -				hlist_add_head(&q->list, &hb_dest->chain);
> -				spin_unlock(&hb_dest->chain_lock);
> -			}
> -		}
> -		spin_unlock(&hb->chain_lock);
> -	}
> -
> -	f->rebuild = false;
> -	f->last_rebuild_jiffies = jiffies;
> -out:
> -	write_sequnlock_bh(&f->rnd_seqlock);
> -}
> -
> -static bool inet_fragq_should_evict(const struct inet_frag_queue *q)
> -{
> -	if (!hlist_unhashed(&q->list_evictor))
> -		return false;
> -
> -	return q->net->low_thresh == 0 ||
> -	       frag_mem_limit(q->net) >= q->net->low_thresh;
> -}
> -
> -static unsigned int
> -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb)
> -{
> -	struct inet_frag_queue *fq;
> -	struct hlist_node *n;
> -	unsigned int evicted = 0;
> -	HLIST_HEAD(expired);
> -
> -	spin_lock(&hb->chain_lock);
> -
> -	hlist_for_each_entry_safe(fq, n, &hb->chain, list) {
> -		if (!inet_fragq_should_evict(fq))
> -			continue;
> -
> -		if (!del_timer(&fq->timer))
> -			continue;
> -
> -		hlist_add_head(&fq->list_evictor, &expired);
> -		++evicted;
> -	}
> -
> -	spin_unlock(&hb->chain_lock);
> -
> -	hlist_for_each_entry_safe(fq, n, &expired, list_evictor)
> -		f->frag_expire(&fq->timer);
> -
> -	return evicted;
> -}
> -
> -static void inet_frag_worker(struct work_struct *work)
> -{
> -	unsigned int budget = INETFRAGS_EVICT_BUCKETS;
> -	unsigned int i, evicted = 0;
> -	struct inet_frags *f;
> -
> -	f = container_of(work, struct inet_frags, frags_work);
> -
> -	BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ);
> -
> -	local_bh_disable();
> -
> -	for (i = READ_ONCE(f->next_bucket); budget; --budget) {
> -		evicted += inet_evict_bucket(f, &f->hash[i]);
> -		i = (i + 1) & (INETFRAGS_HASHSZ - 1);
> -		if (evicted > INETFRAGS_EVICT_MAX)
> -			break;
> -	}
> -
> -	f->next_bucket = i;
> -
> -	local_bh_enable();
> -
> -	if (f->rebuild && inet_frag_may_rebuild(f))
> -		inet_frag_secret_rebuild(f);
> -}
> -
> -static void inet_frag_schedule_worker(struct inet_frags *f)
> -{
> -	if (unlikely(!work_pending(&f->frags_work)))
> -		schedule_work(&f->frags_work);
> -}
> -
>  int inet_frags_init(struct inet_frags *f)
>  {
> -	int i;
> -
> -	INIT_WORK(&f->frags_work, inet_frag_worker);
> -
> -	for (i = 0; i < INETFRAGS_HASHSZ; i++) {
> -		struct inet_frag_bucket *hb = &f->hash[i];
> -
> -		spin_lock_init(&hb->chain_lock);
> -		INIT_HLIST_HEAD(&hb->chain);
> -	}
> -
> -	seqlock_init(&f->rnd_seqlock);
> -	f->last_rebuild_jiffies = 0;
>  	f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0,
>  					    NULL);
>  	if (!f->frags_cachep)
> @@ -214,93 +59,80 @@ EXPORT_SYMBOL(inet_frags_init);
>  
>  void inet_frags_fini(struct inet_frags *f)
>  {
> -	cancel_work_sync(&f->frags_work);
> +	rcu_barrier();

What does this barrier waits? This should have a comment.

>  	kmem_cache_destroy(f->frags_cachep);
> +	f->frags_cachep = NULL;
>  }
>  EXPORT_SYMBOL(inet_frags_fini);
>  
>  void inet_frags_exit_net(struct netns_frags *nf)
>  {
> -	struct inet_frags *f =nf->f;
> -	unsigned int seq;
> -	int i;
> -
> -	nf->low_thresh = 0;
> +	struct rhashtable_iter hti;
> +	struct inet_frag_queue *fq;
>  
> -evict_again:
> -	local_bh_disable();
> -	seq = read_seqbegin(&f->rnd_seqlock);
> +	nf->low_thresh = 0; /* prevent creation of new frags */
>  
> -	for (i = 0; i < INETFRAGS_HASHSZ ; i++)
> -		inet_evict_bucket(f, &f->hash[i]);
> +	rhashtable_walk_enter(&nf->rhashtable, &hti);
> +	do {
> +		rhashtable_walk_start(&hti);
>  
> -	local_bh_enable();
> -	cond_resched();
> +		while ((fq = rhashtable_walk_next(&hti)) && !IS_ERR(fq)) {
> +			if (refcount_inc_not_zero(&fq->refcnt)) {
> +				spin_lock_bh(&fq->lock);
> +				inet_frag_kill(fq);
> +				spin_unlock_bh(&fq->lock);
> +				inet_frag_put(fq);
> +			}
> +		}
>  
> -	if (read_seqretry(&f->rnd_seqlock, seq) ||
> -	    sum_frag_mem_limit(nf))
> -		goto evict_again;
> +		rhashtable_walk_stop(&hti);
> +	} while (cond_resched(), fq == ERR_PTR(-EAGAIN));
> +	rhashtable_walk_exit(&hti);
> +	rhashtable_destroy(&nf->rhashtable);
>  }
>  EXPORT_SYMBOL(inet_frags_exit_net);
>  
> -static struct inet_frag_bucket *
> -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f)
> -__acquires(hb->chain_lock)
> -{
> -	struct inet_frag_bucket *hb;
> -	unsigned int seq, hash;
> -
> - restart:
> -	seq = read_seqbegin(&f->rnd_seqlock);
> -
> -	hash = inet_frag_hashfn(f, fq);
> -	hb = &f->hash[hash];
> -
> -	spin_lock(&hb->chain_lock);
> -	if (read_seqretry(&f->rnd_seqlock, seq)) {
> -		spin_unlock(&hb->chain_lock);
> -		goto restart;
> -	}
> -
> -	return hb;
> -}
> -
> -static inline void fq_unlink(struct inet_frag_queue *fq)
> -{
> -	struct inet_frag_bucket *hb;
> -
> -	hb = get_frag_bucket_locked(fq, fq->net->f);
> -	hlist_del(&fq->list);
> -	fq->flags |= INET_FRAG_COMPLETE;
> -	spin_unlock(&hb->chain_lock);
> -}
> -
>  void inet_frag_kill(struct inet_frag_queue *fq)
>  {
>  	if (del_timer(&fq->timer))
>  		refcount_dec(&fq->refcnt);
>  
>  	if (!(fq->flags & INET_FRAG_COMPLETE)) {
> -		fq_unlink(fq);
> +		struct netns_frags *nf = fq->net;
> +
> +		fq->flags |= INET_FRAG_COMPLETE;
> +		rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
>  		refcount_dec(&fq->refcnt);
>  	}
>  }
>  EXPORT_SYMBOL(inet_frag_kill);
>  
> -void inet_frag_destroy(struct inet_frag_queue *q)
> +static void inet_frag_destroy_rcu(struct rcu_head *head)
>  {
> -	struct sk_buff *fp;
> +	struct inet_frag_queue *fq = container_of(head, struct inet_frag_queue,
> +						 rcu);
> +	struct inet_frags *f = fq->net->f;
> +
> +	if (f->destructor)
> +		f->destructor(fq);
> +	kmem_cache_free(f->frags_cachep, fq);
> +}
> +
> +void inet_frag_destroy(struct inet_frag_queue *fq)
> +{
> +	unsigned int sum_truesize;
>  	struct netns_frags *nf;
> -	unsigned int sum, sum_truesize = 0;
>  	struct inet_frags *f;
> +	struct sk_buff *fp;
>  
> -	WARN_ON(!(q->flags & INET_FRAG_COMPLETE));
> -	WARN_ON(del_timer(&q->timer) != 0);
> +	WARN_ON(!(fq->flags & INET_FRAG_COMPLETE));
> +	WARN_ON(del_timer(&fq->timer) != 0);

This is actually a result of renaming the variable.
The type of variable remains the same, while the name
has changed. So, this should go in separate patch.

>  
>  	/* Release all fragment data. */
> -	fp = q->fragments;
> -	nf = q->net;
> +	fp = fq->fragments;
> +	nf = fq->net;

The same here

>  	f = nf->f;
> +	sum_truesize = f->qsize;
>  	while (fp) {
>  		struct sk_buff *xp = fp->next;
>  
> @@ -308,136 +140,63 @@ void inet_frag_destroy(struct inet_frag_queue *q)
>  		kfree_skb(fp);
>  		fp = xp;
>  	}
> -	sum = sum_truesize + f->qsize;
>  
> -	if (f->destructor)
> -		f->destructor(q);
> -	kmem_cache_free(f->frags_cachep, q);
> +	call_rcu(&fq->rcu, inet_frag_destroy_rcu);
>  
> -	sub_frag_mem_limit(nf, sum);
> +	sub_frag_mem_limit(nf, sum_truesize);
>  }
>  EXPORT_SYMBOL(inet_frag_destroy);
>  
> -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf,
> -						struct inet_frag_queue *qp_in,
> -						struct inet_frags *f,
> +static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
>  						void *arg)
>  {
> -	struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f);
> -	struct inet_frag_queue *qp;
> -
> -#ifdef CONFIG_SMP
> -	/* With SMP race we have to recheck hash table, because
> -	 * such entry could have been created on other cpu before
> -	 * we acquired hash bucket lock.
> -	 */
> -	hlist_for_each_entry(qp, &hb->chain, list) {
> -		if (qp->net == nf && f->match(qp, arg)) {
> -			refcount_inc(&qp->refcnt);
> -			spin_unlock(&hb->chain_lock);
> -			qp_in->flags |= INET_FRAG_COMPLETE;
> -			inet_frag_put(qp_in);
> -			return qp;
> -		}
> -	}
> -#endif
> -	qp = qp_in;
> -	if (!mod_timer(&qp->timer, jiffies + nf->timeout))
> -		refcount_inc(&qp->refcnt);
> -
> -	refcount_inc(&qp->refcnt);
> -	hlist_add_head(&qp->list, &hb->chain);
> -
> -	spin_unlock(&hb->chain_lock);
> -
> -	return qp;
> -}
> -
> -static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
> -					       struct inet_frags *f,
> -					       void *arg)
> -{
> -	struct inet_frag_queue *q;
> +	struct inet_frags *f = nf->f;
> +	struct inet_frag_queue *fq;
> +	int err;
>  
> -	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) {
> -		inet_frag_schedule_worker(f);
> +	if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh)
>  		return NULL;
> -	}
>  
> -	q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
> -	if (!q)
> +	fq = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC);
> +	if (!fq)
>  		return NULL;

Here we also renamed a variable and merged two functions: inet_frag_alloc()
into inet_frag_create(). Can't we do that in separate patch?

Also, note, git (at least my) generates better diff for this hunk
with the following options in .gitconfig:

[diff "default"]
        algorithm = patience
[diff]
        algorithm = patience

>  
> -	q->net = nf;
> -	f->constructor(q, arg);
> -	add_frag_mem_limit(nf, f->qsize);
> -
> -	timer_setup(&q->timer, f->frag_expire, 0);
> -	spin_lock_init(&q->lock);
> -	refcount_set(&q->refcnt, 1);
> -
> -	return q;
> -}
> +	fq->net = nf;
> +	f->constructor(fq, arg);

Also renaming.

>  
> -static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf,
> -						struct inet_frags *f,
> -						void *arg)
> -{
> -	struct inet_frag_queue *q;
> +	timer_setup(&fq->timer, f->frag_expire, 0);
> +	spin_lock_init(&fq->lock);
> +	refcount_set(&fq->refcnt, 3);
> +	mod_timer(&fq->timer, jiffies + nf->timeout);
>  
> -	q = inet_frag_alloc(nf, f, arg);
> -	if (!q)
> +	err = rhashtable_insert_fast(&nf->rhashtable, &fq->node,
> +				     f->rhash_params);
> +	add_frag_mem_limit(nf, f->qsize);
> +	if (err < 0) {
> +		fq->flags |= INET_FRAG_COMPLETE;
> +		inet_frag_kill(fq);
> +		inet_frag_destroy(fq);
>  		return NULL;
> -
> -	return inet_frag_intern(nf, q, f, arg);
> +	}
> +	return fq;
>  }
>  
> -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf,
> -				       struct inet_frags *f, void *key,
> -				       unsigned int hash)
> +/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */
> +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
>  {
> -	struct inet_frag_bucket *hb;
> -	struct inet_frag_queue *q;
> -	int depth = 0;
> -
> -	if (frag_mem_limit(nf) > nf->low_thresh)
> -		inet_frag_schedule_worker(f);
> -
> -	hash &= (INETFRAGS_HASHSZ - 1);
> -	hb = &f->hash[hash];
> -
> -	spin_lock(&hb->chain_lock);
> -	hlist_for_each_entry(q, &hb->chain, list) {
> -		if (q->net == nf && f->match(q, key)) {
> -			refcount_inc(&q->refcnt);
> -			spin_unlock(&hb->chain_lock);
> -			return q;
> -		}
> -		depth++;
> -	}
> -	spin_unlock(&hb->chain_lock);
> +	struct inet_frag_queue *fq;
>  
> -	if (depth <= INETFRAGS_MAXDEPTH)
> -		return inet_frag_create(nf, f, key);
> +	rcu_read_lock();
>  
> -	if (inet_frag_may_rebuild(f)) {
> -		if (!f->rebuild)
> -			f->rebuild = true;
> -		inet_frag_schedule_worker(f);
> +	fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params);
> +	if (fq) {
> +		if (!refcount_inc_not_zero(&fq->refcnt))
> +			fq = NULL;
> +		rcu_read_unlock();
> +		return fq;
>  	}
> +	rcu_read_unlock();
>  
> -	return ERR_PTR(-ENOBUFS);
> +	return inet_frag_create(nf, key);
>  }
>  EXPORT_SYMBOL(inet_frag_find);
> -
> -void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q,
> -				   const char *prefix)
> -{
> -	static const char msg[] = "inet_frag_find: Fragment hash bucket"
> -		" list length grew over limit " __stringify(INETFRAGS_MAXDEPTH)
> -		". Dropping fragment.\n";
> -
> -	if (PTR_ERR(q) == -ENOBUFS)
> -		net_dbg_ratelimited("%s%s", prefix, msg);
> -}
> -EXPORT_SYMBOL(inet_frag_maybe_warn_overflow);
> diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
> index cd2b4c9419fc1552d367b572926e314b11cb6c00..1a7423e8ec0a8f88782ad8c945dc0cd6046f79f0 100644
> --- a/net/ipv4/ip_fragment.c
> +++ b/net/ipv4/ip_fragment.c
> @@ -69,15 +69,9 @@ struct ipfrag_skb_cb
>  struct ipq {
>  	struct inet_frag_queue q;
>  
> -	u32		user;
> -	__be32		saddr;
> -	__be32		daddr;
> -	__be16		id;
> -	u8		protocol;
>  	u8		ecn; /* RFC3168 support */
>  	u16		max_df_size; /* largest frag with DF set seen */
>  	int             iif;
> -	int             vif;   /* L3 master device index */
>  	unsigned int    rid;
>  	struct inet_peer *peer;
>  };
> @@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net)
>  static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
>  			 struct net_device *dev);
>  
> -struct ip4_create_arg {
> -	struct iphdr *iph;
> -	u32 user;
> -	int vif;
> -};
> -
> -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot)
> -{
> -	net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd));
> -	return jhash_3words((__force u32)id << 16 | prot,
> -			    (__force u32)saddr, (__force u32)daddr,
> -			    ip4_frags.rnd);
> -}
> -
> -static unsigned int ip4_hashfn(const struct inet_frag_queue *q)
> -{
> -	const struct ipq *ipq;
> -
> -	ipq = container_of(q, struct ipq, q);
> -	return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol);
> -}
> -
> -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a)
> -{
> -	const struct ipq *qp;
> -	const struct ip4_create_arg *arg = a;
> -
> -	qp = container_of(q, struct ipq, q);
> -	return	qp->id == arg->iph->id &&
> -		qp->saddr == arg->iph->saddr &&
> -		qp->daddr == arg->iph->daddr &&
> -		qp->protocol == arg->iph->protocol &&
> -		qp->user == arg->user &&
> -		qp->vif == arg->vif;
> -}
>  
>  static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
>  {
> @@ -140,37 +99,23 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
>  					       frags);
>  	struct net *net = container_of(ipv4, struct net, ipv4);
>  
> -	const struct ip4_create_arg *arg = a;
> +	const struct frag_v4_compare_key *key = a;
>  
> -	qp->protocol = arg->iph->protocol;
> -	qp->id = arg->iph->id;
> -	qp->ecn = ip4_frag_ecn(arg->iph->tos);
> -	qp->saddr = arg->iph->saddr;
> -	qp->daddr = arg->iph->daddr;
> -	qp->vif = arg->vif;
> -	qp->user = arg->user;
> +	q->key.v4 = *key;
> +	qp->ecn = 0;
>  	qp->peer = q->net->max_dist ?
> -		inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
> +		inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
>  		NULL;
>  }
>  
> -static void ip4_frag_free(struct inet_frag_queue *q)
> +static void ip4_frag_destructor(struct inet_frag_queue *q)

This just renames the function

>  {
> -	struct ipq *qp;
> +	struct ipq *qp = container_of(q, struct ipq, q);
>  
> -	qp = container_of(q, struct ipq, q);

This is also just a refactoring

>  	if (qp->peer)
>  		inet_putpeer(qp->peer);
>  }
>  
> -
> -/* Destruction primitives. */
> -
> -static void ipq_put(struct ipq *ipq)
> -{
> -	inet_frag_put(&ipq->q);
> -}
> -
>  /* Kill ipq entry. It is not destroyed immediately,
>   * because caller (and someone more) holds reference count.
>   */
> @@ -198,25 +143,25 @@ static void ip_expire(struct timer_list *t)
>  	struct net *net;
>  
>  	qp = container_of(frag, struct ipq, q);
> -	net = container_of(qp->q.net, struct net, ipv4.frags);
> +	net = container_of(frag->net, struct net, ipv4.frags);
>  
>  	rcu_read_lock();
> -	spin_lock(&qp->q.lock);
> +	spin_lock(&frag->lock);
>  
> -	if (qp->q.flags & INET_FRAG_COMPLETE)
> +	if (frag->flags & INET_FRAG_COMPLETE)
>  		goto out;
>  
>  	ipq_kill(qp);
>  	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
>  
> -	if (!inet_frag_evicting(&qp->q)) {
> -		struct sk_buff *clone, *head = qp->q.fragments;
> +	if (true) {

This does not look good for me. Better we could try to move this
to separate function..

> +		struct sk_buff *clone, *head = frag->fragments;
>  		const struct iphdr *iph;
>  		int err;
>  
>  		__IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
>  
> -		if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !qp->q.fragments)
> +		if (!(frag->flags & INET_FRAG_FIRST_IN) || !frag->fragments)
>  			goto out;
>  
>  		head->dev = dev_get_by_index_rcu(net, qp->iif);
> @@ -234,7 +179,7 @@ static void ip_expire(struct timer_list *t)
>  		/* Only an end host needs to send an ICMP
>  		 * "Fragment Reassembly Timeout" message, per RFC792.
>  		 */
> -		if (frag_expire_skip_icmp(qp->user) &&
> +		if (frag_expire_skip_icmp(frag->key.v4.user) &&
>  		    (skb_rtable(head)->rt_type != RTN_LOCAL))
>  			goto out;
>  
> @@ -242,7 +187,7 @@ static void ip_expire(struct timer_list *t)
>  
>  		/* Send an ICMP "Fragment Reassembly Timeout" message. */
>  		if (clone) {
> -			spin_unlock(&qp->q.lock);
> +			spin_unlock(&frag->lock);
>  			icmp_send(clone, ICMP_TIME_EXCEEDED,
>  				  ICMP_EXC_FRAGTIME, 0);
>  			consume_skb(clone);
> @@ -250,33 +195,32 @@ static void ip_expire(struct timer_list *t)
>  		}
>  	}
>  out:
> -	spin_unlock(&qp->q.lock);
> +	spin_unlock(&frag->lock);
>  out_rcu_unlock:
>  	rcu_read_unlock();
> -	ipq_put(qp);
> +	inet_frag_put(frag);
>  }
>  
>  /* Find the correct entry in the "incomplete datagrams" queue for
>   * this IP datagram, and create new one, if nothing is found.
>   */
> -static struct ipq *ip_find(struct net *net, struct iphdr *iph,
> +static struct ipq *ip_find(struct net *net, const struct iphdr *iph,
>  			   u32 user, int vif)
>  {
> +	struct frag_v4_compare_key key = {
> +		.saddr = iph->saddr,
> +		.daddr = iph->daddr,
> +		.user = user,
> +		.vif = vif,
> +		.id = iph->id,
> +		.protocol = iph->protocol,
> +	};
>  	struct inet_frag_queue *q;
> -	struct ip4_create_arg arg;
> -	unsigned int hash;
>  
> -	arg.iph = iph;
> -	arg.user = user;
> -	arg.vif = vif;
> -
> -	hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol);
> -
> -	q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash);
> -	if (IS_ERR_OR_NULL(q)) {
> -		inet_frag_maybe_warn_overflow(q, pr_fmt());
> +	q = inet_frag_find(&net->ipv4.frags, &key);
> +	if (!q)
>  		return NULL;
> -	}
> +
>  	return container_of(q, struct ipq, q);
>  }
>  
> @@ -310,8 +254,8 @@ static int ip_frag_too_far(struct ipq *qp)
>  
>  static int ip_frag_reinit(struct ipq *qp)
>  {
> -	struct sk_buff *fp;
>  	unsigned int sum_truesize = 0;
> +	struct sk_buff *fp;

This just moves the line...

>  
>  	if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
>  		refcount_inc(&qp->q.refcnt);
> @@ -652,11 +596,11 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev,
>  	return 0;
>  
>  out_nomem:
> -	net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
> +	net_dbg_ratelimited("queue_glue: no memory for gluing queue\n");

Since there is no parameter type change, but complete removing of it,
we may do that in refatoring patch (together with the above).

>  	err = -ENOMEM;
>  	goto out_fail;
>  out_oversize:
> -	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr);
> +	net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
>  out_fail:
>  	__IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
>  	return err;
> @@ -682,7 +626,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
>  		ret = ip_frag_queue(qp, skb);
>  
>  		spin_unlock(&qp->q.lock);
> -		ipq_put(qp);
> +		inet_frag_put(&qp->q);
>  		return ret;
>  	}
>  
> @@ -894,17 +838,52 @@ static struct pernet_operations ip4_frags_ops = {
>  	.exit = ipv4_frags_exit_net,
>  };
>  
> +
> +static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	return jhash2(data,
> +		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
> +}
> +
> +static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	const struct inet_frag_queue *fq = data;
> +
> +	return jhash2((const u32 *)&fq->key.v4,
> +		      sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
> +}
> +
> +static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
> +{
> +	const struct frag_v4_compare_key *key = arg->key;
> +	const struct inet_frag_queue *fq = ptr;
> +
> +	return !!memcmp(&fq->key, key, sizeof(*key));
> +}
> +
> +static const struct rhashtable_params ip4_rhash_params = {
> +	.head_offset		= offsetof(struct inet_frag_queue, node),
> +	.key_offset		= offsetof(struct inet_frag_queue, key),
> +	.key_len		= sizeof(struct frag_v4_compare_key),
> +	.hashfn			= ip4_key_hashfn,
> +	.obj_hashfn		= ip4_obj_hashfn,
> +	.obj_cmpfn		= ip4_obj_cmpfn,
> +	.automatic_shrinking	= true,
> +};
> +
>  void __init ipfrag_init(void)
>  {
> -	ip4_frags_ctl_register();
> -	register_pernet_subsys(&ip4_frags_ops);
> -	ip4_frags.hashfn = ip4_hashfn;
>  	ip4_frags.constructor = ip4_frag_init;
> -	ip4_frags.destructor = ip4_frag_free;
> +	ip4_frags.destructor = ip4_frag_destructor;

Here just reflects the fact we removed the function

>  	ip4_frags.qsize = sizeof(struct ipq);
> -	ip4_frags.match = ip4_frag_match;
>  	ip4_frags.frag_expire = ip_expire;
>  	ip4_frags.frags_cache_name = ip_frag_cache_name;
> +	ip4_frags.rhash_params = ip4_rhash_params;
> +
>  	if (inet_frags_init(&ip4_frags))
>  		panic("IP: failed to allocate ip4_frags cache\n");
> +
> +	ip4_frags_ctl_register();
> +	register_pernet_subsys(&ip4_frags_ops);

We may consider to do this moving in refatoring patch..

> +
>  }
> diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
> index f69b7ca52727c814eb2887c9deb9f356c56e5442..53859311dea96c03fa5ae8456de32de25009efbe 100644
> --- a/net/ipv6/netfilter/nf_conntrack_reasm.c
> +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
> @@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h)
>  	return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK);
>  }
>  
> -static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr,
> -				 const struct in6_addr *daddr)
> -{
> -	net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd));
> -	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
> -			    (__force u32)id, nf_frags.rnd);
> -}
> -
> -
> -static unsigned int nf_hashfn(const struct inet_frag_queue *q)
> -{
> -	const struct frag_queue *nq;
> -
> -	nq = container_of(q, struct frag_queue, q);
> -	return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
> -}
> -
>  static void nf_ct_frag6_expire(struct timer_list *t)
>  {
>  	struct inet_frag_queue *frag = from_timer(frag, t, timer);
> @@ -178,34 +161,26 @@ static void nf_ct_frag6_expire(struct timer_list *t)
>  	fq = container_of(frag, struct frag_queue, q);
>  	net = container_of(fq->q.net, struct net, nf_frag.frags);
>  
> -	ip6_expire_frag_queue(net, fq, &nf_frags);
> +	ip6_expire_frag_queue(net, fq);
>  }
>  
>  /* Creation primitives. */
> -static inline struct frag_queue *fq_find(struct net *net, __be32 id,
> -					 u32 user, struct in6_addr *src,
> -					 struct in6_addr *dst, int iif, u8 ecn)
> +static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user,
> +				  const struct ipv6hdr *hdr, int iif)
>  {
> +	struct frag_v6_compare_key key = {
> +		.id = id,
> +		.saddr = hdr->saddr,
> +		.daddr = hdr->daddr,
> +		.user = user,
> +		.iif = iif,
> +	};
>  	struct inet_frag_queue *q;
> -	struct ip6_create_arg arg;
> -	unsigned int hash;
> -
> -	arg.id = id;
> -	arg.user = user;
> -	arg.src = src;
> -	arg.dst = dst;
> -	arg.iif = iif;
> -	arg.ecn = ecn;
> -
> -	local_bh_disable();
> -	hash = nf_hash_frag(id, src, dst);
> -
> -	q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash);
> -	local_bh_enable();
> -	if (IS_ERR_OR_NULL(q)) {
> -		inet_frag_maybe_warn_overflow(q, pr_fmt());
> +
> +	q = inet_frag_find(&net->nf_frag.frags, &key);
> +	if (!q)
>  		return NULL;
> -	}
> +
>  	return container_of(q, struct frag_queue, q);
>  }
>  
> @@ -593,8 +568,8 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
>  	fhdr = (struct frag_hdr *)skb_transport_header(skb);
>  
>  	skb_orphan(skb);
> -	fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
> -		     skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
> +	fq = fq_find(net, fhdr->identification, user, hdr,
> +		     skb->dev ? skb->dev->ifindex : 0);
>  	if (fq == NULL) {
>  		pr_debug("Can't find and can't create new queue\n");
>  		return -ENOMEM;
> @@ -656,17 +631,18 @@ static struct pernet_operations nf_ct_net_ops = {
>  	.exit = nf_ct_net_exit,
>  };
>  
> +extern const struct rhashtable_params ip6_rhash_params;

Can't we do this declaration in H-file?

> +
>  int nf_ct_frag6_init(void)
>  {
>  	int ret = 0;
>  
> -	nf_frags.hashfn = nf_hashfn;
>  	nf_frags.constructor = ip6_frag_init;
>  	nf_frags.destructor = NULL;
>  	nf_frags.qsize = sizeof(struct frag_queue);
> -	nf_frags.match = ip6_frag_match;
>  	nf_frags.frag_expire = nf_ct_frag6_expire;
>  	nf_frags.frags_cache_name = nf_frags_cache_name;
> +	nf_frags.rhash_params = ip6_rhash_params;
>  	ret = inet_frags_init(&nf_frags);
>  	if (ret)
>  		goto out;
> diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
> index 8cfea13a179c6f048177ac91fe26c8a5565e5820..737b0921ab0c9af198fefdf06d8f4ede91c7f3f6 100644
> --- a/net/ipv6/reassembly.c
> +++ b/net/ipv6/reassembly.c
> @@ -79,59 +79,19 @@ static struct inet_frags ip6_frags;
>  static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
>  			  struct net_device *dev);
>  
> -/*
> - * callers should be careful not to use the hash value outside the ipfrag_lock
> - * as doing so could race with ipfrag_hash_rnd being recalculated.
> - */
> -static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr,
> -				    const struct in6_addr *daddr)
> -{
> -	net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd));
> -	return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr),
> -			    (__force u32)id, ip6_frags.rnd);
> -}
> -
> -static unsigned int ip6_hashfn(const struct inet_frag_queue *q)
> -{
> -	const struct frag_queue *fq;
> -
> -	fq = container_of(q, struct frag_queue, q);
> -	return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr);
> -}
> -
> -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
> -{
> -	const struct frag_queue *fq;
> -	const struct ip6_create_arg *arg = a;
> -
> -	fq = container_of(q, struct frag_queue, q);
> -	return	fq->id == arg->id &&
> -		fq->user == arg->user &&
> -		ipv6_addr_equal(&fq->saddr, arg->src) &&
> -		ipv6_addr_equal(&fq->daddr, arg->dst) &&
> -		(arg->iif == fq->iif ||
> -		 !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
> -					       IPV6_ADDR_LINKLOCAL)));
> -}
> -EXPORT_SYMBOL(ip6_frag_match);
> -
>  void ip6_frag_init(struct inet_frag_queue *q, const void *a)
>  {
>  	struct frag_queue *fq = container_of(q, struct frag_queue, q);
> -	const struct ip6_create_arg *arg = a;
> +	const struct frag_v6_compare_key *key = a;
>  
> -	fq->id = arg->id;
> -	fq->user = arg->user;
> -	fq->saddr = *arg->src;
> -	fq->daddr = *arg->dst;
> -	fq->ecn = arg->ecn;
> +	q->key.v6 = *key;
> +	fq->ecn = 0;
>  }
>  EXPORT_SYMBOL(ip6_frag_init);
>  
> -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
> -			   struct inet_frags *frags)
> +void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq)
>  {
> -	struct net_device *dev = NULL;
> +	struct net_device *dev;
>  
>  	spin_lock(&fq->q.lock);
>  
> @@ -146,10 +106,6 @@ void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq,
>  		goto out_rcu_unlock;
>  
>  	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS);
> -
> -	if (inet_frag_evicting(&fq->q))
> -		goto out_rcu_unlock;
> -
>  	__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
>  
>  	/* Don't send error if the first segment did not arrive. */
> @@ -179,31 +135,29 @@ static void ip6_frag_expire(struct timer_list *t)
>  	fq = container_of(frag, struct frag_queue, q);
>  	net = container_of(fq->q.net, struct net, ipv6.frags);
>  
> -	ip6_expire_frag_queue(net, fq, &ip6_frags);
> +	ip6_expire_frag_queue(net, fq);
>  }
>  
>  static struct frag_queue *
> -fq_find(struct net *net, __be32 id, const struct in6_addr *src,
> -	const struct in6_addr *dst, int iif, u8 ecn)
> +fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif)
>  {
> +	struct frag_v6_compare_key key = {
> +		.id = id,
> +		.saddr = hdr->saddr,
> +		.daddr = hdr->daddr,
> +		.user = IP6_DEFRAG_LOCAL_DELIVER,
> +		.iif = iif,
> +	};
>  	struct inet_frag_queue *q;
> -	struct ip6_create_arg arg;
> -	unsigned int hash;
>  
> -	arg.id = id;
> -	arg.user = IP6_DEFRAG_LOCAL_DELIVER;
> -	arg.src = src;
> -	arg.dst = dst;
> -	arg.iif = iif;
> -	arg.ecn = ecn;
> +	if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST |
> +					    IPV6_ADDR_LINKLOCAL)))
> +		key.iif = 0;
>  
> -	hash = inet6_hash_frag(id, src, dst);
> -
> -	q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash);
> -	if (IS_ERR_OR_NULL(q)) {
> -		inet_frag_maybe_warn_overflow(q, pr_fmt());
> +	q = inet_frag_find(&net->ipv6.frags, &key);
> +	if (!q)
>  		return NULL;
> -	}
> +
>  	return container_of(q, struct frag_queue, q);
>  }
>  
> @@ -527,10 +481,11 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
>  
>  static int ipv6_frag_rcv(struct sk_buff *skb)
>  {
> +	struct net *net = dev_net(skb_dst(skb)->dev);
> +	const struct ipv6hdr *hdr = ipv6_hdr(skb);
>  	struct frag_hdr *fhdr;
>  	struct frag_queue *fq;
> -	const struct ipv6hdr *hdr = ipv6_hdr(skb);
> -	struct net *net = dev_net(skb_dst(skb)->dev);
> +	int iif;
>  
>  	if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED)
>  		goto fail_hdr;
> @@ -559,13 +514,14 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
>  		return 1;
>  	}
>  
> -	fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
> -		     skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
> +	iif = skb->dev ? skb->dev->ifindex : 0;
> +	fq = fq_find(net, fhdr->identification, hdr, iif);
>  	if (fq) {
>  		int ret;
>  
>  		spin_lock(&fq->q.lock);
>  
> +		fq->iif = iif;
>  		ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
>  
>  		spin_unlock(&fq->q.lock);
> @@ -718,6 +674,7 @@ static int __net_init ipv6_frags_init_net(struct net *net)
>  	net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
>  	net->ipv6.frags.f = &ip6_frags;
>  
> +	net->ipv6.frags.f = &ip6_frags;

This '=' is already made above, we shouldn't do that twise...

>  	res = inet_frags_init_net(&net->ipv6.frags);
>  	if (res < 0)
>  		return res;
> @@ -739,14 +696,55 @@ static struct pernet_operations ip6_frags_ops = {
>  	.exit = ipv6_frags_exit_net,
>  };
>  
> +static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	return jhash2(data,
> +		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
> +}
> +
> +static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed)
> +{
> +	const struct inet_frag_queue *fq = data;
> +
> +	return jhash2((const u32 *)&fq->key.v6,
> +		      sizeof(struct frag_v6_compare_key) / sizeof(u32), seed);
> +}
> +
> +static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
> +{
> +	const struct frag_v6_compare_key *key = arg->key;
> +	const struct inet_frag_queue *fq = ptr;
> +
> +	return !!memcmp(&fq->key, key, sizeof(*key));
> +}
> +
> +const struct rhashtable_params ip6_rhash_params = {
> +	.head_offset		= offsetof(struct inet_frag_queue, node),
> +	.hashfn			= ip6_key_hashfn,
> +	.obj_hashfn		= ip6_obj_hashfn,
> +	.obj_cmpfn		= ip6_obj_cmpfn,
> +	.automatic_shrinking	= true,
> +};
> +EXPORT_SYMBOL(ip6_rhash_params);
> +
>  int __init ipv6_frag_init(void)
>  {
>  	int ret;
>  
> -	ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
> +	ip6_frags.constructor = ip6_frag_init;
> +	ip6_frags.destructor = NULL;
> +	ip6_frags.qsize = sizeof(struct frag_queue);
> +	ip6_frags.frag_expire = ip6_frag_expire;
> +	ip6_frags.frags_cache_name = ip6_frag_cache_name;
> +	ip6_frags.rhash_params = ip6_rhash_params;
> +	ret = inet_frags_init(&ip6_frags);
>  	if (ret)
>  		goto out;
>  
> +	ret = inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT);
> +	if (ret)
> +		goto err_protocol;
> +
>  	ret = ip6_frags_sysctl_register();
>  	if (ret)
>  		goto err_sysctl;
> @@ -755,16 +753,6 @@ int __init ipv6_frag_init(void)
>  	if (ret)
>  		goto err_pernet;
>  
> -	ip6_frags.hashfn = ip6_hashfn;
> -	ip6_frags.constructor = ip6_frag_init;
> -	ip6_frags.destructor = NULL;
> -	ip6_frags.qsize = sizeof(struct frag_queue);
> -	ip6_frags.match = ip6_frag_match;
> -	ip6_frags.frag_expire = ip6_frag_expire;
> -	ip6_frags.frags_cache_name = ip6_frag_cache_name;
> -	ret = inet_frags_init(&ip6_frags);
> -	if (ret)
> -		goto err_pernet;
>  out:
>  	return ret;
>  
> @@ -772,6 +760,8 @@ int __init ipv6_frag_init(void)
>  	ip6_frags_sysctl_unregister();
>  err_sysctl:
>  	inet6_del_protocol(&frag_protocol, IPPROTO_FRAGMENT);
> +err_protocol:
> +	inet_frags_fini(&ip6_frags);
>  	goto out;
>  }

Thanks,
Kirill

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units
  2018-03-30  7:18   ` Herbert Xu
@ 2018-03-30 13:30     ` Eric Dumazet
  2018-03-30 16:37       ` Herbert Xu
  0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2018-03-30 13:30 UTC (permalink / raw)
  To: Herbert Xu, Eric Dumazet
  Cc: David S . Miller, netdev, Florian Westphal, Thomas Graf,
	Jesper Dangaard Brouer, Alexander Aring, Stefan Schmidt,
	Eric Dumazet, Nikolay Aleksandrov



On 03/30/2018 12:18 AM, Herbert Xu wrote:

> Instead of using the walk interface, how about
> rhashtable_free_and_destroy?

Oh, I was not aware of this beast ;)

I guess I will need to add a cond_resched() in it, right ?

My rough guess is that freeing 30 million frags would take few seconds ;)

> 
>>  void inet_frag_kill(struct inet_frag_queue *fq)
>>  {
>>  	if (del_timer(&fq->timer))
>>  		refcount_dec(&fq->refcnt);
>>  
>>  	if (!(fq->flags & INET_FRAG_COMPLETE)) {
>> -		fq_unlink(fq);
>> +		struct netns_frags *nf = fq->net;
>> +
>> +		fq->flags |= INET_FRAG_COMPLETE;
>> +		rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params);
>>  		refcount_dec(&fq->refcnt);
>>  	}
>>  }
> 
> This means that the hash won't inline properly.  Don't know big
> of an issue it is to you.  But you could fix it by doing the same
> hack as rhashtable by making inet_frag_kill an inline function and
> take the rhash_params as an explicit argument.

I will investigate this, thanks !

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units
  2018-03-30 11:44   ` Kirill Tkhai
@ 2018-03-30 16:09     ` Eric Dumazet
  0 siblings, 0 replies; 11+ messages in thread
From: Eric Dumazet @ 2018-03-30 16:09 UTC (permalink / raw)
  To: Kirill Tkhai, Eric Dumazet, David S . Miller
  Cc: netdev, Florian Westphal, Herbert Xu, Thomas Graf,
	Jesper Dangaard Brouer, Alexander Aring, Stefan Schmidt,
	Nikolay Aleksandrov



On 03/30/2018 04:44 AM, Kirill Tkhai wrote:
> Hi, Eric,
> 
> Great results!
> 
> Please, see some comments below.
> 

Thanks a lot Kirill for this detailed review.

I will address/correct all points that you raised in V2.

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units
  2018-03-30 13:30     ` Eric Dumazet
@ 2018-03-30 16:37       ` Herbert Xu
  0 siblings, 0 replies; 11+ messages in thread
From: Herbert Xu @ 2018-03-30 16:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Eric Dumazet, David S . Miller, netdev, Florian Westphal,
	Thomas Graf, Jesper Dangaard Brouer, Alexander Aring,
	Stefan Schmidt, Nikolay Aleksandrov

On Fri, Mar 30, 2018 at 06:30:42AM -0700, Eric Dumazet wrote:
> 
> I guess I will need to add a cond_resched() in it, right ?

I only ever run with preemption enabled :)

But yeah we should probably add some cond_rescheds to it.  While
you're at it you might want to add some to these functions too:
 - nested_table_free
 - bucket_table_alloc
 - rhashtable_rehash_table

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2018-03-30 16:37 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-30  5:22 [PATCH net-next 1/6] ipv6: frag: remove unused field Eric Dumazet
2018-03-30  5:22 ` [PATCH net-next 2/6] inet: frags: change inet_frags_init_net() return value Eric Dumazet
2018-03-30  5:22 ` [PATCH net-next 3/6] inet: frags: add a pointer to struct netns_frags Eric Dumazet
2018-03-30  5:22 ` [PATCH net-next 4/6] inet: frags: use rhashtables for reassembly units Eric Dumazet
2018-03-30  7:18   ` Herbert Xu
2018-03-30 13:30     ` Eric Dumazet
2018-03-30 16:37       ` Herbert Xu
2018-03-30 11:44   ` Kirill Tkhai
2018-03-30 16:09     ` Eric Dumazet
2018-03-30  5:22 ` [PATCH net-next 5/6] inet: frags: remove some helpers Eric Dumazet
2018-03-30  5:22 ` [PATCH net-next 6/6] inet: frags: break the 2GB limit for frags storage Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).