All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH RFC] IPsec performance optimizations
@ 2016-09-23  7:53 Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 01/11] esp4: Avoid skb_cow_data whenever possible Steffen Klassert
                   ` (10 more replies)
  0 siblings, 11 replies; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Sowmini Varadhan, Ilan Tayari, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

This patchset adds several performance optimizations for the ESP IPsec
protocol. This RFC version is intended to be a discussion base for the
IPsec workshop at the netdev 1.2 conference.

The patchset has two parts, patches 1 - 4 are software optimizations.
These patches are complete and could go upstream after some review.

Patch 5 - 11 are needed to create an API for ESP offload to network
devices. Mellanox prepares the mlx5 driver for the use of the created
API, see 

https://git.kernel.org/cgit/linux/kernel/git/klassert/linux-stk.git/?h=net-next-ipsec-offload-api3

This part is still under development, changes are very likely before
it can go upstream.

Patch 1 and 2 try to avoid the linearization of ESP packets whenever
possible.

Patch 3 prepares the generic networking codepath for IPsec GRO.

Patch 4 implements software GRO a codepath for ESP on ipv4 and ipv6.

Patch 5 extends the skbuff gso_type to unsigned int. We need a GSO
flag for ESP, but all available gso_type flags are currently in use.

Patch 6 adds the needed netdev features to configure IPsec offloads.

Patch 7 adds gso handlers for esp4 and esp6, currently only used
in combination with ESP hardware offload.

Patch 8 - 9 prepares for IPsec hardware offloading.

Patch 10 implements an IPsec hardware offloading API.

Patch 11 allows for TSO and checksum offloading of the inner IPsec packet.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH RFC 01/11] esp4: Avoid skb_cow_data whenever possible
  2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
@ 2016-09-23  7:53 ` Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 02/11] esp6: " Steffen Klassert
                   ` (9 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Sowmini Varadhan, Ilan Tayari, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

This patch tries to avoid skb_cow_data on esp4.

On the encrypt side we add the IPsec tailbits
to the lineaer part of the buffer if there is
space on on it. If there is no space on the linaer
part, we add a page fragment with the tailbits to
the buffer and use separate src and dst scatterlists.

On the decrypt side, we leave the buffer as it is
if it is not cloned.

With this, we can avoid a linearization of the buffer
in most of the cases.

Joint work with:
Sowmini Varadhan <sowmini.varadhan@oracle.com>
Ilan Tayari <ilant@mellanox.com>

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h |   2 +
 net/ipv4/esp4.c    | 338 +++++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 266 insertions(+), 74 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d2fdd6d..5016fa2 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -213,6 +213,8 @@ struct xfrm_state {
 	/* Last used time */
 	unsigned long		lastused;
 
+	struct page_frag xfrag;
+
 	/* Reference to data common to all the instances of this
 	 * transformer. */
 	const struct xfrm_type	*type;
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index d95631d..a62c6214 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -18,6 +18,8 @@
 #include <net/protocol.h>
 #include <net/udp.h>
 
+#include <linux/highmem.h>
+
 struct esp_skb_cb {
 	struct xfrm_skb_cb xfrm;
 	void *tmp;
@@ -92,11 +94,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
 			     __alignof__(struct scatterlist));
 }
 
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+{
+	struct esp_output_extra *extra = esp_tmp_extra(tmp);
+	struct crypto_aead *aead = x->data;
+	int extralen = 0;
+	u8 *iv;
+	struct aead_request *req;
+	struct scatterlist *sg;
+
+	if (x->props.flags & XFRM_STATE_ESN)
+		extralen += sizeof(*extra);
+
+	extra = esp_tmp_extra(tmp);
+	iv = esp_tmp_iv(aead, tmp, extralen);
+	req = esp_tmp_req(aead, iv);
+
+	/* Unref skb_frag_pages in the src scatterlist if necessary.
+	 * Skip the first sg which comes from skb->data.
+	 */
+	if (req->src != req->dst)
+		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+			put_page(sg_page(sg));
+}
+
 static void esp_output_done(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
+	void *tmp;
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
 
-	kfree(ESP_SKB_CB(skb)->tmp);
+	tmp = ESP_SKB_CB(skb)->tmp;
+	esp_ssg_unref(x, tmp);
+	kfree(tmp);
 	xfrm_output_resume(skb, err);
 }
 
@@ -120,6 +151,29 @@ static void esp_output_restore_header(struct sk_buff *skb)
 				sizeof(__be32));
 }
 
+static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb,
+					       struct ip_esp_hdr *esph,
+					       struct esp_output_extra *extra)
+{
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * encryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		extra->esphoff = (unsigned char *)esph -
+				 skb_transport_header(skb);
+		esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
+		extra->seqhi = esph->spi;
+		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+	}
+
+	esph->spi = x->id.spi;
+
+	return esph;
+}
+
 static void esp_output_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -130,16 +184,18 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
 
 static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 {
-	int err;
 	struct esp_output_extra *extra;
+	int err = -ENOMEM;
 	struct ip_esp_hdr *esph;
 	struct crypto_aead *aead;
 	struct aead_request *req;
-	struct scatterlist *sg;
+	struct scatterlist *sg, *dsg;
 	struct sk_buff *trailer;
+	struct page *page;
 	void *tmp;
 	u8 *iv;
 	u8 *tail;
+	u8 *vaddr;
 	int blksize;
 	int clen;
 	int alen;
@@ -149,7 +205,9 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	int nfrags;
 	int assoclen;
 	int extralen;
+	int tailen;
 	__be64 seqno;
+	__u8 proto = *skb_mac_header(skb);
 
 	/* skb is pure payload to encrypt */
 
@@ -169,12 +227,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
 	clen = ALIGN(skb->len + 2 + tfclen, blksize);
 	plen = clen - skb->len - tfclen;
-
-	err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
-	if (err < 0)
-		goto error;
-	nfrags = err;
-
+	tailen = tfclen + plen + alen;
 	assoclen = sizeof(*esph);
 	extralen = 0;
 
@@ -183,35 +236,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += sizeof(__be32);
 	}
 
-	tmp = esp_alloc_tmp(aead, nfrags, extralen);
-	if (!tmp) {
-		err = -ENOMEM;
-		goto error;
-	}
-
-	extra = esp_tmp_extra(tmp);
-	iv = esp_tmp_iv(aead, tmp, extralen);
-	req = esp_tmp_req(aead, iv);
-	sg = esp_req_sg(aead, req);
-
-	/* Fill padding... */
-	tail = skb_tail_pointer(trailer);
-	if (tfclen) {
-		memset(tail, 0, tfclen);
-		tail += tfclen;
-	}
-	do {
-		int i;
-		for (i = 0; i < plen - 2; i++)
-			tail[i] = i + 1;
-	} while (0);
-	tail[plen - 2] = plen - 2;
-	tail[plen - 1] = *skb_mac_header(skb);
-	pskb_put(skb, trailer, clen - skb->len + alen);
-
-	skb_push(skb, -skb_network_offset(skb));
-	esph = ip_esp_hdr(skb);
 	*skb_mac_header(skb) = IPPROTO_ESP;
+	esph = ip_esp_hdr(skb);
 
 	/* this is non-NULL only with UDP Encapsulation */
 	if (x->encap) {
@@ -230,7 +256,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 		uh = (struct udphdr *)esph;
 		uh->source = sport;
 		uh->dest = dport;
-		uh->len = htons(skb->len - skb_transport_offset(skb));
+		uh->len = htons(skb->len + tailen
+				- skb_transport_offset(skb));
 		uh->check = 0;
 
 		switch (encap_type) {
@@ -248,31 +275,170 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 		*skb_mac_header(skb) = IPPROTO_UDP;
 	}
 
-	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+	if (!skb_cloned(skb)) {
+		if (tailen <= skb_availroom(skb)) {
+			nfrags = 1;
+			trailer = skb;
+			tail = skb_tail_pointer(trailer);
 
-	aead_request_set_callback(req, 0, esp_output_done, skb);
+			goto skip_cow;
+		} else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+			   && !skb_has_frag_list(skb)) {
+			int allocsize;
+			struct sock *sk = skb->sk;
+			struct page_frag *pfrag = &x->xfrag;
 
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * encryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		extra->esphoff = (unsigned char *)esph -
-				 skb_transport_header(skb);
-		esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4);
-		extra->seqhi = esph->spi;
-		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
-		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+			allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+			spin_lock_bh(&x->lock);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				goto cow;
+			}
+
+			page = pfrag->page;
+			get_page(page);
+
+			vaddr = kmap_atomic(page);
+
+			tail = vaddr + pfrag->offset;
+
+			/* Fill padding... */
+			if (tfclen) {
+				memset(tail, 0, tfclen);
+				tail += tfclen;
+			}
+			do {
+				int i;
+				for (i = 0; i < plen - 2; i++)
+					tail[i] = i + 1;
+			} while (0);
+			tail[plen - 2] = plen - 2;
+			tail[plen - 1] = proto;
+
+			kunmap_atomic(vaddr);
+
+			nfrags = skb_shinfo(skb)->nr_frags;
+
+			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+					     tailen);
+			skb_shinfo(skb)->nr_frags = ++nfrags;
+
+			pfrag->offset = pfrag->offset + allocsize;
+			nfrags++;
+
+			skb->len += tailen;
+			skb->data_len += tailen;
+			skb->truesize += tailen;
+			if (sk)
+				atomic_add(tailen, &sk->sk_wmem_alloc);
+
+			skb_push(skb, -skb_network_offset(skb));
+
+			esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+			esph->spi = x->id.spi;
+
+			tmp = esp_alloc_tmp(aead, nfrags + 2, extralen);
+			if (!tmp) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+
+			extra = esp_tmp_extra(tmp);
+			iv = esp_tmp_iv(aead, tmp, extralen);
+			req = esp_tmp_req(aead, iv);
+			sg = esp_req_sg(aead, req);
+			dsg = &sg[nfrags];
+
+			esph = esp_output_set_extra(skb, esph, extra);
+
+			sg_init_table(sg, nfrags);
+			skb_to_sgvec(skb, sg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+
+			skb_shinfo(skb)->nr_frags = 1;
+
+			page = pfrag->page;
+			get_page(page);
+			/* replace page frags in skb with new page */
+			__skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+			pfrag->offset = pfrag->offset + allocsize;
+
+			sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+			skb_to_sgvec(skb, dsg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			spin_unlock_bh(&x->lock);
+
+			goto skip_cow2;
+		}
 	}
 
+cow:
+	err = skb_cow_data(skb, tailen, &trailer);
+	if (err < 0)
+		goto error;
+	nfrags = err;
+	tail = skb_tail_pointer(trailer);
+	esph = ip_esp_hdr(skb);
+
+skip_cow:
+	/* Fill padding... */
+	if (tfclen) {
+		memset(tail, 0, tfclen);
+		tail += tfclen;
+	}
+	do {
+		int i;
+		for (i = 0; i < plen - 2; i++)
+			tail[i] = i + 1;
+	} while (0);
+	tail[plen - 2] = plen - 2;
+	tail[plen - 1] = proto;
+	pskb_put(skb, trailer, clen - skb->len + alen);
+
+	skb_push(skb, -skb_network_offset(skb));
+	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 	esph->spi = x->id.spi;
 
+	tmp = esp_alloc_tmp(aead, nfrags, extralen);
+	if (!tmp) {
+		err = -ENOMEM;
+		goto error;
+	}
+
+	extra = esp_tmp_extra(tmp);
+	iv = esp_tmp_iv(aead, tmp, extralen);
+	req = esp_tmp_req(aead, iv);
+	sg = esp_req_sg(aead, req);
+	dsg = sg;
+
+	esph = esp_output_set_extra(skb, esph, extra);
+
 	sg_init_table(sg, nfrags);
 	skb_to_sgvec(skb, sg,
 		     (unsigned char *)esph - skb->data,
 		     assoclen + ivlen + clen + alen);
 
-	aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+skip_cow2:
+	if ((x->props.flags & XFRM_STATE_ESN))
+		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+	else
+		aead_request_set_callback(req, 0, esp_output_done, skb);
+
+	aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
 	aead_request_set_ad(req, assoclen);
 
 	seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -298,6 +464,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 			esp_output_restore_header(skb);
 	}
 
+	if (sg != dsg)
+		esp_ssg_unref(x, tmp);
 	kfree(tmp);
 
 error:
@@ -401,6 +569,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
 	__skb_pull(skb, 4);
 }
 
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * decryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		esph = (void *)skb_push(skb, 4);
+		*seqhi = esph->spi;
+		esph->spi = esph->seq_no;
+		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+	}
+}
+
 static void esp_input_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -437,12 +622,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 	if (elen <= 0)
 		goto out;
 
-	err = skb_cow_data(skb, 0, &trailer);
-	if (err < 0)
-		goto out;
-
-	nfrags = err;
-
 	assoclen = sizeof(*esph);
 	seqhilen = 0;
 
@@ -451,6 +630,26 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += seqhilen;
 	}
 
+	if (!skb_cloned(skb)) {
+		if (!skb_is_nonlinear(skb)) {
+			nfrags = 1;
+
+			goto skip_cow;
+		} else if (!skb_has_frag_list(skb)) {
+			nfrags = skb_shinfo(skb)->nr_frags;
+			nfrags++;
+
+			goto skip_cow;
+		}
+	}
+
+	err = skb_cow_data(skb, 0, &trailer);
+	if (err < 0)
+		goto out;
+
+	nfrags = err;
+
+skip_cow:
 	err = -ENOMEM;
 	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
 	if (!tmp)
@@ -462,26 +661,17 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 	req = esp_tmp_req(aead, iv);
 	sg = esp_req_sg(aead, req);
 
-	skb->ip_summed = CHECKSUM_NONE;
+	esp_input_set_header(skb, seqhi);
 
-	esph = (struct ip_esp_hdr *)skb->data;
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, 0, skb->len);
 
-	aead_request_set_callback(req, 0, esp_input_done, skb);
+	skb->ip_summed = CHECKSUM_NONE;
 
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * decryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		esph = (void *)skb_push(skb, 4);
-		*seqhi = esph->spi;
-		esph->spi = esph->seq_no;
-		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+	if ((x->props.flags & XFRM_STATE_ESN))
 		aead_request_set_callback(req, 0, esp_input_done_esn, skb);
-	}
-
-	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	else
+		aead_request_set_callback(req, 0, esp_input_done, skb);
 
 	aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
 	aead_request_set_ad(req, assoclen);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH RFC 02/11] esp6: Avoid skb_cow_data whenever possible
  2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 01/11] esp4: Avoid skb_cow_data whenever possible Steffen Klassert
@ 2016-09-23  7:53 ` Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 03/11] net: Prepare for IPsec GRO Steffen Klassert
                   ` (8 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Sowmini Varadhan, Ilan Tayari, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

This patch tries to avoid skb_cow_data on esp6.

On the encrypt side we add the IPsec tailbits
to the lineaer part of the buffer if there is
space on on it. If there is no space on the linear
part, we add a page fragment with the tailbits to
the buffer and use separate src and dst scatterlists.

On the decrypt side, we leave the buffer as it is
if it is not cloned.

With this, we can avoid a linearization of the buffer
in most of the cases.

Joint work with:
Sowmini Varadhan <sowmini.varadhan@oracle.com>
Ilan Tayari <ilant@mellanox.com>

Signed-off-by: Sowmini Varadhan <sowmini.varadhan@oracle.com>
Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv6/esp6.c | 302 +++++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 246 insertions(+), 56 deletions(-)

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 060a60b..5cb6b35 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -44,6 +44,8 @@
 #include <net/protocol.h>
 #include <linux/icmpv6.h>
 
+#include <linux/highmem.h>
+
 struct esp_skb_cb {
 	struct xfrm_skb_cb xfrm;
 	void *tmp;
@@ -114,11 +116,40 @@ static inline struct scatterlist *esp_req_sg(struct crypto_aead *aead,
 			     __alignof__(struct scatterlist));
 }
 
+static void esp_ssg_unref(struct xfrm_state *x, void *tmp)
+{
+	__be32 *seqhi;
+	struct crypto_aead *aead = x->data;
+	int seqhilen = 0;
+	u8 *iv;
+	struct aead_request *req;
+	struct scatterlist *sg;
+
+	if (x->props.flags & XFRM_STATE_ESN)
+		seqhilen += sizeof(__be32);
+
+	seqhi = esp_tmp_seqhi(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_req(aead, iv);
+
+	/* Unref skb_frag_pages in the src scatterlist if necessary.
+	 * Skip the first sg which comes from skb->data.
+	 */
+	if (req->src != req->dst)
+		for (sg = sg_next(req->src); sg; sg = sg_next(sg))
+			put_page(sg_page(sg));
+}
+
 static void esp_output_done(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
+	void *tmp;
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_state *x = dst->xfrm;
 
-	kfree(ESP_SKB_CB(skb)->tmp);
+	tmp = ESP_SKB_CB(skb)->tmp;
+	esp_ssg_unref(x, tmp);
+	kfree(tmp);
 	xfrm_output_resume(skb, err);
 }
 
@@ -138,6 +169,27 @@ static void esp_output_restore_header(struct sk_buff *skb)
 	esp_restore_header(skb, skb_transport_offset(skb) - sizeof(__be32));
 }
 
+static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb,
+					     struct ip_esp_hdr *esph,
+					     __be32 *seqhi)
+{
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * encryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
+		*seqhi = esph->spi;
+		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
+	}
+
+	esph->spi = x->id.spi;
+
+	return esph;
+}
+
 static void esp_output_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -152,8 +204,9 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	struct ip_esp_hdr *esph;
 	struct crypto_aead *aead;
 	struct aead_request *req;
-	struct scatterlist *sg;
+	struct scatterlist *sg, *dsg;
 	struct sk_buff *trailer;
+	struct page *page;
 	void *tmp;
 	int blksize;
 	int clen;
@@ -164,10 +217,13 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	int nfrags;
 	int assoclen;
 	int seqhilen;
+	int tailen;
 	u8 *iv;
 	u8 *tail;
+	u8 *vaddr;
 	__be32 *seqhi;
 	__be64 seqno;
+	__u8 proto = *skb_mac_header(skb);
 
 	/* skb is pure payload to encrypt */
 	aead = x->data;
@@ -186,11 +242,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	blksize = ALIGN(crypto_aead_blocksize(aead), 4);
 	clen = ALIGN(skb->len + 2 + tfclen, blksize);
 	plen = clen - skb->len - tfclen;
-
-	err = skb_cow_data(skb, tfclen + plen + alen, &trailer);
-	if (err < 0)
-		goto error;
-	nfrags = err;
+	tailen = tfclen + plen + alen;
 
 	assoclen = sizeof(*esph);
 	seqhilen = 0;
@@ -200,19 +252,130 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += seqhilen;
 	}
 
-	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
-	if (!tmp) {
-		err = -ENOMEM;
-		goto error;
+	*skb_mac_header(skb) = IPPROTO_ESP;
+	esph = ip_esp_hdr(skb);
+
+	if (!skb_cloned(skb)) {
+		if (tailen <= skb_availroom(skb)) {
+			nfrags = 1;
+			trailer = skb;
+			tail = skb_tail_pointer(trailer);
+
+			goto skip_cow;
+		} else if ((skb_shinfo(skb)->nr_frags < MAX_SKB_FRAGS)
+			   && !skb_has_frag_list(skb)) {
+			int allocsize;
+			struct sock *sk = skb->sk;
+			struct page_frag *pfrag = &x->xfrag;
+
+			allocsize = ALIGN(tailen, L1_CACHE_BYTES);
+
+			spin_lock_bh(&x->lock);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				goto cow;
+			}
+
+			page = pfrag->page;
+			get_page(page);
+
+			vaddr = kmap_atomic(page);
+
+			tail = vaddr + pfrag->offset;
+
+			/* Fill padding... */
+			if (tfclen) {
+				memset(tail, 0, tfclen);
+				tail += tfclen;
+			}
+			do {
+				int i;
+				for (i = 0; i < plen - 2; i++)
+					tail[i] = i + 1;
+			} while (0);
+			tail[plen - 2] = plen - 2;
+			tail[plen - 1] = proto;
+
+			kunmap_atomic(vaddr);
+
+			nfrags = skb_shinfo(skb)->nr_frags;
+
+			__skb_fill_page_desc(skb, nfrags, page, pfrag->offset,
+					     tailen);
+			skb_shinfo(skb)->nr_frags = ++nfrags;
+
+			pfrag->offset = pfrag->offset + allocsize;
+			nfrags++;
+
+			skb->len += tailen;
+			skb->data_len += tailen;
+			skb->truesize += tailen;
+			if (sk)
+				atomic_add(tailen, &sk->sk_wmem_alloc);
+
+			skb_push(skb, -skb_network_offset(skb));
+
+			esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+			esph->spi = x->id.spi;
+
+			tmp = esp_alloc_tmp(aead, nfrags + 2, seqhilen);
+			if (!tmp) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+			seqhi = esp_tmp_seqhi(tmp);
+			iv = esp_tmp_iv(aead, tmp, seqhilen);
+			req = esp_tmp_req(aead, iv);
+			sg = esp_req_sg(aead, req);
+			dsg = &sg[nfrags];
+
+			esph = esp_output_set_esn(skb, esph, seqhi);
+
+			sg_init_table(sg, nfrags);
+			skb_to_sgvec(skb, sg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES);
+
+			if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) {
+				spin_unlock_bh(&x->lock);
+				err = -ENOMEM;
+				goto error;
+			}
+
+			skb_shinfo(skb)->nr_frags = 1;
+
+			page = pfrag->page;
+			get_page(page);
+			/* replace page frags in skb with new page */
+			__skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len);
+			pfrag->offset = pfrag->offset + allocsize;
+
+			sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1);
+			skb_to_sgvec(skb, dsg,
+				     (unsigned char *)esph - skb->data,
+				     assoclen + ivlen + clen + alen);
+
+			spin_unlock_bh(&x->lock);
+
+			goto skip_cow2;
+		}
 	}
 
-	seqhi = esp_tmp_seqhi(tmp);
-	iv = esp_tmp_iv(aead, tmp, seqhilen);
-	req = esp_tmp_req(aead, iv);
-	sg = esp_req_sg(aead, req);
+cow:
+	err = skb_cow_data(skb, tailen, &trailer);
+	if (err < 0)
+		goto error;
+	nfrags = err;
 
-	/* Fill padding... */
 	tail = skb_tail_pointer(trailer);
+	esph = ip_esp_hdr(skb);
+
+skip_cow:
+	/* Fill padding... */
 	if (tfclen) {
 		memset(tail, 0, tfclen);
 		tail += tfclen;
@@ -223,36 +386,40 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 			tail[i] = i + 1;
 	} while (0);
 	tail[plen - 2] = plen - 2;
-	tail[plen - 1] = *skb_mac_header(skb);
+	tail[plen - 1] = proto;
 	pskb_put(skb, trailer, clen - skb->len + alen);
 
 	skb_push(skb, -skb_network_offset(skb));
-	esph = ip_esp_hdr(skb);
-	*skb_mac_header(skb) = IPPROTO_ESP;
 
 	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+	esph->spi = x->id.spi;
 
-	aead_request_set_callback(req, 0, esp_output_done, skb);
-
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * encryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		esph = (void *)(skb_transport_header(skb) - sizeof(__be32));
-		*seqhi = esph->spi;
-		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi);
-		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
+	if (!tmp) {
+		err = -ENOMEM;
+		goto error;
 	}
 
-	esph->spi = x->id.spi;
+	seqhi = esp_tmp_seqhi(tmp);
+	iv = esp_tmp_iv(aead, tmp, seqhilen);
+	req = esp_tmp_req(aead, iv);
+	sg = esp_req_sg(aead, req);
+	dsg = sg;
+
+	esph = esp_output_set_esn(skb, esph, seqhi);
 
 	sg_init_table(sg, nfrags);
 	skb_to_sgvec(skb, sg,
 		     (unsigned char *)esph - skb->data,
 		     assoclen + ivlen + clen + alen);
 
-	aead_request_set_crypt(req, sg, sg, ivlen + clen, iv);
+skip_cow2:
+	if ((x->props.flags & XFRM_STATE_ESN))
+		aead_request_set_callback(req, 0, esp_output_done_esn, skb);
+	else
+		aead_request_set_callback(req, 0, esp_output_done, skb);
+
+	aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv);
 	aead_request_set_ad(req, assoclen);
 
 	seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low +
@@ -278,6 +445,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 			esp_output_restore_header(skb);
 	}
 
+	if (sg != dsg)
+		esp_ssg_unref(x, tmp);
 	kfree(tmp);
 
 error:
@@ -343,6 +512,23 @@ static void esp_input_restore_header(struct sk_buff *skb)
 	__skb_pull(skb, 4);
 }
 
+static void esp_input_set_header(struct sk_buff *skb, __be32 *seqhi)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)skb->data;
+
+	/* For ESN we move the header forward by 4 bytes to
+	 * accomodate the high bits.  We will move it back after
+	 * decryption.
+	 */
+	if ((x->props.flags & XFRM_STATE_ESN)) {
+		esph = (void *)skb_push(skb, 4);
+		*seqhi = esph->spi;
+		esph->spi = esph->seq_no;
+		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+	}
+}
+
 static void esp_input_done_esn(struct crypto_async_request *base, int err)
 {
 	struct sk_buff *skb = base->data;
@@ -378,14 +564,6 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 		goto out;
 	}
 
-	nfrags = skb_cow_data(skb, 0, &trailer);
-	if (nfrags < 0) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = -ENOMEM;
-
 	assoclen = sizeof(*esph);
 	seqhilen = 0;
 
@@ -394,6 +572,27 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 		assoclen += seqhilen;
 	}
 
+	if (!skb_cloned(skb)) {
+		if (!skb_is_nonlinear(skb)) {
+			nfrags = 1;
+
+			goto skip_cow;
+		} else if (!skb_has_frag_list(skb)) {
+			nfrags = skb_shinfo(skb)->nr_frags;
+			nfrags++;
+
+			goto skip_cow;
+		}
+	}
+
+	nfrags = skb_cow_data(skb, 0, &trailer);
+	if (nfrags < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+skip_cow:
+	ret = -ENOMEM;
 	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
 	if (!tmp)
 		goto out;
@@ -404,26 +603,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 	req = esp_tmp_req(aead, iv);
 	sg = esp_req_sg(aead, req);
 
-	skb->ip_summed = CHECKSUM_NONE;
+	esp_input_set_header(skb, seqhi);
 
-	esph = (struct ip_esp_hdr *)skb->data;
+	sg_init_table(sg, nfrags);
+	skb_to_sgvec(skb, sg, 0, skb->len);
 
-	aead_request_set_callback(req, 0, esp_input_done, skb);
+	skb->ip_summed = CHECKSUM_NONE;
 
-	/* For ESN we move the header forward by 4 bytes to
-	 * accomodate the high bits.  We will move it back after
-	 * decryption.
-	 */
-	if ((x->props.flags & XFRM_STATE_ESN)) {
-		esph = (void *)skb_push(skb, 4);
-		*seqhi = esph->spi;
-		esph->spi = esph->seq_no;
-		esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.input.hi);
+	if ((x->props.flags & XFRM_STATE_ESN))
 		aead_request_set_callback(req, 0, esp_input_done_esn, skb);
-	}
-
-	sg_init_table(sg, nfrags);
-	skb_to_sgvec(skb, sg, 0, skb->len);
+	else
+		aead_request_set_callback(req, 0, esp_input_done, skb);
 
 	aead_request_set_crypt(req, sg, sg, elen + ivlen, iv);
 	aead_request_set_ad(req, assoclen);
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH RFC 03/11] net: Prepare for IPsec GRO
  2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 01/11] esp4: Avoid skb_cow_data whenever possible Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 02/11] esp6: " Steffen Klassert
@ 2016-09-23  7:53 ` Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 04/11] esp: Add a software GRO codepath Steffen Klassert
                   ` (7 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Sowmini Varadhan, Ilan Tayari, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

This patch prepares the generic codepath for IPsec GRO.
We introduce a new GRO_CONSUMED notifier to reflect that
IPsec can return asynchronous. On IPsec GRO we grab the
packet and reinject it back to layer 2 after IPsec
processing. We also use one xfrm_gro bit on the sk_buff
that will be set from IPsec to notify about GRO. If this
bit is set, we call napi_gro_receive for the backlog device
instead of __netif_receive_skb.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h |  1 +
 include/linux/skbuff.h    | 14 +++++++++++++-
 net/core/dev.c            | 17 ++++++++++++++++-
 net/xfrm/Kconfig          |  4 ++++
 4 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a10d8d1..8bbd2d3 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -342,6 +342,7 @@ enum gro_result {
 	GRO_HELD,
 	GRO_NORMAL,
 	GRO_DROP,
+	GRO_CONSUMED,
 };
 typedef enum gro_result gro_result_t;
 
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 4c5662f..f21da42 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -732,7 +732,10 @@ struct sk_buff {
 #ifdef CONFIG_NET_SWITCHDEV
 	__u8			offload_fwd_mark:1;
 #endif
-	/* 2, 4 or 5 bit hole */
+#ifdef CONFIG_XFRM_OFFLOAD
+	__u8			xfrm_gro:1;
+#endif
+	/* 1 to 5 bits hole */
 
 #ifdef CONFIG_NET_SCHED
 	__u16			tc_index;	/* traffic control index */
@@ -3679,6 +3682,15 @@ static inline struct sec_path *skb_sec_path(struct sk_buff *skb)
 #endif
 }
 
+static inline bool skb_xfrm_gro(struct sk_buff *skb)
+{
+#ifdef CONFIG_XFRM_OFFLOAD
+	return skb->xfrm_gro;
+#else
+	return false;
+#endif
+}
+
 /* Keeps track of mac header offset relative to skb->head.
  * It is useful for TSO of Tunneling protocol. e.g. GRE.
  * For non-tunnel skb it points to skb_mac_header() and for
diff --git a/net/core/dev.c b/net/core/dev.c
index 9dbece2..19e621d 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4531,6 +4531,11 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 	}
 	rcu_read_unlock();
 
+	if (PTR_ERR(pp) == -EINPROGRESS) {
+		ret = GRO_CONSUMED;
+		goto ok;
+	}
+
 	if (&ptype->list == head)
 		goto normal;
 
@@ -4629,6 +4634,9 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 	case GRO_MERGED_FREE:
 		if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
 			skb_dst_drop(skb);
+#ifdef CONFIG_XFRM_OFFLOAD
+			secpath_put(skb->sp);
+#endif
 			kmem_cache_free(skbuff_head_cache, skb);
 		} else {
 			__kfree_skb(skb);
@@ -4637,6 +4645,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
 
 	case GRO_HELD:
 	case GRO_MERGED:
+	case GRO_CONSUMED:
 		break;
 	}
 
@@ -4707,6 +4716,7 @@ static gro_result_t napi_frags_finish(struct napi_struct *napi,
 		break;
 
 	case GRO_MERGED:
+	case GRO_CONSUMED:
 		break;
 	}
 
@@ -4849,7 +4859,12 @@ static int process_backlog(struct napi_struct *napi, int quota)
 
 		while ((skb = __skb_dequeue(&sd->process_queue))) {
 			rcu_read_lock();
-			__netif_receive_skb(skb);
+
+			if (skb_xfrm_gro(skb))
+				napi_gro_receive(napi, skb);
+			else
+				__netif_receive_skb(skb);
+
 			rcu_read_unlock();
 			input_queue_head_incr(sd);
 			if (++work >= quota)
diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig
index bda1a13..442ac61 100644
--- a/net/xfrm/Kconfig
+++ b/net/xfrm/Kconfig
@@ -5,6 +5,10 @@ config XFRM
        bool
        depends on NET
 
+config XFRM_OFFLOAD
+	bool
+	depends on XFRM
+
 config XFRM_ALGO
 	tristate
 	select XFRM
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH RFC 04/11] esp: Add a software GRO codepath
  2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
                   ` (2 preceding siblings ...)
  2016-09-23  7:53 ` [PATCH RFC 03/11] net: Prepare for IPsec GRO Steffen Klassert
@ 2016-09-23  7:53 ` Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 05/11] skbuff: Extend gso_type to unsigned int Steffen Klassert
                   ` (6 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Sowmini Varadhan, Ilan Tayari, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

This patch adds GRO callbacks for ESP on ipv4 and ipv6.

In case the GRO layer detects an ESP packet, the
esp{4,6}_gro_receive() function calls the xfrm input layer
which decapsulates the packet and reinject it into
layer 2 by calling netif_rx().

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/Kconfig                |  6 +++
 net/ipv4/Makefile               |  1 +
 net/ipv4/esp4.c                 |  5 +++
 net/ipv4/esp4_offload.c         | 94 +++++++++++++++++++++++++++++++++++++++
 net/ipv4/ip_vti.c               |  4 ++
 net/ipv4/xfrm4_input.c          |  3 ++
 net/ipv4/xfrm4_mode_transport.c |  3 +-
 net/ipv6/Kconfig                |  6 +++
 net/ipv6/Makefile               |  1 +
 net/ipv6/esp6.c                 |  5 +++
 net/ipv6/esp6_offload.c         | 98 +++++++++++++++++++++++++++++++++++++++++
 net/ipv6/xfrm6_input.c          |  3 ++
 net/xfrm/xfrm_input.c           | 26 ++++++++---
 13 files changed, 247 insertions(+), 8 deletions(-)
 create mode 100644 net/ipv4/esp4_offload.c
 create mode 100644 net/ipv6/esp6_offload.c

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 50d6a9b..44d563a 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -360,6 +360,12 @@ config INET_ESP
 
 	  If unsure, say Y.
 
+config INET_ESP_OFFLOAD
+	tristate "IP: ESP transformation offload"
+	depends on INET_ESP
+	select XFRM_OFFLOAD
+	default n
+
 config INET_IPCOMP
 	tristate "IP: IPComp transformation"
 	select INET_XFRM_TUNNEL
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index 24629b6..03da68e 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -29,6 +29,7 @@ obj-$(CONFIG_NET_IPVTI) += ip_vti.o
 obj-$(CONFIG_SYN_COOKIES) += syncookies.o
 obj-$(CONFIG_INET_AH) += ah4.o
 obj-$(CONFIG_INET_ESP) += esp4.o
+obj-$(CONFIG_INET_ESP_OFFLOAD) += esp4_offload.o
 obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
 obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
 obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index a62c6214..cb1c10b 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -635,6 +635,11 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb)
 			nfrags = 1;
 
 			goto skip_cow;
+		} else if (skb_xfrm_gro(skb)) {
+			nfrags = skb_shinfo(skb)->nr_frags;
+			nfrags++;
+
+			goto skip_cow;
 		} else if (!skb_has_frag_list(skb)) {
 			nfrags = skb_shinfo(skb)->nr_frags;
 			nfrags++;
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
new file mode 100644
index 0000000..7277d15
--- /dev/null
+++ b/net/ipv4/esp4_offload.c
@@ -0,0 +1,94 @@
+/*
+ * IPV4 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/udp.h>
+
+static struct sk_buff **esp4_gro_receive(struct sk_buff **head,
+					 struct sk_buff *skb)
+{
+	int err;
+	if (NAPI_GRO_CB(skb)->flush)
+		goto out;
+
+	skb_pull(skb, skb_gro_offset(skb));
+	skb->xfrm_gro = 1;
+
+	err = xfrm4_rcv_encap(skb, IPPROTO_ESP, 0, -2);
+	if (err == -EOPNOTSUPP) {
+		skb_push(skb, skb_gro_offset(skb));
+		NAPI_GRO_CB(skb)->same_flow = 0;
+		NAPI_GRO_CB(skb)->flush = 1;
+		skb->xfrm_gro = 0;
+		goto out;
+	}
+
+	return ERR_PTR(-EINPROGRESS);
+out:
+	return NULL;
+}
+
+static int esp4_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct crypto_aead *aead = x->data;
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data + nhoff);
+	struct packet_offload *ptype;
+	int err = -ENOENT;
+	__be16 type = skb->protocol;
+
+	rcu_read_lock();
+	ptype = gro_find_complete_by_type(type);
+	if (ptype != NULL)
+		err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*esph) + crypto_aead_ivsize(aead));
+
+	rcu_read_unlock();
+
+	return err;
+}
+
+static const struct net_offload esp4_offload = {
+	.callbacks = {
+		.gro_receive = esp4_gro_receive,
+		.gro_complete = esp4_gro_complete,
+	},
+};
+
+static int __init esp4_offload_init(void)
+{
+	return inet_add_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+static void __exit esp4_offload_exit(void)
+{
+	inet_del_offload(&esp4_offload, IPPROTO_ESP);
+}
+
+module_init(esp4_offload_init);
+module_exit(esp4_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index cc701fa..dbcaecf 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -60,6 +60,10 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 				  iph->saddr, iph->daddr, 0);
 	if (tunnel) {
+		/* encap_type < -1 indicates a GRO call, we don't support this. */
+		if (encap_type < -1)
+			return -EOPNOTSUPP;
+
 		if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 			goto drop;
 
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 62e1e72..41578eb 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -53,6 +53,9 @@ int xfrm4_transport_finish(struct sk_buff *skb, int async)
 	iph->tot_len = htons(skb->len);
 	ip_send_check(iph);
 
+	if (skb_xfrm_gro(skb))
+		return 0;
+
 	NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
 		dev_net(skb->dev), NULL, skb, skb->dev, NULL,
 		xfrm4_rcv_encap_finish);
diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index fd840c7..7b447f3 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -50,7 +50,8 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb)
 		skb->network_header = skb->transport_header;
 	}
 	ip_hdr(skb)->tot_len = htons(skb->len + ihl);
-	skb_reset_transport_header(skb);
+	if (!skb_xfrm_gro(skb))
+		skb_reset_transport_header(skb);
 	return 0;
 }
 
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 2343e4f..cb107b4 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -75,6 +75,12 @@ config INET6_ESP
 
 	  If unsure, say Y.
 
+config INET6_ESP_OFFLOAD
+	tristate "IPv6: ESP transformation offload"
+	depends on INET6_ESP
+	select XFRM_OFFLOAD
+	default n
+
 config INET6_IPCOMP
 	tristate "IPv6: IPComp transformation"
 	select INET6_XFRM_TUNNEL
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index c174ccb..a854474 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -28,6 +28,7 @@ ipv6-objs += $(ipv6-y)
 
 obj-$(CONFIG_INET6_AH) += ah6.o
 obj-$(CONFIG_INET6_ESP) += esp6.o
+obj-$(CONFIG_INET6_ESP_OFFLOAD) += esp6_offload.o
 obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
 obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o
 obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 5cb6b35..2280bc6 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -577,6 +577,11 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 			nfrags = 1;
 
 			goto skip_cow;
+		} else if (skb_xfrm_gro(skb)) {
+			nfrags = skb_shinfo(skb)->nr_frags;
+			nfrags++;
+
+			goto skip_cow;
 		} else if (!skb_has_frag_list(skb)) {
 			nfrags = skb_shinfo(skb)->nr_frags;
 			nfrags++;
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
new file mode 100644
index 0000000..e0006cf
--- /dev/null
+++ b/net/ipv6/esp6_offload.c
@@ -0,0 +1,98 @@
+/*
+ * IPV6 GSO/GRO offload support
+ * Linux INET implementation
+ *
+ * Copyright (C) 2016 secunet Security Networks AG
+ * Author: Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * ESP GRO support
+ */
+
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/protocol.h>
+#include <crypto/aead.h>
+#include <crypto/authenc.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <net/ip.h>
+#include <net/xfrm.h>
+#include <net/esp.h>
+#include <linux/scatterlist.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/ip6_route.h>
+#include <net/ipv6.h>
+#include <linux/icmpv6.h>
+
+static struct sk_buff **esp6_gro_receive(struct sk_buff **head,
+					 struct sk_buff *skb)
+{
+	int err;
+	if (NAPI_GRO_CB(skb)->flush)
+		goto out;
+
+	skb_pull(skb, skb_gro_offset(skb));
+	skb->xfrm_gro = 1;
+
+	XFRM_SPI_SKB_CB(skb)->family = AF_INET6;
+	XFRM_SPI_SKB_CB(skb)->daddroff = offsetof(struct ipv6hdr, daddr);
+	err = xfrm_input(skb, IPPROTO_ESP, 0, -2);
+	if (err == -EOPNOTSUPP) {
+		skb_push(skb, skb_gro_offset(skb));
+		NAPI_GRO_CB(skb)->same_flow = 0;
+		NAPI_GRO_CB(skb)->flush = 1;
+		skb->xfrm_gro = 0;
+		goto out;
+	}
+
+	return ERR_PTR(-EINPROGRESS);
+out:
+	return NULL;
+}
+
+static int esp6_gro_complete(struct sk_buff *skb, int nhoff)
+{
+	struct xfrm_state *x = xfrm_input_state(skb);
+	struct crypto_aead *aead = x->data;
+	struct ip_esp_hdr *esph = (struct ip_esp_hdr *)(skb->data + nhoff);
+	struct packet_offload *ptype;
+	int err = -ENOENT;
+	__be16 type = skb->protocol;
+
+	rcu_read_lock();
+	ptype = gro_find_complete_by_type(type);
+	if (ptype != NULL)
+		err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*esph) + crypto_aead_ivsize(aead));
+
+	rcu_read_unlock();
+
+	return err;
+}
+
+static const struct net_offload esp6_offload = {
+	.callbacks = {
+		.gro_receive = esp6_gro_receive,
+		.gro_complete = esp6_gro_complete,
+	},
+};
+
+static int __init esp6_offload_init(void)
+{
+	return inet6_add_offload(&esp6_offload, IPPROTO_ESP);
+}
+
+static void __exit esp6_offload_exit(void)
+{
+	inet6_del_offload(&esp6_offload, IPPROTO_ESP);
+}
+
+module_init(esp6_offload_init);
+module_exit(esp6_offload_exit);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Steffen Klassert <steffen.klassert@secunet.com>");
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 00a2d40..6ffc172 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -43,6 +43,9 @@ int xfrm6_transport_finish(struct sk_buff *skb, int async)
 	ipv6_hdr(skb)->payload_len = htons(skb->len);
 	__skb_push(skb, skb->data - skb_network_header(skb));
 
+	if (skb_xfrm_gro(skb))
+		return -1;
+
 	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING,
 		dev_net(skb->dev), NULL, skb, skb->dev, NULL,
 		ip6_rcv_finish);
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 6e3f025..b1c2d77 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -193,13 +193,17 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 	int decaps = 0;
 	int async = 0;
 
-	/* A negative encap_type indicates async resumption. */
 	if (encap_type < 0) {
-		async = 1;
-		x = xfrm_input_state(skb);
-		seq = XFRM_SKB_CB(skb)->seq.input.low;
-		family = x->outer_mode->afinfo->family;
-		goto resume;
+		/* An encap_type of -1 indicates async resumption. */
+		if (encap_type == -1) {
+			async = 1;
+			x = xfrm_input_state(skb);
+			seq = XFRM_SKB_CB(skb)->seq.input.low;
+			family = x->outer_mode->afinfo->family;
+			goto resume;
+		}
+		/* encap_type < -1 indicates a GRO call. */
+		encap_type = 0;
 	}
 
 	daddr = (xfrm_address_t *)(skb_network_header(skb) +
@@ -374,7 +378,15 @@ resume:
 		netif_rx(skb);
 		return 0;
 	} else {
-		return x->inner_mode->afinfo->transport_finish(skb, async);
+		int xfrm_gro = skb_xfrm_gro(skb);
+
+		err = x->inner_mode->afinfo->transport_finish(skb, async);
+		if (xfrm_gro) {
+			netif_rx(skb);
+			return 0;
+		}
+
+		return err;
 	}
 
 drop_unlock:
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH RFC 05/11] skbuff: Extend gso_type to unsigned int.
  2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
                   ` (3 preceding siblings ...)
  2016-09-23  7:53 ` [PATCH RFC 04/11] esp: Add a software GRO codepath Steffen Klassert
@ 2016-09-23  7:53 ` Steffen Klassert
  2016-09-23 13:19   ` David Laight
  2016-09-23  7:53 ` [PATCH RFC 06/11] net: Add ESP offload features Steffen Klassert
                   ` (5 subsequent siblings)
  10 siblings, 1 reply; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Sowmini Varadhan, Ilan Tayari, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

All available gso_type flags are currently in use,
so extend gso_type to be able to add further flags.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/skbuff.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index f21da42..c1fd854 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -417,7 +417,7 @@ struct skb_shared_info {
 	unsigned short	gso_size;
 	/* Warning: this field is not always filled in (UFO)! */
 	unsigned short	gso_segs;
-	unsigned short  gso_type;
+	unsigned int	gso_type;
 	struct sk_buff	*frag_list;
 	struct skb_shared_hwtstamps hwtstamps;
 	u32		tskey;
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH RFC 06/11] net: Add ESP offload features
  2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
                   ` (4 preceding siblings ...)
  2016-09-23  7:53 ` [PATCH RFC 05/11] skbuff: Extend gso_type to unsigned int Steffen Klassert
@ 2016-09-23  7:53 ` Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 07/11] esp: Add gso handlers for esp4 and esp6 Steffen Klassert
                   ` (4 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Sowmini Varadhan, Ilan Tayari, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

This patch adds netdev features to configure IPsec offloads.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdev_features.h | 8 +++++++-
 include/linux/netdevice.h       | 1 +
 include/linux/skbuff.h          | 2 ++
 net/core/ethtool.c              | 3 +++
 4 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 9c6c8ef..79e72e6 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -54,8 +54,9 @@ enum {
 					 */
 	NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */
 	NETIF_F_GSO_SCTP_BIT,		/* ... SCTP fragmentation */
+	NETIF_F_GSO_ESP_BIT,		/* ... ESP with TSO */
 	/**/NETIF_F_GSO_LAST =		/* last bit, see GSO_MASK */
-		NETIF_F_GSO_SCTP_BIT,
+		NETIF_F_GSO_ESP_BIT,
 
 	NETIF_F_FCOE_CRC_BIT,		/* FCoE CRC32 */
 	NETIF_F_SCTP_CRC_BIT,		/* SCTP checksum offload */
@@ -72,6 +73,8 @@ enum {
 	NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 	NETIF_F_BUSY_POLL_BIT,		/* Busy poll */
+	NETIF_F_HW_ESP_BIT,		/* Hardware ESP transformation offload */
+	NETIF_F_HW_ESP_TX_CSUM_BIT,	/* ESP with TX checksum offload */
 
 	NETIF_F_HW_TC_BIT,		/* Offload TC infrastructure */
 
@@ -130,12 +133,15 @@ enum {
 #define NETIF_F_GSO_PARTIAL	 __NETIF_F(GSO_PARTIAL)
 #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM)
 #define NETIF_F_GSO_SCTP	__NETIF_F(GSO_SCTP)
+#define NETIF_F_GSO_ESP		__NETIF_F(GSO_ESP)
 #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
 #define NETIF_F_HW_VLAN_STAG_RX	__NETIF_F(HW_VLAN_STAG_RX)
 #define NETIF_F_HW_VLAN_STAG_TX	__NETIF_F(HW_VLAN_STAG_TX)
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
 #define NETIF_F_HW_TC		__NETIF_F(HW_TC)
+#define NETIF_F_HW_ESP		__NETIF_F(HW_ESP)
+#define NETIF_F_HW_ESP_TX_CSUM	__NETIF_F(HW_ESP_TX_CSUM)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 8bbd2d3..c3ef027 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -4090,6 +4090,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 	BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT));
 	BUILD_BUG_ON(SKB_GSO_SCTP    != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT));
+	BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT));
 
 	return (features & feature) == feature;
 }
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c1fd854..85f81ac 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -490,6 +490,8 @@ enum {
 	SKB_GSO_TUNNEL_REMCSUM = 1 << 14,
 
 	SKB_GSO_SCTP = 1 << 15,
+
+	SKB_GSO_ESP = 1 << 16,
 };
 
 #if BITS_PER_LONG > 32
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 9774898..a2a9f01 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -90,6 +90,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation",
 	[NETIF_F_GSO_PARTIAL_BIT] =	 "tx-gso-partial",
 	[NETIF_F_GSO_SCTP_BIT] =	 "tx-sctp-segmentation",
+	[NETIF_F_GSO_ESP_BIT] =		 "tx-esp-segmentation",
 
 	[NETIF_F_FCOE_CRC_BIT] =         "tx-checksum-fcoe-crc",
 	[NETIF_F_SCTP_CRC_BIT] =        "tx-checksum-sctp",
@@ -104,6 +105,8 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
 	[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
 	[NETIF_F_BUSY_POLL_BIT] =        "busy-poll",
 	[NETIF_F_HW_TC_BIT] =		 "hw-tc-offload",
+	[NETIF_F_HW_ESP_BIT] =		 "esp-hw-offload",
+	[NETIF_F_HW_ESP_TX_CSUM_BIT] =	 "esp-tx-csum-hw-offload",
 };
 
 static const char
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH RFC 07/11] esp: Add gso handlers for esp4 and esp6
  2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
                   ` (5 preceding siblings ...)
  2016-09-23  7:53 ` [PATCH RFC 06/11] net: Add ESP offload features Steffen Klassert
@ 2016-09-23  7:53 ` Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 08/11] xfrm: Move device notifications to a sepatate file Steffen Klassert
                   ` (3 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Sowmini Varadhan, Ilan Tayari, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

This patch extends the xfrm_type by an encap function pointer
and implements esp4_gso_encap and esp6_gso_encap. These functions
doing the basic esp encapsulation for a GSO packet. In case the
GSP packet needs to be segmented in software, we add gso_segment
functions. The secpath is extended to pass the needed informations
down the layers. This codepath is going to be used on esp hardware
offloads.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h      |  14 ++++++-
 net/ipv4/esp4.c         |  29 +++++++++++--
 net/ipv4/esp4_offload.c | 108 ++++++++++++++++++++++++++++++++++++++++++++++++
 net/ipv6/esp6.c         |  27 +++++++++++-
 net/ipv6/esp6_offload.c | 106 +++++++++++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_replay.c  |   3 +-
 6 files changed, 280 insertions(+), 7 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 5016fa2..1af8489 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -372,6 +372,7 @@ struct xfrm_type {
 	int			(*init_state)(struct xfrm_state *x);
 	void			(*destructor)(struct xfrm_state *);
 	int			(*input)(struct xfrm_state *, struct sk_buff *skb);
+	void			(*encap)(struct xfrm_state *, struct sk_buff *pskb);
 	int			(*output)(struct xfrm_state *, struct sk_buff *pskb);
 	int			(*reject)(struct xfrm_state *, struct sk_buff *,
 					  const struct flowi *);
@@ -979,7 +980,18 @@ void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
 struct sec_path {
 	atomic_t		refcnt;
 	int			len;
-	struct xfrm_state	*xvec[XFRM_MAX_DEPTH];
+
+        /* Output sequence number for replay protection on offloading. */
+	struct {
+		__u32 low;
+		__u32 hi;
+	} seq;
+
+	struct xfrm_state		*xvec[XFRM_MAX_DEPTH];
+
+	__u8				proto;
+	__u8				flags;
+#define SKB_GSO_SEGMENT			1
 };
 
 static inline int secpath_exists(struct sk_buff *skb)
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index cb1c10b..f61ba3c2 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -182,6 +182,22 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
 	esp_output_done(base, err);
 }
 
+static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ip_esp_hdr *esph;
+	struct iphdr *iph = ip_hdr(skb);
+	int proto = iph->protocol;
+
+	skb_push(skb, -skb_network_offset(skb));
+	esph = ip_esp_hdr(skb);
+	*skb_mac_header(skb) = IPPROTO_ESP;
+
+	esph->spi = x->id.spi;
+	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+	skb->sp->proto = proto;
+}
+
 static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct esp_output_extra *extra;
@@ -207,7 +223,7 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	int extralen;
 	int tailen;
 	__be64 seqno;
-	__u8 proto = *skb_mac_header(skb);
+	__u8 proto;
 
 	/* skb is pure payload to encrypt */
 
@@ -231,12 +247,18 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 	assoclen = sizeof(*esph);
 	extralen = 0;
 
+	if (skb->sp && (skb->sp->flags & SKB_GSO_SEGMENT)) {
+		proto = skb->sp->proto;
+	} else {
+		proto = *skb_mac_header(skb);
+		*skb_mac_header(skb) = IPPROTO_ESP;
+	}
+
 	if (x->props.flags & XFRM_STATE_ESN) {
 		extralen += sizeof(*extra);
 		assoclen += sizeof(__be32);
 	}
 
-	*skb_mac_header(skb) = IPPROTO_ESP;
 	esph = ip_esp_hdr(skb);
 
 	/* this is non-NULL only with UDP Encapsulation */
@@ -942,7 +964,8 @@ static const struct xfrm_type esp_type =
 	.destructor	= esp_destroy,
 	.get_mtu	= esp4_get_mtu,
 	.input		= esp_input,
-	.output		= esp_output
+	.output		= esp_output,
+	.encap		= esp4_gso_encap,
 };
 
 static struct xfrm4_protocol esp4_protocol = {
diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c
index 7277d15..d0831a8 100644
--- a/net/ipv4/esp4_offload.c
+++ b/net/ipv4/esp4_offload.c
@@ -71,10 +71,118 @@ static int esp4_gro_complete(struct sk_buff *skb, int nhoff)
 	return err;
 }
 
+static struct sk_buff *esp4_gso_segment(struct sk_buff *skb,
+				        netdev_features_t features)
+{
+	struct ip_esp_hdr *esph;
+	struct sk_buff *skb2;
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct xfrm_state *x;
+	struct sec_path *sp;
+	struct crypto_aead *aead;
+	int err = 0;
+	const struct net_offload *ops;
+	int omaclen;
+	__u32 seq;
+	__u32 seqhi;
+
+	sp = skb->sp;
+	if (!sp || !sp->len)
+		goto out;
+
+	seq = sp->seq.low;
+	seqhi = sp->seq.hi;
+
+	x = sp->xvec[sp->len - 1];
+	aead = x->data;
+	esph = ip_esp_hdr(skb);
+
+	omaclen = skb->mac_len;
+	if (esph->spi != x->id.spi)
+		goto out;
+
+	if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+		goto out;
+
+	__skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead));
+
+	skb->encap_hdr_csum = 1;
+
+	if (x->props.mode == XFRM_MODE_TUNNEL) {
+		__skb_push(skb, skb->mac_len);
+		segs = skb_mac_gso_segment(skb, features);
+	} else {
+		skb->transport_header += x->props.header_len;
+		ops = rcu_dereference(inet_offloads[sp->proto]);
+		if (likely(ops && ops->callbacks.gso_segment))
+			segs = ops->callbacks.gso_segment(skb, features);
+	}
+	if (IS_ERR(segs))
+		goto out;
+	if (segs == NULL)
+		return ERR_PTR(-EINVAL);
+	__skb_pull(skb, skb->data - skb_mac_header(skb));
+
+	skb2 = segs;
+	do {
+		struct sk_buff *nskb = skb2->next;
+
+		if (x->props.mode == XFRM_MODE_TUNNEL) {
+			skb2->network_header = skb2->network_header - x->props.header_len;
+			skb2->transport_header = skb2->network_header + sizeof(struct iphdr);
+			skb_reset_mac_len(skb2);
+			skb_pull(skb2, skb2->mac_len + x->props.header_len);
+		} else {
+			/* skb2 mac and data are pointing at the start of
+			 * mac address. Pull data forward to point to IP
+			 * payload past ESP header (i.e., transport data
+			 * that needs to be encrypted).
+			 * When IPsec transport mode is stacked with a tunnel,
+			 * the skb2->data needs to point at the inner IP
+			 * header for tunnelled packets. After ->gso_segment,
+			 * the skb2 wil have the network/ip header pointing
+			 * at the inner IP header, and the transport_header
+			 * will be pointing at the inner IP payload. Thus we
+			 * need to use omaclen and the outer iphdr length to
+			 * make sure that pointers are set up correctly in
+			 * every case.
+			 */
+			struct iphdr *oiph =
+				(struct iphdr *)(skb2->data + omaclen);
+			int ihl = oiph->ihl * 4;
+
+			 __skb_pull(skb2, omaclen + ihl + x->props.header_len);
+
+			/* move ->transport_header to point to esp header */
+			skb_reset_transport_header(skb2);
+			skb2->transport_header -= x->props.header_len;
+		}
+
+		skb2->sp->flags |= SKB_GSO_SEGMENT;
+		skb2->sp->seq.low = seq;
+		skb2->sp->seq.hi = xfrm_replay_seqhi(x, ntohl(seq));
+
+		err = x->type->output(x, skb2);
+		if (err) {
+			kfree_skb_list(segs);
+			return ERR_PTR(err);
+		}
+
+		seq++;
+
+		skb_push(skb2, skb2->mac_len);
+		skb2 = nskb;
+	} while (skb2);
+
+out:
+	return segs;
+}
+
 static const struct net_offload esp4_offload = {
 	.callbacks = {
 		.gro_receive = esp4_gro_receive,
 		.gro_complete = esp4_gro_complete,
+		.gso_segment = esp4_gso_segment,
 	},
 };
 
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 2280bc6..9bcb32b 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -198,6 +198,22 @@ static void esp_output_done_esn(struct crypto_async_request *base, int err)
 	esp_output_done(base, err);
 }
 
+static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct ip_esp_hdr *esph;
+	struct ipv6hdr *iph = ipv6_hdr(skb);
+	int proto = iph->nexthdr;
+
+	skb_push(skb, -skb_network_offset(skb));
+	esph = ip_esp_hdr(skb);
+	*skb_mac_header(skb) = IPPROTO_ESP;
+
+	esph->spi = x->id.spi;
+	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
+
+	skb->sp->proto = proto;
+}
+
 static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 {
 	int err;
@@ -223,7 +239,7 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	u8 *vaddr;
 	__be32 *seqhi;
 	__be64 seqno;
-	__u8 proto = *skb_mac_header(skb);
+	__u8 proto;
 
 	/* skb is pure payload to encrypt */
 	aead = x->data;
@@ -247,12 +263,18 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 	assoclen = sizeof(*esph);
 	seqhilen = 0;
 
+	if (skb->sp && (skb->sp->flags & SKB_GSO_SEGMENT)) {
+		proto = skb->sp->proto;
+	} else {
+		proto = *skb_mac_header(skb);
+		*skb_mac_header(skb) = IPPROTO_ESP;
+	}
+
 	if (x->props.flags & XFRM_STATE_ESN) {
 		seqhilen += sizeof(__be32);
 		assoclen += seqhilen;
 	}
 
-	*skb_mac_header(skb) = IPPROTO_ESP;
 	esph = ip_esp_hdr(skb);
 
 	if (!skb_cloned(skb)) {
@@ -871,6 +893,7 @@ static const struct xfrm_type esp6_type = {
 	.get_mtu	= esp6_get_mtu,
 	.input		= esp6_input,
 	.output		= esp6_output,
+	.encap		= esp6_gso_encap,
 	.hdr_offset	= xfrm6_find_1stfragopt,
 };
 
diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c
index e0006cf..51efab0 100644
--- a/net/ipv6/esp6_offload.c
+++ b/net/ipv6/esp6_offload.c
@@ -75,10 +75,116 @@ static int esp6_gro_complete(struct sk_buff *skb, int nhoff)
 	return err;
 }
 
+static struct sk_buff *esp6_gso_segment(struct sk_buff *skb,
+				        netdev_features_t features)
+{
+	struct ip_esp_hdr *esph;
+	struct sk_buff *skb2;
+	struct sk_buff *segs = ERR_PTR(-EINVAL);
+	struct xfrm_state *x;
+	struct sec_path *sp;
+	struct crypto_aead *aead;
+	int err = 0;
+	const struct net_offload *ops;
+	int omaclen;
+	__u32 seq;
+	__u32 seqhi;
+
+	sp = skb->sp;
+	if (!sp || !sp->len)
+		goto out;
+
+	seq = sp->seq.low;
+	seqhi = sp->seq.hi;
+
+	x = sp->xvec[sp->len - 1];
+	aead = x->data;
+	esph = ip_esp_hdr(skb);
+
+	omaclen = skb->mac_len;
+	if (esph->spi != x->id.spi)
+		goto out;
+
+	if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)))
+		goto out;
+
+	__skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead));
+
+	skb->encap_hdr_csum = 1;
+
+	if (x->props.mode == XFRM_MODE_TUNNEL) {
+		__skb_push(skb, skb->mac_len);
+		segs = skb_mac_gso_segment(skb, features);
+	} else {
+		skb->transport_header += x->props.header_len;
+		ops = rcu_dereference(inet_offloads[sp->proto]);
+		if (likely(ops && ops->callbacks.gso_segment))
+			segs = ops->callbacks.gso_segment(skb, features);
+	}
+	if (IS_ERR(segs))
+		goto out;
+	if (segs == NULL)
+		return ERR_PTR(-EINVAL);
+	__skb_pull(skb, skb->data - skb_mac_header(skb));
+
+	skb2 = segs;
+	do {
+		struct sk_buff *nskb = skb2->next;
+
+		if (x->props.mode == XFRM_MODE_TUNNEL) {
+			skb2->network_header = skb2->network_header - x->props.header_len;
+			skb2->transport_header = skb2->network_header + sizeof(struct ipv6hdr);
+			skb_reset_mac_len(skb2);
+			skb_pull(skb2, skb2->mac_len + x->props.header_len);
+		} else {
+			/* skb2 mac and data are pointing at the start of
+			 * mac address. Pull data forward to point to IP
+			 * payload past ESP header (i.e., transport data
+			 * that needs to be encrypted).
+			 * When IPsec transport mode is stacked with a tunnel,
+			 * the skb2->data needs to point at the inner IP
+			 * header for tunnelled packets. After ->gso_segment,
+			 * the skb2 wil have the network/ip header pointing
+			 * at the inner IP header, and the transport_header
+			 * will be pointing at the inner IP payload. Thus we
+			 * need to use omaclen and the outer iphdr length to
+			 * make sure that pointers are set up correctly in
+			 * every case.
+			 */
+
+			 __skb_pull(skb2, omaclen + sizeof(struct ipv6hdr) + x->props.header_len);
+
+			/* move ->transport_header to point to esp header */
+			skb_reset_transport_header(skb2);
+			skb2->transport_header -= x->props.header_len;
+		}
+
+		skb2->sp->flags |= SKB_GSO_SEGMENT;
+		skb2->sp->seq.low = seq;
+		skb2->sp->seq.hi = xfrm_replay_seqhi(x, ntohl(seq));
+
+		err = x->type->output(x, skb2);
+		if (err) {
+			kfree_skb_list(segs);
+			return ERR_PTR(err);
+		}
+
+		seq++;
+
+		skb_push(skb2, skb2->mac_len);
+		skb2 = nskb;
+	} while (skb2);
+
+out:
+	return segs;
+}
+
+
 static const struct net_offload esp6_offload = {
 	.callbacks = {
 		.gro_receive = esp6_gro_receive,
 		.gro_complete = esp6_gro_complete,
+		.gso_segment = esp6_gso_segment,
 	},
 };
 
diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index cdc2e2e..20e68a3 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -45,7 +45,8 @@ u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq)
 
 	return seq_hi;
 }
-
+EXPORT_SYMBOL(xfrm_replay_seqhi);
+;
 static void xfrm_replay_notify(struct xfrm_state *x, int event)
 {
 	struct km_event c;
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH RFC 08/11] xfrm: Move device notifications to a sepatate file
  2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
                   ` (6 preceding siblings ...)
  2016-09-23  7:53 ` [PATCH RFC 07/11] esp: Add gso handlers for esp4 and esp6 Steffen Klassert
@ 2016-09-23  7:53 ` Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 09/11] xfrm: Add an IPsec hardware offloading API Steffen Klassert
                   ` (2 subsequent siblings)
  10 siblings, 0 replies; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Sowmini Varadhan, Ilan Tayari, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

This is needed for the upcomming IPsec device offloading.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/net/xfrm.h     |  1 +
 net/xfrm/Makefile      |  2 +-
 net/xfrm/xfrm_device.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_policy.c | 17 +----------------
 4 files changed, 46 insertions(+), 17 deletions(-)
 create mode 100644 net/xfrm/xfrm_device.c

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 1af8489..a700f29 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -1408,6 +1408,7 @@ struct xfrm6_tunnel {
 void xfrm_init(void);
 void xfrm4_init(void);
 int xfrm_state_init(struct net *net);
+void xfrm_dev_init(void);
 void xfrm_state_fini(struct net *net);
 void xfrm4_state_init(void);
 void xfrm4_protocol_init(void);
diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile
index c0e9619..55b2ac3 100644
--- a/net/xfrm/Makefile
+++ b/net/xfrm/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \
 		      xfrm_input.o xfrm_output.o \
-		      xfrm_sysctl.o xfrm_replay.o
+		      xfrm_sysctl.o xfrm_replay.o xfrm_device.o
 obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o
 obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o
 obj-$(CONFIG_XFRM_USER) += xfrm_user.o
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
new file mode 100644
index 0000000..34a260a
--- /dev/null
+++ b/net/xfrm/xfrm_device.c
@@ -0,0 +1,43 @@
+/*
+ * xfrm_device.c - IPsec device offloading code.
+ *
+ * Copyright (c) 2015 secunet Security Networks AG
+ *
+ * Author:
+ * Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <net/dst.h>
+#include <net/xfrm.h>
+#include <linux/notifier.h>
+
+static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+	switch (event) {
+	case NETDEV_DOWN:
+		xfrm_garbage_collect(dev_net(dev));
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block xfrm_dev_notifier = {
+	.notifier_call	= xfrm_dev_event,
+};
+
+void __net_init xfrm_dev_init(void)
+{
+	register_netdevice_notifier(&xfrm_dev_notifier);
+}
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index fd69866..dfa0d86 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -2921,21 +2921,6 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
 
-static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
-{
-	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
-
-	switch (event) {
-	case NETDEV_DOWN:
-		xfrm_garbage_collect(dev_net(dev));
-	}
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block xfrm_dev_notifier = {
-	.notifier_call	= xfrm_dev_event,
-};
-
 #ifdef CONFIG_XFRM_STATISTICS
 static int __net_init xfrm_statistics_init(struct net *net)
 {
@@ -3012,7 +2997,7 @@ static int __net_init xfrm_policy_init(struct net *net)
 	INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize);
 	INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild);
 	if (net_eq(net, &init_net))
-		register_netdevice_notifier(&xfrm_dev_notifier);
+		xfrm_dev_init();
 	return 0;
 
 out_bydst:
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH RFC 09/11] xfrm: Add an IPsec hardware offloading API
  2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
                   ` (7 preceding siblings ...)
  2016-09-23  7:53 ` [PATCH RFC 08/11] xfrm: Move device notifications to a sepatate file Steffen Klassert
@ 2016-09-23  7:53 ` Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 10/11] xfrm: Add xfrm_replay_overflow functions for offloading Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 11/11] xfrm: Add encapsulation header offsets while SKB is not encrypted Steffen Klassert
  10 siblings, 0 replies; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Sowmini Varadhan, Ilan Tayari, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

This patch adds all the bits that are needed to do
IPsec hardware offload for IPsec states and ESP packets.
We add xfrmdev_ops to the net_device. xfrmdev_ops has
function pointers that are needed to manage the xfrm
states in the hardware and to do a per packet offloading
decision.

Joint work with:
Ilan Tayari <ilant@mellanox.com>
Guy Shapiro <guysh@mellanox.com>

Signed-off-by: Guy Shapiro <guysh@mellanox.com>
Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 include/linux/netdevice.h | 14 ++++++++
 include/net/xfrm.h        | 40 +++++++++++++++++++++
 include/uapi/linux/xfrm.h |  8 +++++
 net/ipv4/esp4.c           | 31 ++++++++++++++++-
 net/ipv4/xfrm4_output.c   |  3 +-
 net/ipv6/esp6.c           | 32 ++++++++++++++++-
 net/xfrm/xfrm_device.c    | 51 ++++++++++++++++++++++++++-
 net/xfrm/xfrm_input.c     | 41 +++++++++++++++++++++-
 net/xfrm/xfrm_output.c    | 61 ++++++++++++++++++++++++++++++--
 net/xfrm/xfrm_policy.c    | 11 +++---
 net/xfrm/xfrm_state.c     | 89 +++++++++++++++++++++++++++++++++++++++++++++++
 net/xfrm/xfrm_user.c      | 81 ++++++++++++++++++++++++++++++++++++++++++
 12 files changed, 449 insertions(+), 13 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index c3ef027..b2a511e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -832,6 +832,16 @@ struct netdev_xdp {
 	};
 };
 
+#ifdef CONFIG_XFRM
+struct xfrmdev_ops {
+	int	(*xdo_dev_state_add) (struct xfrm_state *x);
+	void	(*xdo_dev_state_delete) (struct xfrm_state *x);
+	void	(*xdo_dev_state_free) (struct xfrm_state *x);
+	bool	(*xdo_dev_offload_ok) (struct sk_buff *skb,
+				       struct xfrm_state *x);
+};
+#endif
+
 /*
  * This structure defines the management hooks for network devices.
  * The following hooks can be defined; unless noted otherwise, they are
@@ -1708,6 +1718,10 @@ struct net_device {
 	const struct ndisc_ops *ndisc_ops;
 #endif
 
+#ifdef CONFIG_XFRM
+	const struct xfrmdev_ops *xfrmdev_ops;
+#endif
+
 	const struct header_ops *header_ops;
 
 	unsigned int		flags;
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index a700f29..13060ce 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -120,6 +120,13 @@ struct xfrm_state_walk {
 	struct xfrm_address_filter *filter;
 };
 
+struct xfrm_state_offload {
+	struct net_device	*dev;
+	unsigned long		offload_handle;
+	unsigned int		num_exthdrs;
+	u8			flags;
+};
+
 /* Full description of state of transformer. */
 struct xfrm_state {
 	possible_net_t		xs_net;
@@ -207,6 +214,8 @@ struct xfrm_state {
 	struct xfrm_lifetime_cur curlft;
 	struct tasklet_hrtimer	mtimer;
 
+	struct xfrm_state_offload xso;
+
 	/* used to fix curlft->add_time when changing date */
 	long		saved_tmo;
 
@@ -373,6 +382,7 @@ struct xfrm_type {
 	void			(*destructor)(struct xfrm_state *);
 	int			(*input)(struct xfrm_state *, struct sk_buff *skb);
 	void			(*encap)(struct xfrm_state *, struct sk_buff *pskb);
+	int			(*input_tail)(struct xfrm_state *x, struct sk_buff *skb);
 	int			(*output)(struct xfrm_state *, struct sk_buff *pskb);
 	int			(*reject)(struct xfrm_state *, struct sk_buff *,
 					  const struct flowi *);
@@ -977,6 +987,22 @@ static inline void xfrm_dst_destroy(struct xfrm_dst *xdst)
 
 void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev);
 
+struct xfrm_offload_state {
+	u32			flags;
+#define	SA_DELETE_REQ		1
+#define	CRYPTO_DONE		2
+#define	CRYPTO_NEXT_DONE	4
+	u32			status;
+#define CRYPTO_SUCCESS				1
+#define CRYPTO_GENERIC_ERROR			2
+#define CRYPTO_TRANSPORT_AH_AUTH_FAILED		4
+#define CRYPTO_TRANSPORT_ESP_AUTH_FAILED	8
+#define CRYPTO_TUNNEL_AH_AUTH_FAILED		16
+#define CRYPTO_TUNNEL_ESP_AUTH_FAILED		32
+#define CRYPTO_INVALID_PACKET_SYNTAX		64
+#define CRYPTO_INVALID_PROTOCOL			128
+};
+
 struct sec_path {
 	atomic_t		refcnt;
 	int			len;
@@ -988,6 +1014,7 @@ struct sec_path {
 	} seq;
 
 	struct xfrm_state		*xvec[XFRM_MAX_DEPTH];
+	struct xfrm_offload_state	ovec[XFRM_MAX_DEPTH];
 
 	__u8				proto;
 	__u8				flags;
@@ -1514,6 +1541,7 @@ struct xfrmk_spdinfo {
 struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq);
 int xfrm_state_delete(struct xfrm_state *x);
 int xfrm_state_flush(struct net *net, u8 proto, bool task_valid);
+int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid);
 void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si);
 void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si);
 u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq);
@@ -1593,6 +1621,11 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb)
 }
 #endif
 
+struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif,
+				    const xfrm_address_t *saddr,
+				    const xfrm_address_t *daddr,
+				    int family);
+
 struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp);
 
 void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type);
@@ -1787,6 +1820,13 @@ static inline struct xfrm_state *xfrm_input_state(struct sk_buff *skb)
 {
 	return skb->sp->xvec[skb->sp->len - 1];
 }
+static inline struct xfrm_offload_state *xfrm_offload_input(struct sk_buff *skb)
+{
+	if (!skb->sp || !skb->sp->len)
+		return NULL;
+
+	return &skb->sp->ovec[skb->sp->len - 1];
+}
 #endif
 
 static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m)
diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h
index 1433389..8ceb753 100644
--- a/include/uapi/linux/xfrm.h
+++ b/include/uapi/linux/xfrm.h
@@ -303,6 +303,7 @@ enum xfrm_attr_type_t {
 	XFRMA_PROTO,		/* __u8 */
 	XFRMA_ADDRESS_FILTER,	/* struct xfrm_address_filter */
 	XFRMA_PAD,
+	XFRMA_OFFLOAD_DEV,	/* struct xfrm_state_offload */
 	__XFRMA_MAX
 
 #define XFRMA_MAX (__XFRMA_MAX - 1)
@@ -494,6 +495,13 @@ struct xfrm_address_filter {
 	__u8				dplen;
 };
 
+struct xfrm_user_offload {
+	int				ifindex;
+	__u8				flags;
+};
+#define XFRM_OFFLOAD_IPV6	1
+#define XFRM_OFFLOAD_INBOUND	2
+
 #ifndef __KERNEL__
 /* backwards compatibility for userspace */
 #define XFRMGRP_ACQUIRE		1
diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index f61ba3c2..3d2c749 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -361,6 +361,14 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb)
 			esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 			esph->spi = x->id.spi;
 
+			if (x->xso.offload_handle && skb->sp) {
+				if (skb->sp->flags & SKB_GSO_SEGMENT)
+					esph->seq_no = htonl(skb->sp->seq.low);
+
+				spin_unlock_bh(&x->lock);
+				return 0;
+			}
+
 			tmp = esp_alloc_tmp(aead, nfrags + 2, extralen);
 			if (!tmp) {
 				spin_unlock_bh(&x->lock);
@@ -435,6 +443,13 @@ skip_cow:
 	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 	esph->spi = x->id.spi;
 
+	if (x->xso.offload_handle && skb->sp) {
+		if (skb->sp->flags & SKB_GSO_SEGMENT)
+			esph->seq_no = htonl(skb->sp->seq.low);
+
+		return 0;
+	}
+
 	tmp = esp_alloc_tmp(aead, nfrags, extralen);
 	if (!tmp) {
 		err = -ENOMEM;
@@ -498,6 +513,7 @@ static int esp_input_done2(struct sk_buff *skb, int err)
 {
 	const struct iphdr *iph;
 	struct xfrm_state *x = xfrm_input_state(skb);
+	struct xfrm_offload_state *xo = xfrm_offload_input(skb);
 	struct crypto_aead *aead = x->data;
 	int alen = crypto_aead_authsize(aead);
 	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
@@ -506,7 +522,8 @@ static int esp_input_done2(struct sk_buff *skb, int err)
 	u8 nexthdr[2];
 	int padlen;
 
-	kfree(ESP_SKB_CB(skb)->tmp);
+	if (!(xo->flags & CRYPTO_DONE))
+		kfree(ESP_SKB_CB(skb)->tmp);
 
 	if (unlikely(err))
 		goto out;
@@ -616,6 +633,17 @@ static void esp_input_done_esn(struct crypto_async_request *base, int err)
 	esp_input_done(base, err);
 }
 
+static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct crypto_aead *aead = x->data;
+
+	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
+		return -EINVAL;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return esp_input_done2(skb, 0);
+}
 /*
  * Note: detecting truncated vs. non-truncated authentication data is very
  * expensive, so we only support truncated data, which is the recommended
@@ -964,6 +992,7 @@ static const struct xfrm_type esp_type =
 	.destructor	= esp_destroy,
 	.get_mtu	= esp4_get_mtu,
 	.input		= esp_input,
+	.input_tail	= esp_input_tail,
 	.output		= esp_output,
 	.encap		= esp4_gso_encap,
 };
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 7ee6518..94b8702 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -29,7 +29,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb)
 		goto out;
 
 	mtu = dst_mtu(skb_dst(skb));
-	if (skb->len > mtu) {
+	if ((!skb_is_gso(skb) && skb->len > mtu) ||
+	    (skb_is_gso(skb) && skb_gso_network_seglen(skb) > ip_skb_dst_mtu(skb->sk, skb))) {
 		skb->protocol = htons(ETH_P_IP);
 
 		if (skb->sk)
diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 9bcb32b..49ed382 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -341,6 +341,14 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb)
 			esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 			esph->spi = x->id.spi;
 
+			if (x->xso.offload_handle && skb->sp) {
+				if (skb->sp->flags & SKB_GSO_SEGMENT)
+					esph->seq_no = htonl(skb->sp->seq.low);
+
+				spin_unlock_bh(&x->lock);
+				return 0;
+			}
+
 			tmp = esp_alloc_tmp(aead, nfrags + 2, seqhilen);
 			if (!tmp) {
 				spin_unlock_bh(&x->lock);
@@ -416,6 +424,13 @@ skip_cow:
 	esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low);
 	esph->spi = x->id.spi;
 
+	if (x->xso.offload_handle && skb->sp) {
+		if (skb->sp->flags & SKB_GSO_SEGMENT)
+			esph->seq_no = htonl(skb->sp->seq.low);
+
+		return 0;
+	}
+
 	tmp = esp_alloc_tmp(aead, nfrags, seqhilen);
 	if (!tmp) {
 		err = -ENOMEM;
@@ -478,6 +493,7 @@ error:
 static int esp_input_done2(struct sk_buff *skb, int err)
 {
 	struct xfrm_state *x = xfrm_input_state(skb);
+	struct xfrm_offload_state *xo = xfrm_offload_input(skb);
 	struct crypto_aead *aead = x->data;
 	int alen = crypto_aead_authsize(aead);
 	int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead);
@@ -486,7 +502,8 @@ static int esp_input_done2(struct sk_buff *skb, int err)
 	int padlen;
 	u8 nexthdr[2];
 
-	kfree(ESP_SKB_CB(skb)->tmp);
+	if (!(xo->flags & CRYPTO_DONE))
+		kfree(ESP_SKB_CB(skb)->tmp);
 
 	if (unlikely(err))
 		goto out;
@@ -559,6 +576,18 @@ static void esp_input_done_esn(struct crypto_async_request *base, int err)
 	esp_input_done(base, err);
 }
 
+static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb)
+{
+	struct crypto_aead *aead = x->data;
+
+	if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead)))
+		return -EINVAL;
+
+	skb->ip_summed = CHECKSUM_NONE;
+
+	return esp_input_done2(skb, 0);
+}
+
 static int esp6_input(struct xfrm_state *x, struct sk_buff *skb)
 {
 	struct ip_esp_hdr *esph;
@@ -892,6 +921,7 @@ static const struct xfrm_type esp6_type = {
 	.destructor	= esp6_destroy,
 	.get_mtu	= esp6_get_mtu,
 	.input		= esp6_input,
+	.input_tail	= esp6_input_tail,
 	.output		= esp6_output,
 	.encap		= esp6_gso_encap,
 	.hdr_offset	= xfrm6_find_1stfragopt,
diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c
index 34a260a..7add72f 100644
--- a/net/xfrm/xfrm_device.c
+++ b/net/xfrm/xfrm_device.c
@@ -22,13 +22,62 @@
 #include <net/xfrm.h>
 #include <linux/notifier.h>
 
+int xfrm_dev_register(struct net_device *dev)
+{
+	if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops)
+		return NOTIFY_BAD;
+	if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) &&
+	    !(dev->features & NETIF_F_HW_ESP))
+		return NOTIFY_BAD;
+
+	return NOTIFY_DONE;
+}
+
+static int xfrm_dev_unregister(struct net_device *dev)
+{
+	return NOTIFY_DONE;
+}
+
+static int xfrm_dev_feat_change(struct net_device *dev)
+{
+	if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops)
+		return NOTIFY_BAD;
+	else if (!(dev->features & NETIF_F_HW_ESP))
+		dev->xfrmdev_ops = NULL;
+
+	if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) &&
+	    !(dev->features & NETIF_F_HW_ESP))
+		return NOTIFY_BAD;
+
+	return NOTIFY_DONE;
+}
+
+static int xfrm_dev_down(struct net_device *dev)
+{
+	if (dev->hw_features & NETIF_F_HW_ESP)
+		xfrm_dev_state_flush(dev_net(dev), dev, true);
+
+	xfrm_garbage_collect(dev_net(dev));
+
+	return NOTIFY_DONE;
+}
+
 static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
 
 	switch (event) {
+	case NETDEV_REGISTER:
+		return xfrm_dev_register(dev);
+
+	case NETDEV_UNREGISTER:
+		return xfrm_dev_unregister(dev);
+
+	case NETDEV_FEAT_CHANGE:
+		return xfrm_dev_feat_change(dev);
+
 	case NETDEV_DOWN:
-		xfrm_garbage_collect(dev_net(dev));
+		return xfrm_dev_down(dev);
 	}
 	return NOTIFY_DONE;
 }
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index b1c2d77..114f7f9 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -111,6 +111,8 @@ struct sec_path *secpath_dup(struct sec_path *src)
 		return NULL;
 
 	sp->len = 0;
+	sp->flags = 0;
+
 	if (src) {
 		int i;
 
@@ -192,6 +194,8 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 	unsigned int family;
 	int decaps = 0;
 	int async = 0;
+	bool crypto_done = false;
+	struct xfrm_offload_state *xo = xfrm_offload_input(skb);
 
 	if (encap_type < 0) {
 		/* An encap_type of -1 indicates async resumption. */
@@ -206,6 +210,37 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 		encap_type = 0;
 	}
 
+	if (xo && (xo->flags & CRYPTO_DONE)) {
+		crypto_done = true;
+		x = xfrm_input_state(skb);
+		family = XFRM_SPI_SKB_CB(skb)->family;
+
+		if (!(xo->status & CRYPTO_SUCCESS)) {
+			if (xo->status &
+			    (CRYPTO_TRANSPORT_AH_AUTH_FAILED |
+			     CRYPTO_TRANSPORT_ESP_AUTH_FAILED |
+			     CRYPTO_TUNNEL_AH_AUTH_FAILED |
+			     CRYPTO_TUNNEL_ESP_AUTH_FAILED)) {
+
+				xfrm_audit_state_icvfail(x, skb,
+							 x->type->proto);
+				x->stats.integrity_failed++;
+				XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR);
+				goto drop;
+			}
+
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR);
+			goto drop;
+		}
+
+		if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR);
+			goto drop;
+		}
+
+		goto lock;
+	}
+
 	daddr = (xfrm_address_t *)(skb_network_header(skb) +
 				   XFRM_SPI_SKB_CB(skb)->daddroff);
 	family = XFRM_SPI_SKB_CB(skb)->family;
@@ -257,6 +292,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 
 		skb->sp->xvec[skb->sp->len++] = x;
 
+lock:
 		spin_lock(&x->lock);
 
 		if (unlikely(x->km.state != XFRM_STATE_VALID)) {
@@ -298,7 +334,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
 		skb_dst_force(skb);
 		dev_hold(skb->dev);
 
-		nexthdr = x->type->input(x, skb);
+		if (crypto_done)
+			nexthdr = x->type->input_tail(x, skb);
+		else
+			nexthdr = x->type->input(x, skb);
 
 		if (nexthdr == -EINPROGRESS)
 			return 0;
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index 637387b..fdd5fa1 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -102,9 +102,13 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
 		/* Inner headers are invalid now. */
 		skb->encapsulation = 0;
 
-		err = x->type->output(x, skb);
-		if (err == -EINPROGRESS)
-			goto out;
+		if (skb_shinfo(skb)->gso_type & SKB_GSO_ESP) {
+			x->type->encap(x, skb);
+		} else {
+			err = x->type->output(x, skb);
+			if (err == -EINPROGRESS)
+				goto out;
+		}
 
 resume:
 		if (err) {
@@ -197,11 +201,61 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
 	return 0;
 }
 
+static bool xfrm_offload_ok(struct sk_buff *skb, struct xfrm_state *x)
+{
+	int mtu;
+	struct dst_entry *dst = skb_dst(skb);
+	struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
+
+	if (x->xso.offload_handle && (x->xso.dev == dst->path->dev)
+	    && !dst->child->xfrm && x->type->get_mtu) {
+		mtu = x->type->get_mtu(x, xdst->child_mtu_cached);
+
+		if (skb->len <= mtu)
+			goto ok;
+
+		if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
+			goto ok;
+	}
+	return false;
+
+ok:
+	return x->xso.dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x);
+}
+
 int xfrm_output(struct sock *sk, struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb_dst(skb)->dev);
+	struct xfrm_state *x = skb_dst(skb)->xfrm;
 	int err;
 
+	secpath_reset(skb);
+
+	if (xfrm_offload_ok(skb, x)) {
+		struct sec_path *sp;
+
+		sp = secpath_dup(skb->sp);
+		if (!sp) {
+			XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR);
+			err = -ENOMEM;
+			return err;
+		}
+		if (skb->sp)
+			secpath_put(skb->sp);
+		skb->sp = sp;
+
+		skb->sp->xvec[skb->sp->len++] = x;
+		xfrm_state_hold(x);
+
+		if (skb_is_gso(skb)) {
+			skb_shinfo(skb)->gso_type |= SKB_GSO_ESP;
+
+			return xfrm_output2(net, sk, skb);
+		}
+		if (x->xso.dev->features & NETIF_F_HW_ESP_TX_CSUM)
+			goto out;
+	}
+
 	if (skb_is_gso(skb))
 		return xfrm_output_gso(net, sk, skb);
 
@@ -214,6 +268,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
 		}
 	}
 
+out:
 	return xfrm_output2(net, sk, skb);
 }
 EXPORT_SYMBOL_GPL(xfrm_output);
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index dfa0d86..88e045f 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -121,11 +121,11 @@ static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo)
 	rcu_read_unlock();
 }
 
-static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
-						  int tos, int oif,
-						  const xfrm_address_t *saddr,
-						  const xfrm_address_t *daddr,
-						  int family)
+struct dst_entry *__xfrm_dst_lookup(struct net *net,
+				    int tos, int oif,
+				    const xfrm_address_t *saddr,
+				    const xfrm_address_t *daddr,
+				    int family)
 {
 	struct xfrm_policy_afinfo *afinfo;
 	struct dst_entry *dst;
@@ -140,6 +140,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net,
 
 	return dst;
 }
+EXPORT_SYMBOL(__xfrm_dst_lookup);
 
 static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x,
 						int tos, int oif,
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index ba8bf51..486fc67 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -341,6 +341,19 @@ retry:
 	return mode;
 }
 
+static void xfrm_state_free_offload(struct xfrm_state *x)
+{
+	struct xfrm_state_offload *xso = &x->xso;
+
+	if (xso->dev) {
+		struct net_device *dev = xso->dev;
+
+		dev->xfrmdev_ops->xdo_dev_state_free(x);
+		xso->dev = NULL;
+		dev_put(dev);
+	}
+}
+
 static void xfrm_put_mode(struct xfrm_mode *mode)
 {
 	module_put(mode->owner);
@@ -367,6 +380,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
 		x->type->destructor(x);
 		xfrm_put_type(x->type);
 	}
+	xfrm_state_free_offload(x);
 	security_xfrm_state_free(x);
 	kfree(x);
 }
@@ -531,6 +545,7 @@ EXPORT_SYMBOL(__xfrm_state_destroy);
 int __xfrm_state_delete(struct xfrm_state *x)
 {
 	struct net *net = xs_net(x);
+	struct xfrm_state_offload *xso = &x->xso;
 	int err = -ESRCH;
 
 	if (x->km.state != XFRM_STATE_DEAD) {
@@ -544,6 +559,9 @@ int __xfrm_state_delete(struct xfrm_state *x)
 		net->xfrm.state_num--;
 		spin_unlock(&net->xfrm.xfrm_state_lock);
 
+		if (xso->dev)
+			xso->dev->xfrmdev_ops->xdo_dev_state_delete(x);
+
 		/* All xfrm_state objects are created by xfrm_state_alloc.
 		 * The xfrm_state_alloc call gives a reference, and that
 		 * is what we are dropping here.
@@ -588,12 +606,41 @@ xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid)
 
 	return err;
 }
+
+static inline int
+xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid)
+{
+	int i, err = 0;
+
+	for (i = 0; i <= net->xfrm.state_hmask; i++) {
+		struct xfrm_state *x;
+		struct xfrm_state_offload *xso;
+
+		hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
+			xso = &x->xso;
+
+			if (xso->dev == dev &&
+			   (err = security_xfrm_state_delete(x)) != 0) {
+				xfrm_audit_state_delete(x, 0, task_valid);
+				return err;
+			}
+		}
+	}
+
+	return err;
+}
 #else
 static inline int
 xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid)
 {
 	return 0;
 }
+
+static inline int
+xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid)
+{
+	return 0;
+}
 #endif
 
 int xfrm_state_flush(struct net *net, u8 proto, bool task_valid)
@@ -636,6 +683,48 @@ out:
 }
 EXPORT_SYMBOL(xfrm_state_flush);
 
+int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid)
+{
+	int i, err = 0, cnt = 0;
+
+	spin_lock_bh(&net->xfrm.xfrm_state_lock);
+	err = xfrm_dev_state_flush_secctx_check(net, dev, task_valid);
+	if (err)
+		goto out;
+
+	err = -ESRCH;
+	for (i = 0; i <= net->xfrm.state_hmask; i++) {
+		struct xfrm_state *x;
+		struct xfrm_state_offload *xso;
+restart:
+		hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) {
+			xso = &x->xso;
+
+			if (!xfrm_state_kern(x) && xso->dev == dev) {
+				xfrm_state_hold(x);
+				spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+
+				err = xfrm_state_delete(x);
+				xfrm_audit_state_delete(x, err ? 0 : 1,
+							task_valid);
+				xfrm_state_put(x);
+				if (!err)
+					cnt++;
+
+				spin_lock_bh(&net->xfrm.xfrm_state_lock);
+				goto restart;
+			}
+		}
+	}
+	if (cnt)
+		err = 0;
+
+out:
+	spin_unlock_bh(&net->xfrm.xfrm_state_lock);
+	return err;
+}
+EXPORT_SYMBOL(xfrm_dev_state_flush);
+
 void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si)
 {
 	spin_lock_bh(&net->xfrm.xfrm_state_lock);
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index cb65d91..edbdb1b 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -400,6 +400,59 @@ static int attach_aead(struct xfrm_state *x, struct nlattr *rta)
 	return 0;
 }
 
+static int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct nlattr *rta)
+{
+	int err;
+	struct dst_entry *dst;
+	struct net_device *dev;
+	struct xfrm_user_offload *xuo;
+	struct xfrm_state_offload *xso = &x->xso;
+	xfrm_address_t *saddr;
+	xfrm_address_t *daddr;
+
+	if (!rta)
+		return 0;
+
+	xuo = nla_data(rta);
+
+	dev = dev_get_by_index(net, xuo->ifindex);
+	if (!dev) {
+		if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) {
+			saddr = &x->props.saddr;
+			daddr = &x->id.daddr;
+		} else {
+			saddr = &x->id.daddr;
+			daddr = &x->props.saddr;
+		}
+
+		dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, x->props.family);
+		if (IS_ERR(dst))
+			return 0;
+
+		dev = dst->dev;
+
+		dev_hold(dev);
+		dst_release(dst);
+	}
+
+	if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) {
+		dev_put(dev);
+		return 0;
+	}
+
+	xso->dev = dev;
+	xso->num_exthdrs = 1;
+	xso->flags = xuo->flags;
+
+	err = dev->xfrmdev_ops->xdo_dev_state_add(x);
+	if (err) {
+		dev_put(dev);
+		return err;
+	}
+
+	return 0;
+}
+
 static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_esn,
 					 struct nlattr *rp)
 {
@@ -585,6 +638,10 @@ static struct xfrm_state *xfrm_state_construct(struct net *net,
 	    security_xfrm_state_alloc(x, nla_data(attrs[XFRMA_SEC_CTX])))
 		goto error;
 
+	if (attrs[XFRMA_OFFLOAD_DEV] &&
+	    xfrm_dev_state_add(net, x, attrs[XFRMA_OFFLOAD_DEV]))
+		goto error;
+
 	if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn,
 					       attrs[XFRMA_REPLAY_ESN_VAL])))
 		goto error;
@@ -769,6 +826,23 @@ static int copy_sec_ctx(struct xfrm_sec_ctx *s, struct sk_buff *skb)
 	return 0;
 }
 
+static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb)
+{
+	struct xfrm_user_offload *xuo;
+	struct nlattr *attr;
+
+	attr = nla_reserve(skb, XFRMA_OFFLOAD_DEV, sizeof(*xuo));
+	if (attr == NULL)
+		return -EMSGSIZE;
+
+	xuo = nla_data(attr);
+
+	xuo->ifindex = xso->dev->ifindex;
+	xuo->flags = xso->flags;
+
+	return 0;
+}
+
 static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb)
 {
 	struct xfrm_algo *algo;
@@ -859,6 +933,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x,
 			      &x->replay);
 	if (ret)
 		goto out;
+	if(x->xso.dev)
+		ret = copy_user_offload(&x->xso, skb);
+	if (ret)
+		goto out;
 	if (x->security)
 		ret = copy_sec_ctx(x->security, skb);
 out:
@@ -2396,6 +2474,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = {
 	[XFRMA_SA_EXTRA_FLAGS]	= { .type = NLA_U32 },
 	[XFRMA_PROTO]		= { .type = NLA_U8 },
 	[XFRMA_ADDRESS_FILTER]	= { .len = sizeof(struct xfrm_address_filter) },
+	[XFRMA_OFFLOAD_DEV]	= { .len = sizeof(struct xfrm_user_offload) },
 };
 
 static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = {
@@ -2612,6 +2691,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
 		l += nla_total_size(sizeof(*x->coaddr));
 	if (x->props.extra_flags)
 		l += nla_total_size(sizeof(x->props.extra_flags));
+	if (x->xso.dev)
+		 l += nla_total_size(sizeof(x->xso));
 
 	/* Must count x->lastused as it may become non-zero behind our back. */
 	l += nla_total_size_64bit(sizeof(u64));
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH RFC 10/11] xfrm: Add xfrm_replay_overflow functions for offloading
  2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
                   ` (8 preceding siblings ...)
  2016-09-23  7:53 ` [PATCH RFC 09/11] xfrm: Add an IPsec hardware offloading API Steffen Klassert
@ 2016-09-23  7:53 ` Steffen Klassert
  2016-09-23  7:53 ` [PATCH RFC 11/11] xfrm: Add encapsulation header offsets while SKB is not encrypted Steffen Klassert
  10 siblings, 0 replies; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Sowmini Varadhan, Ilan Tayari, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

This patch adds functions that handles IPsec sequence
numbers for GSO segments and TSO offloading. We need
to calculate and update the sequence numbers based
on the segments that GSO/TSO will generate. We need
this to keep software and hardware sequence number
counter in sync.

Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/xfrm/xfrm_replay.c | 103 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 101 insertions(+), 2 deletions(-)

diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index 20e68a3..401d4a9 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -559,6 +559,83 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq)
 		x->repl->notify(x, XFRM_REPLAY_UPDATE);
 }
 
+static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err = 0;
+	struct net *net = xs_net(x);
+	struct sec_path *sp = skb->sp;
+	__u32 oseq = x->replay.oseq;
+
+	if (!sp)
+		return xfrm_replay_overflow(x, skb);
+
+	if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
+		if (!skb_is_gso(skb)) {
+			XFRM_SKB_CB(skb)->seq.output.low = ++oseq;
+			sp->seq.low = oseq;
+		} else {
+			XFRM_SKB_CB(skb)->seq.output.low = oseq + 1;
+			sp->seq.low = oseq + 1;
+			oseq += skb_shinfo(skb)->gso_segs;
+		}
+
+		XFRM_SKB_CB(skb)->seq.output.hi = 0;
+		sp->seq.hi = 0;
+		if (unlikely(oseq < x->replay.oseq)) {
+			xfrm_audit_state_replay_overflow(x, skb);
+			err = -EOVERFLOW;
+
+			return err;
+		}
+
+		x->replay.oseq = oseq;
+
+		if (xfrm_aevent_is_on(net))
+			x->repl->notify(x, XFRM_REPLAY_UPDATE);
+	}
+
+	return err;
+}
+
+static int xfrm_replay_overflow_offload_bmp(struct xfrm_state *x, struct sk_buff *skb)
+{
+	int err = 0;
+	struct sec_path *sp = skb->sp;
+	struct xfrm_replay_state_esn *replay_esn = x->replay_esn;
+	struct net *net = xs_net(x);
+	__u32 oseq = replay_esn->oseq;
+
+	if (!sp)
+		return xfrm_replay_overflow_bmp(x, skb);
+
+	if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
+		if (!skb_is_gso(skb)) {
+			XFRM_SKB_CB(skb)->seq.output.low = ++oseq;
+			sp->seq.low = oseq;
+		} else {
+			XFRM_SKB_CB(skb)->seq.output.low = oseq + 1;
+			sp->seq.low = oseq + 1;
+			oseq += skb_shinfo(skb)->gso_segs;
+		}
+
+		XFRM_SKB_CB(skb)->seq.output.hi = 0;
+		sp->seq.hi = 0;
+		if (unlikely(oseq < replay_esn->oseq)) {
+			xfrm_audit_state_replay_overflow(x, skb);
+			err = -EOVERFLOW;
+
+			return err;
+		} else {
+			replay_esn->oseq = oseq;
+		}
+
+		if (xfrm_aevent_is_on(net))
+			x->repl->notify(x, XFRM_REPLAY_UPDATE);
+	}
+
+	return err;
+}
+
 static const struct xfrm_replay xfrm_replay_legacy = {
 	.advance	= xfrm_replay_advance,
 	.check		= xfrm_replay_check,
@@ -567,6 +644,14 @@ static const struct xfrm_replay xfrm_replay_legacy = {
 	.overflow	= xfrm_replay_overflow,
 };
 
+static const struct xfrm_replay xfrm_replay_offload_legacy = {
+	.advance	= xfrm_replay_advance,
+	.check		= xfrm_replay_check,
+	.recheck	= xfrm_replay_check,
+	.notify		= xfrm_replay_notify,
+	.overflow	= xfrm_replay_overflow_offload,
+};
+
 static const struct xfrm_replay xfrm_replay_bmp = {
 	.advance	= xfrm_replay_advance_bmp,
 	.check		= xfrm_replay_check_bmp,
@@ -575,6 +660,14 @@ static const struct xfrm_replay xfrm_replay_bmp = {
 	.overflow	= xfrm_replay_overflow_bmp,
 };
 
+static const struct xfrm_replay xfrm_replay_offload = {
+	.advance	= xfrm_replay_advance_bmp,
+	.check		= xfrm_replay_check_bmp,
+	.recheck	= xfrm_replay_check_bmp,
+	.notify		= xfrm_replay_notify_bmp,
+	.overflow	= xfrm_replay_overflow_offload_bmp,
+};
+
 static const struct xfrm_replay xfrm_replay_esn = {
 	.advance	= xfrm_replay_advance_esn,
 	.check		= xfrm_replay_check_esn,
@@ -596,10 +689,16 @@ int xfrm_init_replay(struct xfrm_state *x)
 			if (replay_esn->replay_window == 0)
 				return -EINVAL;
 			x->repl = &xfrm_replay_esn;
-		} else
+		} else if (x->xso.offload_handle) {
+			x->repl = &xfrm_replay_offload;
+		} else {
 			x->repl = &xfrm_replay_bmp;
-	} else
+		}
+	} else if (x->xso.offload_handle) {
+		x->repl = &xfrm_replay_offload_legacy;
+	} else {
 		x->repl = &xfrm_replay_legacy;
+	}
 
 	return 0;
 }
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* [PATCH RFC 11/11] xfrm: Add encapsulation header offsets while SKB is not encrypted
  2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
                   ` (9 preceding siblings ...)
  2016-09-23  7:53 ` [PATCH RFC 10/11] xfrm: Add xfrm_replay_overflow functions for offloading Steffen Klassert
@ 2016-09-23  7:53 ` Steffen Klassert
  10 siblings, 0 replies; 14+ messages in thread
From: Steffen Klassert @ 2016-09-23  7:53 UTC (permalink / raw)
  To: netdev
  Cc: Steffen Klassert, Ilan Tayari, Sowmini Varadhan, Boris Pismenny,
	Ariel Levkovich, Hay, Joshua A

From: Ilan Tayari <ilant@mellanox.com>

Both esp4 and esp6 used to assume that the SKB payload is encrypted
and therefore the inner_network and inner_transport offsets are
not relevant.
When doing crypto offload in the NIC, this is no longer the case
and the NIC driver needs these offsets so it can do TX TCP checksum
offloading.
This patch sets the inner_network and inner_transport members of
the SKB, as well as encapsulation, to reflect the actual positions
of these headers, and removes them only once encryption is done
on the payload.

Signed-off-by: Ilan Tayari <ilant@mellanox.com>
Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
---
 net/ipv4/xfrm4_mode_transport.c | 2 ++
 net/ipv4/xfrm4_mode_tunnel.c    | 3 +++
 net/ipv6/xfrm6_mode_transport.c | 1 +
 net/ipv6/xfrm6_mode_tunnel.c    | 3 +++
 net/xfrm/xfrm_output.c          | 6 +++---
 5 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c
index 7b447f3..2cb2a50 100644
--- a/net/ipv4/xfrm4_mode_transport.c
+++ b/net/ipv4/xfrm4_mode_transport.c
@@ -23,6 +23,8 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb)
 	struct iphdr *iph = ip_hdr(skb);
 	int ihl = iph->ihl * 4;
 
+	skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+
 	skb_set_network_header(skb, -x->props.header_len);
 	skb->mac_header = skb->network_header +
 			  offsetof(struct iphdr, protocol);
diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c
index 35feda6..b55ad81 100644
--- a/net/ipv4/xfrm4_mode_tunnel.c
+++ b/net/ipv4/xfrm4_mode_tunnel.c
@@ -33,6 +33,9 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 	struct iphdr *top_iph;
 	int flags;
 
+	skb_set_inner_network_header(skb, skb_network_offset(skb));
+	skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+
 	skb_set_network_header(skb, -x->props.header_len);
 	skb->mac_header = skb->network_header +
 			  offsetof(struct iphdr, protocol);
diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c
index 4e34410..66cc00c 100644
--- a/net/ipv6/xfrm6_mode_transport.c
+++ b/net/ipv6/xfrm6_mode_transport.c
@@ -26,6 +26,7 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb)
 	int hdr_len;
 
 	iph = ipv6_hdr(skb);
+	skb_set_inner_transport_header(skb, skb_transport_offset(skb));
 
 	hdr_len = x->type->hdr_offset(x, skb, &prevhdr);
 	skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data);
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
index 372855e..b8af4bb 100644
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -36,6 +36,9 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb)
 	struct ipv6hdr *top_iph;
 	int dsfield;
 
+	skb_set_inner_network_header(skb, skb_network_offset(skb));
+	skb_set_inner_transport_header(skb, skb_transport_offset(skb));
+
 	skb_set_network_header(skb, -x->props.header_len);
 	skb->mac_header = skb->network_header +
 			  offsetof(struct ipv6hdr, nexthdr);
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index fdd5fa1..aa0dd7a 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -99,9 +99,6 @@ static int xfrm_output_one(struct sk_buff *skb, int err)
 
 		skb_dst_force(skb);
 
-		/* Inner headers are invalid now. */
-		skb->encapsulation = 0;
-
 		if (skb_shinfo(skb)->gso_type & SKB_GSO_ESP) {
 			x->type->encap(x, skb);
 		} else {
@@ -243,6 +240,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
 		if (skb->sp)
 			secpath_put(skb->sp);
 		skb->sp = sp;
+		skb->encapsulation = 1;
 
 		skb->sp->xvec[skb->sp->len++] = x;
 		xfrm_state_hold(x);
@@ -254,6 +252,8 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb)
 		}
 		if (x->xso.dev->features & NETIF_F_HW_ESP_TX_CSUM)
 			goto out;
+	} else {
+		skb->encapsulation = 0;
 	}
 
 	if (skb_is_gso(skb))
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* RE: [PATCH RFC 05/11] skbuff: Extend gso_type to unsigned int.
  2016-09-23  7:53 ` [PATCH RFC 05/11] skbuff: Extend gso_type to unsigned int Steffen Klassert
@ 2016-09-23 13:19   ` David Laight
  2016-09-23 15:15     ` Alexander Duyck
  0 siblings, 1 reply; 14+ messages in thread
From: David Laight @ 2016-09-23 13:19 UTC (permalink / raw)
  To: 'Steffen Klassert', netdev
  Cc: Sowmini Varadhan, Ilan Tayari, Boris Pismenny, Ariel Levkovich,
	Hay, Joshua A

From: Steffen Klassert
> Sent: 23 September 2016 08:54
> All available gso_type flags are currently in use,
> so extend gso_type to be able to add further flags.
> 
> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
> ---
>  include/linux/skbuff.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index f21da42..c1fd854 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -417,7 +417,7 @@ struct skb_shared_info {
>  	unsigned short	gso_size;
>  	/* Warning: this field is not always filled in (UFO)! */
>  	unsigned short	gso_segs;
> -	unsigned short  gso_type;
> +	unsigned int	gso_type;
>  	struct sk_buff	*frag_list;
>  	struct skb_shared_hwtstamps hwtstamps;
>  	u32		tskey;

That add a lot of padding.
I'm not even sure DM will like this structure being extended.
If ktime_t is 64 bit I think there is already some padding later on.

	David

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH RFC 05/11] skbuff: Extend gso_type to unsigned int.
  2016-09-23 13:19   ` David Laight
@ 2016-09-23 15:15     ` Alexander Duyck
  0 siblings, 0 replies; 14+ messages in thread
From: Alexander Duyck @ 2016-09-23 15:15 UTC (permalink / raw)
  To: David Laight
  Cc: Steffen Klassert, netdev, Sowmini Varadhan, Ilan Tayari,
	Boris Pismenny, Ariel Levkovich, Hay, Joshua A

On Fri, Sep 23, 2016 at 6:19 AM, David Laight <David.Laight@aculab.com> wrote:
> From: Steffen Klassert
>> Sent: 23 September 2016 08:54
>> All available gso_type flags are currently in use,
>> so extend gso_type to be able to add further flags.
>>
>> Signed-off-by: Steffen Klassert <steffen.klassert@secunet.com>
>> ---
>>  include/linux/skbuff.h | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>> index f21da42..c1fd854 100644
>> --- a/include/linux/skbuff.h
>> +++ b/include/linux/skbuff.h
>> @@ -417,7 +417,7 @@ struct skb_shared_info {
>>       unsigned short  gso_size;
>>       /* Warning: this field is not always filled in (UFO)! */
>>       unsigned short  gso_segs;
>> -     unsigned short  gso_type;
>> +     unsigned int    gso_type;
>>       struct sk_buff  *frag_list;
>>       struct skb_shared_hwtstamps hwtstamps;
>>       u32             tskey;
>
> That add a lot of padding.
> I'm not even sure DM will like this structure being extended.
> If ktime_t is 64 bit I think there is already some padding later on.

+1.  Just increasing the size is a bad idea as it will add a 6 byte
hole and push frag 0 outside the first cache line.

You may want to take a look at rearranging the structure a bit.  That
way you could eat into the 4 byte hole that is available on x86_64 so
that instead of shifting everything up by 8 bytes they can mostly stay
put with only a bit of rearranging.  This is one of those times a tool
like pahole comes in handy.

Also it occurs to me that one other thing you could look at is if you
were to move gso_type into the slot after dataref we might be able to
make use of the hole while also saving us some initialization time as
the change in size would push us from 32 to 36 which would likely
impact the memset operation by at least a 1 cycle increase.  If we
could make it so that we don't have to initialize the memory for
gso_type unless we are planning to set gso_size we can avoid the extra
overhead for that.  It would just be a matter of validating that
nothing reads gso_type unless gso_size is set and that gso_type is
always written to prior to setting gso_size to a non-zero value.

- Alex

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-09-23 15:15 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-09-23  7:53 [PATCH RFC] IPsec performance optimizations Steffen Klassert
2016-09-23  7:53 ` [PATCH RFC 01/11] esp4: Avoid skb_cow_data whenever possible Steffen Klassert
2016-09-23  7:53 ` [PATCH RFC 02/11] esp6: " Steffen Klassert
2016-09-23  7:53 ` [PATCH RFC 03/11] net: Prepare for IPsec GRO Steffen Klassert
2016-09-23  7:53 ` [PATCH RFC 04/11] esp: Add a software GRO codepath Steffen Klassert
2016-09-23  7:53 ` [PATCH RFC 05/11] skbuff: Extend gso_type to unsigned int Steffen Klassert
2016-09-23 13:19   ` David Laight
2016-09-23 15:15     ` Alexander Duyck
2016-09-23  7:53 ` [PATCH RFC 06/11] net: Add ESP offload features Steffen Klassert
2016-09-23  7:53 ` [PATCH RFC 07/11] esp: Add gso handlers for esp4 and esp6 Steffen Klassert
2016-09-23  7:53 ` [PATCH RFC 08/11] xfrm: Move device notifications to a sepatate file Steffen Klassert
2016-09-23  7:53 ` [PATCH RFC 09/11] xfrm: Add an IPsec hardware offloading API Steffen Klassert
2016-09-23  7:53 ` [PATCH RFC 10/11] xfrm: Add xfrm_replay_overflow functions for offloading Steffen Klassert
2016-09-23  7:53 ` [PATCH RFC 11/11] xfrm: Add encapsulation header offsets while SKB is not encrypted Steffen Klassert

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.