All of lore.kernel.org
 help / color / mirror / Atom feed
From: Florian Westphal <fw@strlen.de>
To: <netdev@vger.kernel.org>
Cc: Florian Westphal <fw@strlen.de>
Subject: [RFC PATCH 2/3] sk_buff: add skb extension infrastructure
Date: Mon, 26 Nov 2018 12:38:56 +0100	[thread overview]
Message-ID: <20181126113857.29270-3-fw@strlen.de> (raw)
In-Reply-To: <20181126113857.29270-1-fw@strlen.de>

adds an extension infrastructure for sk_buff:
1. extension memory is released when the sk_buff is free'd.
2. data is shared after cloning an skb.

This is also how xfrm and bridge netfilter skb-associated data
(skb->sp and skb->nf_bridge) are handled.

Two new members are added to sk_buff:
1. 'active_extensions' byte (filling a hole), telling which extensions
   have been allocated for the skb.
2. extension pointer, located at the end of the sk_buff.
   If active_extensions is 0, its content is undefined.

The 'nf_bridge' pointer is removed, i.e. sk_buff size remains the same,
in a followup patch.

This adds extra code to skb clone and free paths (to deal with
refcount/free of extension area) but replaces the existing code that
deals with skb->nf_bridge.

This patch only adds the basic infrastructure, the nf_bridge conversion
is done in the next patch.

Conversion of skb->sp (ipsec/xfrm secpath) to an skb extension is planned
as a followup.

Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/skbuff.h | 124 +++++++++++++++++++++++++++++++++++++-
 net/Kconfig            |   3 +
 net/core/skbuff.c      | 131 +++++++++++++++++++++++++++++++++++++++++
 net/ipv4/ip_output.c   |   1 +
 net/ipv6/ip6_output.c  |   1 +
 5 files changed, 259 insertions(+), 1 deletion(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 73902acf2b71..832904d71a85 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -245,6 +245,7 @@ struct iov_iter;
 struct napi_struct;
 struct bpf_prog;
 union bpf_attr;
+struct skb_ext;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -633,6 +634,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@queue_mapping: Queue mapping for multiqueue devices
  *	@xmit_more: More SKBs are pending for this queue
  *	@pfmemalloc: skbuff was allocated from PFMEMALLOC reserves
+ *	@active_extensions: active extensions (skb_ext_id types)
  *	@ndisc_nodetype: router type (from link layer)
  *	@ooo_okay: allow the mapping of a socket to a queue to be changed
  *	@l4_hash: indicate hash is a canonical 4-tuple hash over transport
@@ -662,6 +664,7 @@ typedef unsigned char *sk_buff_data_t;
  *	@data: Data head pointer
  *	@truesize: Buffer size
  *	@users: User count - see {datagram,tcp}.c
+ *	@extensions: allocated extensions, valid if active_extensions is nonzero
  */
 
 struct sk_buff {
@@ -744,7 +747,9 @@ struct sk_buff {
 				head_frag:1,
 				xmit_more:1,
 				pfmemalloc:1;
-
+#ifdef CONFIG_SKB_EXTENSIONS
+	__u8			active_extensions;
+#endif
 	/* fields enclosed in headers_start/headers_end are copied
 	 * using a single memcpy() in __copy_skb_header()
 	 */
@@ -866,6 +871,11 @@ struct sk_buff {
 				*data;
 	unsigned int		truesize;
 	refcount_t		users;
+
+#ifdef CONFIG_SKB_EXTENSIONS
+	/* only useable after checking ->active_extensions != 0 */
+	struct skb_ext		*extensions;
+#endif
 };
 
 #ifdef __KERNEL__
@@ -3889,6 +3899,118 @@ static inline void nf_conntrack_get(struct nf_conntrack *nfct)
 		atomic_inc(&nfct->use);
 }
 #endif
+
+#ifdef CONFIG_SKB_EXTENSIONS
+enum skb_ext_id {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+	SKB_EXT_BRIDGE_NF,
+#endif
+	SKB_EXT_NUM, /* must be last */
+};
+
+/* each extension aligned to this value */
+#define SKB_EXT_ALIGN	8
+/* offsets/len: left-shift needed to translate offset to bytes */
+#define SKB_EXT_ALIGN_SHIFT 3
+
+/**
+ *	struct skb_ext - sk_buff extensions
+ *	@refcount: 1 on allocation, deallocated on 0
+ *	@offset: offset to add to @data to obtain extension address
+ *	@len: size currently allocated, stored in SKB_EXT_ALIGN_SHIFT units
+ *	@data: start of extension data, variable sized
+ *
+ *	Note: offsets and len are stored in chunks of 8 bytes, this allows
+ *	to use 'u8' types while allowing up to 2kb worth of extension data.
+ */
+struct skb_ext {
+	refcount_t refcnt;
+	u8 offset[SKB_EXT_NUM]; /* chunks of 8 bytes */
+	u8 len;			/* same, i.e. size == len << 3 */
+	char data[0] __aligned(SKB_EXT_ALIGN);
+};
+
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id);
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id);
+void __skb_ext_free(struct skb_ext *ext);
+
+static inline void __skb_ext_put(struct skb_ext *ext)
+{
+	if (ext && refcount_dec_and_test(&ext->refcnt))
+		__skb_ext_free(ext);
+}
+
+static inline void skb_ext_put(struct sk_buff *skb)
+{
+	if (skb->active_extensions)
+		__skb_ext_put(skb->extensions);
+}
+
+static inline void skb_ext_get(struct sk_buff *skb)
+{
+	if (skb->active_extensions) {
+		struct skb_ext *ext = skb->extensions;
+
+		if (ext)
+			refcount_inc(&ext->refcnt);
+	}
+}
+
+static inline void __skb_ext_copy(struct sk_buff *dst,
+				  const struct sk_buff *src)
+{
+	dst->active_extensions = src->active_extensions;
+
+	if (src->active_extensions) {
+		struct skb_ext *ext = src->extensions;
+
+		if (ext)
+			refcount_inc(&ext->refcnt);
+		dst->extensions = ext;
+	}
+}
+
+static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *src)
+{
+	skb_ext_put(dst);
+	__skb_ext_copy(dst, src);
+}
+
+static inline bool __skb_ext_exist(const struct skb_ext *ext, enum skb_ext_id i)
+{
+	return !!ext->offset[i];
+}
+
+static inline bool skb_ext_exist(const struct sk_buff *skb, enum skb_ext_id id)
+{
+	return skb->active_extensions & (1 << id);
+}
+
+static inline void skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+	if (skb_ext_exist(skb, id))
+		__skb_ext_del(skb, id);
+}
+
+static inline void *skb_ext_find(const struct sk_buff *skb, enum skb_ext_id id)
+{
+	if (skb_ext_exist(skb, id)) {
+		struct skb_ext *ext = skb->extensions;
+
+		if (ext && __skb_ext_exist(ext, id))
+			return (void *)ext + (ext->offset[id] << 3);
+	}
+
+	return NULL;
+}
+#else
+static inline void skb_ext_put(struct sk_buff *skb) {}
+static inline void skb_ext_get(struct sk_buff *skb) {}
+static inline void skb_ext_del(struct sk_buff *skb, int unused) {}
+static inline void __skb_ext_copy(struct sk_buff *d, const struct sk_buff *s) {}
+static inline void skb_ext_copy(struct sk_buff *dst, const struct sk_buff *s) {}
+#endif /* CONFIG_SKB_EXTENSIONS */
+
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 static inline void nf_bridge_put(struct nf_bridge_info *nf_bridge)
 {
diff --git a/net/Kconfig b/net/Kconfig
index f235edb593ba..93b291292860 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -51,6 +51,9 @@ config NET_INGRESS
 config NET_EGRESS
 	bool
 
+config SKB_EXTENSIONS
+	bool
+
 menu "Networking options"
 
 source "net/packet/Kconfig"
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 02cd7ae3d0fb..e29016030633 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -617,6 +617,7 @@ void skb_release_head_state(struct sk_buff *skb)
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	nf_bridge_put(skb->nf_bridge);
 #endif
+	skb_ext_put(skb);
 }
 
 /* Free everything but the sk_buff shell. */
@@ -796,6 +797,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
 	new->dev		= old->dev;
 	memcpy(new->cb, old->cb, sizeof(old->cb));
 	skb_dst_copy(new, old);
+	__skb_ext_copy(new, old);
 #ifdef CONFIG_XFRM
 	new->sp			= secpath_get(old->sp);
 #endif
@@ -5531,3 +5533,132 @@ void skb_condense(struct sk_buff *skb)
 	 */
 	skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
 }
+
+#ifdef CONFIG_SKB_EXTENSIONS
+static const u8 skb_ext_type_len[] = {
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+	[SKB_EXT_BRIDGE_NF] = sizeof(struct nf_bridge_info),
+#endif
+};
+
+static void *skb_ext_get_ptr(struct skb_ext *ext, enum skb_ext_id id)
+{
+	return (void *)ext + (ext->offset[id] << SKB_EXT_ALIGN_SHIFT);
+}
+
+static struct skb_ext *skb_ext_cow(unsigned int len,
+				   struct skb_ext *old)
+{
+	struct skb_ext *new = kmalloc(len, GFP_ATOMIC);
+
+	if (!new)
+		return NULL;
+
+	if (!old) {
+		memset(new->offset, 0, sizeof(new->offset));
+		refcount_set(&new->refcnt, 1);
+		return new;
+	}
+
+	memcpy(new, old, old->len << SKB_EXT_ALIGN_SHIFT);
+	refcount_set(&new->refcnt, 1);
+	__skb_ext_put(old);
+	return new;
+}
+
+static __always_inline unsigned int skb_ext_total_length(void)
+{
+	return 0 +
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+		skb_ext_type_len[SKB_EXT_BRIDGE_NF] +
+#endif
+		0;
+}
+
+/**
+ * skb_ext_add - allocate space for given extension, COW if needed
+ * @skb: buffer
+ * @id: extension to allocate space for
+ *
+ * Allocates enough space for the given extension.
+ * If the extension is already present, a pointer to that extension
+ * is returned.
+ *
+ * If the skb was cloned, COW applies and the returned memory can be
+ * modified without changing the extension space of clones buffers.
+ *
+ * Returns pointer to the extenion or NULL on allocation failure.
+ */
+void *skb_ext_add(struct sk_buff *skb, enum skb_ext_id id)
+{
+	unsigned int newlen, newoff, oldlen;
+	struct skb_ext *new, *old = NULL;
+	bool cow_needed = true;
+
+	BUILD_BUG_ON(SKB_EXT_NUM >= 8);
+	BUILD_BUG_ON(skb_ext_total_length() > (255 << 3));
+
+	if (skb->active_extensions) {
+		old = skb->extensions;
+
+		cow_needed = refcount_read(&old->refcnt) > 1;
+
+		if (__skb_ext_exist(old, id)) {
+			if (!cow_needed) {
+				new = old;
+				goto set_active;
+			}
+
+			/* extension was allocated previously and it
+			 * might be used by a cloned skb. COW needed.
+			 */
+			new = skb_ext_cow(old->len << SKB_EXT_ALIGN_SHIFT, old);
+			if (!new)
+				return NULL;
+
+			skb->extensions = new;
+			goto set_active;
+		}
+		oldlen = old->len << SKB_EXT_ALIGN_SHIFT;
+	} else {
+		oldlen = sizeof(*new);
+	}
+
+	newoff = ALIGN(oldlen, SKB_EXT_ALIGN);
+	newlen = newoff + skb_ext_type_len[id];
+
+	if (cow_needed)
+		new = skb_ext_cow(newlen, old);
+	else
+		new = krealloc(old, newlen, GFP_ATOMIC);
+	if (!new)
+		return NULL;
+
+	new->offset[id] = newoff >> SKB_EXT_ALIGN_SHIFT;
+	new->len = newlen >> SKB_EXT_ALIGN_SHIFT;
+	skb->extensions = new;
+set_active:
+	skb->active_extensions |= 1 << id;
+	return skb_ext_get_ptr(new, id);
+}
+EXPORT_SYMBOL(skb_ext_add);
+
+void __skb_ext_del(struct sk_buff *skb, enum skb_ext_id id)
+{
+	struct skb_ext *ext;
+
+	skb->active_extensions &= ~(1 << id);
+	if (skb->active_extensions == 0) {
+		ext = skb->extensions;
+		skb->extensions = NULL;
+		__skb_ext_put(ext);
+	}
+}
+EXPORT_SYMBOL(__skb_ext_del);
+
+void __skb_ext_free(struct skb_ext *ext)
+{
+	kfree(ext);
+}
+EXPORT_SYMBOL(__skb_ext_free);
+#endif /* CONFIG_SKB_EXTENSIONS */
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index c09219e7f230..a12e12f983d5 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -533,6 +533,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->tc_index = from->tc_index;
 #endif
 	nf_copy(to, from);
+	skb_ext_copy(to, from);
 #if IS_ENABLED(CONFIG_IP_VS)
 	to->ipvs_property = from->ipvs_property;
 #endif
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 89e0d5118afe..7eeb0f24be87 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -574,6 +574,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 	to->tc_index = from->tc_index;
 #endif
 	nf_copy(to, from);
+	skb_ext_copy(to, from);
 	skb_copy_secmark(to, from);
 }
 
-- 
2.18.1

  parent reply	other threads:[~2018-11-26 22:35 UTC|newest]

Thread overview: 10+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2018-11-26 11:38 [RFC PATCH 0/3] sk_buff: add skb extension infrastructure Florian Westphal
2018-11-26 11:38 ` [RFC PATCH 1/3] netfilter: avoid using skb->nf_bridge directly Florian Westphal
2018-11-26 11:38 ` Florian Westphal [this message]
2018-11-26 11:38 ` [RFC PATCH 3/3] net: convert bridge_nf to use skb extension infrastructure Florian Westphal
2018-11-26 20:41 ` [RFC PATCH 0/3] sk_buff: add " David Miller
2018-11-26 21:19   ` Florian Westphal
2018-11-26 21:27     ` David Miller
2018-11-26 21:41       ` Florian Westphal
2018-11-26 23:05 ` Eric Dumazet
2018-11-28  0:39   ` David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20181126113857.29270-3-fw@strlen.de \
    --to=fw@strlen.de \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.