All of lore.kernel.org
 help / color / mirror / Atom feed
From: Daniel Mack <daniel-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>
To: htejun-b10kYP2dOMg@public.gmane.org,
	daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org,
	ast-b10kYP2dOMg@public.gmane.org
Cc: davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org,
	kafai-b10kYP2dOMg@public.gmane.org,
	fw-HFFVJYpyMKqzQB+pC5nmwQ@public.gmane.org,
	pablo-Cap9r6Oaw4JrovVCs/uTlw@public.gmane.org,
	harald-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org,
	netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	sargun-GaZTRHToo+CzQB+pC5nmwQ@public.gmane.org,
	cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org,
	Daniel Mack <daniel-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>
Subject: [PATCH v7 2/6] cgroup: add support for eBPF programs
Date: Tue, 25 Oct 2016 12:14:10 +0200	[thread overview]
Message-ID: <1477390454-12553-3-git-send-email-daniel@zonque.org> (raw)
In-Reply-To: <1477390454-12553-1-git-send-email-daniel-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>

This patch adds two sets of eBPF program pointers to struct cgroup.
One for such that are directly pinned to a cgroup, and one for such
that are effective for it.

To illustrate the logic behind that, assume the following example
cgroup hierarchy.

  A - B - C
        \ D - E

If only B has a program attached, it will be effective for B, C, D
and E. If D then attaches a program itself, that will be effective for
both D and E, and the program in B will only affect B and C. Only one
program of a given type is effective for a cgroup.

Attaching and detaching programs will be done through the bpf(2)
syscall. For now, ingress and egress inet socket filtering are the
only supported use-cases.

Signed-off-by: Daniel Mack <daniel-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>
Acked-by: Alexei Starovoitov <ast-DgEjT+Ai2ygdnm+yROfE0A@public.gmane.org>
---
 include/linux/bpf-cgroup.h  |  71 +++++++++++++++++++
 include/linux/cgroup-defs.h |   4 ++
 init/Kconfig                |  12 ++++
 kernel/bpf/Makefile         |   1 +
 kernel/bpf/cgroup.c         | 167 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/cgroup.c             |  18 +++++
 6 files changed, 273 insertions(+)
 create mode 100644 include/linux/bpf-cgroup.h
 create mode 100644 kernel/bpf/cgroup.c

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
new file mode 100644
index 0000000..fc076de
--- /dev/null
+++ b/include/linux/bpf-cgroup.h
@@ -0,0 +1,71 @@
+#ifndef _BPF_CGROUP_H
+#define _BPF_CGROUP_H
+
+#include <linux/bpf.h>
+#include <linux/jump_label.h>
+#include <uapi/linux/bpf.h>
+
+struct sock;
+struct cgroup;
+struct sk_buff;
+
+#ifdef CONFIG_CGROUP_BPF
+
+extern struct static_key_false cgroup_bpf_enabled_key;
+#define cgroup_bpf_enabled static_branch_unlikely(&cgroup_bpf_enabled_key)
+
+struct cgroup_bpf {
+	/*
+	 * Store two sets of bpf_prog pointers, one for programs that are
+	 * pinned directly to this cgroup, and one for those that are effective
+	 * when this cgroup is accessed.
+	 */
+	struct bpf_prog *prog[MAX_BPF_ATTACH_TYPE];
+	struct bpf_prog *effective[MAX_BPF_ATTACH_TYPE];
+};
+
+void cgroup_bpf_put(struct cgroup *cgrp);
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent);
+
+void __cgroup_bpf_update(struct cgroup *cgrp,
+			 struct cgroup *parent,
+			 struct bpf_prog *prog,
+			 enum bpf_attach_type type);
+
+/* Wrapper for __cgroup_bpf_update() protected by cgroup_mutex */
+void cgroup_bpf_update(struct cgroup *cgrp,
+		       struct bpf_prog *prog,
+		       enum bpf_attach_type type);
+
+int __cgroup_bpf_run_filter(struct sock *sk,
+			    struct sk_buff *skb,
+			    enum bpf_attach_type type);
+
+/* Wrapper for __cgroup_bpf_run_filter() guarded by cgroup_bpf_enabled */
+static inline int cgroup_bpf_run_filter(struct sock *sk,
+					struct sk_buff *skb,
+					enum bpf_attach_type type)
+{
+	if (cgroup_bpf_enabled)
+		return __cgroup_bpf_run_filter(sk, skb, type);
+
+	return 0;
+}
+
+#else
+
+struct cgroup_bpf {};
+static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
+static inline void cgroup_bpf_inherit(struct cgroup *cgrp,
+				      struct cgroup *parent) {}
+
+static inline int cgroup_bpf_run_filter(struct sock *sk,
+					struct sk_buff *skb,
+					enum bpf_attach_type type)
+{
+	return 0;
+}
+
+#endif /* CONFIG_CGROUP_BPF */
+
+#endif /* _BPF_CGROUP_H */
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 5b17de6..861b467 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -16,6 +16,7 @@
 #include <linux/percpu-refcount.h>
 #include <linux/percpu-rwsem.h>
 #include <linux/workqueue.h>
+#include <linux/bpf-cgroup.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -300,6 +301,9 @@ struct cgroup {
 	/* used to schedule release agent */
 	struct work_struct release_agent_work;
 
+	/* used to store eBPF programs */
+	struct cgroup_bpf bpf;
+
 	/* ids of the ancestors at each level including self */
 	int ancestor_ids[];
 };
diff --git a/init/Kconfig b/init/Kconfig
index 34407f1..405120b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1154,6 +1154,18 @@ config CGROUP_PERF
 
 	  Say N if unsure.
 
+config CGROUP_BPF
+	bool "Support for eBPF programs attached to cgroups"
+	depends on BPF_SYSCALL && SOCK_CGROUP_DATA
+	help
+	  Allow attaching eBPF programs to a cgroup using the bpf(2)
+	  syscall command BPF_PROG_ATTACH.
+
+	  In which context these programs are accessed depends on the type
+	  of attachment. For instance, programs that are attached using
+	  BPF_CGROUP_INET_INGRESS will be executed on the ingress path of
+	  inet sockets.
+
 config CGROUP_DEBUG
 	bool "Example controller"
 	default n
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index eed911d..b22256b 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -5,3 +5,4 @@ obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o
 ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
 endif
+obj-$(CONFIG_CGROUP_BPF) += cgroup.o
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
new file mode 100644
index 0000000..a0ab43f
--- /dev/null
+++ b/kernel/bpf/cgroup.c
@@ -0,0 +1,167 @@
+/*
+ * Functions to manage eBPF programs attached to cgroups
+ *
+ * Copyright (c) 2016 Daniel Mack
+ *
+ * This file is subject to the terms and conditions of version 2 of the GNU
+ * General Public License.  See the file COPYING in the main directory of the
+ * Linux distribution for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/atomic.h>
+#include <linux/cgroup.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/bpf-cgroup.h>
+#include <net/sock.h>
+
+DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key);
+EXPORT_SYMBOL(cgroup_bpf_enabled_key);
+
+/**
+ * cgroup_bpf_put() - put references of all bpf programs
+ * @cgrp: the cgroup to modify
+ */
+void cgroup_bpf_put(struct cgroup *cgrp)
+{
+	unsigned int type;
+
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) {
+		struct bpf_prog *prog = cgrp->bpf.prog[type];
+
+		if (prog) {
+			bpf_prog_put(prog);
+			static_branch_dec(&cgroup_bpf_enabled_key);
+		}
+	}
+}
+
+/**
+ * cgroup_bpf_inherit() - inherit effective programs from parent
+ * @cgrp: the cgroup to modify
+ * @parent: the parent to inherit from
+ */
+void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent)
+{
+	unsigned int type;
+
+	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) {
+		struct bpf_prog *e;
+
+		e = rcu_dereference_protected(parent->bpf.effective[type],
+					      lockdep_is_held(&cgroup_mutex));
+		rcu_assign_pointer(cgrp->bpf.effective[type], e);
+	}
+}
+
+/**
+ * __cgroup_bpf_update() - Update the pinned program of a cgroup, and
+ *                         propagate the change to descendants
+ * @cgrp: The cgroup which descendants to traverse
+ * @parent: The parent of @cgrp, or %NULL if @cgrp is the root
+ * @prog: A new program to pin
+ * @type: Type of pinning operation (ingress/egress)
+ *
+ * Each cgroup has a set of two pointers for bpf programs; one for eBPF
+ * programs it owns, and which is effective for execution.
+ *
+ * If @prog is %NULL, this function attaches a new program to the cgroup and
+ * releases the one that is currently attached, if any. @prog is then made
+ * the effective program of type @type in that cgroup.
+ *
+ * If @prog is %NULL, the currently attached program of type @type is released,
+ * and the effective program of the parent cgroup (if any) is inherited to
+ * @cgrp.
+ *
+ * Then, the descendants of @cgrp are walked and the effective program for
+ * each of them is set to the effective program of @cgrp unless the
+ * descendant has its own program attached, in which case the subbranch is
+ * skipped. This ensures that delegated subcgroups with own programs are left
+ * untouched.
+ *
+ * Must be called with cgroup_mutex held.
+ */
+void __cgroup_bpf_update(struct cgroup *cgrp,
+			 struct cgroup *parent,
+			 struct bpf_prog *prog,
+			 enum bpf_attach_type type)
+{
+	struct bpf_prog *old_prog, *effective;
+	struct cgroup_subsys_state *pos;
+
+	old_prog = xchg(cgrp->bpf.prog + type, prog);
+
+	effective = (!prog && parent) ?
+		rcu_dereference_protected(parent->bpf.effective[type],
+					  lockdep_is_held(&cgroup_mutex)) :
+		prog;
+
+	css_for_each_descendant_pre(pos, &cgrp->self) {
+		struct cgroup *desc = container_of(pos, struct cgroup, self);
+
+		/* skip the subtree if the descendant has its own program */
+		if (desc->bpf.prog[type] && desc != cgrp)
+			pos = css_rightmost_descendant(pos);
+		else
+			rcu_assign_pointer(desc->bpf.effective[type],
+					   effective);
+	}
+
+	if (prog)
+		static_branch_inc(&cgroup_bpf_enabled_key);
+
+	if (old_prog) {
+		bpf_prog_put(old_prog);
+		static_branch_dec(&cgroup_bpf_enabled_key);
+	}
+}
+
+/**
+ * __cgroup_bpf_run_filter() - Run a program for packet filtering
+ * @sk: The socken sending or receiving traffic
+ * @skb: The skb that is being sent or received
+ * @type: The type of program to be exectuted
+ *
+ * If no socket is passed, or the socket is not of type INET or INET6,
+ * this function does nothing and returns 0.
+ *
+ * The program type passed in via @type must be suitable for network
+ * filtering. No further check is performed to assert that.
+ *
+ * This function will return %-EPERM if any if an attached program was found
+ * and if it returned != 1 during execution. In all other cases, 0 is returned.
+ */
+int __cgroup_bpf_run_filter(struct sock *sk,
+			    struct sk_buff *skb,
+			    enum bpf_attach_type type)
+{
+	struct bpf_prog *prog;
+	struct cgroup *cgrp;
+	int ret = 0;
+
+	if (!sk || !sk_fullsock(sk))
+		return 0;
+
+	if (sk->sk_family != AF_INET &&
+	    sk->sk_family != AF_INET6)
+		return 0;
+
+	cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+
+	rcu_read_lock();
+
+	prog = rcu_dereference(cgrp->bpf.effective[type]);
+	if (prog) {
+		unsigned int offset = skb->data - skb_network_header(skb);
+
+		__skb_push(skb, offset);
+		ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM;
+		__skb_pull(skb, offset);
+	}
+
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(__cgroup_bpf_run_filter);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 85bc9be..2ee9ec3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -5074,6 +5074,8 @@ static void css_release_work_fn(struct work_struct *work)
 		if (cgrp->kn)
 			RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv,
 					 NULL);
+
+		cgroup_bpf_put(cgrp);
 	}
 
 	mutex_unlock(&cgroup_mutex);
@@ -5281,6 +5283,9 @@ static struct cgroup *cgroup_create(struct cgroup *parent)
 	if (!cgroup_on_dfl(cgrp))
 		cgrp->subtree_control = cgroup_control(cgrp);
 
+	if (parent)
+		cgroup_bpf_inherit(cgrp, parent);
+
 	cgroup_propagate_control(cgrp);
 
 	/* @cgrp doesn't have dir yet so the following will only create csses */
@@ -6495,6 +6500,19 @@ static __init int cgroup_namespaces_init(void)
 }
 subsys_initcall(cgroup_namespaces_init);
 
+#ifdef CONFIG_CGROUP_BPF
+void cgroup_bpf_update(struct cgroup *cgrp,
+		       struct bpf_prog *prog,
+		       enum bpf_attach_type type)
+{
+	struct cgroup *parent = cgroup_parent(cgrp);
+
+	mutex_lock(&cgroup_mutex);
+	__cgroup_bpf_update(cgrp, parent, prog, type);
+	mutex_unlock(&cgroup_mutex);
+}
+#endif /* CONFIG_CGROUP_BPF */
+
 #ifdef CONFIG_CGROUP_DEBUG
 static struct cgroup_subsys_state *
 debug_css_alloc(struct cgroup_subsys_state *parent_css)
-- 
2.7.4

  parent reply	other threads:[~2016-10-25 10:14 UTC|newest]

Thread overview: 25+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2016-10-25 10:14 [PATCH v7 0/6] Add eBPF hooks for cgroups Daniel Mack
2016-10-25 10:14 ` [PATCH v7 1/6] bpf: add new prog type for cgroup socket filtering Daniel Mack
2016-10-25 10:14 ` [PATCH v7 4/6] net: filter: run cgroup eBPF ingress programs Daniel Mack
     [not found] ` <1477390454-12553-1-git-send-email-daniel-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>
2016-10-25 10:14   ` Daniel Mack [this message]
2016-10-25 10:14   ` [PATCH v7 3/6] bpf: add BPF_PROG_ATTACH and BPF_PROG_DETACH commands Daniel Mack
2016-10-25 10:14   ` [PATCH v7 5/6] net: ipv4, ipv6: run cgroup eBPF egress programs Daniel Mack
2016-10-31 16:40     ` David Miller
     [not found]       ` <20161031.124003.1361406552151798940.davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org>
2016-11-02  1:17         ` Daniel Borkmann
2016-10-25 10:14   ` [PATCH v7 6/6] samples: bpf: add userspace example for attaching eBPF programs to cgroups Daniel Mack
2016-10-26 19:59 ` [PATCH v7 0/6] Add eBPF hooks for cgroups Pablo Neira Ayuso
2016-10-27  3:35   ` Alexei Starovoitov
     [not found]     ` <20161027033502.GA43960-+o4/htvd0TDFYCXBM6kdu7fOX0fSgVTm@public.gmane.org>
2016-10-28 11:28       ` Pablo Neira Ayuso
2016-10-28 15:00         ` David Ahern
2016-10-29  1:42         ` Alexei Starovoitov
2016-10-27  8:40   ` Daniel Mack
     [not found]     ` <c9683122-d770-355b-e275-7c446e6d1d0f-cYrQPVfZoowdnm+yROfE0A@public.gmane.org>
2016-10-28 11:53       ` Pablo Neira Ayuso
2016-10-28 12:07         ` Daniel Mack
2016-10-29  3:51       ` Lorenzo Colitti
     [not found]         ` <CAKD1Yr2aRDNUxX8onReZyURufphxGoSTek=Fjk3Wswq9WOVp4w-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-10-29  4:51           ` Alexei Starovoitov
     [not found]             ` <20161029045107.GA61294-+o4/htvd0TDFYCXBM6kdu7fOX0fSgVTm@public.gmane.org>
2016-10-29  4:59               ` Lorenzo Colitti
     [not found]                 ` <CAKD1Yr2pMk52h7BdRwTvGwnP5+ONmr4ac6cyUBoZ9P+Kt-B8jw-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-10-29  6:24                   ` Alexei Starovoitov
2016-10-29 15:34                     ` Lorenzo Colitti
2016-10-29 20:29                       ` Daniel Borkmann
2016-11-01 15:25                         ` Lorenzo Colitti
     [not found]                           ` <CAKD1Yr02SCHvd-xZJL14d_Ta8Dk4evHZ60zytpU0h4r80FucwA-JsoAwUIsXosN+BqQ9rBEUg@public.gmane.org>
2016-11-01 15:38                             ` David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1477390454-12553-3-git-send-email-daniel@zonque.org \
    --to=daniel-cyrqpvfzoowdnm+yrofe0a@public.gmane.org \
    --cc=ast-b10kYP2dOMg@public.gmane.org \
    --cc=cgroups-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=daniel-FeC+5ew28dpmcu3hnIyYJQ@public.gmane.org \
    --cc=davem-fT/PcQaiUtIeIZ0/mPfg9Q@public.gmane.org \
    --cc=fw-HFFVJYpyMKqzQB+pC5nmwQ@public.gmane.org \
    --cc=harald-H+wXaHxf7aLQT0dZR+AlfA@public.gmane.org \
    --cc=htejun-b10kYP2dOMg@public.gmane.org \
    --cc=kafai-b10kYP2dOMg@public.gmane.org \
    --cc=netdev-u79uwXL29TY76Z2rM5mHXA@public.gmane.org \
    --cc=pablo-Cap9r6Oaw4JrovVCs/uTlw@public.gmane.org \
    --cc=sargun-GaZTRHToo+CzQB+pC5nmwQ@public.gmane.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.