bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
@ 2019-08-27 20:52 Alexei Starovoitov
  2019-08-27 23:01 ` Andy Lutomirski
  2019-08-28 10:38 ` kbuild test robot
  0 siblings, 2 replies; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-27 20:52 UTC (permalink / raw)
  To: luto; +Cc: davem, daniel, netdev, bpf, kernel-team, linux-api

Introduce CAP_BPF that allows loading all types of BPF programs,
create most map types, load BTF, iterate programs and maps.
CAP_BPF alone is not enough to attach or run programs.

Networking:

CAP_BPF and CAP_NET_ADMIN are necessary to:
- attach to cgroup-bpf hooks like INET_INGRESS, INET_SOCK_CREATE, INET4_CONNECT
- run networking bpf programs (like xdp, skb, flow_dissector)

Tracing:

CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1)
are necessary to:
- attach bpf program to raw tracepoint
- use bpf_trace_printk() in all program types (not only tracing programs)
- create bpf stackmap

To attach bpf to perf_events perf_event_open() needs to succeed as usual.

CAP_BPF controls BPF side.
CAP_NET_ADMIN controls intersection where BPF calls into networking.
perf_paranoid_tracepoint_raw controls intersection where BPF calls into tracing.

In the future CAP_TRACING could be introduced to control
creation of kprobe/uprobe and attaching bpf to perf_events.
In such case bpf_probe_read() thin wrapper would be controlled by CAP_BPF.
Whereas probe_read() would be controlled by CAP_TRACING.
CAP_TRACING would also control generic kprobe+probe_read.
CAP_BPF and CAP_TRACING would be necessary for tracing bpf programs
that want to use bpf_probe_read.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
---
I would prefer to introduce CAP_TRACING soon, since it
will make tracing and networking permission model symmetrical.

 include/linux/filter.h                      |  1 +
 include/uapi/linux/capability.h             |  5 ++-
 kernel/bpf/arraymap.c                       |  2 +-
 kernel/bpf/cgroup.c                         |  2 +-
 kernel/bpf/core.c                           | 10 ++++--
 kernel/bpf/cpumap.c                         |  2 +-
 kernel/bpf/hashtab.c                        |  4 +--
 kernel/bpf/lpm_trie.c                       |  2 +-
 kernel/bpf/queue_stack_maps.c               |  2 +-
 kernel/bpf/reuseport_array.c                |  2 +-
 kernel/bpf/stackmap.c                       |  2 +-
 kernel/bpf/syscall.c                        | 32 ++++++++++-------
 kernel/bpf/verifier.c                       |  4 +--
 kernel/trace/bpf_trace.c                    |  2 +-
 net/core/bpf_sk_storage.c                   |  2 +-
 net/core/filter.c                           | 10 +++---
 security/selinux/include/classmap.h         |  4 +--
 tools/testing/selftests/bpf/test_verifier.c | 39 ++++++++++++++++-----
 18 files changed, 84 insertions(+), 43 deletions(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 92c6e31fb008..16cea50af014 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -857,6 +857,7 @@ static inline bool bpf_dump_raw_ok(void)
 	return kallsyms_show_value() == 1;
 }
 
+bool cap_bpf_tracing(void);
 struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off,
 				       const struct bpf_insn *patch, u32 len);
 int bpf_remove_insns(struct bpf_prog *prog, u32 off, u32 cnt);
diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h
index 240fdb9a60f6..b3390f34c9f5 100644
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -366,8 +366,11 @@ struct vfs_ns_cap_data {
 
 #define CAP_AUDIT_READ		37
 
+/* Allow bpf() syscall except attach and tracing */
 
-#define CAP_LAST_CAP         CAP_AUDIT_READ
+#define CAP_BPF			38
+
+#define CAP_LAST_CAP         CAP_BPF
 
 #define cap_valid(x) ((x) >= 0 && (x) <= CAP_LAST_CAP)
 
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index 1c65ce0098a9..045e30b7160d 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -73,7 +73,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
 	int ret, numa_node = bpf_map_attr_numa_node(attr);
 	u32 elem_size, index_mask, max_entries;
-	bool unpriv = !capable(CAP_SYS_ADMIN);
+	bool unpriv = !capable(CAP_BPF);
 	u64 cost, array_size, mask64;
 	struct bpf_map_memory mem;
 	struct bpf_array *array;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 6a6a154cfa7b..97f733354421 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -795,7 +795,7 @@ cgroup_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_current_cgroup_id:
 		return &bpf_get_current_cgroup_id_proto;
 	case BPF_FUNC_trace_printk:
-		if (capable(CAP_SYS_ADMIN))
+		if (cap_bpf_tracing())
 			return bpf_get_trace_printk_proto();
 		/* fall through */
 	default:
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 8191a7db2777..5756c8a56f44 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -646,7 +646,7 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
 	if (!bpf_prog_kallsyms_candidate(fp) ||
-	    !capable(CAP_SYS_ADMIN))
+	    !capable(CAP_BPF))
 		return;
 
 	spin_lock_bh(&bpf_lock);
@@ -768,7 +768,7 @@ static int bpf_jit_charge_modmem(u32 pages)
 {
 	if (atomic_long_add_return(pages, &bpf_jit_current) >
 	    (bpf_jit_limit >> PAGE_SHIFT)) {
-		if (!capable(CAP_SYS_ADMIN)) {
+		if (!capable(CAP_BPF)) {
 			atomic_long_sub(pages, &bpf_jit_current);
 			return -EPERM;
 		}
@@ -2104,6 +2104,12 @@ int __weak skb_copy_bits(const struct sk_buff *skb, int offset, void *to,
 DEFINE_STATIC_KEY_FALSE(bpf_stats_enabled_key);
 EXPORT_SYMBOL(bpf_stats_enabled_key);
 
+bool cap_bpf_tracing(void)
+{
+	return capable(CAP_SYS_ADMIN) ||
+	       (capable(CAP_BPF) && !perf_paranoid_tracepoint_raw());
+}
+
 /* All definitions of tracepoints related to BPF. */
 #define CREATE_TRACE_POINTS
 #include <linux/bpf_trace.h>
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index ef49e17ae47c..ca483c9a9c2e 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -84,7 +84,7 @@ static struct bpf_map *cpu_map_alloc(union bpf_attr *attr)
 	int ret, cpu;
 	u64 cost;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return ERR_PTR(-EPERM);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 22066a62c8c9..f459315625ac 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -244,9 +244,9 @@ static int htab_map_alloc_check(union bpf_attr *attr)
 	BUILD_BUG_ON(offsetof(struct htab_elem, fnode.next) !=
 		     offsetof(struct htab_elem, hash_node.pprev));
 
-	if (lru && !capable(CAP_SYS_ADMIN))
+	if (lru && !capable(CAP_BPF))
 		/* LRU implementation is much complicated than other
-		 * maps.  Hence, limit to CAP_SYS_ADMIN for now.
+		 * maps.  Hence, limit to CAP_BPF.
 		 */
 		return -EPERM;
 
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index 56e6c75d354d..a45fa5464d98 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -543,7 +543,7 @@ static struct bpf_map *trie_alloc(union bpf_attr *attr)
 	u64 cost = sizeof(*trie), cost_per_node;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return ERR_PTR(-EPERM);
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/queue_stack_maps.c b/kernel/bpf/queue_stack_maps.c
index f697647ceb54..ca0ba9edca86 100644
--- a/kernel/bpf/queue_stack_maps.c
+++ b/kernel/bpf/queue_stack_maps.c
@@ -45,7 +45,7 @@ static bool queue_stack_map_is_full(struct bpf_queue_stack *qs)
 /* Called from syscall */
 static int queue_stack_map_alloc_check(union bpf_attr *attr)
 {
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	/* check sanity of attributes */
diff --git a/kernel/bpf/reuseport_array.c b/kernel/bpf/reuseport_array.c
index 50c083ba978c..bfad7d41a061 100644
--- a/kernel/bpf/reuseport_array.c
+++ b/kernel/bpf/reuseport_array.c
@@ -154,7 +154,7 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
 	struct bpf_map_memory mem;
 	u64 array_size;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return ERR_PTR(-EPERM);
 
 	array_size = sizeof(*array);
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 052580c33d26..c540b2b3fc4a 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -90,7 +90,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
 	u64 cost, n_buckets;
 	int err;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!cap_bpf_tracing())
 		return ERR_PTR(-EPERM);
 
 	if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index c0f62fd67c6b..ef7b06ca30e5 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1176,7 +1176,7 @@ static int map_freeze(const union bpf_attr *attr)
 		err = -EBUSY;
 		goto err_put;
 	}
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!capable(CAP_BPF)) {
 		err = -EPERM;
 		goto err_put;
 	}
@@ -1634,7 +1634,7 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 
 	if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) &&
 	    (attr->prog_flags & BPF_F_ANY_ALIGNMENT) &&
-	    !capable(CAP_SYS_ADMIN))
+	    !capable(CAP_BPF))
 		return -EPERM;
 
 	/* copy eBPF program license from user space */
@@ -1647,11 +1647,11 @@ static int bpf_prog_load(union bpf_attr *attr, union bpf_attr __user *uattr)
 	is_gpl = license_is_gpl_compatible(license);
 
 	if (attr->insn_cnt == 0 ||
-	    attr->insn_cnt > (capable(CAP_SYS_ADMIN) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
+	    attr->insn_cnt > (capable(CAP_BPF) ? BPF_COMPLEXITY_LIMIT_INSNS : BPF_MAXINSNS))
 		return -E2BIG;
 	if (type != BPF_PROG_TYPE_SOCKET_FILTER &&
 	    type != BPF_PROG_TYPE_CGROUP_SKB &&
-	    !capable(CAP_SYS_ADMIN))
+	    !capable(CAP_BPF))
 		return -EPERM;
 
 	bpf_prog_load_fixup_attach_type(attr);
@@ -1802,6 +1802,9 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
 	char tp_name[128];
 	int tp_fd, err;
 
+	if (!cap_bpf_tracing())
+		return -EPERM;
+
 	if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name),
 			      sizeof(tp_name) - 1) < 0)
 		return -EFAULT;
@@ -2080,7 +2083,10 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
 	struct bpf_prog *prog;
 	int ret = -ENOTSUPP;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_NET_ADMIN) || !capable(CAP_BPF))
+		/* test_run callback is available for networking progs only.
+		 * Add cap_bpf_tracing() above when tracing progs become runable.
+		 */
 		return -EPERM;
 	if (CHECK_ATTR(BPF_PROG_TEST_RUN))
 		return -EINVAL;
@@ -2117,7 +2123,7 @@ static int bpf_obj_get_next_id(const union bpf_attr *attr,
 	if (CHECK_ATTR(BPF_OBJ_GET_NEXT_ID) || next_id >= INT_MAX)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	next_id++;
@@ -2143,7 +2149,7 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_PROG_GET_FD_BY_ID))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	spin_lock_bh(&prog_idr_lock);
@@ -2177,7 +2183,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
 	    attr->open_flags & ~BPF_OBJ_FLAG_MASK)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	f_flags = bpf_get_file_flag(attr->open_flags);
@@ -2352,7 +2358,7 @@ static int bpf_prog_get_info_by_fd(struct bpf_prog *prog,
 	info.run_time_ns = stats.nsecs;
 	info.run_cnt = stats.cnt;
 
-	if (!capable(CAP_SYS_ADMIN)) {
+	if (!capable(CAP_BPF)) {
 		info.jited_prog_len = 0;
 		info.xlated_prog_len = 0;
 		info.nr_jited_ksyms = 0;
@@ -2670,7 +2676,7 @@ static int bpf_btf_load(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_BTF_LOAD))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	return btf_new_fd(attr);
@@ -2683,7 +2689,7 @@ static int bpf_btf_get_fd_by_id(const union bpf_attr *attr)
 	if (CHECK_ATTR(BPF_BTF_GET_FD_BY_ID))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	return btf_get_fd_by_id(attr->btf_id);
@@ -2752,7 +2758,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr,
 	if (CHECK_ATTR(BPF_TASK_FD_QUERY))
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!cap_bpf_tracing())
 		return -EPERM;
 
 	if (attr->task_fd_query.flags != 0)
@@ -2820,7 +2826,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	union bpf_attr attr = {};
 	int err;
 
-	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_SYS_ADMIN))
+	if (sysctl_unprivileged_bpf_disabled && !capable(CAP_BPF))
 		return -EPERM;
 
 	err = bpf_check_uarg_tail_zero(uattr, sizeof(attr), size);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 10c0ff93f52b..5810e8cc9342 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -987,7 +987,7 @@ static void __mark_reg_unbounded(struct bpf_reg_state *reg)
 	reg->umax_value = U64_MAX;
 
 	/* constant backtracking is enabled for root only for now */
-	reg->precise = capable(CAP_SYS_ADMIN) ? false : true;
+	reg->precise = capable(CAP_BPF) ? false : true;
 }
 
 /* Mark a register as having a completely unknown (scalar) value. */
@@ -9233,7 +9233,7 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr,
 		env->insn_aux_data[i].orig_idx = i;
 	env->prog = *prog;
 	env->ops = bpf_verifier_ops[env->prog->type];
-	is_priv = capable(CAP_SYS_ADMIN);
+	is_priv = capable(CAP_BPF);
 
 	/* grab the mutex to protect few globals used by verifier */
 	if (!is_priv)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index ca1255d14576..2bf58ff5bf75 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1246,7 +1246,7 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info)
 	u32 *ids, prog_cnt, ids_len;
 	int ret;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!cap_bpf_tracing())
 		return -EPERM;
 	if (event->attr.type != PERF_TYPE_TRACEPOINT)
 		return -EINVAL;
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index da5639a5bd3b..0b29f6abbeba 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -616,7 +616,7 @@ static int bpf_sk_storage_map_alloc_check(union bpf_attr *attr)
 	    !attr->btf_key_type_id || !attr->btf_value_type_id)
 		return -EINVAL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return -EPERM;
 
 	if (attr->value_size >= KMALLOC_MAX_SIZE -
diff --git a/net/core/filter.c b/net/core/filter.c
index 0c1059cdad3d..986277abfde2 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5990,7 +5990,7 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 		break;
 	}
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!capable(CAP_BPF))
 		return NULL;
 
 	switch (func_id) {
@@ -5999,7 +5999,9 @@ bpf_base_func_proto(enum bpf_func_id func_id)
 	case BPF_FUNC_spin_unlock:
 		return &bpf_spin_unlock_proto;
 	case BPF_FUNC_trace_printk:
-		return bpf_get_trace_printk_proto();
+		if (cap_bpf_tracing())
+			return bpf_get_trace_printk_proto();
+		/* fall through */
 	default:
 		return NULL;
 	}
@@ -6563,7 +6565,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		return false;
 	case bpf_ctx_range(struct __sk_buff, data):
 	case bpf_ctx_range(struct __sk_buff, data_end):
-		if (!capable(CAP_SYS_ADMIN))
+		if (!capable(CAP_BPF))
 			return false;
 		break;
 	}
@@ -6575,7 +6577,7 @@ static bool cg_skb_is_valid_access(int off, int size,
 		case bpf_ctx_range_till(struct __sk_buff, cb[0], cb[4]):
 			break;
 		case bpf_ctx_range(struct __sk_buff, tstamp):
-			if (!capable(CAP_SYS_ADMIN))
+			if (!capable(CAP_BPF))
 				return false;
 			break;
 		default:
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h
index 201f7e588a29..1c925bc04072 100644
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -26,9 +26,9 @@
 	    "audit_control", "setfcap"
 
 #define COMMON_CAP2_PERMS  "mac_override", "mac_admin", "syslog", \
-		"wake_alarm", "block_suspend", "audit_read"
+		"wake_alarm", "block_suspend", "audit_read", "bpf"
 
-#if CAP_LAST_CAP > CAP_AUDIT_READ
+#if CAP_LAST_CAP > CAP_BPF
 #error New capability defined, please update COMMON_CAP2_PERMS.
 #endif
 
diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c
index 44e2d640b088..b31b961f1020 100644
--- a/tools/testing/selftests/bpf/test_verifier.c
+++ b/tools/testing/selftests/bpf/test_verifier.c
@@ -805,10 +805,18 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type,
 	}
 }
 
+struct libcap {
+	struct __user_cap_header_struct hdr;
+	struct __user_cap_data_struct data[2];
+};
+
 static int set_admin(bool admin)
 {
 	cap_t caps;
-	const cap_value_t cap_val = CAP_SYS_ADMIN;
+	/* need CAP_BPF to load progs and CAP_NET_ADMIN to run networking progs */
+	const cap_value_t cap_val[] = {38/*CAP_BPF*/, CAP_NET_ADMIN};
+	const cap_value_t cap_val_admin = CAP_SYS_ADMIN;
+	struct libcap *cap;
 	int ret = -1;
 
 	caps = cap_get_proc();
@@ -816,11 +824,23 @@ static int set_admin(bool admin)
 		perror("cap_get_proc");
 		return -1;
 	}
-	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val,
+	cap = (struct libcap *)caps;
+	if (cap_set_flag(caps, CAP_EFFECTIVE, 1, &cap_val_admin, CAP_CLEAR)) {
+		perror("cap_set_flag clear admin");
+		goto out;
+	}
+	if (cap_set_flag(caps, CAP_EFFECTIVE, 2, cap_val,
 				admin ? CAP_SET : CAP_CLEAR)) {
-		perror("cap_set_flag");
+		perror("cap_set_flag set_or_clear bpf+net");
 		goto out;
 	}
+	/* libcap is likely old and simply ignores CAP_BPF,
+	 * so update effective bits manually
+	 */
+	if (admin)
+		cap->data[1].effective |= 1 << (38 - 32);
+	else
+		cap->data[1].effective &= ~(1 << (38 - 32));
 	if (cap_set_proc(caps)) {
 		perror("cap_set_proc");
 		goto out;
@@ -1013,8 +1033,9 @@ static void do_test_single(struct bpf_test *test, bool unpriv,
 static bool is_admin(void)
 {
 	cap_t caps;
-	cap_flag_value_t sysadmin = CAP_CLEAR;
-	const cap_value_t cap_val = CAP_SYS_ADMIN;
+	cap_flag_value_t bpf_priv = CAP_CLEAR;
+	cap_flag_value_t net_priv = CAP_CLEAR;
+	struct libcap *cap;
 
 #ifdef CAP_IS_SUPPORTED
 	if (!CAP_IS_SUPPORTED(CAP_SETFCAP)) {
@@ -1027,11 +1048,13 @@ static bool is_admin(void)
 		perror("cap_get_proc");
 		return false;
 	}
-	if (cap_get_flag(caps, cap_val, CAP_EFFECTIVE, &sysadmin))
-		perror("cap_get_flag");
+	cap = (struct libcap *)caps;
+	bpf_priv = cap->data[1].effective & (1 << (38/* CAP_BPF */ - 32));
+	if (cap_get_flag(caps, CAP_NET_ADMIN, CAP_EFFECTIVE, &net_priv))
+		perror("cap_get_flag NET");
 	if (cap_free(caps))
 		perror("cap_free");
-	return (sysadmin == CAP_SET);
+	return bpf_priv == CAP_SET && net_priv == CAP_SET;
 }
 
 static void get_unpriv_disabled()
-- 
2.20.0


^ permalink raw reply related	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-27 20:52 [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF Alexei Starovoitov
@ 2019-08-27 23:01 ` Andy Lutomirski
  2019-08-27 23:21   ` Steven Rostedt
                     ` (2 more replies)
  2019-08-28 10:38 ` kbuild test robot
  1 sibling, 3 replies; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-27 23:01 UTC (permalink / raw)
  To: Alexei Starovoitov, Kees Cook, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, Steven Rostedt
  Cc: David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

[adding some security and tracing folks to cc]

On Tue, Aug 27, 2019 at 1:52 PM Alexei Starovoitov <ast@kernel.org> wrote:
>
> Introduce CAP_BPF that allows loading all types of BPF programs,
> create most map types, load BTF, iterate programs and maps.
> CAP_BPF alone is not enough to attach or run programs.
>
> Networking:
>
> CAP_BPF and CAP_NET_ADMIN are necessary to:
> - attach to cgroup-bpf hooks like INET_INGRESS, INET_SOCK_CREATE, INET4_CONNECT
> - run networking bpf programs (like xdp, skb, flow_dissector)
>
> Tracing:
>
> CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1)
> are necessary to:
> - attach bpf program to raw tracepoint
> - use bpf_trace_printk() in all program types (not only tracing programs)
> - create bpf stackmap
>
> To attach bpf to perf_events perf_event_open() needs to succeed as usual.
>
> CAP_BPF controls BPF side.
> CAP_NET_ADMIN controls intersection where BPF calls into networking.
> perf_paranoid_tracepoint_raw controls intersection where BPF calls into tracing.
>
> In the future CAP_TRACING could be introduced to control
> creation of kprobe/uprobe and attaching bpf to perf_events.
> In such case bpf_probe_read() thin wrapper would be controlled by CAP_BPF.
> Whereas probe_read() would be controlled by CAP_TRACING.
> CAP_TRACING would also control generic kprobe+probe_read.
> CAP_BPF and CAP_TRACING would be necessary for tracing bpf programs
> that want to use bpf_probe_read.

First, some high-level review:

Can you write up some clear documentation aimed at administrators that
says what CAP_BPF does?  For example, is it expected that CAP_BPF by
itself permits reading all kernel memory?  Why might one grant it?

Can you give at least one fully described use case where CAP_BPF
solves a real-world problem that is not solved by existing mechanisms?

Changing the capability that some existing operation requires could
break existing programs.  The old capability may need to be accepted
as well.

I'm inclined to suggest that CAP_TRACING be figured out or rejected
before something like this gets applied.


>
> Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> ---
> I would prefer to introduce CAP_TRACING soon, since it
> will make tracing and networking permission model symmetrical.
>

Here's my proposal for CAP_TRACING, documentation-style:

--- begin ---

CAP_TRACING enables a task to use various kernel features to trace
running user programs and the kernel itself.  CAP_TRACING also enables
a task to bypass some speculation attack countermeasures.  A task in
the init user namespace with CAP_TRACING will be able to tell exactly
what kernel code is executed and when, and will be able to read kernel
registers and kernel memory.  It will, similarly, be able to read the
state of other user tasks.

Specifically, CAP_TRACING allows the following operations.  It may
allow more operations in the future:

 - Full use of perf_event_open(), similarly to the effect of
kernel.perf_event_paranoid == -1.

 - Loading and attaching tracing BPF programs, including use of BPF
raw tracepoints.

 - Use of BPF stack maps.

 - Use of bpf_probe_read() and bpf_trace_printk().

 - Use of unsafe pointer-to-integer conversions in BPF.

 - Bypassing of BPF's speculation attack hardening measures and
constant blinding.  (Note: other mechanisms might also allow this.)

CAP_TRACING does not override normal permissions on sysfs or debugfs.
This means that, unless a new interface for programming kprobes and
such is added, it does not directly allow use of kprobes.

If CAP_TRACING, by itself, enables a task to crash or otherwise
corrupt the kernel or other tasks, this will be considered a kernel
bug.

CAP_TRACING in a non-init user namespace may, in the future, allow
tracing of other tasks in that user namespace or its descendants.  It
will not enable kernel tracing or tracing of tasks outside the user
namespace in question.

--- end ---

Does this sound good?  The idea here is that CAP_TRACING should be
very useful even without CAP_BPF, which allows CAP_BPF to be less
powerful.

> +bool cap_bpf_tracing(void)
> +{
> +       return capable(CAP_SYS_ADMIN) ||
> +              (capable(CAP_BPF) && !perf_paranoid_tracepoint_raw());
> +}

If auditing is on, this will audit the wrong thing.  James, I think a
helper like:

bool ns_either_cap(struct user_ns *ns, int preferred_cap, int other_cap);

would help.  ns_either_cap returns true if either cap is held (i.e.
effective, as usual).  On success, it audits preferred_cap if held and
other_cap otherwise.  On failure, it audits preferred_cap.  Does this
sound right?

Also, for reference, perf_paranoid_tracepoint_raw() is this:

static inline bool perf_paranoid_tracepoint_raw(void)
{
        return sysctl_perf_event_paranoid > -1;
}

so the overall effect of cap_bpf_tracing() is rather odd, and it seems
to control a few things that don't obvious all have similar security
effects.


> @@ -2080,7 +2083,10 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
>         struct bpf_prog *prog;
>         int ret = -ENOTSUPP;
>
> -       if (!capable(CAP_SYS_ADMIN))
> +       if (!capable(CAP_NET_ADMIN) || !capable(CAP_BPF))
> +               /* test_run callback is available for networking progs only.
> +                * Add cap_bpf_tracing() above when tracing progs become runable.
> +                */

I think test_run should probably be CAP_SYS_ADMIN forever.  test_run
is the only way that one can run a bpf program and call helper
functions via the program if one doesn't have permission to attach the
program.  Also, if there's a way to run a speculation attack via a bpf
program, test_run will make it much easier to do in a controlled
environment.  Finally, when debugging bpf programs, developers can use
their own computers or a VM.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-27 23:01 ` Andy Lutomirski
@ 2019-08-27 23:21   ` Steven Rostedt
  2019-08-27 23:34     ` Andy Lutomirski
                       ` (2 more replies)
  2019-08-28  0:34   ` Alexei Starovoitov
  2019-08-28  7:14   ` Peter Zijlstra
  2 siblings, 3 replies; 46+ messages in thread
From: Steven Rostedt @ 2019-08-27 23:21 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Kees Cook, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, David S. Miller,
	Daniel Borkmann, Network Development, bpf, kernel-team,
	Linux API

On Tue, 27 Aug 2019 16:01:08 -0700
Andy Lutomirski <luto@kernel.org> wrote:

> [adding some security and tracing folks to cc]
> 
> On Tue, Aug 27, 2019 at 1:52 PM Alexei Starovoitov <ast@kernel.org> wrote:
> >
> > Introduce CAP_BPF that allows loading all types of BPF programs,
> > create most map types, load BTF, iterate programs and maps.
> > CAP_BPF alone is not enough to attach or run programs.
> >
> > Networking:
> >
> > CAP_BPF and CAP_NET_ADMIN are necessary to:
> > - attach to cgroup-bpf hooks like INET_INGRESS, INET_SOCK_CREATE, INET4_CONNECT
> > - run networking bpf programs (like xdp, skb, flow_dissector)
> >
> > Tracing:
> >
> > CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1)
> > are necessary to:
> > - attach bpf program to raw tracepoint
> > - use bpf_trace_printk() in all program types (not only tracing programs)
> > - create bpf stackmap
> >
> > To attach bpf to perf_events perf_event_open() needs to succeed as usual.
> >
> > CAP_BPF controls BPF side.
> > CAP_NET_ADMIN controls intersection where BPF calls into networking.
> > perf_paranoid_tracepoint_raw controls intersection where BPF calls into tracing.
> >
> > In the future CAP_TRACING could be introduced to control
> > creation of kprobe/uprobe and attaching bpf to perf_events.
> > In such case bpf_probe_read() thin wrapper would be controlled by CAP_BPF.
> > Whereas probe_read() would be controlled by CAP_TRACING.
> > CAP_TRACING would also control generic kprobe+probe_read.
> > CAP_BPF and CAP_TRACING would be necessary for tracing bpf programs
> > that want to use bpf_probe_read.

No mention of the tracefs (/sys/kernel/tracing) file?
  
> 
> First, some high-level review:
> 
> Can you write up some clear documentation aimed at administrators that
> says what CAP_BPF does?  For example, is it expected that CAP_BPF by
> itself permits reading all kernel memory?  Why might one grant it?
> 
> Can you give at least one fully described use case where CAP_BPF
> solves a real-world problem that is not solved by existing mechanisms?

At least for CAP_TRACING (if it were to allow read/write access
to /sys/kernel/tracing), that would be very useful. It would be useful
to those that basically own their machines, and want to trace their
applications all the way into the kernel without having to run as full
root.


> 
> Changing the capability that some existing operation requires could
> break existing programs.  The old capability may need to be accepted
> as well.
> 
> I'm inclined to suggest that CAP_TRACING be figured out or rejected
> before something like this gets applied.
> 
> 
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
> > I would prefer to introduce CAP_TRACING soon, since it
> > will make tracing and networking permission model symmetrical.
> >  
> 
> Here's my proposal for CAP_TRACING, documentation-style:
> 
> --- begin ---
> 
> CAP_TRACING enables a task to use various kernel features to trace
> running user programs and the kernel itself.  CAP_TRACING also enables
> a task to bypass some speculation attack countermeasures.  A task in
> the init user namespace with CAP_TRACING will be able to tell exactly
> what kernel code is executed and when, and will be able to read kernel
> registers and kernel memory.  It will, similarly, be able to read the
> state of other user tasks.
> 
> Specifically, CAP_TRACING allows the following operations.  It may
> allow more operations in the future:
> 
>  - Full use of perf_event_open(), similarly to the effect of
> kernel.perf_event_paranoid == -1.
> 
>  - Loading and attaching tracing BPF programs, including use of BPF
> raw tracepoints.
> 
>  - Use of BPF stack maps.
> 
>  - Use of bpf_probe_read() and bpf_trace_printk().
> 
>  - Use of unsafe pointer-to-integer conversions in BPF.
> 
>  - Bypassing of BPF's speculation attack hardening measures and
> constant blinding.  (Note: other mechanisms might also allow this.)
> 
> CAP_TRACING does not override normal permissions on sysfs or debugfs.
> This means that, unless a new interface for programming kprobes and
> such is added, it does not directly allow use of kprobes.

kprobes can be created in the tracefs filesystem (which is separate from
debugfs, tracefs just gets automatically mounted
in /sys/kernel/debug/tracing when debugfs is mounted) from the
kprobe_events file. /sys/kernel/tracing is just the tracefs
directory without debugfs, and was created specifically to allow
tracing to be access without opening up the can of worms in debugfs.

Should we allow CAP_TRACING access to /proc/kallsyms? as it is helpful
to convert perf and trace-cmd's function pointers into names. Once you
allow tracing of the kernel, hiding /proc/kallsyms is pretty useless.

-- Steve

> 
> If CAP_TRACING, by itself, enables a task to crash or otherwise
> corrupt the kernel or other tasks, this will be considered a kernel
> bug.
> 
> CAP_TRACING in a non-init user namespace may, in the future, allow
> tracing of other tasks in that user namespace or its descendants.  It
> will not enable kernel tracing or tracing of tasks outside the user
> namespace in question.
> 
> --- end ---
> 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-27 23:21   ` Steven Rostedt
@ 2019-08-27 23:34     ` Andy Lutomirski
  2019-08-28  0:44       ` Steven Rostedt
  2019-08-28  0:38     ` Alexei Starovoitov
  2019-08-28  3:30     ` Masami Hiramatsu
  2 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-27 23:34 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Tue, Aug 27, 2019 at 4:21 PM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Tue, 27 Aug 2019 16:01:08 -0700
> Andy Lutomirski <luto@kernel.org> wrote:
>
> > [adding some security and tracing folks to cc]
> >
> > On Tue, Aug 27, 2019 at 1:52 PM Alexei Starovoitov <ast@kernel.org> wrote:
> > >
> > > Introduce CAP_BPF that allows loading all types of BPF programs,
> > > create most map types, load BTF, iterate programs and maps.
> > > CAP_BPF alone is not enough to attach or run programs.
> > >
> > > Networking:
> > >
> > > CAP_BPF and CAP_NET_ADMIN are necessary to:
> > > - attach to cgroup-bpf hooks like INET_INGRESS, INET_SOCK_CREATE, INET4_CONNECT
> > > - run networking bpf programs (like xdp, skb, flow_dissector)
> > >
> > > Tracing:
> > >
> > > CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1)
> > > are necessary to:
> > > - attach bpf program to raw tracepoint
> > > - use bpf_trace_printk() in all program types (not only tracing programs)
> > > - create bpf stackmap
> > >
> > > To attach bpf to perf_events perf_event_open() needs to succeed as usual.
> > >
> > > CAP_BPF controls BPF side.
> > > CAP_NET_ADMIN controls intersection where BPF calls into networking.
> > > perf_paranoid_tracepoint_raw controls intersection where BPF calls into tracing.
> > >
> > > In the future CAP_TRACING could be introduced to control
> > > creation of kprobe/uprobe and attaching bpf to perf_events.
> > > In such case bpf_probe_read() thin wrapper would be controlled by CAP_BPF.
> > > Whereas probe_read() would be controlled by CAP_TRACING.
> > > CAP_TRACING would also control generic kprobe+probe_read.
> > > CAP_BPF and CAP_TRACING would be necessary for tracing bpf programs
> > > that want to use bpf_probe_read.
>
> No mention of the tracefs (/sys/kernel/tracing) file?

See below.  Also, I am embarrassed to admit that I just assumed that
/sys/kernel/debug/tracing was just like any other debugfs directory.

>
>
> >
> > Changing the capability that some existing operation requires could
> > break existing programs.  The old capability may need to be accepted
> > as well.
> >
> > I'm inclined to suggest that CAP_TRACING be figured out or rejected
> > before something like this gets applied.
> >
> >
> > >
> > > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > > ---
> > > I would prefer to introduce CAP_TRACING soon, since it
> > > will make tracing and networking permission model symmetrical.
> > >
> >
> > Here's my proposal for CAP_TRACING, documentation-style:
> >
> > --- begin ---
> >
> > CAP_TRACING enables a task to use various kernel features to trace
> > running user programs and the kernel itself.  CAP_TRACING also enables
> > a task to bypass some speculation attack countermeasures.  A task in
> > the init user namespace with CAP_TRACING will be able to tell exactly
> > what kernel code is executed and when, and will be able to read kernel
> > registers and kernel memory.  It will, similarly, be able to read the
> > state of other user tasks.
> >
> > Specifically, CAP_TRACING allows the following operations.  It may
> > allow more operations in the future:
> >
> >  - Full use of perf_event_open(), similarly to the effect of
> > kernel.perf_event_paranoid == -1.
> >
> >  - Loading and attaching tracing BPF programs, including use of BPF
> > raw tracepoints.
> >
> >  - Use of BPF stack maps.
> >
> >  - Use of bpf_probe_read() and bpf_trace_printk().
> >
> >  - Use of unsafe pointer-to-integer conversions in BPF.
> >
> >  - Bypassing of BPF's speculation attack hardening measures and
> > constant blinding.  (Note: other mechanisms might also allow this.)
> >
> > CAP_TRACING does not override normal permissions on sysfs or debugfs.
> > This means that, unless a new interface for programming kprobes and
> > such is added, it does not directly allow use of kprobes.
>
> kprobes can be created in the tracefs filesystem (which is separate from
> debugfs, tracefs just gets automatically mounted
> in /sys/kernel/debug/tracing when debugfs is mounted) from the
> kprobe_events file. /sys/kernel/tracing is just the tracefs
> directory without debugfs, and was created specifically to allow
> tracing to be access without opening up the can of worms in debugfs.

I think that, in principle, CAP_TRACING should allow this, but I'm not
sure how to achieve that.  I suppose we could set up
inode_operations.permission on tracefs, but what exactly would it do?
Would it be just like generic_permission() except that it would look
at CAP_TRACING instead of CAP_DAC_OVERRIDE?  That is, you can use
tracefs if you have CAP_TRACING *or* acl access?  Or would it be:

int tracing_permission(struct inode *inode, int mask)
{
  if (!capable(CAP_TRACING))
    return -EPERM;

  return generic_permission(inode, mask);
}

Which would mean that you need ACL *and* CAP_TRACING, so
administrators would change the mode to 777.  That's a bit scary.

And this still doesn't let people even *find* tracefs, since it's
hidden in debugfs.

So maybe make CAP_TRACING override ACLs but also add /sys/fs/tracing
and mount tracefs there, too, so that regular users can at least find
the mountpoint.

>
> Should we allow CAP_TRACING access to /proc/kallsyms? as it is helpful
> to convert perf and trace-cmd's function pointers into names. Once you
> allow tracing of the kernel, hiding /proc/kallsyms is pretty useless.

I think we should.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-27 23:01 ` Andy Lutomirski
  2019-08-27 23:21   ` Steven Rostedt
@ 2019-08-28  0:34   ` Alexei Starovoitov
  2019-08-28  0:55     ` Andy Lutomirski
  2019-08-28  7:14   ` Peter Zijlstra
  2 siblings, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-28  0:34 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Kees Cook, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, Steven Rostedt,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Tue, Aug 27, 2019 at 04:01:08PM -0700, Andy Lutomirski wrote:
> [adding some security and tracing folks to cc]
> 
> On Tue, Aug 27, 2019 at 1:52 PM Alexei Starovoitov <ast@kernel.org> wrote:
> >
> > Introduce CAP_BPF that allows loading all types of BPF programs,
> > create most map types, load BTF, iterate programs and maps.
> > CAP_BPF alone is not enough to attach or run programs.
> >
> > Networking:
> >
> > CAP_BPF and CAP_NET_ADMIN are necessary to:
> > - attach to cgroup-bpf hooks like INET_INGRESS, INET_SOCK_CREATE, INET4_CONNECT
> > - run networking bpf programs (like xdp, skb, flow_dissector)
> >
> > Tracing:
> >
> > CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1)
> > are necessary to:
> > - attach bpf program to raw tracepoint
> > - use bpf_trace_printk() in all program types (not only tracing programs)
> > - create bpf stackmap
> >
> > To attach bpf to perf_events perf_event_open() needs to succeed as usual.
> >
> > CAP_BPF controls BPF side.
> > CAP_NET_ADMIN controls intersection where BPF calls into networking.
> > perf_paranoid_tracepoint_raw controls intersection where BPF calls into tracing.
> >
> > In the future CAP_TRACING could be introduced to control
> > creation of kprobe/uprobe and attaching bpf to perf_events.
> > In such case bpf_probe_read() thin wrapper would be controlled by CAP_BPF.
> > Whereas probe_read() would be controlled by CAP_TRACING.
> > CAP_TRACING would also control generic kprobe+probe_read.
> > CAP_BPF and CAP_TRACING would be necessary for tracing bpf programs
> > that want to use bpf_probe_read.
> 
> First, some high-level review:
> 
> Can you write up some clear documentation aimed at administrators that
> says what CAP_BPF does?  For example, is it expected that CAP_BPF by
> itself permits reading all kernel memory?

hmm. the answer is in the sentence you quoted right above.

> Can you give at least one fully described use case where CAP_BPF
> solves a real-world problem that is not solved by existing mechanisms?

bpftrace binary would be installed with CAP_BPF and CAP_TRACING.
bcc tools would be installed with CAP_BPF and CAP_TRACING.
perf binary would be installed with CAP_TRACING only.
XDP networking daemon would be installed with CAP_BPF and CAP_NET_ADMIN.
None of them would need full root.

> Changing the capability that some existing operation requires could
> break existing programs.  The old capability may need to be accepted
> as well.

As far as I can see there is no ABI breakage. Please point out
which line of the patch may break it.

> I'm inclined to suggest that CAP_TRACING be figured out or rejected
> before something like this gets applied.

that's fair.

> 
> >
> > Signed-off-by: Alexei Starovoitov <ast@kernel.org>
> > ---
> > I would prefer to introduce CAP_TRACING soon, since it
> > will make tracing and networking permission model symmetrical.
> >
> 
> Here's my proposal for CAP_TRACING, documentation-style:
> 
> --- begin ---
> 
> CAP_TRACING enables a task to use various kernel features to trace
> running user programs and the kernel itself.  CAP_TRACING also enables
> a task to bypass some speculation attack countermeasures.  A task in
> the init user namespace with CAP_TRACING will be able to tell exactly
> what kernel code is executed and when, and will be able to read kernel
> registers and kernel memory.  It will, similarly, be able to read the
> state of other user tasks.
> 
> Specifically, CAP_TRACING allows the following operations.  It may
> allow more operations in the future:
> 
>  - Full use of perf_event_open(), similarly to the effect of
> kernel.perf_event_paranoid == -1.

+1

>  - Loading and attaching tracing BPF programs, including use of BPF
> raw tracepoints.

-1

>  - Use of BPF stack maps.

-1

>  - Use of bpf_probe_read() and bpf_trace_printk().

-1

>  - Use of unsafe pointer-to-integer conversions in BPF.

-1

>  - Bypassing of BPF's speculation attack hardening measures and
> constant blinding.  (Note: other mechanisms might also allow this.)

-1
All of the above are allowed by CAP_BPF.
They are not allowed by CAP_TRACING.

> CAP_TRACING does not override normal permissions on sysfs or debugfs.
> This means that, unless a new interface for programming kprobes and
> such is added, it does not directly allow use of kprobes.

kprobes can be created via perf_event_open already.
So above statement contradicts your first statement.

> If CAP_TRACING, by itself, enables a task to crash or otherwise
> corrupt the kernel or other tasks, this will be considered a kernel
> bug.

+1

> CAP_TRACING in a non-init user namespace may, in the future, allow
> tracing of other tasks in that user namespace or its descendants.  It
> will not enable kernel tracing or tracing of tasks outside the user
> namespace in question.

I would avoid describing user ns for now.
There is enough confusion without it.

> --- end ---
> 
> Does this sound good?  The idea here is that CAP_TRACING should be
> very useful even without CAP_BPF, which allows CAP_BPF to be less
> powerful.

As proposed CAP_BPF does not allow tracing or networking on its own.
CAP_BPF only controls BPF side.

For example:
BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
{
        int ret;

        ret = probe_kernel_read(dst, unsafe_ptr, size);
        if (unlikely(ret < 0))
                memset(dst, 0, size);

        return ret;
}

All of BPF (including prototype of bpf_probe_read) is controlled by CAP_BPF.
But the kernel primitives its using (probe_kernel_read) is controlled by CAP_TRACING.
Hence a task needs _both_ CAP_BPF and CAP_TRACING to attach and run bpf program
that uses bpf_probe_read.

Similar with all other kernel code that BPF helpers may call directly or indirectly.
If there is a way for bpf program to call into piece of code controlled by CAP_TRACING
such helper would need CAP_BPF and CAP_TRACING.
If bpf helper calls into something that may mangle networking packet
such helper would need both CAP_BPF and CAP_NET_ADMIN to execute.

> > @@ -2080,7 +2083,10 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
> >         struct bpf_prog *prog;
> >         int ret = -ENOTSUPP;
> >
> > -       if (!capable(CAP_SYS_ADMIN))
> > +       if (!capable(CAP_NET_ADMIN) || !capable(CAP_BPF))
> > +               /* test_run callback is available for networking progs only.
> > +                * Add cap_bpf_tracing() above when tracing progs become runable.
> > +                */
> 
> I think test_run should probably be CAP_SYS_ADMIN forever.  test_run
> is the only way that one can run a bpf program and call helper
> functions via the program if one doesn't have permission to attach the
> program.  

Since CAP_BPF + CAP_NET_ADMIN allow attach. It means that a task
with these two permissions will have programs running anyway.
(traffic will flow through netdev, socket events will happen, etc)
Hence no reason to disallow running program via test_run.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-27 23:21   ` Steven Rostedt
  2019-08-27 23:34     ` Andy Lutomirski
@ 2019-08-28  0:38     ` Alexei Starovoitov
  2019-08-28  3:30     ` Masami Hiramatsu
  2 siblings, 0 replies; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-28  0:38 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Tue, Aug 27, 2019 at 07:21:44PM -0400, Steven Rostedt wrote:
> 
> At least for CAP_TRACING (if it were to allow read/write access
> to /sys/kernel/tracing), that would be very useful. It would be useful
> to those that basically own their machines, and want to trace their
> applications all the way into the kernel without having to run as full
> root.

+1

The proposal is to have CAP_TRACING to control perf and ftrace.
perf and trace-cmd binaries could be installed with CAP_TRACING and that's
all they need to do full tracing.

I can craft a patch for perf_event_open side and demo CAP_TRACING.
Once that cap bit is ready you can use it on ftrace side?

> Should we allow CAP_TRACING access to /proc/kallsyms? as it is helpful
> to convert perf and trace-cmd's function pointers into names. Once you
> allow tracing of the kernel, hiding /proc/kallsyms is pretty useless.

yep.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-27 23:34     ` Andy Lutomirski
@ 2019-08-28  0:44       ` Steven Rostedt
  2019-08-28  1:12         ` Andy Lutomirski
  0 siblings, 1 reply; 46+ messages in thread
From: Steven Rostedt @ 2019-08-28  0:44 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Kees Cook, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, David S. Miller,
	Daniel Borkmann, Network Development, bpf, kernel-team,
	Linux API

On Tue, 27 Aug 2019 16:34:47 -0700
Andy Lutomirski <luto@kernel.org> wrote:

> > > CAP_TRACING does not override normal permissions on sysfs or debugfs.
> > > This means that, unless a new interface for programming kprobes and
> > > such is added, it does not directly allow use of kprobes.  
> >
> > kprobes can be created in the tracefs filesystem (which is separate from
> > debugfs, tracefs just gets automatically mounted
> > in /sys/kernel/debug/tracing when debugfs is mounted) from the
> > kprobe_events file. /sys/kernel/tracing is just the tracefs
> > directory without debugfs, and was created specifically to allow
> > tracing to be access without opening up the can of worms in debugfs.  
> 
> I think that, in principle, CAP_TRACING should allow this, but I'm not
> sure how to achieve that.  I suppose we could set up
> inode_operations.permission on tracefs, but what exactly would it do?
> Would it be just like generic_permission() except that it would look
> at CAP_TRACING instead of CAP_DAC_OVERRIDE?  That is, you can use
> tracefs if you have CAP_TRACING *or* acl access?  Or would it be:
> 
> int tracing_permission(struct inode *inode, int mask)
> {
>   if (!capable(CAP_TRACING))
>     return -EPERM;
> 
>   return generic_permission(inode, mask);
> }

Perhaps we should make a group for it?

> 
> Which would mean that you need ACL *and* CAP_TRACING, so
> administrators would change the mode to 777.  That's a bit scary.
> 
> And this still doesn't let people even *find* tracefs, since it's
> hidden in debugfs.
> 
> So maybe make CAP_TRACING override ACLs but also add /sys/fs/tracing
> and mount tracefs there, too, so that regular users can at least find
> the mountpoint.

I think you missed what I said. It's not hidden in /sys/kernel/debug.
If you enable tracefs, you have /sys/kernel/tracing created, and is
completely separate from debugfs. I only have it *also* automatically
mounted to /sys/kernel/debug/tracing for backward compatibility
reasons, as older versions of trace-cmd will only mount debugfs (as
root), and expect to find it there.

 mount -t tracefs nodev /sys/kernel/tracing

-- Steve

> 
> >
> > Should we allow CAP_TRACING access to /proc/kallsyms? as it is helpful
> > to convert perf and trace-cmd's function pointers into names. Once you
> > allow tracing of the kernel, hiding /proc/kallsyms is pretty useless.  
> 
> I think we should.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  0:34   ` Alexei Starovoitov
@ 2019-08-28  0:55     ` Andy Lutomirski
  2019-08-28  2:00       ` Andy Lutomirski
  2019-08-28  4:43       ` Alexei Starovoitov
  0 siblings, 2 replies; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-28  0:55 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	Steven Rostedt, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API

On Tue, Aug 27, 2019 at 5:34 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Aug 27, 2019 at 04:01:08PM -0700, Andy Lutomirski wrote:
> > [adding some security and tracing folks to cc]
> >
> > On Tue, Aug 27, 2019 at 1:52 PM Alexei Starovoitov <ast@kernel.org> wrote:
> > >
> > > Introduce CAP_BPF that allows loading all types of BPF programs,
> > > create most map types, load BTF, iterate programs and maps.
> > > CAP_BPF alone is not enough to attach or run programs.
> > >
> > > Networking:
> > >
> > > CAP_BPF and CAP_NET_ADMIN are necessary to:
> > > - attach to cgroup-bpf hooks like INET_INGRESS, INET_SOCK_CREATE, INET4_CONNECT
> > > - run networking bpf programs (like xdp, skb, flow_dissector)
> > >
> > > Tracing:
> > >
> > > CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1)
> > > are necessary to:
> > > - attach bpf program to raw tracepoint
> > > - use bpf_trace_printk() in all program types (not only tracing programs)
> > > - create bpf stackmap
> > >
> > > To attach bpf to perf_events perf_event_open() needs to succeed as usual.
> > >
> > > CAP_BPF controls BPF side.
> > > CAP_NET_ADMIN controls intersection where BPF calls into networking.
> > > perf_paranoid_tracepoint_raw controls intersection where BPF calls into tracing.
> > >
> > > In the future CAP_TRACING could be introduced to control
> > > creation of kprobe/uprobe and attaching bpf to perf_events.
> > > In such case bpf_probe_read() thin wrapper would be controlled by CAP_BPF.
> > > Whereas probe_read() would be controlled by CAP_TRACING.
> > > CAP_TRACING would also control generic kprobe+probe_read.
> > > CAP_BPF and CAP_TRACING would be necessary for tracing bpf programs
> > > that want to use bpf_probe_read.
> >
> > First, some high-level review:
> >
> > Can you write up some clear documentation aimed at administrators that
> > says what CAP_BPF does?  For example, is it expected that CAP_BPF by
> > itself permits reading all kernel memory?
>
> hmm. the answer is in the sentence you quoted right above.

I was hoping for something in Documentation/admin-guide, not in a
changelog that's hard to find.

>
> > Can you give at least one fully described use case where CAP_BPF
> > solves a real-world problem that is not solved by existing mechanisms?
>
> bpftrace binary would be installed with CAP_BPF and CAP_TRACING.
> bcc tools would be installed with CAP_BPF and CAP_TRACING.
> perf binary would be installed with CAP_TRACING only.
> XDP networking daemon would be installed with CAP_BPF and CAP_NET_ADMIN.
> None of them would need full root.

As in just setting these bits in fscaps?  What does this achieve
beyond just installing them setuid-root or with CAP_SYS_ADMIN and
judicious use of capset internally?  For that matter, what prevents
unauthorized users from tracing the system if you do this?  This just
lets anyone trace the system, which seems like a mistake.

Can you clarify your example or give another one?

>
> > Changing the capability that some existing operation requires could
> > break existing programs.  The old capability may need to be accepted
> > as well.
>
> As far as I can see there is no ABI breakage. Please point out
> which line of the patch may break it.

As a more or less arbitrary selection:

 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
        if (!bpf_prog_kallsyms_candidate(fp) ||
-           !capable(CAP_SYS_ADMIN))
+           !capable(CAP_BPF))
                return;

Before your patch, a task with CAP_SYS_ADMIN could do this.  Now it
can't.  Per the usual Linux definition of "ABI break", this is an ABI
break if and only if someone actually did this in a context where they
have CAP_SYS_ADMIN but not all capabilities.  How confident are you
that no one does things like this?
 void bpf_prog_kallsyms_add(struct bpf_prog *fp)
 {
        if (!bpf_prog_kallsyms_candidate(fp) ||
-           !capable(CAP_SYS_ADMIN))
+           !capable(CAP_BPF))
                return;

> > > ---
> > > I would prefer to introduce CAP_TRACING soon, since it
> > > will make tracing and networking permission model symmetrical.
> > >
> >
> > Here's my proposal for CAP_TRACING, documentation-style:
> >
> > --- begin ---
> >
> > CAP_TRACING enables a task to use various kernel features to trace
> > running user programs and the kernel itself.  CAP_TRACING also enables
> > a task to bypass some speculation attack countermeasures.  A task in
> > the init user namespace with CAP_TRACING will be able to tell exactly
> > what kernel code is executed and when, and will be able to read kernel
> > registers and kernel memory.  It will, similarly, be able to read the
> > state of other user tasks.
> >
> > Specifically, CAP_TRACING allows the following operations.  It may
> > allow more operations in the future:
> >
> >  - Full use of perf_event_open(), similarly to the effect of
> > kernel.perf_event_paranoid == -1.
>
> +1
>
> >  - Loading and attaching tracing BPF programs, including use of BPF
> > raw tracepoints.
>
> -1
>
> >  - Use of BPF stack maps.
>
> -1
>
> >  - Use of bpf_probe_read() and bpf_trace_printk().
>
> -1
>
> >  - Use of unsafe pointer-to-integer conversions in BPF.
>
> -1
>
> >  - Bypassing of BPF's speculation attack hardening measures and
> > constant blinding.  (Note: other mechanisms might also allow this.)
>
> -1
> All of the above are allowed by CAP_BPF.
> They are not allowed by CAP_TRACING.

Why?  I don't mean to discount your -1, and you may well have a
compelling reason.  If so, I'll change my proposal.

From the previous discussion, you want to make progress toward solving
a lot of problems with CAP_BPF.  One of them was making BPF
firewalling more generally useful. By making CAP_BPF grant the ability
to read kernel memory, you will make administrators much more nervous
to grant CAP_BPF.  Similarly, and correct me if I'm wrong, most of
these capabilities are primarily or only useful for tracing, so I
don't see why users without CAP_TRACING should get them.
bpf_trace_printk(), in particular, even has "trace" in its name :)

Also, if a task has CAP_TRACING, it's expected to be able to trace the
system -- that's the whole point.  Why shouldn't it be able to use BPF
to trace the system better?

>
> > CAP_TRACING does not override normal permissions on sysfs or debugfs.
> > This means that, unless a new interface for programming kprobes and
> > such is added, it does not directly allow use of kprobes.
>
> kprobes can be created via perf_event_open already.
> So above statement contradicts your first statement.

Hmm.  The way of using perf with kprobes that I'm familiar with is:

# perf probe --add func_name

And this uses "/sys/kernel/debug/tracing//kprobe_events" (with the
double slash!).  Is there indeed another way to do this?

Anyway, I didn't mean to exclude any existing perf_event_open()
mechanism -- what I meant was that, without some extension to my
proposal, /sys/kernel/debug/tracing wouldn't magically become
accessible due to CAP_TRACING.

>
> > If CAP_TRACING, by itself, enables a task to crash or otherwise
> > corrupt the kernel or other tasks, this will be considered a kernel
> > bug.
>
> +1
>
> > CAP_TRACING in a non-init user namespace may, in the future, allow
> > tracing of other tasks in that user namespace or its descendants.  It
> > will not enable kernel tracing or tracing of tasks outside the user
> > namespace in question.
>
> I would avoid describing user ns for now.
> There is enough confusion without it.
>
> > --- end ---
> >
> > Does this sound good?  The idea here is that CAP_TRACING should be
> > very useful even without CAP_BPF, which allows CAP_BPF to be less
> > powerful.
>
> As proposed CAP_BPF does not allow tracing or networking on its own.
> CAP_BPF only controls BPF side.
>
> For example:
> BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
> {
>         int ret;
>
>         ret = probe_kernel_read(dst, unsafe_ptr, size);
>         if (unlikely(ret < 0))
>                 memset(dst, 0, size);
>
>         return ret;
> }
>
> All of BPF (including prototype of bpf_probe_read) is controlled by CAP_BPF.
> But the kernel primitives its using (probe_kernel_read) is controlled by CAP_TRACING.
> Hence a task needs _both_ CAP_BPF and CAP_TRACING to attach and run bpf program
> that uses bpf_probe_read.
>
> Similar with all other kernel code that BPF helpers may call directly or indirectly.
> If there is a way for bpf program to call into piece of code controlled by CAP_TRACING
> such helper would need CAP_BPF and CAP_TRACING.
> If bpf helper calls into something that may mangle networking packet
> such helper would need both CAP_BPF and CAP_NET_ADMIN to execute.

Why do you want to require CAP_BPF to call into functions like
bpf_probe_read()?  I understand why you want to limit access to bpf,
but I think that CAP_TRACING should be sufficient to allow the tracing
parts of BPF.  After all, a lot of your concerns, especially the ones
involving speculation, don't really apply to users with CAP_TRACING --
users with CAP_TRACING can read kernel memory with or without bpf.

>
> > > @@ -2080,7 +2083,10 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
> > >         struct bpf_prog *prog;
> > >         int ret = -ENOTSUPP;
> > >
> > > -       if (!capable(CAP_SYS_ADMIN))
> > > +       if (!capable(CAP_NET_ADMIN) || !capable(CAP_BPF))
> > > +               /* test_run callback is available for networking progs only.
> > > +                * Add cap_bpf_tracing() above when tracing progs become runable.
> > > +                */
> >
> > I think test_run should probably be CAP_SYS_ADMIN forever.  test_run
> > is the only way that one can run a bpf program and call helper
> > functions via the program if one doesn't have permission to attach the
> > program.
>
> Since CAP_BPF + CAP_NET_ADMIN allow attach. It means that a task
> with these two permissions will have programs running anyway.
> (traffic will flow through netdev, socket events will happen, etc)
> Hence no reason to disallow running program via test_run.
>

test_run allows fully controlled inputs, in a context where a program
can trivially flush caches, mistrain branch predictors, etc first.  It
seems to me that, if a JITted bpf program contains an exploitable
speculation gadget (MDS, Spectre v1, RSB, or anything else), it will
be *much* easier to exploit it using test_run than using normal
network traffic.  Similarly, normal network traffic will have network
headers that are valid enough to have caused the BPF program to be
invoked in the first place.  test_run can inject arbitrary garbage.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  0:44       ` Steven Rostedt
@ 2019-08-28  1:12         ` Andy Lutomirski
  2019-08-28  2:22           ` Steven Rostedt
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-28  1:12 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API



> On Aug 27, 2019, at 5:44 PM, Steven Rostedt <rostedt@goodmis.org> wrote:
> 
> On Tue, 27 Aug 2019 16:34:47 -0700
> Andy Lutomirski <luto@kernel.org> wrote:
> 
>>>> CAP_TRACING does not override normal permissions on sysfs or debugfs.
>>>> This means that, unless a new interface for programming kprobes and
>>>> such is added, it does not directly allow use of kprobes.  
>>> 
>>> kprobes can be created in the tracefs filesystem (which is separate from
>>> debugfs, tracefs just gets automatically mounted
>>> in /sys/kernel/debug/tracing when debugfs is mounted) from the
>>> kprobe_events file. /sys/kernel/tracing is just the tracefs
>>> directory without debugfs, and was created specifically to allow
>>> tracing to be access without opening up the can of worms in debugfs.  
>> 
>> I think that, in principle, CAP_TRACING should allow this, but I'm not
>> sure how to achieve that.  I suppose we could set up
>> inode_operations.permission on tracefs, but what exactly would it do?
>> Would it be just like generic_permission() except that it would look
>> at CAP_TRACING instead of CAP_DAC_OVERRIDE?  That is, you can use
>> tracefs if you have CAP_TRACING *or* acl access?  Or would it be:
>> 
>> int tracing_permission(struct inode *inode, int mask)
>> {
>>  if (!capable(CAP_TRACING))
>>    return -EPERM;
>> 
>>  return generic_permission(inode, mask);
>> }
> 
> Perhaps we should make a group for it?
> 

Hmm. That means that you’d need CAP_TRACING and a group. That’s probably not terrible, but it could be annoying.

>> 
>> Which would mean that you need ACL *and* CAP_TRACING, so
>> administrators would change the mode to 777.  That's a bit scary.
>> 
>> And this still doesn't let people even *find* tracefs, since it's
>> hidden in debugfs.
>> 
>> So maybe make CAP_TRACING override ACLs but also add /sys/fs/tracing
>> and mount tracefs there, too, so that regular users can at least find
>> the mountpoint.
> 
> I think you missed what I said. It's not hidden in /sys/kernel/debug.
> If you enable tracefs, you have /sys/kernel/tracing created, and is
> completely separate from debugfs. I only have it *also* automatically
> mounted to /sys/kernel/debug/tracing for backward compatibility
> reasons, as older versions of trace-cmd will only mount debugfs (as
> root), and expect to find it there.
> 
> mount -t tracefs nodev /sys/kernel/tracing

Too many slashes :/

A group could work for v1.  Maybe all the tools should get updated to use this path?

> 
> -- Steve
> 
>> 
>>> 
>>> Should we allow CAP_TRACING access to /proc/kallsyms? as it is helpful
>>> to convert perf and trace-cmd's function pointers into names. Once you
>>> allow tracing of the kernel, hiding /proc/kallsyms is pretty useless.  
>> 
>> I think we should.
> 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  0:55     ` Andy Lutomirski
@ 2019-08-28  2:00       ` Andy Lutomirski
  2019-08-28  4:49         ` Alexei Starovoitov
  2019-08-28  4:43       ` Alexei Starovoitov
  1 sibling, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-28  2:00 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	Steven Rostedt, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API

> On Aug 27, 2019, at 5:55 PM, Andy Lutomirski <luto@kernel.org> wrote:
>
> On Tue, Aug 27, 2019 at 5:34 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
>>

> From the previous discussion, you want to make progress toward solving
> a lot of problems with CAP_BPF.  One of them was making BPF
> firewalling more generally useful. By making CAP_BPF grant the ability
> to read kernel memory, you will make administrators much more nervous
> to grant CAP_BPF.  Similarly, and correct me if I'm wrong, most of
> these capabilities are primarily or only useful for tracing, so I
> don't see why users without CAP_TRACING should get them.
> bpf_trace_printk(), in particular, even has "trace" in its name :)
>
> Also, if a task has CAP_TRACING, it's expected to be able to trace the
> system -- that's the whole point.  Why shouldn't it be able to use BPF
> to trace the system better?

Let me put this a bit differently. Part of the point is that
CAP_TRACING should allow a user or program to trace without being able
to corrupt the system. CAP_BPF as you’ve proposed it *can* likely
crash the system.  For example, CAP_BPF allows bpf_map_get_fd_by_id()
in your patch.  If the system uses a BPF firewall that stores some of
its rules in maps, then bpf_map_get_fd_by_id() can be used to get a
writable fd to the map, which can be used to change the map, thus
preventing network access.  This means that no combination of
CAP_TRACING and CAP_BPF ends up allowing tracing without granting the
ability to reconfigure or otherwise corrupt the system.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  1:12         ` Andy Lutomirski
@ 2019-08-28  2:22           ` Steven Rostedt
  0 siblings, 0 replies; 46+ messages in thread
From: Steven Rostedt @ 2019-08-28  2:22 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Tue, 27 Aug 2019 18:12:59 -0700
Andy Lutomirski <luto@amacapital.net> wrote:

> Too many slashes :/
> 
> A group could work for v1.  Maybe all the tools should get updated to use this path?

trace-cmd now does. In fact, if run as root, it will first check if
tracefs is mounted, and if not, it will try to mount it at this
location.

-- Steve

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-27 23:21   ` Steven Rostedt
  2019-08-27 23:34     ` Andy Lutomirski
  2019-08-28  0:38     ` Alexei Starovoitov
@ 2019-08-28  3:30     ` Masami Hiramatsu
  2019-08-28  4:47       ` Alexei Starovoitov
  2 siblings, 1 reply; 46+ messages in thread
From: Masami Hiramatsu @ 2019-08-28  3:30 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Tue, 27 Aug 2019 19:21:44 -0400
Steven Rostedt <rostedt@goodmis.org> wrote:


> > Here's my proposal for CAP_TRACING, documentation-style:
> > 
> > --- begin ---
> > 
> > CAP_TRACING enables a task to use various kernel features to trace
> > running user programs and the kernel itself.  CAP_TRACING also enables
> > a task to bypass some speculation attack countermeasures.  A task in
> > the init user namespace with CAP_TRACING will be able to tell exactly
> > what kernel code is executed and when, and will be able to read kernel
> > registers and kernel memory.  It will, similarly, be able to read the
> > state of other user tasks.
> > 
> > Specifically, CAP_TRACING allows the following operations.  It may
> > allow more operations in the future:
> > 
> >  - Full use of perf_event_open(), similarly to the effect of
> > kernel.perf_event_paranoid == -1.
> > 
> >  - Loading and attaching tracing BPF programs, including use of BPF
> > raw tracepoints.
> > 
> >  - Use of BPF stack maps.
> > 
> >  - Use of bpf_probe_read() and bpf_trace_printk().
> > 
> >  - Use of unsafe pointer-to-integer conversions in BPF.
> > 
> >  - Bypassing of BPF's speculation attack hardening measures and
> > constant blinding.  (Note: other mechanisms might also allow this.)
> > 
> > CAP_TRACING does not override normal permissions on sysfs or debugfs.
> > This means that, unless a new interface for programming kprobes and
> > such is added, it does not directly allow use of kprobes.
> 
> kprobes can be created in the tracefs filesystem (which is separate from
> debugfs, tracefs just gets automatically mounted
> in /sys/kernel/debug/tracing when debugfs is mounted) from the
> kprobe_events file. /sys/kernel/tracing is just the tracefs
> directory without debugfs, and was created specifically to allow
> tracing to be access without opening up the can of worms in debugfs.

I like the CAP_TRACING for tracefs. Can we make the tracefs itself
check the CAP_TRACING and call file_ops? or each tracefs file-ops
handlers must check it?

> Should we allow CAP_TRACING access to /proc/kallsyms? as it is helpful
> to convert perf and trace-cmd's function pointers into names. Once you
> allow tracing of the kernel, hiding /proc/kallsyms is pretty useless.

Also, there is a blacklist of kprobes under debugfs. If CAP_TRACING
introduced and it allows to access kallsyms, I would like to move the
blacklist under tracefs, or make an alias of blacklist entry on tracefs.

Thank you,

-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  0:55     ` Andy Lutomirski
  2019-08-28  2:00       ` Andy Lutomirski
@ 2019-08-28  4:43       ` Alexei Starovoitov
  2019-08-28  6:12         ` Andy Lutomirski
  1 sibling, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-28  4:43 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Kees Cook, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, Steven Rostedt,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Tue, Aug 27, 2019 at 05:55:41PM -0700, Andy Lutomirski wrote:
> 
> I was hoping for something in Documentation/admin-guide, not in a
> changelog that's hard to find.

eventually yes.

> >
> > > Changing the capability that some existing operation requires could
> > > break existing programs.  The old capability may need to be accepted
> > > as well.
> >
> > As far as I can see there is no ABI breakage. Please point out
> > which line of the patch may break it.
> 
> As a more or less arbitrary selection:
> 
>  void bpf_prog_kallsyms_add(struct bpf_prog *fp)
>  {
>         if (!bpf_prog_kallsyms_candidate(fp) ||
> -           !capable(CAP_SYS_ADMIN))
> +           !capable(CAP_BPF))
>                 return;
> 
> Before your patch, a task with CAP_SYS_ADMIN could do this.  Now it
> can't.  Per the usual Linux definition of "ABI break", this is an ABI
> break if and only if someone actually did this in a context where they
> have CAP_SYS_ADMIN but not all capabilities.  How confident are you
> that no one does things like this?
>  void bpf_prog_kallsyms_add(struct bpf_prog *fp)
>  {
>         if (!bpf_prog_kallsyms_candidate(fp) ||
> -           !capable(CAP_SYS_ADMIN))
> +           !capable(CAP_BPF))
>                 return;

Yes. I'm confident that apps don't drop everything and
leave cap_sys_admin only before doing bpf() syscall, since it would
break their own use of networking.
Hence I'm not going to do the cap_syslog-like "deprecated" message mess
because of this unfounded concern.
If I turn out to be wrong we will add this "deprecated mess" later.

> 
> From the previous discussion, you want to make progress toward solving
> a lot of problems with CAP_BPF.  One of them was making BPF
> firewalling more generally useful. By making CAP_BPF grant the ability
> to read kernel memory, you will make administrators much more nervous
> to grant CAP_BPF. 

Andy, were your email hacked?
I explained several times that in this proposal 
CAP_BPF _and_ CAP_TRACING _both_ are necessary to read kernel memory.
CAP_BPF alone is _not enough_.

> Similarly, and correct me if I'm wrong, most of
> these capabilities are primarily or only useful for tracing, so I
> don't see why users without CAP_TRACING should get them.
> bpf_trace_printk(), in particular, even has "trace" in its name :)
> 
> Also, if a task has CAP_TRACING, it's expected to be able to trace the
> system -- that's the whole point.  Why shouldn't it be able to use BPF
> to trace the system better?

CAP_TRACING shouldn't be able to do BPF because BPF is not tracing only.

> > For example:
> > BPF_CALL_3(bpf_probe_read, void *, dst, u32, size, const void *, unsafe_ptr)
> > {
> >         int ret;
> >
> >         ret = probe_kernel_read(dst, unsafe_ptr, size);
> >         if (unlikely(ret < 0))
> >                 memset(dst, 0, size);
> >
> >         return ret;
> > }
> >
> > All of BPF (including prototype of bpf_probe_read) is controlled by CAP_BPF.
> > But the kernel primitives its using (probe_kernel_read) is controlled by CAP_TRACING.
> > Hence a task needs _both_ CAP_BPF and CAP_TRACING to attach and run bpf program
> > that uses bpf_probe_read.
> >
> > Similar with all other kernel code that BPF helpers may call directly or indirectly.
> > If there is a way for bpf program to call into piece of code controlled by CAP_TRACING
> > such helper would need CAP_BPF and CAP_TRACING.
> > If bpf helper calls into something that may mangle networking packet
> > such helper would need both CAP_BPF and CAP_NET_ADMIN to execute.
> 
> Why do you want to require CAP_BPF to call into functions like
> bpf_probe_read()?  I understand why you want to limit access to bpf,
> but I think that CAP_TRACING should be sufficient to allow the tracing
> parts of BPF.  After all, a lot of your concerns, especially the ones
> involving speculation, don't really apply to users with CAP_TRACING --
> users with CAP_TRACING can read kernel memory with or without bpf.

Let me try again to explain the concept...

Imagine AUDI logo with 4 circles.
They partially intersect.
The first circle is CAP_TRACING. Second is CAP_BPF. Third is CAP_NET_ADMIN.
Fourth - up to your imagination :)

These capabilities subdivide different parts of root privileges.
CAP_NET_ADMIN is useful on its own.
Just as CAP_TRACING that is useful for perf, ftrace, and probably
other tracing things that don't need bpf.

'bpftrace' is using a lot of tracing and a lot of bpf features,
but not all of bpf and not all tracing.
It falls into intersection of CAP_BPF and CAP_TRACING.

probe_kernel_read is a tracing mechanism.
perf can use it without bpf.
Hence it should be controlled by CAP_TRACING.

bpf_probe_read is a wrapper of that mechanism.
It's a place where BPF and TRACING circles intersect.
A task needs to have both CAP_BPF (to load the program)
and CAP_TRACING (to read kernel memory) to execute bpf_probe_read() helper.

> > > > @@ -2080,7 +2083,10 @@ static int bpf_prog_test_run(const union bpf_attr *attr,
> > > >         struct bpf_prog *prog;
> > > >         int ret = -ENOTSUPP;
> > > >
> > > > -       if (!capable(CAP_SYS_ADMIN))
> > > > +       if (!capable(CAP_NET_ADMIN) || !capable(CAP_BPF))
> > > > +               /* test_run callback is available for networking progs only.
> > > > +                * Add cap_bpf_tracing() above when tracing progs become runable.
> > > > +                */
> > >
> > > I think test_run should probably be CAP_SYS_ADMIN forever.  test_run
> > > is the only way that one can run a bpf program and call helper
> > > functions via the program if one doesn't have permission to attach the
> > > program.
> >
> > Since CAP_BPF + CAP_NET_ADMIN allow attach. It means that a task
> > with these two permissions will have programs running anyway.
> > (traffic will flow through netdev, socket events will happen, etc)
> > Hence no reason to disallow running program via test_run.
> >
> 
> test_run allows fully controlled inputs, in a context where a program
> can trivially flush caches, mistrain branch predictors, etc first.  It
> seems to me that, if a JITted bpf program contains an exploitable
> speculation gadget (MDS, Spectre v1, RSB, or anything else), 

speaking of MDS... I already asked you to help investigate its
applicability with existing bpf exposure. Are you going to do that?

> it will
> be *much* easier to exploit it using test_run than using normal
> network traffic.  Similarly, normal network traffic will have network
> headers that are valid enough to have caused the BPF program to be
> invoked in the first place.  test_run can inject arbitrary garbage.

Please take a look at Jann's var1 exploit. Was it hard to run bpf prog
in controlled environment without test_run command ?


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  3:30     ` Masami Hiramatsu
@ 2019-08-28  4:47       ` Alexei Starovoitov
  0 siblings, 0 replies; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-28  4:47 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Steven Rostedt, Andy Lutomirski, Alexei Starovoitov, Kees Cook,
	LSM List, James Morris, Jann Horn, Peter Zijlstra,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Wed, Aug 28, 2019 at 12:30:41PM +0900, Masami Hiramatsu wrote:
> > kprobes can be created in the tracefs filesystem (which is separate from
> > debugfs, tracefs just gets automatically mounted
> > in /sys/kernel/debug/tracing when debugfs is mounted) from the
> > kprobe_events file. /sys/kernel/tracing is just the tracefs
> > directory without debugfs, and was created specifically to allow
> > tracing to be access without opening up the can of worms in debugfs.
> 
> I like the CAP_TRACING for tracefs. Can we make the tracefs itself
> check the CAP_TRACING and call file_ops? or each tracefs file-ops
> handlers must check it?

Thanks for the feedback.
I'll hack a prototype of CAP_TRACING for perf bits that I understand
and you folks will be able to use it in ftrace when initial support lands.
imo the question above is an implementation detail that you can resolve later.
I see it as a followup to initial CAP_TRACING drop.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  2:00       ` Andy Lutomirski
@ 2019-08-28  4:49         ` Alexei Starovoitov
  2019-08-28  6:20           ` Andy Lutomirski
  0 siblings, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-28  4:49 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Kees Cook, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, Steven Rostedt,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Tue, Aug 27, 2019 at 07:00:40PM -0700, Andy Lutomirski wrote:
> 
> Let me put this a bit differently. Part of the point is that
> CAP_TRACING should allow a user or program to trace without being able
> to corrupt the system. CAP_BPF as you’ve proposed it *can* likely
> crash the system.

Really? I'm still waiting for your example where bpf+kprobe crashes the system...


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  4:43       ` Alexei Starovoitov
@ 2019-08-28  6:12         ` Andy Lutomirski
  2019-08-28 22:55           ` Alexei Starovoitov
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-28  6:12 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	Steven Rostedt, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API

On Tue, Aug 27, 2019 at 9:43 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Aug 27, 2019 at 05:55:41PM -0700, Andy Lutomirski wrote:
> >
> > I was hoping for something in Documentation/admin-guide, not in a
> > changelog that's hard to find.
>
> eventually yes.
>
> > >
> > > > Changing the capability that some existing operation requires could
> > > > break existing programs.  The old capability may need to be accepted
> > > > as well.
> > >
> > > As far as I can see there is no ABI breakage. Please point out
> > > which line of the patch may break it.
> >
> > As a more or less arbitrary selection:
> >
> >  void bpf_prog_kallsyms_add(struct bpf_prog *fp)
> >  {
> >         if (!bpf_prog_kallsyms_candidate(fp) ||
> > -           !capable(CAP_SYS_ADMIN))
> > +           !capable(CAP_BPF))
> >                 return;
> >
> > Before your patch, a task with CAP_SYS_ADMIN could do this.  Now it
> > can't.  Per the usual Linux definition of "ABI break", this is an ABI
> > break if and only if someone actually did this in a context where they
> > have CAP_SYS_ADMIN but not all capabilities.  How confident are you
> > that no one does things like this?
> >  void bpf_prog_kallsyms_add(struct bpf_prog *fp)
> >  {
> >         if (!bpf_prog_kallsyms_candidate(fp) ||
> > -           !capable(CAP_SYS_ADMIN))
> > +           !capable(CAP_BPF))
> >                 return;
>
> Yes. I'm confident that apps don't drop everything and
> leave cap_sys_admin only before doing bpf() syscall, since it would
> break their own use of networking.
> Hence I'm not going to do the cap_syslog-like "deprecated" message mess
> because of this unfounded concern.
> If I turn out to be wrong we will add this "deprecated mess" later.
>
> >
> > From the previous discussion, you want to make progress toward solving
> > a lot of problems with CAP_BPF.  One of them was making BPF
> > firewalling more generally useful. By making CAP_BPF grant the ability
> > to read kernel memory, you will make administrators much more nervous
> > to grant CAP_BPF.
>
> Andy, were your email hacked?
> I explained several times that in this proposal
> CAP_BPF _and_ CAP_TRACING _both_ are necessary to read kernel memory.
> CAP_BPF alone is _not enough_.

You have indeed said this many times.  You've stated it as a matter of
fact as though it cannot possibly discussed.  I'm asking you to
justify it.

> > Similarly, and correct me if I'm wrong, most of
> > these capabilities are primarily or only useful for tracing, so I
> > don't see why users without CAP_TRACING should get them.
> > bpf_trace_printk(), in particular, even has "trace" in its name :)
> >
> > Also, if a task has CAP_TRACING, it's expected to be able to trace the
> > system -- that's the whole point.  Why shouldn't it be able to use BPF
> > to trace the system better?
>
> CAP_TRACING shouldn't be able to do BPF because BPF is not tracing only.

What does "do BPF" even mean?  seccomp() does BPF.  SO_ATTACH_FILTER
does BPF.  Saying that using BPF should require a specific capability
seems kind of like saying that using the network should require a
specific capability.  Linux (and Unixy systems in general) distinguish
between binding low-number ports, binding high-number ports, using raw
sockets, and changing the system's IP address.  These have different
implications and require different capabilities.

It seems like you are specifically trying to add a new switch to turn
as much of BPF as possible on and off.  Why?

> >
> > test_run allows fully controlled inputs, in a context where a program
> > can trivially flush caches, mistrain branch predictors, etc first.  It
> > seems to me that, if a JITted bpf program contains an exploitable
> > speculation gadget (MDS, Spectre v1, RSB, or anything else),
>
> speaking of MDS... I already asked you to help investigate its
> applicability with existing bpf exposure. Are you going to do that?

I am blissfully uninvolved in MDS, and I don't know all that much more
about the overall mechanism than a random reader of tech news :)  ISTM
there are two meaningful ways that BPF could be involved: a BPF
program could leak info into the state exposed by MDS, or a BPF
program could try to read that state.  From what little I understand,
it's essentially inevitable that BPF leaks information into MDS state,
and this is probably even controllable by an attacker that understands
MDS in enough detail.    So the interesting questions are: can BPF be
used to read MDS state and can BPF be used to leak information in a
more useful way than the rest of the kernel to an attacker.

Keeping in mind that the kernel will flush MDS state on every exit to
usermode, I think the most likely attack is to try to read MDS state
with BPF.  This could happen, I suppose -- BPF programs can easily
contain the usual speculation gadgets of "do something and read an
address that depends on the outcome".  Fortunately, outside of
bpf_probe_read(), AFAIK BPF programs can't directly touch user memory,
and an attacker that is allowed to use bpf_probe_read() doesn't need
MDS to read things.

So it's not entirely obvious to me how an attack would be mounted.
test_run would make it a lot easier, I think.

>
> > it will
> > be *much* easier to exploit it using test_run than using normal
> > network traffic.  Similarly, normal network traffic will have network
> > headers that are valid enough to have caused the BPF program to be
> > invoked in the first place.  test_run can inject arbitrary garbage.
>
> Please take a look at Jann's var1 exploit. Was it hard to run bpf prog
> in controlled environment without test_run command ?
>

Can you send me a link?

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  4:49         ` Alexei Starovoitov
@ 2019-08-28  6:20           ` Andy Lutomirski
  2019-08-28 23:38             ` Alexei Starovoitov
  0 siblings, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-28  6:20 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	Steven Rostedt, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API

On Tue, Aug 27, 2019 at 9:49 PM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Tue, Aug 27, 2019 at 07:00:40PM -0700, Andy Lutomirski wrote:
> >
> > Let me put this a bit differently. Part of the point is that
> > CAP_TRACING should allow a user or program to trace without being able
> > to corrupt the system. CAP_BPF as you’ve proposed it *can* likely
> > crash the system.
>
> Really? I'm still waiting for your example where bpf+kprobe crashes the system...
>

That's not what I meant.  bpf+kprobe causing a crash is a bug.  I'm
referring to a totally different issue.  On my laptop:

$ sudo bpftool map
48: hash  name foobar  flags 0x0
    key 8B  value 8B  max_entries 64  memlock 8192B
181: lpm_trie  flags 0x1
    key 8B  value 8B  max_entries 1  memlock 4096B
182: lpm_trie  flags 0x1
    key 20B  value 8B  max_entries 1  memlock 4096B
183: lpm_trie  flags 0x1
    key 8B  value 8B  max_entries 1  memlock 4096B
184: lpm_trie  flags 0x1
    key 20B  value 8B  max_entries 1  memlock 4096B
185: lpm_trie  flags 0x1
    key 8B  value 8B  max_entries 1  memlock 4096B
186: lpm_trie  flags 0x1
    key 20B  value 8B  max_entries 1  memlock 4096B
187: lpm_trie  flags 0x1
    key 8B  value 8B  max_entries 1  memlock 4096B
188: lpm_trie  flags 0x1
    key 20B  value 8B  max_entries 1  memlock 4096B

$ sudo bpftool map dump id 186
key:
00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
00 00 00 00
value:
02 00 00 00 00 00 00 00
Found 1 element

$ sudo bpftool map delete id 186 key hex 00 00 00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00 00 00
[this worked]

I don't know what my laptop was doing with map id 186 in particular,
but, whatever it was, I definitely broke it.  If a BPF firewall is in
use on something important enough, this could easily remove
connectivity from part or all of the system.  Right now, this needs
CAP_SYS_ADMIN.  With your patch, CAP_BPF is sufficient to do this, but
you *also* need CAP_BPF to trace the system using BPF.  Tracing with
BPF is 'safe' in the absence of bugs.  Modifying other peoples' maps
is not.

One possible answer to this would be to limit CAP_BPF to the subset of
BPF that is totaly safe in the absence of bugs (e.g. loading most
program types if they don't have dangerous BPF_CALL instructions but
not *_BY_ID).  Another answer would be to say that CAP_BPF will not be
needed by future unprivileged bpf mechanisms, and that CAP_TRACING
plus unprivileged bpf will be enough to do most or all interesting BPF
tracing operations.

If the answer is the latter, then maybe it would make sense to try to
implement some of the unprivileged bpf stuff and then to see whether
CAP_BPF is still needed.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-27 23:01 ` Andy Lutomirski
  2019-08-27 23:21   ` Steven Rostedt
  2019-08-28  0:34   ` Alexei Starovoitov
@ 2019-08-28  7:14   ` Peter Zijlstra
  2019-08-28 22:08     ` Alexei Starovoitov
  2 siblings, 1 reply; 46+ messages in thread
From: Peter Zijlstra @ 2019-08-28  7:14 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Kees Cook, LSM List, James Morris, Jann Horn,
	Masami Hiramatsu, Steven Rostedt, David S. Miller,
	Daniel Borkmann, Network Development, bpf, kernel-team,
	Linux API

On Tue, Aug 27, 2019 at 04:01:08PM -0700, Andy Lutomirski wrote:

> > Tracing:
> >
> > CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1)
> > are necessary to:

That's not tracing, that's perf.

> > +bool cap_bpf_tracing(void)
> > +{
> > +       return capable(CAP_SYS_ADMIN) ||
> > +              (capable(CAP_BPF) && !perf_paranoid_tracepoint_raw());
> > +}

A whole long time ago, I proposed we introduce CAP_PERF or something
along those lines; as a replacement for that horrible crap Android and
Debian ship. But nobody was ever interested enough.

The nice thing about that is that you can then disallow perf/tracing in
general, but tag the perf executable (and similar tools) with the
capability so that unpriv users can still use it, but only limited
through the tool, not the syscalls directly.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-27 20:52 [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF Alexei Starovoitov
  2019-08-27 23:01 ` Andy Lutomirski
@ 2019-08-28 10:38 ` kbuild test robot
  1 sibling, 0 replies; 46+ messages in thread
From: kbuild test robot @ 2019-08-28 10:38 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: kbuild-all, luto, davem, daniel, netdev, bpf, kernel-team, linux-api

[-- Attachment #1: Type: text/plain, Size: 1553 bytes --]

Hi Alexei,

I love your patch! Yet something to improve:

[auto build test ERROR on bpf-next/master]

url:    https://github.com/0day-ci/linux/commits/Alexei-Starovoitov/bpf-capabilities-introduce-CAP_BPF/20190828-142441
base:   https://kernel.googlesource.com/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: ia64-defconfig (attached as .config)
compiler: ia64-linux-gcc (GCC) 7.4.0
reproduce:
        wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
        chmod +x ~/bin/make.cross
        # save the attached .config to linux build tree
        GCC_VERSION=7.4.0 make.cross ARCH=ia64 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot <lkp@intel.com>

All errors (new ones prefixed by >>):

   kernel/bpf/core.c: In function 'cap_bpf_tracing':
>> kernel/bpf/core.c:2110:31: error: implicit declaration of function 'perf_paranoid_tracepoint_raw' [-Werror=implicit-function-declaration]
            (capable(CAP_BPF) && !perf_paranoid_tracepoint_raw());
                                  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors

vim +/perf_paranoid_tracepoint_raw +2110 kernel/bpf/core.c

  2106	
  2107	bool cap_bpf_tracing(void)
  2108	{
  2109		return capable(CAP_SYS_ADMIN) ||
> 2110		       (capable(CAP_BPF) && !perf_paranoid_tracepoint_raw());
  2111	}
  2112	

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 19214 bytes --]

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  7:14   ` Peter Zijlstra
@ 2019-08-28 22:08     ` Alexei Starovoitov
  2019-08-29 13:34       ` Steven Rostedt
  0 siblings, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-28 22:08 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Masami Hiramatsu, Steven Rostedt,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Wed, Aug 28, 2019 at 09:14:21AM +0200, Peter Zijlstra wrote:
> On Tue, Aug 27, 2019 at 04:01:08PM -0700, Andy Lutomirski wrote:
> 
> > > Tracing:
> > >
> > > CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1)
> > > are necessary to:
> 
> That's not tracing, that's perf.
> 
> > > +bool cap_bpf_tracing(void)
> > > +{
> > > +       return capable(CAP_SYS_ADMIN) ||
> > > +              (capable(CAP_BPF) && !perf_paranoid_tracepoint_raw());
> > > +}
> 
> A whole long time ago, I proposed we introduce CAP_PERF or something
> along those lines; as a replacement for that horrible crap Android and
> Debian ship. But nobody was ever interested enough.
> 
> The nice thing about that is that you can then disallow perf/tracing in
> general, but tag the perf executable (and similar tools) with the
> capability so that unpriv users can still use it, but only limited
> through the tool, not the syscalls directly.

Exactly.
Similar motivation for CAP_BPF as well.

re: your first comment above.
I'm not sure what difference you see in words 'tracing' and 'perf'.
I really hope we don't partition the overall tracing category
into CAP_PERF and CAP_FTRACE only because these pieces are maintained
by different people.
On one side perf_event_open() isn't really doing tracing (as step by
step ftracing of function sequences), but perf_event_open() opens
an event and the sequence of events (may include IP) becomes a trace.
imo CAP_TRACING is the best name to descibe the privileged space
of operations possible via perf_event_open, ftrace, kprobe, stack traces, etc.

Another reason are kuprobes. They can be crated via perf_event_open
and via tracefs. Are they in CAP_PERF or in CAP_FTRACE ? In both, right?
Should then CAP_KPROBE be used ? that would be an overkill.
It would partition the space even further without obvious need.

Looking from BPF angle... BPF doesn't have integration with ftrace yet.
bpf_trace_printk is using ftrace mechanism, but that's 1% of ftrace.
In the long run I really like to see bpf using all of ftrace.
Whereas bpf is using a lot of 'perf'.
And extending some perf things in bpf specific way.
Take a look at how BPF_F_STACK_BUILD_ID. It's clearly perf/stack_tracing
feature that generic perf can use one day.
Currently it sits in bpf land and accessible via bpf only.
Though its bpf only today I categorize it under CAP_TRACING.

I think CAP_TRACING privilege should allow task to do all of perf_event_open,
kuprobe, stack trace, ftrace, and kallsyms.
We can think of some exceptions that should stay under CAP_SYS_ADMIN,
but most of the functionality available by 'perf' binary should be
usable with CAP_TRACING. 'perf' can do bpf too.
With CAP_BPF it would be all set.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  6:12         ` Andy Lutomirski
@ 2019-08-28 22:55           ` Alexei Starovoitov
  2019-08-29  0:45             ` Andy Lutomirski
  0 siblings, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-28 22:55 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Kees Cook, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, Steven Rostedt,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Tue, Aug 27, 2019 at 11:12:29PM -0700, Andy Lutomirski wrote:
> >
> > >
> > > From the previous discussion, you want to make progress toward solving
> > > a lot of problems with CAP_BPF.  One of them was making BPF
> > > firewalling more generally useful. By making CAP_BPF grant the ability
> > > to read kernel memory, you will make administrators much more nervous
> > > to grant CAP_BPF.
> >
> > Andy, were your email hacked?
> > I explained several times that in this proposal
> > CAP_BPF _and_ CAP_TRACING _both_ are necessary to read kernel memory.
> > CAP_BPF alone is _not enough_.
> 
> You have indeed said this many times.  You've stated it as a matter of
> fact as though it cannot possibly discussed.  I'm asking you to
> justify it.

That's not how I see it.
I kept stating that both CAP_BPF and CAP_TRACING are necessary to read
kernel memory whereas you kept distorting my statement by dropping second
part and then making claims that "CAP_BPF grant the ability to read
kernel memory, you will make administrators much more nervous".

Just s/CAP_BPF/CAP_BPF and CAP_TRACING/ in this above sentence.
See that meaning suddenly changes?
Now administrators would be worried about tasks that have both at once.
They also would be worried about tasks that have CAP_TRACING alone,
because that's what allows probe_kernel_read().

> It seems like you are specifically trying to add a new switch to turn
> as much of BPF as possible on and off.  Why?

Didn't I explain it several times already with multiple examples
from systemd, daemons, bpftrace ?

Let's try again.
Take your laptop with linux distro.
You're the only user there. I'm assuming you're not sharing it with
partner and kids. This is my definition of 'single user system'.
You can sudo on it at any time, but obviously prefer to run as many
apps as possible without cap_sys_admin.
Now you found some awesome open source app on the web that monitors
the health of the kernel and will pop a nice message on a screen if
something is wrong. Currently this app needs root. You hesitate,
but the apps is so useful and it has strong upstream code review process
that you keep running it 24/7.
This is open source app. New versions come. You upgrade.
You have enough trust in that app that you keep running it as root.
But there is always a chance that new version doing accidentaly
something stupid as 'kill -9 -1'. It's an open source app at the end.

Now I come with this CAP* proposal to make this app safer.
I'm not making your system more secure and not making this app
more secure. I can only make your laptop safer for day to day work
by limiting the operations this app can do.
This particular app monitros the kernel via bpf and tracing.
Hence you can give it CAP_TRACING and CAP_BPF and drop the rest.

> > speaking of MDS... I already asked you to help investigate its
> > applicability with existing bpf exposure. Are you going to do that?
> 
> I am blissfully uninvolved in MDS, and I don't know all that much more
> about the overall mechanism than a random reader of tech news :)  ISTM
> there are two meaningful ways that BPF could be involved: a BPF
> program could leak info into the state exposed by MDS, or a BPF
> program could try to read that state.  From what little I understand,
> it's essentially inevitable that BPF leaks information into MDS state,
> and this is probably even controllable by an attacker that understands
> MDS in enough detail.    So the interesting questions are: can BPF be
> used to read MDS state and can BPF be used to leak information in a
> more useful way than the rest of the kernel to an attacker.

agree. that's exactly the question to ask.

> Keeping in mind that the kernel will flush MDS state on every exit to
> usermode, I think the most likely attack is to try to read MDS state
> with BPF.  This could happen, I suppose -- BPF programs can easily
> contain the usual speculation gadgets of "do something and read an
> address that depends on the outcome".  Fortunately, outside of
> bpf_probe_read(), AFAIK BPF programs can't directly touch user memory,
> and an attacker that is allowed to use bpf_probe_read() doesn't need
> MDS to read things.

true as well.
So what do we do with that sentence in Documentation/x86/mds.rst?
Nothing?
New hw bugs will keep coming.
All of them should get similar wording?
Your understanding of MDS and BPF is way above the average.
What other users suppose to do when they read such sentence?
I think they have no choice but to do kernel.unprivileged_bpf_disabled=1.
We, as a kernel community, are forcing the users into it.
Hence I really do not see a value in any proposal today that expands
unprivileged bpf usage.
Since kernel.unprivileged_bpf_disabled=1 all bpf is under cap_sys_admin.
It's not great from security and safety pov. Hence this CAP* proposal.

> > Please take a look at Jann's var1 exploit. Was it hard to run bpf prog
> > in controlled environment without test_run command ?
> >
> 
> Can you send me a link?

https://bugs.chromium.org/p/project-zero/issues/detail?id=1272
writeup_files.tar:kernel_leak_exploit_amd_pro_a8_9600_r7/bpf_stuff.c
Execution is as trivial as write(sockfd, "X", 1) line 405.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28  6:20           ` Andy Lutomirski
@ 2019-08-28 23:38             ` Alexei Starovoitov
  2019-08-29  0:58               ` Andy Lutomirski
  0 siblings, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-28 23:38 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Alexei Starovoitov, Kees Cook, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, Steven Rostedt,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Tue, Aug 27, 2019 at 11:20:19PM -0700, Andy Lutomirski wrote:
> On Tue, Aug 27, 2019 at 9:49 PM Alexei Starovoitov
> <alexei.starovoitov@gmail.com> wrote:
> >
> > On Tue, Aug 27, 2019 at 07:00:40PM -0700, Andy Lutomirski wrote:
> > >
> > > Let me put this a bit differently. Part of the point is that
> > > CAP_TRACING should allow a user or program to trace without being able
> > > to corrupt the system. CAP_BPF as you’ve proposed it *can* likely
> > > crash the system.
> >
> > Really? I'm still waiting for your example where bpf+kprobe crashes the system...
> >
> 
> That's not what I meant.  bpf+kprobe causing a crash is a bug.  I'm
> referring to a totally different issue.  On my laptop:
> 
> $ sudo bpftool map
> 48: hash  name foobar  flags 0x0
>     key 8B  value 8B  max_entries 64  memlock 8192B
> 181: lpm_trie  flags 0x1
>     key 8B  value 8B  max_entries 1  memlock 4096B
> 182: lpm_trie  flags 0x1
>     key 20B  value 8B  max_entries 1  memlock 4096B
> 183: lpm_trie  flags 0x1
>     key 8B  value 8B  max_entries 1  memlock 4096B
> 184: lpm_trie  flags 0x1
>     key 20B  value 8B  max_entries 1  memlock 4096B
> 185: lpm_trie  flags 0x1
>     key 8B  value 8B  max_entries 1  memlock 4096B
> 186: lpm_trie  flags 0x1
>     key 20B  value 8B  max_entries 1  memlock 4096B
> 187: lpm_trie  flags 0x1
>     key 8B  value 8B  max_entries 1  memlock 4096B
> 188: lpm_trie  flags 0x1
>     key 20B  value 8B  max_entries 1  memlock 4096B
> 
> $ sudo bpftool map dump id 186
> key:
> 00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
> 00 00 00 00
> value:
> 02 00 00 00 00 00 00 00
> Found 1 element
> 
> $ sudo bpftool map delete id 186 key hex 00 00 00 00 00 00 00 00 00 00
> 00 00 00 00 00 00 00 00 00 00
> [this worked]
> 
> I don't know what my laptop was doing with map id 186 in particular,
> but, whatever it was, I definitely broke it.  If a BPF firewall is in
> use on something important enough, this could easily remove
> connectivity from part or all of the system.  Right now, this needs
> CAP_SYS_ADMIN.  With your patch, CAP_BPF is sufficient to do this, but
> you *also* need CAP_BPF to trace the system using BPF.  Tracing with
> BPF is 'safe' in the absence of bugs.  Modifying other peoples' maps
> is not.

That lpm_trie is likely systemd implementing IP sandboxing.
Not sure whether it's white or black list.
Deleting an IP address from that map will either allow or disallow
network traffic.
Out of band operation on bpf map broke some bpf program. Sure.
But calling it 'breaking the system' is quite a stretch.
Calling it 'crashing the system' is plain wrong.
Yet you're generalizing this bpf map read/write as
"CAP_BPF as you’ve proposed it *can* likely crash the system."
This is what I have a problem with.

Anyway, changing gears...
Yes. I did propose to make a task with CAP_BPF to be able to
manipulate arbitrary maps in the system.
You could have said that if CAP_BPF is given to 'bpftool'
then any user will be able to mess with other maps because
bpftool is likely chmod-ed 755.
Absolutely correct!
It's not a fault of the CAP_BPF scope.
Just don't give that cap to bpftool or do different acl/chmod.

> If the answer is the latter, then maybe it would make sense to try to
> implement some of the unprivileged bpf stuff and then to see whether
> CAP_BPF is still needed.

<broken_record_mode=on> Nack to extensions to unprivileged bpf.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28 22:55           ` Alexei Starovoitov
@ 2019-08-29  0:45             ` Andy Lutomirski
  2019-08-29  0:53               ` Andy Lutomirski
  2019-08-29  4:07               ` Alexei Starovoitov
  0 siblings, 2 replies; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-29  0:45 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	Steven Rostedt, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API


> On Aug 28, 2019, at 3:55 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
>> On Tue, Aug 27, 2019 at 11:12:29PM -0700, Andy Lutomirski wrote:
>>>> 
>>>> 
>>>> From the previous discussion, you want to make progress toward solving
>>>> a lot of problems with CAP_BPF.  One of them was making BPF
>>>> firewalling more generally useful. By making CAP_BPF grant the ability
>>>> to read kernel memory, you will make administrators much more nervous
>>>> to grant CAP_BPF.
>>> 
>>> Andy, were your email hacked?
>>> I explained several times that in this proposal
>>> CAP_BPF _and_ CAP_TRACING _both_ are necessary to read kernel memory.
>>> CAP_BPF alone is _not enough_.
>> 
>> You have indeed said this many times.  You've stated it as a matter of
>> fact as though it cannot possibly discussed.  I'm asking you to
>> justify it.
> 
> That's not how I see it.
> I kept stating that both CAP_BPF and CAP_TRACING are necessary to read
> kernel memory whereas you kept distorting my statement by dropping second
> part and then making claims that "CAP_BPF grant the ability to read
> kernel memory, you will make administrators much more nervous".

Mea culpa. CAP_BPF does, however, appear to permit breaking kASLR due to unsafe pointer conversions, and it allows reading and writing everyone’s maps.  I stand by my overall point.

> 
> Just s/CAP_BPF/CAP_BPF and CAP_TRACING/ in this above sentence.
> See that meaning suddenly changes?
> Now administrators would be worried about tasks that have both at once.
> They also would be worried about tasks that have CAP_TRACING alone,
> because that's what allows probe_kernel_read().

This is not all what I meant. Of course granting CAP_BPF+CAP_TRACING allows reading kernel memory. This is not at all a problem.  Here is a problem I see:

CAP_TRACING + CAP_BPF allows modification of other people’s maps and potentially other things that should not be implied by CAP_TRACING alone and that don’t need to be available to tracers. So CAP_TRACING, which is powerful but has somewhat limited scope, isn’t fully useful without CAP_BPF, and giving CAP_TRACING *and* CAP_BPF allows things that teachers shouldn’t be able to do. I think this would make the whole mechanism less useful to Android, for example.

(Also, I’m not sure quite what you mean by “CAP_TRACING ... allows probe_kernel_read()”. probe_kernel_read() is a kernel function that can’t be directly called by userspace. CAP_TRACING allows reading kernel memory in plenty of ways regardless.)

> 
>> It seems like you are specifically trying to add a new switch to turn
>> as much of BPF as possible on and off.  Why?
> 
> Didn't I explain it several times already with multiple examples
> from systemd, daemons, bpftrace ?
> 
> Let's try again.
> Take your laptop with linux distro.
> You're the only user there. I'm assuming you're not sharing it with
> partner and kids. This is my definition of 'single user system'.
> You can sudo on it at any time, but obviously prefer to run as many
> apps as possible without cap_sys_admin.
> Now you found some awesome open source app on the web that monitors
> the health of the kernel and will pop a nice message on a screen if
> something is wrong. Currently this app needs root. You hesitate,
> but the apps is so useful and it has strong upstream code review process
> that you keep running it 24/7.
> This is open source app. New versions come. You upgrade.
> You have enough trust in that app that you keep running it as root.
> But there is always a chance that new version doing accidentaly
> something stupid as 'kill -9 -1'. It's an open source app at the end.
> 
> Now I come with this CAP* proposal to make this app safer.
> I'm not making your system more secure and not making this app
> more secure. I can only make your laptop safer for day to day work
> by limiting the operations this app can do.
> This particular app monitros the kernel via bpf and tracing.
> Hence you can give it CAP_TRACING and CAP_BPF and drop the rest.

This won’t make me much more comfortable, since CAP_BPF lets it do an ever-growing set of nasty things. I’d much rather one or both of two things happen:

1. Give it CAP_TRACING only. It can leak my data, but it’s rather hard for it to crash my laptop, lose data, or cause other shenanigans.

2. Improve it a bit do all the privileged ops are wrapped by capset().

Does this make sense?  I’m a security person on occasion. I find vulnerabilities and exploit them deliberately and I break things by accident on a regular basis. In my considered opinion, CAP_TRACING alone, even extended to cover part of BPF as I’ve described, is decently safe. Getting root with just CAP_TRACING will be decently challenging, especially if I don’t get to read things like sshd’s memory, and improvements to mitigate even that could be added.  I am quite confident that attacks starting with CAP_TRACING will have clear audit signatures if auditing is on.  I am also confident that CAP_BPF *will* allow DoS and likely privilege escalation, and this will only get more likely as BPF gets more widely used. And, if BPF-based auditing ever becomes a thing, writing to the audit daemon’s maps will be a great way to cover one’s tracks.


> I think they have no choice but to do kernel.unprivileged_bpf_disabled=1.
> We, as a kernel community, are forcing the users into it.
> Hence I really do not see a value in any proposal today that expands
> unprivileged bpf usage.

I think you’re overemphasizing bpf’s role in the whole speculation mess. I realize that you’ve spent an insane amount of time on mitigations to stupid issues. I’ve spent a less insane amount of time on mitigating similar issues outside of bpf.  It’s a mess.  At the end of the day, the kernel does its best, and new bugs show up. New CPUs will be less buggy. 

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-29  0:45             ` Andy Lutomirski
@ 2019-08-29  0:53               ` Andy Lutomirski
  2019-08-29  4:07               ` Alexei Starovoitov
  1 sibling, 0 replies; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-29  0:53 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	Steven Rostedt, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API



> On Aug 28, 2019, at 5:45 PM, Andy Lutomirski <luto@amacapital.net> wrote:
> 
> 
>>> On Aug 28, 2019, at 3:55 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>> 
>>> On Tue, Aug 27, 2019 at 11:12:29PM -0700, Andy Lutomirski wrote:
>>>>> 
>>>>> 
>>>>> From the previous discussion, you want to make progress toward solving
>>>>> a lot of problems with CAP_BPF.  One of them was making BPF
>>>>> firewalling more generally useful. By making CAP_BPF grant the ability
>>>>> to read kernel memory, you will make administrators much more nervous
>>>>> to grant CAP_BPF.
>>>> 
>>>> Andy, were your email hacked?
>>>> I explained several times that in this proposal
>>>> CAP_BPF _and_ CAP_TRACING _both_ are necessary to read kernel memory.
>>>> CAP_BPF alone is _not enough_.
>>> 
>>> You have indeed said this many times.  You've stated it as a matter of
>>> fact as though it cannot possibly discussed.  I'm asking you to
>>> justify it.
>> 
>> That's not how I see it.
>> I kept stating that both CAP_BPF and CAP_TRACING are necessary to read
>> kernel memory whereas you kept distorting my statement by dropping second
>> part and then making claims that "CAP_BPF grant the ability to read
>> kernel memory, you will make administrators much more nervous".
> 
> Mea culpa. CAP_BPF does, however, appear to permit breaking kASLR due to unsafe pointer conversions, and it allows reading and writing everyone’s maps.  I stand by my overall point.
> 
>> 
>> Just s/CAP_BPF/CAP_BPF and CAP_TRACING/ in this above sentence.
>> See that meaning suddenly changes?
>> Now administrators would be worried about tasks that have both at once.
>> They also would be worried about tasks that have CAP_TRACING alone,
>> because that's what allows probe_kernel_read().
> 
> This is not all what I meant. Of course granting CAP_BPF+CAP_TRACING allows reading kernel memory. This is not at all a problem.  Here is a problem I see:
> 
> CAP_TRACING + CAP_BPF allows modification of other people’s maps and potentially other things that should not be implied by CAP_TRACING alone and that don’t need to be available to tracers. So CAP_TRACING, which is powerful but has somewhat limited scope, isn’t fully useful without CAP_BPF, and giving CAP_TRACING *and* CAP_BPF allows things that teachers shouldn’t be able to do. I think this would make the whole mechanism less useful to Android, for example.
> 
> (Also, I’m not sure quite what you mean by “CAP_TRACING ... allows probe_kernel_read()”. probe_kernel_read() is a kernel function that can’t be directly called by userspace. CAP_TRACING allows reading kernel memory in plenty of ways regardless.)
> 
>> 
>>> It seems like you are specifically trying to add a new switch to turn
>>> as much of BPF as possible on and off.  Why?
>> 
>> Didn't I explain it several times already with multiple examples
>> from systemd, daemons, bpftrace ?
>> 
>> Let's try again.
>> Take your laptop with linux distro.
>> You're the only user there. I'm assuming you're not sharing it with
>> partner and kids. This is my definition of 'single user system'.
>> You can sudo on it at any time, but obviously prefer to run as many
>> apps as possible without cap_sys_admin.
>> Now you found some awesome open source app on the web that monitors
>> the health of the kernel and will pop a nice message on a screen if
>> something is wrong. Currently this app needs root. You hesitate,
>> but the apps is so useful and it has strong upstream code review process
>> that you keep running it 24/7.
>> This is open source app. New versions come. You upgrade.
>> You have enough trust in that app that you keep running it as root.
>> But there is always a chance that new version doing accidentaly
>> something stupid as 'kill -9 -1'. It's an open source app at the end.
>> 
>> Now I come with this CAP* proposal to make this app safer.
>> I'm not making your system more secure and not making this app
>> more secure. I can only make your laptop safer for day to day work
>> by limiting the operations this app can do.
>> This particular app monitros the kernel via bpf and tracing.
>> Hence you can give it CAP_TRACING and CAP_BPF and drop the rest.
> 
> This won’t make me much more comfortable, since CAP_BPF lets it do an ever-growing set of nasty things. I’d much rather one or both of two things happen:
> 
> 1. Give it CAP_TRACING only. It can leak my data, but it’s rather hard for it to crash my laptop, lose data, or cause other shenanigans.
> 
> 2. Improve it a bit do all the privileged ops are wrapped by capset().
> 
> Does this make sense?  I’m a security person on occasion. I find vulnerabilities and exploit them deliberately and I break things by accident on a regular basis. In my considered opinion, CAP_TRACING alone, even extended to cover part of BPF as I’ve described, is decently safe. Getting root with just CAP_TRACING will be decently challenging, especially if I don’t get to read things like sshd’s memory, and improvements to mitigate even that could be added.  I am quite confident that attacks starting with CAP_TRACING will have clear audit signatures if auditing is on.  I am also confident that CAP_BPF *will* allow DoS and likely privilege escalation, and this will only get more likely as BPF gets more widely used. And, if BPF-based auditing ever becomes a thing, writing to the audit daemon’s maps will be a great way to cover one’s tracks.
> 
> 
>> I think they have no choice but to do kernel.unprivileged_bpf_disabled=1.
>> We, as a kernel community, are forcing the users into it.
>> Hence I really do not see a value in any proposal today that expands
>> unprivileged bpf usage.
> 
> I think you’re overemphasizing bpf’s role in the whole speculation mess. I realize that you’ve spent an insane amount of time on mitigations to stupid issues. I’ve spent a less insane amount of time on mitigating similar issues outside of bpf.  It’s a mess.  At the end of the day, the kernel does its best, and new bugs show up. New CPUs will be less buggy.

Bah, accidentally hit send.

If the kernel’s mitigations aren’t good enough or you’re subject to direct user attack (e.g. via insufficient IBPB, SMT attack, etc) then you’re vulnerable. Otherwise you’re less vulnerable. BPF is by no means the whole story. Heck, the kernel *could*, at unfortunate performance cost, more aggressively flush around BPF and effectively treat it like user code.

So I think we should design bpf’s API’s security with the philosophy that speculation attacks are just one more type of bug, and we should make sure that real-world useful configurations don’t give BPF to tasks that don’t need it. My unpriv proposal tries to do this. This is *why* my proposal keeps test_run locked down and restricts running each program type to tasks that are explicitly granted the ability to attach it.

So let’s let CAP_TRACING use bpf. Speculation attacks are mostly irrelevant to tracers anyway, but all the rest of the stuff I’ve been talking is relevant.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28 23:38             ` Alexei Starovoitov
@ 2019-08-29  0:58               ` Andy Lutomirski
  0 siblings, 0 replies; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-29  0:58 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	Steven Rostedt, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API



> On Aug 28, 2019, at 4:38 PM, Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
>> On Tue, Aug 27, 2019 at 11:20:19PM -0700, Andy Lutomirski wrote:
>> On Tue, Aug 27, 2019 at 9:49 PM Alexei Starovoitov
>> <alexei.starovoitov@gmail.com> wrote:
>>> 
>>>> On Tue, Aug 27, 2019 at 07:00:40PM -0700, Andy Lutomirski wrote:
>>>> 
>>>> Let me put this a bit differently. Part of the point is that
>>>> CAP_TRACING should allow a user or program to trace without being able
>>>> to corrupt the system. CAP_BPF as you’ve proposed it *can* likely
>>>> crash the system.
>>> 
>>> Really? I'm still waiting for your example where bpf+kprobe crashes the system...
>>> 
>> 
>> That's not what I meant.  bpf+kprobe causing a crash is a bug.  I'm
>> referring to a totally different issue.  On my laptop:
>> 
>> $ sudo bpftool map
>> 48: hash  name foobar  flags 0x0
>>    key 8B  value 8B  max_entries 64  memlock 8192B
>> 181: lpm_trie  flags 0x1
>>    key 8B  value 8B  max_entries 1  memlock 4096B
>> 182: lpm_trie  flags 0x1
>>    key 20B  value 8B  max_entries 1  memlock 4096B
>> 183: lpm_trie  flags 0x1
>>    key 8B  value 8B  max_entries 1  memlock 4096B
>> 184: lpm_trie  flags 0x1
>>    key 20B  value 8B  max_entries 1  memlock 4096B
>> 185: lpm_trie  flags 0x1
>>    key 8B  value 8B  max_entries 1  memlock 4096B
>> 186: lpm_trie  flags 0x1
>>    key 20B  value 8B  max_entries 1  memlock 4096B
>> 187: lpm_trie  flags 0x1
>>    key 8B  value 8B  max_entries 1  memlock 4096B
>> 188: lpm_trie  flags 0x1
>>    key 20B  value 8B  max_entries 1  memlock 4096B
>> 
>> $ sudo bpftool map dump id 186
>> key:
>> 00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00
>> 00 00 00 00
>> value:
>> 02 00 00 00 00 00 00 00
>> Found 1 element
>> 
>> $ sudo bpftool map delete id 186 key hex 00 00 00 00 00 00 00 00 00 00
>> 00 00 00 00 00 00 00 00 00 00
>> [this worked]
>> 
>> I don't know what my laptop was doing with map id 186 in particular,
>> but, whatever it was, I definitely broke it.  If a BPF firewall is in
>> use on something important enough, this could easily remove
>> connectivity from part or all of the system.  Right now, this needs
>> CAP_SYS_ADMIN.  With your patch, CAP_BPF is sufficient to do this, but
>> you *also* need CAP_BPF to trace the system using BPF.  Tracing with
>> BPF is 'safe' in the absence of bugs.  Modifying other peoples' maps
>> is not.
> 
> That lpm_trie is likely systemd implementing IP sandboxing.
> Not sure whether it's white or black list.
> Deleting an IP address from that map will either allow or disallow
> network traffic.
> Out of band operation on bpf map broke some bpf program. Sure.
> But calling it 'breaking the system' is quite a stretch.
> Calling it 'crashing the system' is plain wrong.
> Yet you're generalizing this bpf map read/write as
> "CAP_BPF as you’ve proposed it *can* likely crash the system."
> This is what I have a problem with.

Well, after I sent that email, firewalld on my laptop exploded and the system eventually hung.  I call that broken, and I really made a minimal effort here to break things.

> 
> Anyway, changing gears...
> Yes. I did propose to make a task with CAP_BPF to be able to
> manipulate arbitrary maps in the system.
> You could have said that if CAP_BPF is given to 'bpftool'
> then any user will be able to mess with other maps because
> bpftool is likely chmod-ed 755.
> Absolutely correct!
> It's not a fault of the CAP_BPF scope.
> Just don't give that cap to bpftool or do different acl/chmod.

I see no reason that allowing a user to use most of bpftool’s functionality necessarily needs to allow that user to corrupt the system. It obviously will expand the attack surface available to that user, but that should be it.

I’m trying to convince you that bpf’s security model can be made better than what you’re proposing. I’m genuinely not trying to get in your way. I’m trying to help you improve bpf.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-29  0:45             ` Andy Lutomirski
  2019-08-29  0:53               ` Andy Lutomirski
@ 2019-08-29  4:07               ` Alexei Starovoitov
  2019-09-28 23:37                 ` Steven Rostedt
  1 sibling, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-29  4:07 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Andy Lutomirski, Alexei Starovoitov, Kees Cook, LSM List,
	James Morris, Jann Horn, Peter Zijlstra, Masami Hiramatsu,
	Steven Rostedt, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API

On Wed, Aug 28, 2019 at 05:45:47PM -0700, Andy Lutomirski wrote:
> > 
> >> It seems like you are specifically trying to add a new switch to turn
> >> as much of BPF as possible on and off.  Why?
> > 
> > Didn't I explain it several times already with multiple examples
> > from systemd, daemons, bpftrace ?
> > 
> > Let's try again.
> > Take your laptop with linux distro.
> > You're the only user there. I'm assuming you're not sharing it with
> > partner and kids. This is my definition of 'single user system'.
> > You can sudo on it at any time, but obviously prefer to run as many
> > apps as possible without cap_sys_admin.
> > Now you found some awesome open source app on the web that monitors
> > the health of the kernel and will pop a nice message on a screen if
> > something is wrong. Currently this app needs root. You hesitate,
> > but the apps is so useful and it has strong upstream code review process
> > that you keep running it 24/7.
> > This is open source app. New versions come. You upgrade.
> > You have enough trust in that app that you keep running it as root.
> > But there is always a chance that new version doing accidentaly
> > something stupid as 'kill -9 -1'. It's an open source app at the end.
> > 
> > Now I come with this CAP* proposal to make this app safer.
> > I'm not making your system more secure and not making this app
> > more secure. I can only make your laptop safer for day to day work
> > by limiting the operations this app can do.
> > This particular app monitros the kernel via bpf and tracing.
> > Hence you can give it CAP_TRACING and CAP_BPF and drop the rest.
> 
> This won’t make me much more comfortable, since CAP_BPF lets it do an ever-growing set of nasty things. I’d much rather one or both of two things happen:
> 
> 1. Give it CAP_TRACING only. It can leak my data, but it’s rather hard for it to crash my laptop, lose data, or cause other shenanigans.
> 
> 2. Improve it a bit do all the privileged ops are wrapped by capset().
> 
> Does this make sense?  I’m a security person on occasion. I find vulnerabilities and exploit them deliberately and I break things by accident on a regular basis. In my considered opinion, CAP_TRACING alone, even extended to cover part of BPF as I’ve described, is decently safe. Getting root with just CAP_TRACING will be decently challenging, especially if I don’t get to read things like sshd’s memory, and improvements to mitigate even that could be added.  I am quite confident that attacks starting with CAP_TRACING will have clear audit signatures if auditing is on.  I am also confident that CAP_BPF *will* allow DoS and likely privilege escalation, and this will only get more likely as BPF gets more widely used. And, if BPF-based auditing ever becomes a thing, writing to the audit daemon’s maps will be a great way to cover one’s tracks.

CAP_TRACING, as I'm proposing it, will allow full tracefs access.
I think Steven and Massami prefer that as well.
That includes kprobe with probe_kernel_read.
That also means mini-DoS by installing kprobes everywhere or running too much ftrace.

CAP_TRACING will allow perf_event_open() too.
Which also means mini-DoS with too many events.

CAP_TRACING with or without CAP_BPF is safe, but it's not secure.
And that's what I need to make above 'open source kernel health app' to be safe.

In real world we have tens of such apps and they use all of the things that
I'm allowing via CAP_BPF + CAP_NET_ADMIN + CAP_TRACING.
Some apps will need only two out of three.
I don't see any further possibility to shrink the scope of the proposal.

> I’m trying to convince you that bpf’s security model can be made better
> than what you’re proposing. I’m genuinely not trying to get in your way.
> I’m trying to help you improve bpf.

If you really want to help please don't reject the real use cases
just because they don't fit into your proposal.

There is not a single feature in BPF land that we did because we simply
wanted to. For every feature we drilled into use cases to make sure
there is a real user behind it.
Same thing with CAP_BPF. I'm defining it to include GET_FD_BY_ID because
apps use it and they need to made safer.

Anyway the v2 version of the patch with CAP_TRACING and CAP_BPF is on the way.
Hopefully later tonight or tomorrow.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-28 22:08     ` Alexei Starovoitov
@ 2019-08-29 13:34       ` Steven Rostedt
  2019-08-29 15:43         ` Andy Lutomirski
  2019-08-29 17:19         ` Alexei Starovoitov
  0 siblings, 2 replies; 46+ messages in thread
From: Steven Rostedt @ 2019-08-29 13:34 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Andy Lutomirski, Alexei Starovoitov, Kees Cook,
	LSM List, James Morris, Jann Horn, Masami Hiramatsu,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Wed, 28 Aug 2019 15:08:28 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Wed, Aug 28, 2019 at 09:14:21AM +0200, Peter Zijlstra wrote:
> > On Tue, Aug 27, 2019 at 04:01:08PM -0700, Andy Lutomirski wrote:
> >   
> > > > Tracing:
> > > >
> > > > CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1)
> > > > are necessary to:  
> > 
> > That's not tracing, that's perf.
> >   

> re: your first comment above.
> I'm not sure what difference you see in words 'tracing' and 'perf'.
> I really hope we don't partition the overall tracing category
> into CAP_PERF and CAP_FTRACE only because these pieces are maintained
> by different people.

I think Peter meant: It's not tracing, it's profiling.

And there is a bit of separation between the two, although there is an
overlap.

Yes, perf can do tracing but it's designed more for profiling.

> On one side perf_event_open() isn't really doing tracing (as step by
> step ftracing of function sequences), but perf_event_open() opens
> an event and the sequence of events (may include IP) becomes a trace.
> imo CAP_TRACING is the best name to descibe the privileged space
> of operations possible via perf_event_open, ftrace, kprobe, stack traces, etc.

I have no issue with what you suggest. I guess it comes down to how
fine grain people want to go. Do we want it to be all or nothing?
Should CAP_TRACING allow for write access to tracefs? Or should we go
with needing both CAP_TRACING and permissions in that directory
(like changing the group ownership of the files at every boot). 

Perhaps we should have a CAP_TRACING_RO, that gives read access to
tracefs (and write if the users have permissions). And have CAP_TRACING
to allow full write access as well (allowing for users to add kprobe
events and enabling tracers like the function tracer).

> 
> Another reason are kuprobes. They can be crated via perf_event_open
> and via tracefs. Are they in CAP_PERF or in CAP_FTRACE ? In both, right?
> Should then CAP_KPROBE be used ? that would be an overkill.
> It would partition the space even further without obvious need.
> 
> Looking from BPF angle... BPF doesn't have integration with ftrace yet.
> bpf_trace_printk is using ftrace mechanism, but that's 1% of ftrace.
> In the long run I really like to see bpf using all of ftrace.
> Whereas bpf is using a lot of 'perf'.
> And extending some perf things in bpf specific way.
> Take a look at how BPF_F_STACK_BUILD_ID. It's clearly perf/stack_tracing
> feature that generic perf can use one day.
> Currently it sits in bpf land and accessible via bpf only.
> Though its bpf only today I categorize it under CAP_TRACING.
> 
> I think CAP_TRACING privilege should allow task to do all of perf_event_open,
> kuprobe, stack trace, ftrace, and kallsyms.
> We can think of some exceptions that should stay under CAP_SYS_ADMIN,
> but most of the functionality available by 'perf' binary should be
> usable with CAP_TRACING. 'perf' can do bpf too.
> With CAP_BPF it would be all set.

As the above seems to favor the idea of CAP_TRACING allowing write
access to tracefs, should we have a CAP_TRACING_RO for just read access
and limited perf abilities?

-- Steve

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-29 13:34       ` Steven Rostedt
@ 2019-08-29 15:43         ` Andy Lutomirski
  2019-08-29 17:23           ` Alexei Starovoitov
  2019-08-29 17:19         ` Alexei Starovoitov
  1 sibling, 1 reply; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-29 15:43 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Alexei Starovoitov, Peter Zijlstra, Andy Lutomirski,
	Alexei Starovoitov, Kees Cook, LSM List, James Morris, Jann Horn,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API

On Thu, Aug 29, 2019 at 6:34 AM Steven Rostedt <rostedt@goodmis.org> wrote:
>
> On Wed, 28 Aug 2019 15:08:28 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
> > On Wed, Aug 28, 2019 at 09:14:21AM +0200, Peter Zijlstra wrote:
> > > On Tue, Aug 27, 2019 at 04:01:08PM -0700, Andy Lutomirski wrote:
> > >
> > > > > Tracing:
> > > > >
> > > > > CAP_BPF and perf_paranoid_tracepoint_raw() (which is kernel.perf_event_paranoid == -1)
> > > > > are necessary to:
> > >
> > > That's not tracing, that's perf.
> > >
>
> > re: your first comment above.
> > I'm not sure what difference you see in words 'tracing' and 'perf'.
> > I really hope we don't partition the overall tracing category
> > into CAP_PERF and CAP_FTRACE only because these pieces are maintained
> > by different people.
>
> I think Peter meant: It's not tracing, it's profiling.
>
> And there is a bit of separation between the two, although there is an
> overlap.
>
> Yes, perf can do tracing but it's designed more for profiling.

As I see it, there are a couple of reasons to split something into
multiple capabilities.  If they allow users to do well-defined things
that have materially different risks from the perspective of the
person granting the capabilities, then they can usefully be different.
Similarly, if one carries a risk of accidental use that another does
not, they should usefully be different.  An example of the first is
that CAP_NET_BIND_SERVICE has very different powers from
CAP_NET_ADMIN, whereas CAP_SYS_ADMIN and CAP_PTRACE are really quite
similar from a security perspective.  An example of the latter is that
CAP_DAC_OVERRIDE changes overall open() semantics and CAP_SYS_ADMIN
does not, at least not outside of /proc.

Things having different development histories and different
maintainers doesn't seem like a good reason to split the capabilities
IMO.

>
> > On one side perf_event_open() isn't really doing tracing (as step by
> > step ftracing of function sequences), but perf_event_open() opens
> > an event and the sequence of events (may include IP) becomes a trace.
> > imo CAP_TRACING is the best name to descibe the privileged space
> > of operations possible via perf_event_open, ftrace, kprobe, stack traces, etc.
>
> I have no issue with what you suggest. I guess it comes down to how
> fine grain people want to go. Do we want it to be all or nothing?
> Should CAP_TRACING allow for write access to tracefs? Or should we go
> with needing both CAP_TRACING and permissions in that directory
> (like changing the group ownership of the files at every boot).
>
> Perhaps we should have a CAP_TRACING_RO, that gives read access to
> tracefs (and write if the users have permissions). And have CAP_TRACING
> to allow full write access as well (allowing for users to add kprobe
> events and enabling tracers like the function tracer).

I can imagine splitting it into three capabilities:

CAP_TRACE_KERNEL: learn which kernel functions are called when.  This
would allow perf profiling, for example, but not sampling of kernel
regs.

CAP_TRACE_READ_KERNEL_DATA: allow the tracing, profiling, etc features
that can read the kernel's data.  So you get function arguments via
kprobe, kernel regs, and APIs that expose probe_kernel_read()

CAP_TRACE_USER: trace unrelated user processes

I'm not sure the code is written in a way that makes splitting
CAP_TRACE_KERNEL and CAP_TRACE_READ_KERNEL_DATA, and I'm not sure that
CAP_TRACE_KERNEL is all that useful except for plain perf record
without CAP_TRACE_READ_KERNEL_DATA.  What do you all think?  I suppose
it could also be:

CAP_PROFILE_KERNEL: Use perf with events that aren't kprobes or
tracepoints.  Does not grant the ability to sample regs or the kernel
stack directly.

CAP_TRACE_KERNEL: Use all of perf, ftrace, kprobe, etc.

CAP_TRACE_USER: Use all of perf with scope limited to user mode and uprobes.

> As the above seems to favor the idea of CAP_TRACING allowing write
> access to tracefs, should we have a CAP_TRACING_RO for just read access
> and limited perf abilities?

How about making a separate cap for limited perf capabilities along
the lines of the above?

For what it's worth, it should be straightforward using full tracing
to read out the kernel's random number pool, for example, but it would
be difficult or impossible to do that using just perf record -e
cycles.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-29 13:34       ` Steven Rostedt
  2019-08-29 15:43         ` Andy Lutomirski
@ 2019-08-29 17:19         ` Alexei Starovoitov
  2019-08-29 17:47           ` Steven Rostedt
  1 sibling, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-29 17:19 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Peter Zijlstra, Andy Lutomirski, Alexei Starovoitov, Kees Cook,
	LSM List, James Morris, Jann Horn, Masami Hiramatsu,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Thu, Aug 29, 2019 at 09:34:34AM -0400, Steven Rostedt wrote:
> 
> As the above seems to favor the idea of CAP_TRACING allowing write
> access to tracefs, should we have a CAP_TRACING_RO for just read access
> and limited perf abilities?

read only vs writeable is an attribute of the file system.
Bringing such things into caps seem wrong to me.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-29 15:43         ` Andy Lutomirski
@ 2019-08-29 17:23           ` Alexei Starovoitov
  2019-08-29 17:36             ` Andy Lutomirski
  2019-08-29 17:49             ` Steven Rostedt
  0 siblings, 2 replies; 46+ messages in thread
From: Alexei Starovoitov @ 2019-08-29 17:23 UTC (permalink / raw)
  To: Andy Lutomirski
  Cc: Steven Rostedt, Peter Zijlstra, Alexei Starovoitov, Kees Cook,
	LSM List, James Morris, Jann Horn, Masami Hiramatsu,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Thu, Aug 29, 2019 at 08:43:23AM -0700, Andy Lutomirski wrote:
> 
> I can imagine splitting it into three capabilities:
> 
> CAP_TRACE_KERNEL: learn which kernel functions are called when.  This
> would allow perf profiling, for example, but not sampling of kernel
> regs.
> 
> CAP_TRACE_READ_KERNEL_DATA: allow the tracing, profiling, etc features
> that can read the kernel's data.  So you get function arguments via
> kprobe, kernel regs, and APIs that expose probe_kernel_read()
> 
> CAP_TRACE_USER: trace unrelated user processes
> 
> I'm not sure the code is written in a way that makes splitting
> CAP_TRACE_KERNEL and CAP_TRACE_READ_KERNEL_DATA, and I'm not sure that
> CAP_TRACE_KERNEL is all that useful except for plain perf record
> without CAP_TRACE_READ_KERNEL_DATA.  What do you all think?  I suppose
> it could also be:
> 
> CAP_PROFILE_KERNEL: Use perf with events that aren't kprobes or
> tracepoints.  Does not grant the ability to sample regs or the kernel
> stack directly.
> 
> CAP_TRACE_KERNEL: Use all of perf, ftrace, kprobe, etc.
> 
> CAP_TRACE_USER: Use all of perf with scope limited to user mode and uprobes.

imo that makes little sense from security pov, since
such CAP_TRACE_KERNEL (ex kprobe) can trace "unrelated user process"
just as well. Yet not letting it do cleanly via uprobe.
Sort of like giving a spare key for back door of the house and
saying no, you cannot have main door key.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-29 17:23           ` Alexei Starovoitov
@ 2019-08-29 17:36             ` Andy Lutomirski
  2019-08-29 17:49             ` Steven Rostedt
  1 sibling, 0 replies; 46+ messages in thread
From: Andy Lutomirski @ 2019-08-29 17:36 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Steven Rostedt, Peter Zijlstra,
	Alexei Starovoitov, Kees Cook, LSM List, James Morris, Jann Horn,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API

On Thu, Aug 29, 2019 at 10:23 AM Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
>
> On Thu, Aug 29, 2019 at 08:43:23AM -0700, Andy Lutomirski wrote:
> >
> > I can imagine splitting it into three capabilities:
> >
> > CAP_TRACE_KERNEL: learn which kernel functions are called when.  This
> > would allow perf profiling, for example, but not sampling of kernel
> > regs.
> >
> > CAP_TRACE_READ_KERNEL_DATA: allow the tracing, profiling, etc features
> > that can read the kernel's data.  So you get function arguments via
> > kprobe, kernel regs, and APIs that expose probe_kernel_read()
> >
> > CAP_TRACE_USER: trace unrelated user processes
> >
> > I'm not sure the code is written in a way that makes splitting
> > CAP_TRACE_KERNEL and CAP_TRACE_READ_KERNEL_DATA, and I'm not sure that
> > CAP_TRACE_KERNEL is all that useful except for plain perf record
> > without CAP_TRACE_READ_KERNEL_DATA.  What do you all think?  I suppose
> > it could also be:
> >
> > CAP_PROFILE_KERNEL: Use perf with events that aren't kprobes or
> > tracepoints.  Does not grant the ability to sample regs or the kernel
> > stack directly.
> >
> > CAP_TRACE_KERNEL: Use all of perf, ftrace, kprobe, etc.
> >
> > CAP_TRACE_USER: Use all of perf with scope limited to user mode and uprobes.
>
> imo that makes little sense from security pov, since
> such CAP_TRACE_KERNEL (ex kprobe) can trace "unrelated user process"
> just as well. Yet not letting it do cleanly via uprobe.
> Sort of like giving a spare key for back door of the house and
> saying no, you cannot have main door key.
>

Not all combinations of capabilities make total sense.  CAP_SETUID,
for example, generally lets you get all the other capabilities.
CAP_TRACE_KERNEL + CAP_TRACE_USER makes sense.  CAP_TRACE_USER by
itself makes sense.  CAP_TRACE_READ_KERNEL_DATA without
CAP_TRACE_KERNEL does not.  I don't think this is a really a problem.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-29 17:19         ` Alexei Starovoitov
@ 2019-08-29 17:47           ` Steven Rostedt
  0 siblings, 0 replies; 46+ messages in thread
From: Steven Rostedt @ 2019-08-29 17:47 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Peter Zijlstra, Andy Lutomirski, Alexei Starovoitov, Kees Cook,
	LSM List, James Morris, Jann Horn, Masami Hiramatsu,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Thu, 29 Aug 2019 10:19:24 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Thu, Aug 29, 2019 at 09:34:34AM -0400, Steven Rostedt wrote:
> > 
> > As the above seems to favor the idea of CAP_TRACING allowing write
> > access to tracefs, should we have a CAP_TRACING_RO for just read access
> > and limited perf abilities?  
> 
> read only vs writeable is an attribute of the file system.
> Bringing such things into caps seem wrong to me.

So using groups then? I'm fine with that.

-- Steve

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-29 17:23           ` Alexei Starovoitov
  2019-08-29 17:36             ` Andy Lutomirski
@ 2019-08-29 17:49             ` Steven Rostedt
  1 sibling, 0 replies; 46+ messages in thread
From: Steven Rostedt @ 2019-08-29 17:49 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Peter Zijlstra, Alexei Starovoitov, Kees Cook,
	LSM List, James Morris, Jann Horn, Masami Hiramatsu,
	David S. Miller, Daniel Borkmann, Network Development, bpf,
	kernel-team, Linux API

On Thu, 29 Aug 2019 10:23:10 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> > CAP_TRACE_KERNEL: Use all of perf, ftrace, kprobe, etc.
> > 
> > CAP_TRACE_USER: Use all of perf with scope limited to user mode and uprobes.  
> 
> imo that makes little sense from security pov, since
> such CAP_TRACE_KERNEL (ex kprobe) can trace "unrelated user process"
> just as well. Yet not letting it do cleanly via uprobe.
> Sort of like giving a spare key for back door of the house and
> saying no, you cannot have main door key.

I took it as CAP_TRACE_KERNEL as a superset of CAP_TRACE_USER. That is,
if you have CAP_TRACE_KERNEL, by default you get USER. Where as
CAP_TRACE_USER, is much more limiting.

-- Steve

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-08-29  4:07               ` Alexei Starovoitov
@ 2019-09-28 23:37                 ` Steven Rostedt
  2019-09-30 18:31                   ` Kees Cook
  0 siblings, 1 reply; 46+ messages in thread
From: Steven Rostedt @ 2019-09-28 23:37 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Andy Lutomirski, Andy Lutomirski, Alexei Starovoitov, Kees Cook,
	LSM List, James Morris, Jann Horn, Peter Zijlstra,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API

On Wed, 28 Aug 2019 21:07:24 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > 
> > This won’t make me much more comfortable, since CAP_BPF lets it do an ever-growing set of nasty things. I’d much rather one or both of two things happen:
> > 
> > 1. Give it CAP_TRACING only. It can leak my data, but it’s rather hard for it to crash my laptop, lose data, or cause other shenanigans.
> > 
> > 2. Improve it a bit do all the privileged ops are wrapped by capset().
> > 
> > Does this make sense?  I’m a security person on occasion. I find
> > vulnerabilities and exploit them deliberately and I break things by
> > accident on a regular basis. In my considered opinion, CAP_TRACING
> > alone, even extended to cover part of BPF as I’ve described, is
> > decently safe. Getting root with just CAP_TRACING will be decently
> > challenging, especially if I don’t get to read things like sshd’s
> > memory, and improvements to mitigate even that could be added.  I
> > am quite confident that attacks starting with CAP_TRACING will have
> > clear audit signatures if auditing is on.  I am also confident that
> > CAP_BPF *will* allow DoS and likely privilege escalation, and this
> > will only get more likely as BPF gets more widely used. And, if
> > BPF-based auditing ever becomes a thing, writing to the audit
> > daemon’s maps will be a great way to cover one’s tracks.  
> 
> CAP_TRACING, as I'm proposing it, will allow full tracefs access.
> I think Steven and Massami prefer that as well.
> That includes kprobe with probe_kernel_read.
> That also means mini-DoS by installing kprobes everywhere or running
> too much ftrace.

I was talking with Kees at Plumbers about this, and we were talking
about just using simple file permissions. I started playing with some
patches to allow the tracefs be visible but by default it would only be
visible by root.

 rwx------

Then a start up script (or perhaps mount options) could change the
group owner, and change this to:

 rwxrwx---

Where anyone in the group assigned (say "tracing") gets full access to
the file system.

The more I was playing with this, the less I see the need for
CAP_TRACING for ftrace and reading the format files.

-- Steve

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-09-28 23:37                 ` Steven Rostedt
@ 2019-09-30 18:31                   ` Kees Cook
  2019-10-01  1:22                     ` Alexei Starovoitov
  2019-10-03  6:12                     ` Masami Hiramatsu
  0 siblings, 2 replies; 46+ messages in thread
From: Kees Cook @ 2019-09-30 18:31 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Alexei Starovoitov, Andy Lutomirski, Andy Lutomirski,
	Alexei Starovoitov, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, David S. Miller,
	Daniel Borkmann, Network Development, bpf, kernel-team,
	Linux API

On Sat, Sep 28, 2019 at 07:37:27PM -0400, Steven Rostedt wrote:
> On Wed, 28 Aug 2019 21:07:24 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > > 
> > > This won’t make me much more comfortable, since CAP_BPF lets it do an ever-growing set of nasty things. I’d much rather one or both of two things happen:
> > > 
> > > 1. Give it CAP_TRACING only. It can leak my data, but it’s rather hard for it to crash my laptop, lose data, or cause other shenanigans.
> > > 
> > > 2. Improve it a bit do all the privileged ops are wrapped by capset().
> > > 
> > > Does this make sense?  I’m a security person on occasion. I find
> > > vulnerabilities and exploit them deliberately and I break things by
> > > accident on a regular basis. In my considered opinion, CAP_TRACING
> > > alone, even extended to cover part of BPF as I’ve described, is
> > > decently safe. Getting root with just CAP_TRACING will be decently
> > > challenging, especially if I don’t get to read things like sshd’s
> > > memory, and improvements to mitigate even that could be added.  I
> > > am quite confident that attacks starting with CAP_TRACING will have
> > > clear audit signatures if auditing is on.  I am also confident that
> > > CAP_BPF *will* allow DoS and likely privilege escalation, and this
> > > will only get more likely as BPF gets more widely used. And, if
> > > BPF-based auditing ever becomes a thing, writing to the audit
> > > daemon’s maps will be a great way to cover one’s tracks.  
> > 
> > CAP_TRACING, as I'm proposing it, will allow full tracefs access.
> > I think Steven and Massami prefer that as well.
> > That includes kprobe with probe_kernel_read.
> > That also means mini-DoS by installing kprobes everywhere or running
> > too much ftrace.
> 
> I was talking with Kees at Plumbers about this, and we were talking
> about just using simple file permissions. I started playing with some
> patches to allow the tracefs be visible but by default it would only be
> visible by root.
> 
>  rwx------
> 
> Then a start up script (or perhaps mount options) could change the
> group owner, and change this to:
> 
>  rwxrwx---
> 
> Where anyone in the group assigned (say "tracing") gets full access to
> the file system.
> 
> The more I was playing with this, the less I see the need for
> CAP_TRACING for ftrace and reading the format files.

Nice! Thanks for playing with this. I like it because it gives us a way
to push policy into userspace (group membership, etc), and provides a
clean way (hopefully) do separate "read" (kernel memory confidentiality)
from "write" (kernel memory integrity), which wouldn't have been possible
with a single new CAP_...

-Kees

-- 
Kees Cook

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-09-30 18:31                   ` Kees Cook
@ 2019-10-01  1:22                     ` Alexei Starovoitov
  2019-10-01 22:10                       ` Steven Rostedt
  2019-10-03  6:12                     ` Masami Hiramatsu
  1 sibling, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-10-01  1:22 UTC (permalink / raw)
  To: Kees Cook
  Cc: Steven Rostedt, Andy Lutomirski, Andy Lutomirski,
	Alexei Starovoitov, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, David S. Miller,
	Daniel Borkmann, Network Development, bpf, kernel-team,
	Linux API

On Mon, Sep 30, 2019 at 11:31:29AM -0700, Kees Cook wrote:
> On Sat, Sep 28, 2019 at 07:37:27PM -0400, Steven Rostedt wrote:
> > On Wed, 28 Aug 2019 21:07:24 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > > > 
> > > > This won’t make me much more comfortable, since CAP_BPF lets it do an ever-growing set of nasty things. I’d much rather one or both of two things happen:
> > > > 
> > > > 1. Give it CAP_TRACING only. It can leak my data, but it’s rather hard for it to crash my laptop, lose data, or cause other shenanigans.
> > > > 
> > > > 2. Improve it a bit do all the privileged ops are wrapped by capset().
> > > > 
> > > > Does this make sense?  I’m a security person on occasion. I find
> > > > vulnerabilities and exploit them deliberately and I break things by
> > > > accident on a regular basis. In my considered opinion, CAP_TRACING
> > > > alone, even extended to cover part of BPF as I’ve described, is
> > > > decently safe. Getting root with just CAP_TRACING will be decently
> > > > challenging, especially if I don’t get to read things like sshd’s
> > > > memory, and improvements to mitigate even that could be added.  I
> > > > am quite confident that attacks starting with CAP_TRACING will have
> > > > clear audit signatures if auditing is on.  I am also confident that
> > > > CAP_BPF *will* allow DoS and likely privilege escalation, and this
> > > > will only get more likely as BPF gets more widely used. And, if
> > > > BPF-based auditing ever becomes a thing, writing to the audit
> > > > daemon’s maps will be a great way to cover one’s tracks.  
> > > 
> > > CAP_TRACING, as I'm proposing it, will allow full tracefs access.
> > > I think Steven and Massami prefer that as well.
> > > That includes kprobe with probe_kernel_read.
> > > That also means mini-DoS by installing kprobes everywhere or running
> > > too much ftrace.
> > 
> > I was talking with Kees at Plumbers about this, and we were talking
> > about just using simple file permissions. I started playing with some
> > patches to allow the tracefs be visible but by default it would only be
> > visible by root.
> > 
> >  rwx------
> > 
> > Then a start up script (or perhaps mount options) could change the
> > group owner, and change this to:
> > 
> >  rwxrwx---
> > 
> > Where anyone in the group assigned (say "tracing") gets full access to
> > the file system.
> > 
> > The more I was playing with this, the less I see the need for
> > CAP_TRACING for ftrace and reading the format files.
> 
> Nice! Thanks for playing with this. I like it because it gives us a way
> to push policy into userspace (group membership, etc), and provides a
> clean way (hopefully) do separate "read" (kernel memory confidentiality)
> from "write" (kernel memory integrity), which wouldn't have been possible
> with a single new CAP_...

tracefs is a file system, so clearly file based acls are much better fit
for all tracefs operations.
But that is not the case for ftrace overall.
bpf_trace_printk() calls trace_printk() that dumps into trace pipe.
Technically it's ftrace operation, but it cannot be controlled by tracefs
and by file permissions. That's the motivation to guard bpf_trace_printk()
usage from bpf program with CAP_TRACING.

Both 'trace' and 'trace_pipe' have quirky side effects.
Like opening 'trace' file will make all parallel trace_printk() to be ignored.
While reading 'trace_pipe' file will clear it.
The point that traditional 'read' and 'write' ACLs don't map as-is
to tracefs, so I would be careful categorizing things into
confidentiality vs integrity only based on access type.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-10-01  1:22                     ` Alexei Starovoitov
@ 2019-10-01 22:10                       ` Steven Rostedt
  2019-10-01 22:18                         ` Alexei Starovoitov
  0 siblings, 1 reply; 46+ messages in thread
From: Steven Rostedt @ 2019-10-01 22:10 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Kees Cook, Andy Lutomirski, Andy Lutomirski, Alexei Starovoitov,
	LSM List, James Morris, Jann Horn, Peter Zijlstra,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API

On Mon, 30 Sep 2019 18:22:28 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> tracefs is a file system, so clearly file based acls are much better fit
> for all tracefs operations.
> But that is not the case for ftrace overall.
> bpf_trace_printk() calls trace_printk() that dumps into trace pipe.
> Technically it's ftrace operation, but it cannot be controlled by tracefs
> and by file permissions. That's the motivation to guard bpf_trace_printk()
> usage from bpf program with CAP_TRACING.

BTW, I'd rather have bpf use an event that records a string than using
trace printk itself.

Perhaps something like "bpf_print" event? That could be defined like:

TRACE_EVENT(bpf_print,
	TP_PROTO(const char *msg),
	TP_ARGS(msg),
	TP_STRUCT__entry(
		__string(msg, msg)
	),
	TP_fast_assign(
		__assign_str(msg, msg)
	),
	TP_printk("msg=%s", __get_str(msg))
);

And then you can just format the string from the bpf_trace_printk()
into msg, and then have:

	trace_bpf_print(msg);

The user could then just enable the trace event from the file system. I
could also work on making instances work like /tmp does (with the
sticky bit) in creation. That way people with write access to the
instances directory, can make their own buffers that they can use (and
others can't access).


> 
> Both 'trace' and 'trace_pipe' have quirky side effects.
> Like opening 'trace' file will make all parallel trace_printk() to be ignored.
> While reading 'trace_pipe' file will clear it.
> The point that traditional 'read' and 'write' ACLs don't map as-is
> to tracefs, so I would be careful categorizing things into
> confidentiality vs integrity only based on access type.

What exactly is the bpf_trace_printk() used for? I may have other ideas
that can help.

-- Steve

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-10-01 22:10                       ` Steven Rostedt
@ 2019-10-01 22:18                         ` Alexei Starovoitov
  2019-10-01 22:47                           ` Steven Rostedt
  0 siblings, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-10-01 22:18 UTC (permalink / raw)
  To: Steven Rostedt, Alexei Starovoitov
  Cc: Kees Cook, Andy Lutomirski, Andy Lutomirski, Alexei Starovoitov,
	LSM List, James Morris, Jann Horn, Peter Zijlstra,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Network Development, bpf, Kernel Team, Linux API

On 10/1/19 3:10 PM, Steven Rostedt wrote:
> On Mon, 30 Sep 2019 18:22:28 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
>> tracefs is a file system, so clearly file based acls are much better fit
>> for all tracefs operations.
>> But that is not the case for ftrace overall.
>> bpf_trace_printk() calls trace_printk() that dumps into trace pipe.
>> Technically it's ftrace operation, but it cannot be controlled by tracefs
>> and by file permissions. That's the motivation to guard bpf_trace_printk()
>> usage from bpf program with CAP_TRACING.
> 
> BTW, I'd rather have bpf use an event that records a string than using
> trace printk itself.
> 
> Perhaps something like "bpf_print" event? That could be defined like:
> 
> TRACE_EVENT(bpf_print,
> 	TP_PROTO(const char *msg),
> 	TP_ARGS(msg),
> 	TP_STRUCT__entry(
> 		__string(msg, msg)
> 	),
> 	TP_fast_assign(
> 		__assign_str(msg, msg)
> 	),
> 	TP_printk("msg=%s", __get_str(msg))
> );
> 
> And then you can just format the string from the bpf_trace_printk()
> into msg, and then have:
> 
> 	trace_bpf_print(msg);

It's an interesting idea, but I don't think it can work.
Please see bpf_trace_printk implementation in kernel/trace/bpf_trace.c
It's a lot more than string printing.

> The user could then just enable the trace event from the file system. I
> could also work on making instances work like /tmp does (with the
> sticky bit) in creation. That way people with write access to the
> instances directory, can make their own buffers that they can use (and
> others can't access).

We tried instances in bcc in the past and eventually removed all the 
support. The overhead of instances is too high to be usable.

> 
> 
>>
>> Both 'trace' and 'trace_pipe' have quirky side effects.
>> Like opening 'trace' file will make all parallel trace_printk() to be ignored.
>> While reading 'trace_pipe' file will clear it.
>> The point that traditional 'read' and 'write' ACLs don't map as-is
>> to tracefs, so I would be careful categorizing things into
>> confidentiality vs integrity only based on access type.
> 
> What exactly is the bpf_trace_printk() used for? I may have other ideas
> that can help.

It's debugging of bpf programs. Same is what printk() is used for
by kernel developers.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-10-01 22:18                         ` Alexei Starovoitov
@ 2019-10-01 22:47                           ` Steven Rostedt
  2019-10-02 17:18                             ` Alexei Starovoitov
  0 siblings, 1 reply; 46+ messages in thread
From: Steven Rostedt @ 2019-10-01 22:47 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, Kees Cook, Andy Lutomirski, Andy Lutomirski,
	Alexei Starovoitov, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, David S. Miller,
	Daniel Borkmann, Network Development, bpf, Kernel Team,
	Linux API

On Tue, 1 Oct 2019 22:18:18 +0000
Alexei Starovoitov <ast@fb.com> wrote:

> > And then you can just format the string from the bpf_trace_printk()
> > into msg, and then have:
> > 
> > 	trace_bpf_print(msg);  
> 
> It's an interesting idea, but I don't think it can work.
> Please see bpf_trace_printk implementation in kernel/trace/bpf_trace.c
> It's a lot more than string printing.

Well, trace_printk() is just string printing. I was thinking that the
bpf_trace_printk() could just use a vsnprintf() into a temporary buffer
(like trace_printk() does), and then call the trace event to write it
out.

> 
> > The user could then just enable the trace event from the file system. I
> > could also work on making instances work like /tmp does (with the
> > sticky bit) in creation. That way people with write access to the
> > instances directory, can make their own buffers that they can use (and
> > others can't access).  
> 
> We tried instances in bcc in the past and eventually removed all the 
> support. The overhead of instances is too high to be usable.

What overhead? An ftrace instance should not have any more overhead than
the root one does (it's the same code). Or are you talking about memory
overhead?

> 
> > 
> >   
> >>
> >> Both 'trace' and 'trace_pipe' have quirky side effects.
> >> Like opening 'trace' file will make all parallel trace_printk() to be ignored.
> >> While reading 'trace_pipe' file will clear it.
> >> The point that traditional 'read' and 'write' ACLs don't map as-is
> >> to tracefs, so I would be careful categorizing things into
> >> confidentiality vs integrity only based on access type.  
> > 
> > What exactly is the bpf_trace_printk() used for? I may have other ideas
> > that can help.  
> 
> It's debugging of bpf programs. Same is what printk() is used for
> by kernel developers.
> 

How is it extracted? Just read from the trace or trace_pipe file?

-- Steve

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-10-01 22:47                           ` Steven Rostedt
@ 2019-10-02 17:18                             ` Alexei Starovoitov
  2019-10-02 23:00                               ` Steven Rostedt
  0 siblings, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-10-02 17:18 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Alexei Starovoitov, Kees Cook, Andy Lutomirski, Andy Lutomirski,
	Alexei Starovoitov, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, David S. Miller,
	Daniel Borkmann, Network Development, bpf, Kernel Team,
	Linux API

On 10/1/19 3:47 PM, Steven Rostedt wrote:
> On Tue, 1 Oct 2019 22:18:18 +0000
> Alexei Starovoitov <ast@fb.com> wrote:
> 
>>> And then you can just format the string from the bpf_trace_printk()
>>> into msg, and then have:
>>>
>>> 	trace_bpf_print(msg);
>>
>> It's an interesting idea, but I don't think it can work.
>> Please see bpf_trace_printk implementation in kernel/trace/bpf_trace.c
>> It's a lot more than string printing.
> 
> Well, trace_printk() is just string printing. I was thinking that the
> bpf_trace_printk() could just use a vsnprintf() into a temporary buffer
> (like trace_printk() does), and then call the trace event to write it
> out.

are you proposing to replicate get_trace_buf() functionality
into bpf_trace_printk?
So print into temp string buffer is done twice?
I'm not excited about such hack.
And what's the goal? so that trace_bpf_print(string_msg);
can go through _run-time_ check whether that particular trace event
was allowed in tracefs ?
That's not how file system acls are typically designed.
The permission check is at open(). Not at write().
If I understood you correctly you're proposing to check permissions
at bpf program run-time which is no good.

bpf_trace_printk() already has one small buffer for
probe_kernel_read-ing an unknown string to pass into %s.
That's not ftrace. That's core tracing. That aspect is covered by 
CAP_TRACING as well.


>>
>>> The user could then just enable the trace event from the file system. I
>>> could also work on making instances work like /tmp does (with the
>>> sticky bit) in creation. That way people with write access to the
>>> instances directory, can make their own buffers that they can use (and
>>> others can't access).
>>
>> We tried instances in bcc in the past and eventually removed all the
>> support. The overhead of instances is too high to be usable.
> 
> What overhead? An ftrace instance should not have any more overhead than
> the root one does (it's the same code). Or are you talking about memory
> overhead?

Yes. Memory overhead. Human users doing cat/echo into tracefs won't be
creating many instances, so that's the only practical usage of them.

> 
>>
>>>
>>>    
>>>>
>>>> Both 'trace' and 'trace_pipe' have quirky side effects.
>>>> Like opening 'trace' file will make all parallel trace_printk() to be ignored.
>>>> While reading 'trace_pipe' file will clear it.
>>>> The point that traditional 'read' and 'write' ACLs don't map as-is
>>>> to tracefs, so I would be careful categorizing things into
>>>> confidentiality vs integrity only based on access type.
>>>
>>> What exactly is the bpf_trace_printk() used for? I may have other ideas
>>> that can help.
>>
>> It's debugging of bpf programs. Same is what printk() is used for
>> by kernel developers.
>>
> 
> How is it extracted? Just read from the trace or trace_pipe file?

yep. Just like kernel devs look at dmesg when they sprinkle printk.
btw, if you can fix 'trace' file issue that stops all trace_printk
while 'trace' file is open that would be great.
Some users have been bitten by this behavior. We even documented it.

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-10-02 17:18                             ` Alexei Starovoitov
@ 2019-10-02 23:00                               ` Steven Rostedt
  2019-10-03 16:18                                 ` trace_printk issue. Was: " Alexei Starovoitov
  0 siblings, 1 reply; 46+ messages in thread
From: Steven Rostedt @ 2019-10-02 23:00 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, Kees Cook, Andy Lutomirski, Andy Lutomirski,
	Alexei Starovoitov, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, David S. Miller,
	Daniel Borkmann, Network Development, bpf, Kernel Team,
	Linux API

On Wed, 2 Oct 2019 17:18:21 +0000
Alexei Starovoitov <ast@fb.com> wrote:

> >> It's an interesting idea, but I don't think it can work.
> >> Please see bpf_trace_printk implementation in kernel/trace/bpf_trace.c
> >> It's a lot more than string printing.  
> > 
> > Well, trace_printk() is just string printing. I was thinking that the
> > bpf_trace_printk() could just use a vsnprintf() into a temporary buffer
> > (like trace_printk() does), and then call the trace event to write it
> > out.  
> 
> are you proposing to replicate get_trace_buf() functionality
> into bpf_trace_printk?

No, do you need bpf_trace_printk() to run in all contexts?
trace_printk() does the get_trace_buf() dance so that it can be called
without locks and from any context including NMIs.

> So print into temp string buffer is done twice?
> I'm not excited about such hack.
> And what's the goal? so that trace_bpf_print(string_msg);
> can go through _run-time_ check whether that particular trace event
> was allowed in tracefs ?

No, just to use a standard event instead of hacking into
trace_printk().

> That's not how file system acls are typically designed.
> The permission check is at open(). Not at write().
> If I understood you correctly you're proposing to check permissions
> at bpf program run-time which is no good.
> 
> bpf_trace_printk() already has one small buffer for
> probe_kernel_read-ing an unknown string to pass into %s.
> That's not ftrace. That's core tracing. That aspect is covered by 
> CAP_TRACING as well.

Then use that buffer.

> 
> 
> >>  
> >>> The user could then just enable the trace event from the file system. I
> >>> could also work on making instances work like /tmp does (with the
> >>> sticky bit) in creation. That way people with write access to the
> >>> instances directory, can make their own buffers that they can use (and
> >>> others can't access).  
> >>
> >> We tried instances in bcc in the past and eventually removed all the
> >> support. The overhead of instances is too high to be usable.  
> > 
> > What overhead? An ftrace instance should not have any more overhead than
> > the root one does (it's the same code). Or are you talking about memory
> > overhead?  
> 
> Yes. Memory overhead. Human users doing cat/echo into tracefs won't be
> creating many instances, so that's the only practical usage of them.

If it's a real event, it can go into any of the ftrace buffers (top
level or instance), but it gives you the choice.

> 
> >   
> >>  
> >>>
> >>>      
> >>>>
> >>>> Both 'trace' and 'trace_pipe' have quirky side effects.
> >>>> Like opening 'trace' file will make all parallel trace_printk() to be ignored.
> >>>> While reading 'trace_pipe' file will clear it.
> >>>> The point that traditional 'read' and 'write' ACLs don't map as-is
> >>>> to tracefs, so I would be careful categorizing things into
> >>>> confidentiality vs integrity only based on access type.  
> >>>
> >>> What exactly is the bpf_trace_printk() used for? I may have other ideas
> >>> that can help.  
> >>
> >> It's debugging of bpf programs. Same is what printk() is used for
> >> by kernel developers.
> >>  
> > 
> > How is it extracted? Just read from the trace or trace_pipe file?  
> 
> yep. Just like kernel devs look at dmesg when they sprinkle printk.
> btw, if you can fix 'trace' file issue that stops all trace_printk
> while 'trace' file is open that would be great.
> Some users have been bitten by this behavior. We even documented it.

The behavior is documented as well in the ftrace documentation. That's
why we suggest the trace_pipe redirected into a file so that you don't
lose data (unless the writer goes too fast). If you prefer a producer
consumer where you lose newer events (like perf does), you can turn off
overwrite mode, and it will drop events when the buffer is full (see
options/overwrite).

-- Steve

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-09-30 18:31                   ` Kees Cook
  2019-10-01  1:22                     ` Alexei Starovoitov
@ 2019-10-03  6:12                     ` Masami Hiramatsu
  2019-10-03 16:20                       ` Alexei Starovoitov
  1 sibling, 1 reply; 46+ messages in thread
From: Masami Hiramatsu @ 2019-10-03  6:12 UTC (permalink / raw)
  To: Kees Cook
  Cc: Steven Rostedt, Alexei Starovoitov, Andy Lutomirski,
	Andy Lutomirski, Alexei Starovoitov, LSM List, James Morris,
	Jann Horn, Peter Zijlstra, Masami Hiramatsu, David S. Miller,
	Daniel Borkmann, Network Development, bpf, kernel-team,
	Linux API

On Mon, 30 Sep 2019 11:31:29 -0700
Kees Cook <keescook@chromium.org> wrote:

> On Sat, Sep 28, 2019 at 07:37:27PM -0400, Steven Rostedt wrote:
> > On Wed, 28 Aug 2019 21:07:24 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > > > 
> > > > This won’t make me much more comfortable, since CAP_BPF lets it do an ever-growing set of nasty things. I’d much rather one or both of two things happen:
> > > > 
> > > > 1. Give it CAP_TRACING only. It can leak my data, but it’s rather hard for it to crash my laptop, lose data, or cause other shenanigans.
> > > > 
> > > > 2. Improve it a bit do all the privileged ops are wrapped by capset().
> > > > 
> > > > Does this make sense?  I’m a security person on occasion. I find
> > > > vulnerabilities and exploit them deliberately and I break things by
> > > > accident on a regular basis. In my considered opinion, CAP_TRACING
> > > > alone, even extended to cover part of BPF as I’ve described, is
> > > > decently safe. Getting root with just CAP_TRACING will be decently
> > > > challenging, especially if I don’t get to read things like sshd’s
> > > > memory, and improvements to mitigate even that could be added.  I
> > > > am quite confident that attacks starting with CAP_TRACING will have
> > > > clear audit signatures if auditing is on.  I am also confident that
> > > > CAP_BPF *will* allow DoS and likely privilege escalation, and this
> > > > will only get more likely as BPF gets more widely used. And, if
> > > > BPF-based auditing ever becomes a thing, writing to the audit
> > > > daemon’s maps will be a great way to cover one’s tracks.  
> > > 
> > > CAP_TRACING, as I'm proposing it, will allow full tracefs access.
> > > I think Steven and Massami prefer that as well.
> > > That includes kprobe with probe_kernel_read.
> > > That also means mini-DoS by installing kprobes everywhere or running
> > > too much ftrace.
> > 
> > I was talking with Kees at Plumbers about this, and we were talking
> > about just using simple file permissions. I started playing with some
> > patches to allow the tracefs be visible but by default it would only be
> > visible by root.
> > 
> >  rwx------
> > 
> > Then a start up script (or perhaps mount options) could change the
> > group owner, and change this to:
> > 
> >  rwxrwx---
> > 
> > Where anyone in the group assigned (say "tracing") gets full access to
> > the file system.

Does it for "all" files under tracefs?

> > 
> > The more I was playing with this, the less I see the need for
> > CAP_TRACING for ftrace and reading the format files.
> 
> Nice! Thanks for playing with this. I like it because it gives us a way
> to push policy into userspace (group membership, etc), and provides a
> clean way (hopefully) do separate "read" (kernel memory confidentiality)
> from "write" (kernel memory integrity), which wouldn't have been possible
> with a single new CAP_...

 From the confidentiality point of view, if tracefs exposes traced data,
it might include in-kernel pointer and symbols, but the user still can't
see /proc/kallsyms. This means we still have several different confidentiality
for each interface.

Anyway, adding a tracefs mount option for allowing a user group to access
event format data will be a good idea. But even though, I  think we still
need the CAP_TRACING for allowing control of intrusive tracing, like kprobes
and bpf etc. (Or, do we keep those for CAP_SYS_ADMIN??)

BTW, should we request CAP_SYS_PTRACE for ftrace uprobe interface too?
It might break any user-space program (including init) if user puts a
probe on a wrong address (e.g. non instruction boundary on x86).

Thank you,

-- 
Masami Hiramatsu <mhiramat@kernel.org>

^ permalink raw reply	[flat|nested] 46+ messages in thread

* trace_printk issue. Was: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-10-02 23:00                               ` Steven Rostedt
@ 2019-10-03 16:18                                 ` Alexei Starovoitov
  2019-10-03 16:41                                   ` Steven Rostedt
  0 siblings, 1 reply; 46+ messages in thread
From: Alexei Starovoitov @ 2019-10-03 16:18 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: Alexei Starovoitov, Kees Cook, Andy Lutomirski, Andy Lutomirski,
	Alexei Starovoitov, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, David S. Miller,
	Daniel Borkmann, Network Development, bpf, Kernel Team,
	Linux API

On Wed, Oct 02, 2019 at 07:00:27PM -0400, Steven Rostedt wrote:
> > >>>>
> > >>>> Both 'trace' and 'trace_pipe' have quirky side effects.
> > >>>> Like opening 'trace' file will make all parallel trace_printk() to be ignored.
> > >>>> While reading 'trace_pipe' file will clear it.
> > >>>> The point that traditional 'read' and 'write' ACLs don't map as-is
> > >>>> to tracefs, so I would be careful categorizing things into
> > >>>> confidentiality vs integrity only based on access type.  
> > >>>
> > >>> What exactly is the bpf_trace_printk() used for? I may have other ideas
> > >>> that can help.  
> > >>
> > >> It's debugging of bpf programs. Same is what printk() is used for
> > >> by kernel developers.
> > >>  
> > > 
> > > How is it extracted? Just read from the trace or trace_pipe file?  
> > 
> > yep. Just like kernel devs look at dmesg when they sprinkle printk.
> > btw, if you can fix 'trace' file issue that stops all trace_printk
> > while 'trace' file is open that would be great.
> > Some users have been bitten by this behavior. We even documented it.
> 
> The behavior is documented as well in the ftrace documentation. That's
> why we suggest the trace_pipe redirected into a file so that you don't
> lose data (unless the writer goes too fast). If you prefer a producer
> consumer where you lose newer events (like perf does), you can turn off
> overwrite mode, and it will drop events when the buffer is full (see
> options/overwrite).

I think dropping last events is just as bad. Is there a mode to overwrite old
and keep the last N (like perf does) ?
That aside having 'trace' file open should NOT drop trace_printks.
My point that bpf_trace_printk is just as important to bpf developers as
printk to kernel developers.
Imagine kernel developer losing their printk-s only because they typed
"dmesg" in another terminal?
It's completely unexpected and breaks developer trust in debugging mechanism.
Peter Wu brought this issue to my attention in
commit 55c33dfbeb83 ("bpf: clarify when bpf_trace_printk discards lines").
And later sent similar doc fix to ftrace.rst.
To be honest if I knew of this trace_printk quirk I would not have picked it
as a debugging mechanism for bpf.
I urge you to fix it.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-10-03  6:12                     ` Masami Hiramatsu
@ 2019-10-03 16:20                       ` Alexei Starovoitov
  0 siblings, 0 replies; 46+ messages in thread
From: Alexei Starovoitov @ 2019-10-03 16:20 UTC (permalink / raw)
  To: Masami Hiramatsu
  Cc: Kees Cook, Steven Rostedt, Andy Lutomirski, Andy Lutomirski,
	Alexei Starovoitov, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, David S. Miller, Daniel Borkmann,
	Network Development, bpf, kernel-team, Linux API

On Thu, Oct 03, 2019 at 03:12:04PM +0900, Masami Hiramatsu wrote:
> On Mon, 30 Sep 2019 11:31:29 -0700
> Kees Cook <keescook@chromium.org> wrote:
> 
> > On Sat, Sep 28, 2019 at 07:37:27PM -0400, Steven Rostedt wrote:
> > > On Wed, 28 Aug 2019 21:07:24 -0700
> > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > > > > 
> > > > > This won’t make me much more comfortable, since CAP_BPF lets it do an ever-growing set of nasty things. I’d much rather one or both of two things happen:
> > > > > 
> > > > > 1. Give it CAP_TRACING only. It can leak my data, but it’s rather hard for it to crash my laptop, lose data, or cause other shenanigans.
> > > > > 
> > > > > 2. Improve it a bit do all the privileged ops are wrapped by capset().
> > > > > 
> > > > > Does this make sense?  I’m a security person on occasion. I find
> > > > > vulnerabilities and exploit them deliberately and I break things by
> > > > > accident on a regular basis. In my considered opinion, CAP_TRACING
> > > > > alone, even extended to cover part of BPF as I’ve described, is
> > > > > decently safe. Getting root with just CAP_TRACING will be decently
> > > > > challenging, especially if I don’t get to read things like sshd’s
> > > > > memory, and improvements to mitigate even that could be added.  I
> > > > > am quite confident that attacks starting with CAP_TRACING will have
> > > > > clear audit signatures if auditing is on.  I am also confident that
> > > > > CAP_BPF *will* allow DoS and likely privilege escalation, and this
> > > > > will only get more likely as BPF gets more widely used. And, if
> > > > > BPF-based auditing ever becomes a thing, writing to the audit
> > > > > daemon’s maps will be a great way to cover one’s tracks.  
> > > > 
> > > > CAP_TRACING, as I'm proposing it, will allow full tracefs access.
> > > > I think Steven and Massami prefer that as well.
> > > > That includes kprobe with probe_kernel_read.
> > > > That also means mini-DoS by installing kprobes everywhere or running
> > > > too much ftrace.
> > > 
> > > I was talking with Kees at Plumbers about this, and we were talking
> > > about just using simple file permissions. I started playing with some
> > > patches to allow the tracefs be visible but by default it would only be
> > > visible by root.
> > > 
> > >  rwx------
> > > 
> > > Then a start up script (or perhaps mount options) could change the
> > > group owner, and change this to:
> > > 
> > >  rwxrwx---
> > > 
> > > Where anyone in the group assigned (say "tracing") gets full access to
> > > the file system.
> 
> Does it for "all" files under tracefs?
> 
> > > 
> > > The more I was playing with this, the less I see the need for
> > > CAP_TRACING for ftrace and reading the format files.
> > 
> > Nice! Thanks for playing with this. I like it because it gives us a way
> > to push policy into userspace (group membership, etc), and provides a
> > clean way (hopefully) do separate "read" (kernel memory confidentiality)
> > from "write" (kernel memory integrity), which wouldn't have been possible
> > with a single new CAP_...
> 
>  From the confidentiality point of view, if tracefs exposes traced data,
> it might include in-kernel pointer and symbols, but the user still can't
> see /proc/kallsyms. This means we still have several different confidentiality
> for each interface.
> 
> Anyway, adding a tracefs mount option for allowing a user group to access
> event format data will be a good idea. But even though, I  think we still
> need the CAP_TRACING for allowing control of intrusive tracing, like kprobes
> and bpf etc. (Or, do we keep those for CAP_SYS_ADMIN??)

No doubt. This thread is only about tracefs wanting to do its own fs based controls.


^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: trace_printk issue. Was: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-10-03 16:18                                 ` trace_printk issue. Was: " Alexei Starovoitov
@ 2019-10-03 16:41                                   ` Steven Rostedt
  2019-10-04 19:56                                     ` Alexei Starovoitov
  0 siblings, 1 reply; 46+ messages in thread
From: Steven Rostedt @ 2019-10-03 16:41 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Alexei Starovoitov, Kees Cook, Andy Lutomirski, Andy Lutomirski,
	Alexei Starovoitov, LSM List, James Morris, Jann Horn,
	Peter Zijlstra, Masami Hiramatsu, David S. Miller,
	Daniel Borkmann, Network Development, bpf, Kernel Team,
	Linux API

On Thu, 3 Oct 2019 09:18:40 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> I think dropping last events is just as bad. Is there a mode to overwrite old
> and keep the last N (like perf does) ?

Well, it drops it by pages. Thus you should always have the last page
of events.

> Peter Wu brought this issue to my attention in
> commit 55c33dfbeb83 ("bpf: clarify when bpf_trace_printk discards lines").
> And later sent similar doc fix to ftrace.rst.

It was documented there, he just elaborated on it more:

        This file holds the output of the trace in a human
        readable format (described below). Note, tracing is temporarily
-       disabled while this file is being read (opened).
+       disabled when the file is open for reading. Once all readers
+       are closed, tracing is re-enabled.


> To be honest if I knew of this trace_printk quirk I would not have picked it
> as a debugging mechanism for bpf.
> I urge you to fix it.

It's not a trivial fix by far.

Note, trying to read the trace file without disabling the writes to it,
will most likely make reading it when function tracing enabled totally
garbage, as the buffer will most likely be filled for every read event.
That is, each read event will not be related to the next event that is
read, making it very confusing.

Although, I may be able to make it work per page. That way you get at
least a page worth of events.

Now, I could also make it where you have to stop tracing to read the
trace file. That is, if you try to open the trace files while the
buffer is active, it will error -EBUSY. Forcing you to stop tracing to
read it, otherwise you would need to read the trace_pipe. At least this
way you will not get surprised that events were dropped.

-- Steve

^ permalink raw reply	[flat|nested] 46+ messages in thread

* Re: trace_printk issue. Was: [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF
  2019-10-03 16:41                                   ` Steven Rostedt
@ 2019-10-04 19:56                                     ` Alexei Starovoitov
  0 siblings, 0 replies; 46+ messages in thread
From: Alexei Starovoitov @ 2019-10-04 19:56 UTC (permalink / raw)
  To: Steven Rostedt, Alexei Starovoitov
  Cc: Kees Cook, Andy Lutomirski, Andy Lutomirski, Alexei Starovoitov,
	LSM List, James Morris, Jann Horn, Peter Zijlstra,
	Masami Hiramatsu, David S. Miller, Daniel Borkmann,
	Network Development, bpf, Kernel Team, Linux API

On 10/3/19 9:41 AM, Steven Rostedt wrote:
> On Thu, 3 Oct 2019 09:18:40 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
>> I think dropping last events is just as bad. Is there a mode to overwrite old
>> and keep the last N (like perf does) ?
> 
> Well, it drops it by pages. Thus you should always have the last page
> of events.
> 
>> Peter Wu brought this issue to my attention in
>> commit 55c33dfbeb83 ("bpf: clarify when bpf_trace_printk discards lines").
>> And later sent similar doc fix to ftrace.rst.
> 
> It was documented there, he just elaborated on it more:
> 
>          This file holds the output of the trace in a human
>          readable format (described below). Note, tracing is temporarily
> -       disabled while this file is being read (opened).
> +       disabled when the file is open for reading. Once all readers
> +       are closed, tracing is re-enabled.
> 
> 
>> To be honest if I knew of this trace_printk quirk I would not have picked it
>> as a debugging mechanism for bpf.
>> I urge you to fix it.
> 
> It's not a trivial fix by far.
> 
> Note, trying to read the trace file without disabling the writes to it,
> will most likely make reading it when function tracing enabled totally
> garbage, as the buffer will most likely be filled for every read event.
> That is, each read event will not be related to the next event that is
> read, making it very confusing.
> 
> Although, I may be able to make it work per page. That way you get at
> least a page worth of events.

That sounds much better. As long as trace_printk() doesn't disappear
into the void, it's good.

But the part I'm not getting is why trace_printk() has
if (tracing_disabled) goto out;

It's a concurrent ring buffer. One cpu can write into it while
another reading. What is the point disabling trace_printk in particular?
Each __buffer_unlock_commit is an atomic ring buffer update,
so read from trace will either see it as a whole or won't see it.
'trace_pipe' clearly works fine. Why 'trace' is any different?
Just keep tracing enabled and keep reading it until the end of current
ring buffer. Whether open() determines current or it reads until next=0
is an implementation detail.

^ permalink raw reply	[flat|nested] 46+ messages in thread

end of thread, other threads:[~2019-10-04 19:57 UTC | newest]

Thread overview: 46+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-08-27 20:52 [PATCH bpf-next] bpf, capabilities: introduce CAP_BPF Alexei Starovoitov
2019-08-27 23:01 ` Andy Lutomirski
2019-08-27 23:21   ` Steven Rostedt
2019-08-27 23:34     ` Andy Lutomirski
2019-08-28  0:44       ` Steven Rostedt
2019-08-28  1:12         ` Andy Lutomirski
2019-08-28  2:22           ` Steven Rostedt
2019-08-28  0:38     ` Alexei Starovoitov
2019-08-28  3:30     ` Masami Hiramatsu
2019-08-28  4:47       ` Alexei Starovoitov
2019-08-28  0:34   ` Alexei Starovoitov
2019-08-28  0:55     ` Andy Lutomirski
2019-08-28  2:00       ` Andy Lutomirski
2019-08-28  4:49         ` Alexei Starovoitov
2019-08-28  6:20           ` Andy Lutomirski
2019-08-28 23:38             ` Alexei Starovoitov
2019-08-29  0:58               ` Andy Lutomirski
2019-08-28  4:43       ` Alexei Starovoitov
2019-08-28  6:12         ` Andy Lutomirski
2019-08-28 22:55           ` Alexei Starovoitov
2019-08-29  0:45             ` Andy Lutomirski
2019-08-29  0:53               ` Andy Lutomirski
2019-08-29  4:07               ` Alexei Starovoitov
2019-09-28 23:37                 ` Steven Rostedt
2019-09-30 18:31                   ` Kees Cook
2019-10-01  1:22                     ` Alexei Starovoitov
2019-10-01 22:10                       ` Steven Rostedt
2019-10-01 22:18                         ` Alexei Starovoitov
2019-10-01 22:47                           ` Steven Rostedt
2019-10-02 17:18                             ` Alexei Starovoitov
2019-10-02 23:00                               ` Steven Rostedt
2019-10-03 16:18                                 ` trace_printk issue. Was: " Alexei Starovoitov
2019-10-03 16:41                                   ` Steven Rostedt
2019-10-04 19:56                                     ` Alexei Starovoitov
2019-10-03  6:12                     ` Masami Hiramatsu
2019-10-03 16:20                       ` Alexei Starovoitov
2019-08-28  7:14   ` Peter Zijlstra
2019-08-28 22:08     ` Alexei Starovoitov
2019-08-29 13:34       ` Steven Rostedt
2019-08-29 15:43         ` Andy Lutomirski
2019-08-29 17:23           ` Alexei Starovoitov
2019-08-29 17:36             ` Andy Lutomirski
2019-08-29 17:49             ` Steven Rostedt
2019-08-29 17:19         ` Alexei Starovoitov
2019-08-29 17:47           ` Steven Rostedt
2019-08-28 10:38 ` kbuild test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).