bpf.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH bpf-next RFC 0/3] bpf: dynamic map-value config layout via BTF
@ 2020-05-29 15:59 Jesper Dangaard Brouer
  2020-05-29 15:59 ` [PATCH bpf-next RFC 1/3] bpf: move struct bpf_devmap_val out of UAPI Jesper Dangaard Brouer
                   ` (2 more replies)
  0 siblings, 3 replies; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2020-05-29 15:59 UTC (permalink / raw)
  To: David Ahern, bpf, netdev
  Cc: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	Andrii Nakryiko

This patchset is based on top of David Ahern's work V3: "bpf: Add support
for XDP programs in DEVMAP entries"[1]. The purpose is to address the kABI
interfaces that is introduced in that patchset, before it is released.

[1] https://lore.kernel.org/netdev/20200529052057.69378-1-dsahern@kernel.org

The map-value of these special maps are evolving into configuration
interface between userspace and kernel. The approach in[1] is to expose a
binary struct layout that can only be grown in the end of the struct.

With the BTF technology it is possible to create an interface that is much
more dynamic and flexible.

---

Jesper Dangaard Brouer (3):
      bpf: move struct bpf_devmap_val out of UAPI
      bpf: devmap dynamic map-value storage area based on BTF
      samples/bpf: change xdp_fwd to use new BTF config interface


 include/uapi/linux/bpf.h                           |    9 -
 kernel/bpf/devmap.c                                |  227 +++++++++++++++++---
 samples/bpf/xdp_fwd.h                              |   24 ++
 samples/bpf/xdp_fwd_kern.c                         |    5 
 samples/bpf/xdp_fwd_user.c                         |    9 +
 tools/include/uapi/linux/bpf.h                     |    9 -
 .../selftests/bpf/prog_tests/xdp_devmap_attach.c   |   18 +-
 .../bpf/progs/test_xdp_with_devmap_helpers.c       |   10 +
 8 files changed, 252 insertions(+), 59 deletions(-)
 create mode 100644 samples/bpf/xdp_fwd.h

--


^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH bpf-next RFC 1/3] bpf: move struct bpf_devmap_val out of UAPI
  2020-05-29 15:59 [PATCH bpf-next RFC 0/3] bpf: dynamic map-value config layout via BTF Jesper Dangaard Brouer
@ 2020-05-29 15:59 ` Jesper Dangaard Brouer
  2020-05-29 16:06   ` David Ahern
  2020-05-29 15:59 ` [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF Jesper Dangaard Brouer
  2020-05-29 15:59 ` [PATCH bpf-next RFC 3/3] samples/bpf: change xdp_fwd to use new BTF config interface Jesper Dangaard Brouer
  2 siblings, 1 reply; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2020-05-29 15:59 UTC (permalink / raw)
  To: David Ahern, bpf, netdev
  Cc: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	Andrii Nakryiko

The struct bpf_devmap_val doesn't belong in uapi/linux/bpf.h, because this
is a struct that BPF-progs can define themselves, and can provide different
sizes to the kernel.

While moving the struct change the union to be a named union, with the name
"bpf_prog". This makes it easier to identify with BTF in next patch.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 include/uapi/linux/bpf.h                           |    9 -------
 kernel/bpf/devmap.c                                |   25 ++++++++++++++------
 tools/include/uapi/linux/bpf.h                     |    9 -------
 .../selftests/bpf/prog_tests/xdp_devmap_attach.c   |   18 ++++++++++----
 .../bpf/progs/test_xdp_with_devmap_helpers.c       |   10 +++++++-
 5 files changed, 39 insertions(+), 32 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 61ae81bf67de..970c44ecc472 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3628,15 +3628,6 @@ struct xdp_md {
 	__u32 egress_ifindex;  /* txq->dev->ifindex */
 };
 
-/* DEVMAP values */
-struct bpf_devmap_val {
-	__u32 ifindex;   /* device index */
-	union {
-		int   bpf_prog_fd;  /* prog fd on map write */
-		__u32 bpf_prog_id;  /* prog id on map read */
-	};
-};
-
 enum sk_action {
 	SK_DROP = 0,
 	SK_PASS,
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index defdd22caa4b..4ab67b2d8159 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -60,6 +60,15 @@ struct xdp_dev_bulk_queue {
 	unsigned int count;
 };
 
+/* DEVMAP values */
+struct bpf_devmap_val {
+	__u32 ifindex;   /* device index */
+	union {
+		int   fd;  /* prog fd on map write */
+		__u32 id;  /* prog id on map read */
+	} bpf_prog;
+};
+
 struct bpf_dtab_netdev {
 	struct net_device *dev; /* must be first member, due to tracepoint */
 	struct hlist_node index_hlist;
@@ -117,7 +126,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 	 */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
 	    (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
-	     valsize != offsetofend(struct bpf_devmap_val, bpf_prog_fd)) ||
+	     valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
 	    attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return -EINVAL;
 
@@ -609,8 +618,8 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
 	if (!dev->dev)
 		goto err_out;
 
-	if (val->bpf_prog_fd >= 0) {
-		prog = bpf_prog_get_type_dev(val->bpf_prog_fd,
+	if (val->bpf_prog.fd >= 0) {
+		prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
 					     BPF_PROG_TYPE_XDP, false);
 		if (IS_ERR(prog))
 			goto err_put_dev;
@@ -622,10 +631,10 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
 	dev->dtab = dtab;
 	if (prog) {
 		dev->xdp_prog = prog;
-		dev->val.bpf_prog_id = prog->aux->id;
+		dev->val.bpf_prog.id = prog->aux->id;
 	} else {
 		dev->xdp_prog = NULL;
-		dev->val.bpf_prog_id = 0;
+		dev->val.bpf_prog.id = 0;
 	}
 	dev->val.ifindex = val->ifindex;
 
@@ -643,7 +652,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 				 void *key, void *value, u64 map_flags)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct bpf_devmap_val val = { .bpf_prog_fd = -1 };
+	struct bpf_devmap_val val = { .bpf_prog.fd = -1 };
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 i = *(u32 *)key;
 
@@ -660,7 +669,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 	if (!val.ifindex) {
 		dev = NULL;
 		/* can not specify fd if ifindex is 0 */
-		if (val.bpf_prog_fd != -1)
+		if (val.bpf_prog.fd != -1)
 			return -EINVAL;
 	} else {
 		dev = __dev_map_alloc_node(net, dtab, &val, i);
@@ -690,7 +699,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
 				     void *key, void *value, u64 map_flags)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
-	struct bpf_devmap_val val = { .bpf_prog_fd = -1 };
+	struct bpf_devmap_val val = { .bpf_prog.fd = -1 };
 	struct bpf_dtab_netdev *dev, *old_dev;
 	u32 idx = *(u32 *)key;
 	unsigned long flags;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 61ae81bf67de..970c44ecc472 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3628,15 +3628,6 @@ struct xdp_md {
 	__u32 egress_ifindex;  /* txq->dev->ifindex */
 };
 
-/* DEVMAP values */
-struct bpf_devmap_val {
-	__u32 ifindex;   /* device index */
-	union {
-		int   bpf_prog_fd;  /* prog fd on map write */
-		__u32 bpf_prog_id;  /* prog id on map read */
-	};
-};
-
 enum sk_action {
 	SK_DROP = 0,
 	SK_PASS,
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c
index caeea19f2772..b72a696fc6a8 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_devmap_attach.c
@@ -8,11 +8,19 @@
 
 #define IFINDEX_LO 1
 
+struct _bpf_devmap_val {
+	__u32 ifindex;   /* device index */
+	union {
+		int   fd;  /* prog fd on map write */
+		__u32 id;  /* prog id on map read */
+	} bpf_prog;
+};
+
 void test_xdp_with_devmap_helpers(void)
 {
 	struct test_xdp_with_devmap_helpers *skel;
 	struct bpf_prog_info info = {};
-	struct bpf_devmap_val val = {
+	struct _bpf_devmap_val val = {
 		.ifindex = IFINDEX_LO,
 	};
 	__u32 id, len = sizeof(info);
@@ -40,15 +48,15 @@ void test_xdp_with_devmap_helpers(void)
 	if (CHECK_FAIL(err))
 		goto out_close;
 
-	val.bpf_prog_fd = dm_fd;
+	val.bpf_prog.fd = dm_fd;
 	err = bpf_map_update_elem(map_fd, &idx, &val, 0);
 	CHECK(err, "Add program to devmap entry",
 	      "err %d errno %d\n", err, errno);
 
 	err = bpf_map_lookup_elem(map_fd, &id, &val);
 	CHECK(err, "Read devmap entry", "err %d errno %d\n", err, errno);
-	CHECK(info.id != val.bpf_prog_id, "Expected program id in devmap entry",
-	      "expected %u read %u\n", info.id, val.bpf_prog_id);
+	CHECK(info.id != val.bpf_prog.id, "Expected program id in devmap entry",
+	      "expected %u read %u\n", info.id, val.bpf_prog.id);
 
 	/* can not attach BPF_XDP_DEVMAP program to a device */
 	err = bpf_set_link_xdp_fd(IFINDEX_LO, dm_fd, XDP_FLAGS_SKB_MODE);
@@ -56,7 +64,7 @@ void test_xdp_with_devmap_helpers(void)
 	      "should have failed\n");
 
 	val.ifindex = 1;
-	val.bpf_prog_fd = bpf_program__fd(skel->progs.xdp_dummy_prog);
+	val.bpf_prog.fd = bpf_program__fd(skel->progs.xdp_dummy_prog);
 	err = bpf_map_update_elem(map_fd, &idx, &val, 0);
 	CHECK(err == 0, "Add non-BPF_XDP_DEVMAP program to devmap entry",
 	      "should have failed\n");
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c
index 645f7f415857..126f6de514a1 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_with_devmap_helpers.c
@@ -3,10 +3,18 @@
 #include <linux/bpf.h>
 #include <bpf/bpf_helpers.h>
 
+struct _bpf_devmap_val {
+	__u32 ifindex;   /* device index */
+	union {
+		int   fd;  /* prog fd on map write */
+		__u32 id;  /* prog id on map read */
+	} bpf_prog;
+};
+
 struct {
 	__uint(type, BPF_MAP_TYPE_DEVMAP);
 	__uint(key_size, sizeof(__u32));
-	__uint(value_size, sizeof(struct bpf_devmap_val));
+	__uint(value_size, sizeof(struct _bpf_devmap_val));
 	__uint(max_entries, 4);
 } dm_ports SEC(".maps");
 



^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-05-29 15:59 [PATCH bpf-next RFC 0/3] bpf: dynamic map-value config layout via BTF Jesper Dangaard Brouer
  2020-05-29 15:59 ` [PATCH bpf-next RFC 1/3] bpf: move struct bpf_devmap_val out of UAPI Jesper Dangaard Brouer
@ 2020-05-29 15:59 ` Jesper Dangaard Brouer
  2020-05-29 16:39   ` Toke Høiland-Jørgensen
                     ` (2 more replies)
  2020-05-29 15:59 ` [PATCH bpf-next RFC 3/3] samples/bpf: change xdp_fwd to use new BTF config interface Jesper Dangaard Brouer
  2 siblings, 3 replies; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2020-05-29 15:59 UTC (permalink / raw)
  To: David Ahern, bpf, netdev
  Cc: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	Andrii Nakryiko

The devmap map-value can be read from BPF-prog side, and could be used for a
storage area per device. This could e.g. contain info on headers that need
to be added when packet egress this device.

This patchset adds a dynamic storage member to struct bpf_devmap_val. More
importantly the struct bpf_devmap_val is made dynamic via leveraging and
requiring BTF for struct sizes above 4. The only mandatory struct member is
'ifindex' with a fixed offset of zero.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 185 insertions(+), 31 deletions(-)

diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 4ab67b2d8159..9cf2dadcc0fe 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,6 +48,7 @@
 #include <net/xdp.h>
 #include <linux/filter.h>
 #include <trace/events/xdp.h>
+#include <linux/btf.h>
 
 #define DEV_CREATE_FLAG_MASK \
 	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue {
 	unsigned int count;
 };
 
-/* DEVMAP values */
+/* DEVMAP map-value layout.
+ *
+ * The struct data-layout of map-value is a configuration interface.
+ * BPF-prog side have read-only access to this memory.
+ *
+ * The layout might be different than below, because some struct members are
+ * optional.  This is made dynamic by requiring userspace provides an BTF
+ * description of the struct layout, when creating the BPF-map. Struct names
+ * are important and part of API, as BTF use these names to identify members.
+ */
 struct bpf_devmap_val {
-	__u32 ifindex;   /* device index */
+	__u32 ifindex;   /* device index - mandatory */
 	union {
 		int   fd;  /* prog fd on map write */
 		__u32 id;  /* prog id on map read */
 	} bpf_prog;
+	struct {
+		/* This 'storage' member is meant as a dynamically sized area,
+		 * that BPF developer can redefine.  As other members are added
+		 * overtime, this area can shrink, as size can be regained by
+		 * not using members above. Add new members above this struct.
+		 */
+		unsigned char data[24];
+	} storage;
 };
 
 struct bpf_dtab_netdev {
@@ -79,10 +97,18 @@ struct bpf_dtab_netdev {
 	struct bpf_devmap_val val;
 };
 
+struct bpf_devmap_val_cfg {
+	struct {
+		int ifindex;
+		int bpf_prog;
+	} btf_offset;
+};
+
 struct bpf_dtab {
 	struct bpf_map map;
 	struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
 	struct list_head list;
+	struct bpf_devmap_val_cfg cfg;
 
 	/* these are only used for DEVMAP_HASH type maps */
 	struct hlist_head *dev_index_head;
@@ -116,20 +142,24 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
 
 static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
 {
-	__u32 valsize = attr->value_size;
 	u64 cost = 0;
 	int err;
 
-	/* check sanity of attributes. 2 value sizes supported:
-	 * 4 bytes: ifindex
-	 * 8 bytes: ifindex + prog fd
-	 */
+	/* Value contents validated in dev_map_check_btf */
 	if (attr->max_entries == 0 || attr->key_size != 4 ||
-	    (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
-	     valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
+	    attr->value_size > sizeof(struct bpf_devmap_val) ||
 	    attr->map_flags & ~DEV_CREATE_FLAG_MASK)
 		return -EINVAL;
 
+	/* Enforce BTF for userspace, unless dealing with legacy kABI */
+	if (attr->value_size != 4 &&
+	    (!attr->btf_key_type_id || !attr->btf_value_type_id))
+		return -EOPNOTSUPP;
+
+	/* Mark BTF offset's as invalid */
+	dtab->cfg.btf_offset.ifindex  = -1;
+	dtab->cfg.btf_offset.bpf_prog = -1;
+
 	/* Lookup returns a pointer straight to dev->ifindex, so make sure the
 	 * verifier prevents writes from the BPF side
 	 */
@@ -199,6 +229,119 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
 	return &dtab->map;
 }
 
+struct expect {
+	u8 btf_kind;
+	bool mandatory;
+	int bit_offset;
+	int size;
+	const char *name;
+};
+
+static int btf_find_expect_layout_offset(const struct btf *btf,
+					 const struct btf_type *value_type,
+					 const struct expect *layout)
+{
+	const struct btf_member *member;
+	u32 i, off = -ENOENT;
+
+	for_each_member(i, value_type, member) {
+		const struct btf_type *member_type;
+		const char *member_name;
+
+		member_type = btf_type_skip_modifiers(btf, member->type, NULL);
+		if (BTF_INFO_KIND(member_type->info) != layout->btf_kind) {
+			continue;
+		}
+
+		member_name = btf_name_by_offset(btf, member->name_off);
+		if (!member_name)
+			return -EINVAL;
+
+		if (strcmp(layout->name, member_name))
+			continue;
+
+		if (layout->size > 0 &&  // btf_type_has_size(member_type) &&
+		    member_type->size != layout->size)
+			continue;
+
+		off = btf_member_bit_offset(value_type, member);
+		if (layout->bit_offset > 0 &&
+		    layout->bit_offset != off) {
+			off = -ENOENT;
+			continue;
+		}
+
+		return off;
+	}
+	return off;
+}
+
+/* Expected BTF layout that match struct bpf_devmap_val */
+static const struct expect layout[] = {
+	{BTF_KIND_INT,		true,	 0,	 4,	"ifindex"},
+	{BTF_KIND_UNION,	false,	32,	 4,	"bpf_prog"},
+	{BTF_KIND_STRUCT,	false,	-1,	-1,	"storage"}
+};
+
+static int dev_map_check_btf(const struct bpf_map *map,
+			     const struct btf *btf,
+			     const struct btf_type *key_type,
+			     const struct btf_type *value_type)
+{
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
+	u32 found_members_cnt = 0;
+	u32 int_data;
+	int off;
+	u32 i;
+
+	/* Validate KEY type and size */
+	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+		return -EOPNOTSUPP;
+
+	int_data = *(u32 *)(key_type + 1);
+	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
+		return -EOPNOTSUPP;
+
+	/* Validate VALUE have layout that match/map-to struct bpf_devmap_val
+	 * - With a flexible size of member 'storage'.
+	 */
+
+	if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
+		return -EOPNOTSUPP;
+
+	/* Struct/union members in BTF must not exceed (max) expected members */
+	if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
+			return -E2BIG;
+
+	for (i = 0; i < ARRAY_SIZE(layout); i++) {
+		off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
+
+		if (off < 0 && layout[i].mandatory)
+			return -EUCLEAN;
+
+		if (off >= 0)
+			found_members_cnt++;
+
+		/* Transfer layout config to map */
+		switch (i) {
+		case 0:
+			dtab->cfg.btf_offset.ifindex = off;
+			break;
+		case 1:
+			dtab->cfg.btf_offset.bpf_prog = off;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/* Detect if BTF/vlen have members that were not found */
+	if (btf_type_vlen(value_type) > found_members_cnt)
+		return -E2BIG;
+
+	return 0;
+}
+
 static void dev_map_free(struct bpf_map *map)
 {
 	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
@@ -601,42 +744,53 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
 	return ret;
 }
 
+static inline bool map_value_has_bpf_prog(const struct bpf_dtab *dtab)
+{
+	return dtab->cfg.btf_offset.bpf_prog >= 0;
+}
+
 static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
-						    struct bpf_dtab *dtab,
+						    struct bpf_map *map,
 						    struct bpf_devmap_val *val,
 						    unsigned int idx)
 {
+	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
 	struct bpf_prog *prog = NULL;
 	struct bpf_dtab_netdev *dev;
 
-	dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
+	dev = kzalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
 			   dtab->map.numa_node);
 	if (!dev)
 		return ERR_PTR(-ENOMEM);
 
+	/* Member: ifindex is mandatory, both BTF and kABI */
 	dev->dev = dev_get_by_index(net, val->ifindex);
 	if (!dev->dev)
 		goto err_out;
 
-	if (val->bpf_prog.fd >= 0) {
-		prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
-					     BPF_PROG_TYPE_XDP, false);
-		if (IS_ERR(prog))
-			goto err_put_dev;
-		if (prog->expected_attach_type != BPF_XDP_DEVMAP)
-			goto err_put_prog;
+	/* Member: bpf_prog union is optional, but have fixed offset if exist */
+	if (map_value_has_bpf_prog(dtab)) {
+		if (val->bpf_prog.fd >= 0) {
+			prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
+						     BPF_PROG_TYPE_XDP, false);
+			if (IS_ERR(prog))
+				goto err_put_dev;
+			if (prog->expected_attach_type != BPF_XDP_DEVMAP)
+				goto err_put_prog;
+		}
+		if (prog) {
+			dev->xdp_prog = prog;
+			val->bpf_prog.id = prog->aux->id;
+		} else {
+			dev->xdp_prog = NULL;
+			val->bpf_prog.id = 0;
+		}
 	}
-
 	dev->idx = idx;
 	dev->dtab = dtab;
-	if (prog) {
-		dev->xdp_prog = prog;
-		dev->val.bpf_prog.id = prog->aux->id;
-	} else {
-		dev->xdp_prog = NULL;
-		dev->val.bpf_prog.id = 0;
-	}
-	dev->val.ifindex = val->ifindex;
+
+	/* After adjustment copy map value to get storage area */
+	memcpy(&dev->val, val, map->value_size);
 
 	return dev;
 err_put_prog:
@@ -672,7 +826,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
 		if (val.bpf_prog.fd != -1)
 			return -EINVAL;
 	} else {
-		dev = __dev_map_alloc_node(net, dtab, &val, i);
+		dev = __dev_map_alloc_node(net, map, &val, i);
 		if (IS_ERR(dev))
 			return PTR_ERR(dev);
 	}
@@ -717,7 +871,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
 	if (old_dev && (map_flags & BPF_NOEXIST))
 		goto out_err;
 
-	dev = __dev_map_alloc_node(net, dtab, &val, idx);
+	dev = __dev_map_alloc_node(net, map, &val, idx);
 	if (IS_ERR(dev)) {
 		err = PTR_ERR(dev);
 		goto out_err;
@@ -762,7 +916,7 @@ const struct bpf_map_ops dev_map_ops = {
 	.map_lookup_elem = dev_map_lookup_elem,
 	.map_update_elem = dev_map_update_elem,
 	.map_delete_elem = dev_map_delete_elem,
-	.map_check_btf = map_check_no_btf,
+	.map_check_btf = dev_map_check_btf,
 };
 
 const struct bpf_map_ops dev_map_hash_ops = {
@@ -772,7 +926,7 @@ const struct bpf_map_ops dev_map_hash_ops = {
 	.map_lookup_elem = dev_map_hash_lookup_elem,
 	.map_update_elem = dev_map_hash_update_elem,
 	.map_delete_elem = dev_map_hash_delete_elem,
-	.map_check_btf = map_check_no_btf,
+	.map_check_btf = dev_map_check_btf,
 };
 
 static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,



^ permalink raw reply	[flat|nested] 17+ messages in thread

* [PATCH bpf-next RFC 3/3] samples/bpf: change xdp_fwd to use new BTF config interface
  2020-05-29 15:59 [PATCH bpf-next RFC 0/3] bpf: dynamic map-value config layout via BTF Jesper Dangaard Brouer
  2020-05-29 15:59 ` [PATCH bpf-next RFC 1/3] bpf: move struct bpf_devmap_val out of UAPI Jesper Dangaard Brouer
  2020-05-29 15:59 ` [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF Jesper Dangaard Brouer
@ 2020-05-29 15:59 ` Jesper Dangaard Brouer
  2 siblings, 0 replies; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2020-05-29 15:59 UTC (permalink / raw)
  To: David Ahern, bpf, netdev
  Cc: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	Andrii Nakryiko

This enable BTF for samples/bpf/xdp_fwd program, and demonstrates
how the BPF-developer can defined their own version of bpf_devmap_val.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
 samples/bpf/xdp_fwd.h      |   24 ++++++++++++++++++++++++
 samples/bpf/xdp_fwd_kern.c |    5 +++--
 samples/bpf/xdp_fwd_user.c |    9 ++++++++-
 3 files changed, 35 insertions(+), 3 deletions(-)
 create mode 100644 samples/bpf/xdp_fwd.h

diff --git a/samples/bpf/xdp_fwd.h b/samples/bpf/xdp_fwd.h
new file mode 100644
index 000000000000..8abb2a417117
--- /dev/null
+++ b/samples/bpf/xdp_fwd.h
@@ -0,0 +1,24 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef _SAMPLES_BPF_XDP_FWD_H
+#define _SAMPLES_BPF_XDP_FWD_H
+
+#define ENABLE_BPF_PROG 1
+
+/* Notice XDP prog can redefine this struct, which through BTF affect
+ * what kernel-side config options are available.
+ */
+struct bpf_devmap_val {
+	__u32 ifindex;   /* device index - mandatory */
+#ifdef ENABLE_BPF_PROG
+	union {
+		int   fd;  /* prog fd on map write */
+		__u32 id;  /* prog id on map read */
+	} bpf_prog;
+#endif
+	struct {
+		unsigned char data2[2];
+		__u16 vlan_hdr;
+	} storage;
+};
+
+#endif /* _SAMPLES_BPF_XDP_FWD_H */
diff --git a/samples/bpf/xdp_fwd_kern.c b/samples/bpf/xdp_fwd_kern.c
index 54c099cbd639..bb1ee44dcd60 100644
--- a/samples/bpf/xdp_fwd_kern.c
+++ b/samples/bpf/xdp_fwd_kern.c
@@ -20,13 +20,14 @@
 #include <linux/ipv6.h>
 
 #include <bpf/bpf_helpers.h>
+#include "xdp_fwd.h"
 
 #define IPV6_FLOWINFO_MASK              cpu_to_be32(0x0FFFFFFF)
 
 struct {
 	__uint(type, BPF_MAP_TYPE_DEVMAP);
-	__uint(key_size, sizeof(int));
-	__uint(value_size, sizeof(int));
+	__type(key, u32);
+	__type(value, struct bpf_devmap_val);
 	__uint(max_entries, 64);
 } xdp_tx_ports SEC(".maps");
 
diff --git a/samples/bpf/xdp_fwd_user.c b/samples/bpf/xdp_fwd_user.c
index 74a4583d0d86..4ddf70bcedde 100644
--- a/samples/bpf/xdp_fwd_user.c
+++ b/samples/bpf/xdp_fwd_user.c
@@ -26,13 +26,20 @@
 
 #include <bpf/libbpf.h>
 #include <bpf/bpf.h>
+#include "xdp_fwd.h"
 
 static __u32 xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
 
 static int do_attach(int idx, int prog_fd, int map_fd, const char *name)
 {
+	struct bpf_devmap_val val = { 0 };
 	int err;
 
+	val.ifindex = idx;
+#ifdef ENABLE_BPF_PROG
+	val.bpf_prog.fd = - 1;
+#endif
+
 	err = bpf_set_link_xdp_fd(idx, prog_fd, xdp_flags);
 	if (err < 0) {
 		printf("ERROR: failed to attach program to %s\n", name);
@@ -40,7 +47,7 @@ static int do_attach(int idx, int prog_fd, int map_fd, const char *name)
 	}
 
 	/* Adding ifindex as a possible egress TX port */
-	err = bpf_map_update_elem(map_fd, &idx, &idx, 0);
+	err = bpf_map_update_elem(map_fd, &idx, &val, 0);
 	if (err)
 		printf("ERROR: failed using device %s as TX-port\n", name);
 



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 1/3] bpf: move struct bpf_devmap_val out of UAPI
  2020-05-29 15:59 ` [PATCH bpf-next RFC 1/3] bpf: move struct bpf_devmap_val out of UAPI Jesper Dangaard Brouer
@ 2020-05-29 16:06   ` David Ahern
  2020-05-29 16:28     ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 17+ messages in thread
From: David Ahern @ 2020-05-29 16:06 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, bpf, netdev
  Cc: Daniel Borkmann, Alexei Starovoitov, Andrii Nakryiko

On 5/29/20 9:59 AM, Jesper Dangaard Brouer wrote:
> @@ -60,6 +60,15 @@ struct xdp_dev_bulk_queue {
>  	unsigned int count;
>  };
>  
> +/* DEVMAP values */
> +struct bpf_devmap_val {
> +	__u32 ifindex;   /* device index */
> +	union {
> +		int   fd;  /* prog fd on map write */
> +		__u32 id;  /* prog id on map read */
> +	} bpf_prog;
> +};
> +

I can pick up this name change for v4.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 1/3] bpf: move struct bpf_devmap_val out of UAPI
  2020-05-29 16:06   ` David Ahern
@ 2020-05-29 16:28     ` Jesper Dangaard Brouer
  0 siblings, 0 replies; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2020-05-29 16:28 UTC (permalink / raw)
  To: David Ahern
  Cc: bpf, netdev, Daniel Borkmann, Alexei Starovoitov,
	Andrii Nakryiko, brouer

On Fri, 29 May 2020 10:06:25 -0600
David Ahern <dsahern@gmail.com> wrote:

> On 5/29/20 9:59 AM, Jesper Dangaard Brouer wrote:
> > @@ -60,6 +60,15 @@ struct xdp_dev_bulk_queue {
> >  	unsigned int count;
> >  };
> >  
> > +/* DEVMAP values */
> > +struct bpf_devmap_val {
> > +	__u32 ifindex;   /* device index */
> > +	union {
> > +		int   fd;  /* prog fd on map write */
> > +		__u32 id;  /* prog id on map read */
> > +	} bpf_prog;
> > +};
> > +  
> 
> I can pick up this name change for v4.

Great - I will appreciate, as this will make my patchset compatible
with yours :-)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-05-29 15:59 ` [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF Jesper Dangaard Brouer
@ 2020-05-29 16:39   ` Toke Høiland-Jørgensen
  2020-06-02  8:59     ` Jesper Dangaard Brouer
  2020-05-30  7:19   ` Andrii Nakryiko
  2020-06-01 21:30   ` Alexei Starovoitov
  2 siblings, 1 reply; 17+ messages in thread
From: Toke Høiland-Jørgensen @ 2020-05-29 16:39 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, David Ahern, bpf, netdev
  Cc: Jesper Dangaard Brouer, Daniel Borkmann, Alexei Starovoitov,
	Andrii Nakryiko

Jesper Dangaard Brouer <brouer@redhat.com> writes:

> The devmap map-value can be read from BPF-prog side, and could be used for a
> storage area per device. This could e.g. contain info on headers that need
> to be added when packet egress this device.
>
> This patchset adds a dynamic storage member to struct bpf_devmap_val. More
> importantly the struct bpf_devmap_val is made dynamic via leveraging and
> requiring BTF for struct sizes above 4. The only mandatory struct member is
> 'ifindex' with a fixed offset of zero.
>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 185 insertions(+), 31 deletions(-)
>
> diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> index 4ab67b2d8159..9cf2dadcc0fe 100644
> --- a/kernel/bpf/devmap.c
> +++ b/kernel/bpf/devmap.c
> @@ -48,6 +48,7 @@
>  #include <net/xdp.h>
>  #include <linux/filter.h>
>  #include <trace/events/xdp.h>
> +#include <linux/btf.h>
>  
>  #define DEV_CREATE_FLAG_MASK \
>  	(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
> @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue {
>  	unsigned int count;
>  };
>  
> -/* DEVMAP values */
> +/* DEVMAP map-value layout.
> + *
> + * The struct data-layout of map-value is a configuration interface.
> + * BPF-prog side have read-only access to this memory.
> + *
> + * The layout might be different than below, because some struct members are
> + * optional.  This is made dynamic by requiring userspace provides an BTF
> + * description of the struct layout, when creating the BPF-map. Struct names
> + * are important and part of API, as BTF use these names to identify members.
> + */
>  struct bpf_devmap_val {
> -	__u32 ifindex;   /* device index */
> +	__u32 ifindex;   /* device index - mandatory */
>  	union {
>  		int   fd;  /* prog fd on map write */
>  		__u32 id;  /* prog id on map read */
>  	} bpf_prog;
> +	struct {
> +		/* This 'storage' member is meant as a dynamically sized area,
> +		 * that BPF developer can redefine.  As other members are added
> +		 * overtime, this area can shrink, as size can be regained by
> +		 * not using members above. Add new members above this struct.
> +		 */
> +		unsigned char data[24];
> +	} storage;

Why is this needed? Userspace already passes in the value_size, so why
can't the kernel just use the BTF to pick out the values it cares about
and let the rest be up to userspace?

>  };
>  
>  struct bpf_dtab_netdev {
> @@ -79,10 +97,18 @@ struct bpf_dtab_netdev {
>  	struct bpf_devmap_val val;
>  };
>  
> +struct bpf_devmap_val_cfg {
> +	struct {
> +		int ifindex;
> +		int bpf_prog;
> +	} btf_offset;
> +};
> +
>  struct bpf_dtab {
>  	struct bpf_map map;
>  	struct bpf_dtab_netdev **netdev_map; /* DEVMAP type only */
>  	struct list_head list;
> +	struct bpf_devmap_val_cfg cfg;
>  
>  	/* these are only used for DEVMAP_HASH type maps */
>  	struct hlist_head *dev_index_head;
> @@ -116,20 +142,24 @@ static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
>  
>  static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
>  {
> -	__u32 valsize = attr->value_size;
>  	u64 cost = 0;
>  	int err;
>  
> -	/* check sanity of attributes. 2 value sizes supported:
> -	 * 4 bytes: ifindex
> -	 * 8 bytes: ifindex + prog fd
> -	 */
> +	/* Value contents validated in dev_map_check_btf */
>  	if (attr->max_entries == 0 || attr->key_size != 4 ||
> -	    (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
> -	     valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
> +	    attr->value_size > sizeof(struct bpf_devmap_val) ||
>  	    attr->map_flags & ~DEV_CREATE_FLAG_MASK)
>  		return -EINVAL;
>  
> +	/* Enforce BTF for userspace, unless dealing with legacy kABI */
> +	if (attr->value_size != 4 &&
> +	    (!attr->btf_key_type_id || !attr->btf_value_type_id))
> +		return -EOPNOTSUPP;
> +
> +	/* Mark BTF offset's as invalid */
> +	dtab->cfg.btf_offset.ifindex  = -1;
> +	dtab->cfg.btf_offset.bpf_prog = -1;
> +
>  	/* Lookup returns a pointer straight to dev->ifindex, so make sure the
>  	 * verifier prevents writes from the BPF side
>  	 */
> @@ -199,6 +229,119 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
>  	return &dtab->map;
>  }
>  
> +struct expect {
> +	u8 btf_kind;
> +	bool mandatory;
> +	int bit_offset;
> +	int size;
> +	const char *name;
> +};
> +
> +static int btf_find_expect_layout_offset(const struct btf *btf,
> +					 const struct btf_type *value_type,
> +					 const struct expect *layout)
> +{
> +	const struct btf_member *member;
> +	u32 i, off = -ENOENT;
> +
> +	for_each_member(i, value_type, member) {
> +		const struct btf_type *member_type;
> +		const char *member_name;
> +
> +		member_type = btf_type_skip_modifiers(btf, member->type, NULL);
> +		if (BTF_INFO_KIND(member_type->info) != layout->btf_kind) {
> +			continue;
> +		}
> +
> +		member_name = btf_name_by_offset(btf, member->name_off);
> +		if (!member_name)
> +			return -EINVAL;
> +
> +		if (strcmp(layout->name, member_name))
> +			continue;
> +
> +		if (layout->size > 0 &&  // btf_type_has_size(member_type) &&
> +		    member_type->size != layout->size)
> +			continue;
> +
> +		off = btf_member_bit_offset(value_type, member);
> +		if (layout->bit_offset > 0 &&
> +		    layout->bit_offset != off) {
> +			off = -ENOENT;
> +			continue;
> +		}

Won't this enforced offset cause problems for extensibility? Say we
introduce a third struct member that the kernel understands, e.g.
another u32 with expect->bit_offset=64. That would mean you can no
longer skip members, since that would make any subsequent offset tests
fail? Or am I misunderstanding how this is supposed to work?

> +
> +		return off;
> +	}
> +	return off;
> +}
> +
> +/* Expected BTF layout that match struct bpf_devmap_val */
> +static const struct expect layout[] = {
> +	{BTF_KIND_INT,		true,	 0,	 4,	"ifindex"},
> +	{BTF_KIND_UNION,	false,	32,	 4,	"bpf_prog"},
> +	{BTF_KIND_STRUCT,	false,	-1,	-1,	"storage"}
> +};
> +
> +static int dev_map_check_btf(const struct bpf_map *map,
> +			     const struct btf *btf,
> +			     const struct btf_type *key_type,
> +			     const struct btf_type *value_type)
> +{
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +	u32 found_members_cnt = 0;
> +	u32 int_data;
> +	int off;
> +	u32 i;
> +
> +	/* Validate KEY type and size */
> +	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> +		return -EOPNOTSUPP;
> +
> +	int_data = *(u32 *)(key_type + 1);
> +	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> +		return -EOPNOTSUPP;
> +
> +	/* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> +	 * - With a flexible size of member 'storage'.
> +	 */
> +
> +	if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> +		return -EOPNOTSUPP;
> +
> +	/* Struct/union members in BTF must not exceed (max) expected members */
> +	if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> +			return -E2BIG;
> +
> +	for (i = 0; i < ARRAY_SIZE(layout); i++) {
> +		off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> +
> +		if (off < 0 && layout[i].mandatory)
> +			return -EUCLEAN;
> +
> +		if (off >= 0)
> +			found_members_cnt++;
> +
> +		/* Transfer layout config to map */
> +		switch (i) {
> +		case 0:
> +			dtab->cfg.btf_offset.ifindex = off;
> +			break;
> +		case 1:
> +			dtab->cfg.btf_offset.bpf_prog = off;
> +			break;
> +		default:
> +			break;
> +		}
> +	}
> +
> +	/* Detect if BTF/vlen have members that were not found */
> +	if (btf_type_vlen(value_type) > found_members_cnt)
> +		return -E2BIG;
> +
> +	return 0;
> +}
> +
>  static void dev_map_free(struct bpf_map *map)
>  {
>  	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> @@ -601,42 +744,53 @@ static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
>  	return ret;
>  }
>  
> +static inline bool map_value_has_bpf_prog(const struct bpf_dtab *dtab)
> +{
> +	return dtab->cfg.btf_offset.bpf_prog >= 0;
> +}
> +
>  static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
> -						    struct bpf_dtab *dtab,
> +						    struct bpf_map *map,
>  						    struct bpf_devmap_val *val,
>  						    unsigned int idx)
>  {
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
>  	struct bpf_prog *prog = NULL;
>  	struct bpf_dtab_netdev *dev;
>  
> -	dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
> +	dev = kzalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
>  			   dtab->map.numa_node);
>  	if (!dev)
>  		return ERR_PTR(-ENOMEM);
>  
> +	/* Member: ifindex is mandatory, both BTF and kABI */
>  	dev->dev = dev_get_by_index(net, val->ifindex);
>  	if (!dev->dev)
>  		goto err_out;
>  
> -	if (val->bpf_prog.fd >= 0) {
> -		prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
> -					     BPF_PROG_TYPE_XDP, false);
> -		if (IS_ERR(prog))
> -			goto err_put_dev;
> -		if (prog->expected_attach_type != BPF_XDP_DEVMAP)
> -			goto err_put_prog;
> +	/* Member: bpf_prog union is optional, but have fixed offset if exist */
> +	if (map_value_has_bpf_prog(dtab)) {
> +		if (val->bpf_prog.fd >= 0) {
> +			prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
> +						     BPF_PROG_TYPE_XDP, false);
> +			if (IS_ERR(prog))
> +				goto err_put_dev;
> +			if (prog->expected_attach_type != BPF_XDP_DEVMAP)
> +				goto err_put_prog;
> +		}
> +		if (prog) {
> +			dev->xdp_prog = prog;
> +			val->bpf_prog.id = prog->aux->id;
> +		} else {
> +			dev->xdp_prog = NULL;
> +			val->bpf_prog.id = 0;
> +		}
>  	}
> -
>  	dev->idx = idx;
>  	dev->dtab = dtab;
> -	if (prog) {
> -		dev->xdp_prog = prog;
> -		dev->val.bpf_prog.id = prog->aux->id;
> -	} else {
> -		dev->xdp_prog = NULL;
> -		dev->val.bpf_prog.id = 0;
> -	}
> -	dev->val.ifindex = val->ifindex;
> +
> +	/* After adjustment copy map value to get storage area */
> +	memcpy(&dev->val, val, map->value_size);
>  
>  	return dev;
>  err_put_prog:
> @@ -672,7 +826,7 @@ static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
>  		if (val.bpf_prog.fd != -1)
>  			return -EINVAL;
>  	} else {
> -		dev = __dev_map_alloc_node(net, dtab, &val, i);
> +		dev = __dev_map_alloc_node(net, map, &val, i);
>  		if (IS_ERR(dev))
>  			return PTR_ERR(dev);
>  	}
> @@ -717,7 +871,7 @@ static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
>  	if (old_dev && (map_flags & BPF_NOEXIST))
>  		goto out_err;
>  
> -	dev = __dev_map_alloc_node(net, dtab, &val, idx);
> +	dev = __dev_map_alloc_node(net, map, &val, idx);
>  	if (IS_ERR(dev)) {
>  		err = PTR_ERR(dev);
>  		goto out_err;
> @@ -762,7 +916,7 @@ const struct bpf_map_ops dev_map_ops = {
>  	.map_lookup_elem = dev_map_lookup_elem,
>  	.map_update_elem = dev_map_update_elem,
>  	.map_delete_elem = dev_map_delete_elem,
> -	.map_check_btf = map_check_no_btf,
> +	.map_check_btf = dev_map_check_btf,
>  };
>  
>  const struct bpf_map_ops dev_map_hash_ops = {
> @@ -772,7 +926,7 @@ const struct bpf_map_ops dev_map_hash_ops = {
>  	.map_lookup_elem = dev_map_hash_lookup_elem,
>  	.map_update_elem = dev_map_hash_update_elem,
>  	.map_delete_elem = dev_map_hash_delete_elem,
> -	.map_check_btf = map_check_no_btf,
> +	.map_check_btf = dev_map_check_btf,
>  };
>  
>  static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-05-29 15:59 ` [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF Jesper Dangaard Brouer
  2020-05-29 16:39   ` Toke Høiland-Jørgensen
@ 2020-05-30  7:19   ` Andrii Nakryiko
  2020-05-30 14:36     ` Jesper Dangaard Brouer
  2020-06-01 21:30   ` Alexei Starovoitov
  2 siblings, 1 reply; 17+ messages in thread
From: Andrii Nakryiko @ 2020-05-30  7:19 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Ahern, bpf, Networking, Daniel Borkmann, Alexei Starovoitov

On Fri, May 29, 2020 at 8:59 AM Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
>
> The devmap map-value can be read from BPF-prog side, and could be used for a
> storage area per device. This could e.g. contain info on headers that need

If BPF program needs a storage area per device, why can't it just use
a separate map or just plain array (both keyed by ifindex) to store
whatever it needs per-device? It's not clear why this flexibility and
complexity is needed from the description above.

> to be added when packet egress this device.
>
> This patchset adds a dynamic storage member to struct bpf_devmap_val. More
> importantly the struct bpf_devmap_val is made dynamic via leveraging and
> requiring BTF for struct sizes above 4. The only mandatory struct member is
> 'ifindex' with a fixed offset of zero.
>
> Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> ---
>  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
>  1 file changed, 185 insertions(+), 31 deletions(-)
>

[...]

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-05-30  7:19   ` Andrii Nakryiko
@ 2020-05-30 14:36     ` Jesper Dangaard Brouer
  0 siblings, 0 replies; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2020-05-30 14:36 UTC (permalink / raw)
  To: Andrii Nakryiko
  Cc: David Ahern, bpf, Networking, Daniel Borkmann,
	Alexei Starovoitov, brouer

On Sat, 30 May 2020 00:19:50 -0700
Andrii Nakryiko <andrii.nakryiko@gmail.com> wrote:

> On Fri, May 29, 2020 at 8:59 AM Jesper Dangaard Brouer
> <brouer@redhat.com> wrote:
> >
> > The devmap map-value can be read from BPF-prog side, and could be used for a
> > storage area per device. This could e.g. contain info on headers that need  
> 
> If BPF program needs a storage area per device, why can't it just use
> a separate map or just plain array (both keyed by ifindex) to store
> whatever it needs per-device? It's not clear why this flexibility and
> complexity is needed from the description above.

Sorry I though it was obvious, it is for performance reasons and to
reduce the number of maps needed.  We do a lookup in the devmap anyhow,
thus this memory will be cache-hot.  Doing another lookup in a separate
map, which is not guaranteed to be cache-hot, will be wasting cycles.

> > to be added when packet egress this device.
> >
> > This patchset adds a dynamic storage member to struct bpf_devmap_val. More
> > importantly the struct bpf_devmap_val is made dynamic via leveraging and
> > requiring BTF for struct sizes above 4. The only mandatory struct member is
> > 'ifindex' with a fixed offset of zero.
> >
> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > ---
> >  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
> >  1 file changed, 185 insertions(+), 31 deletions(-)
> >  
> 
> [...]
> 



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-05-29 15:59 ` [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF Jesper Dangaard Brouer
  2020-05-29 16:39   ` Toke Høiland-Jørgensen
  2020-05-30  7:19   ` Andrii Nakryiko
@ 2020-06-01 21:30   ` Alexei Starovoitov
  2020-06-02  7:00     ` Jesper Dangaard Brouer
  2 siblings, 1 reply; 17+ messages in thread
From: Alexei Starovoitov @ 2020-06-01 21:30 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Ahern, bpf, netdev, Daniel Borkmann, Andrii Nakryiko

On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote:
> +
> +/* Expected BTF layout that match struct bpf_devmap_val */
> +static const struct expect layout[] = {
> +	{BTF_KIND_INT,		true,	 0,	 4,	"ifindex"},
> +	{BTF_KIND_UNION,	false,	32,	 4,	"bpf_prog"},
> +	{BTF_KIND_STRUCT,	false,	-1,	-1,	"storage"}
> +};
> +
> +static int dev_map_check_btf(const struct bpf_map *map,
> +			     const struct btf *btf,
> +			     const struct btf_type *key_type,
> +			     const struct btf_type *value_type)
> +{
> +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> +	u32 found_members_cnt = 0;
> +	u32 int_data;
> +	int off;
> +	u32 i;
> +
> +	/* Validate KEY type and size */
> +	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> +		return -EOPNOTSUPP;
> +
> +	int_data = *(u32 *)(key_type + 1);
> +	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> +		return -EOPNOTSUPP;
> +
> +	/* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> +	 * - With a flexible size of member 'storage'.
> +	 */
> +
> +	if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> +		return -EOPNOTSUPP;
> +
> +	/* Struct/union members in BTF must not exceed (max) expected members */
> +	if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> +			return -E2BIG;
> +
> +	for (i = 0; i < ARRAY_SIZE(layout); i++) {
> +		off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> +
> +		if (off < 0 && layout[i].mandatory)
> +			return -EUCLEAN;
> +
> +		if (off >= 0)
> +			found_members_cnt++;
> +
> +		/* Transfer layout config to map */
> +		switch (i) {
> +		case 0:
> +			dtab->cfg.btf_offset.ifindex = off;
> +			break;
> +		case 1:
> +			dtab->cfg.btf_offset.bpf_prog = off;
> +			break;
> +		default:
> +			break;
> +		}
> +	}
> +
> +	/* Detect if BTF/vlen have members that were not found */
> +	if (btf_type_vlen(value_type) > found_members_cnt)
> +		return -E2BIG;
> +
> +	return 0;
> +}

This layout validation looks really weird to me.
That layout[] array sort of complements BTF to describe the data,
but double describe of the layout feels like hack.
I'm with Andrii here. Separate array indexed by ifindex or global array
without map_lookup() can be used with good performance.
I don't think such devamp extension is necessary.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-06-01 21:30   ` Alexei Starovoitov
@ 2020-06-02  7:00     ` Jesper Dangaard Brouer
  2020-06-02 18:27       ` Alexei Starovoitov
  0 siblings, 1 reply; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2020-06-02  7:00 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Ahern, bpf, netdev, Daniel Borkmann, Andrii Nakryiko,
	brouer, Lorenzo Bianconi

On Mon, 1 Jun 2020 14:30:12 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote:
> > +
> > +/* Expected BTF layout that match struct bpf_devmap_val */
> > +static const struct expect layout[] = {
> > +	{BTF_KIND_INT,		true,	 0,	 4,	"ifindex"},
> > +	{BTF_KIND_UNION,	false,	32,	 4,	"bpf_prog"},
> > +	{BTF_KIND_STRUCT,	false,	-1,	-1,	"storage"}
> > +};
> > +
> > +static int dev_map_check_btf(const struct bpf_map *map,
> > +			     const struct btf *btf,
> > +			     const struct btf_type *key_type,
> > +			     const struct btf_type *value_type)
> > +{
> > +	struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> > +	u32 found_members_cnt = 0;
> > +	u32 int_data;
> > +	int off;
> > +	u32 i;
> > +
> > +	/* Validate KEY type and size */
> > +	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> > +		return -EOPNOTSUPP;
> > +
> > +	int_data = *(u32 *)(key_type + 1);
> > +	if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> > +		return -EOPNOTSUPP;
> > +
> > +	/* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> > +	 * - With a flexible size of member 'storage'.
> > +	 */
> > +
> > +	if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> > +		return -EOPNOTSUPP;
> > +
> > +	/* Struct/union members in BTF must not exceed (max) expected members */
> > +	if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> > +			return -E2BIG;
> > +
> > +	for (i = 0; i < ARRAY_SIZE(layout); i++) {
> > +		off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> > +
> > +		if (off < 0 && layout[i].mandatory)
> > +			return -EUCLEAN;
> > +
> > +		if (off >= 0)
> > +			found_members_cnt++;
> > +
> > +		/* Transfer layout config to map */
> > +		switch (i) {
> > +		case 0:
> > +			dtab->cfg.btf_offset.ifindex = off;
> > +			break;
> > +		case 1:
> > +			dtab->cfg.btf_offset.bpf_prog = off;
> > +			break;
> > +		default:
> > +			break;
> > +		}
> > +	}
> > +
> > +	/* Detect if BTF/vlen have members that were not found */
> > +	if (btf_type_vlen(value_type) > found_members_cnt)
> > +		return -E2BIG;
> > +
> > +	return 0;
> > +}  
> 
> This layout validation looks really weird to me.
> That layout[] array sort of complements BTF to describe the data,
> but double describe of the layout feels like hack.

This is the kind of feedback I'm looking for.  I want to make the
map-value more dynamic.  It seems so old school to keep extending the
map-value with a size and fixed binary layout, when we have BTF
available.  I'm open to input on how to better verify/parse/desc the
expected BTF layout for kernel-code side.

The patch demonstrates that this is possible, I'm open for changes.
E.g. devmap is now extended with a bpf_prog, but most end-users will
not be using this feature. Today they can use value_size=4 to avoid
using this field. When we extend map-value again, then end-users are
force into providing 'bpf_prog.fd' if they want to use the newer
options.  In this patch end-users don't need to provide 'bpf_prog' if
they don't use it. Via BTF we can see this struct member can be skipped.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-05-29 16:39   ` Toke Høiland-Jørgensen
@ 2020-06-02  8:59     ` Jesper Dangaard Brouer
  2020-06-02  9:23       ` Toke Høiland-Jørgensen
  0 siblings, 1 reply; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2020-06-02  8:59 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Ahern, bpf, netdev, Daniel Borkmann, Alexei Starovoitov,
	Andrii Nakryiko, brouer

On Fri, 29 May 2020 18:39:40 +0200
Toke Høiland-Jørgensen <toke@redhat.com> wrote:

> Jesper Dangaard Brouer <brouer@redhat.com> writes:
> 
> > The devmap map-value can be read from BPF-prog side, and could be used for a
> > storage area per device. This could e.g. contain info on headers that need
> > to be added when packet egress this device.
> >
> > This patchset adds a dynamic storage member to struct bpf_devmap_val. More
> > importantly the struct bpf_devmap_val is made dynamic via leveraging and
> > requiring BTF for struct sizes above 4. The only mandatory struct member is
> > 'ifindex' with a fixed offset of zero.
> >
> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > ---
> >  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
> >  1 file changed, 185 insertions(+), 31 deletions(-)
> >
> > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> > index 4ab67b2d8159..9cf2dadcc0fe 100644
[...]
> > @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue {
> >  	unsigned int count;
> >  };
> >  
> > -/* DEVMAP values */
> > +/* DEVMAP map-value layout.
> > + *
> > + * The struct data-layout of map-value is a configuration interface.
> > + * BPF-prog side have read-only access to this memory.
> > + *
> > + * The layout might be different than below, because some struct members are
> > + * optional.  This is made dynamic by requiring userspace provides an BTF
> > + * description of the struct layout, when creating the BPF-map. Struct names
> > + * are important and part of API, as BTF use these names to identify members.
> > + */
> >  struct bpf_devmap_val {
> > -	__u32 ifindex;   /* device index */
> > +	__u32 ifindex;   /* device index - mandatory */
> >  	union {
> >  		int   fd;  /* prog fd on map write */
> >  		__u32 id;  /* prog id on map read */
> >  	} bpf_prog;
> > +	struct {
> > +		/* This 'storage' member is meant as a dynamically sized area,
> > +		 * that BPF developer can redefine.  As other members are added
> > +		 * overtime, this area can shrink, as size can be regained by
> > +		 * not using members above. Add new members above this struct.
> > +		 */
> > +		unsigned char data[24];
> > +	} storage;  
> 
> Why is this needed? Userspace already passes in the value_size, so why
> can't the kernel just use the BTF to pick out the values it cares about
> and let the rest be up to userspace?

The kernel cannot just ignore unknown struct members, due to forward
compatibility. An older kernel that sees a new struct member, cannot
know what this struct member is used for.  Thus, later I'm rejecting
map creation if I detect members kernel doesn't know about.

This means, that I need to create a named area (e.g. named 'storage')
that users can define their own layout within.

This might be difficult to comprehend for other kernel developers,
because usually we create forward compatibility via walking the binary
struct and then assume that if an unknown area (in end-of-struct)
contains zeros, then it means end-user isn't using that unknown feature.
This doesn't work when the default value, as in this exact case, need
to be minus-1 do describe "unused" as this is a file descriptor.

Forward compatibility is different here.  If the end-user include the
member in their BTF description, that means they intend to use it.
Thus, kernel need to reject map-create if it sees unknown members.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-06-02  8:59     ` Jesper Dangaard Brouer
@ 2020-06-02  9:23       ` Toke Høiland-Jørgensen
  2020-06-02 10:01         ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 17+ messages in thread
From: Toke Høiland-Jørgensen @ 2020-06-02  9:23 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Ahern, bpf, netdev, Daniel Borkmann, Alexei Starovoitov,
	Andrii Nakryiko, brouer

Jesper Dangaard Brouer <brouer@redhat.com> writes:

> On Fri, 29 May 2020 18:39:40 +0200
> Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
>> Jesper Dangaard Brouer <brouer@redhat.com> writes:
>> 
>> > The devmap map-value can be read from BPF-prog side, and could be used for a
>> > storage area per device. This could e.g. contain info on headers that need
>> > to be added when packet egress this device.
>> >
>> > This patchset adds a dynamic storage member to struct bpf_devmap_val. More
>> > importantly the struct bpf_devmap_val is made dynamic via leveraging and
>> > requiring BTF for struct sizes above 4. The only mandatory struct member is
>> > 'ifindex' with a fixed offset of zero.
>> >
>> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
>> > ---
>> >  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
>> >  1 file changed, 185 insertions(+), 31 deletions(-)
>> >
>> > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
>> > index 4ab67b2d8159..9cf2dadcc0fe 100644
> [...]
>> > @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue {
>> >  	unsigned int count;
>> >  };
>> >  
>> > -/* DEVMAP values */
>> > +/* DEVMAP map-value layout.
>> > + *
>> > + * The struct data-layout of map-value is a configuration interface.
>> > + * BPF-prog side have read-only access to this memory.
>> > + *
>> > + * The layout might be different than below, because some struct members are
>> > + * optional.  This is made dynamic by requiring userspace provides an BTF
>> > + * description of the struct layout, when creating the BPF-map. Struct names
>> > + * are important and part of API, as BTF use these names to identify members.
>> > + */
>> >  struct bpf_devmap_val {
>> > -	__u32 ifindex;   /* device index */
>> > +	__u32 ifindex;   /* device index - mandatory */
>> >  	union {
>> >  		int   fd;  /* prog fd on map write */
>> >  		__u32 id;  /* prog id on map read */
>> >  	} bpf_prog;
>> > +	struct {
>> > +		/* This 'storage' member is meant as a dynamically sized area,
>> > +		 * that BPF developer can redefine.  As other members are added
>> > +		 * overtime, this area can shrink, as size can be regained by
>> > +		 * not using members above. Add new members above this struct.
>> > +		 */
>> > +		unsigned char data[24];
>> > +	} storage;  
>> 
>> Why is this needed? Userspace already passes in the value_size, so why
>> can't the kernel just use the BTF to pick out the values it cares about
>> and let the rest be up to userspace?
>
> The kernel cannot just ignore unknown struct members, due to forward
> compatibility. An older kernel that sees a new struct member, cannot
> know what this struct member is used for.  Thus, later I'm rejecting
> map creation if I detect members kernel doesn't know about.
>
> This means, that I need to create a named area (e.g. named 'storage')
> that users can define their own layout within.
>
> This might be difficult to comprehend for other kernel developers,
> because usually we create forward compatibility via walking the binary
> struct and then assume that if an unknown area (in end-of-struct)
> contains zeros, then it means end-user isn't using that unknown feature.
> This doesn't work when the default value, as in this exact case, need
> to be minus-1 do describe "unused" as this is a file descriptor.
>
> Forward compatibility is different here.  If the end-user include the
> member in their BTF description, that means they intend to use it.
> Thus, kernel need to reject map-create if it sees unknown members.

Ah, right, of course. You could still allow such a "user-defined" member
to be any size userspace likes, though, couldn't you?

-Toke


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-06-02  9:23       ` Toke Høiland-Jørgensen
@ 2020-06-02 10:01         ` Jesper Dangaard Brouer
  0 siblings, 0 replies; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2020-06-02 10:01 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David Ahern, bpf, netdev, Daniel Borkmann, Alexei Starovoitov,
	Andrii Nakryiko, brouer

On Tue, 02 Jun 2020 11:23:24 +0200
Toke Høiland-Jørgensen <toke@redhat.com> wrote:

> Jesper Dangaard Brouer <brouer@redhat.com> writes:
> 
> > On Fri, 29 May 2020 18:39:40 +0200
> > Toke Høiland-Jørgensen <toke@redhat.com> wrote:
> >  
> >> Jesper Dangaard Brouer <brouer@redhat.com> writes:
> >>   
> >> > The devmap map-value can be read from BPF-prog side, and could be used for a
> >> > storage area per device. This could e.g. contain info on headers that need
> >> > to be added when packet egress this device.
> >> >
> >> > This patchset adds a dynamic storage member to struct bpf_devmap_val. More
> >> > importantly the struct bpf_devmap_val is made dynamic via leveraging and
> >> > requiring BTF for struct sizes above 4. The only mandatory struct member is
> >> > 'ifindex' with a fixed offset of zero.
> >> >
> >> > Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
> >> > ---
> >> >  kernel/bpf/devmap.c |  216 ++++++++++++++++++++++++++++++++++++++++++++-------
> >> >  1 file changed, 185 insertions(+), 31 deletions(-)
> >> >
> >> > diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
> >> > index 4ab67b2d8159..9cf2dadcc0fe 100644  
> > [...]  
> >> > @@ -60,13 +61,30 @@ struct xdp_dev_bulk_queue {
> >> >  	unsigned int count;
> >> >  };
> >> >  
> >> > -/* DEVMAP values */
> >> > +/* DEVMAP map-value layout.
> >> > + *
> >> > + * The struct data-layout of map-value is a configuration interface.
> >> > + * BPF-prog side have read-only access to this memory.
> >> > + *
> >> > + * The layout might be different than below, because some struct members are
> >> > + * optional.  This is made dynamic by requiring userspace provides an BTF
> >> > + * description of the struct layout, when creating the BPF-map. Struct names
> >> > + * are important and part of API, as BTF use these names to identify members.
> >> > + */
> >> >  struct bpf_devmap_val {
> >> > -	__u32 ifindex;   /* device index */
> >> > +	__u32 ifindex;   /* device index - mandatory */
> >> >  	union {
> >> >  		int   fd;  /* prog fd on map write */
> >> >  		__u32 id;  /* prog id on map read */
> >> >  	} bpf_prog;
> >> > +	struct {
> >> > +		/* This 'storage' member is meant as a dynamically sized area,
> >> > +		 * that BPF developer can redefine.  As other members are added
> >> > +		 * overtime, this area can shrink, as size can be regained by
> >> > +		 * not using members above. Add new members above this struct.
> >> > +		 */
> >> > +		unsigned char data[24];
> >> > +	} storage;    
> >> 
> >> Why is this needed? Userspace already passes in the value_size, so why
> >> can't the kernel just use the BTF to pick out the values it cares about
> >> and let the rest be up to userspace?  
> >
> > The kernel cannot just ignore unknown struct members, due to forward
> > compatibility. An older kernel that sees a new struct member, cannot
> > know what this struct member is used for.  Thus, later I'm rejecting
> > map creation if I detect members kernel doesn't know about.
> >
> > This means, that I need to create a named area (e.g. named 'storage')
> > that users can define their own layout within.
> >
> > This might be difficult to comprehend for other kernel developers,
> > because usually we create forward compatibility via walking the binary
> > struct and then assume that if an unknown area (in end-of-struct)
> > contains zeros, then it means end-user isn't using that unknown feature.
> > This doesn't work when the default value, as in this exact case, need
> > to be minus-1 do describe "unused" as this is a file descriptor.
> >
> > Forward compatibility is different here.  If the end-user include the
> > member in their BTF description, that means they intend to use it.
> > Thus, kernel need to reject map-create if it sees unknown members.  
> 
> Ah, right, of course. You could still allow such a "user-defined" member
> to be any size userspace likes, though, couldn't you?

Yes.  In this implementation the "user-defined" member 'storage' do have
variable size (and can be non-existing).  Do you mean that I have
limited the total size of the struct to be 32 bytes?
(Which is true, and that can also be made dynamic, but I was trying to
limit the scope of patch.  It is hard enough to wrap head around the
binary struct from userspace is becoming dynamic)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-06-02  7:00     ` Jesper Dangaard Brouer
@ 2020-06-02 18:27       ` Alexei Starovoitov
  2020-06-03  9:11         ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 17+ messages in thread
From: Alexei Starovoitov @ 2020-06-02 18:27 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Ahern, bpf, Network Development, Daniel Borkmann,
	Andrii Nakryiko, Lorenzo Bianconi

On Tue, Jun 2, 2020 at 12:00 AM Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
>
> On Mon, 1 Jun 2020 14:30:12 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>
> > On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote:
> > > +
> > > +/* Expected BTF layout that match struct bpf_devmap_val */
> > > +static const struct expect layout[] = {
> > > +   {BTF_KIND_INT,          true,    0,      4,     "ifindex"},
> > > +   {BTF_KIND_UNION,        false,  32,      4,     "bpf_prog"},
> > > +   {BTF_KIND_STRUCT,       false,  -1,     -1,     "storage"}
> > > +};
> > > +
> > > +static int dev_map_check_btf(const struct bpf_map *map,
> > > +                        const struct btf *btf,
> > > +                        const struct btf_type *key_type,
> > > +                        const struct btf_type *value_type)
> > > +{
> > > +   struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> > > +   u32 found_members_cnt = 0;
> > > +   u32 int_data;
> > > +   int off;
> > > +   u32 i;
> > > +
> > > +   /* Validate KEY type and size */
> > > +   if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> > > +           return -EOPNOTSUPP;
> > > +
> > > +   int_data = *(u32 *)(key_type + 1);
> > > +   if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> > > +           return -EOPNOTSUPP;
> > > +
> > > +   /* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> > > +    * - With a flexible size of member 'storage'.
> > > +    */
> > > +
> > > +   if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> > > +           return -EOPNOTSUPP;
> > > +
> > > +   /* Struct/union members in BTF must not exceed (max) expected members */
> > > +   if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> > > +                   return -E2BIG;
> > > +
> > > +   for (i = 0; i < ARRAY_SIZE(layout); i++) {
> > > +           off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> > > +
> > > +           if (off < 0 && layout[i].mandatory)
> > > +                   return -EUCLEAN;
> > > +
> > > +           if (off >= 0)
> > > +                   found_members_cnt++;
> > > +
> > > +           /* Transfer layout config to map */
> > > +           switch (i) {
> > > +           case 0:
> > > +                   dtab->cfg.btf_offset.ifindex = off;
> > > +                   break;
> > > +           case 1:
> > > +                   dtab->cfg.btf_offset.bpf_prog = off;
> > > +                   break;
> > > +           default:
> > > +                   break;
> > > +           }
> > > +   }
> > > +
> > > +   /* Detect if BTF/vlen have members that were not found */
> > > +   if (btf_type_vlen(value_type) > found_members_cnt)
> > > +           return -E2BIG;
> > > +
> > > +   return 0;
> > > +}
> >
> > This layout validation looks really weird to me.
> > That layout[] array sort of complements BTF to describe the data,
> > but double describe of the layout feels like hack.
>
> This is the kind of feedback I'm looking for.  I want to make the
> map-value more dynamic.  It seems so old school to keep extending the
> map-value with a size and fixed binary layout, when we have BTF
> available.  I'm open to input on how to better verify/parse/desc the
> expected BTF layout for kernel-code side.
>
> The patch demonstrates that this is possible, I'm open for changes.
> E.g. devmap is now extended with a bpf_prog, but most end-users will
> not be using this feature. Today they can use value_size=4 to avoid
> using this field. When we extend map-value again, then end-users are
> force into providing 'bpf_prog.fd' if they want to use the newer
> options.  In this patch end-users don't need to provide 'bpf_prog' if
> they don't use it. Via BTF we can see this struct member can be skipped.

I think 'struct bpf_devmap_val' should be in uapi/bpf.h.
That's what it is and it will be extended with new fields at the end
just like all other structs in uapi/bpf.h
I don't think BTF can become a substitute for uapi
where uapi struct has to have all fields defined and backwards supported
by the kernel.
BTF is for flexible structs where fields may disappear.
BTF is there to define a meaning of a binary blob.
'struct bpf_devmap_val' is not such thing. It's very much known with
fixed fields and fixed meaning.

^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-06-02 18:27       ` Alexei Starovoitov
@ 2020-06-03  9:11         ` Jesper Dangaard Brouer
  2020-06-03 16:20           ` Alexei Starovoitov
  0 siblings, 1 reply; 17+ messages in thread
From: Jesper Dangaard Brouer @ 2020-06-03  9:11 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: David Ahern, bpf, Network Development, Daniel Borkmann,
	Andrii Nakryiko, Lorenzo Bianconi, brouer

On Tue, 2 Jun 2020 11:27:03 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Tue, Jun 2, 2020 at 12:00 AM Jesper Dangaard Brouer
> <brouer@redhat.com> wrote:
> >
> > On Mon, 1 Jun 2020 14:30:12 -0700
> > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> >  
> > > On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote:  
> > > > +
> > > > +/* Expected BTF layout that match struct bpf_devmap_val */
> > > > +static const struct expect layout[] = {
> > > > +   {BTF_KIND_INT,          true,    0,      4,     "ifindex"},
> > > > +   {BTF_KIND_UNION,        false,  32,      4,     "bpf_prog"},
> > > > +   {BTF_KIND_STRUCT,       false,  -1,     -1,     "storage"}
> > > > +};
> > > > +
> > > > +static int dev_map_check_btf(const struct bpf_map *map,
> > > > +                        const struct btf *btf,
> > > > +                        const struct btf_type *key_type,
> > > > +                        const struct btf_type *value_type)
> > > > +{
> > > > +   struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> > > > +   u32 found_members_cnt = 0;
> > > > +   u32 int_data;
> > > > +   int off;
> > > > +   u32 i;
> > > > +
> > > > +   /* Validate KEY type and size */
> > > > +   if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> > > > +           return -EOPNOTSUPP;
> > > > +
> > > > +   int_data = *(u32 *)(key_type + 1);
> > > > +   if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> > > > +           return -EOPNOTSUPP;
> > > > +
> > > > +   /* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> > > > +    * - With a flexible size of member 'storage'.
> > > > +    */
> > > > +
> > > > +   if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> > > > +           return -EOPNOTSUPP;
> > > > +
> > > > +   /* Struct/union members in BTF must not exceed (max) expected members */
> > > > +   if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> > > > +                   return -E2BIG;
> > > > +
> > > > +   for (i = 0; i < ARRAY_SIZE(layout); i++) {
> > > > +           off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> > > > +
> > > > +           if (off < 0 && layout[i].mandatory)
> > > > +                   return -EUCLEAN;
> > > > +
> > > > +           if (off >= 0)
> > > > +                   found_members_cnt++;
> > > > +
> > > > +           /* Transfer layout config to map */
> > > > +           switch (i) {
> > > > +           case 0:
> > > > +                   dtab->cfg.btf_offset.ifindex = off;
> > > > +                   break;
> > > > +           case 1:
> > > > +                   dtab->cfg.btf_offset.bpf_prog = off;
> > > > +                   break;
> > > > +           default:
> > > > +                   break;
> > > > +           }
> > > > +   }
> > > > +
> > > > +   /* Detect if BTF/vlen have members that were not found */
> > > > +   if (btf_type_vlen(value_type) > found_members_cnt)
> > > > +           return -E2BIG;
> > > > +
> > > > +   return 0;
> > > > +}  
> > >
> > > This layout validation looks really weird to me.
> > > That layout[] array sort of complements BTF to describe the data,
> > > but double describe of the layout feels like hack.  
> >
> > This is the kind of feedback I'm looking for.  I want to make the
> > map-value more dynamic.  It seems so old school to keep extending the
> > map-value with a size and fixed binary layout, when we have BTF
> > available.  I'm open to input on how to better verify/parse/desc the
> > expected BTF layout for kernel-code side.
> >
> > The patch demonstrates that this is possible, I'm open for changes.
> > E.g. devmap is now extended with a bpf_prog, but most end-users will
> > not be using this feature. Today they can use value_size=4 to avoid
> > using this field. When we extend map-value again, then end-users are
> > force into providing 'bpf_prog.fd' if they want to use the newer
> > options.  In this patch end-users don't need to provide 'bpf_prog' if
> > they don't use it. Via BTF we can see this struct member can be skipped.  
> 
> I think 'struct bpf_devmap_val' should be in uapi/bpf.h.

I disagree.

> That's what it is and it will be extended with new fields at the end
> just like all other structs in uapi/bpf.h

This only works when new fields added will be zero, meaning that
default value of zero means the feature is not used.  In this specific
case devmap adds a file-descriptor field, that have to be -1 for the
feature to be unused.

Thus, when programs gets compiled with this new UAPI header, they will
start to fail, because they try to map-insert file-descriptor zero.


> I don't think BTF can become a substitute for uapi
> where uapi struct has to have all fields defined and backwards supported
> by the kernel.
> BTF is for flexible structs where fields may disappear.

Then BTF is perfect for this, as e.g. I want to remove field/member
'ifindex' for the HASH-variant of devmap, and instead use the key as
the ifindex.


> BTF is there to define a meaning of a binary blob.
> 'struct bpf_devmap_val' is not such thing. It's very much known with
> fixed fields and fixed meaning.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF
  2020-06-03  9:11         ` Jesper Dangaard Brouer
@ 2020-06-03 16:20           ` Alexei Starovoitov
  0 siblings, 0 replies; 17+ messages in thread
From: Alexei Starovoitov @ 2020-06-03 16:20 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Ahern, bpf, Network Development, Daniel Borkmann,
	Andrii Nakryiko, Lorenzo Bianconi

On Wed, Jun 03, 2020 at 11:11:58AM +0200, Jesper Dangaard Brouer wrote:
> On Tue, 2 Jun 2020 11:27:03 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> > On Tue, Jun 2, 2020 at 12:00 AM Jesper Dangaard Brouer
> > <brouer@redhat.com> wrote:
> > >
> > > On Mon, 1 Jun 2020 14:30:12 -0700
> > > Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> > >  
> > > > On Fri, May 29, 2020 at 05:59:45PM +0200, Jesper Dangaard Brouer wrote:  
> > > > > +
> > > > > +/* Expected BTF layout that match struct bpf_devmap_val */
> > > > > +static const struct expect layout[] = {
> > > > > +   {BTF_KIND_INT,          true,    0,      4,     "ifindex"},
> > > > > +   {BTF_KIND_UNION,        false,  32,      4,     "bpf_prog"},
> > > > > +   {BTF_KIND_STRUCT,       false,  -1,     -1,     "storage"}
> > > > > +};
> > > > > +
> > > > > +static int dev_map_check_btf(const struct bpf_map *map,
> > > > > +                        const struct btf *btf,
> > > > > +                        const struct btf_type *key_type,
> > > > > +                        const struct btf_type *value_type)
> > > > > +{
> > > > > +   struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
> > > > > +   u32 found_members_cnt = 0;
> > > > > +   u32 int_data;
> > > > > +   int off;
> > > > > +   u32 i;
> > > > > +
> > > > > +   /* Validate KEY type and size */
> > > > > +   if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
> > > > > +           return -EOPNOTSUPP;
> > > > > +
> > > > > +   int_data = *(u32 *)(key_type + 1);
> > > > > +   if (BTF_INT_BITS(int_data) != 32 || BTF_INT_OFFSET(int_data) != 0)
> > > > > +           return -EOPNOTSUPP;
> > > > > +
> > > > > +   /* Validate VALUE have layout that match/map-to struct bpf_devmap_val
> > > > > +    * - With a flexible size of member 'storage'.
> > > > > +    */
> > > > > +
> > > > > +   if (BTF_INFO_KIND(value_type->info) != BTF_KIND_STRUCT)
> > > > > +           return -EOPNOTSUPP;
> > > > > +
> > > > > +   /* Struct/union members in BTF must not exceed (max) expected members */
> > > > > +   if (btf_type_vlen(value_type) > ARRAY_SIZE(layout))
> > > > > +                   return -E2BIG;
> > > > > +
> > > > > +   for (i = 0; i < ARRAY_SIZE(layout); i++) {
> > > > > +           off = btf_find_expect_layout_offset(btf, value_type, &layout[i]);
> > > > > +
> > > > > +           if (off < 0 && layout[i].mandatory)
> > > > > +                   return -EUCLEAN;
> > > > > +
> > > > > +           if (off >= 0)
> > > > > +                   found_members_cnt++;
> > > > > +
> > > > > +           /* Transfer layout config to map */
> > > > > +           switch (i) {
> > > > > +           case 0:
> > > > > +                   dtab->cfg.btf_offset.ifindex = off;
> > > > > +                   break;
> > > > > +           case 1:
> > > > > +                   dtab->cfg.btf_offset.bpf_prog = off;
> > > > > +                   break;
> > > > > +           default:
> > > > > +                   break;
> > > > > +           }
> > > > > +   }
> > > > > +
> > > > > +   /* Detect if BTF/vlen have members that were not found */
> > > > > +   if (btf_type_vlen(value_type) > found_members_cnt)
> > > > > +           return -E2BIG;
> > > > > +
> > > > > +   return 0;
> > > > > +}  
> > > >
> > > > This layout validation looks really weird to me.
> > > > That layout[] array sort of complements BTF to describe the data,
> > > > but double describe of the layout feels like hack.  
> > >
> > > This is the kind of feedback I'm looking for.  I want to make the
> > > map-value more dynamic.  It seems so old school to keep extending the
> > > map-value with a size and fixed binary layout, when we have BTF
> > > available.  I'm open to input on how to better verify/parse/desc the
> > > expected BTF layout for kernel-code side.
> > >
> > > The patch demonstrates that this is possible, I'm open for changes.
> > > E.g. devmap is now extended with a bpf_prog, but most end-users will
> > > not be using this feature. Today they can use value_size=4 to avoid
> > > using this field. When we extend map-value again, then end-users are
> > > force into providing 'bpf_prog.fd' if they want to use the newer
> > > options.  In this patch end-users don't need to provide 'bpf_prog' if
> > > they don't use it. Via BTF we can see this struct member can be skipped.  
> > 
> > I think 'struct bpf_devmap_val' should be in uapi/bpf.h.
> 
> I disagree.
>
> > That's what it is and it will be extended with new fields at the end
> > just like all other structs in uapi/bpf.h
> 
> This only works when new fields added will be zero, meaning that
> default value of zero means the feature is not used.  In this specific
> case devmap adds a file-descriptor field, that have to be -1 for the
> feature to be unused.
> 
> Thus, when programs gets compiled with this new UAPI header, they will
> start to fail, because they try to map-insert file-descriptor zero.

No, because there is size that has to be specified.
There are plenty of other uapi structs that have non-zero values
in a newly added fields.

> 
> > I don't think BTF can become a substitute for uapi
> > where uapi struct has to have all fields defined and backwards supported
> > by the kernel.
> > BTF is for flexible structs where fields may disappear.
> 
> Then BTF is perfect for this, as e.g. I want to remove field/member
> 'ifindex' for the HASH-variant of devmap, and instead use the key as
> the ifindex.

nack to that.

^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2020-06-03 16:21 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-05-29 15:59 [PATCH bpf-next RFC 0/3] bpf: dynamic map-value config layout via BTF Jesper Dangaard Brouer
2020-05-29 15:59 ` [PATCH bpf-next RFC 1/3] bpf: move struct bpf_devmap_val out of UAPI Jesper Dangaard Brouer
2020-05-29 16:06   ` David Ahern
2020-05-29 16:28     ` Jesper Dangaard Brouer
2020-05-29 15:59 ` [PATCH bpf-next RFC 2/3] bpf: devmap dynamic map-value storage area based on BTF Jesper Dangaard Brouer
2020-05-29 16:39   ` Toke Høiland-Jørgensen
2020-06-02  8:59     ` Jesper Dangaard Brouer
2020-06-02  9:23       ` Toke Høiland-Jørgensen
2020-06-02 10:01         ` Jesper Dangaard Brouer
2020-05-30  7:19   ` Andrii Nakryiko
2020-05-30 14:36     ` Jesper Dangaard Brouer
2020-06-01 21:30   ` Alexei Starovoitov
2020-06-02  7:00     ` Jesper Dangaard Brouer
2020-06-02 18:27       ` Alexei Starovoitov
2020-06-03  9:11         ` Jesper Dangaard Brouer
2020-06-03 16:20           ` Alexei Starovoitov
2020-05-29 15:59 ` [PATCH bpf-next RFC 3/3] samples/bpf: change xdp_fwd to use new BTF config interface Jesper Dangaard Brouer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).