All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
@ 2016-04-08  4:48 Brenden Blanco
  2016-04-08  4:48 ` [RFC PATCH v2 2/5] net: add ndo to set bpf prog in adapter rx Brenden Blanco
                   ` (5 more replies)
  0 siblings, 6 replies; 34+ messages in thread
From: Brenden Blanco @ 2016-04-08  4:48 UTC (permalink / raw)
  To: davem
  Cc: Brenden Blanco, netdev, tom, alexei.starovoitov, ogerlitz,
	daniel, brouer, eric.dumazet, ecree, john.fastabend, tgraf,
	johannes, eranlinuxmellanox, lorenzo

Add a new bpf prog type that is intended to run in early stages of the
packet rx path. Only minimal packet metadata will be available, hence a
new context type, struct bpf_phys_dev_md, is exposed to userspace. So
far only expose the readable packet length, and only in read mode.

The PHYS_DEV name is chosen to represent that the program is meant only
for physical adapters, rather than all netdevs.

While the user visible struct is new, the underlying context must be
implemented as a minimal skb in order for the packet load_* instructions
to work. The skb filled in by the driver must have skb->len, skb->head,
and skb->data set, and skb->data_len == 0.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 include/uapi/linux/bpf.h | 14 ++++++++++
 kernel/bpf/verifier.c    |  1 +
 net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 83 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 70eda5a..3018d83 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -93,6 +93,7 @@ enum bpf_prog_type {
 	BPF_PROG_TYPE_SCHED_CLS,
 	BPF_PROG_TYPE_SCHED_ACT,
 	BPF_PROG_TYPE_TRACEPOINT,
+	BPF_PROG_TYPE_PHYS_DEV,
 };
 
 #define BPF_PSEUDO_MAP_FD	1
@@ -368,6 +369,19 @@ struct __sk_buff {
 	__u32 tc_classid;
 };
 
+/* user return codes for PHYS_DEV prog type */
+enum bpf_phys_dev_action {
+	BPF_PHYS_DEV_DROP,
+	BPF_PHYS_DEV_OK,
+};
+
+/* user accessible metadata for PHYS_DEV packet hook
+ * new fields must be added to the end of this structure
+ */
+struct bpf_phys_dev_md {
+	__u32 len;
+};
+
 struct bpf_tunnel_key {
 	__u32 tunnel_id;
 	union {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 58792fe..877542d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -1344,6 +1344,7 @@ static bool may_access_skb(enum bpf_prog_type type)
 	case BPF_PROG_TYPE_SOCKET_FILTER:
 	case BPF_PROG_TYPE_SCHED_CLS:
 	case BPF_PROG_TYPE_SCHED_ACT:
+	case BPF_PROG_TYPE_PHYS_DEV:
 		return true;
 	default:
 		return false;
diff --git a/net/core/filter.c b/net/core/filter.c
index e8486ba..4f73fb9 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2021,6 +2021,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
 	}
 }
 
+static const struct bpf_func_proto *
+phys_dev_func_proto(enum bpf_func_id func_id)
+{
+	return sk_filter_func_proto(func_id);
+}
+
 static bool __is_valid_access(int off, int size, enum bpf_access_type type)
 {
 	/* check bounds */
@@ -2076,6 +2082,36 @@ static bool tc_cls_act_is_valid_access(int off, int size,
 	return __is_valid_access(off, size, type);
 }
 
+static bool __is_valid_phys_dev_access(int off, int size,
+				       enum bpf_access_type type)
+{
+	if (off < 0 || off >= sizeof(struct bpf_phys_dev_md))
+		return false;
+
+	if (off % size != 0)
+		return false;
+
+	if (size != 4)
+		return false;
+
+	return true;
+}
+
+static bool phys_dev_is_valid_access(int off, int size,
+				     enum bpf_access_type type)
+{
+	if (type == BPF_WRITE)
+		return false;
+
+	switch (off) {
+	case offsetof(struct bpf_phys_dev_md, len):
+		break;
+	default:
+		return false;
+	}
+	return __is_valid_phys_dev_access(off, size, type);
+}
+
 static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 				      int src_reg, int ctx_off,
 				      struct bpf_insn *insn_buf,
@@ -2213,6 +2249,26 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
 	return insn - insn_buf;
 }
 
+static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type,
+					   int dst_reg, int src_reg,
+					   int ctx_off,
+					   struct bpf_insn *insn_buf,
+					   struct bpf_prog *prog)
+{
+	struct bpf_insn *insn = insn_buf;
+
+	switch (ctx_off) {
+	case offsetof(struct bpf_phys_dev_md, len):
+		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
+
+		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
+				      offsetof(struct sk_buff, len));
+		break;
+	}
+
+	return insn - insn_buf;
+}
+
 static const struct bpf_verifier_ops sk_filter_ops = {
 	.get_func_proto = sk_filter_func_proto,
 	.is_valid_access = sk_filter_is_valid_access,
@@ -2225,6 +2281,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
 	.convert_ctx_access = bpf_net_convert_ctx_access,
 };
 
+static const struct bpf_verifier_ops phys_dev_ops = {
+	.get_func_proto = phys_dev_func_proto,
+	.is_valid_access = phys_dev_is_valid_access,
+	.convert_ctx_access = bpf_phys_dev_convert_ctx_access,
+};
+
 static struct bpf_prog_type_list sk_filter_type __read_mostly = {
 	.ops = &sk_filter_ops,
 	.type = BPF_PROG_TYPE_SOCKET_FILTER,
@@ -2240,11 +2302,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = {
 	.type = BPF_PROG_TYPE_SCHED_ACT,
 };
 
+static struct bpf_prog_type_list phys_dev_type __read_mostly = {
+	.ops = &phys_dev_ops,
+	.type = BPF_PROG_TYPE_PHYS_DEV,
+};
+
 static int __init register_sk_filter_ops(void)
 {
 	bpf_register_prog_type(&sk_filter_type);
 	bpf_register_prog_type(&sched_cls_type);
 	bpf_register_prog_type(&sched_act_type);
+	bpf_register_prog_type(&phys_dev_type);
 
 	return 0;
 }
-- 
2.8.0

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC PATCH v2 2/5] net: add ndo to set bpf prog in adapter rx
  2016-04-08  4:48 [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Brenden Blanco
@ 2016-04-08  4:48 ` Brenden Blanco
  2016-04-08  9:38   ` Jesper Dangaard Brouer
  2016-04-08  4:48 ` [RFC PATCH v2 3/5] rtnl: add option for setting link bpf prog Brenden Blanco
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 34+ messages in thread
From: Brenden Blanco @ 2016-04-08  4:48 UTC (permalink / raw)
  To: davem
  Cc: Brenden Blanco, netdev, tom, alexei.starovoitov, ogerlitz,
	daniel, brouer, eric.dumazet, ecree, john.fastabend, tgraf,
	johannes, eranlinuxmellanox, lorenzo

Add two new set/get netdev ops for drivers implementing the
BPF_PROG_TYPE_PHYS_DEV filter.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 include/linux/netdevice.h | 13 +++++++++++++
 net/core/dev.c            | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cb4e508..3acf732 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -61,6 +61,7 @@ struct wireless_dev;
 /* 802.15.4 specific */
 struct wpan_dev;
 struct mpls_dev;
+struct bpf_prog;
 
 void netdev_set_default_ethtool_ops(struct net_device *dev,
 				    const struct ethtool_ops *ops);
@@ -1102,6 +1103,14 @@ struct tc_to_netdev {
  *	appropriate rx headroom value allows avoiding skb head copy on
  *	forward. Setting a negative value resets the rx headroom to the
  *	default value.
+ * int  (*ndo_bpf_set)(struct net_device *dev, struct bpf_prog *prog);
+ *	This function is used to set or clear a bpf program used in the
+ *	earliest stages of packet rx. The prog will have been loaded as
+ *	BPF_PROG_TYPE_PHYS_DEV. The callee is responsible for calling
+ *	bpf_prog_put on any old progs that are stored, but not on the passed
+ *	in prog.
+ * bool (*ndo_bpf_get)(struct net_device *dev);
+ *	This function is used to check if a bpf program is set on the device.
  *
  */
 struct net_device_ops {
@@ -1292,6 +1301,9 @@ struct net_device_ops {
 						       struct sk_buff *skb);
 	void			(*ndo_set_rx_headroom)(struct net_device *dev,
 						       int needed_headroom);
+	int			(*ndo_bpf_set)(struct net_device *dev,
+					       struct bpf_prog *prog);
+	bool			(*ndo_bpf_get)(struct net_device *dev);
 };
 
 /**
@@ -3251,6 +3263,7 @@ int dev_get_phys_port_id(struct net_device *dev,
 int dev_get_phys_port_name(struct net_device *dev,
 			   char *name, size_t len);
 int dev_change_proto_down(struct net_device *dev, bool proto_down);
+int dev_change_bpf_fd(struct net_device *dev, int fd);
 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev);
 struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 				    struct netdev_queue *txq, int *ret);
diff --git a/net/core/dev.c b/net/core/dev.c
index 273f10d..7cf749c 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -94,6 +94,7 @@
 #include <linux/ethtool.h>
 #include <linux/notifier.h>
 #include <linux/skbuff.h>
+#include <linux/bpf.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/busy_poll.h>
@@ -6483,6 +6484,43 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down)
 EXPORT_SYMBOL(dev_change_proto_down);
 
 /**
+ *	dev_change_bpf_fd - set or clear a bpf program for a device
+ *	@dev: device
+ *	@fd: new program fd or negative value to clear
+ *
+ *	Set or clear a bpf program for a device
+ */
+int dev_change_bpf_fd(struct net_device *dev, int fd)
+{
+	const struct net_device_ops *ops = dev->netdev_ops;
+	struct bpf_prog *prog = NULL;
+	int err;
+
+	if (!ops->ndo_bpf_set)
+		return -EOPNOTSUPP;
+	if (!netif_device_present(dev))
+		return -ENODEV;
+
+	if (fd >= 0) {
+		prog = bpf_prog_get(fd);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+
+		if (prog->type != BPF_PROG_TYPE_PHYS_DEV) {
+			bpf_prog_put(prog);
+			return -EINVAL;
+		}
+	}
+
+	err = ops->ndo_bpf_set(dev, prog);
+	if (err < 0 && prog)
+		bpf_prog_put(prog);
+
+	return err;
+}
+EXPORT_SYMBOL(dev_change_bpf_fd);
+
+/**
  *	dev_new_index	-	allocate an ifindex
  *	@net: the applicable net namespace
  *
-- 
2.8.0

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC PATCH v2 3/5] rtnl: add option for setting link bpf prog
  2016-04-08  4:48 [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Brenden Blanco
  2016-04-08  4:48 ` [RFC PATCH v2 2/5] net: add ndo to set bpf prog in adapter rx Brenden Blanco
@ 2016-04-08  4:48 ` Brenden Blanco
  2016-04-08  4:48 ` [RFC PATCH v2 4/5] mlx4: add support for fast rx drop bpf program Brenden Blanco
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 34+ messages in thread
From: Brenden Blanco @ 2016-04-08  4:48 UTC (permalink / raw)
  To: davem
  Cc: Brenden Blanco, netdev, tom, alexei.starovoitov, ogerlitz,
	daniel, brouer, eric.dumazet, ecree, john.fastabend, tgraf,
	johannes, eranlinuxmellanox, lorenzo

Sets the bpf program represented by fd as an early filter in the rx path
of the netdev. The fd must have been created as BPF_PROG_TYPE_PHYS_DEV.
Providing a negative value as fd clears the program. Getting the fd back
via rtnl is not possible, therefore reading of this value merely
provides a bool whether the program is valid on the link or not.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 include/uapi/linux/if_link.h |  1 +
 net/core/rtnetlink.c         | 12 ++++++++++++
 2 files changed, 13 insertions(+)

diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 9427f17..ccad234 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -155,6 +155,7 @@ enum {
 	IFLA_PROTO_DOWN,
 	IFLA_GSO_MAX_SEGS,
 	IFLA_GSO_MAX_SIZE,
+	IFLA_BPF_FD,
 	__IFLA_MAX
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a75f7e9..96579ce 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -910,6 +910,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
 	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
 	       + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
 	       + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
+	       + nla_total_size(4) /* IFLA_BPF_FD */
 	       + nla_total_size(1); /* IFLA_PROTO_DOWN */
 
 }
@@ -1242,6 +1243,9 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
 	     nla_put_string(skb, IFLA_IFALIAS, dev->ifalias)) ||
 	    nla_put_u32(skb, IFLA_CARRIER_CHANGES,
 			atomic_read(&dev->carrier_changes)) ||
+	    (dev->netdev_ops->ndo_bpf_get &&
+	     nla_put_s32(skb, IFLA_BPF_FD,
+			 dev->netdev_ops->ndo_bpf_get(dev))) ||
 	    nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
 		goto nla_put_failure;
 
@@ -1375,6 +1379,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = {
 	[IFLA_PHYS_SWITCH_ID]	= { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN },
 	[IFLA_LINK_NETNSID]	= { .type = NLA_S32 },
 	[IFLA_PROTO_DOWN]	= { .type = NLA_U8 },
+	[IFLA_BPF_FD]		= { .type = NLA_S32 },
 };
 
 static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
@@ -2029,6 +2034,13 @@ static int do_setlink(const struct sk_buff *skb,
 		status |= DO_SETLINK_NOTIFY;
 	}
 
+	if (tb[IFLA_BPF_FD]) {
+		err = dev_change_bpf_fd(dev, nla_get_s32(tb[IFLA_BPF_FD]));
+		if (err)
+			goto errout;
+		status |= DO_SETLINK_NOTIFY;
+	}
+
 errout:
 	if (status & DO_SETLINK_MODIFIED) {
 		if (status & DO_SETLINK_NOTIFY)
-- 
2.8.0

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC PATCH v2 4/5] mlx4: add support for fast rx drop bpf program
  2016-04-08  4:48 [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Brenden Blanco
  2016-04-08  4:48 ` [RFC PATCH v2 2/5] net: add ndo to set bpf prog in adapter rx Brenden Blanco
  2016-04-08  4:48 ` [RFC PATCH v2 3/5] rtnl: add option for setting link bpf prog Brenden Blanco
@ 2016-04-08  4:48 ` Brenden Blanco
  2016-04-08 11:41   ` Jesper Dangaard Brouer
  2016-04-08  4:48 ` [RFC PATCH v2 5/5] Add sample for adding simple drop program to link Brenden Blanco
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 34+ messages in thread
From: Brenden Blanco @ 2016-04-08  4:48 UTC (permalink / raw)
  To: davem
  Cc: Brenden Blanco, netdev, tom, alexei.starovoitov, ogerlitz,
	daniel, brouer, eric.dumazet, ecree, john.fastabend, tgraf,
	johannes, eranlinuxmellanox, lorenzo

Add support for the BPF_PROG_TYPE_PHYS_DEV hook in mlx4 driver. Since
bpf programs require a skb context to navigate the packet, build a
percpu fake skb with the minimal fields. This avoids the costly
allocation for packets that end up being dropped.

Since mlx4 is so far the only user of bpf_phys_dev_md, the build
function is defined locally.

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 65 ++++++++++++++++++++++++++
 drivers/net/ethernet/mellanox/mlx4/en_rx.c     | 25 ++++++++--
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h   |  6 +++
 3 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index b4b258c..b228651 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -31,6 +31,7 @@
  *
  */
 
+#include <linux/bpf.h>
 #include <linux/etherdevice.h>
 #include <linux/tcp.h>
 #include <linux/if_vlan.h>
@@ -1966,6 +1967,9 @@ void mlx4_en_free_resources(struct mlx4_en_priv *priv)
 			mlx4_en_destroy_cq(priv, &priv->rx_cq[i]);
 	}
 
+	if (priv->prog)
+		bpf_prog_put(priv->prog);
+
 }
 
 int mlx4_en_alloc_resources(struct mlx4_en_priv *priv)
@@ -2078,6 +2082,11 @@ static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu)
 		en_err(priv, "Bad MTU size:%d.\n", new_mtu);
 		return -EPERM;
 	}
+	if (priv->prog && MLX4_EN_EFF_MTU(new_mtu) > FRAG_SZ0) {
+		en_err(priv, "MTU size:%d requires frags but bpf prog running",
+		       new_mtu);
+		return -EOPNOTSUPP;
+	}
 	dev->mtu = new_mtu;
 
 	if (netif_running(dev)) {
@@ -2456,6 +2465,58 @@ static int mlx4_en_set_tx_maxrate(struct net_device *dev, int queue_index, u32 m
 	return err;
 }
 
+static DEFINE_PER_CPU(struct sk_buff, percpu_bpf_phys_dev_md);
+
+static void build_bpf_phys_dev_md(struct sk_buff *skb, void *data,
+				  unsigned int length)
+{
+	/* data_len is intentionally not set here so that skb_is_nonlinear()
+	 * returns false
+	 */
+
+	skb->len = length;
+	skb->head = data;
+	skb->data = data;
+}
+
+int mlx4_call_bpf(struct bpf_prog *prog, void *data, unsigned int length)
+{
+	struct sk_buff *skb = this_cpu_ptr(&percpu_bpf_phys_dev_md);
+	int ret;
+
+	build_bpf_phys_dev_md(skb, data, length);
+
+	rcu_read_lock();
+	ret = BPF_PROG_RUN(prog, (void *)skb);
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int mlx4_bpf_set(struct net_device *dev, struct bpf_prog *prog)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+
+	if (priv->num_frags > 1)
+		return -EOPNOTSUPP;
+
+	old_prog = xchg(&priv->prog, prog);
+	if (old_prog) {
+		synchronize_net();
+		bpf_prog_put(old_prog);
+	}
+
+	return 0;
+}
+
+static bool mlx4_bpf_get(struct net_device *dev)
+{
+	struct mlx4_en_priv *priv = netdev_priv(dev);
+
+	return !!priv->prog;
+}
+
 static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_open		= mlx4_en_open,
 	.ndo_stop		= mlx4_en_close,
@@ -2486,6 +2547,8 @@ static const struct net_device_ops mlx4_netdev_ops = {
 	.ndo_features_check	= mlx4_en_features_check,
 #endif
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
+	.ndo_bpf_set		= mlx4_bpf_set,
+	.ndo_bpf_get		= mlx4_bpf_get,
 };
 
 static const struct net_device_ops mlx4_netdev_ops_master = {
@@ -2524,6 +2587,8 @@ static const struct net_device_ops mlx4_netdev_ops_master = {
 	.ndo_features_check	= mlx4_en_features_check,
 #endif
 	.ndo_set_tx_maxrate	= mlx4_en_set_tx_maxrate,
+	.ndo_bpf_set		= mlx4_bpf_set,
+	.ndo_bpf_get		= mlx4_bpf_get,
 };
 
 struct mlx4_en_bond {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 86bcfe5..287da02 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -748,6 +748,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	struct mlx4_en_rx_ring *ring = priv->rx_ring[cq->ring];
 	struct mlx4_en_rx_alloc *frags;
 	struct mlx4_en_rx_desc *rx_desc;
+	struct bpf_prog *prog;
 	struct sk_buff *skb;
 	int index;
 	int nr;
@@ -764,6 +765,8 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 	if (budget <= 0)
 		return polled;
 
+	prog = READ_ONCE(priv->prog);
+
 	/* We assume a 1:1 mapping between CQEs and Rx descriptors, so Rx
 	 * descriptor offset can be deduced from the CQE index instead of
 	 * reading 'cqe->index' */
@@ -840,6 +843,23 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
 		l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
 			(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
 
+		/* A bpf program gets first chance to drop the packet. It may
+		 * read bytes but not past the end of the frag.
+		 */
+		if (prog) {
+			struct ethhdr *ethh;
+			dma_addr_t dma;
+
+			dma = be64_to_cpu(rx_desc->data[0].addr);
+			dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
+						DMA_FROM_DEVICE);
+			ethh = page_address(frags[0].page) +
+							frags[0].page_offset;
+			if (mlx4_call_bpf(prog, ethh, frags[0].page_size) ==
+							BPF_PHYS_DEV_DROP)
+				goto next;
+		}
+
 		if (likely(dev->features & NETIF_F_RXCSUM)) {
 			if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
 						      MLX4_CQE_STATUS_UDP)) {
@@ -1067,10 +1087,7 @@ static const int frag_sizes[] = {
 void mlx4_en_calc_rx_buf(struct net_device *dev)
 {
 	struct mlx4_en_priv *priv = netdev_priv(dev);
-	/* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
-	 * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
-	 */
-	int eff_mtu = dev->mtu + ETH_HLEN + (2 * VLAN_HLEN);
+	int eff_mtu = MLX4_EN_EFF_MTU(dev->mtu);
 	int buf_size = 0;
 	int i = 0;
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
index d12ab6a..40eb32d 100644
--- a/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
+++ b/drivers/net/ethernet/mellanox/mlx4/mlx4_en.h
@@ -164,6 +164,10 @@ enum {
 #define MLX4_LOOPBACK_TEST_PAYLOAD (HEADER_COPY_SIZE - ETH_HLEN)
 
 #define MLX4_EN_MIN_MTU		46
+/* VLAN_HLEN is added twice,to support skb vlan tagged with multiple
+ * headers. (For example: ETH_P_8021Q and ETH_P_8021AD).
+ */
+#define MLX4_EN_EFF_MTU(mtu)	((mtu) + ETH_HLEN + (2 * VLAN_HLEN))
 #define ETH_BCAST		0xffffffffffffULL
 
 #define MLX4_EN_LOOPBACK_RETRIES	5
@@ -568,6 +572,7 @@ struct mlx4_en_priv {
 	struct hlist_head mac_hash[MLX4_EN_MAC_HASH_SIZE];
 	struct hwtstamp_config hwtstamp_config;
 	u32 counter_index;
+	struct bpf_prog *prog;
 
 #ifdef CONFIG_MLX4_EN_DCB
 	struct ieee_ets ets;
@@ -682,6 +687,7 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv);
 void mlx4_en_destroy_drop_qp(struct mlx4_en_priv *priv);
 int mlx4_en_free_tx_buf(struct net_device *dev, struct mlx4_en_tx_ring *ring);
 void mlx4_en_rx_irq(struct mlx4_cq *mcq);
+int mlx4_call_bpf(struct bpf_prog *prog, void *data, unsigned int length);
 
 int mlx4_SET_MCAST_FLTR(struct mlx4_dev *dev, u8 port, u64 mac, u64 clear, u8 mode);
 int mlx4_SET_VLAN_FLTR(struct mlx4_dev *dev, struct mlx4_en_priv *priv);
-- 
2.8.0

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* [RFC PATCH v2 5/5] Add sample for adding simple drop program to link
  2016-04-08  4:48 [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Brenden Blanco
                   ` (2 preceding siblings ...)
  2016-04-08  4:48 ` [RFC PATCH v2 4/5] mlx4: add support for fast rx drop bpf program Brenden Blanco
@ 2016-04-08  4:48 ` Brenden Blanco
  2016-04-09 14:48   ` Jamal Hadi Salim
  2016-04-08 10:36 ` [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Jesper Dangaard Brouer
  2016-04-09 11:17 ` Tom Herbert
  5 siblings, 1 reply; 34+ messages in thread
From: Brenden Blanco @ 2016-04-08  4:48 UTC (permalink / raw)
  To: davem
  Cc: Brenden Blanco, netdev, tom, alexei.starovoitov, ogerlitz,
	daniel, brouer, eric.dumazet, ecree, john.fastabend, tgraf,
	johannes, eranlinuxmellanox, lorenzo

Add a sample program that only drops packets at the
BPF_PROG_TYPE_PHYS_DEV hook of a link. With the drop-only program,
observed single core rate is ~19.5Mpps.

Other tests were run, for instance without the dropcnt increment or
without reading from the packet header, the packet rate was mostly
unchanged.

$ perf record -a samples/bpf/netdrvx1 $(</sys/class/net/eth0/ifindex)
proto 17:   19596362 drops/s

./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
Running... ctrl^C to stop
Device: eth4@0
Result: OK: 7873817(c7872245+d1572) usec, 38801823 (60byte,0frags)
  4927955pps 2365Mb/sec (2365418400bps) errors: 0
Device: eth4@1
Result: OK: 7873817(c7872123+d1693) usec, 38587342 (60byte,0frags)
  4900715pps 2352Mb/sec (2352343200bps) errors: 0
Device: eth4@2
Result: OK: 7873817(c7870929+d2888) usec, 38718848 (60byte,0frags)
  4917417pps 2360Mb/sec (2360360160bps) errors: 0
Device: eth4@3
Result: OK: 7873818(c7872193+d1625) usec, 38796346 (60byte,0frags)
  4927259pps 2365Mb/sec (2365084320bps) errors: 0

perf report --no-children:
 29.48%  ksoftirqd/6  [mlx4_en]         [k] mlx4_en_process_rx_cq
 18.17%  ksoftirqd/6  [mlx4_en]         [k] mlx4_en_alloc_frags
  8.19%  ksoftirqd/6  [mlx4_en]         [k] mlx4_en_free_frag
  5.35%  ksoftirqd/6  [kernel.vmlinux]  [k] get_page_from_freelist
  2.92%  ksoftirqd/6  [kernel.vmlinux]  [k] free_pages_prepare
  2.90%  ksoftirqd/6  [mlx4_en]         [k] mlx4_call_bpf
  2.72%  ksoftirqd/6  [fjes]            [k] 0x000000000000af66
  2.37%  ksoftirqd/6  [kernel.vmlinux]  [k] swiotlb_sync_single_for_cpu
  1.92%  ksoftirqd/6  [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
  1.83%  ksoftirqd/6  [kernel.vmlinux]  [k] free_one_page
  1.70%  ksoftirqd/6  [kernel.vmlinux]  [k] swiotlb_sync_single
  1.69%  ksoftirqd/6  [kernel.vmlinux]  [k] bpf_map_lookup_elem
  1.33%  swapper      [kernel.vmlinux]  [k] intel_idle
  1.32%  ksoftirqd/6  [fjes]            [k] 0x000000000000af90
  1.21%  ksoftirqd/6  [kernel.vmlinux]  [k] sk_load_byte_positive_offset
  1.07%  ksoftirqd/6  [kernel.vmlinux]  [k] __alloc_pages_nodemask
  0.89%  ksoftirqd/6  [kernel.vmlinux]  [k] __rmqueue
  0.84%  ksoftirqd/6  [mlx4_en]         [k] mlx4_alloc_pages.isra.23
  0.79%  ksoftirqd/6  [kernel.vmlinux]  [k] net_rx_action

machine specs:
 receiver - Intel E5-1630 v3 @ 3.70GHz
 sender - Intel E5645 @ 2.40GHz
 Mellanox ConnectX-3 @40G

Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
---
 samples/bpf/Makefile        |   4 ++
 samples/bpf/bpf_load.c      |   8 +++
 samples/bpf/netdrvx1_kern.c |  26 ++++++++
 samples/bpf/netdrvx1_user.c | 155 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 193 insertions(+)
 create mode 100644 samples/bpf/netdrvx1_kern.c
 create mode 100644 samples/bpf/netdrvx1_user.c

diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile
index 9959771..19bb926 100644
--- a/samples/bpf/Makefile
+++ b/samples/bpf/Makefile
@@ -20,6 +20,7 @@ hostprogs-y += offwaketime
 hostprogs-y += spintest
 hostprogs-y += map_perf_test
 hostprogs-y += test_overhead
+hostprogs-y += netdrvx1
 
 test_verifier-objs := test_verifier.o libbpf.o
 test_maps-objs := test_maps.o libbpf.o
@@ -40,6 +41,7 @@ offwaketime-objs := bpf_load.o libbpf.o offwaketime_user.o
 spintest-objs := bpf_load.o libbpf.o spintest_user.o
 map_perf_test-objs := bpf_load.o libbpf.o map_perf_test_user.o
 test_overhead-objs := bpf_load.o libbpf.o test_overhead_user.o
+netdrvx1-objs := bpf_load.o libbpf.o netdrvx1_user.o
 
 # Tell kbuild to always build the programs
 always := $(hostprogs-y)
@@ -60,6 +62,7 @@ always += spintest_kern.o
 always += map_perf_test_kern.o
 always += test_overhead_tp_kern.o
 always += test_overhead_kprobe_kern.o
+always += netdrvx1_kern.o
 
 HOSTCFLAGS += -I$(objtree)/usr/include
 
@@ -80,6 +83,7 @@ HOSTLOADLIBES_offwaketime += -lelf
 HOSTLOADLIBES_spintest += -lelf
 HOSTLOADLIBES_map_perf_test += -lelf -lrt
 HOSTLOADLIBES_test_overhead += -lelf -lrt
+HOSTLOADLIBES_netdrvx1 += -lelf
 
 # point this to your LLVM backend with bpf support
 LLC=$(srctree)/tools/bpf/llvm/bld/Debug+Asserts/bin/llc
diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c
index 022af71..c7b2245 100644
--- a/samples/bpf/bpf_load.c
+++ b/samples/bpf/bpf_load.c
@@ -50,6 +50,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 	bool is_kprobe = strncmp(event, "kprobe/", 7) == 0;
 	bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0;
 	bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0;
+	bool is_phys_dev = strncmp(event, "phys_dev", 8) == 0;
 	enum bpf_prog_type prog_type;
 	char buf[256];
 	int fd, efd, err, id;
@@ -66,6 +67,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 		prog_type = BPF_PROG_TYPE_KPROBE;
 	} else if (is_tracepoint) {
 		prog_type = BPF_PROG_TYPE_TRACEPOINT;
+	} else if (is_phys_dev) {
+		prog_type = BPF_PROG_TYPE_PHYS_DEV;
 	} else {
 		printf("Unknown event '%s'\n", event);
 		return -1;
@@ -79,6 +82,9 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size)
 
 	prog_fd[prog_cnt++] = fd;
 
+	if (is_phys_dev)
+		return 0;
+
 	if (is_socket) {
 		event += 6;
 		if (*event != '/')
@@ -319,6 +325,7 @@ int load_bpf_file(char *path)
 			if (memcmp(shname_prog, "kprobe/", 7) == 0 ||
 			    memcmp(shname_prog, "kretprobe/", 10) == 0 ||
 			    memcmp(shname_prog, "tracepoint/", 11) == 0 ||
+			    memcmp(shname_prog, "phys_dev", 8) == 0 ||
 			    memcmp(shname_prog, "socket", 6) == 0)
 				load_and_attach(shname_prog, insns, data_prog->d_size);
 		}
@@ -336,6 +343,7 @@ int load_bpf_file(char *path)
 		if (memcmp(shname, "kprobe/", 7) == 0 ||
 		    memcmp(shname, "kretprobe/", 10) == 0 ||
 		    memcmp(shname, "tracepoint/", 11) == 0 ||
+		    memcmp(shname, "phys_dev", 8) == 0 ||
 		    memcmp(shname, "socket", 6) == 0)
 			load_and_attach(shname, data->d_buf, data->d_size);
 	}
diff --git a/samples/bpf/netdrvx1_kern.c b/samples/bpf/netdrvx1_kern.c
new file mode 100644
index 0000000..849802d
--- /dev/null
+++ b/samples/bpf/netdrvx1_kern.c
@@ -0,0 +1,26 @@
+#include <uapi/linux/bpf.h>
+#include <uapi/linux/if_ether.h>
+#include <uapi/linux/if_packet.h>
+#include <uapi/linux/ip.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") dropcnt = {
+	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
+	.key_size = sizeof(u32),
+	.value_size = sizeof(long),
+	.max_entries = 256,
+};
+
+SEC("phys_dev1")
+int bpf_prog1(struct bpf_phys_dev_md *ctx)
+{
+	int index = load_byte(ctx, ETH_HLEN + offsetof(struct iphdr, protocol));
+	long *value;
+
+	value = bpf_map_lookup_elem(&dropcnt, &index);
+	if (value)
+		*value += 1;
+
+	return BPF_PHYS_DEV_DROP;
+}
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/netdrvx1_user.c b/samples/bpf/netdrvx1_user.c
new file mode 100644
index 0000000..9e6ec9a
--- /dev/null
+++ b/samples/bpf/netdrvx1_user.c
@@ -0,0 +1,155 @@
+#include <linux/bpf.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <assert.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/socket.h>
+#include <unistd.h>
+#include "bpf_load.h"
+#include "libbpf.h"
+
+static int set_link_bpf_fd(int ifindex, int fd)
+{
+	struct sockaddr_nl sa;
+	int sock, seq = 0, len, ret = -1;
+	char buf[4096];
+	struct rtattr *rta;
+	struct {
+		struct nlmsghdr  nh;
+		struct ifinfomsg ifinfo;
+		char             attrbuf[64];
+	} req;
+	struct nlmsghdr *nh;
+	struct nlmsgerr *err;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.nl_family = AF_NETLINK;
+
+	sock = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
+	if (sock < 0) {
+		printf("open netlink socket: %s\n", strerror(errno));
+		return -1;
+	}
+
+	if (bind(sock, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
+		printf("bind to netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	memset(&req, 0, sizeof(req));
+	req.nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	req.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
+	req.nh.nlmsg_type = RTM_SETLINK;
+	req.nh.nlmsg_pid = 0;
+	req.nh.nlmsg_seq = ++seq;
+	req.ifinfo.ifi_family = AF_UNSPEC;
+	req.ifinfo.ifi_index = ifindex;
+	rta = (struct rtattr *)(((char *) &req)
+				+ NLMSG_ALIGN(req.nh.nlmsg_len));
+	rta->rta_type = 42/*IFLA_BPF_FD*/;
+	rta->rta_len = RTA_LENGTH(sizeof(unsigned int));
+	req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len)
+		+ RTA_LENGTH(sizeof(fd));
+	memcpy(RTA_DATA(rta), &fd, sizeof(fd));
+	if (send(sock, &req, req.nh.nlmsg_len, 0) < 0) {
+		printf("send to netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	len = recv(sock, buf, sizeof(buf), 0);
+	if (len < 0) {
+		printf("recv from netlink: %s\n", strerror(errno));
+		goto cleanup;
+	}
+
+	for (nh = (struct nlmsghdr *)buf; NLMSG_OK(nh, len);
+	     nh = NLMSG_NEXT(nh, len)) {
+		if (nh->nlmsg_pid != getpid()) {
+			printf("Wrong pid %d, expected %d\n",
+			       nh->nlmsg_pid, getpid());
+			goto cleanup;
+		}
+		if (nh->nlmsg_seq != seq) {
+			printf("Wrong seq %d, expected %d\n",
+			       nh->nlmsg_seq, seq);
+			goto cleanup;
+		}
+		switch (nh->nlmsg_type) {
+		case NLMSG_ERROR:
+			err = (struct nlmsgerr *)NLMSG_DATA(nh);
+			if (!err->error)
+				continue;
+			printf("nlmsg error %s\n", strerror(-err->error));
+			goto cleanup;
+		case NLMSG_DONE:
+			break;
+		}
+	}
+
+	ret = 0;
+
+cleanup:
+	close(sock);
+	return ret;
+}
+
+/* simple per-protocol drop counter
+ */
+static void poll_stats(int secs)
+{
+	unsigned int nr_cpus = sysconf(_SC_NPROCESSORS_CONF);
+	__u64 values[nr_cpus];
+	__u32 key;
+	int i;
+
+	sleep(secs);
+
+	for (key = 0; key < 256; key++) {
+		__u64 sum = 0;
+
+		assert(bpf_lookup_elem(map_fd[0], &key, values) == 0);
+		for (i = 0; i < nr_cpus; i++)
+			sum += values[i];
+		if (sum)
+			printf("proto %u: %10llu drops/s\n", key, sum/secs);
+	}
+}
+
+int main(int ac, char **argv)
+{
+	char filename[256];
+	int ifindex;
+
+	snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+	if (ac != 2) {
+		printf("usage: %s IFINDEX\n", argv[0]);
+		return 1;
+	}
+
+	ifindex = strtoul(argv[1], NULL, 0);
+
+	if (load_bpf_file(filename)) {
+		printf("%s", bpf_log_buf);
+		return 1;
+	}
+
+	if (!prog_fd[0]) {
+		printf("load_bpf_file: %s\n", strerror(errno));
+		return 1;
+	}
+
+	if (set_link_bpf_fd(ifindex, prog_fd[0]) < 0) {
+		printf("link set bpf fd failed\n");
+		return 1;
+	}
+
+	poll_stats(5);
+
+	set_link_bpf_fd(ifindex, -1);
+
+	return 0;
+}
-- 
2.8.0

^ permalink raw reply related	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 2/5] net: add ndo to set bpf prog in adapter rx
  2016-04-08  4:48 ` [RFC PATCH v2 2/5] net: add ndo to set bpf prog in adapter rx Brenden Blanco
@ 2016-04-08  9:38   ` Jesper Dangaard Brouer
  2016-04-08 16:39     ` Brenden Blanco
  0 siblings, 1 reply; 34+ messages in thread
From: Jesper Dangaard Brouer @ 2016-04-08  9:38 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, brouer


On Thu,  7 Apr 2016 21:48:47 -0700 Brenden Blanco <bblanco@plumgrid.com> wrote:

> Add two new set/get netdev ops for drivers implementing the
> BPF_PROG_TYPE_PHYS_DEV filter.
> 
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
[...]
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index cb4e508..3acf732 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
[...]
> @@ -1102,6 +1103,14 @@ struct tc_to_netdev {
>   *	appropriate rx headroom value allows avoiding skb head copy on
>   *	forward. Setting a negative value resets the rx headroom to the
>   *	default value.
> + * int  (*ndo_bpf_set)(struct net_device *dev, struct bpf_prog *prog);
> + *	This function is used to set or clear a bpf program used in the
> + *	earliest stages of packet rx. The prog will have been loaded as
> + *	BPF_PROG_TYPE_PHYS_DEV. The callee is responsible for calling
> + *	bpf_prog_put on any old progs that are stored, but not on the passed
> + *	in prog.
> + * bool (*ndo_bpf_get)(struct net_device *dev);
> + *	This function is used to check if a bpf program is set on the device.
>   *

This interface for the entire device, right.  I can imagine that users
want to attach a eBPF program per RX queue.  Can we adapt the interface
to support this? (could default to adding it all queues)


I'm also wondering if we should add a "flags" parameter.  Or maybe we
can extend 'struct bpf_prog' with I have in mind.

When the eBPF program is attached to a RX queue, I want to know if the
program want to modify packet-data, up-front.

The problem is that most drivers use dma_sync, which means that data is
considered 'read-only' (the "considered" part depend on DMA engine, and
we might find a DMA loop-hole for some configs).
  If the program want to write, the driver have the option of
reconfiguring the driver routine to use dma_unmap, before handing over
the page.  Or driver can also choose to realloc the specific RX ring
queue pages as single pages (using dma_map/unmap consistently).
 This also allow us to give a return code indicating given driver does
not support writable packet-pages (rejecting program).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-08  4:48 [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Brenden Blanco
                   ` (3 preceding siblings ...)
  2016-04-08  4:48 ` [RFC PATCH v2 5/5] Add sample for adding simple drop program to link Brenden Blanco
@ 2016-04-08 10:36 ` Jesper Dangaard Brouer
  2016-04-08 11:09   ` Daniel Borkmann
  2016-04-08 12:33   ` Jesper Dangaard Brouer
  2016-04-09 11:17 ` Tom Herbert
  5 siblings, 2 replies; 34+ messages in thread
From: Jesper Dangaard Brouer @ 2016-04-08 10:36 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, brouer

On Thu,  7 Apr 2016 21:48:46 -0700
Brenden Blanco <bblanco@plumgrid.com> wrote:

> Add a new bpf prog type that is intended to run in early stages of the
> packet rx path. Only minimal packet metadata will be available, hence a
> new context type, struct bpf_phys_dev_md, is exposed to userspace. So
> far only expose the readable packet length, and only in read mode.
> 
> The PHYS_DEV name is chosen to represent that the program is meant only
> for physical adapters, rather than all netdevs.
> 
> While the user visible struct is new, the underlying context must be
> implemented as a minimal skb in order for the packet load_* instructions
> to work. The skb filled in by the driver must have skb->len, skb->head,
> and skb->data set, and skb->data_len == 0.
> 
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
> ---
>  include/uapi/linux/bpf.h | 14 ++++++++++
>  kernel/bpf/verifier.c    |  1 +
>  net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+)
> 
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 70eda5a..3018d83 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -93,6 +93,7 @@ enum bpf_prog_type {
>  	BPF_PROG_TYPE_SCHED_CLS,
>  	BPF_PROG_TYPE_SCHED_ACT,
>  	BPF_PROG_TYPE_TRACEPOINT,
> +	BPF_PROG_TYPE_PHYS_DEV,
>  };
>  
>  #define BPF_PSEUDO_MAP_FD	1
> @@ -368,6 +369,19 @@ struct __sk_buff {
>  	__u32 tc_classid;
>  };
>  
> +/* user return codes for PHYS_DEV prog type */
> +enum bpf_phys_dev_action {
> +	BPF_PHYS_DEV_DROP,
> +	BPF_PHYS_DEV_OK,
> +};

I can imagine these extra return codes:

 BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
 BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
 BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */

The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
which we can look at when we get that far...

For the "MODIFIED" case, people caring about checksumming, might want
to voice their concern if we want additional info or return codes,
indicating if software need to recalc CSUM (or e.g. if only IP-pseudo
hdr was modified).


> +/* user accessible metadata for PHYS_DEV packet hook
> + * new fields must be added to the end of this structure
> + */
> +struct bpf_phys_dev_md {
> +	__u32 len;
> +};

This is userspace visible.  I can easily imagine this structure will get
extended.  How does a userspace eBPF program know that the struct got
extended? (bet you got some scheme, normally I would add a "version" as
first elem).

BTW, how and where is this "struct bpf_phys_dev_md" allocated?


>  struct bpf_tunnel_key {
>  	__u32 tunnel_id;
>  	union {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 58792fe..877542d 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1344,6 +1344,7 @@ static bool may_access_skb(enum bpf_prog_type type)
>  	case BPF_PROG_TYPE_SOCKET_FILTER:
>  	case BPF_PROG_TYPE_SCHED_CLS:
>  	case BPF_PROG_TYPE_SCHED_ACT:
> +	case BPF_PROG_TYPE_PHYS_DEV:
>  		return true;
>  	default:
>  		return false;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e8486ba..4f73fb9 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2021,6 +2021,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
>  	}
>  }
>  
> +static const struct bpf_func_proto *
> +phys_dev_func_proto(enum bpf_func_id func_id)
> +{
> +	return sk_filter_func_proto(func_id);
> +}
> +
>  static bool __is_valid_access(int off, int size, enum bpf_access_type type)
>  {
>  	/* check bounds */
> @@ -2076,6 +2082,36 @@ static bool tc_cls_act_is_valid_access(int off, int size,
>  	return __is_valid_access(off, size, type);
>  }
>  
> +static bool __is_valid_phys_dev_access(int off, int size,
> +				       enum bpf_access_type type)
> +{
> +	if (off < 0 || off >= sizeof(struct bpf_phys_dev_md))
> +		return false;
> +
> +	if (off % size != 0)
> +		return false;
> +
> +	if (size != 4)
> +		return false;
> +
> +	return true;
> +}
> +
> +static bool phys_dev_is_valid_access(int off, int size,
> +				     enum bpf_access_type type)
> +{
> +	if (type == BPF_WRITE)
> +		return false;
> +
> +	switch (off) {
> +	case offsetof(struct bpf_phys_dev_md, len):
> +		break;
> +	default:
> +		return false;
> +	}
> +	return __is_valid_phys_dev_access(off, size, type);
> +}
> +
>  static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>  				      int src_reg, int ctx_off,
>  				      struct bpf_insn *insn_buf,
> @@ -2213,6 +2249,26 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>  	return insn - insn_buf;
>  }
>  
> +static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type,
> +					   int dst_reg, int src_reg,
> +					   int ctx_off,
> +					   struct bpf_insn *insn_buf,
> +					   struct bpf_prog *prog)
> +{
> +	struct bpf_insn *insn = insn_buf;
> +
> +	switch (ctx_off) {
> +	case offsetof(struct bpf_phys_dev_md, len):
> +		BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
> +
> +		*insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
> +				      offsetof(struct sk_buff, len));
> +		break;
> +	}
> +
> +	return insn - insn_buf;
> +}
> +
>  static const struct bpf_verifier_ops sk_filter_ops = {
>  	.get_func_proto = sk_filter_func_proto,
>  	.is_valid_access = sk_filter_is_valid_access,
> @@ -2225,6 +2281,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
>  	.convert_ctx_access = bpf_net_convert_ctx_access,
>  };
>  
> +static const struct bpf_verifier_ops phys_dev_ops = {
> +	.get_func_proto = phys_dev_func_proto,
> +	.is_valid_access = phys_dev_is_valid_access,
> +	.convert_ctx_access = bpf_phys_dev_convert_ctx_access,
> +};
> +
>  static struct bpf_prog_type_list sk_filter_type __read_mostly = {
>  	.ops = &sk_filter_ops,
>  	.type = BPF_PROG_TYPE_SOCKET_FILTER,
> @@ -2240,11 +2302,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = {
>  	.type = BPF_PROG_TYPE_SCHED_ACT,
>  };
>  
> +static struct bpf_prog_type_list phys_dev_type __read_mostly = {
> +	.ops = &phys_dev_ops,
> +	.type = BPF_PROG_TYPE_PHYS_DEV,
> +};
> +
>  static int __init register_sk_filter_ops(void)
>  {
>  	bpf_register_prog_type(&sk_filter_type);
>  	bpf_register_prog_type(&sched_cls_type);
>  	bpf_register_prog_type(&sched_act_type);
> +	bpf_register_prog_type(&phys_dev_type);
>  
>  	return 0;
>  }



-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-08 10:36 ` [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Jesper Dangaard Brouer
@ 2016-04-08 11:09   ` Daniel Borkmann
  2016-04-08 16:48     ` Brenden Blanco
  2016-04-08 12:33   ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 34+ messages in thread
From: Daniel Borkmann @ 2016-04-08 11:09 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, eric.dumazet,
	ecree, john.fastabend, tgraf, johannes, eranlinuxmellanox,
	lorenzo

On 04/08/2016 12:36 PM, Jesper Dangaard Brouer wrote:
> On Thu,  7 Apr 2016 21:48:46 -0700
> Brenden Blanco <bblanco@plumgrid.com> wrote:
>
>> Add a new bpf prog type that is intended to run in early stages of the
>> packet rx path. Only minimal packet metadata will be available, hence a
>> new context type, struct bpf_phys_dev_md, is exposed to userspace. So
>> far only expose the readable packet length, and only in read mode.
>>
>> The PHYS_DEV name is chosen to represent that the program is meant only
>> for physical adapters, rather than all netdevs.
>>
>> While the user visible struct is new, the underlying context must be
>> implemented as a minimal skb in order for the packet load_* instructions
>> to work. The skb filled in by the driver must have skb->len, skb->head,
>> and skb->data set, and skb->data_len == 0.
>>
>> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
>> ---
>>   include/uapi/linux/bpf.h | 14 ++++++++++
>>   kernel/bpf/verifier.c    |  1 +
>>   net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
>>   3 files changed, 83 insertions(+)
>>
>> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
>> index 70eda5a..3018d83 100644
>> --- a/include/uapi/linux/bpf.h
>> +++ b/include/uapi/linux/bpf.h
>> @@ -93,6 +93,7 @@ enum bpf_prog_type {
>>   	BPF_PROG_TYPE_SCHED_CLS,
>>   	BPF_PROG_TYPE_SCHED_ACT,
>>   	BPF_PROG_TYPE_TRACEPOINT,
>> +	BPF_PROG_TYPE_PHYS_DEV,
>>   };
>>
>>   #define BPF_PSEUDO_MAP_FD	1
>> @@ -368,6 +369,19 @@ struct __sk_buff {
>>   	__u32 tc_classid;
>>   };
>>
>> +/* user return codes for PHYS_DEV prog type */
>> +enum bpf_phys_dev_action {
>> +	BPF_PHYS_DEV_DROP,
>> +	BPF_PHYS_DEV_OK,
>> +};
>
> I can imagine these extra return codes:
>
>   BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
>   BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
>   BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
>
> The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> which we can look at when we get that far...

I'd probably let the tcpdump case be handled by the rest of the stack.
Forwarding could be to some txqueue or perhaps directly to a virtual net
device e.g. the veth end sitting in a container (where fake skb would
need to be promoted to a real one). Or, perhaps instead of veth end,
directly demuxed into a target socket queue in that container ...
Alternatively for tcpdump use-case, you could also do sampling on the
bpf_phy_dev with eBPF maps.

> For the "MODIFIED" case, people caring about checksumming, might want
> to voice their concern if we want additional info or return codes,
> indicating if software need to recalc CSUM (or e.g. if only IP-pseudo
> hdr was modified).
>
>> +/* user accessible metadata for PHYS_DEV packet hook
>> + * new fields must be added to the end of this structure
>> + */
>> +struct bpf_phys_dev_md {
>> +	__u32 len;
>> +};
>
> This is userspace visible.  I can easily imagine this structure will get
> extended.  How does a userspace eBPF program know that the struct got
> extended? (bet you got some scheme, normally I would add a "version" as
> first elem).
>
> BTW, how and where is this "struct bpf_phys_dev_md" allocated?

It isn't, see bpf_phys_dev_convert_ctx_access(). At load time the verifier
will convert/rewrite this based on offsetof() to a real access of the per
cpu sk_buff, that's the only purpose.

Cheers,
Daniel

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 4/5] mlx4: add support for fast rx drop bpf program
  2016-04-08  4:48 ` [RFC PATCH v2 4/5] mlx4: add support for fast rx drop bpf program Brenden Blanco
@ 2016-04-08 11:41   ` Jesper Dangaard Brouer
  2016-04-08 17:04     ` Brenden Blanco
  0 siblings, 1 reply; 34+ messages in thread
From: Jesper Dangaard Brouer @ 2016-04-08 11:41 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, brouer


On Thu,  7 Apr 2016 21:48:49 -0700 Brenden Blanco <bblanco@plumgrid.com> wrote:

> +int mlx4_call_bpf(struct bpf_prog *prog, void *data, unsigned int length)
> +{
> +	struct sk_buff *skb = this_cpu_ptr(&percpu_bpf_phys_dev_md);
> +	int ret;
> +
> +	build_bpf_phys_dev_md(skb, data, length);
> +
> +	rcu_read_lock();
> +	ret = BPF_PROG_RUN(prog, (void *)skb);
> +	rcu_read_unlock();
> +
> +	return ret;
> +}
[...]

> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> index 86bcfe5..287da02 100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> @@ -840,6 +843,23 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
>  		l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
>  			(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
>  
> +		/* A bpf program gets first chance to drop the packet. It may
> +		 * read bytes but not past the end of the frag.
> +		 */
> +		if (prog) {
> +			struct ethhdr *ethh;
> +			dma_addr_t dma;
> +
> +			dma = be64_to_cpu(rx_desc->data[0].addr);
> +			dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
> +						DMA_FROM_DEVICE);
> +			ethh = page_address(frags[0].page) +
> +							frags[0].page_offset;
> +			if (mlx4_call_bpf(prog, ethh, frags[0].page_size) ==
> +							BPF_PHYS_DEV_DROP)
> +				goto next;
> +		}
> +

Do the program need to know if the packet-page we handover is
considered 'read-only' at the call site?  Or do we rely on my idea
requiring the registration to "know" this...

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-08 10:36 ` [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Jesper Dangaard Brouer
  2016-04-08 11:09   ` Daniel Borkmann
@ 2016-04-08 12:33   ` Jesper Dangaard Brouer
  2016-04-08 17:02     ` Brenden Blanco
  2016-04-08 17:26     ` Alexei Starovoitov
  1 sibling, 2 replies; 34+ messages in thread
From: Jesper Dangaard Brouer @ 2016-04-08 12:33 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, brouer, linux-mm


On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:

> > +/* user return codes for PHYS_DEV prog type */
> > +enum bpf_phys_dev_action {
> > +	BPF_PHYS_DEV_DROP,
> > +	BPF_PHYS_DEV_OK,
> > +};  
> 
> I can imagine these extra return codes:
> 
>  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
>  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
>  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> 
> The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> which we can look at when we get that far...

I want to point out something which is quite FUNDAMENTAL, for
understanding these return codes (and network stack).


At driver RX time, the network stack basically have two ways of
building an SKB, which is send up the stack.

Option-A (fastest): The packet page is writable. The SKB can be
allocated and skb->data/head can point directly to the page.  And
we place/write skb_shared_info in the end/tail-room. (This is done by
calling build_skb()).

Option-B (slower): The packet page is read-only.  The SKB cannot point
skb->data/head directly to the page, because skb_shared_info need to be
written into skb->end (slightly hidden via skb_shinfo() casting).  To
get around this, a separate piece of memory is allocated (speedup by
__alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
be written. (This is done when calling netdev/napi_alloc_skb()).
  Drivers then need to copy over packet headers, and assign + adjust
skb_shinfo(skb)->frags[0] offset to skip copied headers.


Unfortunately most drivers use option-B.  Due to cost of calling the
page allocator.  It is only slightly most expensive to get a larger
compound page from the page allocator, which then can be partitioned into
page-fragments, thus amortizing the page alloc cost.  Unfortunately the
cost is added later, when constructing the SKB.
 Another reason for option-B, is that archs with expensive IOMMU
requirements (like PowerPC), don't need to dma_unmap on every packet,
but only on the compound page level.

Side-note: Most drivers have a "copy-break" optimization.  Especially
for option-B, when copying header data anyhow. For small packet, one
might as well free (or recycle) the RX page, if header size fits into
the newly allocated memory (for skb_shared_info).


For the early filter drop (DDoS use-case), it does not matter that the
packet-page is read-only.

BUT for the future XDP (eXpress Data Path) use-case it does matter.  If
we ever want to see speeds comparable to DPDK, then drivers to
need to implement option-A, as this allow forwarding at the packet-page
level.

I hope, my future page-pool facility can remove/hide the cost calling
the page allocator.


Back to the return codes, thus:
-------------------------------
BPF_PHYS_DEV_SHARED requires driver use option-B, when constructing
the SKB, and treat packet data as read-only.

BPF_PHYS_DEV_MODIFIED requires driver to provide a writable packet-page.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 2/5] net: add ndo to set bpf prog in adapter rx
  2016-04-08  9:38   ` Jesper Dangaard Brouer
@ 2016-04-08 16:39     ` Brenden Blanco
  0 siblings, 0 replies; 34+ messages in thread
From: Brenden Blanco @ 2016-04-08 16:39 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo

On Fri, Apr 08, 2016 at 11:38:58AM +0200, Jesper Dangaard Brouer wrote:
> 
> On Thu,  7 Apr 2016 21:48:47 -0700 Brenden Blanco <bblanco@plumgrid.com> wrote:
> 
> > Add two new set/get netdev ops for drivers implementing the
> > BPF_PROG_TYPE_PHYS_DEV filter.
> > 
> > Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
> [...]
> > 
> > diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> > index cb4e508..3acf732 100644
> > --- a/include/linux/netdevice.h
> > +++ b/include/linux/netdevice.h
> [...]
> > @@ -1102,6 +1103,14 @@ struct tc_to_netdev {
> >   *	appropriate rx headroom value allows avoiding skb head copy on
> >   *	forward. Setting a negative value resets the rx headroom to the
> >   *	default value.
> > + * int  (*ndo_bpf_set)(struct net_device *dev, struct bpf_prog *prog);
> > + *	This function is used to set or clear a bpf program used in the
> > + *	earliest stages of packet rx. The prog will have been loaded as
> > + *	BPF_PROG_TYPE_PHYS_DEV. The callee is responsible for calling
> > + *	bpf_prog_put on any old progs that are stored, but not on the passed
> > + *	in prog.
> > + * bool (*ndo_bpf_get)(struct net_device *dev);
> > + *	This function is used to check if a bpf program is set on the device.
> >   *
> 
> This interface for the entire device, right.  I can imagine that users
> want to attach a eBPF program per RX queue.  Can we adapt the interface
> to support this? (could default to adding it all queues)
> 
Currently yes, for the entire device. I don't see rx queue exposed in
common setlink APIs. Wouldn't this be available only through ethtool,
generally? That would be a significant change to the api, but not a lot
of code. I would defer to others on which is cleaner. An alternative
could be to always run the program, but expose the queue number in
struct bpf_phys_dev_md. That is not as flexible since the program is
still shared, but maybe still useful.
> 
> I'm also wondering if we should add a "flags" parameter.  Or maybe we
> can extend 'struct bpf_prog' with I have in mind.
> 
> When the eBPF program is attached to a RX queue, I want to know if the
> program want to modify packet-data, up-front.
> 
> The problem is that most drivers use dma_sync, which means that data is
> considered 'read-only' (the "considered" part depend on DMA engine, and
> we might find a DMA loop-hole for some configs).
>   If the program want to write, the driver have the option of
> reconfiguring the driver routine to use dma_unmap, before handing over
> the page.  Or driver can also choose to realloc the specific RX ring
> queue pages as single pages (using dma_map/unmap consistently).
>  This also allow us to give a return code indicating given driver does
> not support writable packet-pages (rejecting program).
When write-mode is enabled for this prog type, we'll add the flag. I
don't want to add unused flags prematurely. When we add such support, it
should be available in the bpf_prog struct, similar to the cb_access or
dst_needed bool fields.
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-08 11:09   ` Daniel Borkmann
@ 2016-04-08 16:48     ` Brenden Blanco
  0 siblings, 0 replies; 34+ messages in thread
From: Brenden Blanco @ 2016-04-08 16:48 UTC (permalink / raw)
  To: Daniel Borkmann
  Cc: Jesper Dangaard Brouer, davem, netdev, tom, alexei.starovoitov,
	ogerlitz, eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo

On Fri, Apr 08, 2016 at 01:09:30PM +0200, Daniel Borkmann wrote:
> On 04/08/2016 12:36 PM, Jesper Dangaard Brouer wrote:
> >On Thu,  7 Apr 2016 21:48:46 -0700
> >Brenden Blanco <bblanco@plumgrid.com> wrote:
> >
> >>Add a new bpf prog type that is intended to run in early stages of the
> >>packet rx path. Only minimal packet metadata will be available, hence a
> >>new context type, struct bpf_phys_dev_md, is exposed to userspace. So
> >>far only expose the readable packet length, and only in read mode.
> >>
> >>The PHYS_DEV name is chosen to represent that the program is meant only
> >>for physical adapters, rather than all netdevs.
> >>
> >>While the user visible struct is new, the underlying context must be
> >>implemented as a minimal skb in order for the packet load_* instructions
> >>to work. The skb filled in by the driver must have skb->len, skb->head,
> >>and skb->data set, and skb->data_len == 0.
> >>
> >>Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
> >>---
> >>  include/uapi/linux/bpf.h | 14 ++++++++++
> >>  kernel/bpf/verifier.c    |  1 +
> >>  net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
> >>  3 files changed, 83 insertions(+)
> >>
> >>diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> >>index 70eda5a..3018d83 100644
> >>--- a/include/uapi/linux/bpf.h
> >>+++ b/include/uapi/linux/bpf.h
> >>@@ -93,6 +93,7 @@ enum bpf_prog_type {
> >>  	BPF_PROG_TYPE_SCHED_CLS,
> >>  	BPF_PROG_TYPE_SCHED_ACT,
> >>  	BPF_PROG_TYPE_TRACEPOINT,
> >>+	BPF_PROG_TYPE_PHYS_DEV,
> >>  };
> >>
> >>  #define BPF_PSEUDO_MAP_FD	1
> >>@@ -368,6 +369,19 @@ struct __sk_buff {
> >>  	__u32 tc_classid;
> >>  };
> >>
> >>+/* user return codes for PHYS_DEV prog type */
> >>+enum bpf_phys_dev_action {
> >>+	BPF_PHYS_DEV_DROP,
> >>+	BPF_PHYS_DEV_OK,
> >>+};
> >
> >I can imagine these extra return codes:
> >
> >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> >
> >The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> >which we can look at when we get that far...
> 
> I'd probably let the tcpdump case be handled by the rest of the stack.
> Forwarding could be to some txqueue or perhaps directly to a virtual net
> device e.g. the veth end sitting in a container (where fake skb would
> need to be promoted to a real one). Or, perhaps instead of veth end,
> directly demuxed into a target socket queue in that container ...
> Alternatively for tcpdump use-case, you could also do sampling on the
> bpf_phy_dev with eBPF maps.
+1, there is plenty of infrastructure to deal with this already.
> 
> >For the "MODIFIED" case, people caring about checksumming, might want
> >to voice their concern if we want additional info or return codes,
> >indicating if software need to recalc CSUM (or e.g. if only IP-pseudo
> >hdr was modified).
> >
> >>+/* user accessible metadata for PHYS_DEV packet hook
> >>+ * new fields must be added to the end of this structure
> >>+ */
> >>+struct bpf_phys_dev_md {
> >>+	__u32 len;
> >>+};
> >
> >This is userspace visible.  I can easily imagine this structure will get
> >extended.  How does a userspace eBPF program know that the struct got
> >extended? (bet you got some scheme, normally I would add a "version" as
> >first elem).
Since fields are only ever added to the end, programs that access beyond
the struct as understood by the running kernel will be rejected. The
struct ordering is a hard requirement. We've gone through quite a few
upgrades of this style of struct and not had any issues that I am aware.
> >
> >BTW, how and where is this "struct bpf_phys_dev_md" allocated?
> 
> It isn't, see bpf_phys_dev_convert_ctx_access(). At load time the verifier
> will convert/rewrite this based on offsetof() to a real access of the per
> cpu sk_buff, that's the only purpose.
> 
> Cheers,
> Daniel

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-08 12:33   ` Jesper Dangaard Brouer
@ 2016-04-08 17:02     ` Brenden Blanco
  2016-04-08 19:05       ` Jesper Dangaard Brouer
  2016-04-08 17:26     ` Alexei Starovoitov
  1 sibling, 1 reply; 34+ messages in thread
From: Brenden Blanco @ 2016-04-08 17:02 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, linux-mm

On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> 
> On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> 
> > > +/* user return codes for PHYS_DEV prog type */
> > > +enum bpf_phys_dev_action {
> > > +	BPF_PHYS_DEV_DROP,
> > > +	BPF_PHYS_DEV_OK,
> > > +};  
> > 
> > I can imagine these extra return codes:
> > 
> >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> > 
> > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > which we can look at when we get that far...
> 
> I want to point out something which is quite FUNDAMENTAL, for
> understanding these return codes (and network stack).
> 
> 
> At driver RX time, the network stack basically have two ways of
> building an SKB, which is send up the stack.
> 
> Option-A (fastest): The packet page is writable. The SKB can be
> allocated and skb->data/head can point directly to the page.  And
> we place/write skb_shared_info in the end/tail-room. (This is done by
> calling build_skb()).
> 
> Option-B (slower): The packet page is read-only.  The SKB cannot point
> skb->data/head directly to the page, because skb_shared_info need to be
> written into skb->end (slightly hidden via skb_shinfo() casting).  To
> get around this, a separate piece of memory is allocated (speedup by
> __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> be written. (This is done when calling netdev/napi_alloc_skb()).
>   Drivers then need to copy over packet headers, and assign + adjust
> skb_shinfo(skb)->frags[0] offset to skip copied headers.
> 
> 
> Unfortunately most drivers use option-B.  Due to cost of calling the
> page allocator.  It is only slightly most expensive to get a larger
> compound page from the page allocator, which then can be partitioned into
> page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> cost is added later, when constructing the SKB.
>  Another reason for option-B, is that archs with expensive IOMMU
> requirements (like PowerPC), don't need to dma_unmap on every packet,
> but only on the compound page level.
> 
> Side-note: Most drivers have a "copy-break" optimization.  Especially
> for option-B, when copying header data anyhow. For small packet, one
> might as well free (or recycle) the RX page, if header size fits into
> the newly allocated memory (for skb_shared_info).
> 
> 
> For the early filter drop (DDoS use-case), it does not matter that the
> packet-page is read-only.
> 
> BUT for the future XDP (eXpress Data Path) use-case it does matter.  If
> we ever want to see speeds comparable to DPDK, then drivers to
> need to implement option-A, as this allow forwarding at the packet-page
> level.
> 
> I hope, my future page-pool facility can remove/hide the cost calling
> the page allocator.
> 
Can't wait! This will open up a lot of doors.
> 
> Back to the return codes, thus:
> -------------------------------
> BPF_PHYS_DEV_SHARED requires driver use option-B, when constructing
> the SKB, and treat packet data as read-only.
> 
> BPF_PHYS_DEV_MODIFIED requires driver to provide a writable packet-page.
I understand the driver/hw requirement, but the codes themselves I think
need some tweaking. For instance, if the packet is both modified and
forwarded, should the flags be ORed together? Or is the need for this
return code made obsolete if the driver knows ahead of time via struct
bpf_prog flags that the prog intends to modify the packet, and can set
up the page accordingly?
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 4/5] mlx4: add support for fast rx drop bpf program
  2016-04-08 11:41   ` Jesper Dangaard Brouer
@ 2016-04-08 17:04     ` Brenden Blanco
  0 siblings, 0 replies; 34+ messages in thread
From: Brenden Blanco @ 2016-04-08 17:04 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo

On Fri, Apr 08, 2016 at 01:41:58PM +0200, Jesper Dangaard Brouer wrote:
> 
> On Thu,  7 Apr 2016 21:48:49 -0700 Brenden Blanco <bblanco@plumgrid.com> wrote:
> 
> > +int mlx4_call_bpf(struct bpf_prog *prog, void *data, unsigned int length)
> > +{
> > +	struct sk_buff *skb = this_cpu_ptr(&percpu_bpf_phys_dev_md);
> > +	int ret;
> > +
> > +	build_bpf_phys_dev_md(skb, data, length);
> > +
> > +	rcu_read_lock();
> > +	ret = BPF_PROG_RUN(prog, (void *)skb);
> > +	rcu_read_unlock();
> > +
> > +	return ret;
> > +}
> [...]
> 
> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> > index 86bcfe5..287da02 100644
> > --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
> > @@ -840,6 +843,23 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
> >  		l2_tunnel = (dev->hw_enc_features & NETIF_F_RXCSUM) &&
> >  			(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
> >  
> > +		/* A bpf program gets first chance to drop the packet. It may
> > +		 * read bytes but not past the end of the frag.
> > +		 */
> > +		if (prog) {
> > +			struct ethhdr *ethh;
> > +			dma_addr_t dma;
> > +
> > +			dma = be64_to_cpu(rx_desc->data[0].addr);
> > +			dma_sync_single_for_cpu(priv->ddev, dma, sizeof(*ethh),
> > +						DMA_FROM_DEVICE);
> > +			ethh = page_address(frags[0].page) +
> > +							frags[0].page_offset;
> > +			if (mlx4_call_bpf(prog, ethh, frags[0].page_size) ==
> > +							BPF_PHYS_DEV_DROP)
> > +				goto next;
> > +		}
> > +
> 
> Do the program need to know if the packet-page we handover is
> considered 'read-only' at the call site?  Or do we rely on my idea
> requiring the registration to "know" this...
Seems that we're leaning towards the latter at this point. In either
case, we won't allow buggy situations, and let's see which approach
works better when we have RFC code in hand.
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-08 12:33   ` Jesper Dangaard Brouer
  2016-04-08 17:02     ` Brenden Blanco
@ 2016-04-08 17:26     ` Alexei Starovoitov
  2016-04-08 20:08       ` Jesper Dangaard Brouer
  1 sibling, 1 reply; 34+ messages in thread
From: Alexei Starovoitov @ 2016-04-08 17:26 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Brenden Blanco, davem, netdev, tom, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, linux-mm

On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> 
> On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> 
> > > +/* user return codes for PHYS_DEV prog type */
> > > +enum bpf_phys_dev_action {
> > > +	BPF_PHYS_DEV_DROP,
> > > +	BPF_PHYS_DEV_OK,
> > > +};  
> > 
> > I can imagine these extra return codes:
> > 
> >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> > 
> > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > which we can look at when we get that far...
> 
> I want to point out something which is quite FUNDAMENTAL, for
> understanding these return codes (and network stack).
> 
> 
> At driver RX time, the network stack basically have two ways of
> building an SKB, which is send up the stack.
> 
> Option-A (fastest): The packet page is writable. The SKB can be
> allocated and skb->data/head can point directly to the page.  And
> we place/write skb_shared_info in the end/tail-room. (This is done by
> calling build_skb()).
> 
> Option-B (slower): The packet page is read-only.  The SKB cannot point
> skb->data/head directly to the page, because skb_shared_info need to be
> written into skb->end (slightly hidden via skb_shinfo() casting).  To
> get around this, a separate piece of memory is allocated (speedup by
> __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> be written. (This is done when calling netdev/napi_alloc_skb()).
>   Drivers then need to copy over packet headers, and assign + adjust
> skb_shinfo(skb)->frags[0] offset to skip copied headers.
> 
> 
> Unfortunately most drivers use option-B.  Due to cost of calling the
> page allocator.  It is only slightly most expensive to get a larger
> compound page from the page allocator, which then can be partitioned into
> page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> cost is added later, when constructing the SKB.
>  Another reason for option-B, is that archs with expensive IOMMU
> requirements (like PowerPC), don't need to dma_unmap on every packet,
> but only on the compound page level.
> 
> Side-note: Most drivers have a "copy-break" optimization.  Especially
> for option-B, when copying header data anyhow. For small packet, one
> might as well free (or recycle) the RX page, if header size fits into
> the newly allocated memory (for skb_shared_info).

I think you guys are going into overdesign territory, so
. nack on read-only pages
. nack on copy-break approach
. nack on per-ring programs
. nack on modified/stolen/shared return codes

The whole thing must be dead simple to use. Above is not simple by any means.
The programs must see writeable pages only and return codes:
drop, pass to stack, redirect to xmit.
If program wishes to modify packets before passing it to stack, it
shouldn't need to deal with different return values.
No special things to deal with small or large packets. No header splits.
Program must not be aware of any such things.
Drivers can use DMA_BIDIRECTIONAL to allow received page to be
modified by the program and immediately sent to xmit. 
No dma map/unmap/sync per packet. If some odd architectures/dma setups
cannot do it, then XDP will not be applicable there.
We are not going to sacrifice performance for generality.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-08 17:02     ` Brenden Blanco
@ 2016-04-08 19:05       ` Jesper Dangaard Brouer
  0 siblings, 0 replies; 34+ messages in thread
From: Jesper Dangaard Brouer @ 2016-04-08 19:05 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, brouer

On Fri, 8 Apr 2016 10:02:00 -0700
Brenden Blanco <bblanco@plumgrid.com> wrote:

> On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> > 
> > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> >   
> > > > +/* user return codes for PHYS_DEV prog type */
> > > > +enum bpf_phys_dev_action {
> > > > +	BPF_PHYS_DEV_DROP,
> > > > +	BPF_PHYS_DEV_OK,
> > > > +};    
> > > 
> > > I can imagine these extra return codes:
> > > 
> > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> > >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> > >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> > > 
> > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > > which we can look at when we get that far...  
> > 
> > I want to point out something which is quite FUNDAMENTAL, for
> > understanding these return codes (and network stack).
> > 
> > 
> > At driver RX time, the network stack basically have two ways of
> > building an SKB, which is send up the stack.
> > 
> > Option-A (fastest): The packet page is writable. The SKB can be
> > allocated and skb->data/head can point directly to the page.  And
> > we place/write skb_shared_info in the end/tail-room. (This is done by
> > calling build_skb()).
> > 
> > Option-B (slower): The packet page is read-only.  The SKB cannot point
> > skb->data/head directly to the page, because skb_shared_info need to be
> > written into skb->end (slightly hidden via skb_shinfo() casting).  To
> > get around this, a separate piece of memory is allocated (speedup by
> > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> > be written. (This is done when calling netdev/napi_alloc_skb()).
> >   Drivers then need to copy over packet headers, and assign + adjust
> > skb_shinfo(skb)->frags[0] offset to skip copied headers.
> > 
> > 
> > Unfortunately most drivers use option-B.  Due to cost of calling the
> > page allocator.  It is only slightly most expensive to get a larger
> > compound page from the page allocator, which then can be partitioned into
> > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> > cost is added later, when constructing the SKB.
> >  Another reason for option-B, is that archs with expensive IOMMU
> > requirements (like PowerPC), don't need to dma_unmap on every packet,
> > but only on the compound page level.
> > 
> > Side-note: Most drivers have a "copy-break" optimization.  Especially
> > for option-B, when copying header data anyhow. For small packet, one
> > might as well free (or recycle) the RX page, if header size fits into
> > the newly allocated memory (for skb_shared_info).
> > 
> > 
> > For the early filter drop (DDoS use-case), it does not matter that the
> > packet-page is read-only.
> > 
> > BUT for the future XDP (eXpress Data Path) use-case it does matter.  If
> > we ever want to see speeds comparable to DPDK, then drivers to
> > need to implement option-A, as this allow forwarding at the packet-page
> > level.
> > 
> > I hope, my future page-pool facility can remove/hide the cost calling
> > the page allocator.
> >   
> Can't wait! This will open up a lot of doors.
>

If you talk about the page-pool, then it is just once piece of the
puzzle, not the silver bullet ;-)

> > 
> > Back to the return codes, thus:
> > -------------------------------
> > BPF_PHYS_DEV_SHARED requires driver use option-B, when constructing
> > the SKB, and treat packet data as read-only.
> > 
> > BPF_PHYS_DEV_MODIFIED requires driver to provide a writable packet-page.  
>
> I understand the driver/hw requirement, but the codes themselves I think
> need some tweaking.

I'm very open to changing these return codes. I'm just trying to open
up the discussion.


> For instance, if the packet is both modified and forwarded, should
> the flags be ORed together? 

I didn't see these as bit-flags. I assumed that if you want to forward
the packet, then you need to steal it (BPF_PHYS_DEV_STOLEN) and cannot
return it to the stack.

I'm open to changing this to bit-flags, BUT we just have to take care
not to introduce too many things we need to check, due to performance
issues.


> Or is the need for this return code made obsolete if the driver knows
> ahead of time via struct bpf_prog flags that the prog intends to
> modify the packet, and can set up the page accordingly?

Yes, maybe we can drop the modified (BPF_PHYS_DEV_MODIFIED) return code.
I was just thinking this could be used to indicate if the checksum
would need to be recalculated.  If the usual checksum people don't
care, we should drop this indication.

Think about it performance wise... if we know the program _can_ modify
(but don't know if it did so), then we would have mark the SKB to the
stack as the checksum needed to be recalculated, always...

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-08 17:26     ` Alexei Starovoitov
@ 2016-04-08 20:08       ` Jesper Dangaard Brouer
  2016-04-08 21:34         ` Alexei Starovoitov
  0 siblings, 1 reply; 34+ messages in thread
From: Jesper Dangaard Brouer @ 2016-04-08 20:08 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Brenden Blanco, davem, netdev, tom, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, linux-mm, brouer

On Fri, 8 Apr 2016 10:26:53 -0700
Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:

> On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> > 
> > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> >   
> > > > +/* user return codes for PHYS_DEV prog type */
> > > > +enum bpf_phys_dev_action {
> > > > +	BPF_PHYS_DEV_DROP,
> > > > +	BPF_PHYS_DEV_OK,
> > > > +};    
> > > 
> > > I can imagine these extra return codes:
> > > 
> > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> > >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> > >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> > > 
> > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > > which we can look at when we get that far...  
> > 
> > I want to point out something which is quite FUNDAMENTAL, for
> > understanding these return codes (and network stack).
> > 
> > 
> > At driver RX time, the network stack basically have two ways of
> > building an SKB, which is send up the stack.
> > 
> > Option-A (fastest): The packet page is writable. The SKB can be
> > allocated and skb->data/head can point directly to the page.  And
> > we place/write skb_shared_info in the end/tail-room. (This is done by
> > calling build_skb()).
> > 
> > Option-B (slower): The packet page is read-only.  The SKB cannot point
> > skb->data/head directly to the page, because skb_shared_info need to be
> > written into skb->end (slightly hidden via skb_shinfo() casting).  To
> > get around this, a separate piece of memory is allocated (speedup by
> > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> > be written. (This is done when calling netdev/napi_alloc_skb()).
> >   Drivers then need to copy over packet headers, and assign + adjust
> > skb_shinfo(skb)->frags[0] offset to skip copied headers.
> > 
> > 
> > Unfortunately most drivers use option-B.  Due to cost of calling the
> > page allocator.  It is only slightly most expensive to get a larger
> > compound page from the page allocator, which then can be partitioned into
> > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> > cost is added later, when constructing the SKB.
> >  Another reason for option-B, is that archs with expensive IOMMU
> > requirements (like PowerPC), don't need to dma_unmap on every packet,
> > but only on the compound page level.
> > 
> > Side-note: Most drivers have a "copy-break" optimization.  Especially
> > for option-B, when copying header data anyhow. For small packet, one
> > might as well free (or recycle) the RX page, if header size fits into
> > the newly allocated memory (for skb_shared_info).  
> 
> I think you guys are going into overdesign territory, so
> . nack on read-only pages

Unfortunately you cannot just ignore or nack read-only pages. They are
a fact in the current drivers.

Most drivers today (at-least the ones we care about) only deliver
read-only pages.  If you don't accept read-only pages day-1, then you
first have to rewrite a lot of drivers... and that will stall the
project!  How will you deal with this fact?

The early drop filter use-case in this patchset, can ignore read-only
pages.  But ABI wise we need to deal with the future case where we do
need/require writeable pages.  A simple need-writable pages in the API
could help us move forward.


> . nack on copy-break approach

Copy-break can be ignored.  It sort of happens at a higher-level in the
driver. (Eric likely want/care this happens for local socket delivery).


> . nack on per-ring programs

Hmmm... I don't see it as a lot more complicated to attach the program
to the ring.  But maybe we can extend the API later, and thus postpone that
discussion.

> . nack on modified/stolen/shared return codes
> 
> The whole thing must be dead simple to use. Above is not simple by any means.

Maybe you missed that the above was a description of how the current
network stack handles this, which is not simple... which is root of the
hole performance issue.


> The programs must see writeable pages only and return codes:
> drop, pass to stack, redirect to xmit.
> If program wishes to modify packets before passing it to stack, it
> shouldn't need to deal with different return values.

> No special things to deal with small or large packets. No header splits.
> Program must not be aware of any such things.

I agree on this.  This layer only deals with packets at the page level,
single packets stored in continuous memory.


> Drivers can use DMA_BIDIRECTIONAL to allow received page to be
> modified by the program and immediately sent to xmit.

We just have to verify that DMA_BIDIRECTIONAL does not add extra
overhead (which is explicitly stated that it likely does on the
DMA-API-HOWTO.txt, but I like to verify this with a micro benchmark)

> No dma map/unmap/sync per packet. If some odd architectures/dma setups
> cannot do it, then XDP will not be applicable there.

I do like the idea of rejecting XDP eBPF programs based on the DMA
setup is not compatible, or if the driver does not implement e.g.
writable DMA pages.

Customers wanting this feature will then go buy the NIC which support
this feature.  There is nothing more motivating for NIC vendors seeing
customers buying the competitors hardware. And it only require a driver
change to get this market...


> We are not going to sacrifice performance for generality.

Agree.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-08 20:08       ` Jesper Dangaard Brouer
@ 2016-04-08 21:34         ` Alexei Starovoitov
  2016-04-09 11:29           ` Tom Herbert
  0 siblings, 1 reply; 34+ messages in thread
From: Alexei Starovoitov @ 2016-04-08 21:34 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Brenden Blanco, davem, netdev, tom, ogerlitz, daniel,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo, linux-mm

On Fri, Apr 08, 2016 at 10:08:08PM +0200, Jesper Dangaard Brouer wrote:
> On Fri, 8 Apr 2016 10:26:53 -0700
> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
> 
> > On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
> > > 
> > > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
> > >   
> > > > > +/* user return codes for PHYS_DEV prog type */
> > > > > +enum bpf_phys_dev_action {
> > > > > +	BPF_PHYS_DEV_DROP,
> > > > > +	BPF_PHYS_DEV_OK,
> > > > > +};    
> > > > 
> > > > I can imagine these extra return codes:
> > > > 
> > > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
> > > >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
> > > >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
> > > > 
> > > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
> > > > which we can look at when we get that far...  
> > > 
> > > I want to point out something which is quite FUNDAMENTAL, for
> > > understanding these return codes (and network stack).
> > > 
> > > 
> > > At driver RX time, the network stack basically have two ways of
> > > building an SKB, which is send up the stack.
> > > 
> > > Option-A (fastest): The packet page is writable. The SKB can be
> > > allocated and skb->data/head can point directly to the page.  And
> > > we place/write skb_shared_info in the end/tail-room. (This is done by
> > > calling build_skb()).
> > > 
> > > Option-B (slower): The packet page is read-only.  The SKB cannot point
> > > skb->data/head directly to the page, because skb_shared_info need to be
> > > written into skb->end (slightly hidden via skb_shinfo() casting).  To
> > > get around this, a separate piece of memory is allocated (speedup by
> > > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
> > > be written. (This is done when calling netdev/napi_alloc_skb()).
> > >   Drivers then need to copy over packet headers, and assign + adjust
> > > skb_shinfo(skb)->frags[0] offset to skip copied headers.
> > > 
> > > 
> > > Unfortunately most drivers use option-B.  Due to cost of calling the
> > > page allocator.  It is only slightly most expensive to get a larger
> > > compound page from the page allocator, which then can be partitioned into
> > > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
> > > cost is added later, when constructing the SKB.
> > >  Another reason for option-B, is that archs with expensive IOMMU
> > > requirements (like PowerPC), don't need to dma_unmap on every packet,
> > > but only on the compound page level.
> > > 
> > > Side-note: Most drivers have a "copy-break" optimization.  Especially
> > > for option-B, when copying header data anyhow. For small packet, one
> > > might as well free (or recycle) the RX page, if header size fits into
> > > the newly allocated memory (for skb_shared_info).  
> > 
> > I think you guys are going into overdesign territory, so
> > . nack on read-only pages
> 
> Unfortunately you cannot just ignore or nack read-only pages. They are
> a fact in the current drivers.
> 
> Most drivers today (at-least the ones we care about) only deliver
> read-only pages.  If you don't accept read-only pages day-1, then you
> first have to rewrite a lot of drivers... and that will stall the
> project!  How will you deal with this fact?
> 
> The early drop filter use-case in this patchset, can ignore read-only
> pages.  But ABI wise we need to deal with the future case where we do
> need/require writeable pages.  A simple need-writable pages in the API
> could help us move forward.

the program should never need to worry about whether dma buffer is
writeable or not. Complicating drivers, api, abi, usability
for the single use case of fast packet drop is not acceptable.
XDP is not going to be a fit for all drivers and all architectures.
That is cruicial 'performance vs generality' aspect of the design.
All kernel-bypasses are taking advantage of specific architecture.
We have to take advantage of it as well. If it doesn't fit
powerpc with iommu, so be it. XDP will return -enotsupp.
That is fundamental point. We have to cut such corners and avoid
all cases where unnecessary generality hurts performance.
Read-only pages is clearly such thing.

> > The whole thing must be dead simple to use. Above is not simple by any means.
> 
> Maybe you missed that the above was a description of how the current
> network stack handles this, which is not simple... which is root of the
> hole performance issue.

Disagree. The stack has copy-break, gro, gso and everything else because
it's serving _host_ use case. XDP is packet forwarder use case.
The requirements are completely different. Ex. the host needs gso
in the core and drivers. It needs to deliver data all the way
to user space and back. That is hard and that's where complexity
comes from. For packet forwarder none of it is needed. So saying,
look we have this complexity, so XDP needs it too, is flawed argument.
The kernel is serving host and applications.
XDP is pure packet-in/packet-out framework to achieve better
performance than kernel-bypass, since kernel is the right
place to do it. It has clean access to interrupts, per-cpu,
scheduler, device registers and so on.
Though there are only two broad use cases packet drop and forward,
they cover a ton of real cases: firewalls, dos prevention,
load balancer, nat, etc. In other words mostly stateless.
As soon as packet needs to be queued somewhere we have to
instantiate skb and pass it to the stack.
So no queues in XDP and no 'stolen' and 'shared' return codes.
The program always runs to completion with single packet.
There is no header vs payload split. There is no header
from program point of view. It's raw bytes in dma buffer.

> I do like the idea of rejecting XDP eBPF programs based on the DMA
> setup is not compatible, or if the driver does not implement e.g.
> writable DMA pages.

exactly.

> Customers wanting this feature will then go buy the NIC which support
> this feature.  There is nothing more motivating for NIC vendors seeing
> customers buying the competitors hardware. And it only require a driver
> change to get this market...

exactly.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-08  4:48 [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Brenden Blanco
                   ` (4 preceding siblings ...)
  2016-04-08 10:36 ` [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Jesper Dangaard Brouer
@ 2016-04-09 11:17 ` Tom Herbert
  2016-04-09 12:27   ` Jesper Dangaard Brouer
  2016-04-09 17:00   ` Alexei Starovoitov
  5 siblings, 2 replies; 34+ messages in thread
From: Tom Herbert @ 2016-04-09 11:17 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: David S. Miller, Linux Kernel Network Developers,
	Alexei Starovoitov, Or Gerlitz, Daniel Borkmann,
	Jesper Dangaard Brouer, Eric Dumazet, Edward Cree,
	john fastabend, Thomas Graf, Johannes Berg, eranlinuxmellanox,
	Lorenzo Colitti

On Fri, Apr 8, 2016 at 1:48 AM, Brenden Blanco <bblanco@plumgrid.com> wrote:
> Add a new bpf prog type that is intended to run in early stages of the
> packet rx path. Only minimal packet metadata will be available, hence a
> new context type, struct bpf_phys_dev_md, is exposed to userspace. So
> far only expose the readable packet length, and only in read mode.
>
> The PHYS_DEV name is chosen to represent that the program is meant only
> for physical adapters, rather than all netdevs.
>
> While the user visible struct is new, the underlying context must be
> implemented as a minimal skb in order for the packet load_* instructions
> to work. The skb filled in by the driver must have skb->len, skb->head,
> and skb->data set, and skb->data_len == 0.
>
> Signed-off-by: Brenden Blanco <bblanco@plumgrid.com>
> ---
>  include/uapi/linux/bpf.h | 14 ++++++++++
>  kernel/bpf/verifier.c    |  1 +
>  net/core/filter.c        | 68 ++++++++++++++++++++++++++++++++++++++++++++++++
>  3 files changed, 83 insertions(+)
>
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index 70eda5a..3018d83 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -93,6 +93,7 @@ enum bpf_prog_type {
>         BPF_PROG_TYPE_SCHED_CLS,
>         BPF_PROG_TYPE_SCHED_ACT,
>         BPF_PROG_TYPE_TRACEPOINT,
> +       BPF_PROG_TYPE_PHYS_DEV,
>  };
>
>  #define BPF_PSEUDO_MAP_FD      1
> @@ -368,6 +369,19 @@ struct __sk_buff {
>         __u32 tc_classid;
>  };
>
> +/* user return codes for PHYS_DEV prog type */
> +enum bpf_phys_dev_action {
> +       BPF_PHYS_DEV_DROP,
> +       BPF_PHYS_DEV_OK,

I don't like OK. Maybe this mean LOCAL. We also need FORWARD (not sure
how to handle GRO yet).

I would suggest that we format the return code as code:subcode, where
the above are codes. subcode is relevant to major code. For instance
in forwarding the subcodes indicate a forwarding instruction (maybe a
queue). DROP subcodes might answer why.

> +};
> +
> +/* user accessible metadata for PHYS_DEV packet hook
> + * new fields must be added to the end of this structure
> + */
> +struct bpf_phys_dev_md {
> +       __u32 len;
> +};
> +

The naming is verbose and too tied to specific implementation. The
meta data structure is just a generic abstraction of a receive
descriptor, it is not specific BPF or physical devices. For instance,
the BPF programs for this should be exportable to a device, userspace,
other OSes, etc. Also, inevitably someone will have a reason to use an
alternate filtering than BPF based on same interfaces abstraction.

So for the above I suggest we simply name the structure xdp_md.
Similarly, drop codes can be XDP_DROP, etc. Also, these should be in a
new header say xdp.h which will also be in uapi.

One other API issue is how to deal with encapsulation. In this case a
header may be prepended to the packet, I assume there are BPF helper
functions and we don't need to return a new length or start?

Tom

>  struct bpf_tunnel_key {
>         __u32 tunnel_id;
>         union {
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index 58792fe..877542d 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -1344,6 +1344,7 @@ static bool may_access_skb(enum bpf_prog_type type)
>         case BPF_PROG_TYPE_SOCKET_FILTER:
>         case BPF_PROG_TYPE_SCHED_CLS:
>         case BPF_PROG_TYPE_SCHED_ACT:
> +       case BPF_PROG_TYPE_PHYS_DEV:
>                 return true;
>         default:
>                 return false;
> diff --git a/net/core/filter.c b/net/core/filter.c
> index e8486ba..4f73fb9 100644
> --- a/net/core/filter.c
> +++ b/net/core/filter.c
> @@ -2021,6 +2021,12 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
>         }
>  }
>
> +static const struct bpf_func_proto *
> +phys_dev_func_proto(enum bpf_func_id func_id)
> +{
> +       return sk_filter_func_proto(func_id);
> +}
> +
>  static bool __is_valid_access(int off, int size, enum bpf_access_type type)
>  {
>         /* check bounds */
> @@ -2076,6 +2082,36 @@ static bool tc_cls_act_is_valid_access(int off, int size,
>         return __is_valid_access(off, size, type);
>  }
>
> +static bool __is_valid_phys_dev_access(int off, int size,
> +                                      enum bpf_access_type type)
> +{
> +       if (off < 0 || off >= sizeof(struct bpf_phys_dev_md))
> +               return false;
> +
> +       if (off % size != 0)
> +               return false;
> +
> +       if (size != 4)
> +               return false;
> +
> +       return true;
> +}
> +
> +static bool phys_dev_is_valid_access(int off, int size,
> +                                    enum bpf_access_type type)
> +{
> +       if (type == BPF_WRITE)
> +               return false;
> +
> +       switch (off) {
> +       case offsetof(struct bpf_phys_dev_md, len):
> +               break;
> +       default:
> +               return false;
> +       }
> +       return __is_valid_phys_dev_access(off, size, type);
> +}
> +
>  static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>                                       int src_reg, int ctx_off,
>                                       struct bpf_insn *insn_buf,
> @@ -2213,6 +2249,26 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
>         return insn - insn_buf;
>  }
>
> +static u32 bpf_phys_dev_convert_ctx_access(enum bpf_access_type type,
> +                                          int dst_reg, int src_reg,
> +                                          int ctx_off,
> +                                          struct bpf_insn *insn_buf,
> +                                          struct bpf_prog *prog)
> +{
> +       struct bpf_insn *insn = insn_buf;
> +
> +       switch (ctx_off) {
> +       case offsetof(struct bpf_phys_dev_md, len):
> +               BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
> +
> +               *insn++ = BPF_LDX_MEM(BPF_W, dst_reg, src_reg,
> +                                     offsetof(struct sk_buff, len));
> +               break;
> +       }
> +
> +       return insn - insn_buf;
> +}
> +
>  static const struct bpf_verifier_ops sk_filter_ops = {
>         .get_func_proto = sk_filter_func_proto,
>         .is_valid_access = sk_filter_is_valid_access,
> @@ -2225,6 +2281,12 @@ static const struct bpf_verifier_ops tc_cls_act_ops = {
>         .convert_ctx_access = bpf_net_convert_ctx_access,
>  };
>
> +static const struct bpf_verifier_ops phys_dev_ops = {
> +       .get_func_proto = phys_dev_func_proto,
> +       .is_valid_access = phys_dev_is_valid_access,
> +       .convert_ctx_access = bpf_phys_dev_convert_ctx_access,
> +};
> +
>  static struct bpf_prog_type_list sk_filter_type __read_mostly = {
>         .ops = &sk_filter_ops,
>         .type = BPF_PROG_TYPE_SOCKET_FILTER,
> @@ -2240,11 +2302,17 @@ static struct bpf_prog_type_list sched_act_type __read_mostly = {
>         .type = BPF_PROG_TYPE_SCHED_ACT,
>  };
>
> +static struct bpf_prog_type_list phys_dev_type __read_mostly = {
> +       .ops = &phys_dev_ops,
> +       .type = BPF_PROG_TYPE_PHYS_DEV,
> +};
> +
>  static int __init register_sk_filter_ops(void)
>  {
>         bpf_register_prog_type(&sk_filter_type);
>         bpf_register_prog_type(&sched_cls_type);
>         bpf_register_prog_type(&sched_act_type);
> +       bpf_register_prog_type(&phys_dev_type);
>
>         return 0;
>  }
> --
> 2.8.0
>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-08 21:34         ` Alexei Starovoitov
@ 2016-04-09 11:29           ` Tom Herbert
  2016-04-09 15:29             ` Jamal Hadi Salim
  0 siblings, 1 reply; 34+ messages in thread
From: Tom Herbert @ 2016-04-09 11:29 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jesper Dangaard Brouer, Brenden Blanco, David S. Miller,
	Linux Kernel Network Developers, Or Gerlitz, Daniel Borkmann,
	Eric Dumazet, Edward Cree, john fastabend, Thomas Graf,
	Johannes Berg, eranlinuxmellanox, Lorenzo Colitti, linux-mm

On Fri, Apr 8, 2016 at 6:34 PM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Fri, Apr 08, 2016 at 10:08:08PM +0200, Jesper Dangaard Brouer wrote:
>> On Fri, 8 Apr 2016 10:26:53 -0700
>> Alexei Starovoitov <alexei.starovoitov@gmail.com> wrote:
>>
>> > On Fri, Apr 08, 2016 at 02:33:40PM +0200, Jesper Dangaard Brouer wrote:
>> > >
>> > > On Fri, 8 Apr 2016 12:36:14 +0200 Jesper Dangaard Brouer <brouer@redhat.com> wrote:
>> > >
>> > > > > +/* user return codes for PHYS_DEV prog type */
>> > > > > +enum bpf_phys_dev_action {
>> > > > > +     BPF_PHYS_DEV_DROP,
>> > > > > +     BPF_PHYS_DEV_OK,
>> > > > > +};
>> > > >
>> > > > I can imagine these extra return codes:
>> > > >
>> > > >  BPF_PHYS_DEV_MODIFIED,   /* Packet page/payload modified */
>> > > >  BPF_PHYS_DEV_STOLEN,     /* E.g. forward use-case */
>> > > >  BPF_PHYS_DEV_SHARED,     /* Queue for async processing, e.g. tcpdump use-case */
>> > > >
>> > > > The "STOLEN" and "SHARED" use-cases require some refcnt manipulations,
>> > > > which we can look at when we get that far...
>> > >
>> > > I want to point out something which is quite FUNDAMENTAL, for
>> > > understanding these return codes (and network stack).
>> > >
>> > >
>> > > At driver RX time, the network stack basically have two ways of
>> > > building an SKB, which is send up the stack.
>> > >
>> > > Option-A (fastest): The packet page is writable. The SKB can be
>> > > allocated and skb->data/head can point directly to the page.  And
>> > > we place/write skb_shared_info in the end/tail-room. (This is done by
>> > > calling build_skb()).
>> > >
>> > > Option-B (slower): The packet page is read-only.  The SKB cannot point
>> > > skb->data/head directly to the page, because skb_shared_info need to be
>> > > written into skb->end (slightly hidden via skb_shinfo() casting).  To
>> > > get around this, a separate piece of memory is allocated (speedup by
>> > > __alloc_page_frag) for pointing skb->data/head, so skb_shared_info can
>> > > be written. (This is done when calling netdev/napi_alloc_skb()).
>> > >   Drivers then need to copy over packet headers, and assign + adjust
>> > > skb_shinfo(skb)->frags[0] offset to skip copied headers.
>> > >
>> > >
>> > > Unfortunately most drivers use option-B.  Due to cost of calling the
>> > > page allocator.  It is only slightly most expensive to get a larger
>> > > compound page from the page allocator, which then can be partitioned into
>> > > page-fragments, thus amortizing the page alloc cost.  Unfortunately the
>> > > cost is added later, when constructing the SKB.
>> > >  Another reason for option-B, is that archs with expensive IOMMU
>> > > requirements (like PowerPC), don't need to dma_unmap on every packet,
>> > > but only on the compound page level.
>> > >
>> > > Side-note: Most drivers have a "copy-break" optimization.  Especially
>> > > for option-B, when copying header data anyhow. For small packet, one
>> > > might as well free (or recycle) the RX page, if header size fits into
>> > > the newly allocated memory (for skb_shared_info).
>> >
>> > I think you guys are going into overdesign territory, so
>> > . nack on read-only pages
>>
>> Unfortunately you cannot just ignore or nack read-only pages. They are
>> a fact in the current drivers.
>>
>> Most drivers today (at-least the ones we care about) only deliver
>> read-only pages.  If you don't accept read-only pages day-1, then you
>> first have to rewrite a lot of drivers... and that will stall the
>> project!  How will you deal with this fact?
>>
>> The early drop filter use-case in this patchset, can ignore read-only
>> pages.  But ABI wise we need to deal with the future case where we do
>> need/require writeable pages.  A simple need-writable pages in the API
>> could help us move forward.
>
> the program should never need to worry about whether dma buffer is
> writeable or not. Complicating drivers, api, abi, usability
> for the single use case of fast packet drop is not acceptable.
> XDP is not going to be a fit for all drivers and all architectures.
> That is cruicial 'performance vs generality' aspect of the design.
> All kernel-bypasses are taking advantage of specific architecture.
> We have to take advantage of it as well. If it doesn't fit
> powerpc with iommu, so be it. XDP will return -enotsupp.
> That is fundamental point. We have to cut such corners and avoid
> all cases where unnecessary generality hurts performance.
> Read-only pages is clearly such thing.
>
+1. Forwarding which will be a common application almost always
requires modification (decrement TTL), and header data split has
always been a weak feature since the device has to have some arbitrary
rules about what headers needs to be split out (either implements
protocol specific parsing or some fixed length).

>> > The whole thing must be dead simple to use. Above is not simple by any means.
>>
>> Maybe you missed that the above was a description of how the current
>> network stack handles this, which is not simple... which is root of the
>> hole performance issue.
>
> Disagree. The stack has copy-break, gro, gso and everything else because
> it's serving _host_ use case. XDP is packet forwarder use case.
> The requirements are completely different. Ex. the host needs gso
> in the core and drivers. It needs to deliver data all the way
> to user space and back. That is hard and that's where complexity
> comes from. For packet forwarder none of it is needed. So saying,
> look we have this complexity, so XDP needs it too, is flawed argument.
> The kernel is serving host and applications.
> XDP is pure packet-in/packet-out framework to achieve better
> performance than kernel-bypass, since kernel is the right
> place to do it. It has clean access to interrupts, per-cpu,
> scheduler, device registers and so on.
> Though there are only two broad use cases packet drop and forward,
> they cover a ton of real cases: firewalls, dos prevention,
> load balancer, nat, etc. In other words mostly stateless.
> As soon as packet needs to be queued somewhere we have to
> instantiate skb and pass it to the stack.
> So no queues in XDP and no 'stolen' and 'shared' return codes.
> The program always runs to completion with single packet.
> There is no header vs payload split. There is no header
> from program point of view. It's raw bytes in dma buffer.
>
Exactly. We are rethinking the low level data path for performance. An
all encompassing solution that covers ever existing driver model only
results in complexity which is what makes things "slow" in the first
place. Drivers need to change to implement XDP, but the model is as
simple as we can make it-- for instance we are putting very little
requirements on device features.

Tom


>> I do like the idea of rejecting XDP eBPF programs based on the DMA
>> setup is not compatible, or if the driver does not implement e.g.
>> writable DMA pages.
>
> exactly.
>
>> Customers wanting this feature will then go buy the NIC which support
>> this feature.  There is nothing more motivating for NIC vendors seeing
>> customers buying the competitors hardware. And it only require a driver
>> change to get this market...
>
> exactly.
>

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-09 11:17 ` Tom Herbert
@ 2016-04-09 12:27   ` Jesper Dangaard Brouer
  2016-04-09 13:17     ` Tom Herbert
  2016-04-09 17:00   ` Alexei Starovoitov
  1 sibling, 1 reply; 34+ messages in thread
From: Jesper Dangaard Brouer @ 2016-04-09 12:27 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Brenden Blanco, David S. Miller, Linux Kernel Network Developers,
	Alexei Starovoitov, Or Gerlitz, Daniel Borkmann, Eric Dumazet,
	Edward Cree, john fastabend, Thomas Graf, Johannes Berg,
	eranlinuxmellanox, Lorenzo Colitti, brouer

On Sat, 9 Apr 2016 08:17:04 -0300
Tom Herbert <tom@herbertland.com> wrote:

> One other API issue is how to deal with encapsulation. In this case a
> header may be prepended to the packet, I assume there are BPF helper
> functions and we don't need to return a new length or start?

That reminds me.  Do the BPF program need to know the head-room, then?

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-09 12:27   ` Jesper Dangaard Brouer
@ 2016-04-09 13:17     ` Tom Herbert
  0 siblings, 0 replies; 34+ messages in thread
From: Tom Herbert @ 2016-04-09 13:17 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Brenden Blanco, David S. Miller, Linux Kernel Network Developers,
	Alexei Starovoitov, Or Gerlitz, Daniel Borkmann, Eric Dumazet,
	Edward Cree, john fastabend, Thomas Graf, Johannes Berg,
	eranlinuxmellanox, Lorenzo Colitti

On Sat, Apr 9, 2016 at 9:27 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
> On Sat, 9 Apr 2016 08:17:04 -0300
> Tom Herbert <tom@herbertland.com> wrote:
>
>> One other API issue is how to deal with encapsulation. In this case a
>> header may be prepended to the packet, I assume there are BPF helper
>> functions and we don't need to return a new length or start?
>
> That reminds me.  Do the BPF program need to know the head-room, then?
>
Right, that is basically my question. Can we have a helper function in
BPF that will prepend n bytes to the buffer? (I don't think we want
expose a notion of headroom).

Tom

> --
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 5/5] Add sample for adding simple drop program to link
  2016-04-08  4:48 ` [RFC PATCH v2 5/5] Add sample for adding simple drop program to link Brenden Blanco
@ 2016-04-09 14:48   ` Jamal Hadi Salim
  2016-04-09 16:43     ` Brenden Blanco
  0 siblings, 1 reply; 34+ messages in thread
From: Jamal Hadi Salim @ 2016-04-09 14:48 UTC (permalink / raw)
  To: Brenden Blanco, davem
  Cc: netdev, tom, alexei.starovoitov, ogerlitz, daniel, brouer,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo

On 16-04-08 12:48 AM, Brenden Blanco wrote:
> Add a sample program that only drops packets at the
> BPF_PROG_TYPE_PHYS_DEV hook of a link. With the drop-only program,
> observed single core rate is ~19.5Mpps.
>
> Other tests were run, for instance without the dropcnt increment or
> without reading from the packet header, the packet rate was mostly
> unchanged.
>
> $ perf record -a samples/bpf/netdrvx1 $(</sys/class/net/eth0/ifindex)
> proto 17:   19596362 drops/s
>
> ./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
> Running... ctrl^C to stop
> Device: eth4@0
> Result: OK: 7873817(c7872245+d1572) usec, 38801823 (60byte,0frags)
>    4927955pps 2365Mb/sec (2365418400bps) errors: 0
> Device: eth4@1
> Result: OK: 7873817(c7872123+d1693) usec, 38587342 (60byte,0frags)
>    4900715pps 2352Mb/sec (2352343200bps) errors: 0
> Device: eth4@2
> Result: OK: 7873817(c7870929+d2888) usec, 38718848 (60byte,0frags)
>    4917417pps 2360Mb/sec (2360360160bps) errors: 0
> Device: eth4@3
> Result: OK: 7873818(c7872193+d1625) usec, 38796346 (60byte,0frags)
>    4927259pps 2365Mb/sec (2365084320bps) errors: 0
>
> perf report --no-children:
>   29.48%  ksoftirqd/6  [mlx4_en]         [k] mlx4_en_process_rx_cq
>   18.17%  ksoftirqd/6  [mlx4_en]         [k] mlx4_en_alloc_frags
>    8.19%  ksoftirqd/6  [mlx4_en]         [k] mlx4_en_free_frag
>    5.35%  ksoftirqd/6  [kernel.vmlinux]  [k] get_page_from_freelist
>    2.92%  ksoftirqd/6  [kernel.vmlinux]  [k] free_pages_prepare
>    2.90%  ksoftirqd/6  [mlx4_en]         [k] mlx4_call_bpf
>    2.72%  ksoftirqd/6  [fjes]            [k] 0x000000000000af66
>    2.37%  ksoftirqd/6  [kernel.vmlinux]  [k] swiotlb_sync_single_for_cpu
>    1.92%  ksoftirqd/6  [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
>    1.83%  ksoftirqd/6  [kernel.vmlinux]  [k] free_one_page
>    1.70%  ksoftirqd/6  [kernel.vmlinux]  [k] swiotlb_sync_single
>    1.69%  ksoftirqd/6  [kernel.vmlinux]  [k] bpf_map_lookup_elem
>    1.33%  swapper      [kernel.vmlinux]  [k] intel_idle
>    1.32%  ksoftirqd/6  [fjes]            [k] 0x000000000000af90
>    1.21%  ksoftirqd/6  [kernel.vmlinux]  [k] sk_load_byte_positive_offset
>    1.07%  ksoftirqd/6  [kernel.vmlinux]  [k] __alloc_pages_nodemask
>    0.89%  ksoftirqd/6  [kernel.vmlinux]  [k] __rmqueue
>    0.84%  ksoftirqd/6  [mlx4_en]         [k] mlx4_alloc_pages.isra.23
>    0.79%  ksoftirqd/6  [kernel.vmlinux]  [k] net_rx_action
>
> machine specs:
>   receiver - Intel E5-1630 v3 @ 3.70GHz
>   sender - Intel E5645 @ 2.40GHz
>   Mellanox ConnectX-3 @40G
>


Ok, sorry - should have looked this far before sending earlier email.
So when you run concurently you see about 5Mpps per core but if you
shoot all traffic at a single core you see 20Mpps?

Devil's advocate question:
If the bottleneck is the driver - is there an advantage in adding the
bpf code at all in the driver?
I am curious than before to see the comparison for the same bpf code
running at tc level vs in the driver..

cheers,
jamal

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-09 11:29           ` Tom Herbert
@ 2016-04-09 15:29             ` Jamal Hadi Salim
  2016-04-09 17:26               ` Alexei Starovoitov
  0 siblings, 1 reply; 34+ messages in thread
From: Jamal Hadi Salim @ 2016-04-09 15:29 UTC (permalink / raw)
  To: Tom Herbert, Alexei Starovoitov
  Cc: Jesper Dangaard Brouer, Brenden Blanco, David S. Miller,
	Linux Kernel Network Developers, Or Gerlitz, Daniel Borkmann,
	Eric Dumazet, Edward Cree, john fastabend, Thomas Graf,
	Johannes Berg, eranlinuxmellanox, Lorenzo Colitti, linux-mm

On 16-04-09 07:29 AM, Tom Herbert wrote:

> +1. Forwarding which will be a common application almost always
> requires modification (decrement TTL), and header data split has
> always been a weak feature since the device has to have some arbitrary
> rules about what headers needs to be split out (either implements
> protocol specific parsing or some fixed length).

Then this is sensible. I was cruising the threads and
confused by your earlier emails Tom because you talked
about XPS etc. It sounded like the idea evolved into putting
the whole freaking stack on bpf.
If this is _forwarding only_ it maybe useful to look at
Alexey's old code in particular the DMA bits;
he built his own lookup algorithm but sounds like bpf is
a much better fit today.

cheers,
jamal

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 5/5] Add sample for adding simple drop program to link
  2016-04-09 14:48   ` Jamal Hadi Salim
@ 2016-04-09 16:43     ` Brenden Blanco
  2016-04-09 17:27       ` Jamal Hadi Salim
  0 siblings, 1 reply; 34+ messages in thread
From: Brenden Blanco @ 2016-04-09 16:43 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel, brouer,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo

On Sat, Apr 09, 2016 at 10:48:05AM -0400, Jamal Hadi Salim wrote:
> On 16-04-08 12:48 AM, Brenden Blanco wrote:
> >Add a sample program that only drops packets at the
> >BPF_PROG_TYPE_PHYS_DEV hook of a link. With the drop-only program,
> >observed single core rate is ~19.5Mpps.
> >
> >Other tests were run, for instance without the dropcnt increment or
> >without reading from the packet header, the packet rate was mostly
> >unchanged.
> >
> >$ perf record -a samples/bpf/netdrvx1 $(</sys/class/net/eth0/ifindex)
> >proto 17:   19596362 drops/s
> >
> >./pktgen_sample03_burst_single_flow.sh -i $DEV -d $IP -m $MAC -t 4
> >Running... ctrl^C to stop
> >Device: eth4@0
> >Result: OK: 7873817(c7872245+d1572) usec, 38801823 (60byte,0frags)
> >   4927955pps 2365Mb/sec (2365418400bps) errors: 0
> >Device: eth4@1
> >Result: OK: 7873817(c7872123+d1693) usec, 38587342 (60byte,0frags)
> >   4900715pps 2352Mb/sec (2352343200bps) errors: 0
> >Device: eth4@2
> >Result: OK: 7873817(c7870929+d2888) usec, 38718848 (60byte,0frags)
> >   4917417pps 2360Mb/sec (2360360160bps) errors: 0
> >Device: eth4@3
> >Result: OK: 7873818(c7872193+d1625) usec, 38796346 (60byte,0frags)
> >   4927259pps 2365Mb/sec (2365084320bps) errors: 0
> >
> >perf report --no-children:
> >  29.48%  ksoftirqd/6  [mlx4_en]         [k] mlx4_en_process_rx_cq
> >  18.17%  ksoftirqd/6  [mlx4_en]         [k] mlx4_en_alloc_frags
> >   8.19%  ksoftirqd/6  [mlx4_en]         [k] mlx4_en_free_frag
> >   5.35%  ksoftirqd/6  [kernel.vmlinux]  [k] get_page_from_freelist
> >   2.92%  ksoftirqd/6  [kernel.vmlinux]  [k] free_pages_prepare
> >   2.90%  ksoftirqd/6  [mlx4_en]         [k] mlx4_call_bpf
> >   2.72%  ksoftirqd/6  [fjes]            [k] 0x000000000000af66
> >   2.37%  ksoftirqd/6  [kernel.vmlinux]  [k] swiotlb_sync_single_for_cpu
> >   1.92%  ksoftirqd/6  [kernel.vmlinux]  [k] percpu_array_map_lookup_elem
> >   1.83%  ksoftirqd/6  [kernel.vmlinux]  [k] free_one_page
> >   1.70%  ksoftirqd/6  [kernel.vmlinux]  [k] swiotlb_sync_single
> >   1.69%  ksoftirqd/6  [kernel.vmlinux]  [k] bpf_map_lookup_elem
> >   1.33%  swapper      [kernel.vmlinux]  [k] intel_idle
> >   1.32%  ksoftirqd/6  [fjes]            [k] 0x000000000000af90
> >   1.21%  ksoftirqd/6  [kernel.vmlinux]  [k] sk_load_byte_positive_offset
> >   1.07%  ksoftirqd/6  [kernel.vmlinux]  [k] __alloc_pages_nodemask
> >   0.89%  ksoftirqd/6  [kernel.vmlinux]  [k] __rmqueue
> >   0.84%  ksoftirqd/6  [mlx4_en]         [k] mlx4_alloc_pages.isra.23
> >   0.79%  ksoftirqd/6  [kernel.vmlinux]  [k] net_rx_action
> >
> >machine specs:
> >  receiver - Intel E5-1630 v3 @ 3.70GHz
> >  sender - Intel E5645 @ 2.40GHz
> >  Mellanox ConnectX-3 @40G
> >
> 
> 
> Ok, sorry - should have looked this far before sending earlier email.
> So when you run concurently you see about 5Mpps per core but if you
> shoot all traffic at a single core you see 20Mpps?
No, only sender is multiple, receiver is still single core. The flow is
the same in all 4 of the send threads. Note that only ksoftirqd/6 is
active.
> 
> Devil's advocate question:
> If the bottleneck is the driver - is there an advantage in adding the
> bpf code at all in the driver?
Only by adding this hook into the driver has it become the bottleneck.
Prior to this, the bottleneck was later in the codepath, primarily in
allocations.

If a packet is to be dropped, and a determination can be made with fewer
cpu cycles spent, then there is more time for the goodput.

Beyond that, even if the skb allocation gets 10x or 100x or whatever
improvement, there is still a non-zero cost associated, and dropping bad
packets with minimal time spent has value. The same argument holds for
physical nic forwarding decisions.

> I am curious than before to see the comparison for the same bpf code
> running at tc level vs in the driver..
Here is a perf report for drop in the clsact qdisc with direct-action,
which Daniel earlier showed to have the best performance to-date. On my
machine, this gets about 6.5Mpps drop single core. Drop due to failed
IP lookup (not shown here) is worse @4.5Mpps.

  9.24%  ksoftirqd/3  [mlx4_en]          [k] mlx4_en_process_rx_cq
  8.50%  ksoftirqd/3  [kernel.vmlinux]   [k] dev_gro_receive
  7.24%  ksoftirqd/3  [kernel.vmlinux]   [k] __netif_receive_skb_core
  5.47%  ksoftirqd/3  [mlx4_en]          [k] mlx4_en_complete_rx_desc
  4.74%  ksoftirqd/3  [kernel.vmlinux]   [k] kmem_cache_free
  3.94%  ksoftirqd/3  [mlx4_en]          [k] mlx4_en_alloc_frags
  3.42%  ksoftirqd/3  [kernel.vmlinux]   [k] napi_gro_frags
  3.34%  ksoftirqd/3  [kernel.vmlinux]   [k] inet_gro_receive
  3.32%  ksoftirqd/3  [kernel.vmlinux]   [k] __build_skb
  3.28%  ksoftirqd/3  [kernel.vmlinux]   [k] __napi_alloc_skb
  2.94%  ksoftirqd/3  [cls_bpf]          [k] cls_bpf_classify
  2.88%  ksoftirqd/3  [kernel.vmlinux]   [k] ktime_get_with_offset
  2.50%  ksoftirqd/3  [kernel.vmlinux]   [k] eth_type_trans
  2.40%  ksoftirqd/3  [kernel.vmlinux]   [k] kmem_cache_alloc
  2.29%  ksoftirqd/3  [kernel.vmlinux]   [k] skb_release_data
  2.25%  ksoftirqd/3  [kernel.vmlinux]   [k] gro_pull_from_frag0
  2.09%  ksoftirqd/3  [kernel.vmlinux]   [k] netif_receive_skb_internal
  1.99%  ksoftirqd/3  [kernel.vmlinux]   [k] memcpy_erms
  1.73%  ksoftirqd/3  [kernel.vmlinux]   [k] napi_get_frags
  1.66%  ksoftirqd/3  [kernel.vmlinux]   [k] __udp4_lib_lookup
  1.60%  ksoftirqd/3  [kernel.vmlinux]   [k] tc_classify
  1.25%  ksoftirqd/3  [kernel.vmlinux]   [k] kfree_skb
  1.24%  ksoftirqd/3  [kernel.vmlinux]   [k] get_page_from_freelist
  1.24%  ksoftirqd/3  [kernel.vmlinux]   [k] skb_gro_reset_offset
  1.16%  ksoftirqd/3  [kernel.vmlinux]   [k] udp4_gro_receive
  1.12%  ksoftirqd/3  [kernel.vmlinux]   [k] udp_gro_receive
  0.93%  ksoftirqd/3  [kernel.vmlinux]   [k] __free_page_frag
  0.91%  ksoftirqd/3  [kernel.vmlinux]   [k] skb_release_head_state
  0.89%  ksoftirqd/3  [kernel.vmlinux]   [k] __alloc_page_frag
  0.88%  ksoftirqd/3  [kernel.vmlinux]   [k] udp4_lib_lookup_skb
  0.83%  swapper      [kernel.vmlinux]   [k] intel_idle
  0.81%  ksoftirqd/3  [kernel.vmlinux]   [k] kfree_skbmem
  0.77%  ksoftirqd/3  [kernel.vmlinux]   [k] skb_release_all
  0.76%  ksoftirqd/3  [mlx4_en]          [k] mlx4_en_free_frag
  0.68%  ksoftirqd/3  [kernel.vmlinux]   [k] __netif_receive_skb
  0.64%  ksoftirqd/3  [kernel.vmlinux]   [k] free_pages_prepare
  0.53%  ksoftirqd/3  [kernel.vmlinux]   [k] read_tsc
  0.43%  ksoftirqd/3  [kernel.vmlinux]   [k] swiotlb_sync_single
  0.38%  ksoftirqd/3  [kernel.vmlinux]   [k] __memcpy
  0.37%  ksoftirqd/3  [kernel.vmlinux]   [k] bpf_map_lookup_elem
  0.35%  ksoftirqd/3  [kernel.vmlinux]   [k] __memcg_kmem_put_cache
  0.32%  ksoftirqd/3  [kernel.vmlinux]   [k] swiotlb_sync_single_for_cpu
  0.32%  ksoftirqd/3  [kernel.vmlinux]   [k] free_one_page
  0.25%  ksoftirqd/3  [kernel.vmlinux]   [k] __alloc_pages_nodemask
  0.23%  ksoftirqd/3  [kernel.vmlinux]   [k] net_rx_action
  0.22%  ksoftirqd/3  [kernel.vmlinux]   [k] __free_pages_ok
  0.21%  ksoftirqd/3  [mlx4_en]          [k] mlx4_alloc_pages.isra.23
  0.17%  ksoftirqd/3  [kernel.vmlinux]   [k] percpu_array_map_lookup_elem
  0.17%  ksoftirqd/3  [kernel.vmlinux]   [k] PageHuge
  0.15%  ksoftirqd/3  [wmi]              [k] 0x0000000000005d49
  0.15%  ksoftirqd/3  [kernel.vmlinux]   [k] __rmqueue
  0.13%  ksoftirqd/3  [wmi]              [k] 0x0000000000005d60

> 
> cheers,
> jamal

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-09 11:17 ` Tom Herbert
  2016-04-09 12:27   ` Jesper Dangaard Brouer
@ 2016-04-09 17:00   ` Alexei Starovoitov
  1 sibling, 0 replies; 34+ messages in thread
From: Alexei Starovoitov @ 2016-04-09 17:00 UTC (permalink / raw)
  To: Tom Herbert
  Cc: Brenden Blanco, David S. Miller, Linux Kernel Network Developers,
	Or Gerlitz, Daniel Borkmann, Jesper Dangaard Brouer,
	Eric Dumazet, Edward Cree, john fastabend, Thomas Graf,
	Johannes Berg, eranlinuxmellanox, Lorenzo Colitti

On Sat, Apr 09, 2016 at 08:17:04AM -0300, Tom Herbert wrote:
> >
> > +/* user return codes for PHYS_DEV prog type */
> > +enum bpf_phys_dev_action {
> > +       BPF_PHYS_DEV_DROP,
> > +       BPF_PHYS_DEV_OK,
> 
> I don't like OK. Maybe this mean LOCAL. We also need FORWARD (not sure
> how to handle GRO yet).
> 
> I would suggest that we format the return code as code:subcode, where
> the above are codes. subcode is relevant to major code. For instance
> in forwarding the subcodes indicate a forwarding instruction (maybe a
> queue). DROP subcodes might answer why.

for tc redirect we use hidden percpu variable to pass additional
info together with return code. The cost of it is extra bpf_redirect() call.
Here we can do better and embed such info for xmit,
but subcodes for drop is slippery slop, since it's adding concepts
to design that are not going to be used by everyone.
If necessary bpf programs can count drop reasons internally.
Drops due to protocol!=ipv6 or drops due to ip frags present
will be program internal reasons. No need to expose them in api.

We need to get xmit part implemented first and see how it looks
before deciding on this part of api.
Right now I think we do not need tx queue number actually.
The prog should just return 'XMIT' and xdp(driver) side will decide
which tx queue to use.

> One other API issue is how to deal with encapsulation. In this case a
> header may be prepended to the packet, I assume there are BPF helper
> functions and we don't need to return a new length or start?

a bit of history:
for tc+bpf we've been trying to come up with clean helpers to do
header push/pop and it was very difficult, since skb keeps a ton
of metedata about header offsets, csum offsets, encap flag, etc
we've lost the count on number of different approaches we've
implemented and discarded.
For XDP there is no such issue.
Likely we'll have single bpf_packet_change(ctx, off, len) helper
that will grow(len) or trim(-len) bytes at offset(off) in the packet.
ctx->len will be adjusted automatically by the helper.
The headroom, tailroom will not be exposed and will never be known
to the bpf side. It's up to the helper and the driver to decide how to
insert N bytes at offset M. If the driver reserved headroom in dma
buffer, it can grow into it, if not it can grow tail and move
the whole packet. For performance reasons we obviously want some
headroom in dma buffer, but it's not exposed to bpf.

But it could be that directly adjusting ctx->len and ctx->data is faster.
For cls_bpf ctx->data is hidden and packet access is done via
special instructions and helpers. For XDP we can hopefully do better
and do packet access with direct loads. I outlined that plan
in the previous thread.

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-09 15:29             ` Jamal Hadi Salim
@ 2016-04-09 17:26               ` Alexei Starovoitov
  2016-04-10  7:55                 ` Thomas Graf
  2016-04-10 13:07                 ` Jamal Hadi Salim
  0 siblings, 2 replies; 34+ messages in thread
From: Alexei Starovoitov @ 2016-04-09 17:26 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: Tom Herbert, Jesper Dangaard Brouer, Brenden Blanco,
	David S. Miller, Linux Kernel Network Developers, Or Gerlitz,
	Daniel Borkmann, Eric Dumazet, Edward Cree, john fastabend,
	Thomas Graf, Johannes Berg, eranlinuxmellanox, Lorenzo Colitti,
	linux-mm

On Sat, Apr 09, 2016 at 11:29:18AM -0400, Jamal Hadi Salim wrote:
> On 16-04-09 07:29 AM, Tom Herbert wrote:
> 
> >+1. Forwarding which will be a common application almost always
> >requires modification (decrement TTL), and header data split has
> >always been a weak feature since the device has to have some arbitrary
> >rules about what headers needs to be split out (either implements
> >protocol specific parsing or some fixed length).
> 
> Then this is sensible. I was cruising the threads and
> confused by your earlier emails Tom because you talked
> about XPS etc. It sounded like the idea evolved into putting
> the whole freaking stack on bpf.

yeah, no stack, no queues in bpf.

> If this is _forwarding only_ it maybe useful to look at
> Alexey's old code in particular the DMA bits;
> he built his own lookup algorithm but sounds like bpf is
> a much better fit today.

a link to these old bits?

Just to be clear: this rfc is not the only thing we're considering.
In particular huawei guys did a monster effort to improve performance
in this area as well. We'll try to blend all the code together and
pick what's the best.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 5/5] Add sample for adding simple drop program to link
  2016-04-09 16:43     ` Brenden Blanco
@ 2016-04-09 17:27       ` Jamal Hadi Salim
  2016-04-10 18:38         ` Brenden Blanco
  0 siblings, 1 reply; 34+ messages in thread
From: Jamal Hadi Salim @ 2016-04-09 17:27 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel, brouer,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo

On 16-04-09 12:43 PM, Brenden Blanco wrote:
> On Sat, Apr 09, 2016 at 10:48:05AM -0400, Jamal Hadi Salim wrote:


>> Ok, sorry - should have looked this far before sending earlier email.
>> So when you run concurently you see about 5Mpps per core but if you
>> shoot all traffic at a single core you see 20Mpps?
> No, only sender is multiple, receiver is still single core. The flow is
> the same in all 4 of the send threads. Note that only ksoftirqd/6 is
> active.

Got it.
The sender was limited to the 20Mpps and you are able to keep up
if i understand correctly.


>>
>> Devil's advocate question:
>> If the bottleneck is the driver - is there an advantage in adding the
>> bpf code at all in the driver?
> Only by adding this hook into the driver has it become the bottleneck.
 >
> Prior to this, the bottleneck was later in the codepath, primarily in
> allocations.
>

Maybe useful in your commit log to show the prior and after.
Looking at both your and Daniel's profile you show in this email
mlx4_en_process_rx_cq() seems to be where the action is on both, no?

> If a packet is to be dropped, and a determination can be made with fewer
> cpu cycles spent, then there is more time for the goodput.
>

Agreed.

> Beyond that, even if the skb allocation gets 10x or 100x or whatever
> improvement, there is still a non-zero cost associated, and dropping bad
> packets with minimal time spent has value. The same argument holds for
> physical nic forwarding decisions.
>

I always go for the lowest hanging fruit.
It seemed it was the driver path in your case. When we removed
the driver overhead (as demoed at the tc workshop in netdev11) we saw
__netif_receive_skb_core() at the top of the profile.
So in this case seems it was mlx4_en_process_rx_cq() - thats why i
was saying the bottleneck is the driver.
Having said that: I agree that early drop is useful if not for anything
else to avoid the longer code path (but was worried after reading on
thread this was going to get into a messy stack-in-the-driver and i am
not sure it is avoidable either given a new ops interface is showing
  up).

>> I am curious than before to see the comparison for the same bpf code
>> running at tc level vs in the driver..
> Here is a perf report for drop in the clsact qdisc with direct-action,
> which Daniel earlier showed to have the best performance to-date. On my
> machine, this gets about 6.5Mpps drop single core. Drop due to failed
> IP lookup (not shown here) is worse @4.5Mpps.
>

Nice.
However, still for this to be orange/orange comparison you have to
run it on the _same receiver machine_ as opposed to Daniel doing
it on his for the one case. And two different kernels booted up
one patched  with your changes and another virgin without them.

cheers,
jamal

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-09 17:26               ` Alexei Starovoitov
@ 2016-04-10  7:55                 ` Thomas Graf
  2016-04-10 16:53                   ` Tom Herbert
  2016-04-10 13:07                 ` Jamal Hadi Salim
  1 sibling, 1 reply; 34+ messages in thread
From: Thomas Graf @ 2016-04-10  7:55 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jamal Hadi Salim, Tom Herbert, Jesper Dangaard Brouer,
	Brenden Blanco, David S. Miller, Linux Kernel Network Developers,
	Or Gerlitz, Daniel Borkmann, Eric Dumazet, Edward Cree,
	john fastabend, Johannes Berg, eranlinuxmellanox,
	Lorenzo Colitti, linux-mm

On 04/09/16 at 10:26am, Alexei Starovoitov wrote:
> On Sat, Apr 09, 2016 at 11:29:18AM -0400, Jamal Hadi Salim wrote:
> > If this is _forwarding only_ it maybe useful to look at
> > Alexey's old code in particular the DMA bits;
> > he built his own lookup algorithm but sounds like bpf is
> > a much better fit today.
> 
> a link to these old bits?
> 
> Just to be clear: this rfc is not the only thing we're considering.
> In particular huawei guys did a monster effort to improve performance
> in this area as well. We'll try to blend all the code together and
> pick what's the best.

What's the plan on opening the discussion on this? Can we get a peek?
Is it an alternative to XDP and the driver hook? Different architecture
or just different implementation? I understood it as another pseudo
skb model with a path on converting to real skbs for stack processing.

I really like the current proposal by Brenden for its simplicity and
targeted compatibility with cls_bpf.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-09 17:26               ` Alexei Starovoitov
  2016-04-10  7:55                 ` Thomas Graf
@ 2016-04-10 13:07                 ` Jamal Hadi Salim
  1 sibling, 0 replies; 34+ messages in thread
From: Jamal Hadi Salim @ 2016-04-10 13:07 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Tom Herbert, Jesper Dangaard Brouer, Brenden Blanco,
	David S. Miller, Linux Kernel Network Developers, Or Gerlitz,
	Daniel Borkmann, Eric Dumazet, Edward Cree, john fastabend,
	Thomas Graf, Johannes Berg, eranlinuxmellanox, Lorenzo Colitti,
	linux-mm, Robert Olsson, kuznet

On 16-04-09 01:26 PM, Alexei Starovoitov wrote:

>
> yeah, no stack, no queues in bpf.

Thanks.

>
>> If this is _forwarding only_ it maybe useful to look at
>> Alexey's old code in particular the DMA bits;
>> he built his own lookup algorithm but sounds like bpf is
>> a much better fit today.
>
> a link to these old bits?
>

Dang. Trying to remember exact name (I think it has been gone for at
least 10 years now). I know it is not CONFIG_NET_FASTROUTE although
it could have been that depending on the driver (tulip had some
nice DMA properties - which by todays standards would be considered
primitive ;->).
+Cc Robert and Alexey (Trying to figure out name of driver based
routing code that DMAed from ingress to egress port)

> Just to be clear: this rfc is not the only thing we're considering.
> In particular huawei guys did a monster effort to improve performance
> in this area as well. We'll try to blend all the code together and
> pick what's the best.
>

Sounds very interesting.

cheers,
jamal

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-10  7:55                 ` Thomas Graf
@ 2016-04-10 16:53                   ` Tom Herbert
  2016-04-10 18:09                     ` Jamal Hadi Salim
  0 siblings, 1 reply; 34+ messages in thread
From: Tom Herbert @ 2016-04-10 16:53 UTC (permalink / raw)
  To: Thomas Graf
  Cc: Alexei Starovoitov, Jamal Hadi Salim, Jesper Dangaard Brouer,
	Brenden Blanco, David S. Miller, Linux Kernel Network Developers,
	Or Gerlitz, Daniel Borkmann, Eric Dumazet, Edward Cree,
	john fastabend, Johannes Berg, eranlinuxmellanox,
	Lorenzo Colitti, linux-mm

On Sun, Apr 10, 2016 at 12:55 AM, Thomas Graf <tgraf@suug.ch> wrote:
> On 04/09/16 at 10:26am, Alexei Starovoitov wrote:
>> On Sat, Apr 09, 2016 at 11:29:18AM -0400, Jamal Hadi Salim wrote:
>> > If this is _forwarding only_ it maybe useful to look at
>> > Alexey's old code in particular the DMA bits;
>> > he built his own lookup algorithm but sounds like bpf is
>> > a much better fit today.
>>
>> a link to these old bits?
>>
>> Just to be clear: this rfc is not the only thing we're considering.
>> In particular huawei guys did a monster effort to improve performance
>> in this area as well. We'll try to blend all the code together and
>> pick what's the best.
>
> What's the plan on opening the discussion on this? Can we get a peek?
> Is it an alternative to XDP and the driver hook? Different architecture
> or just different implementation? I understood it as another pseudo
> skb model with a path on converting to real skbs for stack processing.
>
We started discussions about this in IOvisor. The Huawei project is
called ceth (Common Ethernet). It is essentially a layer called
directly from drivers intended for fast path forwarding and network
virtualization. They have put quite a bit of effort into buffer
management and other parts of the infrastructure, much of which we
would like to leverage in XDP. The code is currently in github, will
ask them to make it generally accessible.

The programmability part, essentially BPF, should be part of a common
solution. We can define the necessary interfaces independently of the
underlying infrastructure which is really the only way we can do this
if we want the BPF programs to be portable across different
platforms-- in Linux, userspace, HW, etc.

Tom

> I really like the current proposal by Brenden for its simplicity and
> targeted compatibility with cls_bpf.

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter
  2016-04-10 16:53                   ` Tom Herbert
@ 2016-04-10 18:09                     ` Jamal Hadi Salim
  0 siblings, 0 replies; 34+ messages in thread
From: Jamal Hadi Salim @ 2016-04-10 18:09 UTC (permalink / raw)
  To: Tom Herbert, Thomas Graf
  Cc: Alexei Starovoitov, Jesper Dangaard Brouer, Brenden Blanco,
	David S. Miller, Linux Kernel Network Developers, Or Gerlitz,
	Daniel Borkmann, Eric Dumazet, Edward Cree, john fastabend,
	Johannes Berg, eranlinuxmellanox, Lorenzo Colitti, linux-mm

On 16-04-10 12:53 PM, Tom Herbert wrote:

> We started discussions about this in IOvisor. The Huawei project is
> called ceth (Common Ethernet). It is essentially a layer called
> directly from drivers intended for fast path forwarding and network
> virtualization. They have put quite a bit of effort into buffer
> management and other parts of the infrastructure, much of which we
> would like to leverage in XDP. The code is currently in github, will
> ask them to make it generally accessible.
>

Cant seem to find any info on it on the googles.
If it is forwarding then it should hopefully at least make use of Linux
control APIs I hope.

cheers,
jamal

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 5/5] Add sample for adding simple drop program to link
  2016-04-09 17:27       ` Jamal Hadi Salim
@ 2016-04-10 18:38         ` Brenden Blanco
  2016-04-13 10:40           ` Jamal Hadi Salim
  0 siblings, 1 reply; 34+ messages in thread
From: Brenden Blanco @ 2016-04-10 18:38 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel, brouer,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo

On Sat, Apr 09, 2016 at 01:27:03PM -0400, Jamal Hadi Salim wrote:
> On 16-04-09 12:43 PM, Brenden Blanco wrote:
> >On Sat, Apr 09, 2016 at 10:48:05AM -0400, Jamal Hadi Salim wrote:
> 
> 
> >>Ok, sorry - should have looked this far before sending earlier email.
> >>So when you run concurently you see about 5Mpps per core but if you
> >>shoot all traffic at a single core you see 20Mpps?
> >No, only sender is multiple, receiver is still single core. The flow is
> >the same in all 4 of the send threads. Note that only ksoftirqd/6 is
> >active.
> 
> Got it.
> The sender was limited to the 20Mpps and you are able to keep up
> if i understand correctly.
Perhaps, though I can't say 100%. The sender is able to do about 21/22
Mpps when pause frames are disabled. The sender is likely CPU limited as
it is an older Xeon.
> 
> 
> >>
> >>Devil's advocate question:
> >>If the bottleneck is the driver - is there an advantage in adding the
> >>bpf code at all in the driver?
> >Only by adding this hook into the driver has it become the bottleneck.
> >
> >Prior to this, the bottleneck was later in the codepath, primarily in
> >allocations.
> >
> 
> Maybe useful in your commit log to show the prior and after.
I can add this, sure.
> Looking at both your and Daniel's profile you show in this email
> mlx4_en_process_rx_cq() seems to be where the action is on both, no?
I don't draw this conclusion. With the phys_dev drop,
mlx4_en_process_rx_cq is the majority time consumer. In the perf output
showing drop in tc, the functions such as dev_gro_receive,
kmem_cache_free, napi_gro_frags, inet_gro_receive, __build_skb, etc
combined add up to 60% of the time spent. None of these are called when
early drop occurs. Just because mlx4_en_process_rx_cq is at the top of
the list doesn't mean it is the lowest hanging fruit.
> 
> >If a packet is to be dropped, and a determination can be made with fewer
> >cpu cycles spent, then there is more time for the goodput.
> >
> 
> Agreed.
> 
> >Beyond that, even if the skb allocation gets 10x or 100x or whatever
> >improvement, there is still a non-zero cost associated, and dropping bad
> >packets with minimal time spent has value. The same argument holds for
> >physical nic forwarding decisions.
> >
> 
> I always go for the lowest hanging fruit.
Which to me is the 60% time spent above the driver level as shown above.
> It seemed it was the driver path in your case. When we removed
> the driver overhead (as demoed at the tc workshop in netdev11) we saw
> __netif_receive_skb_core() at the top of the profile.
> So in this case seems it was mlx4_en_process_rx_cq() - thats why i
> was saying the bottleneck is the driver.
I wouldn't call it a bottleneck when the time spent is additive,
aka run-to-completion.
> Having said that: I agree that early drop is useful if not for anything
> else to avoid the longer code path (but was worried after reading on
> thread this was going to get into a messy stack-in-the-driver and i am
> not sure it is avoidable either given a new ops interface is showing
>  up).
> 
> >>I am curious than before to see the comparison for the same bpf code
> >>running at tc level vs in the driver..
> >Here is a perf report for drop in the clsact qdisc with direct-action,
> >which Daniel earlier showed to have the best performance to-date. On my
> >machine, this gets about 6.5Mpps drop single core. Drop due to failed
> >IP lookup (not shown here) is worse @4.5Mpps.
> >
> 
> Nice.
> However, still for this to be orange/orange comparison you have to
> run it on the _same receiver machine_ as opposed to Daniel doing
> it on his for the one case. And two different kernels booted up
> one patched  with your changes and another virgin without them.
Of course the second perf report is on the same machine as the commit
message. That was generated fresh for this email thread. All of the
numbers I've quoted come from the same single-sender/single-receiver
setup. I did also revert the change the in mlx4 driver and there was no
change in the tc numbers.
> 
> cheers,
> jamal

^ permalink raw reply	[flat|nested] 34+ messages in thread

* Re: [RFC PATCH v2 5/5] Add sample for adding simple drop program to link
  2016-04-10 18:38         ` Brenden Blanco
@ 2016-04-13 10:40           ` Jamal Hadi Salim
  0 siblings, 0 replies; 34+ messages in thread
From: Jamal Hadi Salim @ 2016-04-13 10:40 UTC (permalink / raw)
  To: Brenden Blanco
  Cc: davem, netdev, tom, alexei.starovoitov, ogerlitz, daniel, brouer,
	eric.dumazet, ecree, john.fastabend, tgraf, johannes,
	eranlinuxmellanox, lorenzo

On 16-04-10 02:38 PM, Brenden Blanco wrote:

>> I always go for the lowest hanging fruit.
> Which to me is the 60% time spent above the driver level as shown above.
[..]
>> It seemed it was the driver path in your case. When we removed
>> the driver overhead (as demoed at the tc workshop in netdev11) we saw
>> __netif_receive_skb_core() at the top of the profile.
>> So in this case seems it was mlx4_en_process_rx_cq() - thats why i
>> was saying the bottleneck is the driver.
> I wouldn't call it a bottleneck when the time spent is additive,
> aka run-to-completion.


The driver is a bottleneck regardless. It is probably the DMA interfaces 
and lots of cacheline misses. So the first thing to
fix is whats at the top of the profile if you wanb
The fact you are dropping earlier is in itself an improvement
as long as you dont try to be too fancy.

> Of course the second perf report is on the same machine as the commit
> message. That was generated fresh for this email thread. All of the
> numbers I've quoted come from the same single-sender/single-receiver
> setup. I did also revert the change the in mlx4 driver and there was no
> change in the tc numbers.

Ok, i misunderstood then because you hinted Daniel had seen those
numbers. If you please also add that to your commit numbers.

cheers,
jamal

^ permalink raw reply	[flat|nested] 34+ messages in thread

end of thread, other threads:[~2016-04-13 10:40 UTC | newest]

Thread overview: 34+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-04-08  4:48 [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Brenden Blanco
2016-04-08  4:48 ` [RFC PATCH v2 2/5] net: add ndo to set bpf prog in adapter rx Brenden Blanco
2016-04-08  9:38   ` Jesper Dangaard Brouer
2016-04-08 16:39     ` Brenden Blanco
2016-04-08  4:48 ` [RFC PATCH v2 3/5] rtnl: add option for setting link bpf prog Brenden Blanco
2016-04-08  4:48 ` [RFC PATCH v2 4/5] mlx4: add support for fast rx drop bpf program Brenden Blanco
2016-04-08 11:41   ` Jesper Dangaard Brouer
2016-04-08 17:04     ` Brenden Blanco
2016-04-08  4:48 ` [RFC PATCH v2 5/5] Add sample for adding simple drop program to link Brenden Blanco
2016-04-09 14:48   ` Jamal Hadi Salim
2016-04-09 16:43     ` Brenden Blanco
2016-04-09 17:27       ` Jamal Hadi Salim
2016-04-10 18:38         ` Brenden Blanco
2016-04-13 10:40           ` Jamal Hadi Salim
2016-04-08 10:36 ` [RFC PATCH v2 1/5] bpf: add PHYS_DEV prog type for early driver filter Jesper Dangaard Brouer
2016-04-08 11:09   ` Daniel Borkmann
2016-04-08 16:48     ` Brenden Blanco
2016-04-08 12:33   ` Jesper Dangaard Brouer
2016-04-08 17:02     ` Brenden Blanco
2016-04-08 19:05       ` Jesper Dangaard Brouer
2016-04-08 17:26     ` Alexei Starovoitov
2016-04-08 20:08       ` Jesper Dangaard Brouer
2016-04-08 21:34         ` Alexei Starovoitov
2016-04-09 11:29           ` Tom Herbert
2016-04-09 15:29             ` Jamal Hadi Salim
2016-04-09 17:26               ` Alexei Starovoitov
2016-04-10  7:55                 ` Thomas Graf
2016-04-10 16:53                   ` Tom Herbert
2016-04-10 18:09                     ` Jamal Hadi Salim
2016-04-10 13:07                 ` Jamal Hadi Salim
2016-04-09 11:17 ` Tom Herbert
2016-04-09 12:27   ` Jesper Dangaard Brouer
2016-04-09 13:17     ` Tom Herbert
2016-04-09 17:00   ` Alexei Starovoitov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.