* [bpf-next PATCH 1/4] bpf: devmap introduce dev_map_enqueue
2018-05-09 13:12 [bpf-next PATCH 0/4] xdp: introduce bulking for ndo_xdp_xmit API Jesper Dangaard Brouer
@ 2018-05-09 13:12 ` Jesper Dangaard Brouer
2018-05-10 9:10 ` kbuild test robot
2018-05-10 9:10 ` kbuild test robot
2018-05-09 13:13 ` [bpf-next PATCH 2/4] bpf: devmap prepare xdp frames for bulking Jesper Dangaard Brouer
` (2 subsequent siblings)
3 siblings, 2 replies; 9+ messages in thread
From: Jesper Dangaard Brouer @ 2018-05-09 13:12 UTC (permalink / raw)
To: netdev, Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson
Functionality is the same, but the ndo_xdp_xmit call is now
simply invoked from inside the devmap.c code.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
include/linux/bpf.h | 13 ++++++++++---
include/trace/events/xdp.h | 9 ++++++++-
kernel/bpf/devmap.c | 37 +++++++++++++++++++++++++++++++------
net/core/filter.c | 15 ++-------------
4 files changed, 51 insertions(+), 23 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 321969da67b7..6457343f6dfa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -485,14 +485,15 @@ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr);
void bpf_patch_call_args(struct bpf_insn *insn, u32 stack_depth);
/* Map specifics */
-struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
+struct xdp_buff;
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
void __dev_map_flush(struct bpf_map *map);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
void __cpu_map_flush(struct bpf_map *map);
-struct xdp_buff;
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
struct net_device *dev_rx);
@@ -571,6 +572,13 @@ static inline void __dev_map_flush(struct bpf_map *map)
{
}
+struct xdp_buff;
+static inline
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+ return 0;
+}
+
static inline
struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
{
@@ -585,7 +593,6 @@ static inline void __cpu_map_flush(struct bpf_map *map)
{
}
-struct xdp_buff;
static inline int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu,
struct xdp_buff *xdp,
struct net_device *dev_rx)
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 8989a92c571a..96104610d40e 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -138,11 +138,18 @@ DEFINE_EVENT_PRINT(xdp_redirect_template, xdp_redirect_map_err,
__entry->map_id, __entry->map_index)
);
+#ifndef __DEVMAP_OBJ_TYPE
+#define __DEVMAP_OBJ_TYPE
+struct _bpf_dtab_netdev {
+ struct net_device *dev;
+};
+#endif /* __DEVMAP_OBJ_TYPE */
+
#define devmap_ifindex(fwd, map) \
(!fwd ? 0 : \
(!map ? 0 : \
((map->map_type == BPF_MAP_TYPE_DEVMAP) ? \
- ((struct net_device *)fwd)->ifindex : 0)))
+ ((struct _bpf_dtab_netdev *)fwd)->dev->ifindex : 0)))
#define _trace_xdp_redirect_map(dev, xdp, fwd, map, idx) \
trace_xdp_redirect_map(dev, xdp, devmap_ifindex(fwd, map), \
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 565f9ece9115..808808bf2bf2 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -48,18 +48,21 @@
* calls will fail at this point.
*/
#include <linux/bpf.h>
+#include <net/xdp.h>
#include <linux/filter.h>
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+/* objects in the map */
struct bpf_dtab_netdev {
- struct net_device *dev;
+ struct net_device *dev; /* must be first member, due to tracepoint */
struct bpf_dtab *dtab;
unsigned int bit;
struct rcu_head rcu;
};
+/* bpf map container */
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map;
@@ -240,21 +243,43 @@ void __dev_map_flush(struct bpf_map *map)
* update happens in parallel here a dev_put wont happen until after reading the
* ifindex.
*/
-struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
+struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
- struct bpf_dtab_netdev *dev;
+ struct bpf_dtab_netdev *obj;
if (key >= map->max_entries)
return NULL;
- dev = READ_ONCE(dtab->netdev_map[key]);
- return dev ? dev->dev : NULL;
+ obj = READ_ONCE(dtab->netdev_map[key]);
+ return obj;
+}
+
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+{
+ struct net_device *dev = dst->dev;
+ struct xdp_frame *xdpf;
+ int err;
+
+ if (!dev->netdev_ops->ndo_xdp_xmit)
+ return -EOPNOTSUPP;
+
+ xdpf = convert_to_xdp_frame(xdp);
+ if (unlikely(!xdpf))
+ return -EOVERFLOW;
+
+ /* TODO: implement a bulking/enqueue step later */
+ err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+ if (err)
+ return err;
+
+ return 0;
}
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
- struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
+ struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
+ struct net_device *dev = dev = obj ? obj->dev : NULL;
return dev ? &dev->ifindex : NULL;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 6877426c23a6..258d243099e1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3020,20 +3020,9 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
switch (map->map_type) {
case BPF_MAP_TYPE_DEVMAP: {
- struct net_device *dev = fwd;
- struct xdp_frame *xdpf;
+ struct bpf_dtab_netdev *dst = fwd;
- if (!dev->netdev_ops->ndo_xdp_xmit)
- return -EOPNOTSUPP;
-
- xdpf = convert_to_xdp_frame(xdp);
- if (unlikely(!xdpf))
- return -EOVERFLOW;
-
- /* TODO: move to inside map code instead, for bulk support
- * err = dev_map_enqueue(dev, xdp);
- */
- err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+ err = dev_map_enqueue(dst, xdp);
if (err)
return err;
__dev_map_insert_ctx(map, index);
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [bpf-next PATCH 2/4] bpf: devmap prepare xdp frames for bulking
2018-05-09 13:12 [bpf-next PATCH 0/4] xdp: introduce bulking for ndo_xdp_xmit API Jesper Dangaard Brouer
2018-05-09 13:12 ` [bpf-next PATCH 1/4] bpf: devmap introduce dev_map_enqueue Jesper Dangaard Brouer
@ 2018-05-09 13:13 ` Jesper Dangaard Brouer
2018-05-09 13:13 ` [bpf-next PATCH 3/4] xdp: add tracepoint for devmap like cpumap have Jesper Dangaard Brouer
2018-05-09 13:13 ` [bpf-next PATCH 4/4] xdp: change ndo_xdp_xmit API to support bulking Jesper Dangaard Brouer
3 siblings, 0 replies; 9+ messages in thread
From: Jesper Dangaard Brouer @ 2018-05-09 13:13 UTC (permalink / raw)
To: netdev, Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson
Like cpumap create queue for xdp frames that will be bulked. For now,
this patch simply invoke ndo_xdp_xmit foreach frame. This happens,
either when the map flush operation is envoked, or when the limit
DEV_MAP_BULK_SIZE is reached.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
kernel/bpf/devmap.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++++---
1 file changed, 73 insertions(+), 4 deletions(-)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 808808bf2bf2..cab72c100bb5 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -54,11 +54,18 @@
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+#define DEV_MAP_BULK_SIZE 16
+struct xdp_bulk_queue {
+ struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+ unsigned int count;
+};
+
/* objects in the map */
struct bpf_dtab_netdev {
struct net_device *dev; /* must be first member, due to tracepoint */
struct bpf_dtab *dtab;
unsigned int bit;
+ struct xdp_bulk_queue __percpu *bulkq;
struct rcu_head rcu;
};
@@ -209,6 +216,38 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
__set_bit(bit, bitmap);
}
+static int bq_xmit_all(struct bpf_dtab_netdev *obj,
+ struct xdp_bulk_queue *bq)
+{
+ unsigned int processed = 0, drops = 0;
+ struct net_device *dev = obj->dev;
+ int i;
+
+ if (unlikely(!bq->count))
+ return 0;
+
+ for (i = 0; i < bq->count; i++) {
+ struct xdp_frame *xdpf = bq->q[i];
+
+ prefetch(xdpf);
+ }
+
+ for (i = 0; i < bq->count; i++) {
+ struct xdp_frame *xdpf = bq->q[i];
+ int err;
+
+ err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+ if (err) {
+ drops++;
+ xdp_return_frame(xdpf);
+ }
+ processed++;
+ }
+ bq->count = 0;
+
+ return 0;
+}
+
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
@@ -224,6 +263,7 @@ void __dev_map_flush(struct bpf_map *map)
for_each_set_bit(bit, bitmap, map->max_entries) {
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
+ struct xdp_bulk_queue *bq;
struct net_device *netdev;
/* This is possible if the dev entry is removed by user space
@@ -233,6 +273,9 @@ void __dev_map_flush(struct bpf_map *map)
continue;
__clear_bit(bit, bitmap);
+
+ bq = this_cpu_ptr(dev->bulkq);
+ bq_xmit_all(dev, bq);
netdev = dev->dev;
if (likely(netdev->netdev_ops->ndo_xdp_flush))
netdev->netdev_ops->ndo_xdp_flush(netdev);
@@ -255,6 +298,20 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
return obj;
}
+/* Runs under RCU-read-side, plus in softirq under NAPI protection.
+ * Thus, safe percpu variable access.
+ */
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+{
+ struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
+
+ if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
+ bq_xmit_all(obj, bq);
+
+ bq->q[bq->count++] = xdpf;
+ return 0;
+}
+
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
{
struct net_device *dev = dst->dev;
@@ -268,8 +325,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
if (unlikely(!xdpf))
return -EOVERFLOW;
- /* TODO: implement a bulking/enqueue step later */
- err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
+ err = bq_enqueue(dst, xdpf);
if (err)
return err;
@@ -288,13 +344,18 @@ static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{
if (dev->dev->netdev_ops->ndo_xdp_flush) {
struct net_device *fl = dev->dev;
+ struct xdp_bulk_queue *bq;
unsigned long *bitmap;
+
int cpu;
for_each_online_cpu(cpu) {
bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
__clear_bit(dev->bit, bitmap);
+ bq = per_cpu_ptr(dev->bulkq, cpu);
+ bq_xmit_all(dev, bq);
+
fl->netdev_ops->ndo_xdp_flush(dev->dev);
}
}
@@ -306,6 +367,7 @@ static void __dev_map_entry_free(struct rcu_head *rcu)
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
dev_map_flush_old(dev);
+ free_percpu(dev->bulkq);
dev_put(dev->dev);
kfree(dev);
}
@@ -338,6 +400,7 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct net *net = current->nsproxy->net_ns;
+ gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
struct bpf_dtab_netdev *dev, *old_dev;
u32 i = *(u32 *)key;
u32 ifindex = *(u32 *)value;
@@ -352,11 +415,17 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
if (!ifindex) {
dev = NULL;
} else {
- dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
- map->numa_node);
+ dev = kmalloc_node(sizeof(*dev), gfp, map->numa_node);
if (!dev)
return -ENOMEM;
+ dev->bulkq = __alloc_percpu_gfp(sizeof(*dev->bulkq),
+ sizeof(void *), gfp);
+ if (!dev->bulkq) {
+ kfree(dev);
+ return -ENOMEM;
+ }
+
dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) {
kfree(dev);
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [bpf-next PATCH 3/4] xdp: add tracepoint for devmap like cpumap have
2018-05-09 13:12 [bpf-next PATCH 0/4] xdp: introduce bulking for ndo_xdp_xmit API Jesper Dangaard Brouer
2018-05-09 13:12 ` [bpf-next PATCH 1/4] bpf: devmap introduce dev_map_enqueue Jesper Dangaard Brouer
2018-05-09 13:13 ` [bpf-next PATCH 2/4] bpf: devmap prepare xdp frames for bulking Jesper Dangaard Brouer
@ 2018-05-09 13:13 ` Jesper Dangaard Brouer
2018-05-09 13:13 ` [bpf-next PATCH 4/4] xdp: change ndo_xdp_xmit API to support bulking Jesper Dangaard Brouer
3 siblings, 0 replies; 9+ messages in thread
From: Jesper Dangaard Brouer @ 2018-05-09 13:13 UTC (permalink / raw)
To: netdev, Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson
Notice how this allow us get XDP statistic without affecting the XDP
performance, as tracepoint is no-longer activated on a per packet basis.
The xdp_monitor sample/tool is updated to use this new tracepoint.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
include/linux/bpf.h | 6 ++++-
include/trace/events/xdp.h | 39 +++++++++++++++++++++++++++++++++++
kernel/bpf/devmap.c | 25 ++++++++++++++++++-----
net/core/filter.c | 2 +-
samples/bpf/xdp_monitor_kern.c | 39 +++++++++++++++++++++++++++++++++++
samples/bpf/xdp_monitor_user.c | 44 +++++++++++++++++++++++++++++++++++++++-
6 files changed, 146 insertions(+), 9 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 6457343f6dfa..066b5c67c71f 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -489,7 +489,8 @@ struct xdp_buff;
struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
void __dev_map_insert_ctx(struct bpf_map *map, u32 index);
void __dev_map_flush(struct bpf_map *map);
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp);
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx);
struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
void __cpu_map_insert_ctx(struct bpf_map *map, u32 index);
@@ -574,7 +575,8 @@ static inline void __dev_map_flush(struct bpf_map *map)
struct xdp_buff;
static inline
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
{
return 0;
}
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 96104610d40e..2e9ef0650144 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -229,6 +229,45 @@ TRACE_EVENT(xdp_cpumap_enqueue,
__entry->to_cpu)
);
+TRACE_EVENT(xdp_devmap_xmit,
+
+ TP_PROTO(const struct bpf_map *map, u32 map_index,
+ int sent, int drops,
+ const struct net_device *from_dev,
+ const struct net_device *to_dev),
+
+ TP_ARGS(map, map_index, sent, drops, from_dev, to_dev),
+
+ TP_STRUCT__entry(
+ __field(int, map_id)
+ __field(u32, act)
+ __field(u32, map_index)
+ __field(int, drops)
+ __field(int, sent)
+ __field(int, from_ifindex)
+ __field(int, to_ifindex)
+ ),
+
+ TP_fast_assign(
+ __entry->map_id = map->id;
+ __entry->act = XDP_REDIRECT;
+ __entry->map_index = map_index;
+ __entry->drops = drops;
+ __entry->sent = sent;
+ __entry->from_ifindex = from_dev->ifindex;
+ __entry->to_ifindex = to_dev->ifindex;
+ ),
+
+ TP_printk("ndo_xdp_xmit"
+ " map_id=%d map_index=%d action=%s"
+ " sent=%d drops=%d"
+ " from_ifindex=%d to_ifindex=%d",
+ __entry->map_id, __entry->map_index,
+ __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
+ __entry->sent, __entry->drops,
+ __entry->from_ifindex, __entry->to_ifindex)
+);
+
#endif /* _TRACE_XDP_H */
#include <trace/define_trace.h>
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index cab72c100bb5..6f84100723b0 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,7 @@
#include <linux/bpf.h>
#include <net/xdp.h>
#include <linux/filter.h>
+#include <trace/events/xdp.h>
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
@@ -57,6 +58,7 @@
#define DEV_MAP_BULK_SIZE 16
struct xdp_bulk_queue {
struct xdp_frame *q[DEV_MAP_BULK_SIZE];
+ struct net_device *dev_rx;
unsigned int count;
};
@@ -219,8 +221,8 @@ void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
static int bq_xmit_all(struct bpf_dtab_netdev *obj,
struct xdp_bulk_queue *bq)
{
- unsigned int processed = 0, drops = 0;
struct net_device *dev = obj->dev;
+ int sent = 0, drops = 0;
int i;
if (unlikely(!bq->count))
@@ -241,10 +243,13 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
drops++;
xdp_return_frame(xdpf);
}
- processed++;
+ sent++;
}
bq->count = 0;
+ trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
+ sent, drops, bq->dev_rx, dev);
+ bq->dev_rx = NULL;
return 0;
}
@@ -301,18 +306,28 @@ struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
/* Runs under RCU-read-side, plus in softirq under NAPI protection.
* Thus, safe percpu variable access.
*/
-static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
+static int bq_enqueue(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf,
+ struct net_device *dev_rx)
+
{
struct xdp_bulk_queue *bq = this_cpu_ptr(obj->bulkq);
if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
bq_xmit_all(obj, bq);
+ /* Ingress dev_rx will be the same for all xdp_frame's in
+ * bulk_queue, because bq stored per-CPU and must be flushed
+ * from net_device drivers NAPI func end.
+ */
+ if (!bq->dev_rx)
+ bq->dev_rx = dev_rx;
+
bq->q[bq->count++] = xdpf;
return 0;
}
-int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
+int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
+ struct net_device *dev_rx)
{
struct net_device *dev = dst->dev;
struct xdp_frame *xdpf;
@@ -325,7 +340,7 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp)
if (unlikely(!xdpf))
return -EOVERFLOW;
- err = bq_enqueue(dst, xdpf);
+ err = bq_enqueue(dst, xdpf, dev_rx);
if (err)
return err;
diff --git a/net/core/filter.c b/net/core/filter.c
index 258d243099e1..8b7924368dc1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3022,7 +3022,7 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
case BPF_MAP_TYPE_DEVMAP: {
struct bpf_dtab_netdev *dst = fwd;
- err = dev_map_enqueue(dst, xdp);
+ err = dev_map_enqueue(dst, xdp, dev_rx);
if (err)
return err;
__dev_map_insert_ctx(map, index);
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 211db8ded0de..2854aa0665ea 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -208,3 +208,42 @@ int trace_xdp_cpumap_kthread(struct cpumap_kthread_ctx *ctx)
return 0;
}
+
+struct bpf_map_def SEC("maps") devmap_xmit_cnt = {
+ .type = BPF_MAP_TYPE_PERCPU_ARRAY,
+ .key_size = sizeof(u32),
+ .value_size = sizeof(struct datarec),
+ .max_entries = 1,
+};
+
+/* Tracepoint: /sys/kernel/debug/tracing/events/xdp/xdp_devmap_xmit/format
+ * Code in: kernel/include/trace/events/xdp.h
+ */
+struct devmap_xmit_ctx {
+ u64 __pad; // First 8 bytes are not accessible by bpf code
+ int map_id; // offset:8; size:4; signed:1;
+ u32 act; // offset:12; size:4; signed:0;
+ u32 map_index; // offset:16; size:4; signed:0;
+ int drops; // offset:20; size:4; signed:1;
+ int sent; // offset:24; size:4; signed:1;
+ int from_ifindex; // offset:28; size:4; signed:1;
+ int to_ifindex; // offset:32; size:4; signed:1;
+};
+
+SEC("tracepoint/xdp/xdp_devmap_xmit")
+int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
+{
+ struct datarec *rec;
+ u32 key = 0;
+
+ rec = bpf_map_lookup_elem(&devmap_xmit_cnt, &key);
+ if (!rec)
+ return 0;
+ rec->processed += ctx->sent;
+ rec->dropped += ctx->drops;
+
+ /* Record bulk events, then userspace can calc average bulk size */
+ rec->info += 1;
+
+ return 1;
+}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index 894bc64c2cac..4aaf1ab1927d 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -141,6 +141,7 @@ struct stats_record {
struct record_u64 xdp_exception[XDP_ACTION_MAX];
struct record xdp_cpumap_kthread;
struct record xdp_cpumap_enqueue[MAX_CPUS];
+ struct record xdp_devmap_xmit;
};
static bool map_collect_record(int fd, __u32 key, struct record *rec)
@@ -397,7 +398,7 @@ static void stats_print(struct stats_record *stats_rec,
info = calc_info(r, p, t);
if (info > 0)
i_str = "sched";
- if (pps > 0)
+ if (pps > 0 || drop > 0)
printf(fmt1, "cpumap-kthread",
i, pps, drop, info, i_str);
}
@@ -409,6 +410,42 @@ static void stats_print(struct stats_record *stats_rec,
printf(fmt2, "cpumap-kthread", "total", pps, drop, info, i_str);
}
+ /* devmap ndo_xdp_xmit stats */
+ {
+ char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s\n";
+ char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s\n";
+ struct record *rec, *prev;
+ double drop, info;
+ char *i_str = "";
+
+ rec = &stats_rec->xdp_devmap_xmit;
+ prev = &stats_prev->xdp_devmap_xmit;
+ t = calc_period(rec, prev);
+ for (i = 0; i < nr_cpus; i++) {
+ struct datarec *r = &rec->cpu[i];
+ struct datarec *p = &prev->cpu[i];
+
+ pps = calc_pps(r, p, t);
+ drop = calc_drop(r, p, t);
+ info = calc_info(r, p, t);
+ if (info > 0) {
+ i_str = "bulk-average";
+ info = (pps+drop) / info; /* calc avg bulk */
+ }
+ if (pps > 0 || drop > 0)
+ printf(fmt1, "devmap-xmit",
+ i, pps, drop, info, i_str);
+ }
+ pps = calc_pps(&rec->total, &prev->total, t);
+ drop = calc_drop(&rec->total, &prev->total, t);
+ info = calc_info(&rec->total, &prev->total, t);
+ if (info > 0) {
+ i_str = "bulk-average";
+ info = (pps+drop) / info; /* calc avg bulk */
+ }
+ printf(fmt2, "devmap-xmit", "total", pps, drop, info, i_str);
+ }
+
printf("\n");
}
@@ -437,6 +474,9 @@ static bool stats_collect(struct stats_record *rec)
fd = map_data[3].fd; /* map3: cpumap_kthread_cnt */
map_collect_record(fd, 0, &rec->xdp_cpumap_kthread);
+ fd = map_data[4].fd; /* map4: devmap_xmit_cnt */
+ map_collect_record(fd, 0, &rec->xdp_devmap_xmit);
+
return true;
}
@@ -480,6 +520,7 @@ static struct stats_record *alloc_stats_record(void)
rec_sz = sizeof(struct datarec);
rec->xdp_cpumap_kthread.cpu = alloc_rec_per_cpu(rec_sz);
+ rec->xdp_devmap_xmit.cpu = alloc_rec_per_cpu(rec_sz);
for (i = 0; i < MAX_CPUS; i++)
rec->xdp_cpumap_enqueue[i].cpu = alloc_rec_per_cpu(rec_sz);
@@ -498,6 +539,7 @@ static void free_stats_record(struct stats_record *r)
free(r->xdp_exception[i].cpu);
free(r->xdp_cpumap_kthread.cpu);
+ free(r->xdp_devmap_xmit.cpu);
for (i = 0; i < MAX_CPUS; i++)
free(r->xdp_cpumap_enqueue[i].cpu);
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [bpf-next PATCH 4/4] xdp: change ndo_xdp_xmit API to support bulking
2018-05-09 13:12 [bpf-next PATCH 0/4] xdp: introduce bulking for ndo_xdp_xmit API Jesper Dangaard Brouer
` (2 preceding siblings ...)
2018-05-09 13:13 ` [bpf-next PATCH 3/4] xdp: add tracepoint for devmap like cpumap have Jesper Dangaard Brouer
@ 2018-05-09 13:13 ` Jesper Dangaard Brouer
2018-05-10 8:10 ` kbuild test robot
2018-05-10 9:03 ` kbuild test robot
3 siblings, 2 replies; 9+ messages in thread
From: Jesper Dangaard Brouer @ 2018-05-09 13:13 UTC (permalink / raw)
To: netdev, Daniel Borkmann, Alexei Starovoitov, Jesper Dangaard Brouer
Cc: Christoph Hellwig, BjörnTöpel, Magnus Karlsson
This patch change the API for ndo_xdp_xmit to support bulking
xdp_frames.
When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown.
Most of the slowdown is caused by DMA API indirect function calls, but
also the net_device->ndo_xdp_xmit() call.
Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with
single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed
performance improved:
for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps
for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps
With frames avail as a bulk inside the driver ndo_xdp_xmit call,
further optimizations are possible, like bulk DMA-mapping for TX.
Testing without CONFIG_RETPOLINE show the same performance for
physical NIC drivers.
The virtual NIC driver tun sees a huge performance boost, as it can
avoid doing per frame producer locking, but instead amortize the
locking cost over the bulk.
Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
---
drivers/net/ethernet/intel/i40e/i40e_txrx.c | 26 +++++++---
drivers/net/ethernet/intel/i40e/i40e_txrx.h | 2 -
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 21 ++++++--
drivers/net/tun.c | 37 +++++++++-----
drivers/net/virtio_net.c | 66 +++++++++++++++++++------
include/linux/netdevice.h | 14 +++--
include/net/xdp.h | 1
include/trace/events/xdp.h | 10 ++--
kernel/bpf/devmap.c | 33 ++++++++-----
net/core/filter.c | 4 +-
net/core/xdp.c | 14 ++++-
samples/bpf/xdp_monitor_kern.c | 10 ++++
samples/bpf/xdp_monitor_user.c | 35 +++++++++++--
13 files changed, 197 insertions(+), 76 deletions(-)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 87fb27ab9c24..27e69fdd0a11 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -3686,14 +3686,19 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
* @dev: netdev
* @xdp: XDP buffer
*
- * Returns Zero if sent, else an error code
+ * Returns number of frames successfully sent. Frames that fail are
+ * free'ed via XDP return API.
+ *
+ * For error cases, a negative errno code is returned and no-frames
+ * are transmitted (caller must handle freeing frames).
**/
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
{
struct i40e_netdev_priv *np = netdev_priv(dev);
unsigned int queue_index = smp_processor_id();
struct i40e_vsi *vsi = np->vsi;
- int err;
+ int drops = 0;
+ int i;
if (test_bit(__I40E_VSI_DOWN, vsi->state))
return -ENETDOWN;
@@ -3701,11 +3706,18 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
return -ENXIO;
- err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
- if (err != I40E_XDP_TX)
- return -ENOSPC;
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ int err;
- return 0;
+ err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
+ if (err != I40E_XDP_TX) {
+ xdp_return_frame_rx_napi(xdpf);
+ drops++;
+ }
+ }
+
+ return n - drops;
}
/**
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.h b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
index 4bf318b8be85..da6697c0407f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.h
@@ -511,7 +511,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
void i40e_detect_recover_hung(struct i40e_vsi *vsi);
int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
+int i40e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames);
void i40e_xdp_flush(struct net_device *dev);
/**
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index b6e5cea84949..e64f71bc04c2 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10042,11 +10042,13 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
}
}
-static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+static int ixgbe_xdp_xmit(struct net_device *dev, int n,
+ struct xdp_frame **frames)
{
struct ixgbe_adapter *adapter = netdev_priv(dev);
struct ixgbe_ring *ring;
- int err;
+ int drops = 0;
+ int i;
if (unlikely(test_bit(__IXGBE_DOWN, &adapter->state)))
return -ENETDOWN;
@@ -10058,11 +10060,18 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
if (unlikely(!ring))
return -ENXIO;
- err = ixgbe_xmit_xdp_ring(adapter, xdpf);
- if (err != IXGBE_XDP_TX)
- return -ENOSPC;
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+ int err;
- return 0;
+ err = ixgbe_xmit_xdp_ring(adapter, xdpf);
+ if (err != IXGBE_XDP_TX) {
+ xdp_return_frame_rx_napi(xdpf);
+ drops++;
+ }
+ }
+
+ return n - drops;
}
static void ixgbe_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d3c04ab9752a..4fe0c75c5e0b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -70,6 +70,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
+#include <net/xdp.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
@@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = {
.ndo_get_stats64 = tun_net_get_stats64,
};
-static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
+static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
{
struct tun_struct *tun = netdev_priv(dev);
struct tun_file *tfile;
u32 numqueues;
- int ret = 0;
+ int drops = 0;
+ int cnt = n;
+ int i;
rcu_read_lock();
numqueues = READ_ONCE(tun->numqueues);
if (!numqueues) {
- ret = -ENOSPC;
- goto out;
+ rcu_read_unlock();
+ return -ENXIO; /* Caller will free/return all frames */
}
tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
numqueues]);
- /* Encode the XDP flag into lowest bit for consumer to differ
- * XDP buffer from sk_buff.
- */
- if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
- this_cpu_inc(tun->pcpu_stats->tx_dropped);
- ret = -ENOSPC;
+
+ spin_lock(&tfile->tx_ring.producer_lock);
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdp = frames[i];
+ /* Encode the XDP flag into lowest bit for consumer to differ
+ * XDP buffer from sk_buff.
+ */
+ void *frame = tun_xdp_to_ptr(xdp);
+
+ if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
+ this_cpu_inc(tun->pcpu_stats->tx_dropped);
+ xdp_return_frame_rx_napi(xdp);
+ drops++;
+ }
}
+ spin_unlock(&tfile->tx_ring.producer_lock);
-out:
rcu_read_unlock();
- return ret;
+ return cnt - drops;
}
static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
@@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
if (unlikely(!frame))
return -EOVERFLOW;
- return tun_xdp_xmit(dev, frame);
+ return tun_xdp_xmit(dev, 1, &frame);
}
static void tun_xdp_flush(struct net_device *dev)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index f34794a76c4d..39a0783d1cde 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -419,23 +419,13 @@ static void virtnet_xdp_flush(struct net_device *dev)
virtqueue_kick(sq->vq);
}
-static int __virtnet_xdp_xmit(struct virtnet_info *vi,
- struct xdp_frame *xdpf)
+static int __virtnet_xdp_xmit_one(struct virtnet_info *vi,
+ struct send_queue *sq,
+ struct xdp_frame *xdpf)
{
struct virtio_net_hdr_mrg_rxbuf *hdr;
- struct xdp_frame *xdpf_sent;
- struct send_queue *sq;
- unsigned int len;
- unsigned int qp;
int err;
- qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
- sq = &vi->sq[qp];
-
- /* Free up any pending old buffers before queueing new ones. */
- while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
- xdp_return_frame(xdpf_sent);
-
/* virtqueue want to use data area in-front of packet */
if (unlikely(xdpf->metasize > 0))
return -EOPNOTSUPP;
@@ -459,11 +449,40 @@ static int __virtnet_xdp_xmit(struct virtnet_info *vi,
return 0;
}
-static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
+static int __virtnet_xdp_tx_xmit(struct virtnet_info *vi,
+ struct xdp_frame *xdpf)
+{
+ struct xdp_frame *xdpf_sent;
+ struct send_queue *sq;
+ unsigned int len;
+ unsigned int qp;
+
+ qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+ sq = &vi->sq[qp];
+
+ /* Free up any pending old buffers before queueing new ones. */
+ while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+ xdp_return_frame(xdpf_sent);
+
+ return __virtnet_xdp_xmit_one(vi, sq, xdpf);
+}
+
+static int virtnet_xdp_xmit(struct net_device *dev,
+ int n, struct xdp_frame **frames)
{
struct virtnet_info *vi = netdev_priv(dev);
struct receive_queue *rq = vi->rq;
+ struct xdp_frame *xdpf_sent;
struct bpf_prog *xdp_prog;
+ struct send_queue *sq;
+ unsigned int len;
+ unsigned int qp;
+ int drops = 0;
+ int err;
+ int i;
+
+ qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
+ sq = &vi->sq[qp];
/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
* indicate XDP resources have been successfully allocated.
@@ -472,7 +491,20 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
if (!xdp_prog)
return -ENXIO;
- return __virtnet_xdp_xmit(vi, xdpf);
+ /* Free up any pending old buffers before queueing new ones. */
+ while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+ xdp_return_frame(xdpf_sent);
+
+ for (i = 0; i < n; i++) {
+ struct xdp_frame *xdpf = frames[i];
+
+ err = __virtnet_xdp_xmit_one(vi, sq, xdpf);
+ if (err) {
+ xdp_return_frame_rx_napi(xdpf);
+ drops++;
+ }
+ }
+ return n - drops;
}
static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
@@ -616,7 +648,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!xdpf))
goto err_xdp;
- err = __virtnet_xdp_xmit(vi, xdpf);
+ err = __virtnet_xdp_tx_xmit(vi, xdpf);
if (unlikely(err)) {
trace_xdp_exception(vi->dev, xdp_prog, act);
goto err_xdp;
@@ -779,7 +811,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
xdpf = convert_to_xdp_frame(&xdp);
if (unlikely(!xdpf))
goto err_xdp;
- err = __virtnet_xdp_xmit(vi, xdpf);
+ err = __virtnet_xdp_tx_xmit(vi, xdpf);
if (unlikely(err)) {
trace_xdp_exception(vi->dev, xdp_prog, act);
if (unlikely(xdp_page != page))
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index a30435118530..1c92f3b63c70 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1165,9 +1165,13 @@ struct dev_ifalias {
* This function is used to set or query state related to XDP on the
* netdevice and manage BPF offload. See definition of
* enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
- * This function is used to submit a XDP packet for transmit on a
- * netdevice.
+ * int (*ndo_xdp_xmit)(struct net_device *dev, int n, struct xdp_frame **xdp);
+ * This function is used to submit @n XDP packets for transmit on a
+ * netdevice. Returns number of frames successfully transmitted, frames
+ * that got dropped are freed/returned via xdp_return_frame().
+ * Returns negative number, means general error invoking ndo, meaning
+ * no frames were xmit'ed and core-caller will free all frames.
+ * TODO: Consider add flag to allow sending flush operation.
* void (*ndo_xdp_flush)(struct net_device *dev);
* This function is used to inform the driver to flush a particular
* xdp tx queue. Must be called on same CPU as xdp_xmit.
@@ -1355,8 +1359,8 @@ struct net_device_ops {
int needed_headroom);
int (*ndo_bpf)(struct net_device *dev,
struct netdev_bpf *bpf);
- int (*ndo_xdp_xmit)(struct net_device *dev,
- struct xdp_frame *xdp);
+ int (*ndo_xdp_xmit)(struct net_device *dev, int n,
+ struct xdp_frame **xdp);
void (*ndo_xdp_flush)(struct net_device *dev);
};
diff --git a/include/net/xdp.h b/include/net/xdp.h
index 0b689cf561c7..7ad779237ae8 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -104,6 +104,7 @@ struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
}
void xdp_return_frame(struct xdp_frame *xdpf);
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf);
void xdp_return_buff(struct xdp_buff *xdp);
int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
diff --git a/include/trace/events/xdp.h b/include/trace/events/xdp.h
index 2e9ef0650144..1ecf4c67fcf7 100644
--- a/include/trace/events/xdp.h
+++ b/include/trace/events/xdp.h
@@ -234,9 +234,9 @@ TRACE_EVENT(xdp_devmap_xmit,
TP_PROTO(const struct bpf_map *map, u32 map_index,
int sent, int drops,
const struct net_device *from_dev,
- const struct net_device *to_dev),
+ const struct net_device *to_dev, int err),
- TP_ARGS(map, map_index, sent, drops, from_dev, to_dev),
+ TP_ARGS(map, map_index, sent, drops, from_dev, to_dev, err),
TP_STRUCT__entry(
__field(int, map_id)
@@ -246,6 +246,7 @@ TRACE_EVENT(xdp_devmap_xmit,
__field(int, sent)
__field(int, from_ifindex)
__field(int, to_ifindex)
+ __field(int, err)
),
TP_fast_assign(
@@ -256,16 +257,17 @@ TRACE_EVENT(xdp_devmap_xmit,
__entry->sent = sent;
__entry->from_ifindex = from_dev->ifindex;
__entry->to_ifindex = to_dev->ifindex;
+ __entry->err = err;
),
TP_printk("ndo_xdp_xmit"
" map_id=%d map_index=%d action=%s"
" sent=%d drops=%d"
- " from_ifindex=%d to_ifindex=%d",
+ " from_ifindex=%d to_ifindex=%d err=%d",
__entry->map_id, __entry->map_index,
__print_symbolic(__entry->act, __XDP_ACT_SYM_TAB),
__entry->sent, __entry->drops,
- __entry->from_ifindex, __entry->to_ifindex)
+ __entry->from_ifindex, __entry->to_ifindex, __entry->err)
);
#endif /* _TRACE_XDP_H */
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 6f84100723b0..4dd8f0e3a8d9 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -222,7 +222,7 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
struct xdp_bulk_queue *bq)
{
struct net_device *dev = obj->dev;
- int sent = 0, drops = 0;
+ int sent = 0, drops = 0, err = 0;
int i;
if (unlikely(!bq->count))
@@ -234,23 +234,32 @@ static int bq_xmit_all(struct bpf_dtab_netdev *obj,
prefetch(xdpf);
}
- for (i = 0; i < bq->count; i++) {
- struct xdp_frame *xdpf = bq->q[i];
- int err;
-
- err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
- if (err) {
- drops++;
- xdp_return_frame(xdpf);
- }
- sent++;
+ sent = dev->netdev_ops->ndo_xdp_xmit(dev, bq->count, bq->q);
+ if (sent < 0) {
+ err = sent;
+ sent = 0;
+ goto error;
}
+ drops = bq->count - sent;
+out:
bq->count = 0;
trace_xdp_devmap_xmit(&obj->dtab->map, obj->bit,
- sent, drops, bq->dev_rx, dev);
+ sent, drops, bq->dev_rx, dev, err);
bq->dev_rx = NULL;
return 0;
+error:
+ /* If ndo_xdp_xmit fails with an errno, no frames have been
+ * xmit'ed and it's our responsibility to them free all.
+ */
+ for (i = 0; i < bq->count; i++) {
+ struct xdp_frame *xdpf = bq->q[i];
+
+ /* RX path under NAPI protection, can return frames faster */
+ xdp_return_frame_rx_napi(xdpf);
+ drops++;
+ }
+ goto out;
}
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
diff --git a/net/core/filter.c b/net/core/filter.c
index 8b7924368dc1..547174d37f66 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -3004,8 +3004,8 @@ static int __bpf_tx_xdp(struct net_device *dev,
if (unlikely(!xdpf))
return -EOVERFLOW;
- err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
- if (err)
+ err = dev->netdev_ops->ndo_xdp_xmit(dev, 1, &xdpf);
+ if (err <= 0)
return err;
dev->netdev_ops->ndo_xdp_flush(dev);
return 0;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index bf6758f74339..237d374e6ef7 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -308,7 +308,7 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
-static void xdp_return(void *data, struct xdp_mem_info *mem)
+static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct)
{
struct xdp_mem_allocator *xa;
struct page *page;
@@ -320,7 +320,7 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
if (xa)
- page_pool_put_page(xa->page_pool, page);
+ __page_pool_put_page(xa->page_pool, page, napi_direct);
else
put_page(page);
rcu_read_unlock();
@@ -340,12 +340,18 @@ static void xdp_return(void *data, struct xdp_mem_info *mem)
void xdp_return_frame(struct xdp_frame *xdpf)
{
- xdp_return(xdpf->data, &xdpf->mem);
+ __xdp_return(xdpf->data, &xdpf->mem, false);
}
EXPORT_SYMBOL_GPL(xdp_return_frame);
+void xdp_return_frame_rx_napi(struct xdp_frame *xdpf)
+{
+ __xdp_return(xdpf->data, &xdpf->mem, true);
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame_rx_napi);
+
void xdp_return_buff(struct xdp_buff *xdp)
{
- xdp_return(xdp->data, &xdp->rxq->mem);
+ __xdp_return(xdp->data, &xdp->rxq->mem, true);
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
diff --git a/samples/bpf/xdp_monitor_kern.c b/samples/bpf/xdp_monitor_kern.c
index 2854aa0665ea..ad10fe700d7d 100644
--- a/samples/bpf/xdp_monitor_kern.c
+++ b/samples/bpf/xdp_monitor_kern.c
@@ -125,6 +125,7 @@ struct datarec {
u64 processed;
u64 dropped;
u64 info;
+ u64 err;
};
#define MAX_CPUS 64
@@ -228,6 +229,7 @@ struct devmap_xmit_ctx {
int sent; // offset:24; size:4; signed:1;
int from_ifindex; // offset:28; size:4; signed:1;
int to_ifindex; // offset:32; size:4; signed:1;
+ int err; // offset:36; size:4; signed:1;
};
SEC("tracepoint/xdp/xdp_devmap_xmit")
@@ -245,5 +247,13 @@ int trace_xdp_devmap_xmit(struct devmap_xmit_ctx *ctx)
/* Record bulk events, then userspace can calc average bulk size */
rec->info += 1;
+ /* Record error cases, where no frame were sent */
+ if (ctx->err)
+ rec->err++;
+
+ /* Catch API error of drv ndo_xdp_xmit sent more than count */
+ if (ctx->drops < 0)
+ rec->err++;
+
return 1;
}
diff --git a/samples/bpf/xdp_monitor_user.c b/samples/bpf/xdp_monitor_user.c
index 4aaf1ab1927d..0ec7967dd32a 100644
--- a/samples/bpf/xdp_monitor_user.c
+++ b/samples/bpf/xdp_monitor_user.c
@@ -117,6 +117,7 @@ struct datarec {
__u64 processed;
__u64 dropped;
__u64 info;
+ __u64 err;
};
#define MAX_CPUS 64
@@ -152,6 +153,7 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
__u64 sum_processed = 0;
__u64 sum_dropped = 0;
__u64 sum_info = 0;
+ __u64 sum_err = 0;
int i;
if ((bpf_map_lookup_elem(fd, &key, values)) != 0) {
@@ -170,10 +172,13 @@ static bool map_collect_record(int fd, __u32 key, struct record *rec)
sum_dropped += values[i].dropped;
rec->cpu[i].info = values[i].info;
sum_info += values[i].info;
+ rec->cpu[i].err = values[i].err;
+ sum_err += values[i].err;
}
rec->total.processed = sum_processed;
rec->total.dropped = sum_dropped;
rec->total.info = sum_info;
+ rec->total.err = sum_err;
return true;
}
@@ -274,6 +279,18 @@ static double calc_info(struct datarec *r, struct datarec *p, double period)
return pps;
}
+static double calc_err(struct datarec *r, struct datarec *p, double period)
+{
+ __u64 packets = 0;
+ double pps = 0;
+
+ if (period > 0) {
+ packets = r->err - p->err;
+ pps = packets / period;
+ }
+ return pps;
+}
+
static void stats_print(struct stats_record *stats_rec,
struct stats_record *stats_prev,
bool err_only)
@@ -412,11 +429,12 @@ static void stats_print(struct stats_record *stats_rec,
/* devmap ndo_xdp_xmit stats */
{
- char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s\n";
- char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s\n";
+ char *fmt1 = "%-15s %-7d %'-12.0f %'-12.0f %'-10.2f %s %s\n";
+ char *fmt2 = "%-15s %-7s %'-12.0f %'-12.0f %'-10.2f %s %s\n";
struct record *rec, *prev;
- double drop, info;
+ double drop, info, err;
char *i_str = "";
+ char *err_str = "";
rec = &stats_rec->xdp_devmap_xmit;
prev = &stats_prev->xdp_devmap_xmit;
@@ -428,22 +446,29 @@ static void stats_print(struct stats_record *stats_rec,
pps = calc_pps(r, p, t);
drop = calc_drop(r, p, t);
info = calc_info(r, p, t);
+ err = calc_err(r, p, t);
if (info > 0) {
i_str = "bulk-average";
info = (pps+drop) / info; /* calc avg bulk */
}
+ if (err > 0)
+ err_str = "drv-err";
if (pps > 0 || drop > 0)
printf(fmt1, "devmap-xmit",
- i, pps, drop, info, i_str);
+ i, pps, drop, info, i_str, err_str);
}
pps = calc_pps(&rec->total, &prev->total, t);
drop = calc_drop(&rec->total, &prev->total, t);
info = calc_info(&rec->total, &prev->total, t);
+ err = calc_err(&rec->total, &prev->total, t);
if (info > 0) {
i_str = "bulk-average";
info = (pps+drop) / info; /* calc avg bulk */
}
- printf(fmt2, "devmap-xmit", "total", pps, drop, info, i_str);
+ if (err > 0)
+ err_str = "drv-err";
+ printf(fmt2, "devmap-xmit", "total", pps, drop,
+ info, i_str, err_str);
}
printf("\n");
^ permalink raw reply related [flat|nested] 9+ messages in thread