* [PATCH v4 bpf-next 1/6] netdev_priv_flags: add missing IFF_PHONY_HEADROOM self-definition
2021-02-16 11:38 [PATCH v4 bpf-next 0/6] xsk: build skb by page (aka generic zerocopy xmit) Alexander Lobakin
@ 2021-02-16 11:38 ` Alexander Lobakin
2021-02-16 11:38 ` [PATCH v4 bpf-next 2/6] netdevice: check for net_device::priv_flags bitfield overflow Alexander Lobakin
` (4 subsequent siblings)
5 siblings, 0 replies; 11+ messages in thread
From: Alexander Lobakin @ 2021-02-16 11:38 UTC (permalink / raw)
To: Magnus Karlsson, Björn Töpel
Cc: Michael S. Tsirkin, Jason Wang, David S. Miller, Jakub Kicinski,
Jonathan Lemon, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh, Paolo Abeni,
Eric Dumazet, Xuan Zhuo, Dust Li, Alexander Lobakin,
virtualization, netdev, linux-kernel, bpf
This is harmless for now, but comes fatal for the subsequent patch.
Fixes: 871b642adebe3 ("netdev: introduce ndo_set_rx_headroom")
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
---
include/linux/netdevice.h | 1 +
1 file changed, 1 insertion(+)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b9bcbfde7849..b895973390ee 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1584,6 +1584,7 @@ enum netdev_priv_flags {
#define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE
#define IFF_TEAM IFF_TEAM
#define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
+#define IFF_PHONY_HEADROOM IFF_PHONY_HEADROOM
#define IFF_MACSEC IFF_MACSEC
#define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
#define IFF_FAILOVER IFF_FAILOVER
--
2.30.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 bpf-next 2/6] netdevice: check for net_device::priv_flags bitfield overflow
2021-02-16 11:38 [PATCH v4 bpf-next 0/6] xsk: build skb by page (aka generic zerocopy xmit) Alexander Lobakin
2021-02-16 11:38 ` [PATCH v4 bpf-next 1/6] netdev_priv_flags: add missing IFF_PHONY_HEADROOM self-definition Alexander Lobakin
@ 2021-02-16 11:38 ` Alexander Lobakin
2021-02-16 14:18 ` kernel test robot
2021-02-16 11:38 ` [PATCH v4 bpf-next 3/6] net: add priv_flags for allow tx skb without linear Alexander Lobakin
` (3 subsequent siblings)
5 siblings, 1 reply; 11+ messages in thread
From: Alexander Lobakin @ 2021-02-16 11:38 UTC (permalink / raw)
To: Magnus Karlsson, Björn Töpel
Cc: Michael S. Tsirkin, Jason Wang, David S. Miller, Jakub Kicinski,
Jonathan Lemon, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh, Paolo Abeni,
Eric Dumazet, Xuan Zhuo, Dust Li, Alexander Lobakin,
virtualization, netdev, linux-kernel, bpf
We almost ran out of unsigned int bitwidth. Define priv flags and
check for potential overflow in the fashion of netdev_features_t.
Defined this way, priv_flags can be easily expanded later with
just changing its typedef.
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
---
include/linux/netdevice.h | 135 ++++++++++++++++++++------------------
1 file changed, 72 insertions(+), 63 deletions(-)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index b895973390ee..fa4ab77ce81e 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1527,70 +1527,79 @@ struct net_device_ops {
* @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
*/
enum netdev_priv_flags {
- IFF_802_1Q_VLAN = 1<<0,
- IFF_EBRIDGE = 1<<1,
- IFF_BONDING = 1<<2,
- IFF_ISATAP = 1<<3,
- IFF_WAN_HDLC = 1<<4,
- IFF_XMIT_DST_RELEASE = 1<<5,
- IFF_DONT_BRIDGE = 1<<6,
- IFF_DISABLE_NETPOLL = 1<<7,
- IFF_MACVLAN_PORT = 1<<8,
- IFF_BRIDGE_PORT = 1<<9,
- IFF_OVS_DATAPATH = 1<<10,
- IFF_TX_SKB_SHARING = 1<<11,
- IFF_UNICAST_FLT = 1<<12,
- IFF_TEAM_PORT = 1<<13,
- IFF_SUPP_NOFCS = 1<<14,
- IFF_LIVE_ADDR_CHANGE = 1<<15,
- IFF_MACVLAN = 1<<16,
- IFF_XMIT_DST_RELEASE_PERM = 1<<17,
- IFF_L3MDEV_MASTER = 1<<18,
- IFF_NO_QUEUE = 1<<19,
- IFF_OPENVSWITCH = 1<<20,
- IFF_L3MDEV_SLAVE = 1<<21,
- IFF_TEAM = 1<<22,
- IFF_RXFH_CONFIGURED = 1<<23,
- IFF_PHONY_HEADROOM = 1<<24,
- IFF_MACSEC = 1<<25,
- IFF_NO_RX_HANDLER = 1<<26,
- IFF_FAILOVER = 1<<27,
- IFF_FAILOVER_SLAVE = 1<<28,
- IFF_L3MDEV_RX_HANDLER = 1<<29,
- IFF_LIVE_RENAME_OK = 1<<30,
+ IFF_802_1Q_VLAN_BIT,
+ IFF_EBRIDGE_BIT,
+ IFF_BONDING_BIT,
+ IFF_ISATAP_BIT,
+ IFF_WAN_HDLC_BIT,
+ IFF_XMIT_DST_RELEASE_BIT,
+ IFF_DONT_BRIDGE_BIT,
+ IFF_DISABLE_NETPOLL_BIT,
+ IFF_MACVLAN_PORT_BIT,
+ IFF_BRIDGE_PORT_BIT,
+ IFF_OVS_DATAPATH_BIT,
+ IFF_TX_SKB_SHARING_BIT,
+ IFF_UNICAST_FLT_BIT,
+ IFF_TEAM_PORT_BIT,
+ IFF_SUPP_NOFCS_BIT,
+ IFF_LIVE_ADDR_CHANGE_BIT,
+ IFF_MACVLAN_BIT,
+ IFF_XMIT_DST_RELEASE_PERM_BIT,
+ IFF_L3MDEV_MASTER_BIT,
+ IFF_NO_QUEUE_BIT,
+ IFF_OPENVSWITCH_BIT,
+ IFF_L3MDEV_SLAVE_BIT,
+ IFF_TEAM_BIT,
+ IFF_RXFH_CONFIGURED_BIT,
+ IFF_PHONY_HEADROOM_BIT,
+ IFF_MACSEC_BIT,
+ IFF_NO_RX_HANDLER_BIT,
+ IFF_FAILOVER_BIT,
+ IFF_FAILOVER_SLAVE_BIT,
+ IFF_L3MDEV_RX_HANDLER_BIT,
+ IFF_LIVE_RENAME_OK_BIT,
+
+ NETDEV_PRIV_FLAG_COUNT,
};
-#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
-#define IFF_EBRIDGE IFF_EBRIDGE
-#define IFF_BONDING IFF_BONDING
-#define IFF_ISATAP IFF_ISATAP
-#define IFF_WAN_HDLC IFF_WAN_HDLC
-#define IFF_XMIT_DST_RELEASE IFF_XMIT_DST_RELEASE
-#define IFF_DONT_BRIDGE IFF_DONT_BRIDGE
-#define IFF_DISABLE_NETPOLL IFF_DISABLE_NETPOLL
-#define IFF_MACVLAN_PORT IFF_MACVLAN_PORT
-#define IFF_BRIDGE_PORT IFF_BRIDGE_PORT
-#define IFF_OVS_DATAPATH IFF_OVS_DATAPATH
-#define IFF_TX_SKB_SHARING IFF_TX_SKB_SHARING
-#define IFF_UNICAST_FLT IFF_UNICAST_FLT
-#define IFF_TEAM_PORT IFF_TEAM_PORT
-#define IFF_SUPP_NOFCS IFF_SUPP_NOFCS
-#define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE
-#define IFF_MACVLAN IFF_MACVLAN
-#define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM
-#define IFF_L3MDEV_MASTER IFF_L3MDEV_MASTER
-#define IFF_NO_QUEUE IFF_NO_QUEUE
-#define IFF_OPENVSWITCH IFF_OPENVSWITCH
-#define IFF_L3MDEV_SLAVE IFF_L3MDEV_SLAVE
-#define IFF_TEAM IFF_TEAM
-#define IFF_RXFH_CONFIGURED IFF_RXFH_CONFIGURED
-#define IFF_PHONY_HEADROOM IFF_PHONY_HEADROOM
-#define IFF_MACSEC IFF_MACSEC
-#define IFF_NO_RX_HANDLER IFF_NO_RX_HANDLER
-#define IFF_FAILOVER IFF_FAILOVER
-#define IFF_FAILOVER_SLAVE IFF_FAILOVER_SLAVE
-#define IFF_L3MDEV_RX_HANDLER IFF_L3MDEV_RX_HANDLER
-#define IFF_LIVE_RENAME_OK IFF_LIVE_RENAME_OK
+typedef u32 netdev_priv_flags_t;
+static_assert(sizeof(netdev_priv_flags_t) * BITS_PER_BYTE <=
+ NETDEV_PRIV_FLAG_COUNT);
+
+#define __IFF_BIT(bit) ((netdev_priv_flags_t)1 << (bit))
+#define __IFF(name) __IFF_BIT(IFF_##name##_BIT)
+
+#define IFF_802_1Q_VLAN __IFF(802_1Q_VLAN)
+#define IFF_EBRIDGE __IFF(EBRIDGE)
+#define IFF_BONDING __IFF(BONDING)
+#define IFF_ISATAP __IFF(ISATAP)
+#define IFF_WAN_HDLC __IFF(WAN_HDLC)
+#define IFF_XMIT_DST_RELEASE __IFF(XMIT_DST_RELEASE)
+#define IFF_DONT_BRIDGE __IFF(DONT_BRIDGE)
+#define IFF_DISABLE_NETPOLL __IFF(DISABLE_NETPOLL)
+#define IFF_MACVLAN_PORT __IFF(MACVLAN_PORT)
+#define IFF_BRIDGE_PORT __IFF(BRIDGE_PORT)
+#define IFF_OVS_DATAPATH __IFF(OVS_DATAPATH)
+#define IFF_TX_SKB_SHARING __IFF(TX_SKB_SHARING)
+#define IFF_UNICAST_FLT __IFF(UNICAST_FLT)
+#define IFF_TEAM_PORT __IFF(TEAM_PORT)
+#define IFF_SUPP_NOFCS __IFF(SUPP_NOFCS)
+#define IFF_LIVE_ADDR_CHANGE __IFF(LIVE_ADDR_CHANGE)
+#define IFF_MACVLAN __IFF(MACVLAN)
+#define IFF_XMIT_DST_RELEASE_PERM __IFF(XMIT_DST_RELEASE_PERM)
+#define IFF_L3MDEV_MASTER __IFF(L3MDEV_MASTER)
+#define IFF_NO_QUEUE __IFF(NO_QUEUE)
+#define IFF_OPENVSWITCH __IFF(OPENVSWITCH)
+#define IFF_L3MDEV_SLAVE __IFF(L3MDEV_SLAVE)
+#define IFF_TEAM __IFF(TEAM)
+#define IFF_RXFH_CONFIGURED __IFF(RXFH_CONFIGURED)
+#define IFF_PHONY_HEADROOM __IFF(PHONY_HEADROOM)
+#define IFF_MACSEC __IFF(MACSEC)
+#define IFF_NO_RX_HANDLER __IFF(NO_RX_HANDLER)
+#define IFF_FAILOVER __IFF(FAILOVER)
+#define IFF_FAILOVER_SLAVE __IFF(FAILOVER_SLAVE)
+#define IFF_L3MDEV_RX_HANDLER __IFF(L3MDEV_RX_HANDLER)
+#define IFF_LIVE_RENAME_OK __IFF(LIVE_RENAME_OK)
/**
* struct net_device - The DEVICE structure.
@@ -1925,7 +1934,7 @@ struct net_device {
const struct header_ops *header_ops;
unsigned int flags;
- unsigned int priv_flags;
+ netdev_priv_flags_t priv_flags;
unsigned short gflags;
unsigned short padded;
--
2.30.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v4 bpf-next 2/6] netdevice: check for net_device::priv_flags bitfield overflow
2021-02-16 11:38 ` [PATCH v4 bpf-next 2/6] netdevice: check for net_device::priv_flags bitfield overflow Alexander Lobakin
@ 2021-02-16 14:18 ` kernel test robot
0 siblings, 0 replies; 11+ messages in thread
From: kernel test robot @ 2021-02-16 14:18 UTC (permalink / raw)
To: kbuild-all
[-- Attachment #1: Type: text/plain, Size: 4522 bytes --]
Hi Alexander,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on bpf-next/master]
url: https://github.com/0day-ci/linux/commits/Alexander-Lobakin/xsk-build-skb-by-page-aka-generic-zerocopy-xmit/20210216-194634
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: riscv-randconfig-r014-20210216 (attached as .config)
compiler: clang version 12.0.0 (https://github.com/llvm/llvm-project c9439ca36342fb6013187d0a69aef92736951476)
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# install riscv cross compiling tool for clang build
# apt-get install binutils-riscv64-linux-gnu
# https://github.com/0day-ci/linux/commit/ab71cd66e7a92a19ed9d78686970702fbbf0cd09
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Alexander-Lobakin/xsk-build-skb-by-page-aka-generic-zerocopy-xmit/20210216-194634
git checkout ab71cd66e7a92a19ed9d78686970702fbbf0cd09
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=riscv
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
In file included from net/8021q/vlanproc.c:26:
include/linux/netdevice.h:1566:1: error: static_assert failed due to requirement 'sizeof(unsigned int) * 8 <= NETDEV_PRIV_FLAG_COUNT' "sizeof(netdev_priv_flags_t) * BITS_PER_BYTE <= NETDEV_PRIV_FLAG_COUNT"
static_assert(sizeof(netdev_priv_flags_t) * BITS_PER_BYTE <=
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:77:34: note: expanded from macro 'static_assert'
#define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr)
^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
include/linux/build_bug.h:78:41: note: expanded from macro '__static_assert'
#define __static_assert(expr, msg, ...) _Static_assert(expr, msg)
^ ~~~~
>> net/8021q/vlanproc.c:257:30: warning: format specifies type 'unsigned short' but the argument has type 'netdev_priv_flags_t' (aka 'unsigned int') [-Wformat]
(int)(vlan->flags & 1), vlandev->priv_flags);
^~~~~~~~~~~~~~~~~~~
net/8021q/vlanproc.c:284:22: warning: format specifies type 'unsigned short' but the argument has type 'int' [-Wformat]
mp->priority, ((mp->vlan_qos >> 13) & 0x7));
^~~~~~~~~~~~~~~~~~~~~~~~~~~~
2 warnings and 1 error generated.
vim +257 net/8021q/vlanproc.c
^1da177e4c3f41 Linus Torvalds 2005-04-16 240
^1da177e4c3f41 Linus Torvalds 2005-04-16 241 static int vlandev_seq_show(struct seq_file *seq, void *offset)
^1da177e4c3f41 Linus Torvalds 2005-04-16 242 {
^1da177e4c3f41 Linus Torvalds 2005-04-16 243 struct net_device *vlandev = (struct net_device *) seq->private;
7da82c06ded105 Jiri Pirko 2011-12-08 244 const struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
28172739f0a276 Eric Dumazet 2010-07-07 245 struct rtnl_link_stats64 temp;
be1f3c2c027cc5 Ben Hutchings 2010-06-08 246 const struct rtnl_link_stats64 *stats;
be1f3c2c027cc5 Ben Hutchings 2010-06-08 247 static const char fmt64[] = "%30s %12llu\n";
^1da177e4c3f41 Linus Torvalds 2005-04-16 248 int i;
^1da177e4c3f41 Linus Torvalds 2005-04-16 249
26a25239d7a660 Joonwoo Park 2008-07-08 250 if (!is_vlan_dev(vlandev))
^1da177e4c3f41 Linus Torvalds 2005-04-16 251 return 0;
^1da177e4c3f41 Linus Torvalds 2005-04-16 252
28172739f0a276 Eric Dumazet 2010-07-07 253 stats = dev_get_stats(vlandev, &temp);
2029cc2c84fb11 Patrick McHardy 2008-01-21 254 seq_printf(seq,
2029cc2c84fb11 Patrick McHardy 2008-01-21 255 "%s VID: %d REORDER_HDR: %i dev->priv_flags: %hx\n",
7da82c06ded105 Jiri Pirko 2011-12-08 256 vlandev->name, vlan->vlan_id,
7da82c06ded105 Jiri Pirko 2011-12-08 @257 (int)(vlan->flags & 1), vlandev->priv_flags);
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org
[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 44327 bytes --]
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH v4 bpf-next 3/6] net: add priv_flags for allow tx skb without linear
2021-02-16 11:38 [PATCH v4 bpf-next 0/6] xsk: build skb by page (aka generic zerocopy xmit) Alexander Lobakin
2021-02-16 11:38 ` [PATCH v4 bpf-next 1/6] netdev_priv_flags: add missing IFF_PHONY_HEADROOM self-definition Alexander Lobakin
2021-02-16 11:38 ` [PATCH v4 bpf-next 2/6] netdevice: check for net_device::priv_flags bitfield overflow Alexander Lobakin
@ 2021-02-16 11:38 ` Alexander Lobakin
2021-02-16 11:38 ` [PATCH v4 bpf-next 4/6] virtio-net: support IFF_TX_SKB_NO_LINEAR Alexander Lobakin
` (2 subsequent siblings)
5 siblings, 0 replies; 11+ messages in thread
From: Alexander Lobakin @ 2021-02-16 11:38 UTC (permalink / raw)
To: Magnus Karlsson, Björn Töpel
Cc: Michael S. Tsirkin, Jason Wang, David S. Miller, Jakub Kicinski,
Jonathan Lemon, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh, Paolo Abeni,
Eric Dumazet, Xuan Zhuo, Dust Li, Alexander Lobakin,
virtualization, netdev, linux-kernel, bpf
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
In some cases, we hope to construct skb directly based on the existing
memory without copying data. In this case, the page will be placed
directly in the skb, and the linear space of skb is empty. But
unfortunately, many the network card does not support this operation.
For example Mellanox Technologies MT27710 Family [ConnectX-4 Lx] will
get the following error message:
mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8,
qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
So a priv_flag is added here to indicate whether the network card
supports this feature.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Suggested-by: Alexander Lobakin <alobakin@pm.me>
[ alobakin: give a new flag more detailed description ]
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
---
include/linux/netdevice.h | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index fa4ab77ce81e..86e19f62f978 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1525,6 +1525,8 @@ struct net_device_ops {
* @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
* @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
* @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
+ * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
+ * skb_headlen(skb) == 0 (data starts from frag0)
*/
enum netdev_priv_flags {
IFF_802_1Q_VLAN_BIT,
@@ -1558,6 +1560,7 @@ enum netdev_priv_flags {
IFF_FAILOVER_SLAVE_BIT,
IFF_L3MDEV_RX_HANDLER_BIT,
IFF_LIVE_RENAME_OK_BIT,
+ IFF_TX_SKB_NO_LINEAR_BIT,
NETDEV_PRIV_FLAG_COUNT,
};
@@ -1600,6 +1603,7 @@ static_assert(sizeof(netdev_priv_flags_t) * BITS_PER_BYTE <=
#define IFF_FAILOVER_SLAVE __IFF(FAILOVER_SLAVE)
#define IFF_L3MDEV_RX_HANDLER __IFF(L3MDEV_RX_HANDLER)
#define IFF_LIVE_RENAME_OK __IFF(LIVE_RENAME_OK)
+#define IFF_TX_SKB_NO_LINEAR __IFF(TX_SKB_NO_LINEAR)
/**
* struct net_device - The DEVICE structure.
--
2.30.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 bpf-next 4/6] virtio-net: support IFF_TX_SKB_NO_LINEAR
2021-02-16 11:38 [PATCH v4 bpf-next 0/6] xsk: build skb by page (aka generic zerocopy xmit) Alexander Lobakin
` (2 preceding siblings ...)
2021-02-16 11:38 ` [PATCH v4 bpf-next 3/6] net: add priv_flags for allow tx skb without linear Alexander Lobakin
@ 2021-02-16 11:38 ` Alexander Lobakin
2021-02-16 11:39 ` [PATCH v4 bpf-next 5/6] xsk: respect device's headroom and tailroom on generic xmit path Alexander Lobakin
2021-02-16 11:39 ` [PATCH v4 bpf-next 6/6] xsk: build skb by page (aka generic zerocopy xmit) Alexander Lobakin
5 siblings, 0 replies; 11+ messages in thread
From: Alexander Lobakin @ 2021-02-16 11:38 UTC (permalink / raw)
To: Magnus Karlsson, Björn Töpel
Cc: Michael S. Tsirkin, Jason Wang, David S. Miller, Jakub Kicinski,
Jonathan Lemon, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh, Paolo Abeni,
Eric Dumazet, Xuan Zhuo, Dust Li, Alexander Lobakin,
virtualization, netdev, linux-kernel, bpf
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Virtio net supports the case where the skb linear space is empty, so add
priv_flags.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
---
drivers/net/virtio_net.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index ba8e63792549..f2ff6c3906c1 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2972,7 +2972,8 @@ static int virtnet_probe(struct virtio_device *vdev)
return -ENOMEM;
/* Set up network device as normal. */
- dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
+ dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE |
+ IFF_TX_SKB_NO_LINEAR;
dev->netdev_ops = &virtnet_netdev;
dev->features = NETIF_F_HIGHDMA;
--
2.30.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH v4 bpf-next 5/6] xsk: respect device's headroom and tailroom on generic xmit path
2021-02-16 11:38 [PATCH v4 bpf-next 0/6] xsk: build skb by page (aka generic zerocopy xmit) Alexander Lobakin
` (3 preceding siblings ...)
2021-02-16 11:38 ` [PATCH v4 bpf-next 4/6] virtio-net: support IFF_TX_SKB_NO_LINEAR Alexander Lobakin
@ 2021-02-16 11:39 ` Alexander Lobakin
2021-02-16 14:08 ` Magnus Karlsson
2021-02-16 11:39 ` [PATCH v4 bpf-next 6/6] xsk: build skb by page (aka generic zerocopy xmit) Alexander Lobakin
5 siblings, 1 reply; 11+ messages in thread
From: Alexander Lobakin @ 2021-02-16 11:39 UTC (permalink / raw)
To: Magnus Karlsson, Björn Töpel
Cc: Michael S. Tsirkin, Jason Wang, David S. Miller, Jakub Kicinski,
Jonathan Lemon, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh, Paolo Abeni,
Eric Dumazet, Xuan Zhuo, Dust Li, Alexander Lobakin,
virtualization, netdev, linux-kernel, bpf
xsk_generic_xmit() allocates a new skb and then queues it for
xmitting. The size of new skb's headroom is desc->len, so it comes
to the driver/device with no reserved headroom and/or tailroom.
Lots of drivers need some headroom (and sometimes tailroom) to
prepend (and/or append) some headers or data, e.g. CPU tags,
device-specific headers/descriptors (LSO, TLS etc.), and if case
of no available space skb_cow_head() will reallocate the skb.
Reallocations are unwanted on fast-path, especially when it comes
to XDP, so generic XSK xmit should reserve the spaces declared in
dev->needed_headroom and dev->needed tailroom to avoid them.
Note on max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)):
Usually, output functions reserve LL_RESERVED_SPACE(dev), which
consists of dev->hard_header_len + dev->needed_headroom, aligned
by 16.
However, on XSK xmit hard header is already here in the chunk, so
hard_header_len is not needed. But it'd still be better to align
data up to cacheline, while reserving no less than driver requests
for headroom. NET_SKB_PAD here is to double-insure there will be
no reallocations even when the driver advertises no needed_headroom,
but in fact need it (not so rare case).
Fixes: 35fcde7f8deb ("xsk: support for Tx")
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
---
net/xdp/xsk.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 4faabd1ecfd1..143979ea4165 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -454,12 +454,16 @@ static int xsk_generic_xmit(struct sock *sk)
struct sk_buff *skb;
unsigned long flags;
int err = 0;
+ u32 hr, tr;
mutex_lock(&xs->mutex);
if (xs->queue_id >= xs->dev->real_num_tx_queues)
goto out;
+ hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
+ tr = xs->dev->needed_tailroom;
+
while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
char *buffer;
u64 addr;
@@ -471,11 +475,13 @@ static int xsk_generic_xmit(struct sock *sk)
}
len = desc.len;
- skb = sock_alloc_send_skb(sk, len, 1, &err);
+ skb = sock_alloc_send_skb(sk, hr + len + tr, 1, &err);
if (unlikely(!skb))
goto out;
+ skb_reserve(skb, hr);
skb_put(skb, len);
+
addr = desc.addr;
buffer = xsk_buff_raw_get_data(xs->pool, addr);
err = skb_store_bits(skb, 0, buffer, len);
--
2.30.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v4 bpf-next 5/6] xsk: respect device's headroom and tailroom on generic xmit path
2021-02-16 11:39 ` [PATCH v4 bpf-next 5/6] xsk: respect device's headroom and tailroom on generic xmit path Alexander Lobakin
@ 2021-02-16 14:08 ` Magnus Karlsson
0 siblings, 0 replies; 11+ messages in thread
From: Magnus Karlsson @ 2021-02-16 14:08 UTC (permalink / raw)
To: Alexander Lobakin
Cc: Magnus Karlsson, Björn Töpel, Michael S. Tsirkin,
Jason Wang, David S. Miller, Jakub Kicinski, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
Yonghong Song, KP Singh, Paolo Abeni, Eric Dumazet, Xuan Zhuo,
Dust Li, virtualization, Network Development, open list, bpf
On Tue, Feb 16, 2021 at 12:44 PM Alexander Lobakin <alobakin@pm.me> wrote:
>
> xsk_generic_xmit() allocates a new skb and then queues it for
> xmitting. The size of new skb's headroom is desc->len, so it comes
> to the driver/device with no reserved headroom and/or tailroom.
> Lots of drivers need some headroom (and sometimes tailroom) to
> prepend (and/or append) some headers or data, e.g. CPU tags,
> device-specific headers/descriptors (LSO, TLS etc.), and if case
> of no available space skb_cow_head() will reallocate the skb.
> Reallocations are unwanted on fast-path, especially when it comes
> to XDP, so generic XSK xmit should reserve the spaces declared in
> dev->needed_headroom and dev->needed tailroom to avoid them.
>
> Note on max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom)):
>
> Usually, output functions reserve LL_RESERVED_SPACE(dev), which
> consists of dev->hard_header_len + dev->needed_headroom, aligned
> by 16.
> However, on XSK xmit hard header is already here in the chunk, so
> hard_header_len is not needed. But it'd still be better to align
> data up to cacheline, while reserving no less than driver requests
> for headroom. NET_SKB_PAD here is to double-insure there will be
> no reallocations even when the driver advertises no needed_headroom,
> but in fact need it (not so rare case).
>
> Fixes: 35fcde7f8deb ("xsk: support for Tx")
> Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> ---
> net/xdp/xsk.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 4faabd1ecfd1..143979ea4165 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -454,12 +454,16 @@ static int xsk_generic_xmit(struct sock *sk)
> struct sk_buff *skb;
> unsigned long flags;
> int err = 0;
> + u32 hr, tr;
>
> mutex_lock(&xs->mutex);
>
> if (xs->queue_id >= xs->dev->real_num_tx_queues)
> goto out;
>
> + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
> + tr = xs->dev->needed_tailroom;
> +
> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> char *buffer;
> u64 addr;
> @@ -471,11 +475,13 @@ static int xsk_generic_xmit(struct sock *sk)
> }
>
> len = desc.len;
> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> + skb = sock_alloc_send_skb(sk, hr + len + tr, 1, &err);
> if (unlikely(!skb))
> goto out;
>
> + skb_reserve(skb, hr);
> skb_put(skb, len);
> +
> addr = desc.addr;
> buffer = xsk_buff_raw_get_data(xs->pool, addr);
> err = skb_store_bits(skb, 0, buffer, len);
> --
> 2.30.1
>
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH v4 bpf-next 6/6] xsk: build skb by page (aka generic zerocopy xmit)
2021-02-16 11:38 [PATCH v4 bpf-next 0/6] xsk: build skb by page (aka generic zerocopy xmit) Alexander Lobakin
` (4 preceding siblings ...)
2021-02-16 11:39 ` [PATCH v4 bpf-next 5/6] xsk: respect device's headroom and tailroom on generic xmit path Alexander Lobakin
@ 2021-02-16 11:39 ` Alexander Lobakin
2021-02-16 14:08 ` Magnus Karlsson
5 siblings, 1 reply; 11+ messages in thread
From: Alexander Lobakin @ 2021-02-16 11:39 UTC (permalink / raw)
To: Magnus Karlsson, Björn Töpel
Cc: Michael S. Tsirkin, Jason Wang, David S. Miller, Jakub Kicinski,
Jonathan Lemon, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh, Paolo Abeni,
Eric Dumazet, Xuan Zhuo, Dust Li, Alexander Lobakin,
virtualization, netdev, linux-kernel, bpf
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
This patch is used to construct skb based on page to save memory copy
overhead.
This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
directly construct skb. If this feature is not supported, it is still
necessary to copy data to construct skb.
---------------- Performance Testing ------------
The test environment is Aliyun ECS server.
Test cmd:
```
xdpsock -i eth0 -t -S -s <msg size>
```
Test result data:
size 64 512 1024 1500
copy 1916747 1775988 1600203 1440054
page 1974058 1953655 1945463 1904478
percent 3.0% 10.0% 21.58% 32.3%
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
[ alobakin:
- expand subject to make it clearer;
- improve skb->truesize calculation;
- reserve some headroom in skb for drivers;
- tailroom is not needed as skb is non-linear ]
Signed-off-by: Alexander Lobakin <alobakin@pm.me>
---
net/xdp/xsk.c | 119 ++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 95 insertions(+), 24 deletions(-)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 143979ea4165..ff7bd06e1241 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -445,6 +445,96 @@ static void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}
+static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
+ struct xdp_desc *desc)
+{
+ struct xsk_buff_pool *pool = xs->pool;
+ u32 hr, len, offset, copy, copied;
+ struct sk_buff *skb;
+ struct page *page;
+ void *buffer;
+ int err, i;
+ u64 addr;
+
+ hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
+
+ skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
+ if (unlikely(!skb))
+ return ERR_PTR(err);
+
+ skb_reserve(skb, hr);
+
+ addr = desc->addr;
+ len = desc->len;
+
+ buffer = xsk_buff_raw_get_data(pool, addr);
+ offset = offset_in_page(buffer);
+ addr = buffer - pool->addrs;
+
+ for (copied = 0, i = 0; copied < len; i++) {
+ page = pool->umem->pgs[addr >> PAGE_SHIFT];
+ get_page(page);
+
+ copy = min_t(u32, PAGE_SIZE - offset, len - copied);
+ skb_fill_page_desc(skb, i, page, offset, copy);
+
+ copied += copy;
+ addr += copy;
+ offset = 0;
+ }
+
+ skb->len += len;
+ skb->data_len += len;
+ skb->truesize += pool->unaligned ? len : pool->chunk_size;
+
+ refcount_add(skb->truesize, &xs->sk.sk_wmem_alloc);
+
+ return skb;
+}
+
+static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
+ struct xdp_desc *desc)
+{
+ struct net_device *dev = xs->dev;
+ struct sk_buff *skb;
+
+ if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
+ skb = xsk_build_skb_zerocopy(xs, desc);
+ if (IS_ERR(skb))
+ return skb;
+ } else {
+ u32 hr, tr, len;
+ void *buffer;
+ int err;
+
+ hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
+ tr = dev->needed_tailroom;
+ len = desc->len;
+
+ skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
+ if (unlikely(!skb))
+ return ERR_PTR(err);
+
+ skb_reserve(skb, hr);
+ skb_put(skb, len);
+
+ buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
+ err = skb_store_bits(skb, 0, buffer, len);
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+ }
+
+ skb->dev = dev;
+ skb->priority = xs->sk.sk_priority;
+ skb->mark = xs->sk.sk_mark;
+ skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
+ skb->destructor = xsk_destruct_skb;
+
+ return skb;
+}
+
static int xsk_generic_xmit(struct sock *sk)
{
struct xdp_sock *xs = xdp_sk(sk);
@@ -454,56 +544,37 @@ static int xsk_generic_xmit(struct sock *sk)
struct sk_buff *skb;
unsigned long flags;
int err = 0;
- u32 hr, tr;
mutex_lock(&xs->mutex);
if (xs->queue_id >= xs->dev->real_num_tx_queues)
goto out;
- hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
- tr = xs->dev->needed_tailroom;
-
while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
- char *buffer;
- u64 addr;
- u32 len;
-
if (max_batch-- == 0) {
err = -EAGAIN;
goto out;
}
- len = desc.len;
- skb = sock_alloc_send_skb(sk, hr + len + tr, 1, &err);
- if (unlikely(!skb))
+ skb = xsk_build_skb(xs, &desc);
+ if (IS_ERR(skb)) {
+ err = PTR_ERR(skb);
goto out;
+ }
- skb_reserve(skb, hr);
- skb_put(skb, len);
-
- addr = desc.addr;
- buffer = xsk_buff_raw_get_data(xs->pool, addr);
- err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
spin_lock_irqsave(&xs->pool->cq_lock, flags);
- if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+ if (xskq_prod_reserve(xs->pool->cq)) {
spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
kfree_skb(skb);
goto out;
}
spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
- skb->dev = xs->dev;
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
- skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
- skb->destructor = xsk_destruct_skb;
-
err = __dev_direct_xmit(skb, xs->queue_id);
if (err == NETDEV_TX_BUSY) {
/* Tell user-space to retry the send */
--
2.30.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH v4 bpf-next 6/6] xsk: build skb by page (aka generic zerocopy xmit)
2021-02-16 11:39 ` [PATCH v4 bpf-next 6/6] xsk: build skb by page (aka generic zerocopy xmit) Alexander Lobakin
@ 2021-02-16 14:08 ` Magnus Karlsson
2021-02-16 14:15 ` Alexander Lobakin
0 siblings, 1 reply; 11+ messages in thread
From: Magnus Karlsson @ 2021-02-16 14:08 UTC (permalink / raw)
To: Alexander Lobakin
Cc: Magnus Karlsson, Björn Töpel, Michael S. Tsirkin,
Jason Wang, David S. Miller, Jakub Kicinski, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
Yonghong Song, KP Singh, Paolo Abeni, Eric Dumazet, Xuan Zhuo,
Dust Li, virtualization, Network Development, open list, bpf
On Tue, Feb 16, 2021 at 12:44 PM Alexander Lobakin <alobakin@pm.me> wrote:
>
> From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
> network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
> directly construct skb. If this feature is not supported, it is still
> necessary to copy data to construct skb.
>
> ---------------- Performance Testing ------------
>
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t -S -s <msg size>
> ```
>
> Test result data:
>
> size 64 512 1024 1500
> copy 1916747 1775988 1600203 1440054
> page 1974058 1953655 1945463 1904478
> percent 3.0% 10.0% 21.58% 32.3%
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> [ alobakin:
> - expand subject to make it clearer;
> - improve skb->truesize calculation;
> - reserve some headroom in skb for drivers;
> - tailroom is not needed as skb is non-linear ]
> Signed-off-by: Alexander Lobakin <alobakin@pm.me>
Thank you Alexander!
Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
> ---
> net/xdp/xsk.c | 119 ++++++++++++++++++++++++++++++++++++++++----------
> 1 file changed, 95 insertions(+), 24 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 143979ea4165..ff7bd06e1241 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -445,6 +445,96 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> + struct xdp_desc *desc)
> +{
> + struct xsk_buff_pool *pool = xs->pool;
> + u32 hr, len, offset, copy, copied;
> + struct sk_buff *skb;
> + struct page *page;
> + void *buffer;
> + int err, i;
> + u64 addr;
> +
> + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
> +
> + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
> + if (unlikely(!skb))
> + return ERR_PTR(err);
> +
> + skb_reserve(skb, hr);
> +
> + addr = desc->addr;
> + len = desc->len;
> +
> + buffer = xsk_buff_raw_get_data(pool, addr);
> + offset = offset_in_page(buffer);
> + addr = buffer - pool->addrs;
> +
> + for (copied = 0, i = 0; copied < len; i++) {
> + page = pool->umem->pgs[addr >> PAGE_SHIFT];
> + get_page(page);
> +
> + copy = min_t(u32, PAGE_SIZE - offset, len - copied);
> + skb_fill_page_desc(skb, i, page, offset, copy);
> +
> + copied += copy;
> + addr += copy;
> + offset = 0;
> + }
> +
> + skb->len += len;
> + skb->data_len += len;
> + skb->truesize += pool->unaligned ? len : pool->chunk_size;
> +
> + refcount_add(skb->truesize, &xs->sk.sk_wmem_alloc);
> +
> + return skb;
> +}
> +
> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> + struct xdp_desc *desc)
> +{
> + struct net_device *dev = xs->dev;
> + struct sk_buff *skb;
> +
> + if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
> + skb = xsk_build_skb_zerocopy(xs, desc);
> + if (IS_ERR(skb))
> + return skb;
> + } else {
> + u32 hr, tr, len;
> + void *buffer;
> + int err;
> +
> + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
> + tr = dev->needed_tailroom;
> + len = desc->len;
> +
> + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
> + if (unlikely(!skb))
> + return ERR_PTR(err);
> +
> + skb_reserve(skb, hr);
> + skb_put(skb, len);
> +
> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> + err = skb_store_bits(skb, 0, buffer, len);
> + if (unlikely(err)) {
> + kfree_skb(skb);
> + return ERR_PTR(err);
> + }
> + }
> +
> + skb->dev = dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
> +}
> +
> static int xsk_generic_xmit(struct sock *sk)
> {
> struct xdp_sock *xs = xdp_sk(sk);
> @@ -454,56 +544,37 @@ static int xsk_generic_xmit(struct sock *sk)
> struct sk_buff *skb;
> unsigned long flags;
> int err = 0;
> - u32 hr, tr;
>
> mutex_lock(&xs->mutex);
>
> if (xs->queue_id >= xs->dev->real_num_tx_queues)
> goto out;
>
> - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
> - tr = xs->dev->needed_tailroom;
> -
> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> - char *buffer;
> - u64 addr;
> - u32 len;
> -
> if (max_batch-- == 0) {
> err = -EAGAIN;
> goto out;
> }
>
> - len = desc.len;
> - skb = sock_alloc_send_skb(sk, hr + len + tr, 1, &err);
> - if (unlikely(!skb))
> + skb = xsk_build_skb(xs, &desc);
> + if (IS_ERR(skb)) {
> + err = PTR_ERR(skb);
> goto out;
> + }
>
> - skb_reserve(skb, hr);
> - skb_put(skb, len);
> -
> - addr = desc.addr;
> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> - err = skb_store_bits(skb, 0, buffer, len);
> /* This is the backpressure mechanism for the Tx path.
> * Reserve space in the completion queue and only proceed
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> spin_lock_irqsave(&xs->pool->cq_lock, flags);
> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> + if (xskq_prod_reserve(xs->pool->cq)) {
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> kfree_skb(skb);
> goto out;
> }
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>
> - skb->dev = xs->dev;
> - skb->priority = sk->sk_priority;
> - skb->mark = sk->sk_mark;
> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> - skb->destructor = xsk_destruct_skb;
> -
> err = __dev_direct_xmit(skb, xs->queue_id);
> if (err == NETDEV_TX_BUSY) {
> /* Tell user-space to retry the send */
> --
> 2.30.1
>
>
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH v4 bpf-next 6/6] xsk: build skb by page (aka generic zerocopy xmit)
2021-02-16 14:08 ` Magnus Karlsson
@ 2021-02-16 14:15 ` Alexander Lobakin
0 siblings, 0 replies; 11+ messages in thread
From: Alexander Lobakin @ 2021-02-16 14:15 UTC (permalink / raw)
To: Magnus Karlsson
Cc: Alexander Lobakin, Björn Töpel, Michael S. Tsirkin,
Jason Wang, David S. Miller, Jakub Kicinski, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
Yonghong Song, KP Singh, Paolo Abeni, Eric Dumazet, Xuan Zhuo,
Dust Li, virtualization, Network Development, open list, bpf
From: Magnus Karlsson <magnus.karlsson@gmail.com>
Date: Tue, 16 Feb 2021 15:08:26 +0100
> On Tue, Feb 16, 2021 at 12:44 PM Alexander Lobakin <alobakin@pm.me> wrote:
> >
> > From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> >
> > This patch is used to construct skb based on page to save memory copy
> > overhead.
> >
> > This function is implemented based on IFF_TX_SKB_NO_LINEAR. Only the
> > network card priv_flags supports IFF_TX_SKB_NO_LINEAR will use page to
> > directly construct skb. If this feature is not supported, it is still
> > necessary to copy data to construct skb.
> >
> > ---------------- Performance Testing ------------
> >
> > The test environment is Aliyun ECS server.
> > Test cmd:
> > ```
> > xdpsock -i eth0 -t -S -s <msg size>
> > ```
> >
> > Test result data:
> >
> > size 64 512 1024 1500
> > copy 1916747 1775988 1600203 1440054
> > page 1974058 1953655 1945463 1904478
> > percent 3.0% 10.0% 21.58% 32.3%
> >
> > Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> > [ alobakin:
> > - expand subject to make it clearer;
> > - improve skb->truesize calculation;
> > - reserve some headroom in skb for drivers;
> > - tailroom is not needed as skb is non-linear ]
> > Signed-off-by: Alexander Lobakin <alobakin@pm.me>
>
> Thank you Alexander!
>
> Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
Thanks!
I have one more generic zerocopy to offer (inspired by this series)
that wouldn't require IFF_TX_SKB_NO_LINEAR, only a capability to xmit
S/G packets that almost every NIC has. I'll publish an RFC once this
and your upcoming changes get merged.
> > ---
> > net/xdp/xsk.c | 119 ++++++++++++++++++++++++++++++++++++++++----------
> > 1 file changed, 95 insertions(+), 24 deletions(-)
> >
> > diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > index 143979ea4165..ff7bd06e1241 100644
> > --- a/net/xdp/xsk.c
> > +++ b/net/xdp/xsk.c
> > @@ -445,6 +445,96 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > sock_wfree(skb);
> > }
> >
> > +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > + struct xdp_desc *desc)
> > +{
> > + struct xsk_buff_pool *pool = xs->pool;
> > + u32 hr, len, offset, copy, copied;
> > + struct sk_buff *skb;
> > + struct page *page;
> > + void *buffer;
> > + int err, i;
> > + u64 addr;
> > +
> > + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
> > +
> > + skb = sock_alloc_send_skb(&xs->sk, hr, 1, &err);
> > + if (unlikely(!skb))
> > + return ERR_PTR(err);
> > +
> > + skb_reserve(skb, hr);
> > +
> > + addr = desc->addr;
> > + len = desc->len;
> > +
> > + buffer = xsk_buff_raw_get_data(pool, addr);
> > + offset = offset_in_page(buffer);
> > + addr = buffer - pool->addrs;
> > +
> > + for (copied = 0, i = 0; copied < len; i++) {
> > + page = pool->umem->pgs[addr >> PAGE_SHIFT];
> > + get_page(page);
> > +
> > + copy = min_t(u32, PAGE_SIZE - offset, len - copied);
> > + skb_fill_page_desc(skb, i, page, offset, copy);
> > +
> > + copied += copy;
> > + addr += copy;
> > + offset = 0;
> > + }
> > +
> > + skb->len += len;
> > + skb->data_len += len;
> > + skb->truesize += pool->unaligned ? len : pool->chunk_size;
> > +
> > + refcount_add(skb->truesize, &xs->sk.sk_wmem_alloc);
> > +
> > + return skb;
> > +}
> > +
> > +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > + struct xdp_desc *desc)
> > +{
> > + struct net_device *dev = xs->dev;
> > + struct sk_buff *skb;
> > +
> > + if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
> > + skb = xsk_build_skb_zerocopy(xs, desc);
> > + if (IS_ERR(skb))
> > + return skb;
> > + } else {
> > + u32 hr, tr, len;
> > + void *buffer;
> > + int err;
> > +
> > + hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(dev->needed_headroom));
> > + tr = dev->needed_tailroom;
> > + len = desc->len;
> > +
> > + skb = sock_alloc_send_skb(&xs->sk, hr + len + tr, 1, &err);
> > + if (unlikely(!skb))
> > + return ERR_PTR(err);
> > +
> > + skb_reserve(skb, hr);
> > + skb_put(skb, len);
> > +
> > + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> > + err = skb_store_bits(skb, 0, buffer, len);
> > + if (unlikely(err)) {
> > + kfree_skb(skb);
> > + return ERR_PTR(err);
> > + }
> > + }
> > +
> > + skb->dev = dev;
> > + skb->priority = xs->sk.sk_priority;
> > + skb->mark = xs->sk.sk_mark;
> > + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> > + skb->destructor = xsk_destruct_skb;
> > +
> > + return skb;
> > +}
> > +
> > static int xsk_generic_xmit(struct sock *sk)
> > {
> > struct xdp_sock *xs = xdp_sk(sk);
> > @@ -454,56 +544,37 @@ static int xsk_generic_xmit(struct sock *sk)
> > struct sk_buff *skb;
> > unsigned long flags;
> > int err = 0;
> > - u32 hr, tr;
> >
> > mutex_lock(&xs->mutex);
> >
> > if (xs->queue_id >= xs->dev->real_num_tx_queues)
> > goto out;
> >
> > - hr = max(NET_SKB_PAD, L1_CACHE_ALIGN(xs->dev->needed_headroom));
> > - tr = xs->dev->needed_tailroom;
> > -
> > while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> > - char *buffer;
> > - u64 addr;
> > - u32 len;
> > -
> > if (max_batch-- == 0) {
> > err = -EAGAIN;
> > goto out;
> > }
> >
> > - len = desc.len;
> > - skb = sock_alloc_send_skb(sk, hr + len + tr, 1, &err);
> > - if (unlikely(!skb))
> > + skb = xsk_build_skb(xs, &desc);
> > + if (IS_ERR(skb)) {
> > + err = PTR_ERR(skb);
> > goto out;
> > + }
> >
> > - skb_reserve(skb, hr);
> > - skb_put(skb, len);
> > -
> > - addr = desc.addr;
> > - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> > - err = skb_store_bits(skb, 0, buffer, len);
> > /* This is the backpressure mechanism for the Tx path.
> > * Reserve space in the completion queue and only proceed
> > * if there is space in it. This avoids having to implement
> > * any buffering in the Tx path.
> > */
> > spin_lock_irqsave(&xs->pool->cq_lock, flags);
> > - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> > + if (xskq_prod_reserve(xs->pool->cq)) {
> > spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> > kfree_skb(skb);
> > goto out;
> > }
> > spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> >
> > - skb->dev = xs->dev;
> > - skb->priority = sk->sk_priority;
> > - skb->mark = sk->sk_mark;
> > - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> > - skb->destructor = xsk_destruct_skb;
> > -
> > err = __dev_direct_xmit(skb, xs->queue_id);
> > if (err == NETDEV_TX_BUSY) {
> > /* Tell user-space to retry the send */
> > --
> > 2.30.1
Al
^ permalink raw reply [flat|nested] 11+ messages in thread