* [PATCH bpf-next] xsk: build skb by page
@ 2020-12-23 8:56 Xuan Zhuo
2020-12-23 10:04 ` Magnus Karlsson
2020-12-31 16:29 ` John Fastabend
0 siblings, 2 replies; 23+ messages in thread
From: Xuan Zhuo @ 2020-12-23 8:56 UTC (permalink / raw)
To: magnus.karlsson
Cc: Björn Töpel, Jonathan Lemon, David S. Miller,
Jakub Kicinski, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh,
open list:XDP SOCKETS (AF_XDP), open list:XDP SOCKETS (AF_XDP),
open list
This patch is used to construct skb based on page to save memory copy
overhead.
Taking into account the problem of addr unaligned, and the
possibility of frame size greater than page in the future.
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
net/xdp/xsk.c | 68 ++++++++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 51 insertions(+), 17 deletions(-)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ac4a317..7cab40f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -430,6 +430,55 @@ static void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}
+static struct sk_buff *xsk_build_skb_bypage(struct xdp_sock *xs, struct xdp_desc *desc)
+{
+ char *buffer;
+ u64 addr;
+ u32 len, offset, copy, copied;
+ int err, i;
+ struct page *page;
+ struct sk_buff *skb;
+
+ skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
+ if (unlikely(!skb))
+ return NULL;
+
+ addr = desc->addr;
+ len = desc->len;
+
+ buffer = xsk_buff_raw_get_data(xs->pool, addr);
+ offset = offset_in_page(buffer);
+ addr = buffer - (char *)xs->pool->addrs;
+
+ for (copied = 0, i = 0; copied < len; ++i) {
+ page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
+
+ get_page(page);
+
+ copy = min((u32)(PAGE_SIZE - offset), len - copied);
+
+ skb_fill_page_desc(skb, i, page, offset, copy);
+
+ copied += copy;
+ addr += copy;
+ offset = 0;
+ }
+
+ skb->len += len;
+ skb->data_len += len;
+ skb->truesize += len;
+
+ refcount_add(len, &xs->sk.sk_wmem_alloc);
+
+ skb->dev = xs->dev;
+ skb->priority = xs->sk.sk_priority;
+ skb->mark = xs->sk.sk_mark;
+ skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
+ skb->destructor = xsk_destruct_skb;
+
+ return skb;
+}
+
static int xsk_generic_xmit(struct sock *sk)
{
struct xdp_sock *xs = xdp_sk(sk);
@@ -445,40 +494,25 @@ static int xsk_generic_xmit(struct sock *sk)
goto out;
while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
- char *buffer;
- u64 addr;
- u32 len;
-
if (max_batch-- == 0) {
err = -EAGAIN;
goto out;
}
- len = desc.len;
- skb = sock_alloc_send_skb(sk, len, 1, &err);
+ skb = xsk_build_skb_bypage(xs, &desc);
if (unlikely(!skb))
goto out;
- skb_put(skb, len);
- addr = desc.addr;
- buffer = xsk_buff_raw_get_data(xs->pool, addr);
- err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+ if (xskq_prod_reserve(xs->pool->cq)) {
kfree_skb(skb);
goto out;
}
- skb->dev = xs->dev;
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
- skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
- skb->destructor = xsk_destruct_skb;
-
err = __dev_direct_xmit(skb, xs->queue_id);
if (err == NETDEV_TX_BUSY) {
/* Tell user-space to retry the send */
--
1.8.3.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2020-12-23 8:56 [PATCH bpf-next] xsk: build skb by page Xuan Zhuo
@ 2020-12-23 10:04 ` Magnus Karlsson
2020-12-29 8:32 ` Xuan Zhuo
2020-12-31 16:29 ` John Fastabend
1 sibling, 1 reply; 23+ messages in thread
From: Magnus Karlsson @ 2020-12-23 10:04 UTC (permalink / raw)
To: Xuan Zhuo
Cc: Karlsson, Magnus, Björn Töpel, Jonathan Lemon,
David S. Miller, Jakub Kicinski, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
KP Singh, open list:XDP SOCKETS (AF_XDP),
open list:XDP SOCKETS (AF_XDP),
open list
On Wed, Dec 23, 2020 at 9:57 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> Taking into account the problem of addr unaligned, and the
> possibility of frame size greater than page in the future.
Thanks Xuan for the patch set. Could you please share performance
numbers so we know how much this buys us? Would be good if you could
produce them for 64 bytes, 1500 bytes and something in the middle so
we can judge the benefits of this.
Please note that responses will be delayed this week and next due to
the Christmas and New Years holidays over here.
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
> net/xdp/xsk.c | 68 ++++++++++++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 51 insertions(+), 17 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index ac4a317..7cab40f 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,55 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +static struct sk_buff *xsk_build_skb_bypage(struct xdp_sock *xs, struct xdp_desc *desc)
> +{
> + char *buffer;
> + u64 addr;
> + u32 len, offset, copy, copied;
> + int err, i;
> + struct page *page;
> + struct sk_buff *skb;
> +
> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> + if (unlikely(!skb))
> + return NULL;
> +
> + addr = desc->addr;
> + len = desc->len;
> +
> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> + offset = offset_in_page(buffer);
> + addr = buffer - (char *)xs->pool->addrs;
> +
> + for (copied = 0, i = 0; copied < len; ++i) {
> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> + get_page(page);
> +
> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> +
> + skb_fill_page_desc(skb, i, page, offset, copy);
> +
> + copied += copy;
> + addr += copy;
> + offset = 0;
> + }
> +
> + skb->len += len;
> + skb->data_len += len;
> + skb->truesize += len;
> +
> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> +
> + skb->dev = xs->dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
> +}
> +
> static int xsk_generic_xmit(struct sock *sk)
> {
> struct xdp_sock *xs = xdp_sk(sk);
> @@ -445,40 +494,25 @@ static int xsk_generic_xmit(struct sock *sk)
> goto out;
>
> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> - char *buffer;
> - u64 addr;
> - u32 len;
> -
> if (max_batch-- == 0) {
> err = -EAGAIN;
> goto out;
> }
>
> - len = desc.len;
> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> + skb = xsk_build_skb_bypage(xs, &desc);
> if (unlikely(!skb))
> goto out;
>
> - skb_put(skb, len);
> - addr = desc.addr;
> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> - err = skb_store_bits(skb, 0, buffer, len);
> /* This is the backpressure mechanism for the Tx path.
> * Reserve space in the completion queue and only proceed
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> + if (xskq_prod_reserve(xs->pool->cq)) {
> kfree_skb(skb);
> goto out;
> }
>
> - skb->dev = xs->dev;
> - skb->priority = sk->sk_priority;
> - skb->mark = sk->sk_mark;
> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> - skb->destructor = xsk_destruct_skb;
> -
> err = __dev_direct_xmit(skb, xs->queue_id);
> if (err == NETDEV_TX_BUSY) {
> /* Tell user-space to retry the send */
> --
> 1.8.3.1
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH bpf-next] xsk: build skb by page
2020-12-23 10:04 ` Magnus Karlsson
@ 2020-12-29 8:32 ` Xuan Zhuo
0 siblings, 0 replies; 23+ messages in thread
From: Xuan Zhuo @ 2020-12-29 8:32 UTC (permalink / raw)
To: magnus.karlsson
Cc: Björn Töpel, Jonathan Lemon, David S. Miller,
Jakub Kicinski, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh,
open list:XDP SOCKETS (AF_XDP), open list:XDP SOCKETS (AF_XDP),
open list
This patch is used to construct skb based on page to save memory copy
overhead.
Taking into account the problem of addr unaligned, and the
possibility of frame size greater than page in the future.
The test environment is Aliyun ECS server.
Test cmd:
```
xdpsock -i eth0 -t -S -s <msg size>
```
Test result data:
size 64 512 1024 1500
copy 1916747 1775988 1600203 1440054
page 1974058 1953655 1945463 1904478
percent 3.0% 10.0% 21.58% 32.3%
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
---
net/xdp/xsk.c | 68 ++++++++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 51 insertions(+), 17 deletions(-)
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index ac4a317..7cab40f 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -430,6 +430,55 @@ static void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}
+static struct sk_buff *xsk_build_skb_bypage(struct xdp_sock *xs, struct xdp_desc *desc)
+{
+ char *buffer;
+ u64 addr;
+ u32 len, offset, copy, copied;
+ int err, i;
+ struct page *page;
+ struct sk_buff *skb;
+
+ skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
+ if (unlikely(!skb))
+ return NULL;
+
+ addr = desc->addr;
+ len = desc->len;
+
+ buffer = xsk_buff_raw_get_data(xs->pool, addr);
+ offset = offset_in_page(buffer);
+ addr = buffer - (char *)xs->pool->addrs;
+
+ for (copied = 0, i = 0; copied < len; ++i) {
+ page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
+
+ get_page(page);
+
+ copy = min((u32)(PAGE_SIZE - offset), len - copied);
+
+ skb_fill_page_desc(skb, i, page, offset, copy);
+
+ copied += copy;
+ addr += copy;
+ offset = 0;
+ }
+
+ skb->len += len;
+ skb->data_len += len;
+ skb->truesize += len;
+
+ refcount_add(len, &xs->sk.sk_wmem_alloc);
+
+ skb->dev = xs->dev;
+ skb->priority = xs->sk.sk_priority;
+ skb->mark = xs->sk.sk_mark;
+ skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
+ skb->destructor = xsk_destruct_skb;
+
+ return skb;
+}
+
static int xsk_generic_xmit(struct sock *sk)
{
struct xdp_sock *xs = xdp_sk(sk);
@@ -445,40 +494,25 @@ static int xsk_generic_xmit(struct sock *sk)
goto out;
while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
- char *buffer;
- u64 addr;
- u32 len;
-
if (max_batch-- == 0) {
err = -EAGAIN;
goto out;
}
- len = desc.len;
- skb = sock_alloc_send_skb(sk, len, 1, &err);
+ skb = xsk_build_skb_bypage(xs, &desc);
if (unlikely(!skb))
goto out;
- skb_put(skb, len);
- addr = desc.addr;
- buffer = xsk_buff_raw_get_data(xs->pool, addr);
- err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
- if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+ if (xskq_prod_reserve(xs->pool->cq)) {
kfree_skb(skb);
goto out;
}
- skb->dev = xs->dev;
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
- skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
- skb->destructor = xsk_destruct_skb;
-
err = __dev_direct_xmit(skb, xs->queue_id);
if (err == NETDEV_TX_BUSY) {
/* Tell user-space to retry the send */
--
1.8.3.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
* RE: [PATCH bpf-next] xsk: build skb by page
2020-12-23 8:56 [PATCH bpf-next] xsk: build skb by page Xuan Zhuo
2020-12-23 10:04 ` Magnus Karlsson
@ 2020-12-31 16:29 ` John Fastabend
1 sibling, 0 replies; 23+ messages in thread
From: John Fastabend @ 2020-12-31 16:29 UTC (permalink / raw)
To: Xuan Zhuo, magnus.karlsson
Cc: Björn Töpel, Jonathan Lemon, David S. Miller,
Jakub Kicinski, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh,
(open list:XDP SOCKETS \(AF_XDP\)),
open list:XDP SOCKETS (AF_XDP),
(open list:XDP SOCKETS \(AF_XDP\) open list)
Xuan Zhuo wrote:
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> Taking into account the problem of addr unaligned, and the
> possibility of frame size greater than page in the future.
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> ---
> net/xdp/xsk.c | 68 ++++++++++++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 51 insertions(+), 17 deletions(-)
>
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index ac4a317..7cab40f 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,55 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +static struct sk_buff *xsk_build_skb_bypage(struct xdp_sock *xs, struct xdp_desc *desc)
> +{
> + char *buffer;
> + u64 addr;
> + u32 len, offset, copy, copied;
> + int err, i;
> + struct page *page;
> + struct sk_buff *skb;
> +
> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
Because this is just grabbing an skb did you consider build_skb?
> + if (unlikely(!skb))
> + return NULL;
I think it would be best to push err back to caller here with ERR_PTR().
> +
> + addr = desc->addr;
> + len = desc->len;
> +
> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> + offset = offset_in_page(buffer);
> + addr = buffer - (char *)xs->pool->addrs;
> +
> + for (copied = 0, i = 0; copied < len; ++i) {
> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> + get_page(page);
Is it obvious why this get_page() is needed? Maybe a small comment would
be nice. Something like, "we need to inc refcnt on page to ensure skb
does not release page from pool".
> +
> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> +
nit: take it or leave it, seems like a lot of new lines imo. I would
just put all these together. Not really important though.
> + skb_fill_page_desc(skb, i, page, offset, copy);
> +
> + copied += copy;
> + addr += copy;
> + offset = 0;
> + }
> +
> + skb->len += len;
> + skb->data_len += len;
> + skb->truesize += len;
> +
> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> +
> + skb->dev = xs->dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
> +}
> +
> static int xsk_generic_xmit(struct sock *sk)
> {
> struct xdp_sock *xs = xdp_sk(sk);
> @@ -445,40 +494,25 @@ static int xsk_generic_xmit(struct sock *sk)
> goto out;
>
> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> - char *buffer;
> - u64 addr;
> - u32 len;
> -
> if (max_batch-- == 0) {
> err = -EAGAIN;
> goto out;
> }
>
> - len = desc.len;
> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> + skb = xsk_build_skb_bypage(xs, &desc);
> if (unlikely(!skb))
Is err set here? Either way if skb is an ERR_PTR we can use that
here for better error handling.
> goto out;
>
> - skb_put(skb, len);
> - addr = desc.addr;
> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> - err = skb_store_bits(skb, 0, buffer, len);
> /* This is the backpressure mechanism for the Tx path.
> * Reserve space in the completion queue and only proceed
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> + if (xskq_prod_reserve(xs->pool->cq)) {
> kfree_skb(skb);
Same here, do we need to set err now that its not explicit above in
err = skb_store_bits...
> goto out;
> }
>
> - skb->dev = xs->dev;
> - skb->priority = sk->sk_priority;
> - skb->mark = sk->sk_mark;
> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> - skb->destructor = xsk_destruct_skb;
> -
> err = __dev_direct_xmit(skb, xs->queue_id);
> if (err == NETDEV_TX_BUSY) {
> /* Tell user-space to retry the send */
> --
> 1.8.3.1
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-16 2:44 Xuan Zhuo
` (6 preceding siblings ...)
2021-01-18 12:40 ` Yunsheng Lin
@ 2021-01-19 12:44 ` Alexander Lobakin
7 siblings, 0 replies; 23+ messages in thread
From: Alexander Lobakin @ 2021-01-19 12:44 UTC (permalink / raw)
To: Xuan Zhuo
Cc: Alexander Lobakin, Michael S. Tsirkin, Jason Wang,
David S. Miller, Jakub Kicinski, bjorn.topel, Magnus Karlsson,
Jonathan Lemon, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh,
Willem de Bruijn, Steffen Klassert, Miaohe Lin,
Mauro Carvalho Chehab, Antoine Tenart, Michal Kubecek,
Andrew Lunn, Florian Fainelli, Meir Lichtinger, virtualization,
bpf, netdev, linux-kernel
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Sat, 16 Jan 2021 10:44:53 +0800
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> This has one problem:
>
> We construct the skb by fill the data page as a frag into the skb. In
> this way, the linear space is empty, and the header information is also
> in the frag, not in the linear space, which is not allowed for some
> network cards. For example, Mellanox Technologies MT27710 Family
> [ConnectX-4 Lx] will get the following error message:
>
> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>
> I also tried to use build_skb to construct skb, but because of the
> existence of skb_shinfo, it must be behind the linear space, so this
> method is not working. We can't put skb_shinfo on desc->addr, it will be
> exposed to users, this is not safe.
>
> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> network card supports the header information of the packet in the frag
> and not in the linear space.
>
> ---------------- Performance Testing ------------
>
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t -S -s <msg size>
> ```
>
> Test result data:
>
> size 64 512 1024 1500
> copy 1916747 1775988 1600203 1440054
> page 1974058 1953655 1945463 1904478
> percent 3.0% 10.0% 21.58% 32.3%
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> ---
> drivers/net/virtio_net.c | 2 +-
> include/linux/netdev_features.h | 5 +-
> net/ethtool/common.c | 1 +
> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
> 4 files changed, 97 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 4ecccb8..841a331 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> /* Set up network device as normal. */
> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> dev->netdev_ops = &virtnet_netdev;
> - dev->features = NETIF_F_HIGHDMA;
> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>
> dev->ethtool_ops = &virtnet_ethtool_ops;
> SET_NETDEV_DEV(dev, &vdev->dev);
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index 934de56..8dd28e2 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
> @@ -85,9 +85,11 @@ enum {
>
> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
>
> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
> +
> /*
> * Add your fresh new feature above and remember to update
> - * netdev_features_strings[] in net/core/ethtool.c and maybe
> + * netdev_features_strings[] in net/ethtool/common.c and maybe
> * some feature mask #defines below. Please also describe it
> * in Documentation/networking/netdev-features.rst.
> */
> @@ -157,6 +159,7 @@ enum {
> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
>
> /* Finds the next feature with the highest number of the range of start till 0.
> */
> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> index 24036e3..2f3d309 100644
> --- a/net/ethtool/common.c
> +++ b/net/ethtool/common.c
> @@ -68,6 +68,7 @@
> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
> };
>
> const char
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 8037b04..94d17dc 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> + struct xdp_desc *desc)
> +{
> + u32 len, offset, copy, copied;
> + struct sk_buff *skb;
> + struct page *page;
> + char *buffer;
> + int err, i;
> + u64 addr;
> +
> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> + if (unlikely(!skb))
> + return NULL;
> +
> + addr = desc->addr;
> + len = desc->len;
> +
> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> + offset = offset_in_page(buffer);
> + addr = buffer - (char *)xs->pool->addrs;
> +
> + for (copied = 0, i = 0; copied < len; ++i) {
Just noticed. i++ would be less confusing here. You start to fill
skb frags from frag 0 anyway.
> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> + get_page(page);
> +
> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
Also. It's better to use min_t() in this case:
copy = min_t(u32, PAGE_SIZE - offset, len - copied);
instead of manual casting.
> + skb_fill_page_desc(skb, i, page, offset, copy);
> +
> + copied += copy;
> + addr += copy;
> + offset = 0;
> + }
> +
> + skb->len += len;
> + skb->data_len += len;
> + skb->truesize += len;
> +
> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> +
> + return skb;
> +}
> +
> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> + struct xdp_desc *desc, int *err)
> +{
> + struct sk_buff *skb;
> +
> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> + skb = xsk_build_skb_zerocopy(xs, desc);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
> + return NULL;
> + }
> + } else {
> + char *buffer;
> + u64 addr;
> + u32 len;
> + int err;
> +
> + len = desc->len;
> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
> + return NULL;
> + }
> +
> + skb_put(skb, len);
> + addr = desc->addr;
> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> + err = skb_store_bits(skb, 0, buffer, len);
> +
> + if (unlikely(err)) {
> + kfree_skb(skb);
> + *err = -EINVAL;
> + return NULL;
> + }
> + }
> +
> + skb->dev = xs->dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
> +}
> +
> static int xsk_generic_xmit(struct sock *sk)
> {
> struct xdp_sock *xs = xdp_sk(sk);
> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
> goto out;
>
> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> - char *buffer;
> - u64 addr;
> - u32 len;
> -
> if (max_batch-- == 0) {
> err = -EAGAIN;
> goto out;
> }
>
> - len = desc.len;
> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> + skb = xsk_build_skb(xs, &desc, &err);
> if (unlikely(!skb))
> goto out;
>
> - skb_put(skb, len);
> - addr = desc.addr;
> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> - err = skb_store_bits(skb, 0, buffer, len);
> /* This is the backpressure mechanism for the Tx path.
> * Reserve space in the completion queue and only proceed
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> spin_lock_irqsave(&xs->pool->cq_lock, flags);
> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> + if (xskq_prod_reserve(xs->pool->cq)) {
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> kfree_skb(skb);
> goto out;
> }
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>
> - skb->dev = xs->dev;
> - skb->priority = sk->sk_priority;
> - skb->mark = sk->sk_mark;
> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> - skb->destructor = xsk_destruct_skb;
> -
> err = __dev_direct_xmit(skb, xs->queue_id);
> if (err == NETDEV_TX_BUSY) {
> /* Tell user-space to retry the send */
> --
> 1.8.3.1
Al
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-18 16:38 ` Alexander Lobakin
@ 2021-01-19 7:01 ` Magnus Karlsson
0 siblings, 0 replies; 23+ messages in thread
From: Magnus Karlsson @ 2021-01-19 7:01 UTC (permalink / raw)
To: Alexander Lobakin
Cc: Yunsheng Lin, Xuan Zhuo, Michael S. Tsirkin, Jason Wang,
David S. Miller, Jakub Kicinski, Björn Töpel,
Magnus Karlsson, Jonathan Lemon, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
KP Singh, Willem de Bruijn, Steffen Klassert, Miaohe Lin,
Mauro Carvalho Chehab, Antoine Tenart, Michal Kubecek,
Andrew Lunn, Florian Fainelli, Meir Lichtinger, virtualization,
bpf, Network Development, open list
On Mon, Jan 18, 2021 at 5:38 PM Alexander Lobakin <alobakin@pm.me> wrote:
>
> > From: Magnus Karlsson <magnus.karlsson@gmail.com>
> > Date: Mon, 18 Jan 2021 16:10:40 +0100
> >
> > On Mon, Jan 18, 2021 at 3:47 PM Alexander Lobakin <alobakin@pm.me> wrote:
> > >
> > > From: Alexander Lobakin <alobakin@pm.me>
> > > Date: Mon, 18 Jan 2021 13:00:17 +0000
> > >
> > > > From: Yunsheng Lin <linyunsheng@huawei.com>
> > > > Date: Mon, 18 Jan 2021 20:40:52 +0800
> > > >
> > > >> On 2021/1/16 10:44, Xuan Zhuo wrote:
> > > >>> This patch is used to construct skb based on page to save memory copy
> > > >>> overhead.
> > > >>>
> > > >>> This has one problem:
> > > >>>
> > > >>> We construct the skb by fill the data page as a frag into the skb. In
> > > >>> this way, the linear space is empty, and the header information is also
> > > >>> in the frag, not in the linear space, which is not allowed for some
> > > >>> network cards. For example, Mellanox Technologies MT27710 Family
> > > >>> [ConnectX-4 Lx] will get the following error message:
> > > >>>
> > > >>> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> > > >>> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > > >>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > > >>> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > > >>> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> > > >>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> > > >>> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> > > >>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > > >>> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> > > >>> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > > >>> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
> > > >>>
> > > >>> I also tried to use build_skb to construct skb, but because of the
> > > >>> existence of skb_shinfo, it must be behind the linear space, so this
> > > >>> method is not working. We can't put skb_shinfo on desc->addr, it will be
> > > >>> exposed to users, this is not safe.
> > > >>>
> > > >>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> > > >>
> > > >> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
> > > >> configure if the data is copied or not?
> > > >
> > > > As far as I can grep, only mlx4 supports this, and it has a different
> > > > meaning in that driver.
> > > > So I guess a new netdev_feature would be a better solution.
> > > >
> > > >>> network card supports the header information of the packet in the frag
> > > >>> and not in the linear space.
> > > >>>
> > > >>> ---------------- Performance Testing ------------
> > > >>>
> > > >>> The test environment is Aliyun ECS server.
> > > >>> Test cmd:
> > > >>> ```
> > > >>> xdpsock -i eth0 -t -S -s <msg size>
> > > >>> ```
> > > >>>
> > > >>> Test result data:
> > > >>>
> > > >>> size 64 512 1024 1500
> > > >>> copy 1916747 1775988 1600203 1440054
> > > >>> page 1974058 1953655 1945463 1904478
> > > >>> percent 3.0% 10.0% 21.58% 32.3%
> > > >>>
> > > >>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > > >>> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> > > >>> ---
> > > >>> drivers/net/virtio_net.c | 2 +-
> > > >>> include/linux/netdev_features.h | 5 +-
> > > >>> net/ethtool/common.c | 1 +
> > > >>> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
> > > >>> 4 files changed, 97 insertions(+), 19 deletions(-)
> > > >>>
> > > >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > > >>> index 4ecccb8..841a331 100644
> > > >>> --- a/drivers/net/virtio_net.c
> > > >>> +++ b/drivers/net/virtio_net.c
> > > >>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> > > >>> /* Set up network device as normal. */
> > > >>> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> > > >>> dev->netdev_ops = &virtnet_netdev;
> > > >>> - dev->features = NETIF_F_HIGHDMA;
> > > >>> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
> > > >>>
> > > >>> dev->ethtool_ops = &virtnet_ethtool_ops;
> > > >>> SET_NETDEV_DEV(dev, &vdev->dev);
> > > >>> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> > > >>> index 934de56..8dd28e2 100644
> > > >>> --- a/include/linux/netdev_features.h
> > > >>> +++ b/include/linux/netdev_features.h
> > > >>> @@ -85,9 +85,11 @@ enum {
> > > >>>
> > > >>> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
> > > >>>
> > > >>> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
> > > >>> +
> > > >>> /*
> > > >>> * Add your fresh new feature above and remember to update
> > > >>> - * netdev_features_strings[] in net/core/ethtool.c and maybe
> > > >>> + * netdev_features_strings[] in net/ethtool/common.c and maybe
> > > >>> * some feature mask #defines below. Please also describe it
> > > >>> * in Documentation/networking/netdev-features.rst.
> > > >>> */
> > > >>> @@ -157,6 +159,7 @@ enum {
> > > >>> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
> > > >>> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
> > > >>> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
> > > >>> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
> > > >>>
> > > >>> /* Finds the next feature with the highest number of the range of start till 0.
> > > >>> */
> > > >>> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> > > >>> index 24036e3..2f3d309 100644
> > > >>> --- a/net/ethtool/common.c
> > > >>> +++ b/net/ethtool/common.c
> > > >>> @@ -68,6 +68,7 @@
> > > >>> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
> > > >>> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
> > > >>> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
> > > >>> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
> > > >
> > > > I completely forgot to add that you'd better to mention in both
> > > > enumeration/feature and its Ethtool string that the feature applies
> > > > to Tx path.
> > > > Smth like:
> > > >
> > > > NETIF_F_SKB_TX_NO_LINEAR{,_BIT}, "skb-tx-no-linear"
> > > > or
> > > > NETIF_F_TX_SKB_NO_LINEAR{,_BIT}, "tx-skb-no-linear"
> > > >
> > > > Otherwise, it may be confusing for users and developers.
> >
> > I prefer one of these names for the property as they clearly describe
> > a feature that the driver supports.
> >
> > > OR, I think we may tight the feature with the new approach to build
> > > skbs by page as it makes no sense for anything else.
> > > So, if we define something like:
> > >
> > > NETIF_F_XSK_TX_GENERIC_ZC{,_BIT}, "xsk-tx-generic-zerocopy",
> >
> > This one I misunderstood first. I thought: "this is not zerocopy", but
> > you are right it is. It is zero-copy implemented with skb:s. But in my
> > mind, the NO_LINEAR version that you suggested are clearer.
> >
> > > then user can toggle your new XSK Tx path on/off via Ethtool for
> > > drivers that will support it (don't forget to add it to hw_features
> > > for virtio_net then).
>
> User don't need to enable manually this, drivers usually enable most
> of their features on netdevice creation. This way we just could have
> an option to turn it off.
>
> If the feature is not about to be exposed to user at all, only to
> indicate if a particular driver supports skbs with skb_headlen == 0
> on its .ndo_start_xmit() path, then it might be better to introduce
> a private flag (netdev_priv_flags) instead of netdev_feature. Private
> flags are kernel-only and can't be toggled on/off after netdev is
> registered.
> E.g.
>
> IFF_TX_SKB_NO_LINEAR
>
> and test it like
>
> if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
> /* new generic zerocopy path */
> } else {
> /* current code */
> }
>
This sounds like a good idea. I would go with this. Thank you Alexander!
> > > >>> };
> > > >>>
> > > >>> const char
> > > >>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > > >>> index 8037b04..94d17dc 100644
> > > >>> --- a/net/xdp/xsk.c
> > > >>> +++ b/net/xdp/xsk.c
> > > >>> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > > >>> sock_wfree(skb);
> > > >>> }
> > > >>>
> > > >>> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > > >>> + struct xdp_desc *desc)
> > > >>> +{
> > > >>> + u32 len, offset, copy, copied;
> > > >>> + struct sk_buff *skb;
> > > >>> + struct page *page;
> > > >>> + char *buffer;
> > > >>> + int err, i;
> > > >>> + u64 addr;
> > > >>> +
> > > >>> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> > > >>> + if (unlikely(!skb))
> > > >>> + return NULL;
> > > >>> +
> > > >>> + addr = desc->addr;
> > > >>> + len = desc->len;
> > > >>> +
> > > >>> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> > > >>> + offset = offset_in_page(buffer);
> > > >>> + addr = buffer - (char *)xs->pool->addrs;
> > > >>> +
> > > >>> + for (copied = 0, i = 0; copied < len; ++i) {
> > > >>> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> > > >>> +
> > > >>> + get_page(page);
> > > >>> +
> > > >>> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> > > >>> +
> > > >>> + skb_fill_page_desc(skb, i, page, offset, copy);
> > > >>> +
> > > >>> + copied += copy;
> > > >>> + addr += copy;
> > > >>> + offset = 0;
> > > >>> + }
> > > >>> +
> > > >>> + skb->len += len;
> > > >>> + skb->data_len += len;
> > > >>> + skb->truesize += len;
> > > >>> +
> > > >>> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> > > >>> +
> > > >>> + return skb;
> > > >>> +}
> > > >>> +
> > > >>> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > > >>> + struct xdp_desc *desc, int *err)
> > > >>> +{
> > > >>> + struct sk_buff *skb;
> > > >>> +
> > > >>> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> > > >>> + skb = xsk_build_skb_zerocopy(xs, desc);
> > > >>> + if (unlikely(!skb)) {
> > > >>> + *err = -ENOMEM;
> > > >>> + return NULL;
> > > >>> + }
> > > >>> + } else {
> > > >>> + char *buffer;
> > > >>> + u64 addr;
> > > >>> + u32 len;
> > > >>> + int err;
> > > >>> +
> > > >>> + len = desc->len;
> > > >>> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> > > >>> + if (unlikely(!skb)) {
> > > >>> + *err = -ENOMEM;
> > > >>> + return NULL;
> > > >>> + }
> > > >>> +
> > > >>> + skb_put(skb, len);
> > > >>> + addr = desc->addr;
> > > >>> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> > > >>> + err = skb_store_bits(skb, 0, buffer, len);
> > > >>> +
> > > >>> + if (unlikely(err)) {
> > > >>> + kfree_skb(skb);
> > > >>> + *err = -EINVAL;
> > > >>> + return NULL;
> > > >>> + }
> > > >>> + }
> > > >>> +
> > > >>> + skb->dev = xs->dev;
> > > >>> + skb->priority = xs->sk.sk_priority;
> > > >>> + skb->mark = xs->sk.sk_mark;
> > > >>> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> > > >>> + skb->destructor = xsk_destruct_skb;
> > > >>> +
> > > >>> + return skb;
> > > >>> +}
> > > >>> +
> > > >>> static int xsk_generic_xmit(struct sock *sk)
> > > >>> {
> > > >>> struct xdp_sock *xs = xdp_sk(sk);
> > > >>> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
> > > >>> goto out;
> > > >>>
> > > >>> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> > > >>> - char *buffer;
> > > >>> - u64 addr;
> > > >>> - u32 len;
> > > >>> -
> > > >>> if (max_batch-- == 0) {
> > > >>> err = -EAGAIN;
> > > >>> goto out;
> > > >>> }
> > > >>>
> > > >>> - len = desc.len;
> > > >>> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> > > >>> + skb = xsk_build_skb(xs, &desc, &err);
> > > >>> if (unlikely(!skb))
> > > >>> goto out;
> > > >>>
> > > >>> - skb_put(skb, len);
> > > >>> - addr = desc.addr;
> > > >>> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> > > >>> - err = skb_store_bits(skb, 0, buffer, len);
> > > >>> /* This is the backpressure mechanism for the Tx path.
> > > >>> * Reserve space in the completion queue and only proceed
> > > >>> * if there is space in it. This avoids having to implement
> > > >>> * any buffering in the Tx path.
> > > >>> */
> > > >>> spin_lock_irqsave(&xs->pool->cq_lock, flags);
> > > >>> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> > > >>> + if (xskq_prod_reserve(xs->pool->cq)) {
> > > >>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> > > >>> kfree_skb(skb);
> > > >>> goto out;
> > > >>> }
> > > >>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> > > >>>
> > > >>> - skb->dev = xs->dev;
> > > >>> - skb->priority = sk->sk_priority;
> > > >>> - skb->mark = sk->sk_mark;
> > > >>> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> > > >>> - skb->destructor = xsk_destruct_skb;
> > > >>> -
> > > >>> err = __dev_direct_xmit(skb, xs->queue_id);
> > > >>> if (err == NETDEV_TX_BUSY) {
> > > >>> /* Tell user-space to retry the send */
> > > >>>
> > > >
> > > > Al
> > >
> > > Al
> >
>
> Al
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-18 15:10 ` Magnus Karlsson
@ 2021-01-18 16:38 ` Alexander Lobakin
2021-01-19 7:01 ` Magnus Karlsson
0 siblings, 1 reply; 23+ messages in thread
From: Alexander Lobakin @ 2021-01-18 16:38 UTC (permalink / raw)
To: Magnus Karlsson
Cc: Alexander Lobakin, Yunsheng Lin, Xuan Zhuo, Michael S. Tsirkin,
Jason Wang, David S. Miller, Jakub Kicinski, bjorn.topel,
Magnus Karlsson, Jonathan Lemon, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
KP Singh, Willem de Bruijn, Steffen Klassert, Miaohe Lin,
Mauro Carvalho Chehab, Antoine Tenart, Michal Kubecek,
Andrew Lunn, Florian Fainelli, Meir Lichtinger, virtualization,
bpf, Network Development, open list
> From: Magnus Karlsson <magnus.karlsson@gmail.com>
> Date: Mon, 18 Jan 2021 16:10:40 +0100
>
> On Mon, Jan 18, 2021 at 3:47 PM Alexander Lobakin <alobakin@pm.me> wrote:
> >
> > From: Alexander Lobakin <alobakin@pm.me>
> > Date: Mon, 18 Jan 2021 13:00:17 +0000
> >
> > > From: Yunsheng Lin <linyunsheng@huawei.com>
> > > Date: Mon, 18 Jan 2021 20:40:52 +0800
> > >
> > >> On 2021/1/16 10:44, Xuan Zhuo wrote:
> > >>> This patch is used to construct skb based on page to save memory copy
> > >>> overhead.
> > >>>
> > >>> This has one problem:
> > >>>
> > >>> We construct the skb by fill the data page as a frag into the skb. In
> > >>> this way, the linear space is empty, and the header information is also
> > >>> in the frag, not in the linear space, which is not allowed for some
> > >>> network cards. For example, Mellanox Technologies MT27710 Family
> > >>> [ConnectX-4 Lx] will get the following error message:
> > >>>
> > >>> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> > >>> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > >>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > >>> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > >>> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> > >>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> > >>> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> > >>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > >>> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> > >>> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> > >>> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
> > >>>
> > >>> I also tried to use build_skb to construct skb, but because of the
> > >>> existence of skb_shinfo, it must be behind the linear space, so this
> > >>> method is not working. We can't put skb_shinfo on desc->addr, it will be
> > >>> exposed to users, this is not safe.
> > >>>
> > >>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> > >>
> > >> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
> > >> configure if the data is copied or not?
> > >
> > > As far as I can grep, only mlx4 supports this, and it has a different
> > > meaning in that driver.
> > > So I guess a new netdev_feature would be a better solution.
> > >
> > >>> network card supports the header information of the packet in the frag
> > >>> and not in the linear space.
> > >>>
> > >>> ---------------- Performance Testing ------------
> > >>>
> > >>> The test environment is Aliyun ECS server.
> > >>> Test cmd:
> > >>> ```
> > >>> xdpsock -i eth0 -t -S -s <msg size>
> > >>> ```
> > >>>
> > >>> Test result data:
> > >>>
> > >>> size 64 512 1024 1500
> > >>> copy 1916747 1775988 1600203 1440054
> > >>> page 1974058 1953655 1945463 1904478
> > >>> percent 3.0% 10.0% 21.58% 32.3%
> > >>>
> > >>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> > >>> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> > >>> ---
> > >>> drivers/net/virtio_net.c | 2 +-
> > >>> include/linux/netdev_features.h | 5 +-
> > >>> net/ethtool/common.c | 1 +
> > >>> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
> > >>> 4 files changed, 97 insertions(+), 19 deletions(-)
> > >>>
> > >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > >>> index 4ecccb8..841a331 100644
> > >>> --- a/drivers/net/virtio_net.c
> > >>> +++ b/drivers/net/virtio_net.c
> > >>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> > >>> /* Set up network device as normal. */
> > >>> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> > >>> dev->netdev_ops = &virtnet_netdev;
> > >>> - dev->features = NETIF_F_HIGHDMA;
> > >>> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
> > >>>
> > >>> dev->ethtool_ops = &virtnet_ethtool_ops;
> > >>> SET_NETDEV_DEV(dev, &vdev->dev);
> > >>> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> > >>> index 934de56..8dd28e2 100644
> > >>> --- a/include/linux/netdev_features.h
> > >>> +++ b/include/linux/netdev_features.h
> > >>> @@ -85,9 +85,11 @@ enum {
> > >>>
> > >>> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
> > >>>
> > >>> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
> > >>> +
> > >>> /*
> > >>> * Add your fresh new feature above and remember to update
> > >>> - * netdev_features_strings[] in net/core/ethtool.c and maybe
> > >>> + * netdev_features_strings[] in net/ethtool/common.c and maybe
> > >>> * some feature mask #defines below. Please also describe it
> > >>> * in Documentation/networking/netdev-features.rst.
> > >>> */
> > >>> @@ -157,6 +159,7 @@ enum {
> > >>> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
> > >>> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
> > >>> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
> > >>> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
> > >>>
> > >>> /* Finds the next feature with the highest number of the range of start till 0.
> > >>> */
> > >>> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> > >>> index 24036e3..2f3d309 100644
> > >>> --- a/net/ethtool/common.c
> > >>> +++ b/net/ethtool/common.c
> > >>> @@ -68,6 +68,7 @@
> > >>> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
> > >>> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
> > >>> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
> > >>> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
> > >
> > > I completely forgot to add that you'd better to mention in both
> > > enumeration/feature and its Ethtool string that the feature applies
> > > to Tx path.
> > > Smth like:
> > >
> > > NETIF_F_SKB_TX_NO_LINEAR{,_BIT}, "skb-tx-no-linear"
> > > or
> > > NETIF_F_TX_SKB_NO_LINEAR{,_BIT}, "tx-skb-no-linear"
> > >
> > > Otherwise, it may be confusing for users and developers.
>
> I prefer one of these names for the property as they clearly describe
> a feature that the driver supports.
>
> > OR, I think we may tight the feature with the new approach to build
> > skbs by page as it makes no sense for anything else.
> > So, if we define something like:
> >
> > NETIF_F_XSK_TX_GENERIC_ZC{,_BIT}, "xsk-tx-generic-zerocopy",
>
> This one I misunderstood first. I thought: "this is not zerocopy", but
> you are right it is. It is zero-copy implemented with skb:s. But in my
> mind, the NO_LINEAR version that you suggested are clearer.
>
> > then user can toggle your new XSK Tx path on/off via Ethtool for
> > drivers that will support it (don't forget to add it to hw_features
> > for virtio_net then).
User don't need to enable manually this, drivers usually enable most
of their features on netdevice creation. This way we just could have
an option to turn it off.
If the feature is not about to be exposed to user at all, only to
indicate if a particular driver supports skbs with skb_headlen == 0
on its .ndo_start_xmit() path, then it might be better to introduce
a private flag (netdev_priv_flags) instead of netdev_feature. Private
flags are kernel-only and can't be toggled on/off after netdev is
registered.
E.g.
IFF_TX_SKB_NO_LINEAR
and test it like
if (dev->priv_flags & IFF_TX_SKB_NO_LINEAR) {
/* new generic zerocopy path */
} else {
/* current code */
}
> > >>> };
> > >>>
> > >>> const char
> > >>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> > >>> index 8037b04..94d17dc 100644
> > >>> --- a/net/xdp/xsk.c
> > >>> +++ b/net/xdp/xsk.c
> > >>> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> > >>> sock_wfree(skb);
> > >>> }
> > >>>
> > >>> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> > >>> + struct xdp_desc *desc)
> > >>> +{
> > >>> + u32 len, offset, copy, copied;
> > >>> + struct sk_buff *skb;
> > >>> + struct page *page;
> > >>> + char *buffer;
> > >>> + int err, i;
> > >>> + u64 addr;
> > >>> +
> > >>> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> > >>> + if (unlikely(!skb))
> > >>> + return NULL;
> > >>> +
> > >>> + addr = desc->addr;
> > >>> + len = desc->len;
> > >>> +
> > >>> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> > >>> + offset = offset_in_page(buffer);
> > >>> + addr = buffer - (char *)xs->pool->addrs;
> > >>> +
> > >>> + for (copied = 0, i = 0; copied < len; ++i) {
> > >>> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> > >>> +
> > >>> + get_page(page);
> > >>> +
> > >>> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> > >>> +
> > >>> + skb_fill_page_desc(skb, i, page, offset, copy);
> > >>> +
> > >>> + copied += copy;
> > >>> + addr += copy;
> > >>> + offset = 0;
> > >>> + }
> > >>> +
> > >>> + skb->len += len;
> > >>> + skb->data_len += len;
> > >>> + skb->truesize += len;
> > >>> +
> > >>> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> > >>> +
> > >>> + return skb;
> > >>> +}
> > >>> +
> > >>> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> > >>> + struct xdp_desc *desc, int *err)
> > >>> +{
> > >>> + struct sk_buff *skb;
> > >>> +
> > >>> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> > >>> + skb = xsk_build_skb_zerocopy(xs, desc);
> > >>> + if (unlikely(!skb)) {
> > >>> + *err = -ENOMEM;
> > >>> + return NULL;
> > >>> + }
> > >>> + } else {
> > >>> + char *buffer;
> > >>> + u64 addr;
> > >>> + u32 len;
> > >>> + int err;
> > >>> +
> > >>> + len = desc->len;
> > >>> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> > >>> + if (unlikely(!skb)) {
> > >>> + *err = -ENOMEM;
> > >>> + return NULL;
> > >>> + }
> > >>> +
> > >>> + skb_put(skb, len);
> > >>> + addr = desc->addr;
> > >>> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> > >>> + err = skb_store_bits(skb, 0, buffer, len);
> > >>> +
> > >>> + if (unlikely(err)) {
> > >>> + kfree_skb(skb);
> > >>> + *err = -EINVAL;
> > >>> + return NULL;
> > >>> + }
> > >>> + }
> > >>> +
> > >>> + skb->dev = xs->dev;
> > >>> + skb->priority = xs->sk.sk_priority;
> > >>> + skb->mark = xs->sk.sk_mark;
> > >>> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> > >>> + skb->destructor = xsk_destruct_skb;
> > >>> +
> > >>> + return skb;
> > >>> +}
> > >>> +
> > >>> static int xsk_generic_xmit(struct sock *sk)
> > >>> {
> > >>> struct xdp_sock *xs = xdp_sk(sk);
> > >>> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
> > >>> goto out;
> > >>>
> > >>> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> > >>> - char *buffer;
> > >>> - u64 addr;
> > >>> - u32 len;
> > >>> -
> > >>> if (max_batch-- == 0) {
> > >>> err = -EAGAIN;
> > >>> goto out;
> > >>> }
> > >>>
> > >>> - len = desc.len;
> > >>> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> > >>> + skb = xsk_build_skb(xs, &desc, &err);
> > >>> if (unlikely(!skb))
> > >>> goto out;
> > >>>
> > >>> - skb_put(skb, len);
> > >>> - addr = desc.addr;
> > >>> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> > >>> - err = skb_store_bits(skb, 0, buffer, len);
> > >>> /* This is the backpressure mechanism for the Tx path.
> > >>> * Reserve space in the completion queue and only proceed
> > >>> * if there is space in it. This avoids having to implement
> > >>> * any buffering in the Tx path.
> > >>> */
> > >>> spin_lock_irqsave(&xs->pool->cq_lock, flags);
> > >>> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> > >>> + if (xskq_prod_reserve(xs->pool->cq)) {
> > >>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> > >>> kfree_skb(skb);
> > >>> goto out;
> > >>> }
> > >>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> > >>>
> > >>> - skb->dev = xs->dev;
> > >>> - skb->priority = sk->sk_priority;
> > >>> - skb->mark = sk->sk_mark;
> > >>> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> > >>> - skb->destructor = xsk_destruct_skb;
> > >>> -
> > >>> err = __dev_direct_xmit(skb, xs->queue_id);
> > >>> if (err == NETDEV_TX_BUSY) {
> > >>> /* Tell user-space to retry the send */
> > >>>
> > >
> > > Al
> >
> > Al
>
Al
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-18 14:40 ` Alexander Lobakin
2021-01-18 15:03 ` Magnus Karlsson
@ 2021-01-18 15:10 ` Magnus Karlsson
2021-01-18 16:38 ` Alexander Lobakin
1 sibling, 1 reply; 23+ messages in thread
From: Magnus Karlsson @ 2021-01-18 15:10 UTC (permalink / raw)
To: Alexander Lobakin
Cc: Yunsheng Lin, Xuan Zhuo, Michael S. Tsirkin, Jason Wang,
David S. Miller, Jakub Kicinski, Björn Töpel,
Magnus Karlsson, Jonathan Lemon, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
KP Singh, Willem de Bruijn, Steffen Klassert, Miaohe Lin,
Mauro Carvalho Chehab, Antoine Tenart, Michal Kubecek,
Andrew Lunn, Florian Fainelli, Meir Lichtinger, virtualization,
bpf, Network Development, open list
On Mon, Jan 18, 2021 at 3:47 PM Alexander Lobakin <alobakin@pm.me> wrote:
>
> From: Alexander Lobakin <alobakin@pm.me>
> Date: Mon, 18 Jan 2021 13:00:17 +0000
>
> > From: Yunsheng Lin <linyunsheng@huawei.com>
> > Date: Mon, 18 Jan 2021 20:40:52 +0800
> >
> >> On 2021/1/16 10:44, Xuan Zhuo wrote:
> >>> This patch is used to construct skb based on page to save memory copy
> >>> overhead.
> >>>
> >>> This has one problem:
> >>>
> >>> We construct the skb by fill the data page as a frag into the skb. In
> >>> this way, the linear space is empty, and the header information is also
> >>> in the frag, not in the linear space, which is not allowed for some
> >>> network cards. For example, Mellanox Technologies MT27710 Family
> >>> [ConnectX-4 Lx] will get the following error message:
> >>>
> >>> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> >>> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> >>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> >>> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> >>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> >>> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
> >>>
> >>> I also tried to use build_skb to construct skb, but because of the
> >>> existence of skb_shinfo, it must be behind the linear space, so this
> >>> method is not working. We can't put skb_shinfo on desc->addr, it will be
> >>> exposed to users, this is not safe.
> >>>
> >>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> >>
> >> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
> >> configure if the data is copied or not?
> >
> > As far as I can grep, only mlx4 supports this, and it has a different
> > meaning in that driver.
> > So I guess a new netdev_feature would be a better solution.
> >
> >>> network card supports the header information of the packet in the frag
> >>> and not in the linear space.
> >>>
> >>> ---------------- Performance Testing ------------
> >>>
> >>> The test environment is Aliyun ECS server.
> >>> Test cmd:
> >>> ```
> >>> xdpsock -i eth0 -t -S -s <msg size>
> >>> ```
> >>>
> >>> Test result data:
> >>>
> >>> size 64 512 1024 1500
> >>> copy 1916747 1775988 1600203 1440054
> >>> page 1974058 1953655 1945463 1904478
> >>> percent 3.0% 10.0% 21.58% 32.3%
> >>>
> >>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> >>> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> >>> ---
> >>> drivers/net/virtio_net.c | 2 +-
> >>> include/linux/netdev_features.h | 5 +-
> >>> net/ethtool/common.c | 1 +
> >>> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
> >>> 4 files changed, 97 insertions(+), 19 deletions(-)
> >>>
> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >>> index 4ecccb8..841a331 100644
> >>> --- a/drivers/net/virtio_net.c
> >>> +++ b/drivers/net/virtio_net.c
> >>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> >>> /* Set up network device as normal. */
> >>> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> >>> dev->netdev_ops = &virtnet_netdev;
> >>> - dev->features = NETIF_F_HIGHDMA;
> >>> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
> >>>
> >>> dev->ethtool_ops = &virtnet_ethtool_ops;
> >>> SET_NETDEV_DEV(dev, &vdev->dev);
> >>> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> >>> index 934de56..8dd28e2 100644
> >>> --- a/include/linux/netdev_features.h
> >>> +++ b/include/linux/netdev_features.h
> >>> @@ -85,9 +85,11 @@ enum {
> >>>
> >>> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
> >>>
> >>> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
> >>> +
> >>> /*
> >>> * Add your fresh new feature above and remember to update
> >>> - * netdev_features_strings[] in net/core/ethtool.c and maybe
> >>> + * netdev_features_strings[] in net/ethtool/common.c and maybe
> >>> * some feature mask #defines below. Please also describe it
> >>> * in Documentation/networking/netdev-features.rst.
> >>> */
> >>> @@ -157,6 +159,7 @@ enum {
> >>> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
> >>> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
> >>> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
> >>> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
> >>>
> >>> /* Finds the next feature with the highest number of the range of start till 0.
> >>> */
> >>> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> >>> index 24036e3..2f3d309 100644
> >>> --- a/net/ethtool/common.c
> >>> +++ b/net/ethtool/common.c
> >>> @@ -68,6 +68,7 @@
> >>> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
> >>> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
> >>> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
> >>> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
> >
> > I completely forgot to add that you'd better to mention in both
> > enumeration/feature and its Ethtool string that the feature applies
> > to Tx path.
> > Smth like:
> >
> > NETIF_F_SKB_TX_NO_LINEAR{,_BIT}, "skb-tx-no-linear"
> > or
> > NETIF_F_TX_SKB_NO_LINEAR{,_BIT}, "tx-skb-no-linear"
> >
> > Otherwise, it may be confusing for users and developers.
I prefer one of these names for the property as they clearly describe
a feature that the driver supports.
> OR, I think we may tight the feature with the new approach to build
> skbs by page as it makes no sense for anything else.
> So, if we define something like:
>
> NETIF_F_XSK_TX_GENERIC_ZC{,_BIT}, "xsk-tx-generic-zerocopy",
This one I misunderstood first. I thought: "this is not zerocopy", but
you are right it is. It is zero-copy implemented with skb:s. But in my
mind, the NO_LINEAR version that you suggested are clearer.
> then user can toggle your new XSK Tx path on/off via Ethtool for
> drivers that will support it (don't forget to add it to hw_features
> for virtio_net then).
>
> >>> };
> >>>
> >>> const char
> >>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> >>> index 8037b04..94d17dc 100644
> >>> --- a/net/xdp/xsk.c
> >>> +++ b/net/xdp/xsk.c
> >>> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> >>> sock_wfree(skb);
> >>> }
> >>>
> >>> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> >>> + struct xdp_desc *desc)
> >>> +{
> >>> + u32 len, offset, copy, copied;
> >>> + struct sk_buff *skb;
> >>> + struct page *page;
> >>> + char *buffer;
> >>> + int err, i;
> >>> + u64 addr;
> >>> +
> >>> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> >>> + if (unlikely(!skb))
> >>> + return NULL;
> >>> +
> >>> + addr = desc->addr;
> >>> + len = desc->len;
> >>> +
> >>> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> >>> + offset = offset_in_page(buffer);
> >>> + addr = buffer - (char *)xs->pool->addrs;
> >>> +
> >>> + for (copied = 0, i = 0; copied < len; ++i) {
> >>> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> >>> +
> >>> + get_page(page);
> >>> +
> >>> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> >>> +
> >>> + skb_fill_page_desc(skb, i, page, offset, copy);
> >>> +
> >>> + copied += copy;
> >>> + addr += copy;
> >>> + offset = 0;
> >>> + }
> >>> +
> >>> + skb->len += len;
> >>> + skb->data_len += len;
> >>> + skb->truesize += len;
> >>> +
> >>> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> >>> +
> >>> + return skb;
> >>> +}
> >>> +
> >>> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> >>> + struct xdp_desc *desc, int *err)
> >>> +{
> >>> + struct sk_buff *skb;
> >>> +
> >>> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> >>> + skb = xsk_build_skb_zerocopy(xs, desc);
> >>> + if (unlikely(!skb)) {
> >>> + *err = -ENOMEM;
> >>> + return NULL;
> >>> + }
> >>> + } else {
> >>> + char *buffer;
> >>> + u64 addr;
> >>> + u32 len;
> >>> + int err;
> >>> +
> >>> + len = desc->len;
> >>> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> >>> + if (unlikely(!skb)) {
> >>> + *err = -ENOMEM;
> >>> + return NULL;
> >>> + }
> >>> +
> >>> + skb_put(skb, len);
> >>> + addr = desc->addr;
> >>> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> >>> + err = skb_store_bits(skb, 0, buffer, len);
> >>> +
> >>> + if (unlikely(err)) {
> >>> + kfree_skb(skb);
> >>> + *err = -EINVAL;
> >>> + return NULL;
> >>> + }
> >>> + }
> >>> +
> >>> + skb->dev = xs->dev;
> >>> + skb->priority = xs->sk.sk_priority;
> >>> + skb->mark = xs->sk.sk_mark;
> >>> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> >>> + skb->destructor = xsk_destruct_skb;
> >>> +
> >>> + return skb;
> >>> +}
> >>> +
> >>> static int xsk_generic_xmit(struct sock *sk)
> >>> {
> >>> struct xdp_sock *xs = xdp_sk(sk);
> >>> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
> >>> goto out;
> >>>
> >>> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> >>> - char *buffer;
> >>> - u64 addr;
> >>> - u32 len;
> >>> -
> >>> if (max_batch-- == 0) {
> >>> err = -EAGAIN;
> >>> goto out;
> >>> }
> >>>
> >>> - len = desc.len;
> >>> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> >>> + skb = xsk_build_skb(xs, &desc, &err);
> >>> if (unlikely(!skb))
> >>> goto out;
> >>>
> >>> - skb_put(skb, len);
> >>> - addr = desc.addr;
> >>> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> >>> - err = skb_store_bits(skb, 0, buffer, len);
> >>> /* This is the backpressure mechanism for the Tx path.
> >>> * Reserve space in the completion queue and only proceed
> >>> * if there is space in it. This avoids having to implement
> >>> * any buffering in the Tx path.
> >>> */
> >>> spin_lock_irqsave(&xs->pool->cq_lock, flags);
> >>> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> >>> + if (xskq_prod_reserve(xs->pool->cq)) {
> >>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> >>> kfree_skb(skb);
> >>> goto out;
> >>> }
> >>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> >>>
> >>> - skb->dev = xs->dev;
> >>> - skb->priority = sk->sk_priority;
> >>> - skb->mark = sk->sk_mark;
> >>> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> >>> - skb->destructor = xsk_destruct_skb;
> >>> -
> >>> err = __dev_direct_xmit(skb, xs->queue_id);
> >>> if (err == NETDEV_TX_BUSY) {
> >>> /* Tell user-space to retry the send */
> >>>
> >
> > Al
>
> Al
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-18 14:40 ` Alexander Lobakin
@ 2021-01-18 15:03 ` Magnus Karlsson
2021-01-18 15:10 ` Magnus Karlsson
1 sibling, 0 replies; 23+ messages in thread
From: Magnus Karlsson @ 2021-01-18 15:03 UTC (permalink / raw)
To: Alexander Lobakin
Cc: Yunsheng Lin, Xuan Zhuo, Michael S. Tsirkin, Jason Wang,
David S. Miller, Jakub Kicinski, Björn Töpel,
Magnus Karlsson, Jonathan Lemon, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
KP Singh, Willem de Bruijn, Steffen Klassert, Miaohe Lin,
Mauro Carvalho Chehab, Antoine Tenart, Michal Kubecek,
Andrew Lunn, Florian Fainelli, Meir Lichtinger, virtualization,
bpf, Network Development, open list
On Mon, Jan 18, 2021 at 3:47 PM Alexander Lobakin <alobakin@pm.me> wrote:
>
> From: Alexander Lobakin <alobakin@pm.me>
> Date: Mon, 18 Jan 2021 13:00:17 +0000
>
> > From: Yunsheng Lin <linyunsheng@huawei.com>
> > Date: Mon, 18 Jan 2021 20:40:52 +0800
> >
> >> On 2021/1/16 10:44, Xuan Zhuo wrote:
> >>> This patch is used to construct skb based on page to save memory copy
> >>> overhead.
> >>>
> >>> This has one problem:
> >>>
> >>> We construct the skb by fill the data page as a frag into the skb. In
> >>> this way, the linear space is empty, and the header information is also
> >>> in the frag, not in the linear space, which is not allowed for some
> >>> network cards. For example, Mellanox Technologies MT27710 Family
> >>> [ConnectX-4 Lx] will get the following error message:
> >>>
> >>> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> >>> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> >>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> >>> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> >>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> >>> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >>> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
> >>>
> >>> I also tried to use build_skb to construct skb, but because of the
> >>> existence of skb_shinfo, it must be behind the linear space, so this
> >>> method is not working. We can't put skb_shinfo on desc->addr, it will be
> >>> exposed to users, this is not safe.
> >>>
> >>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> >>
> >> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
> >> configure if the data is copied or not?
> >
> > As far as I can grep, only mlx4 supports this, and it has a different
> > meaning in that driver.
> > So I guess a new netdev_feature would be a better solution.
> >
> >>> network card supports the header information of the packet in the frag
> >>> and not in the linear space.
> >>>
> >>> ---------------- Performance Testing ------------
> >>>
> >>> The test environment is Aliyun ECS server.
> >>> Test cmd:
> >>> ```
> >>> xdpsock -i eth0 -t -S -s <msg size>
> >>> ```
> >>>
> >>> Test result data:
> >>>
> >>> size 64 512 1024 1500
> >>> copy 1916747 1775988 1600203 1440054
> >>> page 1974058 1953655 1945463 1904478
> >>> percent 3.0% 10.0% 21.58% 32.3%
> >>>
> >>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> >>> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> >>> ---
> >>> drivers/net/virtio_net.c | 2 +-
> >>> include/linux/netdev_features.h | 5 +-
> >>> net/ethtool/common.c | 1 +
> >>> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
> >>> 4 files changed, 97 insertions(+), 19 deletions(-)
> >>>
> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >>> index 4ecccb8..841a331 100644
> >>> --- a/drivers/net/virtio_net.c
> >>> +++ b/drivers/net/virtio_net.c
> >>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> >>> /* Set up network device as normal. */
> >>> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> >>> dev->netdev_ops = &virtnet_netdev;
> >>> - dev->features = NETIF_F_HIGHDMA;
> >>> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
> >>>
> >>> dev->ethtool_ops = &virtnet_ethtool_ops;
> >>> SET_NETDEV_DEV(dev, &vdev->dev);
> >>> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> >>> index 934de56..8dd28e2 100644
> >>> --- a/include/linux/netdev_features.h
> >>> +++ b/include/linux/netdev_features.h
> >>> @@ -85,9 +85,11 @@ enum {
> >>>
> >>> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
> >>>
> >>> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
> >>> +
> >>> /*
> >>> * Add your fresh new feature above and remember to update
> >>> - * netdev_features_strings[] in net/core/ethtool.c and maybe
> >>> + * netdev_features_strings[] in net/ethtool/common.c and maybe
> >>> * some feature mask #defines below. Please also describe it
> >>> * in Documentation/networking/netdev-features.rst.
> >>> */
> >>> @@ -157,6 +159,7 @@ enum {
> >>> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
> >>> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
> >>> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
> >>> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
> >>>
> >>> /* Finds the next feature with the highest number of the range of start till 0.
> >>> */
> >>> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> >>> index 24036e3..2f3d309 100644
> >>> --- a/net/ethtool/common.c
> >>> +++ b/net/ethtool/common.c
> >>> @@ -68,6 +68,7 @@
> >>> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
> >>> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
> >>> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
> >>> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
> >
> > I completely forgot to add that you'd better to mention in both
> > enumeration/feature and its Ethtool string that the feature applies
> > to Tx path.
> > Smth like:
> >
> > NETIF_F_SKB_TX_NO_LINEAR{,_BIT}, "skb-tx-no-linear"
> > or
> > NETIF_F_TX_SKB_NO_LINEAR{,_BIT}, "tx-skb-no-linear"
> >
> > Otherwise, it may be confusing for users and developers.
>
> OR, I think we may tight the feature with the new approach to build
> skbs by page as it makes no sense for anything else.
> So, if we define something like:
>
> NETIF_F_XSK_TX_GENERIC_ZC{,_BIT}, "xsk-tx-generic-zerocopy",
>
> then user can toggle your new XSK Tx path on/off via Ethtool for
> drivers that will support it (don't forget to add it to hw_features
> for virtio_net then).
Is there ever a reason not to use this path if it is available? If
not, then it would IMO be better if the xsk base code could probe for
this bit and always use this faster path when the feature is supported
by the driver. If it is not supported by the driver, we will just fall
back on the old approach. This way, the user would get better
performance without having to know it needs to toggle some special bit
using Ethtool.
> >>> };
> >>>
> >>> const char
> >>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> >>> index 8037b04..94d17dc 100644
> >>> --- a/net/xdp/xsk.c
> >>> +++ b/net/xdp/xsk.c
> >>> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> >>> sock_wfree(skb);
> >>> }
> >>>
> >>> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> >>> + struct xdp_desc *desc)
> >>> +{
> >>> + u32 len, offset, copy, copied;
> >>> + struct sk_buff *skb;
> >>> + struct page *page;
> >>> + char *buffer;
> >>> + int err, i;
> >>> + u64 addr;
> >>> +
> >>> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> >>> + if (unlikely(!skb))
> >>> + return NULL;
> >>> +
> >>> + addr = desc->addr;
> >>> + len = desc->len;
> >>> +
> >>> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> >>> + offset = offset_in_page(buffer);
> >>> + addr = buffer - (char *)xs->pool->addrs;
> >>> +
> >>> + for (copied = 0, i = 0; copied < len; ++i) {
> >>> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> >>> +
> >>> + get_page(page);
> >>> +
> >>> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> >>> +
> >>> + skb_fill_page_desc(skb, i, page, offset, copy);
> >>> +
> >>> + copied += copy;
> >>> + addr += copy;
> >>> + offset = 0;
> >>> + }
> >>> +
> >>> + skb->len += len;
> >>> + skb->data_len += len;
> >>> + skb->truesize += len;
> >>> +
> >>> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> >>> +
> >>> + return skb;
> >>> +}
> >>> +
> >>> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> >>> + struct xdp_desc *desc, int *err)
> >>> +{
> >>> + struct sk_buff *skb;
> >>> +
> >>> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> >>> + skb = xsk_build_skb_zerocopy(xs, desc);
> >>> + if (unlikely(!skb)) {
> >>> + *err = -ENOMEM;
> >>> + return NULL;
> >>> + }
> >>> + } else {
> >>> + char *buffer;
> >>> + u64 addr;
> >>> + u32 len;
> >>> + int err;
> >>> +
> >>> + len = desc->len;
> >>> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> >>> + if (unlikely(!skb)) {
> >>> + *err = -ENOMEM;
> >>> + return NULL;
> >>> + }
> >>> +
> >>> + skb_put(skb, len);
> >>> + addr = desc->addr;
> >>> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> >>> + err = skb_store_bits(skb, 0, buffer, len);
> >>> +
> >>> + if (unlikely(err)) {
> >>> + kfree_skb(skb);
> >>> + *err = -EINVAL;
> >>> + return NULL;
> >>> + }
> >>> + }
> >>> +
> >>> + skb->dev = xs->dev;
> >>> + skb->priority = xs->sk.sk_priority;
> >>> + skb->mark = xs->sk.sk_mark;
> >>> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> >>> + skb->destructor = xsk_destruct_skb;
> >>> +
> >>> + return skb;
> >>> +}
> >>> +
> >>> static int xsk_generic_xmit(struct sock *sk)
> >>> {
> >>> struct xdp_sock *xs = xdp_sk(sk);
> >>> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
> >>> goto out;
> >>>
> >>> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> >>> - char *buffer;
> >>> - u64 addr;
> >>> - u32 len;
> >>> -
> >>> if (max_batch-- == 0) {
> >>> err = -EAGAIN;
> >>> goto out;
> >>> }
> >>>
> >>> - len = desc.len;
> >>> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> >>> + skb = xsk_build_skb(xs, &desc, &err);
> >>> if (unlikely(!skb))
> >>> goto out;
> >>>
> >>> - skb_put(skb, len);
> >>> - addr = desc.addr;
> >>> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> >>> - err = skb_store_bits(skb, 0, buffer, len);
> >>> /* This is the backpressure mechanism for the Tx path.
> >>> * Reserve space in the completion queue and only proceed
> >>> * if there is space in it. This avoids having to implement
> >>> * any buffering in the Tx path.
> >>> */
> >>> spin_lock_irqsave(&xs->pool->cq_lock, flags);
> >>> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> >>> + if (xskq_prod_reserve(xs->pool->cq)) {
> >>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> >>> kfree_skb(skb);
> >>> goto out;
> >>> }
> >>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> >>>
> >>> - skb->dev = xs->dev;
> >>> - skb->priority = sk->sk_priority;
> >>> - skb->mark = sk->sk_mark;
> >>> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> >>> - skb->destructor = xsk_destruct_skb;
> >>> -
> >>> err = __dev_direct_xmit(skb, xs->queue_id);
> >>> if (err == NETDEV_TX_BUSY) {
> >>> /* Tell user-space to retry the send */
> >>>
> >
> > Al
>
> Al
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-18 13:00 ` Alexander Lobakin
@ 2021-01-18 14:40 ` Alexander Lobakin
2021-01-18 15:03 ` Magnus Karlsson
2021-01-18 15:10 ` Magnus Karlsson
0 siblings, 2 replies; 23+ messages in thread
From: Alexander Lobakin @ 2021-01-18 14:40 UTC (permalink / raw)
To: Yunsheng Lin
Cc: Alexander Lobakin, Xuan Zhuo, Michael S. Tsirkin, Jason Wang,
David S. Miller, Jakub Kicinski, bjorn.topel, Magnus Karlsson,
Jonathan Lemon, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh,
Willem de Bruijn, Steffen Klassert, Miaohe Lin,
Mauro Carvalho Chehab, Antoine Tenart, Michal Kubecek,
Andrew Lunn, Florian Fainelli, Meir Lichtinger, virtualization,
bpf, netdev, linux-kernel
From: Alexander Lobakin <alobakin@pm.me>
Date: Mon, 18 Jan 2021 13:00:17 +0000
> From: Yunsheng Lin <linyunsheng@huawei.com>
> Date: Mon, 18 Jan 2021 20:40:52 +0800
>
>> On 2021/1/16 10:44, Xuan Zhuo wrote:
>>> This patch is used to construct skb based on page to save memory copy
>>> overhead.
>>>
>>> This has one problem:
>>>
>>> We construct the skb by fill the data page as a frag into the skb. In
>>> this way, the linear space is empty, and the header information is also
>>> in the frag, not in the linear space, which is not allowed for some
>>> network cards. For example, Mellanox Technologies MT27710 Family
>>> [ConnectX-4 Lx] will get the following error message:
>>>
>>> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
>>> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
>>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
>>> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
>>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
>>> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>>> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>>>
>>> I also tried to use build_skb to construct skb, but because of the
>>> existence of skb_shinfo, it must be behind the linear space, so this
>>> method is not working. We can't put skb_shinfo on desc->addr, it will be
>>> exposed to users, this is not safe.
>>>
>>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
>>
>> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
>> configure if the data is copied or not?
>
> As far as I can grep, only mlx4 supports this, and it has a different
> meaning in that driver.
> So I guess a new netdev_feature would be a better solution.
>
>>> network card supports the header information of the packet in the frag
>>> and not in the linear space.
>>>
>>> ---------------- Performance Testing ------------
>>>
>>> The test environment is Aliyun ECS server.
>>> Test cmd:
>>> ```
>>> xdpsock -i eth0 -t -S -s <msg size>
>>> ```
>>>
>>> Test result data:
>>>
>>> size 64 512 1024 1500
>>> copy 1916747 1775988 1600203 1440054
>>> page 1974058 1953655 1945463 1904478
>>> percent 3.0% 10.0% 21.58% 32.3%
>>>
>>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>>> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
>>> ---
>>> drivers/net/virtio_net.c | 2 +-
>>> include/linux/netdev_features.h | 5 +-
>>> net/ethtool/common.c | 1 +
>>> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
>>> 4 files changed, 97 insertions(+), 19 deletions(-)
>>>
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index 4ecccb8..841a331 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>>> /* Set up network device as normal. */
>>> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
>>> dev->netdev_ops = &virtnet_netdev;
>>> - dev->features = NETIF_F_HIGHDMA;
>>> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>>>
>>> dev->ethtool_ops = &virtnet_ethtool_ops;
>>> SET_NETDEV_DEV(dev, &vdev->dev);
>>> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
>>> index 934de56..8dd28e2 100644
>>> --- a/include/linux/netdev_features.h
>>> +++ b/include/linux/netdev_features.h
>>> @@ -85,9 +85,11 @@ enum {
>>>
>>> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
>>>
>>> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
>>> +
>>> /*
>>> * Add your fresh new feature above and remember to update
>>> - * netdev_features_strings[] in net/core/ethtool.c and maybe
>>> + * netdev_features_strings[] in net/ethtool/common.c and maybe
>>> * some feature mask #defines below. Please also describe it
>>> * in Documentation/networking/netdev-features.rst.
>>> */
>>> @@ -157,6 +159,7 @@ enum {
>>> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
>>> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
>>> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
>>> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
>>>
>>> /* Finds the next feature with the highest number of the range of start till 0.
>>> */
>>> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
>>> index 24036e3..2f3d309 100644
>>> --- a/net/ethtool/common.c
>>> +++ b/net/ethtool/common.c
>>> @@ -68,6 +68,7 @@
>>> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
>>> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
>>> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
>>> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
>
> I completely forgot to add that you'd better to mention in both
> enumeration/feature and its Ethtool string that the feature applies
> to Tx path.
> Smth like:
>
> NETIF_F_SKB_TX_NO_LINEAR{,_BIT}, "skb-tx-no-linear"
> or
> NETIF_F_TX_SKB_NO_LINEAR{,_BIT}, "tx-skb-no-linear"
>
> Otherwise, it may be confusing for users and developers.
OR, I think we may tight the feature with the new approach to build
skbs by page as it makes no sense for anything else.
So, if we define something like:
NETIF_F_XSK_TX_GENERIC_ZC{,_BIT}, "xsk-tx-generic-zerocopy",
then user can toggle your new XSK Tx path on/off via Ethtool for
drivers that will support it (don't forget to add it to hw_features
for virtio_net then).
>>> };
>>>
>>> const char
>>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
>>> index 8037b04..94d17dc 100644
>>> --- a/net/xdp/xsk.c
>>> +++ b/net/xdp/xsk.c
>>> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>>> sock_wfree(skb);
>>> }
>>>
>>> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>>> + struct xdp_desc *desc)
>>> +{
>>> + u32 len, offset, copy, copied;
>>> + struct sk_buff *skb;
>>> + struct page *page;
>>> + char *buffer;
>>> + int err, i;
>>> + u64 addr;
>>> +
>>> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
>>> + if (unlikely(!skb))
>>> + return NULL;
>>> +
>>> + addr = desc->addr;
>>> + len = desc->len;
>>> +
>>> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
>>> + offset = offset_in_page(buffer);
>>> + addr = buffer - (char *)xs->pool->addrs;
>>> +
>>> + for (copied = 0, i = 0; copied < len; ++i) {
>>> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
>>> +
>>> + get_page(page);
>>> +
>>> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
>>> +
>>> + skb_fill_page_desc(skb, i, page, offset, copy);
>>> +
>>> + copied += copy;
>>> + addr += copy;
>>> + offset = 0;
>>> + }
>>> +
>>> + skb->len += len;
>>> + skb->data_len += len;
>>> + skb->truesize += len;
>>> +
>>> + refcount_add(len, &xs->sk.sk_wmem_alloc);
>>> +
>>> + return skb;
>>> +}
>>> +
>>> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>>> + struct xdp_desc *desc, int *err)
>>> +{
>>> + struct sk_buff *skb;
>>> +
>>> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
>>> + skb = xsk_build_skb_zerocopy(xs, desc);
>>> + if (unlikely(!skb)) {
>>> + *err = -ENOMEM;
>>> + return NULL;
>>> + }
>>> + } else {
>>> + char *buffer;
>>> + u64 addr;
>>> + u32 len;
>>> + int err;
>>> +
>>> + len = desc->len;
>>> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
>>> + if (unlikely(!skb)) {
>>> + *err = -ENOMEM;
>>> + return NULL;
>>> + }
>>> +
>>> + skb_put(skb, len);
>>> + addr = desc->addr;
>>> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
>>> + err = skb_store_bits(skb, 0, buffer, len);
>>> +
>>> + if (unlikely(err)) {
>>> + kfree_skb(skb);
>>> + *err = -EINVAL;
>>> + return NULL;
>>> + }
>>> + }
>>> +
>>> + skb->dev = xs->dev;
>>> + skb->priority = xs->sk.sk_priority;
>>> + skb->mark = xs->sk.sk_mark;
>>> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
>>> + skb->destructor = xsk_destruct_skb;
>>> +
>>> + return skb;
>>> +}
>>> +
>>> static int xsk_generic_xmit(struct sock *sk)
>>> {
>>> struct xdp_sock *xs = xdp_sk(sk);
>>> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
>>> goto out;
>>>
>>> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
>>> - char *buffer;
>>> - u64 addr;
>>> - u32 len;
>>> -
>>> if (max_batch-- == 0) {
>>> err = -EAGAIN;
>>> goto out;
>>> }
>>>
>>> - len = desc.len;
>>> - skb = sock_alloc_send_skb(sk, len, 1, &err);
>>> + skb = xsk_build_skb(xs, &desc, &err);
>>> if (unlikely(!skb))
>>> goto out;
>>>
>>> - skb_put(skb, len);
>>> - addr = desc.addr;
>>> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
>>> - err = skb_store_bits(skb, 0, buffer, len);
>>> /* This is the backpressure mechanism for the Tx path.
>>> * Reserve space in the completion queue and only proceed
>>> * if there is space in it. This avoids having to implement
>>> * any buffering in the Tx path.
>>> */
>>> spin_lock_irqsave(&xs->pool->cq_lock, flags);
>>> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
>>> + if (xskq_prod_reserve(xs->pool->cq)) {
>>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>>> kfree_skb(skb);
>>> goto out;
>>> }
>>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>>>
>>> - skb->dev = xs->dev;
>>> - skb->priority = sk->sk_priority;
>>> - skb->mark = sk->sk_mark;
>>> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
>>> - skb->destructor = xsk_destruct_skb;
>>> -
>>> err = __dev_direct_xmit(skb, xs->queue_id);
>>> if (err == NETDEV_TX_BUSY) {
>>> /* Tell user-space to retry the send */
>>>
>
> Al
Al
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-18 12:40 ` Yunsheng Lin
@ 2021-01-18 13:00 ` Alexander Lobakin
2021-01-18 14:40 ` Alexander Lobakin
0 siblings, 1 reply; 23+ messages in thread
From: Alexander Lobakin @ 2021-01-18 13:00 UTC (permalink / raw)
To: Yunsheng Lin
Cc: Alexander Lobakin, Xuan Zhuo, Michael S. Tsirkin, Jason Wang,
David S. Miller, Jakub Kicinski, bjorn.topel, Magnus Karlsson,
Jonathan Lemon, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh,
Willem de Bruijn, Steffen Klassert, Miaohe Lin,
Mauro Carvalho Chehab, Antoine Tenart, Michal Kubecek,
Andrew Lunn, Florian Fainelli, Meir Lichtinger, virtualization,
bpf, netdev, linux-kernel
From: Yunsheng Lin <linyunsheng@huawei.com>
Date: Mon, 18 Jan 2021 20:40:52 +0800
> On 2021/1/16 10:44, Xuan Zhuo wrote:
>> This patch is used to construct skb based on page to save memory copy
>> overhead.
>>
>> This has one problem:
>>
>> We construct the skb by fill the data page as a frag into the skb. In
>> this way, the linear space is empty, and the header information is also
>> in the frag, not in the linear space, which is not allowed for some
>> network cards. For example, Mellanox Technologies MT27710 Family
>> [ConnectX-4 Lx] will get the following error message:
>>
>> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
>> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
>> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
>> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
>> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
>> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>>
>> I also tried to use build_skb to construct skb, but because of the
>> existence of skb_shinfo, it must be behind the linear space, so this
>> method is not working. We can't put skb_shinfo on desc->addr, it will be
>> exposed to users, this is not safe.
>>
>> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
>
> Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
> configure if the data is copied or not?
As far as I can grep, only mlx4 supports this, and it has a different
meaning in that driver.
So I guess a new netdev_feature would be a better solution.
>> network card supports the header information of the packet in the frag
>> and not in the linear space.
>>
>> ---------------- Performance Testing ------------
>>
>> The test environment is Aliyun ECS server.
>> Test cmd:
>> ```
>> xdpsock -i eth0 -t -S -s <msg size>
>> ```
>>
>> Test result data:
>>
>> size 64 512 1024 1500
>> copy 1916747 1775988 1600203 1440054
>> page 1974058 1953655 1945463 1904478
>> percent 3.0% 10.0% 21.58% 32.3%
>>
>> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
>> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
>> ---
>> drivers/net/virtio_net.c | 2 +-
>> include/linux/netdev_features.h | 5 +-
>> net/ethtool/common.c | 1 +
>> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
>> 4 files changed, 97 insertions(+), 19 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 4ecccb8..841a331 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
>> /* Set up network device as normal. */
>> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
>> dev->netdev_ops = &virtnet_netdev;
>> - dev->features = NETIF_F_HIGHDMA;
>> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>>
>> dev->ethtool_ops = &virtnet_ethtool_ops;
>> SET_NETDEV_DEV(dev, &vdev->dev);
>> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
>> index 934de56..8dd28e2 100644
>> --- a/include/linux/netdev_features.h
>> +++ b/include/linux/netdev_features.h
>> @@ -85,9 +85,11 @@ enum {
>>
>> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
>>
>> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
>> +
>> /*
>> * Add your fresh new feature above and remember to update
>> - * netdev_features_strings[] in net/core/ethtool.c and maybe
>> + * netdev_features_strings[] in net/ethtool/common.c and maybe
>> * some feature mask #defines below. Please also describe it
>> * in Documentation/networking/netdev-features.rst.
>> */
>> @@ -157,6 +159,7 @@ enum {
>> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
>> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
>> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
>> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
>>
>> /* Finds the next feature with the highest number of the range of start till 0.
>> */
>> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
>> index 24036e3..2f3d309 100644
>> --- a/net/ethtool/common.c
>> +++ b/net/ethtool/common.c
>> @@ -68,6 +68,7 @@
>> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
>> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
>> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
>> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
I completely forgot to add that you'd better to mention in both
enumeration/feature and its Ethtool string that the feature applies
to Tx path.
Smth like:
NETIF_F_SKB_TX_NO_LINEAR{,_BIT}, "skb-tx-no-linear"
or
NETIF_F_TX_SKB_NO_LINEAR{,_BIT}, "tx-skb-no-linear"
Otherwise, it may be confusing for users and developers.
>> };
>>
>> const char
>> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
>> index 8037b04..94d17dc 100644
>> --- a/net/xdp/xsk.c
>> +++ b/net/xdp/xsk.c
>> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
>> sock_wfree(skb);
>> }
>>
>> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
>> + struct xdp_desc *desc)
>> +{
>> + u32 len, offset, copy, copied;
>> + struct sk_buff *skb;
>> + struct page *page;
>> + char *buffer;
>> + int err, i;
>> + u64 addr;
>> +
>> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
>> + if (unlikely(!skb))
>> + return NULL;
>> +
>> + addr = desc->addr;
>> + len = desc->len;
>> +
>> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
>> + offset = offset_in_page(buffer);
>> + addr = buffer - (char *)xs->pool->addrs;
>> +
>> + for (copied = 0, i = 0; copied < len; ++i) {
>> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
>> +
>> + get_page(page);
>> +
>> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
>> +
>> + skb_fill_page_desc(skb, i, page, offset, copy);
>> +
>> + copied += copy;
>> + addr += copy;
>> + offset = 0;
>> + }
>> +
>> + skb->len += len;
>> + skb->data_len += len;
>> + skb->truesize += len;
>> +
>> + refcount_add(len, &xs->sk.sk_wmem_alloc);
>> +
>> + return skb;
>> +}
>> +
>> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
>> + struct xdp_desc *desc, int *err)
>> +{
>> + struct sk_buff *skb;
>> +
>> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
>> + skb = xsk_build_skb_zerocopy(xs, desc);
>> + if (unlikely(!skb)) {
>> + *err = -ENOMEM;
>> + return NULL;
>> + }
>> + } else {
>> + char *buffer;
>> + u64 addr;
>> + u32 len;
>> + int err;
>> +
>> + len = desc->len;
>> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
>> + if (unlikely(!skb)) {
>> + *err = -ENOMEM;
>> + return NULL;
>> + }
>> +
>> + skb_put(skb, len);
>> + addr = desc->addr;
>> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
>> + err = skb_store_bits(skb, 0, buffer, len);
>> +
>> + if (unlikely(err)) {
>> + kfree_skb(skb);
>> + *err = -EINVAL;
>> + return NULL;
>> + }
>> + }
>> +
>> + skb->dev = xs->dev;
>> + skb->priority = xs->sk.sk_priority;
>> + skb->mark = xs->sk.sk_mark;
>> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
>> + skb->destructor = xsk_destruct_skb;
>> +
>> + return skb;
>> +}
>> +
>> static int xsk_generic_xmit(struct sock *sk)
>> {
>> struct xdp_sock *xs = xdp_sk(sk);
>> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
>> goto out;
>>
>> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
>> - char *buffer;
>> - u64 addr;
>> - u32 len;
>> -
>> if (max_batch-- == 0) {
>> err = -EAGAIN;
>> goto out;
>> }
>>
>> - len = desc.len;
>> - skb = sock_alloc_send_skb(sk, len, 1, &err);
>> + skb = xsk_build_skb(xs, &desc, &err);
>> if (unlikely(!skb))
>> goto out;
>>
>> - skb_put(skb, len);
>> - addr = desc.addr;
>> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
>> - err = skb_store_bits(skb, 0, buffer, len);
>> /* This is the backpressure mechanism for the Tx path.
>> * Reserve space in the completion queue and only proceed
>> * if there is space in it. This avoids having to implement
>> * any buffering in the Tx path.
>> */
>> spin_lock_irqsave(&xs->pool->cq_lock, flags);
>> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
>> + if (xskq_prod_reserve(xs->pool->cq)) {
>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>> kfree_skb(skb);
>> goto out;
>> }
>> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>>
>> - skb->dev = xs->dev;
>> - skb->priority = sk->sk_priority;
>> - skb->mark = sk->sk_mark;
>> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
>> - skb->destructor = xsk_destruct_skb;
>> -
>> err = __dev_direct_xmit(skb, xs->queue_id);
>> if (err == NETDEV_TX_BUSY) {
>> /* Tell user-space to retry the send */
>>
Al
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-16 2:44 Xuan Zhuo
` (5 preceding siblings ...)
2021-01-18 12:37 ` Alexander Lobakin
@ 2021-01-18 12:40 ` Yunsheng Lin
2021-01-18 13:00 ` Alexander Lobakin
2021-01-19 12:44 ` Alexander Lobakin
7 siblings, 1 reply; 23+ messages in thread
From: Yunsheng Lin @ 2021-01-18 12:40 UTC (permalink / raw)
To: Xuan Zhuo, netdev
Cc: Michael S. Tsirkin, Jason Wang, David S. Miller, Jakub Kicinski,
Björn Töpel, Magnus Karlsson, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
Yonghong Song, KP Singh, Willem de Bruijn, Steffen Klassert,
Alexander Lobakin, Miaohe Lin, Mauro Carvalho Chehab,
Antoine Tenart, Michal Kubecek, Andrew Lunn, Florian Fainelli,
Meir Lichtinger, virtualization, bpf
On 2021/1/16 10:44, Xuan Zhuo wrote:
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> This has one problem:
>
> We construct the skb by fill the data page as a frag into the skb. In
> this way, the linear space is empty, and the header information is also
> in the frag, not in the linear space, which is not allowed for some
> network cards. For example, Mellanox Technologies MT27710 Family
> [ConnectX-4 Lx] will get the following error message:
>
> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>
> I also tried to use build_skb to construct skb, but because of the
> existence of skb_shinfo, it must be behind the linear space, so this
> method is not working. We can't put skb_shinfo on desc->addr, it will be
> exposed to users, this is not safe.
>
> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
Does it make sense to use ETHTOOL_TX_COPYBREAK tunable in ethtool to
configure if the data is copied or not?
> network card supports the header information of the packet in the frag
> and not in the linear space.
>
> ---------------- Performance Testing ------------
>
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t -S -s <msg size>
> ```
>
> Test result data:
>
> size 64 512 1024 1500
> copy 1916747 1775988 1600203 1440054
> page 1974058 1953655 1945463 1904478
> percent 3.0% 10.0% 21.58% 32.3%
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> ---
> drivers/net/virtio_net.c | 2 +-
> include/linux/netdev_features.h | 5 +-
> net/ethtool/common.c | 1 +
> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
> 4 files changed, 97 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 4ecccb8..841a331 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> /* Set up network device as normal. */
> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> dev->netdev_ops = &virtnet_netdev;
> - dev->features = NETIF_F_HIGHDMA;
> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>
> dev->ethtool_ops = &virtnet_ethtool_ops;
> SET_NETDEV_DEV(dev, &vdev->dev);
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index 934de56..8dd28e2 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
> @@ -85,9 +85,11 @@ enum {
>
> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
>
> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
> +
> /*
> * Add your fresh new feature above and remember to update
> - * netdev_features_strings[] in net/core/ethtool.c and maybe
> + * netdev_features_strings[] in net/ethtool/common.c and maybe
> * some feature mask #defines below. Please also describe it
> * in Documentation/networking/netdev-features.rst.
> */
> @@ -157,6 +159,7 @@ enum {
> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
>
> /* Finds the next feature with the highest number of the range of start till 0.
> */
> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> index 24036e3..2f3d309 100644
> --- a/net/ethtool/common.c
> +++ b/net/ethtool/common.c
> @@ -68,6 +68,7 @@
> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
> };
>
> const char
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 8037b04..94d17dc 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> + struct xdp_desc *desc)
> +{
> + u32 len, offset, copy, copied;
> + struct sk_buff *skb;
> + struct page *page;
> + char *buffer;
> + int err, i;
> + u64 addr;
> +
> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> + if (unlikely(!skb))
> + return NULL;
> +
> + addr = desc->addr;
> + len = desc->len;
> +
> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> + offset = offset_in_page(buffer);
> + addr = buffer - (char *)xs->pool->addrs;
> +
> + for (copied = 0, i = 0; copied < len; ++i) {
> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> + get_page(page);
> +
> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> +
> + skb_fill_page_desc(skb, i, page, offset, copy);
> +
> + copied += copy;
> + addr += copy;
> + offset = 0;
> + }
> +
> + skb->len += len;
> + skb->data_len += len;
> + skb->truesize += len;
> +
> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> +
> + return skb;
> +}
> +
> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> + struct xdp_desc *desc, int *err)
> +{
> + struct sk_buff *skb;
> +
> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> + skb = xsk_build_skb_zerocopy(xs, desc);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
> + return NULL;
> + }
> + } else {
> + char *buffer;
> + u64 addr;
> + u32 len;
> + int err;
> +
> + len = desc->len;
> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
> + return NULL;
> + }
> +
> + skb_put(skb, len);
> + addr = desc->addr;
> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> + err = skb_store_bits(skb, 0, buffer, len);
> +
> + if (unlikely(err)) {
> + kfree_skb(skb);
> + *err = -EINVAL;
> + return NULL;
> + }
> + }
> +
> + skb->dev = xs->dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
> +}
> +
> static int xsk_generic_xmit(struct sock *sk)
> {
> struct xdp_sock *xs = xdp_sk(sk);
> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
> goto out;
>
> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> - char *buffer;
> - u64 addr;
> - u32 len;
> -
> if (max_batch-- == 0) {
> err = -EAGAIN;
> goto out;
> }
>
> - len = desc.len;
> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> + skb = xsk_build_skb(xs, &desc, &err);
> if (unlikely(!skb))
> goto out;
>
> - skb_put(skb, len);
> - addr = desc.addr;
> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> - err = skb_store_bits(skb, 0, buffer, len);
> /* This is the backpressure mechanism for the Tx path.
> * Reserve space in the completion queue and only proceed
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> spin_lock_irqsave(&xs->pool->cq_lock, flags);
> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> + if (xskq_prod_reserve(xs->pool->cq)) {
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> kfree_skb(skb);
> goto out;
> }
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>
> - skb->dev = xs->dev;
> - skb->priority = sk->sk_priority;
> - skb->mark = sk->sk_mark;
> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> - skb->destructor = xsk_destruct_skb;
> -
> err = __dev_direct_xmit(skb, xs->queue_id);
> if (err == NETDEV_TX_BUSY) {
> /* Tell user-space to retry the send */
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-16 2:44 Xuan Zhuo
` (4 preceding siblings ...)
2021-01-18 11:57 ` Michael S. Tsirkin
@ 2021-01-18 12:37 ` Alexander Lobakin
2021-01-18 12:40 ` Yunsheng Lin
2021-01-19 12:44 ` Alexander Lobakin
7 siblings, 0 replies; 23+ messages in thread
From: Alexander Lobakin @ 2021-01-18 12:37 UTC (permalink / raw)
To: Xuan Zhuo
Cc: Alexander Lobakin, Michael S. Tsirkin, Jason Wang,
David S. Miller, Jakub Kicinski, bjorn.topel, Magnus Karlsson,
Jonathan Lemon, Alexei Starovoitov, Daniel Borkmann,
Jesper Dangaard Brouer, John Fastabend, Andrii Nakryiko,
Martin KaFai Lau, Song Liu, Yonghong Song, KP Singh,
Willem de Bruijn, Steffen Klassert, Miaohe Lin,
Mauro Carvalho Chehab, Antoine Tenart, Michal Kubecek,
Andrew Lunn, Florian Fainelli, Meir Lichtinger, virtualization,
bpf, netdev, linux-kernel
From: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Date: Sat, 16 Jan 2021 10:44:53 +0800
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> This has one problem:
>
> We construct the skb by fill the data page as a frag into the skb. In
> this way, the linear space is empty, and the header information is also
> in the frag, not in the linear space, which is not allowed for some
> network cards. For example, Mellanox Technologies MT27710 Family
> [ConnectX-4 Lx] will get the following error message:
>
> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>
> I also tried to use build_skb to construct skb, but because of the
> existence of skb_shinfo, it must be behind the linear space, so this
> method is not working. We can't put skb_shinfo on desc->addr, it will be
> exposed to users, this is not safe.
>
> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> network card supports the header information of the packet in the frag
> and not in the linear space.
>
> ---------------- Performance Testing ------------
>
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t -S -s <msg size>
> ```
>
> Test result data:
>
> size 64 512 1024 1500
> copy 1916747 1775988 1600203 1440054
> page 1974058 1953655 1945463 1904478
> percent 3.0% 10.0% 21.58% 32.3%
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> ---
> drivers/net/virtio_net.c | 2 +-
> include/linux/netdev_features.h | 5 +-
> net/ethtool/common.c | 1 +
> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
> 4 files changed, 97 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 4ecccb8..841a331 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> /* Set up network device as normal. */
> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> dev->netdev_ops = &virtnet_netdev;
> - dev->features = NETIF_F_HIGHDMA;
> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>
> dev->ethtool_ops = &virtnet_ethtool_ops;
> SET_NETDEV_DEV(dev, &vdev->dev);
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index 934de56..8dd28e2 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
> @@ -85,9 +85,11 @@ enum {
>
> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
>
> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
> +
> /*
> * Add your fresh new feature above and remember to update
> - * netdev_features_strings[] in net/core/ethtool.c and maybe
> + * netdev_features_strings[] in net/ethtool/common.c and maybe
> * some feature mask #defines below. Please also describe it
> * in Documentation/networking/netdev-features.rst.
> */
> @@ -157,6 +159,7 @@ enum {
> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
>
> /* Finds the next feature with the highest number of the range of start till 0.
> */
> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> index 24036e3..2f3d309 100644
> --- a/net/ethtool/common.c
> +++ b/net/ethtool/common.c
> @@ -68,6 +68,7 @@
> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
> };
>
> const char
I think the best would be if you will split this patch into three:
- the first one will introduce NETI_F_SKB_NO_LINEAR;
- the second will add this feature to virtio_net;
- the third will do the rest.
Also, it would be nice if you'll mention (in the cover letter or
in the third patch) that in order to get a nice boost on non-ZC
XSK xmit developers can add a support for completely non-linear
skbs and advertise this new feature in their drivers. I think
there'll be enough folks wanting to do this.
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 8037b04..94d17dc 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> + struct xdp_desc *desc)
> +{
> + u32 len, offset, copy, copied;
> + struct sk_buff *skb;
> + struct page *page;
> + char *buffer;
> + int err, i;
> + u64 addr;
> +
> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> + if (unlikely(!skb))
> + return NULL;
> +
> + addr = desc->addr;
> + len = desc->len;
> +
> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> + offset = offset_in_page(buffer);
> + addr = buffer - (char *)xs->pool->addrs;
> +
> + for (copied = 0, i = 0; copied < len; ++i) {
> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> + get_page(page);
> +
> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> +
> + skb_fill_page_desc(skb, i, page, offset, copy);
> +
> + copied += copy;
> + addr += copy;
> + offset = 0;
> + }
> +
> + skb->len += len;
> + skb->data_len += len;
> + skb->truesize += len;
> +
> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> +
> + return skb;
> +}
> +
> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> + struct xdp_desc *desc, int *err)
As the others said, just use ERR_PTR() and PTR_ERR().
You also should have received the letters from kernel test robot
that the current version is non-compilable at all.
> + struct sk_buff *skb;
> +
> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> + skb = xsk_build_skb_zerocopy(xs, desc);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
> + return NULL;
> + }
> + } else {
> + char *buffer;
> + u64 addr;
> + u32 len;
> + int err;
> +
> + len = desc->len;
> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
> + return NULL;
> + }
> +
> + skb_put(skb, len);
> + addr = desc->addr;
> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> + err = skb_store_bits(skb, 0, buffer, len);
> +
> + if (unlikely(err)) {
> + kfree_skb(skb);
> + *err = -EINVAL;
> + return NULL;
> + }
> + }
> +
> + skb->dev = xs->dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
> +}
> +
> static int xsk_generic_xmit(struct sock *sk)
> {
> struct xdp_sock *xs = xdp_sk(sk);
> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
> goto out;
>
> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> - char *buffer;
> - u64 addr;
> - u32 len;
> -
> if (max_batch-- == 0) {
> err = -EAGAIN;
> goto out;
> }
>
> - len = desc.len;
> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> + skb = xsk_build_skb(xs, &desc, &err);
> if (unlikely(!skb))
> goto out;
>
> - skb_put(skb, len);
> - addr = desc.addr;
> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> - err = skb_store_bits(skb, 0, buffer, len);
> /* This is the backpressure mechanism for the Tx path.
> * Reserve space in the completion queue and only proceed
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> spin_lock_irqsave(&xs->pool->cq_lock, flags);
> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> + if (xskq_prod_reserve(xs->pool->cq)) {
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> kfree_skb(skb);
> goto out;
> }
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>
> - skb->dev = xs->dev;
> - skb->priority = sk->sk_priority;
> - skb->mark = sk->sk_mark;
> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> - skb->destructor = xsk_destruct_skb;
> -
> err = __dev_direct_xmit(skb, xs->queue_id);
> if (err == NETDEV_TX_BUSY) {
> /* Tell user-space to retry the send */
> --
> 1.8.3.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-16 2:44 Xuan Zhuo
@ 2021-01-18 11:57 ` Michael S. Tsirkin
2021-01-16 8:15 ` kernel test robot
` (6 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Michael S. Tsirkin @ 2021-01-18 11:57 UTC (permalink / raw)
To: Xuan Zhuo
Cc: netdev, Jason Wang, David S. Miller, Jakub Kicinski,
Björn Töpel, Magnus Karlsson, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
Yonghong Song, KP Singh, Willem de Bruijn, Steffen Klassert,
Alexander Lobakin, Miaohe Lin, Mauro Carvalho Chehab,
Antoine Tenart, Michal Kubecek, Andrew Lunn, Florian Fainelli,
Meir Lichtinger, virtualization, bpf
On Sat, Jan 16, 2021 at 10:44:53AM +0800, Xuan Zhuo wrote:
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> This has one problem:
>
> We construct the skb by fill the data page as a frag into the skb. In
> this way, the linear space is empty, and the header information is also
> in the frag, not in the linear space, which is not allowed for some
> network cards. For example, Mellanox Technologies MT27710 Family
> [ConnectX-4 Lx] will get the following error message:
>
> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>
> I also tried to use build_skb to construct skb, but because of the
> existence of skb_shinfo, it must be behind the linear space, so this
> method is not working. We can't put skb_shinfo on desc->addr, it will be
> exposed to users, this is not safe.
>
> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> network card supports the header information of the packet in the frag
> and not in the linear space.
>
> ---------------- Performance Testing ------------
>
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t -S -s <msg size>
> ```
>
> Test result data:
>
> size 64 512 1024 1500
> copy 1916747 1775988 1600203 1440054
> page 1974058 1953655 1945463 1904478
> percent 3.0% 10.0% 21.58% 32.3%
Nice, but it looks like the patch presented wouldn't compile.
It's worth retesting after you actually make it compile.
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> ---
> drivers/net/virtio_net.c | 2 +-
> include/linux/netdev_features.h | 5 +-
> net/ethtool/common.c | 1 +
> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
> 4 files changed, 97 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 4ecccb8..841a331 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> /* Set up network device as normal. */
> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> dev->netdev_ops = &virtnet_netdev;
> - dev->features = NETIF_F_HIGHDMA;
> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>
> dev->ethtool_ops = &virtnet_ethtool_ops;
> SET_NETDEV_DEV(dev, &vdev->dev);
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index 934de56..8dd28e2 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
> @@ -85,9 +85,11 @@ enum {
>
> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
>
> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
> +
> /*
> * Add your fresh new feature above and remember to update
> - * netdev_features_strings[] in net/core/ethtool.c and maybe
> + * netdev_features_strings[] in net/ethtool/common.c and maybe
> * some feature mask #defines below. Please also describe it
> * in Documentation/networking/netdev-features.rst.
> */
> @@ -157,6 +159,7 @@ enum {
> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
>
> /* Finds the next feature with the highest number of the range of start till 0.
> */
> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> index 24036e3..2f3d309 100644
> --- a/net/ethtool/common.c
> +++ b/net/ethtool/common.c
> @@ -68,6 +68,7 @@
> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
> };
>
> const char
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 8037b04..94d17dc 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> + struct xdp_desc *desc)
> +{
> + u32 len, offset, copy, copied;
> + struct sk_buff *skb;
> + struct page *page;
> + char *buffer;
> + int err, i;
> + u64 addr;
> +
> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> + if (unlikely(!skb))
> + return NULL;
> +
> + addr = desc->addr;
> + len = desc->len;
> +
> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> + offset = offset_in_page(buffer);
> + addr = buffer - (char *)xs->pool->addrs;
> +
> + for (copied = 0, i = 0; copied < len; ++i) {
> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> + get_page(page);
> +
> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> +
> + skb_fill_page_desc(skb, i, page, offset, copy);
> +
> + copied += copy;
> + addr += copy;
> + offset = 0;
> + }
> +
> + skb->len += len;
> + skb->data_len += len;
> + skb->truesize += len;
> +
> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> +
> + return skb;
> +}
> +
> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> + struct xdp_desc *desc, int *err)
Rather than passing int *err, you can return PTR_ERR.
Seems cleaner ...
> +{
> + struct sk_buff *skb;
> +
> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> + skb = xsk_build_skb_zerocopy(xs, desc);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
> + return NULL;
> + }
> + } else {
> + char *buffer;
> + u64 addr;
> + u32 len;
> + int err;
So err is int here
> +
> + len = desc->len;
> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
.. and you dereference it here
> + return NULL;
> + }
> +
> + skb_put(skb, len);
> + addr = desc->addr;
> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> + err = skb_store_bits(skb, 0, buffer, len);
> +
> + if (unlikely(err)) {
> + kfree_skb(skb);
> + *err = -EINVAL;
Same thing here ... how does it compile?
> + return NULL;
> + }
> + }
> +
> + skb->dev = xs->dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
> +}
> +
> static int xsk_generic_xmit(struct sock *sk)
> {
> struct xdp_sock *xs = xdp_sk(sk);
> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
> goto out;
>
> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> - char *buffer;
> - u64 addr;
> - u32 len;
> -
> if (max_batch-- == 0) {
> err = -EAGAIN;
> goto out;
> }
>
> - len = desc.len;
> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> + skb = xsk_build_skb(xs, &desc, &err);
> if (unlikely(!skb))
> goto out;
>
> - skb_put(skb, len);
> - addr = desc.addr;
> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> - err = skb_store_bits(skb, 0, buffer, len);
> /* This is the backpressure mechanism for the Tx path.
> * Reserve space in the completion queue and only proceed
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> spin_lock_irqsave(&xs->pool->cq_lock, flags);
> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> + if (xskq_prod_reserve(xs->pool->cq)) {
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> kfree_skb(skb);
> goto out;
> }
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>
> - skb->dev = xs->dev;
> - skb->priority = sk->sk_priority;
> - skb->mark = sk->sk_mark;
> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> - skb->destructor = xsk_destruct_skb;
> -
> err = __dev_direct_xmit(skb, xs->queue_id);
> if (err == NETDEV_TX_BUSY) {
> /* Tell user-space to retry the send */
> --
> 1.8.3.1
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
@ 2021-01-18 11:57 ` Michael S. Tsirkin
0 siblings, 0 replies; 23+ messages in thread
From: Michael S. Tsirkin @ 2021-01-18 11:57 UTC (permalink / raw)
To: Xuan Zhuo
Cc: Miaohe Lin, Michal Kubecek, Song Liu, Andrew Lunn,
Alexander Lobakin, Alexei Starovoitov, virtualization,
Meir Lichtinger, Steffen Klassert, Florian Fainelli,
Daniel Borkmann, Mauro Carvalho Chehab, John Fastabend,
Andrii Nakryiko, Yonghong Song, Björn Töpel,
Jesper Dangaard Brouer, KP Singh, Jakub Kicinski,
Magnus Karlsson, Willem de Bruijn, netdev, Antoine Tenart,
David S. Miller, Jonathan Lemon, bpf, Martin KaFai Lau
On Sat, Jan 16, 2021 at 10:44:53AM +0800, Xuan Zhuo wrote:
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> This has one problem:
>
> We construct the skb by fill the data page as a frag into the skb. In
> this way, the linear space is empty, and the header information is also
> in the frag, not in the linear space, which is not allowed for some
> network cards. For example, Mellanox Technologies MT27710 Family
> [ConnectX-4 Lx] will get the following error message:
>
> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>
> I also tried to use build_skb to construct skb, but because of the
> existence of skb_shinfo, it must be behind the linear space, so this
> method is not working. We can't put skb_shinfo on desc->addr, it will be
> exposed to users, this is not safe.
>
> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> network card supports the header information of the packet in the frag
> and not in the linear space.
>
> ---------------- Performance Testing ------------
>
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t -S -s <msg size>
> ```
>
> Test result data:
>
> size 64 512 1024 1500
> copy 1916747 1775988 1600203 1440054
> page 1974058 1953655 1945463 1904478
> percent 3.0% 10.0% 21.58% 32.3%
Nice, but it looks like the patch presented wouldn't compile.
It's worth retesting after you actually make it compile.
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> ---
> drivers/net/virtio_net.c | 2 +-
> include/linux/netdev_features.h | 5 +-
> net/ethtool/common.c | 1 +
> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
> 4 files changed, 97 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 4ecccb8..841a331 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> /* Set up network device as normal. */
> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> dev->netdev_ops = &virtnet_netdev;
> - dev->features = NETIF_F_HIGHDMA;
> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>
> dev->ethtool_ops = &virtnet_ethtool_ops;
> SET_NETDEV_DEV(dev, &vdev->dev);
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index 934de56..8dd28e2 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
> @@ -85,9 +85,11 @@ enum {
>
> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
>
> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
> +
> /*
> * Add your fresh new feature above and remember to update
> - * netdev_features_strings[] in net/core/ethtool.c and maybe
> + * netdev_features_strings[] in net/ethtool/common.c and maybe
> * some feature mask #defines below. Please also describe it
> * in Documentation/networking/netdev-features.rst.
> */
> @@ -157,6 +159,7 @@ enum {
> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
>
> /* Finds the next feature with the highest number of the range of start till 0.
> */
> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> index 24036e3..2f3d309 100644
> --- a/net/ethtool/common.c
> +++ b/net/ethtool/common.c
> @@ -68,6 +68,7 @@
> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
> };
>
> const char
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 8037b04..94d17dc 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> + struct xdp_desc *desc)
> +{
> + u32 len, offset, copy, copied;
> + struct sk_buff *skb;
> + struct page *page;
> + char *buffer;
> + int err, i;
> + u64 addr;
> +
> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> + if (unlikely(!skb))
> + return NULL;
> +
> + addr = desc->addr;
> + len = desc->len;
> +
> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> + offset = offset_in_page(buffer);
> + addr = buffer - (char *)xs->pool->addrs;
> +
> + for (copied = 0, i = 0; copied < len; ++i) {
> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> + get_page(page);
> +
> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> +
> + skb_fill_page_desc(skb, i, page, offset, copy);
> +
> + copied += copy;
> + addr += copy;
> + offset = 0;
> + }
> +
> + skb->len += len;
> + skb->data_len += len;
> + skb->truesize += len;
> +
> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> +
> + return skb;
> +}
> +
> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> + struct xdp_desc *desc, int *err)
Rather than passing int *err, you can return PTR_ERR.
Seems cleaner ...
> +{
> + struct sk_buff *skb;
> +
> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> + skb = xsk_build_skb_zerocopy(xs, desc);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
> + return NULL;
> + }
> + } else {
> + char *buffer;
> + u64 addr;
> + u32 len;
> + int err;
So err is int here
> +
> + len = desc->len;
> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
.. and you dereference it here
> + return NULL;
> + }
> +
> + skb_put(skb, len);
> + addr = desc->addr;
> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> + err = skb_store_bits(skb, 0, buffer, len);
> +
> + if (unlikely(err)) {
> + kfree_skb(skb);
> + *err = -EINVAL;
Same thing here ... how does it compile?
> + return NULL;
> + }
> + }
> +
> + skb->dev = xs->dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
> +}
> +
> static int xsk_generic_xmit(struct sock *sk)
> {
> struct xdp_sock *xs = xdp_sk(sk);
> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
> goto out;
>
> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> - char *buffer;
> - u64 addr;
> - u32 len;
> -
> if (max_batch-- == 0) {
> err = -EAGAIN;
> goto out;
> }
>
> - len = desc.len;
> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> + skb = xsk_build_skb(xs, &desc, &err);
> if (unlikely(!skb))
> goto out;
>
> - skb_put(skb, len);
> - addr = desc.addr;
> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> - err = skb_store_bits(skb, 0, buffer, len);
> /* This is the backpressure mechanism for the Tx path.
> * Reserve space in the completion queue and only proceed
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> spin_lock_irqsave(&xs->pool->cq_lock, flags);
> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> + if (xskq_prod_reserve(xs->pool->cq)) {
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> kfree_skb(skb);
> goto out;
> }
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>
> - skb->dev = xs->dev;
> - skb->priority = sk->sk_priority;
> - skb->mark = sk->sk_mark;
> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> - skb->destructor = xsk_destruct_skb;
> -
> err = __dev_direct_xmit(skb, xs->queue_id);
> if (err == NETDEV_TX_BUSY) {
> /* Tell user-space to retry the send */
> --
> 1.8.3.1
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-16 2:44 Xuan Zhuo
` (2 preceding siblings ...)
2021-01-17 21:55 ` John Fastabend
@ 2021-01-18 9:25 ` Magnus Karlsson
2021-01-18 11:57 ` Michael S. Tsirkin
` (3 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: Magnus Karlsson @ 2021-01-18 9:25 UTC (permalink / raw)
To: Xuan Zhuo
Cc: Network Development, Michael S. Tsirkin, Jason Wang,
David S. Miller, Jakub Kicinski, Björn Töpel,
Magnus Karlsson, Jonathan Lemon, Alexei Starovoitov,
Daniel Borkmann, Jesper Dangaard Brouer, John Fastabend,
Andrii Nakryiko, Martin KaFai Lau, Song Liu, Yonghong Song,
KP Singh, Willem de Bruijn, Steffen Klassert, Alexander Lobakin,
Miaohe Lin, Mauro Carvalho Chehab, Antoine Tenart,
Michal Kubecek, Andrew Lunn, Florian Fainelli, Meir Lichtinger,
virtualization, bpf
On Sat, Jan 16, 2021 at 3:47 AM Xuan Zhuo <xuanzhuo@linux.alibaba.com> wrote:
>
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> This has one problem:
>
> We construct the skb by fill the data page as a frag into the skb. In
> this way, the linear space is empty, and the header information is also
> in the frag, not in the linear space, which is not allowed for some
> network cards. For example, Mellanox Technologies MT27710 Family
> [ConnectX-4 Lx] will get the following error message:
>
> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>
> I also tried to use build_skb to construct skb, but because of the
> existence of skb_shinfo, it must be behind the linear space, so this
> method is not working. We can't put skb_shinfo on desc->addr, it will be
> exposed to users, this is not safe.
>
> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> network card supports the header information of the packet in the frag
> and not in the linear space.
Could you please break this new feature bit into its own patch and
produce a patch set. Patch 1 being the NETIF_F_SKB_NO_LINEAR bit
addition and patch 2 being the xsk part. I can then ack patch 2 since
I am one of the maintainers of that, and then the owner of the netdev
bits can ack that part (whoever that is?). The bit addition looks fine
to me, but I am not the owner of those feature bits, so I would feel
somewhat awkward to ack that.
Otherwise, good performance boost. I like it. Thank you!
> ---------------- Performance Testing ------------
>
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t -S -s <msg size>
> ```
>
> Test result data:
>
> size 64 512 1024 1500
> copy 1916747 1775988 1600203 1440054
> page 1974058 1953655 1945463 1904478
> percent 3.0% 10.0% 21.58% 32.3%
>
> Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
> Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
> ---
> drivers/net/virtio_net.c | 2 +-
> include/linux/netdev_features.h | 5 +-
> net/ethtool/common.c | 1 +
> net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
> 4 files changed, 97 insertions(+), 19 deletions(-)
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 4ecccb8..841a331 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
> /* Set up network device as normal. */
> dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
> dev->netdev_ops = &virtnet_netdev;
> - dev->features = NETIF_F_HIGHDMA;
> + dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
>
> dev->ethtool_ops = &virtnet_ethtool_ops;
> SET_NETDEV_DEV(dev, &vdev->dev);
> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
> index 934de56..8dd28e2 100644
> --- a/include/linux/netdev_features.h
> +++ b/include/linux/netdev_features.h
> @@ -85,9 +85,11 @@ enum {
>
> NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
>
> + NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
> +
> /*
> * Add your fresh new feature above and remember to update
> - * netdev_features_strings[] in net/core/ethtool.c and maybe
> + * netdev_features_strings[] in net/ethtool/common.c and maybe
> * some feature mask #defines below. Please also describe it
> * in Documentation/networking/netdev-features.rst.
> */
> @@ -157,6 +159,7 @@ enum {
> #define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
> #define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
> #define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
> +#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
>
> /* Finds the next feature with the highest number of the range of start till 0.
> */
> diff --git a/net/ethtool/common.c b/net/ethtool/common.c
> index 24036e3..2f3d309 100644
> --- a/net/ethtool/common.c
> +++ b/net/ethtool/common.c
> @@ -68,6 +68,7 @@
> [NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
> [NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
> [NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
> + [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
> };
>
> const char
> diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
> index 8037b04..94d17dc 100644
> --- a/net/xdp/xsk.c
> +++ b/net/xdp/xsk.c
> @@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
> sock_wfree(skb);
> }
>
> +static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
> + struct xdp_desc *desc)
> +{
> + u32 len, offset, copy, copied;
> + struct sk_buff *skb;
> + struct page *page;
> + char *buffer;
> + int err, i;
> + u64 addr;
> +
> + skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
> + if (unlikely(!skb))
> + return NULL;
> +
> + addr = desc->addr;
> + len = desc->len;
> +
> + buffer = xsk_buff_raw_get_data(xs->pool, addr);
> + offset = offset_in_page(buffer);
> + addr = buffer - (char *)xs->pool->addrs;
> +
> + for (copied = 0, i = 0; copied < len; ++i) {
> + page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
> +
> + get_page(page);
> +
> + copy = min((u32)(PAGE_SIZE - offset), len - copied);
> +
> + skb_fill_page_desc(skb, i, page, offset, copy);
> +
> + copied += copy;
> + addr += copy;
> + offset = 0;
> + }
> +
> + skb->len += len;
> + skb->data_len += len;
> + skb->truesize += len;
> +
> + refcount_add(len, &xs->sk.sk_wmem_alloc);
> +
> + return skb;
> +}
> +
> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> + struct xdp_desc *desc, int *err)
> +{
> + struct sk_buff *skb;
> +
> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> + skb = xsk_build_skb_zerocopy(xs, desc);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
> + return NULL;
> + }
> + } else {
> + char *buffer;
> + u64 addr;
> + u32 len;
> + int err;
> +
> + len = desc->len;
> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> + if (unlikely(!skb)) {
> + *err = -ENOMEM;
> + return NULL;
> + }
> +
> + skb_put(skb, len);
> + addr = desc->addr;
> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> + err = skb_store_bits(skb, 0, buffer, len);
> +
> + if (unlikely(err)) {
> + kfree_skb(skb);
> + *err = -EINVAL;
> + return NULL;
> + }
> + }
> +
> + skb->dev = xs->dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
> +}
> +
> static int xsk_generic_xmit(struct sock *sk)
> {
> struct xdp_sock *xs = xdp_sk(sk);
> @@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
> goto out;
>
> while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
> - char *buffer;
> - u64 addr;
> - u32 len;
> -
> if (max_batch-- == 0) {
> err = -EAGAIN;
> goto out;
> }
>
> - len = desc.len;
> - skb = sock_alloc_send_skb(sk, len, 1, &err);
> + skb = xsk_build_skb(xs, &desc, &err);
> if (unlikely(!skb))
> goto out;
>
> - skb_put(skb, len);
> - addr = desc.addr;
> - buffer = xsk_buff_raw_get_data(xs->pool, addr);
> - err = skb_store_bits(skb, 0, buffer, len);
> /* This is the backpressure mechanism for the Tx path.
> * Reserve space in the completion queue and only proceed
> * if there is space in it. This avoids having to implement
> * any buffering in the Tx path.
> */
> spin_lock_irqsave(&xs->pool->cq_lock, flags);
> - if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
> + if (xskq_prod_reserve(xs->pool->cq)) {
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
> kfree_skb(skb);
> goto out;
> }
> spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
>
> - skb->dev = xs->dev;
> - skb->priority = sk->sk_priority;
> - skb->mark = sk->sk_mark;
> - skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
> - skb->destructor = xsk_destruct_skb;
> -
> err = __dev_direct_xmit(skb, xs->queue_id);
> if (err == NETDEV_TX_BUSY) {
> /* Tell user-space to retry the send */
> --
> 1.8.3.1
>
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [PATCH bpf-next] xsk: build skb by page
2021-01-16 2:44 Xuan Zhuo
@ 2021-01-17 21:55 ` John Fastabend
2021-01-16 8:15 ` kernel test robot
` (6 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: John Fastabend @ 2021-01-17 21:55 UTC (permalink / raw)
To: Xuan Zhuo, netdev
Cc: Michael S. Tsirkin, Jason Wang, David S. Miller, Jakub Kicinski,
Björn Töpel, Magnus Karlsson, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
Yonghong Song, KP Singh, Willem de Bruijn, Steffen Klassert,
Alexander Lobakin, Miaohe Lin, Xuan Zhuo, Mauro Carvalho Chehab,
Antoine Tenart, Michal Kubecek, Andrew Lunn, Florian Fainelli,
Meir Lichtinger, virtualization, bpf
Xuan Zhuo wrote:
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> This has one problem:
>
> We construct the skb by fill the data page as a frag into the skb. In
> this way, the linear space is empty, and the header information is also
> in the frag, not in the linear space, which is not allowed for some
> network cards. For example, Mellanox Technologies MT27710 Family
> [ConnectX-4 Lx] will get the following error message:
>
> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>
> I also tried to use build_skb to construct skb, but because of the
> existence of skb_shinfo, it must be behind the linear space, so this
> method is not working. We can't put skb_shinfo on desc->addr, it will be
> exposed to users, this is not safe.
>
> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> network card supports the header information of the packet in the frag
> and not in the linear space.
>
> ---------------- Performance Testing ------------
>
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t -S -s <msg size>
> ```
>
> Test result data:
>
> size 64 512 1024 1500
> copy 1916747 1775988 1600203 1440054
> page 1974058 1953655 1945463 1904478
> percent 3.0% 10.0% 21.58% 32.3%
Looks like a good perf bump. Some easy suggestions below
> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> + struct xdp_desc *desc, int *err)
> +{
Passing a 'int *err' here is ugly IMO use the ERR_PTR/PTR_ERR macros
and roll it into the return value.
or maybe use the out: pattern used in the kernel, but just doing direct
returns like now but with ERR_PTR() would also be fine.
> + struct sk_buff *skb ;
struct sk_buff *skb = NULL;
err = -ENOMEM;
> +
> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> + skb = xsk_build_skb_zerocopy(xs, desc);
> + if (unlikely(!skb)) {
goto out
> + *err = -ENOMEM;
> + return NULL;
> + }
> + } else {
> + char *buffer;
> + u64 addr;
> + u32 len;
> + int err;
> +
> + len = desc->len;
> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> + if (unlikely(!skb)) {
goto out;
> + *err = -ENOMEM;
> + return NULL;
> + }
> +
> + skb_put(skb, len);
> + addr = desc->addr;
> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> + err = skb_store_bits(skb, 0, buffer, len);
> +
> + if (unlikely(err)) {
> + kfree_skb(skb);
err = -EINVAL;
goto out
> + *err = -EINVAL;
> + return NULL;
> + }
> + }
> +
> + skb->dev = xs->dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
out:
kfree_skb(skb)
return ERR_PTR(err);
> +}
> +
Otherwise looks good thanks.
^ permalink raw reply [flat|nested] 23+ messages in thread
* RE: [PATCH bpf-next] xsk: build skb by page
@ 2021-01-17 21:55 ` John Fastabend
0 siblings, 0 replies; 23+ messages in thread
From: John Fastabend @ 2021-01-17 21:55 UTC (permalink / raw)
To: Xuan Zhuo, netdev
Cc: Miaohe Lin, Michal Kubecek, Song Liu, Michael S. Tsirkin,
Alexander Lobakin, Alexei Starovoitov, virtualization,
Meir Lichtinger, Andrew Lunn, Steffen Klassert, Xuan Zhuo,
Florian Fainelli, Daniel Borkmann, Mauro Carvalho Chehab,
John Fastabend, Andrii Nakryiko, Yonghong Song,
Björn Töpel, Jesper Dangaard Brouer, KP Singh,
Jakub Kicinski, Magnus Karlsson, Willem de Bruijn,
Antoine Tenart, David S. Miller, Jonathan Lemon, bpf,
Martin KaFai Lau
Xuan Zhuo wrote:
> This patch is used to construct skb based on page to save memory copy
> overhead.
>
> This has one problem:
>
> We construct the skb by fill the data page as a frag into the skb. In
> this way, the linear space is empty, and the header information is also
> in the frag, not in the linear space, which is not allowed for some
> network cards. For example, Mellanox Technologies MT27710 Family
> [ConnectX-4 Lx] will get the following error message:
>
> mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
> 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
> WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
> 00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
> 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> 00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
> 00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
>
> I also tried to use build_skb to construct skb, but because of the
> existence of skb_shinfo, it must be behind the linear space, so this
> method is not working. We can't put skb_shinfo on desc->addr, it will be
> exposed to users, this is not safe.
>
> Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
> network card supports the header information of the packet in the frag
> and not in the linear space.
>
> ---------------- Performance Testing ------------
>
> The test environment is Aliyun ECS server.
> Test cmd:
> ```
> xdpsock -i eth0 -t -S -s <msg size>
> ```
>
> Test result data:
>
> size 64 512 1024 1500
> copy 1916747 1775988 1600203 1440054
> page 1974058 1953655 1945463 1904478
> percent 3.0% 10.0% 21.58% 32.3%
Looks like a good perf bump. Some easy suggestions below
> +static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
> + struct xdp_desc *desc, int *err)
> +{
Passing a 'int *err' here is ugly IMO use the ERR_PTR/PTR_ERR macros
and roll it into the return value.
or maybe use the out: pattern used in the kernel, but just doing direct
returns like now but with ERR_PTR() would also be fine.
> + struct sk_buff *skb ;
struct sk_buff *skb = NULL;
err = -ENOMEM;
> +
> + if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
> + skb = xsk_build_skb_zerocopy(xs, desc);
> + if (unlikely(!skb)) {
goto out
> + *err = -ENOMEM;
> + return NULL;
> + }
> + } else {
> + char *buffer;
> + u64 addr;
> + u32 len;
> + int err;
> +
> + len = desc->len;
> + skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
> + if (unlikely(!skb)) {
goto out;
> + *err = -ENOMEM;
> + return NULL;
> + }
> +
> + skb_put(skb, len);
> + addr = desc->addr;
> + buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
> + err = skb_store_bits(skb, 0, buffer, len);
> +
> + if (unlikely(err)) {
> + kfree_skb(skb);
err = -EINVAL;
goto out
> + *err = -EINVAL;
> + return NULL;
> + }
> + }
> +
> + skb->dev = xs->dev;
> + skb->priority = xs->sk.sk_priority;
> + skb->mark = xs->sk.sk_mark;
> + skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
> + skb->destructor = xsk_destruct_skb;
> +
> + return skb;
out:
kfree_skb(skb)
return ERR_PTR(err);
> +}
> +
Otherwise looks good thanks.
_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-16 2:44 Xuan Zhuo
@ 2021-01-16 8:15 ` kernel test robot
2021-01-16 8:15 ` kernel test robot
` (6 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: kernel test robot @ 2021-01-16 8:15 UTC (permalink / raw)
To: Xuan Zhuo, netdev
Cc: kbuild-all, Michael S. Tsirkin, Jason Wang, Jakub Kicinski,
Björn Töpel, Magnus Karlsson, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer
[-- Attachment #1: Type: text/plain, Size: 3312 bytes --]
Hi Xuan,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on bpf-next/master]
url: https://github.com/0day-ci/linux/commits/Xuan-Zhuo/xsk-build-skb-by-page/20210116-105116
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: sh-allmodconfig (attached as .config)
compiler: sh4-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/ee139d2988e5c5945108889a7c95c751910c1877
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Xuan-Zhuo/xsk-build-skb-by-page/20210116-105116
git checkout ee139d2988e5c5945108889a7c95c751910c1877
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=sh
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
net/xdp/xsk.c: In function 'xsk_build_skb':
>> net/xdp/xsk.c:497:4: error: invalid type argument of unary '*' (have 'int')
497 | *err = -ENOMEM;
| ^~~~
net/xdp/xsk.c:508:4: error: invalid type argument of unary '*' (have 'int')
508 | *err = -EINVAL;
| ^~~~
net/xdp/xsk.c:490:7: warning: variable 'addr' set but not used [-Wunused-but-set-variable]
490 | u64 addr;
| ^~~~
Kconfig warnings: (for reference only)
WARNING: unmet direct dependencies detected for SND_ATMEL_SOC_PDC
Depends on SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC && HAS_DMA
Selected by
- SND_ATMEL_SOC_SSC && SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC
- SND_ATMEL_SOC_SSC_PDC && SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC && ATMEL_SSC
vim +497 net/xdp/xsk.c
476
477 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
478 struct xdp_desc *desc, int *err)
479 {
480 struct sk_buff *skb;
481
482 if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
483 skb = xsk_build_skb_zerocopy(xs, desc);
484 if (unlikely(!skb)) {
485 *err = -ENOMEM;
486 return NULL;
487 }
488 } else {
489 char *buffer;
490 u64 addr;
491 u32 len;
492 int err;
493
494 len = desc->len;
495 skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
496 if (unlikely(!skb)) {
> 497 *err = -ENOMEM;
498 return NULL;
499 }
500
501 skb_put(skb, len);
502 addr = desc->addr;
503 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
504 err = skb_store_bits(skb, 0, buffer, len);
505
506 if (unlikely(err)) {
507 kfree_skb(skb);
508 *err = -EINVAL;
509 return NULL;
510 }
511 }
512
513 skb->dev = xs->dev;
514 skb->priority = xs->sk.sk_priority;
515 skb->mark = xs->sk.sk_mark;
516 skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
517 skb->destructor = xsk_destruct_skb;
518
519 return skb;
520 }
521
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 53952 bytes --]
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
@ 2021-01-16 8:15 ` kernel test robot
0 siblings, 0 replies; 23+ messages in thread
From: kernel test robot @ 2021-01-16 8:15 UTC (permalink / raw)
To: kbuild-all
[-- Attachment #1: Type: text/plain, Size: 3410 bytes --]
Hi Xuan,
Thank you for the patch! Yet something to improve:
[auto build test ERROR on bpf-next/master]
url: https://github.com/0day-ci/linux/commits/Xuan-Zhuo/xsk-build-skb-by-page/20210116-105116
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: sh-allmodconfig (attached as .config)
compiler: sh4-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/ee139d2988e5c5945108889a7c95c751910c1877
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Xuan-Zhuo/xsk-build-skb-by-page/20210116-105116
git checkout ee139d2988e5c5945108889a7c95c751910c1877
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=sh
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All errors (new ones prefixed by >>):
net/xdp/xsk.c: In function 'xsk_build_skb':
>> net/xdp/xsk.c:497:4: error: invalid type argument of unary '*' (have 'int')
497 | *err = -ENOMEM;
| ^~~~
net/xdp/xsk.c:508:4: error: invalid type argument of unary '*' (have 'int')
508 | *err = -EINVAL;
| ^~~~
net/xdp/xsk.c:490:7: warning: variable 'addr' set but not used [-Wunused-but-set-variable]
490 | u64 addr;
| ^~~~
Kconfig warnings: (for reference only)
WARNING: unmet direct dependencies detected for SND_ATMEL_SOC_PDC
Depends on SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC && HAS_DMA
Selected by
- SND_ATMEL_SOC_SSC && SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC
- SND_ATMEL_SOC_SSC_PDC && SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC && ATMEL_SSC
vim +497 net/xdp/xsk.c
476
477 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
478 struct xdp_desc *desc, int *err)
479 {
480 struct sk_buff *skb;
481
482 if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
483 skb = xsk_build_skb_zerocopy(xs, desc);
484 if (unlikely(!skb)) {
485 *err = -ENOMEM;
486 return NULL;
487 }
488 } else {
489 char *buffer;
490 u64 addr;
491 u32 len;
492 int err;
493
494 len = desc->len;
495 skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
496 if (unlikely(!skb)) {
> 497 *err = -ENOMEM;
498 return NULL;
499 }
500
501 skb_put(skb, len);
502 addr = desc->addr;
503 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
504 err = skb_store_bits(skb, 0, buffer, len);
505
506 if (unlikely(err)) {
507 kfree_skb(skb);
508 *err = -EINVAL;
509 return NULL;
510 }
511 }
512
513 skb->dev = xs->dev;
514 skb->priority = xs->sk.sk_priority;
515 skb->mark = xs->sk.sk_mark;
516 skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
517 skb->destructor = xsk_destruct_skb;
518
519 return skb;
520 }
521
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org
[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 53952 bytes --]
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
2021-01-16 2:44 Xuan Zhuo
@ 2021-01-16 5:24 ` kernel test robot
2021-01-16 8:15 ` kernel test robot
` (6 subsequent siblings)
7 siblings, 0 replies; 23+ messages in thread
From: kernel test robot @ 2021-01-16 5:24 UTC (permalink / raw)
To: Xuan Zhuo, netdev
Cc: kbuild-all, Michael S. Tsirkin, Jason Wang, Jakub Kicinski,
Björn Töpel, Magnus Karlsson, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer
[-- Attachment #1: Type: text/plain, Size: 3327 bytes --]
Hi Xuan,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on bpf-next/master]
url: https://github.com/0day-ci/linux/commits/Xuan-Zhuo/xsk-build-skb-by-page/20210116-105116
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: sh-allmodconfig (attached as .config)
compiler: sh4-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/ee139d2988e5c5945108889a7c95c751910c1877
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Xuan-Zhuo/xsk-build-skb-by-page/20210116-105116
git checkout ee139d2988e5c5945108889a7c95c751910c1877
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=sh
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
net/xdp/xsk.c: In function 'xsk_build_skb':
net/xdp/xsk.c:497:4: error: invalid type argument of unary '*' (have 'int')
497 | *err = -ENOMEM;
| ^~~~
net/xdp/xsk.c:508:4: error: invalid type argument of unary '*' (have 'int')
508 | *err = -EINVAL;
| ^~~~
>> net/xdp/xsk.c:490:7: warning: variable 'addr' set but not used [-Wunused-but-set-variable]
490 | u64 addr;
| ^~~~
Kconfig warnings: (for reference only)
WARNING: unmet direct dependencies detected for SND_ATMEL_SOC_PDC
Depends on SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC && HAS_DMA
Selected by
- SND_ATMEL_SOC_SSC && SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC
- SND_ATMEL_SOC_SSC_PDC && SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC && ATMEL_SSC
vim +/addr +490 net/xdp/xsk.c
476
477 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
478 struct xdp_desc *desc, int *err)
479 {
480 struct sk_buff *skb;
481
482 if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
483 skb = xsk_build_skb_zerocopy(xs, desc);
484 if (unlikely(!skb)) {
485 *err = -ENOMEM;
486 return NULL;
487 }
488 } else {
489 char *buffer;
> 490 u64 addr;
491 u32 len;
492 int err;
493
494 len = desc->len;
495 skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
496 if (unlikely(!skb)) {
> 497 *err = -ENOMEM;
498 return NULL;
499 }
500
501 skb_put(skb, len);
502 addr = desc->addr;
503 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
504 err = skb_store_bits(skb, 0, buffer, len);
505
506 if (unlikely(err)) {
507 kfree_skb(skb);
508 *err = -EINVAL;
509 return NULL;
510 }
511 }
512
513 skb->dev = xs->dev;
514 skb->priority = xs->sk.sk_priority;
515 skb->mark = xs->sk.sk_mark;
516 skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
517 skb->destructor = xsk_destruct_skb;
518
519 return skb;
520 }
521
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all@lists.01.org
[-- Attachment #2: .config.gz --]
[-- Type: application/gzip, Size: 53952 bytes --]
^ permalink raw reply [flat|nested] 23+ messages in thread
* Re: [PATCH bpf-next] xsk: build skb by page
@ 2021-01-16 5:24 ` kernel test robot
0 siblings, 0 replies; 23+ messages in thread
From: kernel test robot @ 2021-01-16 5:24 UTC (permalink / raw)
To: kbuild-all
[-- Attachment #1: Type: text/plain, Size: 3425 bytes --]
Hi Xuan,
Thank you for the patch! Perhaps something to improve:
[auto build test WARNING on bpf-next/master]
url: https://github.com/0day-ci/linux/commits/Xuan-Zhuo/xsk-build-skb-by-page/20210116-105116
base: https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git master
config: sh-allmodconfig (attached as .config)
compiler: sh4-linux-gcc (GCC) 9.3.0
reproduce (this is a W=1 build):
wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
chmod +x ~/bin/make.cross
# https://github.com/0day-ci/linux/commit/ee139d2988e5c5945108889a7c95c751910c1877
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review Xuan-Zhuo/xsk-build-skb-by-page/20210116-105116
git checkout ee139d2988e5c5945108889a7c95c751910c1877
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=sh
If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot <lkp@intel.com>
All warnings (new ones prefixed by >>):
net/xdp/xsk.c: In function 'xsk_build_skb':
net/xdp/xsk.c:497:4: error: invalid type argument of unary '*' (have 'int')
497 | *err = -ENOMEM;
| ^~~~
net/xdp/xsk.c:508:4: error: invalid type argument of unary '*' (have 'int')
508 | *err = -EINVAL;
| ^~~~
>> net/xdp/xsk.c:490:7: warning: variable 'addr' set but not used [-Wunused-but-set-variable]
490 | u64 addr;
| ^~~~
Kconfig warnings: (for reference only)
WARNING: unmet direct dependencies detected for SND_ATMEL_SOC_PDC
Depends on SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC && HAS_DMA
Selected by
- SND_ATMEL_SOC_SSC && SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC
- SND_ATMEL_SOC_SSC_PDC && SOUND && !UML && SND && SND_SOC && SND_ATMEL_SOC && ATMEL_SSC
vim +/addr +490 net/xdp/xsk.c
476
477 static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
478 struct xdp_desc *desc, int *err)
479 {
480 struct sk_buff *skb;
481
482 if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
483 skb = xsk_build_skb_zerocopy(xs, desc);
484 if (unlikely(!skb)) {
485 *err = -ENOMEM;
486 return NULL;
487 }
488 } else {
489 char *buffer;
> 490 u64 addr;
491 u32 len;
492 int err;
493
494 len = desc->len;
495 skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
496 if (unlikely(!skb)) {
> 497 *err = -ENOMEM;
498 return NULL;
499 }
500
501 skb_put(skb, len);
502 addr = desc->addr;
503 buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
504 err = skb_store_bits(skb, 0, buffer, len);
505
506 if (unlikely(err)) {
507 kfree_skb(skb);
508 *err = -EINVAL;
509 return NULL;
510 }
511 }
512
513 skb->dev = xs->dev;
514 skb->priority = xs->sk.sk_priority;
515 skb->mark = xs->sk.sk_mark;
516 skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
517 skb->destructor = xsk_destruct_skb;
518
519 return skb;
520 }
521
---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-all(a)lists.01.org
[-- Attachment #2: config.gz --]
[-- Type: application/gzip, Size: 53952 bytes --]
^ permalink raw reply [flat|nested] 23+ messages in thread
* [PATCH bpf-next] xsk: build skb by page
@ 2021-01-16 2:44 Xuan Zhuo
2021-01-16 5:24 ` kernel test robot
` (7 more replies)
0 siblings, 8 replies; 23+ messages in thread
From: Xuan Zhuo @ 2021-01-16 2:44 UTC (permalink / raw)
To: netdev
Cc: Michael S. Tsirkin, Jason Wang, David S. Miller, Jakub Kicinski,
Björn Töpel, Magnus Karlsson, Jonathan Lemon,
Alexei Starovoitov, Daniel Borkmann, Jesper Dangaard Brouer,
John Fastabend, Andrii Nakryiko, Martin KaFai Lau, Song Liu,
Yonghong Song, KP Singh, Willem de Bruijn, Steffen Klassert,
Alexander Lobakin, Miaohe Lin, Xuan Zhuo, Mauro Carvalho Chehab,
Antoine Tenart, Michal Kubecek, Andrew Lunn, Florian Fainelli,
Meir Lichtinger, virtualization, bpf
This patch is used to construct skb based on page to save memory copy
overhead.
This has one problem:
We construct the skb by fill the data page as a frag into the skb. In
this way, the linear space is empty, and the header information is also
in the frag, not in the linear space, which is not allowed for some
network cards. For example, Mellanox Technologies MT27710 Family
[ConnectX-4 Lx] will get the following error message:
mlx5_core 0000:3b:00.1 eth1: Error cqe on cqn 0x817, ci 0x8, qn 0x1dbb, opcode 0xd, syndrome 0x1, vendor syndrome 0x68
00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00000020: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00000030: 00 00 00 00 60 10 68 01 0a 00 1d bb 00 0f 9f d2
WQE DUMP: WQ size 1024 WQ cur size 0, WQE index 0xf, len: 64
00000000: 00 00 0f 0a 00 1d bb 03 00 00 00 08 00 00 00 00
00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00000020: 00 00 00 2b 00 08 00 00 00 00 00 05 9e e3 08 00
00000030: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
mlx5_core 0000:3b:00.1 eth1: ERR CQE on SQ: 0x1dbb
I also tried to use build_skb to construct skb, but because of the
existence of skb_shinfo, it must be behind the linear space, so this
method is not working. We can't put skb_shinfo on desc->addr, it will be
exposed to users, this is not safe.
Finally, I added a feature NETIF_F_SKB_NO_LINEAR to identify whether the
network card supports the header information of the packet in the frag
and not in the linear space.
---------------- Performance Testing ------------
The test environment is Aliyun ECS server.
Test cmd:
```
xdpsock -i eth0 -t -S -s <msg size>
```
Test result data:
size 64 512 1024 1500
copy 1916747 1775988 1600203 1440054
page 1974058 1953655 1945463 1904478
percent 3.0% 10.0% 21.58% 32.3%
Signed-off-by: Xuan Zhuo <xuanzhuo@linux.alibaba.com>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
---
drivers/net/virtio_net.c | 2 +-
include/linux/netdev_features.h | 5 +-
net/ethtool/common.c | 1 +
net/xdp/xsk.c | 108 +++++++++++++++++++++++++++++++++-------
4 files changed, 97 insertions(+), 19 deletions(-)
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4ecccb8..841a331 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2985,7 +2985,7 @@ static int virtnet_probe(struct virtio_device *vdev)
/* Set up network device as normal. */
dev->priv_flags |= IFF_UNICAST_FLT | IFF_LIVE_ADDR_CHANGE;
dev->netdev_ops = &virtnet_netdev;
- dev->features = NETIF_F_HIGHDMA;
+ dev->features = NETIF_F_HIGHDMA | NETIF_F_SKB_NO_LINEAR;
dev->ethtool_ops = &virtnet_ethtool_ops;
SET_NETDEV_DEV(dev, &vdev->dev);
diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
index 934de56..8dd28e2 100644
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -85,9 +85,11 @@ enum {
NETIF_F_HW_MACSEC_BIT, /* Offload MACsec operations */
+ NETIF_F_SKB_NO_LINEAR_BIT, /* Allow skb linear is empty */
+
/*
* Add your fresh new feature above and remember to update
- * netdev_features_strings[] in net/core/ethtool.c and maybe
+ * netdev_features_strings[] in net/ethtool/common.c and maybe
* some feature mask #defines below. Please also describe it
* in Documentation/networking/netdev-features.rst.
*/
@@ -157,6 +159,7 @@ enum {
#define NETIF_F_GRO_FRAGLIST __NETIF_F(GRO_FRAGLIST)
#define NETIF_F_GSO_FRAGLIST __NETIF_F(GSO_FRAGLIST)
#define NETIF_F_HW_MACSEC __NETIF_F(HW_MACSEC)
+#define NETIF_F_SKB_NO_LINEAR __NETIF_F(SKB_NO_LINEAR)
/* Finds the next feature with the highest number of the range of start till 0.
*/
diff --git a/net/ethtool/common.c b/net/ethtool/common.c
index 24036e3..2f3d309 100644
--- a/net/ethtool/common.c
+++ b/net/ethtool/common.c
@@ -68,6 +68,7 @@
[NETIF_F_HW_TLS_RX_BIT] = "tls-hw-rx-offload",
[NETIF_F_GRO_FRAGLIST_BIT] = "rx-gro-list",
[NETIF_F_HW_MACSEC_BIT] = "macsec-hw-offload",
+ [NETIF_F_SKB_NO_LINEAR_BIT] = "skb-no-linear",
};
const char
diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 8037b04..94d17dc 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -430,6 +430,95 @@ static void xsk_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}
+static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
+ struct xdp_desc *desc)
+{
+ u32 len, offset, copy, copied;
+ struct sk_buff *skb;
+ struct page *page;
+ char *buffer;
+ int err, i;
+ u64 addr;
+
+ skb = sock_alloc_send_skb(&xs->sk, 0, 1, &err);
+ if (unlikely(!skb))
+ return NULL;
+
+ addr = desc->addr;
+ len = desc->len;
+
+ buffer = xsk_buff_raw_get_data(xs->pool, addr);
+ offset = offset_in_page(buffer);
+ addr = buffer - (char *)xs->pool->addrs;
+
+ for (copied = 0, i = 0; copied < len; ++i) {
+ page = xs->pool->umem->pgs[addr >> PAGE_SHIFT];
+
+ get_page(page);
+
+ copy = min((u32)(PAGE_SIZE - offset), len - copied);
+
+ skb_fill_page_desc(skb, i, page, offset, copy);
+
+ copied += copy;
+ addr += copy;
+ offset = 0;
+ }
+
+ skb->len += len;
+ skb->data_len += len;
+ skb->truesize += len;
+
+ refcount_add(len, &xs->sk.sk_wmem_alloc);
+
+ return skb;
+}
+
+static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
+ struct xdp_desc *desc, int *err)
+{
+ struct sk_buff *skb;
+
+ if (xs->dev->features & NETIF_F_SKB_NO_LINEAR) {
+ skb = xsk_build_skb_zerocopy(xs, desc);
+ if (unlikely(!skb)) {
+ *err = -ENOMEM;
+ return NULL;
+ }
+ } else {
+ char *buffer;
+ u64 addr;
+ u32 len;
+ int err;
+
+ len = desc->len;
+ skb = sock_alloc_send_skb(&xs->sk, len, 1, &err);
+ if (unlikely(!skb)) {
+ *err = -ENOMEM;
+ return NULL;
+ }
+
+ skb_put(skb, len);
+ addr = desc->addr;
+ buffer = xsk_buff_raw_get_data(xs->pool, desc->addr);
+ err = skb_store_bits(skb, 0, buffer, len);
+
+ if (unlikely(err)) {
+ kfree_skb(skb);
+ *err = -EINVAL;
+ return NULL;
+ }
+ }
+
+ skb->dev = xs->dev;
+ skb->priority = xs->sk.sk_priority;
+ skb->mark = xs->sk.sk_mark;
+ skb_shinfo(skb)->destructor_arg = (void *)(long)desc->addr;
+ skb->destructor = xsk_destruct_skb;
+
+ return skb;
+}
+
static int xsk_generic_xmit(struct sock *sk)
{
struct xdp_sock *xs = xdp_sk(sk);
@@ -446,43 +535,28 @@ static int xsk_generic_xmit(struct sock *sk)
goto out;
while (xskq_cons_peek_desc(xs->tx, &desc, xs->pool)) {
- char *buffer;
- u64 addr;
- u32 len;
-
if (max_batch-- == 0) {
err = -EAGAIN;
goto out;
}
- len = desc.len;
- skb = sock_alloc_send_skb(sk, len, 1, &err);
+ skb = xsk_build_skb(xs, &desc, &err);
if (unlikely(!skb))
goto out;
- skb_put(skb, len);
- addr = desc.addr;
- buffer = xsk_buff_raw_get_data(xs->pool, addr);
- err = skb_store_bits(skb, 0, buffer, len);
/* This is the backpressure mechanism for the Tx path.
* Reserve space in the completion queue and only proceed
* if there is space in it. This avoids having to implement
* any buffering in the Tx path.
*/
spin_lock_irqsave(&xs->pool->cq_lock, flags);
- if (unlikely(err) || xskq_prod_reserve(xs->pool->cq)) {
+ if (xskq_prod_reserve(xs->pool->cq)) {
spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
kfree_skb(skb);
goto out;
}
spin_unlock_irqrestore(&xs->pool->cq_lock, flags);
- skb->dev = xs->dev;
- skb->priority = sk->sk_priority;
- skb->mark = sk->sk_mark;
- skb_shinfo(skb)->destructor_arg = (void *)(long)desc.addr;
- skb->destructor = xsk_destruct_skb;
-
err = __dev_direct_xmit(skb, xs->queue_id);
if (err == NETDEV_TX_BUSY) {
/* Tell user-space to retry the send */
--
1.8.3.1
^ permalink raw reply related [flat|nested] 23+ messages in thread
end of thread, other threads:[~2021-01-19 13:03 UTC | newest]
Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-12-23 8:56 [PATCH bpf-next] xsk: build skb by page Xuan Zhuo
2020-12-23 10:04 ` Magnus Karlsson
2020-12-29 8:32 ` Xuan Zhuo
2020-12-31 16:29 ` John Fastabend
2021-01-16 2:44 Xuan Zhuo
2021-01-16 5:24 ` kernel test robot
2021-01-16 5:24 ` kernel test robot
2021-01-16 8:15 ` kernel test robot
2021-01-16 8:15 ` kernel test robot
2021-01-17 21:55 ` John Fastabend
2021-01-17 21:55 ` John Fastabend
2021-01-18 9:25 ` Magnus Karlsson
2021-01-18 11:57 ` Michael S. Tsirkin
2021-01-18 11:57 ` Michael S. Tsirkin
2021-01-18 12:37 ` Alexander Lobakin
2021-01-18 12:40 ` Yunsheng Lin
2021-01-18 13:00 ` Alexander Lobakin
2021-01-18 14:40 ` Alexander Lobakin
2021-01-18 15:03 ` Magnus Karlsson
2021-01-18 15:10 ` Magnus Karlsson
2021-01-18 16:38 ` Alexander Lobakin
2021-01-19 7:01 ` Magnus Karlsson
2021-01-19 12:44 ` Alexander Lobakin
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.