All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
@ 2016-10-22  4:07 Shrijeet Mukherjee
  2016-10-23 16:38 ` Stephen Hemminger
                   ` (2 more replies)
  0 siblings, 3 replies; 42+ messages in thread
From: Shrijeet Mukherjee @ 2016-10-22  4:07 UTC (permalink / raw)
  To: mst, tom; +Cc: netdev, shm, roopa, nikolay

This patch adds support for xdp ndo and also inserts the xdp program
call into the merged RX buffers and big buffers paths

* The small packet skb receive is skipped for now
* No TX for now

Signed-off-by: Shrijeet Mukherjee <shrijeet@gmail.com>
---
 drivers/net/virtio_net.c | 133 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 127 insertions(+), 6 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index fad84f3..d5af3f7 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/virtio.h>
 #include <linux/virtio_net.h>
+#include <linux/bpf.h>
 #include <linux/scatterlist.h>
 #include <linux/if_vlan.h>
 #include <linux/slab.h>
@@ -81,6 +82,8 @@ struct receive_queue {
 
 	struct napi_struct napi;
 
+	struct bpf_prog *xdp_prog;
+
 	/* Chain pages by the private ptr. */
 	struct page *pages;
 
@@ -324,6 +327,51 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 	return skb;
 }
 
+/* this function is not called from the receive_buf path directly as
+ * we want to use the page model for rx merge buffer and big buffers
+ * and not use the fast path for driving skb's around
+ */
+static inline u32 do_xdp_prog(struct virtnet_info *vi,
+			      struct receive_queue *rq,
+			      void *buf, int offset, int len)
+{
+	struct bpf_prog *xdp_prog;
+	struct xdp_buff xdp;
+	int hdr_padded_len;
+	u32 act;
+
+	/* A bpf program gets first chance to drop the packet. It may
+	 * read bytes but not past the end of the frag.
+	 */
+
+	xdp_prog = rcu_dereference(rq->xdp_prog);
+	if (xdp_prog) {
+		if (vi->mergeable_rx_bufs)
+			hdr_padded_len = sizeof(
+				struct virtio_net_hdr_mrg_rxbuf);
+		else
+			hdr_padded_len = sizeof(struct padded_vnet_hdr);
+
+		buf = (void *)((unsigned long)buf + offset + hdr_padded_len);
+
+		xdp.data = buf;
+		xdp.data_end = xdp.data + len;
+
+		act = bpf_prog_run_xdp(xdp_prog, &xdp);
+		switch (act) {
+		case XDP_PASS:
+			return XDP_PASS;
+		case XDP_TX:
+		case XDP_ABORTED:
+		case XDP_DROP:
+			return XDP_DROP;
+		default:
+			bpf_warn_invalid_xdp_action(act);
+		}
+	}
+	return XDP_PASS;
+}
+
 static struct sk_buff *receive_small(struct virtnet_info *vi, void *buf, unsigned int len)
 {
 	struct sk_buff * skb = buf;
@@ -341,8 +389,14 @@ static struct sk_buff *receive_big(struct net_device *dev,
 				   unsigned int len)
 {
 	struct page *page = buf;
-	struct sk_buff *skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
+	struct sk_buff *skb;
+	u32 act;
 
+	act = do_xdp_prog(vi, rq, buf, 0, len);
+	if (act == XDP_DROP)
+		goto err;
+
+	skb = page_to_skb(vi, rq, page, 0, len, PAGE_SIZE);
 	if (unlikely(!skb))
 		goto err;
 
@@ -366,13 +420,22 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	struct page *page = virt_to_head_page(buf);
 	int offset = buf - page_address(page);
 	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
+	u32 act;
+	struct sk_buff *head_skb, *curr_skb;
 
-	struct sk_buff *head_skb = page_to_skb(vi, rq, page, offset, len,
-					       truesize);
-	struct sk_buff *curr_skb = head_skb;
+	act = do_xdp_prog(vi, rq, buf, offset, len);
+	/* controversial, but alternative is to create an SKB anyway then */
+	if (act == XDP_DROP) {
+		put_page(page);
+		return NULL;
+	}
+
+	head_skb = page_to_skb(vi, rq, page, offset, len, truesize);
+	curr_skb = head_skb;
 
 	if (unlikely(!curr_skb))
 		goto err_skb;
+
 	while (--num_buf) {
 		int num_skb_frags;
 
@@ -388,6 +451,13 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 
 		buf = mergeable_ctx_to_buf_address(ctx);
 		page = virt_to_head_page(buf);
+		offset = buf - page_address(page);
+
+		act = do_xdp_prog(vi, rq, buf, offset, len);
+		if (act != XDP_PASS) {
+			put_page(page);
+			continue;
+		}
 
 		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
 		if (unlikely(num_skb_frags == MAX_SKB_FRAGS)) {
@@ -409,7 +479,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			head_skb->len += len;
 			head_skb->truesize += truesize;
 		}
-		offset = buf - page_address(page);
 		if (skb_can_coalesce(curr_skb, num_skb_frags, page, offset)) {
 			put_page(page);
 			skb_coalesce_rx_frag(curr_skb, num_skb_frags - 1,
@@ -1430,6 +1499,52 @@ static int virtnet_change_mtu(struct net_device *dev, int new_mtu)
 	return 0;
 }
 
+static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	struct bpf_prog *old_prog;
+	int i;
+
+	if (prog) {
+		prog = bpf_prog_add(prog, vi->max_queue_pairs - 1);
+		if (IS_ERR(prog))
+			return PTR_ERR(prog);
+	}
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		old_prog = rcu_dereference(vi->rq[i].xdp_prog);
+		rcu_assign_pointer(vi->rq[i].xdp_prog, prog);
+		if (old_prog)
+			bpf_prog_put(old_prog);
+	}
+
+	return 0;
+}
+
+static int virtnet_xdp_query(struct net_device *dev)
+{
+	struct virtnet_info *vi = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < vi->max_queue_pairs; i++) {
+		if (vi->rq[i].xdp_prog)
+			return 1;
+	}
+	return 0;
+}
+
+static int virtnet_xdp(struct net_device *dev, struct netdev_xdp *xdp)
+{
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return virtnet_xdp_set(dev, xdp->prog);
+	case XDP_QUERY_PROG:
+		return virtnet_xdp_query(dev);
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops virtnet_netdev = {
 	.ndo_open            = virtnet_open,
 	.ndo_stop   	     = virtnet_close,
@@ -1447,6 +1562,7 @@ static const struct net_device_ops virtnet_netdev = {
 #ifdef CONFIG_NET_RX_BUSY_POLL
 	.ndo_busy_poll		= virtnet_busy_poll,
 #endif
+	.ndo_xdp		= virtnet_xdp,
 };
 
 static void virtnet_config_changed_work(struct work_struct *work)
@@ -1503,11 +1619,17 @@ static void virtnet_free_queues(struct virtnet_info *vi)
 
 static void free_receive_bufs(struct virtnet_info *vi)
 {
+	struct bpf_prog *old_prog;
 	int i;
 
 	for (i = 0; i < vi->max_queue_pairs; i++) {
 		while (vi->rq[i].pages)
 			__free_pages(get_a_page(&vi->rq[i], GFP_KERNEL), 0);
+
+		old_prog = rcu_dereference(vi->rq[i].xdp_prog);
+		RCU_INIT_POINTER(vi->rq[i].xdp_prog, NULL);
+		if (old_prog)
+			bpf_prog_put(old_prog);
 	}
 }
 
@@ -1878,7 +2000,6 @@ static int virtnet_probe(struct virtio_device *vdev)
 		if (virtnet_change_mtu(dev, mtu))
 			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
 	}
-
 	if (vi->any_header_sg)
 		dev->needed_headroom = vi->hdr_len;
 
-- 
2.1.4

^ permalink raw reply related	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-22  4:07 [PATCH net-next RFC WIP] Patch for XDP support for virtio_net Shrijeet Mukherjee
@ 2016-10-23 16:38 ` Stephen Hemminger
  2016-10-24  1:51   ` Shrijeet Mukherjee
  2016-10-25 17:36 ` Jakub Kicinski
  2016-10-26 13:52 ` Jesper Dangaard Brouer
  2 siblings, 1 reply; 42+ messages in thread
From: Stephen Hemminger @ 2016-10-23 16:38 UTC (permalink / raw)
  To: Shrijeet Mukherjee; +Cc: mst, tom, netdev, shm, roopa, nikolay

Overall, I am glad to see XDP support more widely available. Minor stuff
in implementation.

>  
> +/* this function is not called from the receive_buf path directly as
> + * we want to use the page model for rx merge buffer and big buffers
> + * and not use the fast path for driving skb's around
> + */
> +static inline u32 do_xdp_prog(struct virtnet_info *vi,
> +			      struct receive_queue *rq,
> +			      void *buf, int offset, int len)
> +{

Do not mark non-trivial static functions as inline. The compiler will
do it anyway if it thinks it is appropriate.

+static int virtnet_xdp_query(struct net_device *dev)

Use bool here.

@@ -366,13 +420,22 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
...

 	if (unlikely(!curr_skb))
 		goto err_skb;
+
 	while (--num_buf) {
 		int num_skb_frags;

@@ -1878,7 +2000,6 @@ static int virtnet_probe(struct virtio_device *vdev)
 		if (virtnet_change_mtu(dev, mtu))
 			__virtio_clear_bit(vdev, VIRTIO_NET_F_MTU);
 	}
-
 	if (vi->any_header_sg)
 		dev->needed_headroom = vi->hdr_len;

This parts of the patch is white space creep. I.e other edits you did
that stayed around.

Do you have any performance numbers? Does the receive into pages hurt
the non XDP performance?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-23 16:38 ` Stephen Hemminger
@ 2016-10-24  1:51   ` Shrijeet Mukherjee
  2016-10-25  1:10     ` Alexei Starovoitov
  0 siblings, 1 reply; 42+ messages in thread
From: Shrijeet Mukherjee @ 2016-10-24  1:51 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Shrijeet Mukherjee, mst, Tom Herbert, Netdev, Roopa Prabhu,
	Nikolay Aleksandrov

On Sun, Oct 23, 2016 at 9:38 AM, Stephen Hemminger
<stephen@networkplumber.org> wrote:
> Overall, I am glad to see XDP support more widely available. Minor stuff
> in implementation.
>
>>
>> +/* this function is not called from the receive_buf path directly as
>> + * we want to use the page model for rx merge buffer and big buffers
>> + * and not use the fast path for driving skb's around
>> + */
>> +static inline u32 do_xdp_prog(struct virtnet_info *vi,
>> +                           struct receive_queue *rq,
>> +                           void *buf, int offset, int len)
>> +{
>
> Do not mark non-trivial static functions as inline. The compiler will
> do it anyway if it thinks it is appropriate.
>
> +static int virtnet_xdp_query(struct net_device *dev)
>
> Use bool here.
>

Ack on the bool.

the inline was my feeble attempt to minimize the overhead in the
"normal" path as currently I need a rcu deref followed by a
conditional. Wanted to ensure that we don't incur a call stack
overhead as well. But I will trust the compiler :)


> @@ -366,13 +420,22 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
hdr_len;
>
> This parts of the patch is white space creep. I.e other edits you did
> that stayed around.

Will clean up, thought I did .. but something must have come through.

>
> Do you have any performance numbers? Does the receive into pages hurt
> the non XDP performance?
>

No perf yet, on my limited env (laptop) I see higher packet drop rates
using the xdp_drop example, which is no surprise .. but I did not
change anything to recv into pages and instead opted to not support
the skb direct recv at all. And the other two modes were receiving
into pages and then forming skb's which is what I skipped if anything
other than XDP_PASS was returned.

The main goal of this patch was to start that discussion. My v2 patch
rejects the ndo op if neither of rx_mergeable or big_buffers are set.
Does that sound like a good tradeoff ? Don't know enough about who
turns these features off and why.

I can say that virtualbox always has the device features enabled .. so
seems like a good tradeoff ?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-24  1:51   ` Shrijeet Mukherjee
@ 2016-10-25  1:10     ` Alexei Starovoitov
  0 siblings, 0 replies; 42+ messages in thread
From: Alexei Starovoitov @ 2016-10-25  1:10 UTC (permalink / raw)
  To: Shrijeet Mukherjee
  Cc: Stephen Hemminger, Shrijeet Mukherjee, mst, Tom Herbert, Netdev,
	Roopa Prabhu, Nikolay Aleksandrov

On Sun, Oct 23, 2016 at 06:51:53PM -0700, Shrijeet Mukherjee wrote:
> 
> The main goal of this patch was to start that discussion. My v2 patch
> rejects the ndo op if neither of rx_mergeable or big_buffers are set.
> Does that sound like a good tradeoff ? Don't know enough about who
> turns these features off and why.
> 
> I can say that virtualbox always has the device features enabled .. so
> seems like a good tradeoff ?

If virtio can be taught to work with xdp that would be awesome.
I've looked at it from xdp prog debugging point of view, but amount of
complexity related to mergeable/big/etc was too much, so I went with e1k+xdp.
Are you sure that if mergeable/big disabled than buf is contiguous?
Also my understanding that buf is not writeable?
I don't see how to do TX either... May be it's all solvable somehow.

There was a discussion to convert raw dma buffer in mlx/intel directly
into vhost to avoid skb. This will allow host to send packets into
VMs quickly. Then if we can have fast virtio in the guest then
even more interesting use cases will be solved.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-22  4:07 [PATCH net-next RFC WIP] Patch for XDP support for virtio_net Shrijeet Mukherjee
  2016-10-23 16:38 ` Stephen Hemminger
@ 2016-10-25 17:36 ` Jakub Kicinski
  2016-10-26 13:52 ` Jesper Dangaard Brouer
  2 siblings, 0 replies; 42+ messages in thread
From: Jakub Kicinski @ 2016-10-25 17:36 UTC (permalink / raw)
  To: Shrijeet Mukherjee; +Cc: mst, tom, netdev, shm, roopa, nikolay

On Sat, 22 Oct 2016 04:07:23 +0000, Shrijeet Mukherjee wrote:
> +		act = bpf_prog_run_xdp(xdp_prog, &xdp);
> +		switch (act) {
> +		case XDP_PASS:
> +			return XDP_PASS;
> +		case XDP_TX:
> +		case XDP_ABORTED:
> +		case XDP_DROP:
> +			return XDP_DROP;
> +		default:
> +			bpf_warn_invalid_xdp_action(act);
> +		}
> +	}
> +	return XDP_PASS;

FWIW you may want to move the default label before XDP_TX/XDP_ABORT,
to get the behaviour to be drop on unknown ret code.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-22  4:07 [PATCH net-next RFC WIP] Patch for XDP support for virtio_net Shrijeet Mukherjee
  2016-10-23 16:38 ` Stephen Hemminger
  2016-10-25 17:36 ` Jakub Kicinski
@ 2016-10-26 13:52 ` Jesper Dangaard Brouer
  2016-10-26 16:36   ` Michael S. Tsirkin
  2 siblings, 1 reply; 42+ messages in thread
From: Jesper Dangaard Brouer @ 2016-10-26 13:52 UTC (permalink / raw)
  To: Shrijeet Mukherjee; +Cc: brouer, mst, tom, netdev, shm, roopa, nikolay

On Sat, 22 Oct 2016 04:07:23 +0000
Shrijeet Mukherjee <shrijeet@gmail.com> wrote:

> This patch adds support for xdp ndo and also inserts the xdp program
> call into the merged RX buffers and big buffers paths

I really appreciate you are doing this for virtio_net.

My first question is: Is the (packet) page data writable?
(MST might be able to answer?)

As this is currently an XDP requirement[1].  


> * The small packet skb receive is skipped for now
> * No TX for now

I do see more and more valid use-cases for only implementing XDP_DROP
and not necessarily also implementing XDP_TX.  This does requires that
we implement some kind of feature/capabilities negotiation mechanism[2].


[1] https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/design/requirements.html#write-access-to-packet-data

[2] https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/design/design.html#capabilities-negotiation

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-26 13:52 ` Jesper Dangaard Brouer
@ 2016-10-26 16:36   ` Michael S. Tsirkin
  2016-10-26 16:52     ` David Miller
  0 siblings, 1 reply; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-10-26 16:36 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: Shrijeet Mukherjee, tom, netdev, shm, roopa, nikolay

On Wed, Oct 26, 2016 at 03:52:02PM +0200, Jesper Dangaard Brouer wrote:
> On Sat, 22 Oct 2016 04:07:23 +0000
> Shrijeet Mukherjee <shrijeet@gmail.com> wrote:
> 
> > This patch adds support for xdp ndo and also inserts the xdp program
> > call into the merged RX buffers and big buffers paths
> 
> I really appreciate you are doing this for virtio_net.
> 
> My first question is: Is the (packet) page data writable?
> (MST might be able to answer?)
> 
> As this is currently an XDP requirement[1].  

I'm not sure I understand what does writable mean.
Could you explain a bit more pls?
We do copy data into skb ATM but I plan to change that.


> 
> > * The small packet skb receive is skipped for now
> > * No TX for now
> 
> I do see more and more valid use-cases for only implementing XDP_DROP
> and not necessarily also implementing XDP_TX.  This does requires that
> we implement some kind of feature/capabilities negotiation mechanism[2].
> 
> 
> [1] https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/design/requirements.html#write-access-to-packet-data
> 
> [2] https://prototype-kernel.readthedocs.io/en/latest/networking/XDP/design/design.html#capabilities-negotiation
> 
> -- 
> Best regards,
>   Jesper Dangaard Brouer
>   MSc.CS, Principal Kernel Engineer at Red Hat
>   Author of http://www.iptv-analyzer.org
>   LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-26 16:36   ` Michael S. Tsirkin
@ 2016-10-26 16:52     ` David Miller
  2016-10-26 17:07       ` Michael S. Tsirkin
  0 siblings, 1 reply; 42+ messages in thread
From: David Miller @ 2016-10-26 16:52 UTC (permalink / raw)
  To: mst; +Cc: brouer, shrijeet, tom, netdev, shm, roopa, nikolay

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 26 Oct 2016 19:36:45 +0300

> On Wed, Oct 26, 2016 at 03:52:02PM +0200, Jesper Dangaard Brouer wrote:
>> On Sat, 22 Oct 2016 04:07:23 +0000
>> Shrijeet Mukherjee <shrijeet@gmail.com> wrote:
>> 
>> > This patch adds support for xdp ndo and also inserts the xdp program
>> > call into the merged RX buffers and big buffers paths
>> 
>> I really appreciate you are doing this for virtio_net.
>> 
>> My first question is: Is the (packet) page data writable?
>> (MST might be able to answer?)
>> 
>> As this is currently an XDP requirement[1].  
> 
> I'm not sure I understand what does writable mean.
> Could you explain a bit more pls?
> We do copy data into skb ATM but I plan to change that.

The packet data area must be writable, and the page it lives in must
not be shared with any other entity in the system.

This is because the eBPF program that executes via XDP must be able
to modify and read arbitrary parts of the packet area.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-26 16:52     ` David Miller
@ 2016-10-26 17:07       ` Michael S. Tsirkin
  2016-10-26 17:11         ` David Miller
  0 siblings, 1 reply; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-10-26 17:07 UTC (permalink / raw)
  To: David Miller; +Cc: brouer, shrijeet, tom, netdev, shm, roopa, nikolay

On Wed, Oct 26, 2016 at 12:52:45PM -0400, David Miller wrote:
> From: "Michael S. Tsirkin" <mst@redhat.com>
> Date: Wed, 26 Oct 2016 19:36:45 +0300
> 
> > On Wed, Oct 26, 2016 at 03:52:02PM +0200, Jesper Dangaard Brouer wrote:
> >> On Sat, 22 Oct 2016 04:07:23 +0000
> >> Shrijeet Mukherjee <shrijeet@gmail.com> wrote:
> >> 
> >> > This patch adds support for xdp ndo and also inserts the xdp program
> >> > call into the merged RX buffers and big buffers paths
> >> 
> >> I really appreciate you are doing this for virtio_net.
> >> 
> >> My first question is: Is the (packet) page data writable?
> >> (MST might be able to answer?)
> >> 
> >> As this is currently an XDP requirement[1].  
> > 
> > I'm not sure I understand what does writable mean.
> > Could you explain a bit more pls?
> > We do copy data into skb ATM but I plan to change that.
> 
> The packet data area must be writable,

This is the part I don't fully understand.
It's in RAM so of course it's writeable.

> and the page it lives in must
> not be shared with any other entity in the system.

We share pages between arbitrary multiple packets. I think that's
OK - or is there an assumption that multiple programs
will be attached with different priveledges?

> This is because the eBPF program that executes via XDP must be able
> to modify and read arbitrary parts of the packet area.

-- 
MST

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-26 17:07       ` Michael S. Tsirkin
@ 2016-10-26 17:11         ` David Miller
  2016-10-27  8:55           ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 42+ messages in thread
From: David Miller @ 2016-10-26 17:11 UTC (permalink / raw)
  To: mst; +Cc: brouer, shrijeet, tom, netdev, shm, roopa, nikolay

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Wed, 26 Oct 2016 20:07:19 +0300

> On Wed, Oct 26, 2016 at 12:52:45PM -0400, David Miller wrote:
>> From: "Michael S. Tsirkin" <mst@redhat.com>
>> Date: Wed, 26 Oct 2016 19:36:45 +0300
>> 
>> > On Wed, Oct 26, 2016 at 03:52:02PM +0200, Jesper Dangaard Brouer wrote:
>> >> On Sat, 22 Oct 2016 04:07:23 +0000
>> >> Shrijeet Mukherjee <shrijeet@gmail.com> wrote:
>> >> 
>> >> > This patch adds support for xdp ndo and also inserts the xdp program
>> >> > call into the merged RX buffers and big buffers paths
>> >> 
>> >> I really appreciate you are doing this for virtio_net.
>> >> 
>> >> My first question is: Is the (packet) page data writable?
>> >> (MST might be able to answer?)
>> >> 
>> >> As this is currently an XDP requirement[1].  
>> > 
>> > I'm not sure I understand what does writable mean.
>> > Could you explain a bit more pls?
>> > We do copy data into skb ATM but I plan to change that.
>> 
>> The packet data area must be writable,
> 
> This is the part I don't fully understand.
> It's in RAM so of course it's writeable.

Pages in SKB frag lists are not usually writable, because they share
space with data from other packets the way drivers usually carve up
pages to receive packets into.

It is therefore illegal for the networking code to write into SKB frag
pages.

Pages used for XDP processed packets must not have this restriction.

> We share pages between arbitrary multiple packets. I think that's
> OK

That's exactly what is not allowed with XDP.

Each packet must be the sole user of a page, otherwise the semantics
required by XDP are not met.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-26 17:11         ` David Miller
@ 2016-10-27  8:55           ` Jesper Dangaard Brouer
  2016-10-27 21:09             ` John Fastabend
  0 siblings, 1 reply; 42+ messages in thread
From: Jesper Dangaard Brouer @ 2016-10-27  8:55 UTC (permalink / raw)
  To: David Miller; +Cc: mst, shrijeet, tom, netdev, shm, roopa, nikolay, brouer

On Wed, 26 Oct 2016 13:11:22 -0400 (EDT)
David Miller <davem@davemloft.net> wrote:

> From: "Michael S. Tsirkin" <mst@redhat.com>
> Date: Wed, 26 Oct 2016 20:07:19 +0300
> 
> > On Wed, Oct 26, 2016 at 12:52:45PM -0400, David Miller wrote:  
> >> From: "Michael S. Tsirkin" <mst@redhat.com>
> >> Date: Wed, 26 Oct 2016 19:36:45 +0300
> >>   
> >> > On Wed, Oct 26, 2016 at 03:52:02PM +0200, Jesper Dangaard Brouer wrote:  
> >> >> On Sat, 22 Oct 2016 04:07:23 +0000
> >> >> Shrijeet Mukherjee <shrijeet@gmail.com> wrote:
> >> >>   
> >> >> > This patch adds support for xdp ndo and also inserts the xdp program
> >> >> > call into the merged RX buffers and big buffers paths  
> >> >> 
> >> >> I really appreciate you are doing this for virtio_net.
> >> >> 
> >> >> My first question is: Is the (packet) page data writable?
> >> >> (MST might be able to answer?)
> >> >> 
> >> >> As this is currently an XDP requirement[1].    
> >> > 
> >> > I'm not sure I understand what does writable mean.
> >> > Could you explain a bit more pls?
> >> > We do copy data into skb ATM but I plan to change that.  
> >> 
> >> The packet data area must be writable,  
> > 
> > This is the part I don't fully understand.
> > It's in RAM so of course it's writeable.  
> 
> Pages in SKB frag lists are not usually writable, because they share
> space with data from other packets the way drivers usually carve up
> pages to receive packets into.
> 
> It is therefore illegal for the networking code to write into SKB frag
> pages.
> 
> Pages used for XDP processed packets must not have this restriction.
> 
> > We share pages between arbitrary multiple packets. I think that's
> > OK  
> 
> That's exactly what is not allowed with XDP.
> 
> Each packet must be the sole user of a page, otherwise the semantics
> required by XDP are not met.

Looking at the virtio_net.c code, the function call receive_big() might
actually be okay for XDP, unless the incoming packet is larger than
PAGE_SIZE and thus uses several pages (via a linked list in page->private).

The receive_mergeable() does not look compatible with XDP.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-27  8:55           ` Jesper Dangaard Brouer
@ 2016-10-27 21:09             ` John Fastabend
  2016-10-27 21:30               ` Michael S. Tsirkin
  2016-10-28  0:02               ` Shrijeet Mukherjee
  0 siblings, 2 replies; 42+ messages in thread
From: John Fastabend @ 2016-10-27 21:09 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, David Miller
  Cc: mst, shrijeet, tom, netdev, shm, roopa, nikolay

On 16-10-27 01:55 AM, Jesper Dangaard Brouer wrote:
> On Wed, 26 Oct 2016 13:11:22 -0400 (EDT)
> David Miller <davem@davemloft.net> wrote:
> 
>> From: "Michael S. Tsirkin" <mst@redhat.com>
>> Date: Wed, 26 Oct 2016 20:07:19 +0300
>>
>>> On Wed, Oct 26, 2016 at 12:52:45PM -0400, David Miller wrote:  
>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
>>>> Date: Wed, 26 Oct 2016 19:36:45 +0300
>>>>   
>>>>> On Wed, Oct 26, 2016 at 03:52:02PM +0200, Jesper Dangaard Brouer wrote:  
>>>>>> On Sat, 22 Oct 2016 04:07:23 +0000
>>>>>> Shrijeet Mukherjee <shrijeet@gmail.com> wrote:
>>>>>>   
>>>>>>> This patch adds support for xdp ndo and also inserts the xdp program
>>>>>>> call into the merged RX buffers and big buffers paths  
>>>>>>
>>>>>> I really appreciate you are doing this for virtio_net.
>>>>>>
>>>>>> My first question is: Is the (packet) page data writable?
>>>>>> (MST might be able to answer?)
>>>>>>
>>>>>> As this is currently an XDP requirement[1].    
>>>>>
>>>>> I'm not sure I understand what does writable mean.
>>>>> Could you explain a bit more pls?
>>>>> We do copy data into skb ATM but I plan to change that.  
>>>>
>>>> The packet data area must be writable,  
>>>
>>> This is the part I don't fully understand.
>>> It's in RAM so of course it's writeable.  
>>
>> Pages in SKB frag lists are not usually writable, because they share
>> space with data from other packets the way drivers usually carve up
>> pages to receive packets into.
>>
>> It is therefore illegal for the networking code to write into SKB frag
>> pages.
>>
>> Pages used for XDP processed packets must not have this restriction.
>>
>>> We share pages between arbitrary multiple packets. I think that's
>>> OK  
>>
>> That's exactly what is not allowed with XDP.
>>
>> Each packet must be the sole user of a page, otherwise the semantics
>> required by XDP are not met.
> 
> Looking at the virtio_net.c code, the function call receive_big() might
> actually be okay for XDP, unless the incoming packet is larger than
> PAGE_SIZE and thus uses several pages (via a linked list in page->private).
> 
> The receive_mergeable() does not look compatible with XDP.
> 

Looks to me the correct conditions can be met by getting the correct
feature negotiation to happen, specifically no VIRTIO_NET_F_MRG_RXBUF
and one of the TSO/ECN/GSO feature bits set the big_packets flag as
Jesper notes.

Srijeet, are you going to give this a try? I'm trying to get the device
side working by the way on the vhost interface.

Thanks,
John

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-27 21:09             ` John Fastabend
@ 2016-10-27 21:30               ` Michael S. Tsirkin
  2016-10-27 21:42                 ` David Miller
  2016-10-28  0:02               ` Shrijeet Mukherjee
  1 sibling, 1 reply; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-10-27 21:30 UTC (permalink / raw)
  To: John Fastabend
  Cc: Jesper Dangaard Brouer, David Miller, shrijeet, tom, netdev, shm,
	roopa, nikolay

On Thu, Oct 27, 2016 at 02:09:08PM -0700, John Fastabend wrote:
> On 16-10-27 01:55 AM, Jesper Dangaard Brouer wrote:
> > On Wed, 26 Oct 2016 13:11:22 -0400 (EDT)
> > David Miller <davem@davemloft.net> wrote:
> > 
> >> From: "Michael S. Tsirkin" <mst@redhat.com>
> >> Date: Wed, 26 Oct 2016 20:07:19 +0300
> >>
> >>> On Wed, Oct 26, 2016 at 12:52:45PM -0400, David Miller wrote:  
> >>>> From: "Michael S. Tsirkin" <mst@redhat.com>
> >>>> Date: Wed, 26 Oct 2016 19:36:45 +0300
> >>>>   
> >>>>> On Wed, Oct 26, 2016 at 03:52:02PM +0200, Jesper Dangaard Brouer wrote:  
> >>>>>> On Sat, 22 Oct 2016 04:07:23 +0000
> >>>>>> Shrijeet Mukherjee <shrijeet@gmail.com> wrote:
> >>>>>>   
> >>>>>>> This patch adds support for xdp ndo and also inserts the xdp program
> >>>>>>> call into the merged RX buffers and big buffers paths  
> >>>>>>
> >>>>>> I really appreciate you are doing this for virtio_net.
> >>>>>>
> >>>>>> My first question is: Is the (packet) page data writable?
> >>>>>> (MST might be able to answer?)
> >>>>>>
> >>>>>> As this is currently an XDP requirement[1].    
> >>>>>
> >>>>> I'm not sure I understand what does writable mean.
> >>>>> Could you explain a bit more pls?
> >>>>> We do copy data into skb ATM but I plan to change that.  
> >>>>
> >>>> The packet data area must be writable,  
> >>>
> >>> This is the part I don't fully understand.
> >>> It's in RAM so of course it's writeable.  
> >>
> >> Pages in SKB frag lists are not usually writable, because they share
> >> space with data from other packets the way drivers usually carve up
> >> pages to receive packets into.
> >>
> >> It is therefore illegal for the networking code to write into SKB frag
> >> pages.
> >>
> >> Pages used for XDP processed packets must not have this restriction.
> >>
> >>> We share pages between arbitrary multiple packets. I think that's
> >>> OK  
> >>
> >> That's exactly what is not allowed with XDP.
> >>
> >> Each packet must be the sole user of a page, otherwise the semantics
> >> required by XDP are not met.
> > 
> > Looking at the virtio_net.c code, the function call receive_big() might
> > actually be okay for XDP, unless the incoming packet is larger than
> > PAGE_SIZE and thus uses several pages (via a linked list in page->private).
> > 
> > The receive_mergeable() does not look compatible with XDP.
> > 
> 
> Looks to me the correct conditions can be met by getting the correct
> feature negotiation to happen, specifically no VIRTIO_NET_F_MRG_RXBUF
> and one of the TSO/ECN/GSO feature bits set the big_packets flag as
> Jesper notes.
> 
> Srijeet, are you going to give this a try? I'm trying to get the device
> side working by the way on the vhost interface.
> 
> Thanks,
> John

Something I'd like to understand is how does XDP address the
problem that 100Byte packets are consuming 4K of memory now.

-- 
MST

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-27 21:30               ` Michael S. Tsirkin
@ 2016-10-27 21:42                 ` David Miller
  2016-10-27 22:25                   ` Michael S. Tsirkin
  0 siblings, 1 reply; 42+ messages in thread
From: David Miller @ 2016-10-27 21:42 UTC (permalink / raw)
  To: mst; +Cc: john.fastabend, brouer, shrijeet, tom, netdev, shm, roopa, nikolay

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 28 Oct 2016 00:30:35 +0300

> Something I'd like to understand is how does XDP address the
> problem that 100Byte packets are consuming 4K of memory now.

Via page pools.  We're going to make a generic one, but right now
each and every driver implements a quick list of pages to allocate
from (and thus avoid the DMA man/unmap overhead, etc.)

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-27 21:42                 ` David Miller
@ 2016-10-27 22:25                   ` Michael S. Tsirkin
  2016-10-28  1:35                     ` David Miller
  0 siblings, 1 reply; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-10-27 22:25 UTC (permalink / raw)
  To: David Miller
  Cc: john.fastabend, brouer, shrijeet, tom, netdev, shm, roopa, nikolay

On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:
> From: "Michael S. Tsirkin" <mst@redhat.com>
> Date: Fri, 28 Oct 2016 00:30:35 +0300
> 
> > Something I'd like to understand is how does XDP address the
> > problem that 100Byte packets are consuming 4K of memory now.
> 
> Via page pools.  We're going to make a generic one, but right now
> each and every driver implements a quick list of pages to allocate
> from (and thus avoid the DMA man/unmap overhead, etc.)

So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
so there should be no issue with that even when using sub/page
regions, assuming DMA APIs support sub-page map/unmap correctly.

The only reason virtio attempts to use sub-page fragments is to conserve
memory so that truesize for a 100byte packets won't be 4K. Are you
saying pools will somehow mean we won't need to worry about that so
effectively truesize=100byte even though the rest of the 4k page goes
unused? If so we'll happily go back to allocate memory in 4K chunks like
we used to before 2613af0ed18a, and remove a bunch of complexity.

-- 
MST

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-27 21:09             ` John Fastabend
  2016-10-27 21:30               ` Michael S. Tsirkin
@ 2016-10-28  0:02               ` Shrijeet Mukherjee
  2016-10-28  0:46                 ` Shrijeet Mukherjee
  1 sibling, 1 reply; 42+ messages in thread
From: Shrijeet Mukherjee @ 2016-10-28  0:02 UTC (permalink / raw)
  To: John Fastabend, Jesper Dangaard Brouer, David Miller
  Cc: mst, shrijeet, tom, netdev, Roopa Prabhu, Nikolay Aleksandrov

> >
> > Looking at the virtio_net.c code, the function call receive_big()
> > might actually be okay for XDP, unless the incoming packet is larger
> > than PAGE_SIZE and thus uses several pages (via a linked list in page-
> >private).
> >
> > The receive_mergeable() does not look compatible with XDP.
> >
>
> Looks to me the correct conditions can be met by getting the correct
> feature negotiation to happen, specifically no VIRTIO_NET_F_MRG_RXBUF
> and one of the TSO/ECN/GSO feature bits set the big_packets flag as
Jesper
> notes.
>
> Srijeet, are you going to give this a try? I'm trying to get the device
side
> working by the way on the vhost interface.
>

Yup, I did try it .. but it looked like the device gets to make that call
(Unless someone tells me I did something wrong) If I turned that feature
off in the driver it crashed (did not chase too far down that hole) but
let me try that again.

Wanted to consider a path, which looks like has been considered and
discarded, but for the mergable buffers case, if we disallow edit's (no
writes) would that be an acceptable answer ? sorry in advance for the
repeat question, but the benefit of mergeable seems like it can be real
for all packets that are not being XDP'ed .. so maybe apps can make a
choice of doing a copy into a local buffer and XDP_TX from there ?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28  0:02               ` Shrijeet Mukherjee
@ 2016-10-28  0:46                 ` Shrijeet Mukherjee
  0 siblings, 0 replies; 42+ messages in thread
From: Shrijeet Mukherjee @ 2016-10-28  0:46 UTC (permalink / raw)
  To: John Fastabend, Jesper Dangaard Brouer, David Miller
  Cc: mst, shrijeet, tom, netdev, Roopa Prabhu, Nikolay Aleksandrov

> -----Original Message-----
> From: Shrijeet Mukherjee [mailto:shm@cumulusnetworks.com]
> Sent: Thursday, October 27, 2016 5:02 PM
> To: John Fastabend <john.fastabend@gmail.com>; Jesper Dangaard Brouer
> <brouer@redhat.com>; David Miller <davem@davemloft.net>
> Cc: mst@redhat.com; shrijeet@gmail.com; tom@herbertland.com;
> netdev@vger.kernel.org; Roopa Prabhu <roopa@cumulusnetworks.com>;
> Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
> Subject: RE: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
>
> > >
> > > Looking at the virtio_net.c code, the function call receive_big()
> > >might actually be okay for XDP, unless the incoming packet is larger
> > >than PAGE_SIZE and thus uses several pages (via a linked list in
> > >page- private).
> > >
> > > The receive_mergeable() does not look compatible with XDP.
> > >
> >
> > Looks to me the correct conditions can be met by getting the correct
> > feature negotiation to happen, specifically no
> VIRTIO_NET_F_MRG_RXBUF
> > and one of the TSO/ECN/GSO feature bits set the big_packets flag as
> Jesper
> > notes.
> >
> > Srijeet, are you going to give this a try? I'm trying to get the
> > device
> side
> > working by the way on the vhost interface.
> >
>
> Yup, I did try it .. but it looked like the device gets to make that call
> (Unless
> someone tells me I did something wrong) If I turned that feature off in
> the
> driver it crashed (did not chase too far down that hole) but let me try
> that
> again.

Nevermind, I must have had some other issue. This does seem to work using
TSO as the function. Will spin something up to have the device export an XDP
capability which would block MERGEABLE for now and we can sort out alternate
plans later ?

>
> Wanted to consider a path, which looks like has been considered and
> discarded, but for the mergable buffers case, if we disallow edit's (no
> writes) would that be an acceptable answer ? sorry in advance for the
> repeat question, but the benefit of mergeable seems like it can be real
> for
> all packets that are not being XDP'ed .. so maybe apps can make a choice
> of
> doing a copy into a local buffer and XDP_TX from there ?

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-27 22:25                   ` Michael S. Tsirkin
@ 2016-10-28  1:35                     ` David Miller
  2016-10-28  1:43                       ` Alexander Duyck
  0 siblings, 1 reply; 42+ messages in thread
From: David Miller @ 2016-10-28  1:35 UTC (permalink / raw)
  To: mst; +Cc: john.fastabend, brouer, shrijeet, tom, netdev, shm, roopa, nikolay

From: "Michael S. Tsirkin" <mst@redhat.com>
Date: Fri, 28 Oct 2016 01:25:48 +0300

> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:
>> From: "Michael S. Tsirkin" <mst@redhat.com>
>> Date: Fri, 28 Oct 2016 00:30:35 +0300
>> 
>> > Something I'd like to understand is how does XDP address the
>> > problem that 100Byte packets are consuming 4K of memory now.
>> 
>> Via page pools.  We're going to make a generic one, but right now
>> each and every driver implements a quick list of pages to allocate
>> from (and thus avoid the DMA man/unmap overhead, etc.)
> 
> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
> so there should be no issue with that even when using sub/page
> regions, assuming DMA APIs support sub-page map/unmap correctly.

That's not what I said.

The page pools are meant to address the performance degradation from
going to having one packet per page for the sake of XDP's
requirements.

You still need to have one packet per page for correct XDP operation
whether you do page pools or not, and whether you have DMA mapping
(or it's equivalent virutalization operation) or not.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28  1:35                     ` David Miller
@ 2016-10-28  1:43                       ` Alexander Duyck
  2016-10-28  2:10                         ` David Miller
  0 siblings, 1 reply; 42+ messages in thread
From: Alexander Duyck @ 2016-10-28  1:43 UTC (permalink / raw)
  To: David Miller
  Cc: Michael S. Tsirkin, John Fastabend, Jesper Dangaard Brouer,
	shrijeet, Tom Herbert, Netdev, Shrijeet Mukherjee, roopa,
	nikolay

On Thu, Oct 27, 2016 at 6:35 PM, David Miller <davem@davemloft.net> wrote:
> From: "Michael S. Tsirkin" <mst@redhat.com>
> Date: Fri, 28 Oct 2016 01:25:48 +0300
>
>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:
>>> From: "Michael S. Tsirkin" <mst@redhat.com>
>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
>>>
>>> > Something I'd like to understand is how does XDP address the
>>> > problem that 100Byte packets are consuming 4K of memory now.
>>>
>>> Via page pools.  We're going to make a generic one, but right now
>>> each and every driver implements a quick list of pages to allocate
>>> from (and thus avoid the DMA man/unmap overhead, etc.)
>>
>> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
>> so there should be no issue with that even when using sub/page
>> regions, assuming DMA APIs support sub-page map/unmap correctly.
>
> That's not what I said.
>
> The page pools are meant to address the performance degradation from
> going to having one packet per page for the sake of XDP's
> requirements.
>
> You still need to have one packet per page for correct XDP operation
> whether you do page pools or not, and whether you have DMA mapping
> (or it's equivalent virutalization operation) or not.

Maybe I am missing something here, but why do you need to limit things
to one packet per page for correct XDP operation?  Most of the drivers
out there now are usually storing something closer to at least 2
packets per page, and with the DMA API fixes I am working on there
should be no issue with changing the contents inside those pages since
we won't invalidate or overwrite the data after the DMA buffer has
been synchronized for use by the CPU.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28  1:43                       ` Alexander Duyck
@ 2016-10-28  2:10                         ` David Miller
  2016-10-28 15:56                           ` John Fastabend
  0 siblings, 1 reply; 42+ messages in thread
From: David Miller @ 2016-10-28  2:10 UTC (permalink / raw)
  To: alexander.duyck
  Cc: mst, john.fastabend, brouer, shrijeet, tom, netdev, shm, roopa, nikolay

From: Alexander Duyck <alexander.duyck@gmail.com>
Date: Thu, 27 Oct 2016 18:43:59 -0700

> On Thu, Oct 27, 2016 at 6:35 PM, David Miller <davem@davemloft.net> wrote:
>> From: "Michael S. Tsirkin" <mst@redhat.com>
>> Date: Fri, 28 Oct 2016 01:25:48 +0300
>>
>>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:
>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
>>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
>>>>
>>>> > Something I'd like to understand is how does XDP address the
>>>> > problem that 100Byte packets are consuming 4K of memory now.
>>>>
>>>> Via page pools.  We're going to make a generic one, but right now
>>>> each and every driver implements a quick list of pages to allocate
>>>> from (and thus avoid the DMA man/unmap overhead, etc.)
>>>
>>> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
>>> so there should be no issue with that even when using sub/page
>>> regions, assuming DMA APIs support sub-page map/unmap correctly.
>>
>> That's not what I said.
>>
>> The page pools are meant to address the performance degradation from
>> going to having one packet per page for the sake of XDP's
>> requirements.
>>
>> You still need to have one packet per page for correct XDP operation
>> whether you do page pools or not, and whether you have DMA mapping
>> (or it's equivalent virutalization operation) or not.
> 
> Maybe I am missing something here, but why do you need to limit things
> to one packet per page for correct XDP operation?  Most of the drivers
> out there now are usually storing something closer to at least 2
> packets per page, and with the DMA API fixes I am working on there
> should be no issue with changing the contents inside those pages since
> we won't invalidate or overwrite the data after the DMA buffer has
> been synchronized for use by the CPU.

Because with SKB's you can share the page with other packets.

With XDP you simply cannot.

It's software semantics that are the issue.  SKB frag list pages
are read only, XDP packets are writable.

This has nothing to do with "writability" of the pages wrt. DMA
mapping or cpu mappings.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28  2:10                         ` David Miller
@ 2016-10-28 15:56                           ` John Fastabend
  2016-10-28 16:18                             ` Jakub Kicinski
  2016-10-28 17:11                             ` David Miller
  0 siblings, 2 replies; 42+ messages in thread
From: John Fastabend @ 2016-10-28 15:56 UTC (permalink / raw)
  To: David Miller, alexander.duyck
  Cc: mst, brouer, shrijeet, tom, netdev, shm, roopa, nikolay

On 16-10-27 07:10 PM, David Miller wrote:
> From: Alexander Duyck <alexander.duyck@gmail.com>
> Date: Thu, 27 Oct 2016 18:43:59 -0700
> 
>> On Thu, Oct 27, 2016 at 6:35 PM, David Miller <davem@davemloft.net> wrote:
>>> From: "Michael S. Tsirkin" <mst@redhat.com>
>>> Date: Fri, 28 Oct 2016 01:25:48 +0300
>>>
>>>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:
>>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
>>>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
>>>>>
>>>>>> Something I'd like to understand is how does XDP address the
>>>>>> problem that 100Byte packets are consuming 4K of memory now.
>>>>>
>>>>> Via page pools.  We're going to make a generic one, but right now
>>>>> each and every driver implements a quick list of pages to allocate
>>>>> from (and thus avoid the DMA man/unmap overhead, etc.)
>>>>
>>>> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
>>>> so there should be no issue with that even when using sub/page
>>>> regions, assuming DMA APIs support sub-page map/unmap correctly.
>>>
>>> That's not what I said.
>>>
>>> The page pools are meant to address the performance degradation from
>>> going to having one packet per page for the sake of XDP's
>>> requirements.
>>>
>>> You still need to have one packet per page for correct XDP operation
>>> whether you do page pools or not, and whether you have DMA mapping
>>> (or it's equivalent virutalization operation) or not.
>>
>> Maybe I am missing something here, but why do you need to limit things
>> to one packet per page for correct XDP operation?  Most of the drivers
>> out there now are usually storing something closer to at least 2
>> packets per page, and with the DMA API fixes I am working on there
>> should be no issue with changing the contents inside those pages since
>> we won't invalidate or overwrite the data after the DMA buffer has
>> been synchronized for use by the CPU.
> 
> Because with SKB's you can share the page with other packets.
> 
> With XDP you simply cannot.
> 
> It's software semantics that are the issue.  SKB frag list pages
> are read only, XDP packets are writable.
> 
> This has nothing to do with "writability" of the pages wrt. DMA
> mapping or cpu mappings.
> 

Sorry I'm not seeing it either. The current xdp_buff is defined
by,

  struct xdp_buff {
	void *data;
	void *data_end;
  };

The verifier has an xdp_is_valid_access() check to ensure we don't go
past data_end. The page for now at least never leaves the driver. For
the work to get xmit to other devices working I'm still not sure I see
any issue.

.John

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28 15:56                           ` John Fastabend
@ 2016-10-28 16:18                             ` Jakub Kicinski
  2016-10-28 18:22                               ` Alexei Starovoitov
  2016-10-28 17:11                             ` David Miller
  1 sibling, 1 reply; 42+ messages in thread
From: Jakub Kicinski @ 2016-10-28 16:18 UTC (permalink / raw)
  To: John Fastabend
  Cc: David Miller, alexander.duyck, mst, brouer, shrijeet, tom,
	netdev, shm, roopa, nikolay

On Fri, 28 Oct 2016 08:56:35 -0700, John Fastabend wrote:
> On 16-10-27 07:10 PM, David Miller wrote:
> > From: Alexander Duyck <alexander.duyck@gmail.com>
> > Date: Thu, 27 Oct 2016 18:43:59 -0700
> >   
> >> On Thu, Oct 27, 2016 at 6:35 PM, David Miller <davem@davemloft.net> wrote:  
> >>> From: "Michael S. Tsirkin" <mst@redhat.com>
> >>> Date: Fri, 28 Oct 2016 01:25:48 +0300
> >>>  
> >>>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:  
> >>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
> >>>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
> >>>>>  
> >>>>>> Something I'd like to understand is how does XDP address the
> >>>>>> problem that 100Byte packets are consuming 4K of memory now.  
> >>>>>
> >>>>> Via page pools.  We're going to make a generic one, but right now
> >>>>> each and every driver implements a quick list of pages to allocate
> >>>>> from (and thus avoid the DMA man/unmap overhead, etc.)  
> >>>>
> >>>> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
> >>>> so there should be no issue with that even when using sub/page
> >>>> regions, assuming DMA APIs support sub-page map/unmap correctly.  
> >>>
> >>> That's not what I said.
> >>>
> >>> The page pools are meant to address the performance degradation from
> >>> going to having one packet per page for the sake of XDP's
> >>> requirements.
> >>>
> >>> You still need to have one packet per page for correct XDP operation
> >>> whether you do page pools or not, and whether you have DMA mapping
> >>> (or it's equivalent virutalization operation) or not.  
> >>
> >> Maybe I am missing something here, but why do you need to limit things
> >> to one packet per page for correct XDP operation?  Most of the drivers
> >> out there now are usually storing something closer to at least 2
> >> packets per page, and with the DMA API fixes I am working on there
> >> should be no issue with changing the contents inside those pages since
> >> we won't invalidate or overwrite the data after the DMA buffer has
> >> been synchronized for use by the CPU.  
> > 
> > Because with SKB's you can share the page with other packets.
> > 
> > With XDP you simply cannot.
> > 
> > It's software semantics that are the issue.  SKB frag list pages
> > are read only, XDP packets are writable.
> > 
> > This has nothing to do with "writability" of the pages wrt. DMA
> > mapping or cpu mappings.
> >   
> 
> Sorry I'm not seeing it either. The current xdp_buff is defined
> by,
> 
>   struct xdp_buff {
> 	void *data;
> 	void *data_end;
>   };
> 
> The verifier has an xdp_is_valid_access() check to ensure we don't go
> past data_end. The page for now at least never leaves the driver. For
> the work to get xmit to other devices working I'm still not sure I see
> any issue.

+1

Do we want to make the packet-per-page a requirement because it could
be useful in the future from architectural standpoint?  I guess there
is a trade-off here between having the comfort of people following this
requirement today and making driver support for XDP even more complex.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28 15:56                           ` John Fastabend
  2016-10-28 16:18                             ` Jakub Kicinski
@ 2016-10-28 17:11                             ` David Miller
  2016-10-30 22:53                               ` Michael S. Tsirkin
  2016-11-02 14:01                               ` Jesper Dangaard Brouer
  1 sibling, 2 replies; 42+ messages in thread
From: David Miller @ 2016-10-28 17:11 UTC (permalink / raw)
  To: john.fastabend
  Cc: alexander.duyck, mst, brouer, shrijeet, tom, netdev, shm, roopa, nikolay

From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 28 Oct 2016 08:56:35 -0700

> On 16-10-27 07:10 PM, David Miller wrote:
>> From: Alexander Duyck <alexander.duyck@gmail.com>
>> Date: Thu, 27 Oct 2016 18:43:59 -0700
>> 
>>> On Thu, Oct 27, 2016 at 6:35 PM, David Miller <davem@davemloft.net> wrote:
>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
>>>> Date: Fri, 28 Oct 2016 01:25:48 +0300
>>>>
>>>>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:
>>>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
>>>>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
>>>>>>
>>>>>>> Something I'd like to understand is how does XDP address the
>>>>>>> problem that 100Byte packets are consuming 4K of memory now.
>>>>>>
>>>>>> Via page pools.  We're going to make a generic one, but right now
>>>>>> each and every driver implements a quick list of pages to allocate
>>>>>> from (and thus avoid the DMA man/unmap overhead, etc.)
>>>>>
>>>>> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
>>>>> so there should be no issue with that even when using sub/page
>>>>> regions, assuming DMA APIs support sub-page map/unmap correctly.
>>>>
>>>> That's not what I said.
>>>>
>>>> The page pools are meant to address the performance degradation from
>>>> going to having one packet per page for the sake of XDP's
>>>> requirements.
>>>>
>>>> You still need to have one packet per page for correct XDP operation
>>>> whether you do page pools or not, and whether you have DMA mapping
>>>> (or it's equivalent virutalization operation) or not.
>>>
>>> Maybe I am missing something here, but why do you need to limit things
>>> to one packet per page for correct XDP operation?  Most of the drivers
>>> out there now are usually storing something closer to at least 2
>>> packets per page, and with the DMA API fixes I am working on there
>>> should be no issue with changing the contents inside those pages since
>>> we won't invalidate or overwrite the data after the DMA buffer has
>>> been synchronized for use by the CPU.
>> 
>> Because with SKB's you can share the page with other packets.
>> 
>> With XDP you simply cannot.
>> 
>> It's software semantics that are the issue.  SKB frag list pages
>> are read only, XDP packets are writable.
>> 
>> This has nothing to do with "writability" of the pages wrt. DMA
>> mapping or cpu mappings.
>> 
> 
> Sorry I'm not seeing it either. The current xdp_buff is defined
> by,
> 
>   struct xdp_buff {
> 	void *data;
> 	void *data_end;
>   };
> 
> The verifier has an xdp_is_valid_access() check to ensure we don't go
> past data_end. The page for now at least never leaves the driver. For
> the work to get xmit to other devices working I'm still not sure I see
> any issue.

I guess I can say that the packets must be "writable" until I'm blue
in the face but I'll say it again, semantically writable pages are a
requirement.  And if multiple packets share a page this requirement
is not satisfied.

Also, we want to do several things in the future:

1) Allow push/pop of headers via eBPF code, which needs we need
   headroom.

2) Transparently zero-copy pass packets into userspace, basically
   the user will have a semi-permanently mapped ring of all the
   packet pages sitting in the RX queue of the device and the
   page pool associated with it.  This way we avoid all of the
   TLB flush/map overhead for the user's mapping of the packets
   just as we avoid the DMA map/unmap overhead.

And that's just the beginninng.

I'm sure others can come up with more reasons why we have this
requirement.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28 16:18                             ` Jakub Kicinski
@ 2016-10-28 18:22                               ` Alexei Starovoitov
  2016-10-28 20:35                                 ` Alexander Duyck
                                                   ` (2 more replies)
  0 siblings, 3 replies; 42+ messages in thread
From: Alexei Starovoitov @ 2016-10-28 18:22 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: John Fastabend, David Miller, alexander.duyck, mst, brouer,
	shrijeet, tom, netdev, shm, roopa, nikolay

On Fri, Oct 28, 2016 at 05:18:12PM +0100, Jakub Kicinski wrote:
> On Fri, 28 Oct 2016 08:56:35 -0700, John Fastabend wrote:
> > On 16-10-27 07:10 PM, David Miller wrote:
> > > From: Alexander Duyck <alexander.duyck@gmail.com>
> > > Date: Thu, 27 Oct 2016 18:43:59 -0700
> > >   
> > >> On Thu, Oct 27, 2016 at 6:35 PM, David Miller <davem@davemloft.net> wrote:  
> > >>> From: "Michael S. Tsirkin" <mst@redhat.com>
> > >>> Date: Fri, 28 Oct 2016 01:25:48 +0300
> > >>>  
> > >>>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:  
> > >>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
> > >>>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
> > >>>>>  
> > >>>>>> Something I'd like to understand is how does XDP address the
> > >>>>>> problem that 100Byte packets are consuming 4K of memory now.  
> > >>>>>
> > >>>>> Via page pools.  We're going to make a generic one, but right now
> > >>>>> each and every driver implements a quick list of pages to allocate
> > >>>>> from (and thus avoid the DMA man/unmap overhead, etc.)  
> > >>>>
> > >>>> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
> > >>>> so there should be no issue with that even when using sub/page
> > >>>> regions, assuming DMA APIs support sub-page map/unmap correctly.  
> > >>>
> > >>> That's not what I said.
> > >>>
> > >>> The page pools are meant to address the performance degradation from
> > >>> going to having one packet per page for the sake of XDP's
> > >>> requirements.
> > >>>
> > >>> You still need to have one packet per page for correct XDP operation
> > >>> whether you do page pools or not, and whether you have DMA mapping
> > >>> (or it's equivalent virutalization operation) or not.  
> > >>
> > >> Maybe I am missing something here, but why do you need to limit things
> > >> to one packet per page for correct XDP operation?  Most of the drivers
> > >> out there now are usually storing something closer to at least 2
> > >> packets per page, and with the DMA API fixes I am working on there
> > >> should be no issue with changing the contents inside those pages since
> > >> we won't invalidate or overwrite the data after the DMA buffer has
> > >> been synchronized for use by the CPU.  
> > > 
> > > Because with SKB's you can share the page with other packets.
> > > 
> > > With XDP you simply cannot.
> > > 
> > > It's software semantics that are the issue.  SKB frag list pages
> > > are read only, XDP packets are writable.
> > > 
> > > This has nothing to do with "writability" of the pages wrt. DMA
> > > mapping or cpu mappings.
> > >   
> > 
> > Sorry I'm not seeing it either. The current xdp_buff is defined
> > by,
> > 
> >   struct xdp_buff {
> > 	void *data;
> > 	void *data_end;
> >   };
> > 
> > The verifier has an xdp_is_valid_access() check to ensure we don't go
> > past data_end. The page for now at least never leaves the driver. For
> > the work to get xmit to other devices working I'm still not sure I see
> > any issue.
> 
> +1
> 
> Do we want to make the packet-per-page a requirement because it could
> be useful in the future from architectural standpoint?  I guess there
> is a trade-off here between having the comfort of people following this
> requirement today and making driver support for XDP even more complex.

It looks to me that packet per page makes drivers simpler instead of complex.
ixgbe split-page and mlx* order-3/5 tricks are definitely complex.
The skb truesize concerns come from the host when data is delivered to user
space and we need to have precise memory accounting for different applications
and different users. XDP is all root and imo it's far away from the days when
multi-user non-root issues start to pop up.
At the same time XDP doesn't require to use 4k buffer in something like Netronome.
If xdp bpf program can be offloaded into HW with 1800 byte buffers, great!
For x86 cpu the 4k byte is a natural allocation chunk. Anything lower requires
delicate dma tricks paired with even more complex slab allocator and atomic recnts.
All of that can only drive the performance down.
Comparing to kernel bypass xdp is using 4k pages whereas dpdk has
to use huge pages, so xdp is saving a ton of memory already!

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28 18:22                               ` Alexei Starovoitov
@ 2016-10-28 20:35                                 ` Alexander Duyck
  2016-10-28 20:42                                   ` Jakub Kicinski
  2016-10-28 20:36                                 ` Jakub Kicinski
  2016-10-29  3:51                                 ` Shrijeet Mukherjee
  2 siblings, 1 reply; 42+ messages in thread
From: Alexander Duyck @ 2016-10-28 20:35 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: Jakub Kicinski, John Fastabend, David Miller, Michael S. Tsirkin,
	Jesper Dangaard Brouer, shrijeet, Tom Herbert, Netdev,
	Shrijeet Mukherjee, roopa, nikolay

On Fri, Oct 28, 2016 at 11:22 AM, Alexei Starovoitov
<alexei.starovoitov@gmail.com> wrote:
> On Fri, Oct 28, 2016 at 05:18:12PM +0100, Jakub Kicinski wrote:
>> On Fri, 28 Oct 2016 08:56:35 -0700, John Fastabend wrote:
>> > On 16-10-27 07:10 PM, David Miller wrote:
>> > > From: Alexander Duyck <alexander.duyck@gmail.com>
>> > > Date: Thu, 27 Oct 2016 18:43:59 -0700
>> > >
>> > >> On Thu, Oct 27, 2016 at 6:35 PM, David Miller <davem@davemloft.net> wrote:
>> > >>> From: "Michael S. Tsirkin" <mst@redhat.com>
>> > >>> Date: Fri, 28 Oct 2016 01:25:48 +0300
>> > >>>
>> > >>>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:
>> > >>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
>> > >>>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
>> > >>>>>
>> > >>>>>> Something I'd like to understand is how does XDP address the
>> > >>>>>> problem that 100Byte packets are consuming 4K of memory now.
>> > >>>>>
>> > >>>>> Via page pools.  We're going to make a generic one, but right now
>> > >>>>> each and every driver implements a quick list of pages to allocate
>> > >>>>> from (and thus avoid the DMA man/unmap overhead, etc.)
>> > >>>>
>> > >>>> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
>> > >>>> so there should be no issue with that even when using sub/page
>> > >>>> regions, assuming DMA APIs support sub-page map/unmap correctly.
>> > >>>
>> > >>> That's not what I said.
>> > >>>
>> > >>> The page pools are meant to address the performance degradation from
>> > >>> going to having one packet per page for the sake of XDP's
>> > >>> requirements.
>> > >>>
>> > >>> You still need to have one packet per page for correct XDP operation
>> > >>> whether you do page pools or not, and whether you have DMA mapping
>> > >>> (or it's equivalent virutalization operation) or not.
>> > >>
>> > >> Maybe I am missing something here, but why do you need to limit things
>> > >> to one packet per page for correct XDP operation?  Most of the drivers
>> > >> out there now are usually storing something closer to at least 2
>> > >> packets per page, and with the DMA API fixes I am working on there
>> > >> should be no issue with changing the contents inside those pages since
>> > >> we won't invalidate or overwrite the data after the DMA buffer has
>> > >> been synchronized for use by the CPU.
>> > >
>> > > Because with SKB's you can share the page with other packets.
>> > >
>> > > With XDP you simply cannot.
>> > >
>> > > It's software semantics that are the issue.  SKB frag list pages
>> > > are read only, XDP packets are writable.
>> > >
>> > > This has nothing to do with "writability" of the pages wrt. DMA
>> > > mapping or cpu mappings.
>> > >
>> >
>> > Sorry I'm not seeing it either. The current xdp_buff is defined
>> > by,
>> >
>> >   struct xdp_buff {
>> >     void *data;
>> >     void *data_end;
>> >   };
>> >
>> > The verifier has an xdp_is_valid_access() check to ensure we don't go
>> > past data_end. The page for now at least never leaves the driver. For
>> > the work to get xmit to other devices working I'm still not sure I see
>> > any issue.
>>
>> +1
>>
>> Do we want to make the packet-per-page a requirement because it could
>> be useful in the future from architectural standpoint?  I guess there
>> is a trade-off here between having the comfort of people following this
>> requirement today and making driver support for XDP even more complex.
>
> It looks to me that packet per page makes drivers simpler instead of complex.
> ixgbe split-page and mlx* order-3/5 tricks are definitely complex.
> The skb truesize concerns come from the host when data is delivered to user
> space and we need to have precise memory accounting for different applications
> and different users. XDP is all root and imo it's far away from the days when
> multi-user non-root issues start to pop up.

Right but having XDP require 4K pages is going to hurt performance for
user space when we are using sockets.  We cannot justify killing
application performance just because we want to support XDP, and
having to alloc new memory and memcpy out of the buffers isn't going
to work as a viable workaround for this either.

> At the same time XDP doesn't require to use 4k buffer in something like Netronome.
> If xdp bpf program can be offloaded into HW with 1800 byte buffers, great!

So are you saying this is only really meant to be used with a full bpf
hardware offload then?

> For x86 cpu the 4k byte is a natural allocation chunk. Anything lower requires
> delicate dma tricks paired with even more complex slab allocator and atomic recnts.
> All of that can only drive the performance down.
> Comparing to kernel bypass xdp is using 4k pages whereas dpdk has
> to use huge pages, so xdp is saving a ton of memory already!

Do you really think the page pool Jesper is talking about doing is any
different?  Half the reason why I have to implement the DMA API
changes that I am are so that the page pool can unmap a page if the
device decides to cut it from the pool without invalidating the data
written by the CPU.

If anything I think we end up needing to add two more data members to
xdp_buff so that we can define the bounds of the sandbox it gets to
play in.  Otherwise on platforms such as PowerPC, that can use pages
larger than 4K, this is going to quickly get ridiculous.

- Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28 18:22                               ` Alexei Starovoitov
  2016-10-28 20:35                                 ` Alexander Duyck
@ 2016-10-28 20:36                                 ` Jakub Kicinski
  2016-10-29  3:51                                 ` Shrijeet Mukherjee
  2 siblings, 0 replies; 42+ messages in thread
From: Jakub Kicinski @ 2016-10-28 20:36 UTC (permalink / raw)
  To: Alexei Starovoitov
  Cc: John Fastabend, David Miller, alexander.duyck, mst, brouer,
	shrijeet, tom, netdev, shm, roopa, nikolay

On Fri, 28 Oct 2016 11:22:25 -0700, Alexei Starovoitov wrote:
> On Fri, Oct 28, 2016 at 05:18:12PM +0100, Jakub Kicinski wrote:
> > On Fri, 28 Oct 2016 08:56:35 -0700, John Fastabend wrote:  
> > > On 16-10-27 07:10 PM, David Miller wrote:  
> > > > From: Alexander Duyck <alexander.duyck@gmail.com>
> > > > Date: Thu, 27 Oct 2016 18:43:59 -0700
> > > >     
> > > >> On Thu, Oct 27, 2016 at 6:35 PM, David Miller <davem@davemloft.net> wrote:    
> > > >>> From: "Michael S. Tsirkin" <mst@redhat.com>
> > > >>> Date: Fri, 28 Oct 2016 01:25:48 +0300
> > > >>>    
> > > >>>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:    
> > > >>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
> > > >>>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
> > > >>>>>    
> > > >>>>>> Something I'd like to understand is how does XDP address the
> > > >>>>>> problem that 100Byte packets are consuming 4K of memory now.    
> > > >>>>>
> > > >>>>> Via page pools.  We're going to make a generic one, but right now
> > > >>>>> each and every driver implements a quick list of pages to allocate
> > > >>>>> from (and thus avoid the DMA man/unmap overhead, etc.)    
> > > >>>>
> > > >>>> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
> > > >>>> so there should be no issue with that even when using sub/page
> > > >>>> regions, assuming DMA APIs support sub-page map/unmap correctly.    
> > > >>>
> > > >>> That's not what I said.
> > > >>>
> > > >>> The page pools are meant to address the performance degradation from
> > > >>> going to having one packet per page for the sake of XDP's
> > > >>> requirements.
> > > >>>
> > > >>> You still need to have one packet per page for correct XDP operation
> > > >>> whether you do page pools or not, and whether you have DMA mapping
> > > >>> (or it's equivalent virutalization operation) or not.    
> > > >>
> > > >> Maybe I am missing something here, but why do you need to limit things
> > > >> to one packet per page for correct XDP operation?  Most of the drivers
> > > >> out there now are usually storing something closer to at least 2
> > > >> packets per page, and with the DMA API fixes I am working on there
> > > >> should be no issue with changing the contents inside those pages since
> > > >> we won't invalidate or overwrite the data after the DMA buffer has
> > > >> been synchronized for use by the CPU.    
> > > > 
> > > > Because with SKB's you can share the page with other packets.
> > > > 
> > > > With XDP you simply cannot.
> > > > 
> > > > It's software semantics that are the issue.  SKB frag list pages
> > > > are read only, XDP packets are writable.
> > > > 
> > > > This has nothing to do with "writability" of the pages wrt. DMA
> > > > mapping or cpu mappings.
> > > >     
> > > 
> > > Sorry I'm not seeing it either. The current xdp_buff is defined
> > > by,
> > > 
> > >   struct xdp_buff {
> > > 	void *data;
> > > 	void *data_end;
> > >   };
> > > 
> > > The verifier has an xdp_is_valid_access() check to ensure we don't go
> > > past data_end. The page for now at least never leaves the driver. For
> > > the work to get xmit to other devices working I'm still not sure I see
> > > any issue.  
> > 
> > +1
> > 
> > Do we want to make the packet-per-page a requirement because it could
> > be useful in the future from architectural standpoint?  I guess there
> > is a trade-off here between having the comfort of people following this
> > requirement today and making driver support for XDP even more complex.  
>
> It looks to me that packet per page makes drivers simpler instead of complex.
> ixgbe split-page and mlx* order-3/5 tricks are definitely complex.
> The skb truesize concerns come from the host when data is delivered to user
> space and we need to have precise memory accounting for different applications
> and different users. XDP is all root and imo it's far away from the days when
> multi-user non-root issues start to pop up.
> At the same time XDP doesn't require to use 4k buffer in something like Netronome.
> If xdp bpf program can be offloaded into HW with 1800 byte buffers, great!
> For x86 cpu the 4k byte is a natural allocation chunk. Anything lower requires
> delicate dma tricks paired with even more complex slab allocator and atomic recnts.
> All of that can only drive the performance down.
> Comparing to kernel bypass xdp is using 4k pages whereas dpdk has
> to use huge pages, so xdp is saving a ton of memory already!
 
Thanks a lot for explaining!  I was not aware of the user space
delivery and precise memory accounting requirements.

My understanding was that we would be driving toward some common
implementation of the recycle tricks either as part of the frag
allocator or some new mechanism of handing out mapped pages.  Basically
moving the allocation strategy decision further up the stack.

I'm not actually worried about the offload, full disclosure, I have a
XDP implementation based on the frag allocator :) (still pending
internal review), so if packet-per-page is the way forward then I'll
have revise to flip between 4k pages and frag allocator based on !!xdp.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28 20:35                                 ` Alexander Duyck
@ 2016-10-28 20:42                                   ` Jakub Kicinski
  0 siblings, 0 replies; 42+ messages in thread
From: Jakub Kicinski @ 2016-10-28 20:42 UTC (permalink / raw)
  To: Alexander Duyck
  Cc: Alexei Starovoitov, John Fastabend, David Miller,
	Michael S. Tsirkin, Jesper Dangaard Brouer, shrijeet,
	Tom Herbert, Netdev, Shrijeet Mukherjee, roopa, nikolay

On Fri, 28 Oct 2016 13:35:02 -0700, Alexander Duyck wrote:
> > At the same time XDP doesn't require to use 4k buffer in something like Netronome.
> > If xdp bpf program can be offloaded into HW with 1800 byte buffers, great!  
> 
> So are you saying this is only really meant to be used with a full bpf
> hardware offload then?

I think Alexei just meant to say I don't have to worry about providing
4k buffers when program is offloaded but on the host a page is the
memory granularity for XDP...

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28 18:22                               ` Alexei Starovoitov
  2016-10-28 20:35                                 ` Alexander Duyck
  2016-10-28 20:36                                 ` Jakub Kicinski
@ 2016-10-29  3:51                                 ` Shrijeet Mukherjee
  2016-10-29 11:25                                   ` Thomas Graf
  2 siblings, 1 reply; 42+ messages in thread
From: Shrijeet Mukherjee @ 2016-10-29  3:51 UTC (permalink / raw)
  To: Alexei Starovoitov, Jakub Kicinski
  Cc: John Fastabend, David Miller, alexander.duyck, mst, brouer,
	shrijeet, tom, netdev, Roopa Prabhu, Nikolay Aleksandrov

> -----Original Message-----
> From: Alexei Starovoitov [mailto:alexei.starovoitov@gmail.com]
> Sent: Friday, October 28, 2016 11:22 AM
> To: Jakub Kicinski <kubakici@wp.pl>
> Cc: John Fastabend <john.fastabend@gmail.com>; David Miller
> <davem@davemloft.net>; alexander.duyck@gmail.com; mst@redhat.com;
> brouer@redhat.com; shrijeet@gmail.com; tom@herbertland.com;
> netdev@vger.kernel.org; shm@cumulusnetworks.com;
> roopa@cumulusnetworks.com; nikolay@cumulusnetworks.com
> Subject: Re: [PATCH net-next RFC WIP] Patch for XDP support for
virtio_net
>
> On Fri, Oct 28, 2016 at 05:18:12PM +0100, Jakub Kicinski wrote:
> > On Fri, 28 Oct 2016 08:56:35 -0700, John Fastabend wrote:
> > > On 16-10-27 07:10 PM, David Miller wrote:
> > > > From: Alexander Duyck <alexander.duyck@gmail.com>
> > > > Date: Thu, 27 Oct 2016 18:43:59 -0700
> > > >
> > > >> On Thu, Oct 27, 2016 at 6:35 PM, David Miller
> <davem@davemloft.net> wrote:
> > > >>> From: "Michael S. Tsirkin" <mst@redhat.com>
> > > >>> Date: Fri, 28 Oct 2016 01:25:48 +0300
> > > >>>
> > > >>>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:
> > > >>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
> > > >>>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
> > > >>>>>
> > > >>>>>> Something I'd like to understand is how does XDP address the
> > > >>>>>> problem that 100Byte packets are consuming 4K of memory
> now.
> > > >>>>>
> > > >>>>> Via page pools.  We're going to make a generic one, but right
> > > >>>>> now each and every driver implements a quick list of pages to
> > > >>>>> allocate from (and thus avoid the DMA man/unmap overhead,
> > > >>>>> etc.)
> > > >>>>
> > > >>>> So to clarify, ATM virtio doesn't attempt to avoid dma
> > > >>>> map/unmap so there should be no issue with that even when using
> > > >>>> sub/page regions, assuming DMA APIs support sub-page
> map/unmap correctly.
> > > >>>
> > > >>> That's not what I said.
> > > >>>
> > > >>> The page pools are meant to address the performance degradation
> > > >>> from going to having one packet per page for the sake of XDP's
> > > >>> requirements.
> > > >>>
> > > >>> You still need to have one packet per page for correct XDP
> > > >>> operation whether you do page pools or not, and whether you have
> > > >>> DMA mapping (or it's equivalent virutalization operation) or
not.
> > > >>
> > > >> Maybe I am missing something here, but why do you need to limit
> > > >> things to one packet per page for correct XDP operation?  Most of
> > > >> the drivers out there now are usually storing something closer to
> > > >> at least 2 packets per page, and with the DMA API fixes I am
> > > >> working on there should be no issue with changing the contents
> > > >> inside those pages since we won't invalidate or overwrite the
> > > >> data after the DMA buffer has been synchronized for use by the
CPU.
> > > >
> > > > Because with SKB's you can share the page with other packets.
> > > >
> > > > With XDP you simply cannot.
> > > >
> > > > It's software semantics that are the issue.  SKB frag list pages
> > > > are read only, XDP packets are writable.
> > > >
> > > > This has nothing to do with "writability" of the pages wrt. DMA
> > > > mapping or cpu mappings.
> > > >
> > >
> > > Sorry I'm not seeing it either. The current xdp_buff is defined by,
> > >
> > >   struct xdp_buff {
> > > 	void *data;
> > > 	void *data_end;
> > >   };
> > >
> > > The verifier has an xdp_is_valid_access() check to ensure we don't
> > > go past data_end. The page for now at least never leaves the driver.
> > > For the work to get xmit to other devices working I'm still not sure
> > > I see any issue.
> >
> > +1
> >
> > Do we want to make the packet-per-page a requirement because it could
> > be useful in the future from architectural standpoint?  I guess there
> > is a trade-off here between having the comfort of people following
> > this requirement today and making driver support for XDP even more
> complex.
>
> It looks to me that packet per page makes drivers simpler instead of
> complex.
> ixgbe split-page and mlx* order-3/5 tricks are definitely complex.
> The skb truesize concerns come from the host when data is delivered to
> user space and we need to have precise memory accounting for different
> applications and different users. XDP is all root and imo it's far away
from
> the days when multi-user non-root issues start to pop up.
> At the same time XDP doesn't require to use 4k buffer in something like
> Netronome.
> If xdp bpf program can be offloaded into HW with 1800 byte buffers,
great!
> For x86 cpu the 4k byte is a natural allocation chunk. Anything lower
> requires delicate dma tricks paired with even more complex slab
allocator
> and atomic recnts.
> All of that can only drive the performance down.
> Comparing to kernel bypass xdp is using 4k pages whereas dpdk has to use
> huge pages, so xdp is saving a ton of memory already!

Generally agree, but SRIOV nics with multiple queues can end up in a bad
spot if each buffer was 4K right ? I see a specific page pool to be used
by queues which are enabled for XDP as the easiest to swing solution that
way the memory overhead can be restricted to enabled queues and shared
access issues can be restricted to skb's using that pool no ?

Clearly as said later this will not apply to ebpf offload devices ..

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-29  3:51                                 ` Shrijeet Mukherjee
@ 2016-10-29 11:25                                   ` Thomas Graf
  2016-11-02 14:27                                     ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 42+ messages in thread
From: Thomas Graf @ 2016-10-29 11:25 UTC (permalink / raw)
  To: Shrijeet Mukherjee
  Cc: Alexei Starovoitov, Jakub Kicinski, John Fastabend, David Miller,
	alexander.duyck, mst, brouer, shrijeet, tom, netdev,
	Roopa Prabhu, Nikolay Aleksandrov

On 10/28/16 at 08:51pm, Shrijeet Mukherjee wrote:
> Generally agree, but SRIOV nics with multiple queues can end up in a bad
> spot if each buffer was 4K right ? I see a specific page pool to be used
> by queues which are enabled for XDP as the easiest to swing solution that
> way the memory overhead can be restricted to enabled queues and shared
> access issues can be restricted to skb's using that pool no ?

Isn't this clearly a must anyway? I may be missing something
fundamental here so please enlighten me :-)

If we dedicate a page per packet, that could translate to 14M*4K worth
of memory being mapped per second for just a 10G NIC under DoS attack.
How can one protect such as system? Is the assumption that we can always
drop such packets quickly enough before we start dropping randomly due
to memory pressure? If a handshake is required to determine validity
of a packet then that is going to be difficult.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28 17:11                             ` David Miller
@ 2016-10-30 22:53                               ` Michael S. Tsirkin
  2016-11-02 14:01                               ` Jesper Dangaard Brouer
  1 sibling, 0 replies; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-10-30 22:53 UTC (permalink / raw)
  To: David Miller
  Cc: john.fastabend, alexander.duyck, brouer, shrijeet, tom, netdev,
	shm, roopa, nikolay

On Fri, Oct 28, 2016 at 01:11:01PM -0400, David Miller wrote:
> From: John Fastabend <john.fastabend@gmail.com>
> Date: Fri, 28 Oct 2016 08:56:35 -0700
> 
> > On 16-10-27 07:10 PM, David Miller wrote:
> >> From: Alexander Duyck <alexander.duyck@gmail.com>
> >> Date: Thu, 27 Oct 2016 18:43:59 -0700
> >> 
> >>> On Thu, Oct 27, 2016 at 6:35 PM, David Miller <davem@davemloft.net> wrote:
> >>>> From: "Michael S. Tsirkin" <mst@redhat.com>
> >>>> Date: Fri, 28 Oct 2016 01:25:48 +0300
> >>>>
> >>>>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:
> >>>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
> >>>>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
> >>>>>>
> >>>>>>> Something I'd like to understand is how does XDP address the
> >>>>>>> problem that 100Byte packets are consuming 4K of memory now.
> >>>>>>
> >>>>>> Via page pools.  We're going to make a generic one, but right now
> >>>>>> each and every driver implements a quick list of pages to allocate
> >>>>>> from (and thus avoid the DMA man/unmap overhead, etc.)
> >>>>>
> >>>>> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
> >>>>> so there should be no issue with that even when using sub/page
> >>>>> regions, assuming DMA APIs support sub-page map/unmap correctly.
> >>>>
> >>>> That's not what I said.
> >>>>
> >>>> The page pools are meant to address the performance degradation from
> >>>> going to having one packet per page for the sake of XDP's
> >>>> requirements.
> >>>>
> >>>> You still need to have one packet per page for correct XDP operation
> >>>> whether you do page pools or not, and whether you have DMA mapping
> >>>> (or it's equivalent virutalization operation) or not.
> >>>
> >>> Maybe I am missing something here, but why do you need to limit things
> >>> to one packet per page for correct XDP operation?  Most of the drivers
> >>> out there now are usually storing something closer to at least 2
> >>> packets per page, and with the DMA API fixes I am working on there
> >>> should be no issue with changing the contents inside those pages since
> >>> we won't invalidate or overwrite the data after the DMA buffer has
> >>> been synchronized for use by the CPU.
> >> 
> >> Because with SKB's you can share the page with other packets.
> >> 
> >> With XDP you simply cannot.
> >> 
> >> It's software semantics that are the issue.  SKB frag list pages
> >> are read only, XDP packets are writable.
> >> 
> >> This has nothing to do with "writability" of the pages wrt. DMA
> >> mapping or cpu mappings.
> >> 
> > 
> > Sorry I'm not seeing it either. The current xdp_buff is defined
> > by,
> > 
> >   struct xdp_buff {
> > 	void *data;
> > 	void *data_end;
> >   };
> > 
> > The verifier has an xdp_is_valid_access() check to ensure we don't go
> > past data_end. The page for now at least never leaves the driver. For
> > the work to get xmit to other devices working I'm still not sure I see
> > any issue.
> 
> I guess I can say that the packets must be "writable" until I'm blue
> in the face but I'll say it again, semantically writable pages are a
> requirement.  And if multiple packets share a page this requirement
> is not satisfied.
> 
> Also, we want to do several things in the future:
> 
> 1) Allow push/pop of headers via eBPF code, which needs we need
>    headroom.

I think that with e.g. LRO or a large MTU page per packet
does not guarantee headroom.

> 2) Transparently zero-copy pass packets into userspace, basically
>    the user will have a semi-permanently mapped ring of all the
>    packet pages sitting in the RX queue of the device and the
>    page pool associated with it.  This way we avoid all of the
>    TLB flush/map overhead for the user's mapping of the packets
>    just as we avoid the DMA map/unmap overhead.

Looks like you can share pages between packets as long as
they all come from the same pool so accessible
to the same userspace.

> And that's just the beginninng.
> 
> I'm sure others can come up with more reasons why we have this
> requirement.

I'm still a bit confused :( Is this a requirement of the current code or
to enable future extensions?

-- 
MST

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-28 17:11                             ` David Miller
  2016-10-30 22:53                               ` Michael S. Tsirkin
@ 2016-11-02 14:01                               ` Jesper Dangaard Brouer
  2016-11-02 16:06                                 ` Alexander Duyck
  1 sibling, 1 reply; 42+ messages in thread
From: Jesper Dangaard Brouer @ 2016-11-02 14:01 UTC (permalink / raw)
  To: David Miller
  Cc: john.fastabend, alexander.duyck, mst, shrijeet, tom, netdev, shm,
	roopa, nikolay, brouer

On Fri, 28 Oct 2016 13:11:01 -0400 (EDT)
David Miller <davem@davemloft.net> wrote:

> From: John Fastabend <john.fastabend@gmail.com>
> Date: Fri, 28 Oct 2016 08:56:35 -0700
> 
> > On 16-10-27 07:10 PM, David Miller wrote:  
> >> From: Alexander Duyck <alexander.duyck@gmail.com>
> >> Date: Thu, 27 Oct 2016 18:43:59 -0700
> >>   
> >>> On Thu, Oct 27, 2016 at 6:35 PM, David Miller <davem@davemloft.net> wrote:  
> >>>> From: "Michael S. Tsirkin" <mst@redhat.com>
> >>>> Date: Fri, 28 Oct 2016 01:25:48 +0300
> >>>>  
> >>>>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:  
> >>>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
> >>>>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
> >>>>>>  
> >>>>>>> Something I'd like to understand is how does XDP address the
> >>>>>>> problem that 100Byte packets are consuming 4K of memory now.  
> >>>>>>
> >>>>>> Via page pools.  We're going to make a generic one, but right now
> >>>>>> each and every driver implements a quick list of pages to allocate
> >>>>>> from (and thus avoid the DMA man/unmap overhead, etc.)  
> >>>>>
> >>>>> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
> >>>>> so there should be no issue with that even when using sub/page
> >>>>> regions, assuming DMA APIs support sub-page map/unmap correctly.  
> >>>>
> >>>> That's not what I said.
> >>>>
> >>>> The page pools are meant to address the performance degradation from
> >>>> going to having one packet per page for the sake of XDP's
> >>>> requirements.
> >>>>
> >>>> You still need to have one packet per page for correct XDP operation
> >>>> whether you do page pools or not, and whether you have DMA mapping
> >>>> (or it's equivalent virutalization operation) or not.  
> >>>
> >>> Maybe I am missing something here, but why do you need to limit things
> >>> to one packet per page for correct XDP operation?  Most of the drivers
> >>> out there now are usually storing something closer to at least 2
> >>> packets per page, and with the DMA API fixes I am working on there
> >>> should be no issue with changing the contents inside those pages since
> >>> we won't invalidate or overwrite the data after the DMA buffer has
> >>> been synchronized for use by the CPU.  
> >> 
> >> Because with SKB's you can share the page with other packets.
> >> 
> >> With XDP you simply cannot.
> >> 
> >> It's software semantics that are the issue.  SKB frag list pages
> >> are read only, XDP packets are writable.
> >> 
> >> This has nothing to do with "writability" of the pages wrt. DMA
> >> mapping or cpu mappings.
> >>   
> > 
> > Sorry I'm not seeing it either. The current xdp_buff is defined
> > by,
> > 
> >   struct xdp_buff {
> > 	void *data;
> > 	void *data_end;
> >   };
> > 
> > The verifier has an xdp_is_valid_access() check to ensure we don't go
> > past data_end. The page for now at least never leaves the driver. For
> > the work to get xmit to other devices working I'm still not sure I see
> > any issue.  
> 
> I guess I can say that the packets must be "writable" until I'm blue
> in the face but I'll say it again, semantically writable pages are a
> requirement.  And if multiple packets share a page this requirement
> is not satisfied.
> 
> Also, we want to do several things in the future:
> 
> 1) Allow push/pop of headers via eBPF code, which needs we need
>    headroom.
> 
> 2) Transparently zero-copy pass packets into userspace, basically
>    the user will have a semi-permanently mapped ring of all the
>    packet pages sitting in the RX queue of the device and the
>    page pool associated with it.  This way we avoid all of the
>    TLB flush/map overhead for the user's mapping of the packets
>    just as we avoid the DMA map/unmap overhead.
> 
> And that's just the beginninng.
> 
> I'm sure others can come up with more reasons why we have this
> requirement.

I've tried to update the XDP documentation about the "Page per packet"
requirement[1], fell free to correct below text:

Page per packet
===============

On RX many NIC drivers splitup a memory page, to share it for multiple
packets, in-order to conserve memory.  Doing so complicates handling
and accounting of these memory pages, which affects performance.
Particularly the extra atomic refcnt handling needed for the page can
hurt performance.

XDP defines upfront a memory model where there is only one packet per
page.  This simplifies page handling and open up for future
extensions.

This requirement also (upfront) result in choosing not to support
things like, jumpo-frames, LRO and generally packets split over
multiple pages.

In the future, this strict memory model might be relaxed, but for now
it is a strict requirement.  With a more flexible
:ref:`ref_prog_negotiation` is might be possible to negotiate another
memory model. Given some specific XDP use-case might not require this
strict memory model.



Online here:
 [1] http://prototype-kernel.readthedocs.io/en/latest/networking/XDP/design/requirements.html#page-per-packet

Commit:
 https://github.com/netoptimizer/prototype-kernel/commit/27ece059011e6d5c8a1cb4bdb2ab361cd7faa6dd

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-10-29 11:25                                   ` Thomas Graf
@ 2016-11-02 14:27                                     ` Jesper Dangaard Brouer
  2016-11-03  1:28                                       ` Shrijeet Mukherjee
  0 siblings, 1 reply; 42+ messages in thread
From: Jesper Dangaard Brouer @ 2016-11-02 14:27 UTC (permalink / raw)
  To: Thomas Graf
  Cc: Shrijeet Mukherjee, Alexei Starovoitov, Jakub Kicinski,
	John Fastabend, David Miller, alexander.duyck, mst, shrijeet,
	tom, netdev, Roopa Prabhu, Nikolay Aleksandrov, brouer

On Sat, 29 Oct 2016 13:25:14 +0200
Thomas Graf <tgraf@suug.ch> wrote:

> On 10/28/16 at 08:51pm, Shrijeet Mukherjee wrote:
> > Generally agree, but SRIOV nics with multiple queues can end up in a bad
> > spot if each buffer was 4K right ? I see a specific page pool to be used
> > by queues which are enabled for XDP as the easiest to swing solution that
> > way the memory overhead can be restricted to enabled queues and shared
> > access issues can be restricted to skb's using that pool no ?

Yes, that is why that I've been arguing so strongly for having the
flexibility to attach a XDP program per RX queue, as this only change
the memory model for this one queue.

 
> Isn't this clearly a must anyway? I may be missing something
> fundamental here so please enlighten me :-)
> 
> If we dedicate a page per packet, that could translate to 14M*4K worth
> of memory being mapped per second for just a 10G NIC under DoS attack.
> How can one protect such as system? Is the assumption that we can always
> drop such packets quickly enough before we start dropping randomly due
> to memory pressure? If a handshake is required to determine validity
> of a packet then that is going to be difficult.

Under DoS attacks you don't run out of memory, because a diverse set of
socket memory limits/accounting avoids that situation.  What does
happen is the maximum achievable PPS rate is directly dependent on the
time you spend on each packet.   This use of CPU resources (and
hitting mem-limits-safe-guards) push-back on the drivers speed to
process the RX ring.  In effect, packets are dropped in the NIC HW as
RX-ring queue is not emptied fast-enough.

Given you don't control what HW drops, the attacker will "successfully"
cause your good traffic to be among the dropped packets.

This is where XDP change the picture. If you can express (by eBPF) a
filter that can separate "bad" vs "good" traffic, then you can take
back control.  Almost like controlling what traffic the HW should drop.
Given the cost of XDP-eBPF filter + serving regular traffic does not
use all of your CPU resources, you have overcome the attack.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-11-02 14:01                               ` Jesper Dangaard Brouer
@ 2016-11-02 16:06                                 ` Alexander Duyck
  0 siblings, 0 replies; 42+ messages in thread
From: Alexander Duyck @ 2016-11-02 16:06 UTC (permalink / raw)
  To: Jesper Dangaard Brouer
  Cc: David Miller, John Fastabend, Michael S. Tsirkin,
	Shrijeet Mukherjee, Tom Herbert, Netdev, Shrijeet Mukherjee,
	roopa, nikolay

On Wed, Nov 2, 2016 at 7:01 AM, Jesper Dangaard Brouer
<brouer@redhat.com> wrote:
> On Fri, 28 Oct 2016 13:11:01 -0400 (EDT)
> David Miller <davem@davemloft.net> wrote:
>
>> From: John Fastabend <john.fastabend@gmail.com>
>> Date: Fri, 28 Oct 2016 08:56:35 -0700
>>
>> > On 16-10-27 07:10 PM, David Miller wrote:
>> >> From: Alexander Duyck <alexander.duyck@gmail.com>
>> >> Date: Thu, 27 Oct 2016 18:43:59 -0700
>> >>
>> >>> On Thu, Oct 27, 2016 at 6:35 PM, David Miller <davem@davemloft.net> wrote:
>> >>>> From: "Michael S. Tsirkin" <mst@redhat.com>
>> >>>> Date: Fri, 28 Oct 2016 01:25:48 +0300
>> >>>>
>> >>>>> On Thu, Oct 27, 2016 at 05:42:18PM -0400, David Miller wrote:
>> >>>>>> From: "Michael S. Tsirkin" <mst@redhat.com>
>> >>>>>> Date: Fri, 28 Oct 2016 00:30:35 +0300
>> >>>>>>
>> >>>>>>> Something I'd like to understand is how does XDP address the
>> >>>>>>> problem that 100Byte packets are consuming 4K of memory now.
>> >>>>>>
>> >>>>>> Via page pools.  We're going to make a generic one, but right now
>> >>>>>> each and every driver implements a quick list of pages to allocate
>> >>>>>> from (and thus avoid the DMA man/unmap overhead, etc.)
>> >>>>>
>> >>>>> So to clarify, ATM virtio doesn't attempt to avoid dma map/unmap
>> >>>>> so there should be no issue with that even when using sub/page
>> >>>>> regions, assuming DMA APIs support sub-page map/unmap correctly.
>> >>>>
>> >>>> That's not what I said.
>> >>>>
>> >>>> The page pools are meant to address the performance degradation from
>> >>>> going to having one packet per page for the sake of XDP's
>> >>>> requirements.
>> >>>>
>> >>>> You still need to have one packet per page for correct XDP operation
>> >>>> whether you do page pools or not, and whether you have DMA mapping
>> >>>> (or it's equivalent virutalization operation) or not.
>> >>>
>> >>> Maybe I am missing something here, but why do you need to limit things
>> >>> to one packet per page for correct XDP operation?  Most of the drivers
>> >>> out there now are usually storing something closer to at least 2
>> >>> packets per page, and with the DMA API fixes I am working on there
>> >>> should be no issue with changing the contents inside those pages since
>> >>> we won't invalidate or overwrite the data after the DMA buffer has
>> >>> been synchronized for use by the CPU.
>> >>
>> >> Because with SKB's you can share the page with other packets.
>> >>
>> >> With XDP you simply cannot.
>> >>
>> >> It's software semantics that are the issue.  SKB frag list pages
>> >> are read only, XDP packets are writable.
>> >>
>> >> This has nothing to do with "writability" of the pages wrt. DMA
>> >> mapping or cpu mappings.
>> >>
>> >
>> > Sorry I'm not seeing it either. The current xdp_buff is defined
>> > by,
>> >
>> >   struct xdp_buff {
>> >     void *data;
>> >     void *data_end;
>> >   };
>> >
>> > The verifier has an xdp_is_valid_access() check to ensure we don't go
>> > past data_end. The page for now at least never leaves the driver. For
>> > the work to get xmit to other devices working I'm still not sure I see
>> > any issue.
>>
>> I guess I can say that the packets must be "writable" until I'm blue
>> in the face but I'll say it again, semantically writable pages are a
>> requirement.  And if multiple packets share a page this requirement
>> is not satisfied.
>>
>> Also, we want to do several things in the future:
>>
>> 1) Allow push/pop of headers via eBPF code, which needs we need
>>    headroom.
>>
>> 2) Transparently zero-copy pass packets into userspace, basically
>>    the user will have a semi-permanently mapped ring of all the
>>    packet pages sitting in the RX queue of the device and the
>>    page pool associated with it.  This way we avoid all of the
>>    TLB flush/map overhead for the user's mapping of the packets
>>    just as we avoid the DMA map/unmap overhead.
>>
>> And that's just the beginninng.
>>
>> I'm sure others can come up with more reasons why we have this
>> requirement.
>
> I've tried to update the XDP documentation about the "Page per packet"
> requirement[1], fell free to correct below text:
>
> Page per packet
> ===============
>
> On RX many NIC drivers splitup a memory page, to share it for multiple
> packets, in-order to conserve memory.  Doing so complicates handling
> and accounting of these memory pages, which affects performance.
> Particularly the extra atomic refcnt handling needed for the page can
> hurt performance.

The atomic refcnt handling isn't a big deal.  It is easily worked
around by doing bulk updates.  The allocating and freeing of pages on
the other hand is quite expensive by comparison.

> XDP defines upfront a memory model where there is only one packet per
> page.  This simplifies page handling and open up for future
> extensions.

This is blatantly wrong.  It complicates page handling especially
since you are implying you aren't using the page count which implies a
new memory model like SLUB but not SLUB and hopefully including DMA
pools of some sort.  The fact is the page count tricks are much
cheaper than having to come up with your own page allocator.  In
addition having to share these with the stack means the network stack
and file system has to be updated to handle the special pages you are
sending that aren't using the page count.

> This requirement also (upfront) result in choosing not to support
> things like, jumpo-frames, LRO and generally packets split over
> multiple pages.

Same here.  Page per packet is only limiting if the final frame size
is larger than a page.  So for example on a 4K page I could setup an
82599 to support doing hardware LRO into a single 3K block which would
give you 2 frames per packet.  In the case of pages larger than 4k it
wouldn't be hard to support jumbo frames.  Where it gets messier is
the headroom requirements since I have yet to hear how that is going
to be communicated to the XPS programs.

> In the future, this strict memory model might be relaxed, but for now
> it is a strict requirement.  With a more flexible
> :ref:`ref_prog_negotiation` is might be possible to negotiate another
> memory model. Given some specific XDP use-case might not require this
> strict memory model.

Honestly, I think making this a hard requirement is going to hurt
overall performance for XDP.  I get that it is needed if you are
planning to support a use case where userspace is somehow sharing
buffers with the ring but I really think that it is getting the cart
ahead of the horse since in that case you should have a ring that is
completely reserved for that userspace application anyway and we
haven't figured out a good way to address that.  All of this page per
packet stuff just seems like premature optimization for a use case
that doesn't yet exist.

In the cases where packets are just getting sent into the network
stack with XDP as a bump in the wire I don't see the point in having
the entire page reserved for a single packet.  It seems like a fast
way to start leaking memory and eating up resources since every frame
going into the stack is going to end up with a truesize of over 4K(64K
in some cases) unless we introduce some sort of copy-break.

Anyway that is my $.02 on all this.

- Alex

^ permalink raw reply	[flat|nested] 42+ messages in thread

* RE: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-11-02 14:27                                     ` Jesper Dangaard Brouer
@ 2016-11-03  1:28                                       ` Shrijeet Mukherjee
  2016-11-03  4:11                                         ` Michael S. Tsirkin
  0 siblings, 1 reply; 42+ messages in thread
From: Shrijeet Mukherjee @ 2016-11-03  1:28 UTC (permalink / raw)
  To: Jesper Dangaard Brouer, Thomas Graf
  Cc: Alexei Starovoitov, Jakub Kicinski, John Fastabend, David Miller,
	alexander.duyck, mst, shrijeet, tom, netdev, Roopa Prabhu,
	Nikolay Aleksandrov

> -----Original Message-----
> From: Jesper Dangaard Brouer [mailto:brouer@redhat.com]
> Sent: Wednesday, November 2, 2016 7:27 AM
> To: Thomas Graf <tgraf@suug.ch>
> Cc: Shrijeet Mukherjee <shm@cumulusnetworks.com>; Alexei Starovoitov
> <alexei.starovoitov@gmail.com>; Jakub Kicinski <kubakici@wp.pl>; John
> Fastabend <john.fastabend@gmail.com>; David Miller
> <davem@davemloft.net>; alexander.duyck@gmail.com; mst@redhat.com;
> shrijeet@gmail.com; tom@herbertland.com; netdev@vger.kernel.org;
> Roopa Prabhu <roopa@cumulusnetworks.com>; Nikolay Aleksandrov
> <nikolay@cumulusnetworks.com>; brouer@redhat.com
> Subject: Re: [PATCH net-next RFC WIP] Patch for XDP support for
virtio_net
>
> On Sat, 29 Oct 2016 13:25:14 +0200
> Thomas Graf <tgraf@suug.ch> wrote:
>
> > On 10/28/16 at 08:51pm, Shrijeet Mukherjee wrote:
> > > Generally agree, but SRIOV nics with multiple queues can end up in a
> > > bad spot if each buffer was 4K right ? I see a specific page pool to
> > > be used by queues which are enabled for XDP as the easiest to swing
> > > solution that way the memory overhead can be restricted to enabled
> > > queues and shared access issues can be restricted to skb's using
that
> pool no ?
>
> Yes, that is why that I've been arguing so strongly for having the
flexibility to
> attach a XDP program per RX queue, as this only change the memory model
> for this one queue.
>
>
> > Isn't this clearly a must anyway? I may be missing something
> > fundamental here so please enlighten me :-)
> >
> > If we dedicate a page per packet, that could translate to 14M*4K worth
> > of memory being mapped per second for just a 10G NIC under DoS attack.
> > How can one protect such as system? Is the assumption that we can
> > always drop such packets quickly enough before we start dropping
> > randomly due to memory pressure? If a handshake is required to
> > determine validity of a packet then that is going to be difficult.
>
> Under DoS attacks you don't run out of memory, because a diverse set of
> socket memory limits/accounting avoids that situation.  What does happen
> is the maximum achievable PPS rate is directly dependent on the
> time you spend on each packet.   This use of CPU resources (and
> hitting mem-limits-safe-guards) push-back on the drivers speed to
process
> the RX ring.  In effect, packets are dropped in the NIC HW as RX-ring
queue
> is not emptied fast-enough.
>
> Given you don't control what HW drops, the attacker will "successfully"
> cause your good traffic to be among the dropped packets.
>
> This is where XDP change the picture. If you can express (by eBPF) a
filter
> that can separate "bad" vs "good" traffic, then you can take back
control.
> Almost like controlling what traffic the HW should drop.
> Given the cost of XDP-eBPF filter + serving regular traffic does not use
all of
> your CPU resources, you have overcome the attack.
>
> --
Jesper,  John et al .. to make this a little concrete I am going to spin
up a v2 which has only bigbuffers mode enabled for xdp acceleration, all
other modes will reject the xdp ndo ..

Do we have agreement on that model ?

It will need that all vhost implementations will need to start with
mergeable buffers disabled to get xdp goodness, but that sounds like a
safe thing to do for now ..

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-11-03  1:28                                       ` Shrijeet Mukherjee
@ 2016-11-03  4:11                                         ` Michael S. Tsirkin
  2016-11-03  6:44                                           ` John Fastabend
  0 siblings, 1 reply; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-11-03  4:11 UTC (permalink / raw)
  To: Shrijeet Mukherjee
  Cc: Jesper Dangaard Brouer, Thomas Graf, Alexei Starovoitov,
	Jakub Kicinski, John Fastabend, David Miller, alexander.duyck,
	shrijeet, tom, netdev, Roopa Prabhu, Nikolay Aleksandrov

On Wed, Nov 02, 2016 at 06:28:34PM -0700, Shrijeet Mukherjee wrote:
> > -----Original Message-----
> > From: Jesper Dangaard Brouer [mailto:brouer@redhat.com]
> > Sent: Wednesday, November 2, 2016 7:27 AM
> > To: Thomas Graf <tgraf@suug.ch>
> > Cc: Shrijeet Mukherjee <shm@cumulusnetworks.com>; Alexei Starovoitov
> > <alexei.starovoitov@gmail.com>; Jakub Kicinski <kubakici@wp.pl>; John
> > Fastabend <john.fastabend@gmail.com>; David Miller
> > <davem@davemloft.net>; alexander.duyck@gmail.com; mst@redhat.com;
> > shrijeet@gmail.com; tom@herbertland.com; netdev@vger.kernel.org;
> > Roopa Prabhu <roopa@cumulusnetworks.com>; Nikolay Aleksandrov
> > <nikolay@cumulusnetworks.com>; brouer@redhat.com
> > Subject: Re: [PATCH net-next RFC WIP] Patch for XDP support for
> virtio_net
> >
> > On Sat, 29 Oct 2016 13:25:14 +0200
> > Thomas Graf <tgraf@suug.ch> wrote:
> >
> > > On 10/28/16 at 08:51pm, Shrijeet Mukherjee wrote:
> > > > Generally agree, but SRIOV nics with multiple queues can end up in a
> > > > bad spot if each buffer was 4K right ? I see a specific page pool to
> > > > be used by queues which are enabled for XDP as the easiest to swing
> > > > solution that way the memory overhead can be restricted to enabled
> > > > queues and shared access issues can be restricted to skb's using
> that
> > pool no ?
> >
> > Yes, that is why that I've been arguing so strongly for having the
> flexibility to
> > attach a XDP program per RX queue, as this only change the memory model
> > for this one queue.
> >
> >
> > > Isn't this clearly a must anyway? I may be missing something
> > > fundamental here so please enlighten me :-)
> > >
> > > If we dedicate a page per packet, that could translate to 14M*4K worth
> > > of memory being mapped per second for just a 10G NIC under DoS attack.
> > > How can one protect such as system? Is the assumption that we can
> > > always drop such packets quickly enough before we start dropping
> > > randomly due to memory pressure? If a handshake is required to
> > > determine validity of a packet then that is going to be difficult.
> >
> > Under DoS attacks you don't run out of memory, because a diverse set of
> > socket memory limits/accounting avoids that situation.  What does happen
> > is the maximum achievable PPS rate is directly dependent on the
> > time you spend on each packet.   This use of CPU resources (and
> > hitting mem-limits-safe-guards) push-back on the drivers speed to
> process
> > the RX ring.  In effect, packets are dropped in the NIC HW as RX-ring
> queue
> > is not emptied fast-enough.
> >
> > Given you don't control what HW drops, the attacker will "successfully"
> > cause your good traffic to be among the dropped packets.
> >
> > This is where XDP change the picture. If you can express (by eBPF) a
> filter
> > that can separate "bad" vs "good" traffic, then you can take back
> control.
> > Almost like controlling what traffic the HW should drop.
> > Given the cost of XDP-eBPF filter + serving regular traffic does not use
> all of
> > your CPU resources, you have overcome the attack.
> >
> > --
> Jesper,  John et al .. to make this a little concrete I am going to spin
> up a v2 which has only bigbuffers mode enabled for xdp acceleration, all
> other modes will reject the xdp ndo ..
> 
> Do we have agreement on that model ?
> 
> It will need that all vhost implementations will need to start with
> mergeable buffers disabled to get xdp goodness, but that sounds like a
> safe thing to do for now ..

It's ok for experimentation, but really after speaking with Alexei it's
clear to me that xdp should have a separate code path in the driver,
e.g. the separation between modes is something that does not
make sense for xdp.

The way I imagine it working:

- when XDP is attached disable all LRO using VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
  (not used by driver so far, designed to allow dynamic LRO control with
   ethtool)
- start adding page-sized buffers
- do something with non-page-sized buffers added previously - what
  exactly? copy I guess? What about LRO packets that are too large -
  can we drop or can we split them up?

I'm fine with disabling XDP for some configurations as the first step,
and we can add that support later.

Ideas about mergeable buffers (optional):

At the moment mergeable buffers can't be disabled dynamically.
They do bring a small benefit for XDP if host MTU is large (see below)
and aren't hard to support:
- if header is by itself skip 1st page
- otherwise copy all data into first page
and it's nicer not to add random limitations that require guest reboot.
It might make sense to add a command that disables/enabled
mergeable buffers dynamically but that's for newer hosts.

Spec does not require it but in practice most hosts put all data
in the 1st page or all in the 2nd page so the copy will be nop
for these cases.

Large host MTU - newer hosts report the host MTU, older ones don't.
Using mergeable buffers we can at least detect this case
(and then what? drop I guess).




-- 
MST

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-11-03  4:11                                         ` Michael S. Tsirkin
@ 2016-11-03  6:44                                           ` John Fastabend
  2016-11-03 22:20                                             ` John Fastabend
  2016-11-03 22:42                                             ` Michael S. Tsirkin
  0 siblings, 2 replies; 42+ messages in thread
From: John Fastabend @ 2016-11-03  6:44 UTC (permalink / raw)
  To: Michael S. Tsirkin, Shrijeet Mukherjee
  Cc: Jesper Dangaard Brouer, Thomas Graf, Alexei Starovoitov,
	Jakub Kicinski, David Miller, alexander.duyck, shrijeet, tom,
	netdev, Roopa Prabhu, Nikolay Aleksandrov

On 16-11-02 09:11 PM, Michael S. Tsirkin wrote:
> On Wed, Nov 02, 2016 at 06:28:34PM -0700, Shrijeet Mukherjee wrote:
>>> -----Original Message-----
>>> From: Jesper Dangaard Brouer [mailto:brouer@redhat.com]
>>> Sent: Wednesday, November 2, 2016 7:27 AM
>>> To: Thomas Graf <tgraf@suug.ch>
>>> Cc: Shrijeet Mukherjee <shm@cumulusnetworks.com>; Alexei Starovoitov
>>> <alexei.starovoitov@gmail.com>; Jakub Kicinski <kubakici@wp.pl>; John
>>> Fastabend <john.fastabend@gmail.com>; David Miller
>>> <davem@davemloft.net>; alexander.duyck@gmail.com; mst@redhat.com;
>>> shrijeet@gmail.com; tom@herbertland.com; netdev@vger.kernel.org;
>>> Roopa Prabhu <roopa@cumulusnetworks.com>; Nikolay Aleksandrov
>>> <nikolay@cumulusnetworks.com>; brouer@redhat.com
>>> Subject: Re: [PATCH net-next RFC WIP] Patch for XDP support for
>> virtio_net
>>>
>>> On Sat, 29 Oct 2016 13:25:14 +0200
>>> Thomas Graf <tgraf@suug.ch> wrote:
>>>
>>>> On 10/28/16 at 08:51pm, Shrijeet Mukherjee wrote:
>>>>> Generally agree, but SRIOV nics with multiple queues can end up in a
>>>>> bad spot if each buffer was 4K right ? I see a specific page pool to
>>>>> be used by queues which are enabled for XDP as the easiest to swing
>>>>> solution that way the memory overhead can be restricted to enabled
>>>>> queues and shared access issues can be restricted to skb's using
>> that
>>> pool no ?
>>>
>>> Yes, that is why that I've been arguing so strongly for having the
>> flexibility to
>>> attach a XDP program per RX queue, as this only change the memory model
>>> for this one queue.
>>>
>>>
>>>> Isn't this clearly a must anyway? I may be missing something
>>>> fundamental here so please enlighten me :-)
>>>>
>>>> If we dedicate a page per packet, that could translate to 14M*4K worth
>>>> of memory being mapped per second for just a 10G NIC under DoS attack.
>>>> How can one protect such as system? Is the assumption that we can
>>>> always drop such packets quickly enough before we start dropping
>>>> randomly due to memory pressure? If a handshake is required to
>>>> determine validity of a packet then that is going to be difficult.
>>>
>>> Under DoS attacks you don't run out of memory, because a diverse set of
>>> socket memory limits/accounting avoids that situation.  What does happen
>>> is the maximum achievable PPS rate is directly dependent on the
>>> time you spend on each packet.   This use of CPU resources (and
>>> hitting mem-limits-safe-guards) push-back on the drivers speed to
>> process
>>> the RX ring.  In effect, packets are dropped in the NIC HW as RX-ring
>> queue
>>> is not emptied fast-enough.
>>>
>>> Given you don't control what HW drops, the attacker will "successfully"
>>> cause your good traffic to be among the dropped packets.
>>>
>>> This is where XDP change the picture. If you can express (by eBPF) a
>> filter
>>> that can separate "bad" vs "good" traffic, then you can take back
>> control.
>>> Almost like controlling what traffic the HW should drop.
>>> Given the cost of XDP-eBPF filter + serving regular traffic does not use
>> all of
>>> your CPU resources, you have overcome the attack.
>>>
>>> --
>> Jesper,  John et al .. to make this a little concrete I am going to spin
>> up a v2 which has only bigbuffers mode enabled for xdp acceleration, all
>> other modes will reject the xdp ndo ..
>>
>> Do we have agreement on that model ?
>>
>> It will need that all vhost implementations will need to start with
>> mergeable buffers disabled to get xdp goodness, but that sounds like a
>> safe thing to do for now ..
> 
> It's ok for experimentation, but really after speaking with Alexei it's
> clear to me that xdp should have a separate code path in the driver,
> e.g. the separation between modes is something that does not
> make sense for xdp.
> 
> The way I imagine it working:

OK I tried to make some sense out of this and get it working,

> 
> - when XDP is attached disable all LRO using VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
>   (not used by driver so far, designed to allow dynamic LRO control with
>    ethtool)

I see there is a UAPI bit for this but I guess we also need to add
support to vhost as well? Seems otherwise we may just drop a bunch
of packets on the floor out of handle_rx() when recvmsg returns larger
than a page size. Or did I read this wrong...

> - start adding page-sized buffers

I started to mangle add_recvbuf_big() and receive_big() here and this
didn't seem too bad.

> - do something with non-page-sized buffers added previously - what
>   exactly? copy I guess? What about LRO packets that are too large -
>   can we drop or can we split them up?

hmm not sure I understand this here. With LRO disabled and mergeable
buffers disabled all packets should fit in a page correct?

With LRO enabled case I guess to start with we block XDP from being
loaded for the same reason we don't allow jumbo frames on physical
nics.

> 
> I'm fine with disabling XDP for some configurations as the first step,
> and we can add that support later.
> 

In order for this to work though I guess we need to be able to
dynamically disable mergeable buffers at the moment I just commented
it out of the features list and fixed up virtio_has_features so it
wont bug_on.

> Ideas about mergeable buffers (optional):
> 
> At the moment mergeable buffers can't be disabled dynamically.
> They do bring a small benefit for XDP if host MTU is large (see below)
> and aren't hard to support:
> - if header is by itself skip 1st page
> - otherwise copy all data into first page
> and it's nicer not to add random limitations that require guest reboot.
> It might make sense to add a command that disables/enabled
> mergeable buffers dynamically but that's for newer hosts.

Yep it seems disabling mergeable buffers solves this but didn't look at
it too closely. I'll look closer tomorrow.

> 
> Spec does not require it but in practice most hosts put all data
> in the 1st page or all in the 2nd page so the copy will be nop
> for these cases.
> 
> Large host MTU - newer hosts report the host MTU, older ones don't.
> Using mergeable buffers we can at least detect this case
> (and then what? drop I guess).
> 

The physical nics just refuse to load XDP with large MTU. Any reason
not to negotiate the mtu with the guest so that the guest can force
this?

> 
> 
> 

Thanks,
John

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-11-03  6:44                                           ` John Fastabend
@ 2016-11-03 22:20                                             ` John Fastabend
  2016-11-03 22:42                                             ` Michael S. Tsirkin
  1 sibling, 0 replies; 42+ messages in thread
From: John Fastabend @ 2016-11-03 22:20 UTC (permalink / raw)
  To: Michael S. Tsirkin, Shrijeet Mukherjee
  Cc: Jesper Dangaard Brouer, Thomas Graf, Alexei Starovoitov,
	Jakub Kicinski, David Miller, alexander.duyck, shrijeet, tom,
	netdev, Roopa Prabhu, Nikolay Aleksandrov

On 16-11-02 11:44 PM, John Fastabend wrote:
> On 16-11-02 09:11 PM, Michael S. Tsirkin wrote:
>> On Wed, Nov 02, 2016 at 06:28:34PM -0700, Shrijeet Mukherjee wrote:
>>>> -----Original Message-----
>>>> From: Jesper Dangaard Brouer [mailto:brouer@redhat.com]
>>>> Sent: Wednesday, November 2, 2016 7:27 AM
>>>> To: Thomas Graf <tgraf@suug.ch>
>>>> Cc: Shrijeet Mukherjee <shm@cumulusnetworks.com>; Alexei Starovoitov
>>>> <alexei.starovoitov@gmail.com>; Jakub Kicinski <kubakici@wp.pl>; John
>>>> Fastabend <john.fastabend@gmail.com>; David Miller
>>>> <davem@davemloft.net>; alexander.duyck@gmail.com; mst@redhat.com;
>>>> shrijeet@gmail.com; tom@herbertland.com; netdev@vger.kernel.org;
>>>> Roopa Prabhu <roopa@cumulusnetworks.com>; Nikolay Aleksandrov
>>>> <nikolay@cumulusnetworks.com>; brouer@redhat.com
>>>> Subject: Re: [PATCH net-next RFC WIP] Patch for XDP support for
>>> virtio_net
>>>>
>>>> On Sat, 29 Oct 2016 13:25:14 +0200
>>>> Thomas Graf <tgraf@suug.ch> wrote:
>>>>
>>>>> On 10/28/16 at 08:51pm, Shrijeet Mukherjee wrote:
>>>>>> Generally agree, but SRIOV nics with multiple queues can end up in a
>>>>>> bad spot if each buffer was 4K right ? I see a specific page pool to
>>>>>> be used by queues which are enabled for XDP as the easiest to swing
>>>>>> solution that way the memory overhead can be restricted to enabled
>>>>>> queues and shared access issues can be restricted to skb's using
>>> that
>>>> pool no ?
>>>>
>>>> Yes, that is why that I've been arguing so strongly for having the
>>> flexibility to
>>>> attach a XDP program per RX queue, as this only change the memory model
>>>> for this one queue.
>>>>
>>>>
>>>>> Isn't this clearly a must anyway? I may be missing something
>>>>> fundamental here so please enlighten me :-)
>>>>>
>>>>> If we dedicate a page per packet, that could translate to 14M*4K worth
>>>>> of memory being mapped per second for just a 10G NIC under DoS attack.
>>>>> How can one protect such as system? Is the assumption that we can
>>>>> always drop such packets quickly enough before we start dropping
>>>>> randomly due to memory pressure? If a handshake is required to
>>>>> determine validity of a packet then that is going to be difficult.
>>>>
>>>> Under DoS attacks you don't run out of memory, because a diverse set of
>>>> socket memory limits/accounting avoids that situation.  What does happen
>>>> is the maximum achievable PPS rate is directly dependent on the
>>>> time you spend on each packet.   This use of CPU resources (and
>>>> hitting mem-limits-safe-guards) push-back on the drivers speed to
>>> process
>>>> the RX ring.  In effect, packets are dropped in the NIC HW as RX-ring
>>> queue
>>>> is not emptied fast-enough.
>>>>
>>>> Given you don't control what HW drops, the attacker will "successfully"
>>>> cause your good traffic to be among the dropped packets.
>>>>
>>>> This is where XDP change the picture. If you can express (by eBPF) a
>>> filter
>>>> that can separate "bad" vs "good" traffic, then you can take back
>>> control.
>>>> Almost like controlling what traffic the HW should drop.
>>>> Given the cost of XDP-eBPF filter + serving regular traffic does not use
>>> all of
>>>> your CPU resources, you have overcome the attack.
>>>>
>>>> --
>>> Jesper,  John et al .. to make this a little concrete I am going to spin
>>> up a v2 which has only bigbuffers mode enabled for xdp acceleration, all
>>> other modes will reject the xdp ndo ..
>>>
>>> Do we have agreement on that model ?
>>>
>>> It will need that all vhost implementations will need to start with
>>> mergeable buffers disabled to get xdp goodness, but that sounds like a
>>> safe thing to do for now ..
>>
>> It's ok for experimentation, but really after speaking with Alexei it's
>> clear to me that xdp should have a separate code path in the driver,
>> e.g. the separation between modes is something that does not
>> make sense for xdp.
>>
>> The way I imagine it working:
> 
> OK I tried to make some sense out of this and get it working,
> 
>>
>> - when XDP is attached disable all LRO using VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
>>   (not used by driver so far, designed to allow dynamic LRO control with
>>    ethtool)
> 
> I see there is a UAPI bit for this but I guess we also need to add
> support to vhost as well? Seems otherwise we may just drop a bunch
> of packets on the floor out of handle_rx() when recvmsg returns larger
> than a page size. Or did I read this wrong...
> 
>> - start adding page-sized buffers
> 
> I started to mangle add_recvbuf_big() and receive_big() here and this
> didn't seem too bad.
> 
>> - do something with non-page-sized buffers added previously - what
>>   exactly? copy I guess? What about LRO packets that are too large -
>>   can we drop or can we split them up?
> 
> hmm not sure I understand this here. With LRO disabled and mergeable
> buffers disabled all packets should fit in a page correct?
> 
> With LRO enabled case I guess to start with we block XDP from being
> loaded for the same reason we don't allow jumbo frames on physical
> nics.
> 
>>
>> I'm fine with disabling XDP for some configurations as the first step,
>> and we can add that support later.
>>
> 
> In order for this to work though I guess we need to be able to
> dynamically disable mergeable buffers at the moment I just commented
> it out of the features list and fixed up virtio_has_features so it
> wont bug_on.
> 
>> Ideas about mergeable buffers (optional):
>>
>> At the moment mergeable buffers can't be disabled dynamically.
>> They do bring a small benefit for XDP if host MTU is large (see below)
>> and aren't hard to support:
>> - if header is by itself skip 1st page
>> - otherwise copy all data into first page
>> and it's nicer not to add random limitations that require guest reboot.
>> It might make sense to add a command that disables/enabled
>> mergeable buffers dynamically but that's for newer hosts.
> 
> Yep it seems disabling mergeable buffers solves this but didn't look at
> it too closely. I'll look closer tomorrow.
> 

Actually after some more research I can't see why with LRO off using
mergeable buffers causes any issues? For XDP all we need is a pointer
to contiguous data and the length. Seems all conditions are met here
assuming I'm reading the code correctly.

Any more hints on this comment "otherwise copy all data into first
page" is required?

>>
>> Spec does not require it but in practice most hosts put all data
>> in the 1st page or all in the 2nd page so the copy will be nop
>> for these cases.
>>
>> Large host MTU - newer hosts report the host MTU, older ones don't.
>> Using mergeable buffers we can at least detect this case
>> (and then what? drop I guess).
>>
> 
> The physical nics just refuse to load XDP with large MTU. Any reason
> not to negotiate the mtu with the guest so that the guest can force
> this?
> 
>>
>>
>>
> 
> Thanks,
> John
> 

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-11-03  6:44                                           ` John Fastabend
  2016-11-03 22:20                                             ` John Fastabend
@ 2016-11-03 22:42                                             ` Michael S. Tsirkin
  2016-11-03 23:29                                               ` John Fastabend
  1 sibling, 1 reply; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-11-03 22:42 UTC (permalink / raw)
  To: John Fastabend
  Cc: Shrijeet Mukherjee, Jesper Dangaard Brouer, Thomas Graf,
	Alexei Starovoitov, Jakub Kicinski, David Miller,
	alexander.duyck, shrijeet, tom, netdev, Roopa Prabhu,
	Nikolay Aleksandrov, aconole

On Wed, Nov 02, 2016 at 11:44:33PM -0700, John Fastabend wrote:
> On 16-11-02 09:11 PM, Michael S. Tsirkin wrote:
> > On Wed, Nov 02, 2016 at 06:28:34PM -0700, Shrijeet Mukherjee wrote:
> >>> -----Original Message-----
> >>> From: Jesper Dangaard Brouer [mailto:brouer@redhat.com]
> >>> Sent: Wednesday, November 2, 2016 7:27 AM
> >>> To: Thomas Graf <tgraf@suug.ch>
> >>> Cc: Shrijeet Mukherjee <shm@cumulusnetworks.com>; Alexei Starovoitov
> >>> <alexei.starovoitov@gmail.com>; Jakub Kicinski <kubakici@wp.pl>; John
> >>> Fastabend <john.fastabend@gmail.com>; David Miller
> >>> <davem@davemloft.net>; alexander.duyck@gmail.com; mst@redhat.com;
> >>> shrijeet@gmail.com; tom@herbertland.com; netdev@vger.kernel.org;
> >>> Roopa Prabhu <roopa@cumulusnetworks.com>; Nikolay Aleksandrov
> >>> <nikolay@cumulusnetworks.com>; brouer@redhat.com
> >>> Subject: Re: [PATCH net-next RFC WIP] Patch for XDP support for
> >> virtio_net
> >>>
> >>> On Sat, 29 Oct 2016 13:25:14 +0200
> >>> Thomas Graf <tgraf@suug.ch> wrote:
> >>>
> >>>> On 10/28/16 at 08:51pm, Shrijeet Mukherjee wrote:
> >>>>> Generally agree, but SRIOV nics with multiple queues can end up in a
> >>>>> bad spot if each buffer was 4K right ? I see a specific page pool to
> >>>>> be used by queues which are enabled for XDP as the easiest to swing
> >>>>> solution that way the memory overhead can be restricted to enabled
> >>>>> queues and shared access issues can be restricted to skb's using
> >> that
> >>> pool no ?
> >>>
> >>> Yes, that is why that I've been arguing so strongly for having the
> >> flexibility to
> >>> attach a XDP program per RX queue, as this only change the memory model
> >>> for this one queue.
> >>>
> >>>
> >>>> Isn't this clearly a must anyway? I may be missing something
> >>>> fundamental here so please enlighten me :-)
> >>>>
> >>>> If we dedicate a page per packet, that could translate to 14M*4K worth
> >>>> of memory being mapped per second for just a 10G NIC under DoS attack.
> >>>> How can one protect such as system? Is the assumption that we can
> >>>> always drop such packets quickly enough before we start dropping
> >>>> randomly due to memory pressure? If a handshake is required to
> >>>> determine validity of a packet then that is going to be difficult.
> >>>
> >>> Under DoS attacks you don't run out of memory, because a diverse set of
> >>> socket memory limits/accounting avoids that situation.  What does happen
> >>> is the maximum achievable PPS rate is directly dependent on the
> >>> time you spend on each packet.   This use of CPU resources (and
> >>> hitting mem-limits-safe-guards) push-back on the drivers speed to
> >> process
> >>> the RX ring.  In effect, packets are dropped in the NIC HW as RX-ring
> >> queue
> >>> is not emptied fast-enough.
> >>>
> >>> Given you don't control what HW drops, the attacker will "successfully"
> >>> cause your good traffic to be among the dropped packets.
> >>>
> >>> This is where XDP change the picture. If you can express (by eBPF) a
> >> filter
> >>> that can separate "bad" vs "good" traffic, then you can take back
> >> control.
> >>> Almost like controlling what traffic the HW should drop.
> >>> Given the cost of XDP-eBPF filter + serving regular traffic does not use
> >> all of
> >>> your CPU resources, you have overcome the attack.
> >>>
> >>> --
> >> Jesper,  John et al .. to make this a little concrete I am going to spin
> >> up a v2 which has only bigbuffers mode enabled for xdp acceleration, all
> >> other modes will reject the xdp ndo ..
> >>
> >> Do we have agreement on that model ?
> >>
> >> It will need that all vhost implementations will need to start with
> >> mergeable buffers disabled to get xdp goodness, but that sounds like a
> >> safe thing to do for now ..
> > 
> > It's ok for experimentation, but really after speaking with Alexei it's
> > clear to me that xdp should have a separate code path in the driver,
> > e.g. the separation between modes is something that does not
> > make sense for xdp.
> > 
> > The way I imagine it working:
> 
> OK I tried to make some sense out of this and get it working,
> 
> > 
> > - when XDP is attached disable all LRO using VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
> >   (not used by driver so far, designed to allow dynamic LRO control with
> >    ethtool)
> 
> I see there is a UAPI bit for this but I guess we also need to add
> support to vhost as well? Seems otherwise we may just drop a bunch
> of packets on the floor out of handle_rx() when recvmsg returns larger
> than a page size. Or did I read this wrong...

It's already supported host side. However you might
get some packets that were in flight when you attached.

> > - start adding page-sized buffers
> 
> I started to mangle add_recvbuf_big() and receive_big() here and this
> didn't seem too bad.

I imagine it won't be ATM but I think we'll need to support
mrg buffers with time and then it will be messy.
Besides, it's not an architectural thing that receive_big
uses page sized buffers, it could use any size.
So a separate path just for xdp would be better imho.

> > - do something with non-page-sized buffers added previously - what
> >   exactly? copy I guess? What about LRO packets that are too large -
> >   can we drop or can we split them up?
> 
> hmm not sure I understand this here. With LRO disabled and mergeable
> buffers disabled all packets should fit in a page correct?

Assuing F_MTU is negotiated and MTU field is small enough, yes.
But if you disable mrg buffers dynamically you will get some packets
in buffers that were added before the disable.
Similarly for disabling LRO dynamically.

> With LRO enabled case I guess to start with we block XDP from being
> loaded for the same reason we don't allow jumbo frames on physical
> nics.

If you ask that host disables the capability, then yes, it's easy.
Let's do that for now, it's a start.


> > 
> > I'm fine with disabling XDP for some configurations as the first step,
> > and we can add that support later.
> > 
> 
> In order for this to work though I guess we need to be able to
> dynamically disable mergeable buffers at the moment I just commented
> it out of the features list and fixed up virtio_has_features so it
> wont bug_on.

For now we can just set mrg_rxbuf=off on qemu command line, and
fail XDP attach if not there. I think we'll be able to support it
long term but you will need host side changes, or fully reset
device and reconfigure it.

> > Ideas about mergeable buffers (optional):
> > 
> > At the moment mergeable buffers can't be disabled dynamically.
> > They do bring a small benefit for XDP if host MTU is large (see below)
> > and aren't hard to support:
> > - if header is by itself skip 1st page
> > - otherwise copy all data into first page
> > and it's nicer not to add random limitations that require guest reboot.
> > It might make sense to add a command that disables/enabled
> > mergeable buffers dynamically but that's for newer hosts.
> 
> Yep it seems disabling mergeable buffers solves this but didn't look at
> it too closely. I'll look closer tomorrow.
> 
> > 
> > Spec does not require it but in practice most hosts put all data
> > in the 1st page or all in the 2nd page so the copy will be nop
> > for these cases.
> > 
> > Large host MTU - newer hosts report the host MTU, older ones don't.
> > Using mergeable buffers we can at least detect this case
> > (and then what? drop I guess).
> > 
> 
> The physical nics just refuse to load XDP with large MTU.

So let's do the same for now, unfortunately you don't know
the MTU unless _F_MTU is negitiated and QEMU does not
implement that yet, but it's easy to add.
In fact I suspect Aaron (cc) has an implementation since
he posted a patch implementing that.
Aaron could you post it pls?

> Any reason
> not to negotiate the mtu with the guest so that the guest can force
> this?

There are generally many guests and many NICs on the host.
A big packet arrives, what do you want to do with it?
We probably want to build propagating MTU across all VMs and NICs
but let's get a basic thing merged first.

> > 
> > 
> > 
> 
> Thanks,
> John

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-11-03 22:42                                             ` Michael S. Tsirkin
@ 2016-11-03 23:29                                               ` John Fastabend
  2016-11-04  0:34                                                 ` Michael S. Tsirkin
  0 siblings, 1 reply; 42+ messages in thread
From: John Fastabend @ 2016-11-03 23:29 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Shrijeet Mukherjee, Jesper Dangaard Brouer, Thomas Graf,
	Alexei Starovoitov, Jakub Kicinski, David Miller,
	alexander.duyck, shrijeet, tom, netdev, Roopa Prabhu,
	Nikolay Aleksandrov, aconole

[...]

>>> - when XDP is attached disable all LRO using VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
>>>   (not used by driver so far, designed to allow dynamic LRO control with
>>>    ethtool)
>>
>> I see there is a UAPI bit for this but I guess we also need to add
>> support to vhost as well? Seems otherwise we may just drop a bunch
>> of packets on the floor out of handle_rx() when recvmsg returns larger
>> than a page size. Or did I read this wrong...
> 
> It's already supported host side. However you might
> get some packets that were in flight when you attached.
> 

Really I must have missed it I don't see any *GUEST_FEATURES* flag in
./drivers/vhost/?

>>> - start adding page-sized buffers
>>
>> I started to mangle add_recvbuf_big() and receive_big() here and this
>> didn't seem too bad.
> 
> I imagine it won't be ATM but I think we'll need to support
> mrg buffers with time and then it will be messy.
> Besides, it's not an architectural thing that receive_big
> uses page sized buffers, it could use any size.
> So a separate path just for xdp would be better imho.
> 
>>> - do something with non-page-sized buffers added previously - what
>>>   exactly? copy I guess? What about LRO packets that are too large -
>>>   can we drop or can we split them up?
>>
>> hmm not sure I understand this here. With LRO disabled and mergeable
>> buffers disabled all packets should fit in a page correct?
> 
> Assuing F_MTU is negotiated and MTU field is small enough, yes.
> But if you disable mrg buffers dynamically you will get some packets
> in buffers that were added before the disable.
> Similarly for disabling LRO dynamically.
> 
>> With LRO enabled case I guess to start with we block XDP from being
>> loaded for the same reason we don't allow jumbo frames on physical
>> nics.
> 
> If you ask that host disables the capability, then yes, it's easy.
> Let's do that for now, it's a start.
> 
> 
>>>
>>> I'm fine with disabling XDP for some configurations as the first step,
>>> and we can add that support later.
>>>
>>
>> In order for this to work though I guess we need to be able to
>> dynamically disable mergeable buffers at the moment I just commented
>> it out of the features list and fixed up virtio_has_features so it
>> wont bug_on.
> 
> For now we can just set mrg_rxbuf=off on qemu command line, and
> fail XDP attach if not there. I think we'll be able to support it
> long term but you will need host side changes, or fully reset
> device and reconfigure it.

see question below. I agree disabling mrg_rxbuff=off lro=off and an
xdp receive path makes this relatively straight forward and clean with
the MTU patch noted below as well.

> 
>>> Ideas about mergeable buffers (optional):
>>>
>>> At the moment mergeable buffers can't be disabled dynamically.
>>> They do bring a small benefit for XDP if host MTU is large (see below)
>>> and aren't hard to support:
>>> - if header is by itself skip 1st page
>>> - otherwise copy all data into first page
>>> and it's nicer not to add random limitations that require guest reboot.
>>> It might make sense to add a command that disables/enabled
>>> mergeable buffers dynamically but that's for newer hosts.
>>
>> Yep it seems disabling mergeable buffers solves this but didn't look at
>> it too closely. I'll look closer tomorrow.
>>
>>>
>>> Spec does not require it but in practice most hosts put all data
>>> in the 1st page or all in the 2nd page so the copy will be nop
>>> for these cases.
>>>
>>> Large host MTU - newer hosts report the host MTU, older ones don't.
>>> Using mergeable buffers we can at least detect this case
>>> (and then what? drop I guess).
>>>
>>
>> The physical nics just refuse to load XDP with large MTU.
> 
> So let's do the same for now, unfortunately you don't know
> the MTU unless _F_MTU is negitiated and QEMU does not
> implement that yet, but it's easy to add.
> In fact I suspect Aaron (cc) has an implementation since
> he posted a patch implementing that.
> Aaron could you post it pls?
> 

Great! Aaron if you want me to review/test at all let me know I have
a few systems setup running this now so can help if needed.

>> Any reason
>> not to negotiate the mtu with the guest so that the guest can force
>> this?
> 
> There are generally many guests and many NICs on the host.
> A big packet arrives, what do you want to do with it?

Drop it just like a physical nic would do if packet is larger
than MTU. Maybe splat a message in the log so user has some
clue something got misconfigured.

> We probably want to build propagating MTU across all VMs and NICs
> but let's get a basic thing merged first.

That feels like an orchestration/QEMU type problem to me. Just because
some NIC has jumbo frames enabled doesn't necessarily mean they would
ever get to any specific VM based on forwarding configuration.

And if I try to merge the last email I sent out here. In mergeable and
big_packets modes if LRO is off and MTU < PAGE_SIZE it seems we should
always get physically contiguous data on a single page correct? It
may be at some offset in a page however. But the offset should not
matter to XDP. If I read this right we wouldn't need to add a new
XDP mode and could just use the existing merge or big modes. This would
seem cleaner to me than adding a new mode and requiring a qemu option.

Thanks for all the pointers by the way its very helpful.

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-11-03 23:29                                               ` John Fastabend
@ 2016-11-04  0:34                                                 ` Michael S. Tsirkin
  2016-11-04 23:05                                                   ` John Fastabend
  0 siblings, 1 reply; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-11-04  0:34 UTC (permalink / raw)
  To: John Fastabend
  Cc: Shrijeet Mukherjee, Jesper Dangaard Brouer, Thomas Graf,
	Alexei Starovoitov, Jakub Kicinski, David Miller,
	alexander.duyck, shrijeet, tom, netdev, Roopa Prabhu,
	Nikolay Aleksandrov, aconole

On Thu, Nov 03, 2016 at 04:29:22PM -0700, John Fastabend wrote:
> [...]
> 
> >>> - when XDP is attached disable all LRO using VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
> >>>   (not used by driver so far, designed to allow dynamic LRO control with
> >>>    ethtool)
> >>
> >> I see there is a UAPI bit for this but I guess we also need to add
> >> support to vhost as well? Seems otherwise we may just drop a bunch
> >> of packets on the floor out of handle_rx() when recvmsg returns larger
> >> than a page size. Or did I read this wrong...
> > 
> > It's already supported host side. However you might
> > get some packets that were in flight when you attached.
> > 
> 
> Really I must have missed it I don't see any *GUEST_FEATURES* flag in
> ./drivers/vhost/?

It's all done by QEMU catching these commands and calling
ioctls on the tun/macvtap/packet socket.

> >>> - start adding page-sized buffers
> >>
> >> I started to mangle add_recvbuf_big() and receive_big() here and this
> >> didn't seem too bad.
> > 
> > I imagine it won't be ATM but I think we'll need to support
> > mrg buffers with time and then it will be messy.
> > Besides, it's not an architectural thing that receive_big
> > uses page sized buffers, it could use any size.
> > So a separate path just for xdp would be better imho.
> > 
> >>> - do something with non-page-sized buffers added previously - what
> >>>   exactly? copy I guess? What about LRO packets that are too large -
> >>>   can we drop or can we split them up?
> >>
> >> hmm not sure I understand this here. With LRO disabled and mergeable
> >> buffers disabled all packets should fit in a page correct?
> > 
> > Assuing F_MTU is negotiated and MTU field is small enough, yes.
> > But if you disable mrg buffers dynamically you will get some packets
> > in buffers that were added before the disable.
> > Similarly for disabling LRO dynamically.
> > 
> >> With LRO enabled case I guess to start with we block XDP from being
> >> loaded for the same reason we don't allow jumbo frames on physical
> >> nics.
> > 
> > If you ask that host disables the capability, then yes, it's easy.
> > Let's do that for now, it's a start.
> > 
> > 
> >>>
> >>> I'm fine with disabling XDP for some configurations as the first step,
> >>> and we can add that support later.
> >>>
> >>
> >> In order for this to work though I guess we need to be able to
> >> dynamically disable mergeable buffers at the moment I just commented
> >> it out of the features list and fixed up virtio_has_features so it
> >> wont bug_on.
> > 
> > For now we can just set mrg_rxbuf=off on qemu command line, and
> > fail XDP attach if not there. I think we'll be able to support it
> > long term but you will need host side changes, or fully reset
> > device and reconfigure it.
> 
> see question below. I agree disabling mrg_rxbuff=off lro=off and an
> xdp receive path makes this relatively straight forward and clean with
> the MTU patch noted below as well.
> 
> > 
> >>> Ideas about mergeable buffers (optional):
> >>>
> >>> At the moment mergeable buffers can't be disabled dynamically.
> >>> They do bring a small benefit for XDP if host MTU is large (see below)
> >>> and aren't hard to support:
> >>> - if header is by itself skip 1st page
> >>> - otherwise copy all data into first page
> >>> and it's nicer not to add random limitations that require guest reboot.
> >>> It might make sense to add a command that disables/enabled
> >>> mergeable buffers dynamically but that's for newer hosts.
> >>
> >> Yep it seems disabling mergeable buffers solves this but didn't look at
> >> it too closely. I'll look closer tomorrow.
> >>
> >>>
> >>> Spec does not require it but in practice most hosts put all data
> >>> in the 1st page or all in the 2nd page so the copy will be nop
> >>> for these cases.
> >>>
> >>> Large host MTU - newer hosts report the host MTU, older ones don't.
> >>> Using mergeable buffers we can at least detect this case
> >>> (and then what? drop I guess).
> >>>
> >>
> >> The physical nics just refuse to load XDP with large MTU.
> > 
> > So let's do the same for now, unfortunately you don't know
> > the MTU unless _F_MTU is negitiated and QEMU does not
> > implement that yet, but it's easy to add.
> > In fact I suspect Aaron (cc) has an implementation since
> > he posted a patch implementing that.
> > Aaron could you post it pls?
> > 
> 
> Great! Aaron if you want me to review/test at all let me know I have
> a few systems setup running this now so can help if needed.
> 
> >> Any reason
> >> not to negotiate the mtu with the guest so that the guest can force
> >> this?
> > 
> > There are generally many guests and many NICs on the host.
> > A big packet arrives, what do you want to do with it?
> 
> Drop it just like a physical nic would do if packet is larger
> than MTU. Maybe splat a message in the log so user has some
> clue something got misconfigured.
> 
> > We probably want to build propagating MTU across all VMs and NICs
> > but let's get a basic thing merged first.
> 
> That feels like an orchestration/QEMU type problem to me. Just because
> some NIC has jumbo frames enabled doesn't necessarily mean they would
> ever get to any specific VM based on forwarding configuration.
> 
> And if I try to merge the last email I sent out here. In mergeable and
> big_packets modes if LRO is off and MTU < PAGE_SIZE it seems we should
> always get physically contiguous data on a single page correct?

Unfortunately not in the mergeable buffer case according to spec, even though
linux hosts will do that, so it's fine to optimize for that
but need to somehow work in other cases e.g. by doing a data copy.


> It
> may be at some offset in a page however. But the offset should not
> matter to XDP. If I read this right we wouldn't need to add a new
> XDP mode and could just use the existing merge or big modes. This would
> seem cleaner to me than adding a new mode and requiring a qemu option.
> 
> Thanks for all the pointers by the way its very helpful.

So for mergeable we spend cycles trying to make buffers as small
as possible and I have a patch to avoid copies for that too,
I'll post it next week hopefully.

-- 
MST

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-11-04  0:34                                                 ` Michael S. Tsirkin
@ 2016-11-04 23:05                                                   ` John Fastabend
  2016-11-06  6:50                                                     ` Michael S. Tsirkin
  0 siblings, 1 reply; 42+ messages in thread
From: John Fastabend @ 2016-11-04 23:05 UTC (permalink / raw)
  To: Michael S. Tsirkin
  Cc: Shrijeet Mukherjee, Jesper Dangaard Brouer, Thomas Graf,
	Alexei Starovoitov, Jakub Kicinski, David Miller,
	alexander.duyck, shrijeet, tom, netdev, Roopa Prabhu,
	Nikolay Aleksandrov, aconole

On 16-11-03 05:34 PM, Michael S. Tsirkin wrote:
> On Thu, Nov 03, 2016 at 04:29:22PM -0700, John Fastabend wrote:
>> [...]
>>
>>>>> - when XDP is attached disable all LRO using VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
>>>>>   (not used by driver so far, designed to allow dynamic LRO control with
>>>>>    ethtool)
>>>>
>>>> I see there is a UAPI bit for this but I guess we also need to add
>>>> support to vhost as well? Seems otherwise we may just drop a bunch
>>>> of packets on the floor out of handle_rx() when recvmsg returns larger
>>>> than a page size. Or did I read this wrong...
>>>
>>> It's already supported host side. However you might
>>> get some packets that were in flight when you attached.
>>>
>>
>> Really I must have missed it I don't see any *GUEST_FEATURES* flag in
>> ./drivers/vhost/?
> 
> It's all done by QEMU catching these commands and calling
> ioctls on the tun/macvtap/packet socket.
> 

Well at least for the tap vhost backend in linux that I found here,

 ./qemu/net/tap-linux.c

there is no LRO feature flag but that is OK I can get it working next
week looks fairly straight forward.

[...]

>> And if I try to merge the last email I sent out here. In mergeable and
>> big_packets modes if LRO is off and MTU < PAGE_SIZE it seems we should
>> always get physically contiguous data on a single page correct?
> 
> Unfortunately not in the mergeable buffer case according to spec, even though
> linux hosts will do that, so it's fine to optimize for that
> but need to somehow work in other cases e.g. by doing a data copy.
> 

ah OK this makes sense I was looking at vhost implementation in Linux.

> 
>> It
>> may be at some offset in a page however. But the offset should not
>> matter to XDP. If I read this right we wouldn't need to add a new
>> XDP mode and could just use the existing merge or big modes. This would
>> seem cleaner to me than adding a new mode and requiring a qemu option.
>>
>> Thanks for all the pointers by the way its very helpful.
> 
> So for mergeable we spend cycles trying to make buffers as small
> as possible and I have a patch to avoid copies for that too,
> I'll post it next week hopefully.
> 

Good to know. I'll get the XDP stuff wrapped up next week or see
if Srijeet wants to do it.

Thanks,
John

^ permalink raw reply	[flat|nested] 42+ messages in thread

* Re: [PATCH net-next RFC WIP] Patch for XDP support for virtio_net
  2016-11-04 23:05                                                   ` John Fastabend
@ 2016-11-06  6:50                                                     ` Michael S. Tsirkin
  0 siblings, 0 replies; 42+ messages in thread
From: Michael S. Tsirkin @ 2016-11-06  6:50 UTC (permalink / raw)
  To: John Fastabend
  Cc: Shrijeet Mukherjee, Jesper Dangaard Brouer, Thomas Graf,
	Alexei Starovoitov, Jakub Kicinski, David Miller,
	alexander.duyck, shrijeet, tom, netdev, Roopa Prabhu,
	Nikolay Aleksandrov, aconole

On Fri, Nov 04, 2016 at 04:05:51PM -0700, John Fastabend wrote:
> On 16-11-03 05:34 PM, Michael S. Tsirkin wrote:
> > On Thu, Nov 03, 2016 at 04:29:22PM -0700, John Fastabend wrote:
> >> [...]
> >>
> >>>>> - when XDP is attached disable all LRO using VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET
> >>>>>   (not used by driver so far, designed to allow dynamic LRO control with
> >>>>>    ethtool)
> >>>>
> >>>> I see there is a UAPI bit for this but I guess we also need to add
> >>>> support to vhost as well? Seems otherwise we may just drop a bunch
> >>>> of packets on the floor out of handle_rx() when recvmsg returns larger
> >>>> than a page size. Or did I read this wrong...
> >>>
> >>> It's already supported host side. However you might
> >>> get some packets that were in flight when you attached.
> >>>
> >>
> >> Really I must have missed it I don't see any *GUEST_FEATURES* flag in
> >> ./drivers/vhost/?
> > 
> > It's all done by QEMU catching these commands and calling
> > ioctls on the tun/macvtap/packet socket.
> > 
> 
> Well at least for the tap vhost backend in linux that I found here,
> 
>  ./qemu/net/tap-linux.c
> 
> there is no LRO feature flag but that is OK I can get it working next
> week looks fairly straight forward.
> 
> [...]

This is because tun/tap is the reverse of virtio. LRO in virtio
maps to TSO in tun.
The relevant function is tap_fd_set_offload in QEMU.


> >> And if I try to merge the last email I sent out here. In mergeable and
> >> big_packets modes if LRO is off and MTU < PAGE_SIZE it seems we should
> >> always get physically contiguous data on a single page correct?
> > 
> > Unfortunately not in the mergeable buffer case according to spec, even though
> > linux hosts will do that, so it's fine to optimize for that
> > but need to somehow work in other cases e.g. by doing a data copy.
> > 
> 
> ah OK this makes sense I was looking at vhost implementation in Linux.
> 
> > 
> >> It
> >> may be at some offset in a page however. But the offset should not
> >> matter to XDP. If I read this right we wouldn't need to add a new
> >> XDP mode and could just use the existing merge or big modes. This would
> >> seem cleaner to me than adding a new mode and requiring a qemu option.
> >>
> >> Thanks for all the pointers by the way its very helpful.
> > 
> > So for mergeable we spend cycles trying to make buffers as small
> > as possible and I have a patch to avoid copies for that too,
> > I'll post it next week hopefully.
> > 
> 
> Good to know. I'll get the XDP stuff wrapped up next week or see
> if Srijeet wants to do it.
> 
> Thanks,
> John

^ permalink raw reply	[flat|nested] 42+ messages in thread

end of thread, other threads:[~2016-11-06  6:50 UTC | newest]

Thread overview: 42+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-10-22  4:07 [PATCH net-next RFC WIP] Patch for XDP support for virtio_net Shrijeet Mukherjee
2016-10-23 16:38 ` Stephen Hemminger
2016-10-24  1:51   ` Shrijeet Mukherjee
2016-10-25  1:10     ` Alexei Starovoitov
2016-10-25 17:36 ` Jakub Kicinski
2016-10-26 13:52 ` Jesper Dangaard Brouer
2016-10-26 16:36   ` Michael S. Tsirkin
2016-10-26 16:52     ` David Miller
2016-10-26 17:07       ` Michael S. Tsirkin
2016-10-26 17:11         ` David Miller
2016-10-27  8:55           ` Jesper Dangaard Brouer
2016-10-27 21:09             ` John Fastabend
2016-10-27 21:30               ` Michael S. Tsirkin
2016-10-27 21:42                 ` David Miller
2016-10-27 22:25                   ` Michael S. Tsirkin
2016-10-28  1:35                     ` David Miller
2016-10-28  1:43                       ` Alexander Duyck
2016-10-28  2:10                         ` David Miller
2016-10-28 15:56                           ` John Fastabend
2016-10-28 16:18                             ` Jakub Kicinski
2016-10-28 18:22                               ` Alexei Starovoitov
2016-10-28 20:35                                 ` Alexander Duyck
2016-10-28 20:42                                   ` Jakub Kicinski
2016-10-28 20:36                                 ` Jakub Kicinski
2016-10-29  3:51                                 ` Shrijeet Mukherjee
2016-10-29 11:25                                   ` Thomas Graf
2016-11-02 14:27                                     ` Jesper Dangaard Brouer
2016-11-03  1:28                                       ` Shrijeet Mukherjee
2016-11-03  4:11                                         ` Michael S. Tsirkin
2016-11-03  6:44                                           ` John Fastabend
2016-11-03 22:20                                             ` John Fastabend
2016-11-03 22:42                                             ` Michael S. Tsirkin
2016-11-03 23:29                                               ` John Fastabend
2016-11-04  0:34                                                 ` Michael S. Tsirkin
2016-11-04 23:05                                                   ` John Fastabend
2016-11-06  6:50                                                     ` Michael S. Tsirkin
2016-10-28 17:11                             ` David Miller
2016-10-30 22:53                               ` Michael S. Tsirkin
2016-11-02 14:01                               ` Jesper Dangaard Brouer
2016-11-02 16:06                                 ` Alexander Duyck
2016-10-28  0:02               ` Shrijeet Mukherjee
2016-10-28  0:46                 ` Shrijeet Mukherjee

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.