[net-next] virtio-net: switch to use build_skb() for small buffer
diff mbox series

Message ID 1487666788-9658-1-git-send-email-jasowang@redhat.com
State New, archived
Headers show
Series
  • [net-next] virtio-net: switch to use build_skb() for small buffer
Related show

Commit Message

Jason Wang Feb. 21, 2017, 8:46 a.m. UTC
This patch switch to use build_skb() for small buffer which can have
better performance for both TCP and XDP (since we can work at page
before skb creation). It also remove lots of XDP codes since both
mergeable and small buffer use page frag during refill now.

                       Before   | After
XDP_DROP(xdp1) 64B  :  11.1Mpps | 14.4Mpps

Tested with xdp1/xdp2/xdp_ip_tx_tunnel and netperf.

Signed-off-by: Jason Wang <jasowang@redhat.com>
---
 drivers/net/virtio_net.c | 138 ++++++++++++++++++++++-------------------------
 1 file changed, 63 insertions(+), 75 deletions(-)

Comments

Michael S. Tsirkin Feb. 21, 2017, 2:37 p.m. UTC | #1
On Tue, Feb 21, 2017 at 04:46:28PM +0800, Jason Wang wrote:
> This patch switch to use build_skb() for small buffer which can have
> better performance for both TCP and XDP (since we can work at page
> before skb creation). It also remove lots of XDP codes since both
> mergeable and small buffer use page frag during refill now.
> 
>                        Before   | After
> XDP_DROP(xdp1) 64B  :  11.1Mpps | 14.4Mpps
> 
> Tested with xdp1/xdp2/xdp_ip_tx_tunnel and netperf.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Thanks!
I had a similar patch for mergeable too, though it's trickier there
as host has a lot of flexibility in sizing buffers.
Looks like a good intermediate step to me.


Acked-by: Michael S. Tsirkin <mst@redhat.com>

> ---
>  drivers/net/virtio_net.c | 138 ++++++++++++++++++++++-------------------------
>  1 file changed, 63 insertions(+), 75 deletions(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index ca489e0..bf95016 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -41,6 +41,8 @@ module_param(gso, bool, 0444);
>  #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
>  #define GOOD_COPY_LEN	128
>  
> +#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
> +
>  /* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
>  #define VIRTIO_XDP_HEADROOM 256
>  
> @@ -343,11 +345,10 @@ static struct sk_buff *page_to_skb(struct virtnet_info *vi,
>  
>  static bool virtnet_xdp_xmit(struct virtnet_info *vi,
>  			     struct receive_queue *rq,
> -			     struct xdp_buff *xdp,
> -			     void *data)
> +			     struct xdp_buff *xdp)
>  {
>  	struct virtio_net_hdr_mrg_rxbuf *hdr;
> -	unsigned int num_sg, len;
> +	unsigned int len;
>  	struct send_queue *sq;
>  	unsigned int qp;
>  	void *xdp_sent;
> @@ -358,49 +359,23 @@ static bool virtnet_xdp_xmit(struct virtnet_info *vi,
>  
>  	/* Free up any pending old buffers before queueing new ones. */
>  	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
> -		if (vi->mergeable_rx_bufs) {
> -			struct page *sent_page = virt_to_head_page(xdp_sent);
> +		struct page *sent_page = virt_to_head_page(xdp_sent);
>  
> -			put_page(sent_page);
> -		} else { /* small buffer */
> -			struct sk_buff *skb = xdp_sent;
> -
> -			kfree_skb(skb);
> -		}
> +		put_page(sent_page);
>  	}
>  
> -	if (vi->mergeable_rx_bufs) {
> -		xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
> -		/* Zero header and leave csum up to XDP layers */
> -		hdr = xdp->data;
> -		memset(hdr, 0, vi->hdr_len);
> -
> -		num_sg = 1;
> -		sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
> -	} else { /* small buffer */
> -		struct sk_buff *skb = data;
> +	xdp->data -= vi->hdr_len;
> +	/* Zero header and leave csum up to XDP layers */
> +	hdr = xdp->data;
> +	memset(hdr, 0, vi->hdr_len);
>  
> -		/* Zero header and leave csum up to XDP layers */
> -		hdr = skb_vnet_hdr(skb);
> -		memset(hdr, 0, vi->hdr_len);
> +	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
>  
> -		num_sg = 2;
> -		sg_init_table(sq->sg, 2);
> -		sg_set_buf(sq->sg, hdr, vi->hdr_len);
> -		skb_to_sgvec(skb, sq->sg + 1,
> -			     xdp->data - xdp->data_hard_start,
> -			     xdp->data_end - xdp->data);
> -	}
> -	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
> -				   data, GFP_ATOMIC);
> +	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
>  	if (unlikely(err)) {
> -		if (vi->mergeable_rx_bufs) {
> -			struct page *page = virt_to_head_page(xdp->data);
> +		struct page *page = virt_to_head_page(xdp->data);
>  
> -			put_page(page);
> -		} else /* small buffer */
> -			kfree_skb(data);
> -		/* On error abort to avoid unnecessary kick */
> +		put_page(page);
>  		return false;
>  	}
>  
> @@ -408,39 +383,50 @@ static bool virtnet_xdp_xmit(struct virtnet_info *vi,
>  	return true;
>  }
>  
> +static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
> +{
> +	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
> +}
> +
>  static struct sk_buff *receive_small(struct net_device *dev,
>  				     struct virtnet_info *vi,
>  				     struct receive_queue *rq,
>  				     void *buf, unsigned int len)
>  {
> -	struct sk_buff * skb = buf;
> +	struct sk_buff *skb;
>  	struct bpf_prog *xdp_prog;
> -
> +	unsigned int xdp_headroom = virtnet_get_headroom(vi);
> +	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
> +	unsigned int headroom = vi->hdr_len + header_offset;
> +	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
> +			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +	unsigned int delta = 0;
>  	len -= vi->hdr_len;
>  
>  	rcu_read_lock();
>  	xdp_prog = rcu_dereference(rq->xdp_prog);
>  	if (xdp_prog) {
> -		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
> +		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
>  		struct xdp_buff xdp;
> +		void *orig_data;
>  		u32 act;
>  
>  		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
>  			goto err_xdp;
>  
> -		xdp.data_hard_start = skb->data;
> -		xdp.data = skb->data + VIRTIO_XDP_HEADROOM;
> +		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
> +		xdp.data = xdp.data_hard_start + xdp_headroom;
>  		xdp.data_end = xdp.data + len;
> +		orig_data = xdp.data;
>  		act = bpf_prog_run_xdp(xdp_prog, &xdp);
>  
>  		switch (act) {
>  		case XDP_PASS:
>  			/* Recalculate length in case bpf program changed it */
> -			__skb_pull(skb, xdp.data - xdp.data_hard_start);
> -			len = xdp.data_end - xdp.data;
> +			delta = orig_data - xdp.data;
>  			break;
>  		case XDP_TX:
> -			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp, skb)))
> +			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
>  				trace_xdp_exception(vi->dev, xdp_prog, act);
>  			rcu_read_unlock();
>  			goto xdp_xmit;
> @@ -454,13 +440,25 @@ static struct sk_buff *receive_small(struct net_device *dev,
>  	}
>  	rcu_read_unlock();
>  
> -	skb_trim(skb, len);
> +	skb = build_skb(buf, buflen);
> +	if (!skb) {
> +		put_page(virt_to_head_page(buf));
> +		goto err;
> +	}
> +	skb_reserve(skb, headroom - delta);
> +	skb_put(skb, len + delta);
> +	if (!delta) {
> +		buf += header_offset;
> +		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
> +	} /* keep zeroed vnet hdr since packet was changed by bpf */
> +
> +err:
>  	return skb;
>  
>  err_xdp:
>  	rcu_read_unlock();
>  	dev->stats.rx_dropped++;
> -	kfree_skb(skb);
> +	put_page(virt_to_head_page(buf));
>  xdp_xmit:
>  	return NULL;
>  }
> @@ -621,7 +619,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
>  			}
>  			break;
>  		case XDP_TX:
> -			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp, data)))
> +			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
>  				trace_xdp_exception(vi->dev, xdp_prog, act);
>  			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
>  			if (unlikely(xdp_page != page))
> @@ -737,7 +735,7 @@ static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
>  		} else if (vi->big_packets) {
>  			give_pages(rq, buf);
>  		} else {
> -			dev_kfree_skb(buf);
> +			put_page(virt_to_head_page(buf));
>  		}
>  		return 0;
>  	}
> @@ -780,34 +778,28 @@ static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
>  	return 0;
>  }
>  
> -static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
> -{
> -	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
> -}
> -
>  static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
>  			     gfp_t gfp)
>  {
> -	int headroom = GOOD_PACKET_LEN + virtnet_get_headroom(vi);
> +	struct page_frag *alloc_frag = &rq->alloc_frag;
> +	char *buf;
>  	unsigned int xdp_headroom = virtnet_get_headroom(vi);
> -	struct sk_buff *skb;
> -	struct virtio_net_hdr_mrg_rxbuf *hdr;
> +	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
>  	int err;
>  
> -	skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp);
> -	if (unlikely(!skb))
> +	len = SKB_DATA_ALIGN(len) +
> +	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
>  		return -ENOMEM;
>  
> -	skb_put(skb, headroom);
> -
> -	hdr = skb_vnet_hdr(skb);
> -	sg_init_table(rq->sg, 2);
> -	sg_set_buf(rq->sg, hdr, vi->hdr_len);
> -	skb_to_sgvec(skb, rq->sg + 1, xdp_headroom, skb->len - xdp_headroom);
> -
> -	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
> +	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +	get_page(alloc_frag->page);
> +	alloc_frag->offset += len;
> +	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> +		    vi->hdr_len + GOOD_PACKET_LEN);
> +	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
>  	if (err < 0)
> -		dev_kfree_skb(skb);
> +		put_page(virt_to_head_page(buf));
>  
>  	return err;
>  }
> @@ -1994,10 +1986,6 @@ static void free_receive_page_frags(struct virtnet_info *vi)
>  
>  static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
>  {
> -	/* For small receive mode always use kfree_skb variants */
> -	if (!vi->mergeable_rx_bufs)
> -		return false;
> -
>  	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
>  		return false;
>  	else if (q < vi->curr_queue_pairs)
> @@ -2032,7 +2020,7 @@ static void free_unused_bufs(struct virtnet_info *vi)
>  			} else if (vi->big_packets) {
>  				give_pages(&vi->rq[i], buf);
>  			} else {
> -				dev_kfree_skb(buf);
> +				put_page(virt_to_head_page(buf));
>  			}
>  		}
>  	}
> -- 
> 2.7.4
David Miller Feb. 21, 2017, 5:26 p.m. UTC | #2
From: Jason Wang <jasowang@redhat.com>
Date: Tue, 21 Feb 2017 16:46:28 +0800

> This patch switch to use build_skb() for small buffer which can have
> better performance for both TCP and XDP (since we can work at page
> before skb creation). It also remove lots of XDP codes since both
> mergeable and small buffer use page frag during refill now.
> 
>                        Before   | After
> XDP_DROP(xdp1) 64B  :  11.1Mpps | 14.4Mpps
> 
> Tested with xdp1/xdp2/xdp_ip_tx_tunnel and netperf.
> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>

Applied.
Jason Wang Feb. 22, 2017, 2:58 a.m. UTC | #3
On 2017年02月21日 22:37, Michael S. Tsirkin wrote:
> On Tue, Feb 21, 2017 at 04:46:28PM +0800, Jason Wang wrote:
>> This patch switch to use build_skb() for small buffer which can have
>> better performance for both TCP and XDP (since we can work at page
>> before skb creation). It also remove lots of XDP codes since both
>> mergeable and small buffer use page frag during refill now.
>>
>>                         Before   | After
>> XDP_DROP(xdp1) 64B  :  11.1Mpps | 14.4Mpps
>>
>> Tested with xdp1/xdp2/xdp_ip_tx_tunnel and netperf.
>>
>> Signed-off-by: Jason Wang<jasowang@redhat.com>
> Thanks!
> I had a similar patch for mergeable too, though it's trickier there
> as host has a lot of flexibility in sizing buffers.
> Looks like a good intermediate step to me.

Yes, I think it's more tricky for the case of mergeable buffer:

1) we need reserve NET_SKB_PAD + NET_IP_ALIGN for each buffer, this will 
break rx frag coalescing
2) need tailroom for skb_shinfo, so it won't work for all size of packet

Thanks

>
>
> Acked-by: Michael S. Tsirkin<mst@redhat.com>
>
Michael S. Tsirkin Feb. 22, 2017, 3:06 a.m. UTC | #4
On Wed, Feb 22, 2017 at 10:58:08AM +0800, Jason Wang wrote:
> 
> 
> On 2017年02月21日 22:37, Michael S. Tsirkin wrote:
> > On Tue, Feb 21, 2017 at 04:46:28PM +0800, Jason Wang wrote:
> > > This patch switch to use build_skb() for small buffer which can have
> > > better performance for both TCP and XDP (since we can work at page
> > > before skb creation). It also remove lots of XDP codes since both
> > > mergeable and small buffer use page frag during refill now.
> > > 
> > >                         Before   | After
> > > XDP_DROP(xdp1) 64B  :  11.1Mpps | 14.4Mpps
> > > 
> > > Tested with xdp1/xdp2/xdp_ip_tx_tunnel and netperf.
> > > 
> > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > Thanks!
> > I had a similar patch for mergeable too, though it's trickier there
> > as host has a lot of flexibility in sizing buffers.
> > Looks like a good intermediate step to me.
> 
> Yes, I think it's more tricky for the case of mergeable buffer:
> 
> 1) we need reserve NET_SKB_PAD + NET_IP_ALIGN for each buffer, this will
> break rx frag coalescing
> 2) need tailroom for skb_shinfo, so it won't work for all size of packet
> 
> Thanks

Have you seen my prototype? It works with qemu in practice,
just needs to cover a bunch of corner cases.


> > 
> > 
> > Acked-by: Michael S. Tsirkin<mst@redhat.com>
> >
Jason Wang Feb. 22, 2017, 3:17 a.m. UTC | #5
On 2017年02月22日 11:06, Michael S. Tsirkin wrote:
> On Wed, Feb 22, 2017 at 10:58:08AM +0800, Jason Wang wrote:
>>
>> On 2017年02月21日 22:37, Michael S. Tsirkin wrote:
>>> On Tue, Feb 21, 2017 at 04:46:28PM +0800, Jason Wang wrote:
>>>> This patch switch to use build_skb() for small buffer which can have
>>>> better performance for both TCP and XDP (since we can work at page
>>>> before skb creation). It also remove lots of XDP codes since both
>>>> mergeable and small buffer use page frag during refill now.
>>>>
>>>>                          Before   | After
>>>> XDP_DROP(xdp1) 64B  :  11.1Mpps | 14.4Mpps
>>>>
>>>> Tested with xdp1/xdp2/xdp_ip_tx_tunnel and netperf.
>>>>
>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>> Thanks!
>>> I had a similar patch for mergeable too, though it's trickier there
>>> as host has a lot of flexibility in sizing buffers.
>>> Looks like a good intermediate step to me.
>> Yes, I think it's more tricky for the case of mergeable buffer:
>>
>> 1) we need reserve NET_SKB_PAD + NET_IP_ALIGN for each buffer, this will
>> break rx frag coalescing
>> 2) need tailroom for skb_shinfo, so it won't work for all size of packet
>>
>> Thanks
> Have you seen my prototype?

Not yet, please give me a pointer.

>   It works with qemu in practice,
> just needs to cover a bunch of corner cases.

Sounds good, if you wish I can help to finalize it.

Thanks

>
>>>
>>> Acked-by: Michael S. Tsirkin<mst@redhat.com>
>>>
Jason Wang Feb. 22, 2017, 3:38 a.m. UTC | #6
On 2017年02月22日 11:17, Jason Wang wrote:
>
>
> On 2017年02月22日 11:06, Michael S. Tsirkin wrote:
>> On Wed, Feb 22, 2017 at 10:58:08AM +0800, Jason Wang wrote:
>>>
>>> On 2017年02月21日 22:37, Michael S. Tsirkin wrote:
>>>> On Tue, Feb 21, 2017 at 04:46:28PM +0800, Jason Wang wrote:
>>>>> This patch switch to use build_skb() for small buffer which can have
>>>>> better performance for both TCP and XDP (since we can work at page
>>>>> before skb creation). It also remove lots of XDP codes since both
>>>>> mergeable and small buffer use page frag during refill now.
>>>>>
>>>>>                          Before   | After
>>>>> XDP_DROP(xdp1) 64B  :  11.1Mpps | 14.4Mpps
>>>>>
>>>>> Tested with xdp1/xdp2/xdp_ip_tx_tunnel and netperf.
>>>>>
>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>> Thanks!
>>>> I had a similar patch for mergeable too, though it's trickier there
>>>> as host has a lot of flexibility in sizing buffers.
>>>> Looks like a good intermediate step to me.
>>> Yes, I think it's more tricky for the case of mergeable buffer:
>>>
>>> 1) we need reserve NET_SKB_PAD + NET_IP_ALIGN for each buffer, this 
>>> will
>>> break rx frag coalescing
>>> 2) need tailroom for skb_shinfo, so it won't work for all size of 
>>> packet
>>>
>>> Thanks
>> Have you seen my prototype?
>
> Not yet, please give me a pointer.

FYI, I posted a prototype during EWMA discussion in 2013:

https://lists.linuxfoundation.org/pipermail/virtualization/2013-October/025556.html
Michael S. Tsirkin Feb. 22, 2017, 3:42 a.m. UTC | #7
On Wed, Feb 22, 2017 at 11:17:50AM +0800, Jason Wang wrote:
> 
> 
> On 2017年02月22日 11:06, Michael S. Tsirkin wrote:
> > On Wed, Feb 22, 2017 at 10:58:08AM +0800, Jason Wang wrote:
> > > 
> > > On 2017年02月21日 22:37, Michael S. Tsirkin wrote:
> > > > On Tue, Feb 21, 2017 at 04:46:28PM +0800, Jason Wang wrote:
> > > > > This patch switch to use build_skb() for small buffer which can have
> > > > > better performance for both TCP and XDP (since we can work at page
> > > > > before skb creation). It also remove lots of XDP codes since both
> > > > > mergeable and small buffer use page frag during refill now.
> > > > > 
> > > > >                          Before   | After
> > > > > XDP_DROP(xdp1) 64B  :  11.1Mpps | 14.4Mpps
> > > > > 
> > > > > Tested with xdp1/xdp2/xdp_ip_tx_tunnel and netperf.
> > > > > 
> > > > > Signed-off-by: Jason Wang<jasowang@redhat.com>
> > > > Thanks!
> > > > I had a similar patch for mergeable too, though it's trickier there
> > > > as host has a lot of flexibility in sizing buffers.
> > > > Looks like a good intermediate step to me.
> > > Yes, I think it's more tricky for the case of mergeable buffer:
> > > 
> > > 1) we need reserve NET_SKB_PAD + NET_IP_ALIGN for each buffer, this will
> > > break rx frag coalescing
> > > 2) need tailroom for skb_shinfo, so it won't work for all size of packet
> > > 
> > > Thanks
> > Have you seen my prototype?
> 
> Not yet, please give me a pointer.
> 
> >   It works with qemu in practice,
> > just needs to cover a bunch of corner cases.
> 
> Sounds good, if you wish I can help to finalize it.
> 
> Thanks
> 
> > 
> > > > 
> > > > Acked-by: Michael S. Tsirkin<mst@redhat.com>
> > > > 


Great! Here it is I think it's on top of 4.9 or so. Issues to address:
- when ring is very small this might cause us not to
  be able to fit even a single packet in the whole queue.
  Limit logic to when ring is big enough?
- If e.g. header is split across many buffers, this won't work
  correctly. Detect and copy?

Could be more issues that I forgot. It might be a good idea to
first finish my buffer ctx hack. Will simplify the logic.


commit 77c177091a7c8473e3f0ed148afa2b4765b8b0f7
Author: Michael S. Tsirkin <mst@redhat.com>
Date:   Sun Feb 21 20:22:31 2016 +0200

    virtio_net: switch to build_skb for mrg_rxbuf
    
    For small packets data copy was observed to
    take up about 15% CPU time. Switch to build_skb
    and avoid the copy when using mergeable rx buffers.
    
    As a bonus, medium-size skbs that fit in a page will be
    completely linear.
    
    Of course, we now need to lower the lower bound on packet size,
    to make sure a sane number of skbs fits in rx socket buffer.
    By how much? I don't know yet.
    
    It might also be useful to prefetch the packet buffer since
    net stack will likely use it soon.

    TODO: it appears that Linux won't handle correctly the case of first
    buffer being very small (or consisting exclusively of virtio header).
    This is already the case for current code, so don't bother
    testing yet.
    TODO: might be unfair to the last packet in a fragment as we include
    remaining space if any in its truesize.
    
    Signed-off-by: Michael S. Tsirkin <mst@redhat.com>

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b425fa1..a6b996f 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -38,6 +38,8 @@ module_param(gso, bool, 0444);
 
 /* FIXME: MTU in config. */
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
+//#define MIN_PACKET_ALLOC GOOD_PACKET_LEN
+#define MIN_PACKET_ALLOC 128
 #define GOOD_COPY_LEN	128
 
 /* RX packet size EWMA. The average packet size is used to determine the packet
@@ -246,6 +248,9 @@ static void *mergeable_ctx_to_buf_address(unsigned long mrg_ctx)
 static unsigned long mergeable_buf_to_ctx(void *buf, unsigned int truesize)
 {
 	unsigned int size = truesize / MERGEABLE_BUFFER_ALIGN;
+
+	BUG_ON((unsigned long)buf & (MERGEABLE_BUFFER_ALIGN - 1));
+	BUG_ON(size - 1 >= MERGEABLE_BUFFER_ALIGN);
 	return (unsigned long)buf | (size - 1);
 }
 
@@ -354,25 +359,54 @@ static struct sk_buff *receive_big(struct net_device *dev,
 	return NULL;
 }
 
+#define VNET_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN)
+#define VNET_SKB_BUG (VNET_SKB_PAD < sizeof(struct virtio_net_hdr_mrg_rxbuf))
+#define VNET_SKB_LEN(len) ((len) - sizeof(struct virtio_net_hdr_mrg_rxbuf))
+#define VNET_SKB_OFF VNET_SKB_LEN(VNET_SKB_PAD)
+
+static struct sk_buff *vnet_build_skb(struct virtnet_info *vi,
+				      void *buf,
+				      unsigned int len, unsigned int truesize)
+{
+	struct sk_buff *skb = build_skb(buf, truesize);
+
+	if (!skb)
+		return NULL;
+
+	skb_reserve(skb, VNET_SKB_PAD);
+	skb_put(skb, VNET_SKB_LEN(len));
+
+	return skb;
+}
+
 static struct sk_buff *receive_mergeable(struct net_device *dev,
 					 struct virtnet_info *vi,
 					 struct receive_queue *rq,
 					 unsigned long ctx,
-					 unsigned int len)
+					 unsigned int len,
+					 struct virtio_net_hdr_mrg_rxbuf *hdr)
 {
 	void *buf = mergeable_ctx_to_buf_address(ctx);
-	struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
-	u16 num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
+	u16 num_buf;
 	struct page *page = virt_to_head_page(buf);
-	int offset = buf - page_address(page);
-	unsigned int truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
+	unsigned int truesize = mergeable_ctx_to_buf_truesize(ctx);
+	int offset;
+	struct sk_buff *head_skb;
+	struct sk_buff *curr_skb;
+
+	BUG_ON(len > truesize);
 
-	struct sk_buff *head_skb = page_to_skb(vi, rq, page, offset, len,
-					       truesize);
-	struct sk_buff *curr_skb = head_skb;
+	/* copy header: build_skb will overwrite it */
+	memcpy(hdr, buf + VNET_SKB_OFF, sizeof *hdr);
+
+	head_skb = vnet_build_skb(vi, buf, len, truesize);
+	curr_skb = head_skb;
+
+	num_buf = virtio16_to_cpu(vi->vdev, hdr->num_buffers);
 
 	if (unlikely(!curr_skb))
 		goto err_skb;
+
 	while (--num_buf) {
 		int num_skb_frags;
 
@@ -386,7 +420,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			goto err_buf;
 		}
 
-		buf = mergeable_ctx_to_buf_address(ctx);
+		buf = mergeable_ctx_to_buf_address(ctx) + VNET_SKB_OFF;
 		page = virt_to_head_page(buf);
 
 		num_skb_frags = skb_shinfo(curr_skb)->nr_frags;
@@ -403,7 +437,8 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			head_skb->truesize += nskb->truesize;
 			num_skb_frags = 0;
 		}
-		truesize = max(len, mergeable_ctx_to_buf_truesize(ctx));
+		truesize = mergeable_ctx_to_buf_truesize(ctx);
+		BUG_ON(len > truesize);
 		if (curr_skb != head_skb) {
 			head_skb->data_len += len;
 			head_skb->len += len;
@@ -449,6 +484,7 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 	struct virtnet_stats *stats = this_cpu_ptr(vi->stats);
 	struct sk_buff *skb;
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
+	struct virtio_net_hdr_mrg_rxbuf hdr0;
 
 	if (unlikely(len < vi->hdr_len + ETH_HLEN)) {
 		pr_debug("%s: short packet %i\n", dev->name, len);
@@ -465,17 +501,24 @@ static void receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 		return;
 	}
 
-	if (vi->mergeable_rx_bufs)
-		skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len);
-	else if (vi->big_packets)
+	if (vi->mergeable_rx_bufs) {
+		skb = receive_mergeable(dev, vi, rq, (unsigned long)buf, len,
+					&hdr0);
+		if (unlikely(!skb))
+			return;
+		hdr = &hdr0;
+	} else if (vi->big_packets) {
 		skb = receive_big(dev, vi, rq, buf, len);
-	else
+		if (unlikely(!skb))
+			return;
+		hdr = skb_vnet_hdr(skb);
+	} else {
 		skb = receive_small(vi, buf, len);
+		if (unlikely(!skb))
+			return;
+		hdr = skb_vnet_hdr(skb);
+	}
 
-	if (unlikely(!skb))
-		return;
-
-	hdr = skb_vnet_hdr(skb);
 
 	u64_stats_update_begin(&stats->rx_syncp);
 	stats->rx_bytes += skb->len;
@@ -581,11 +624,14 @@ static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
 
 static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
 {
-	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+	unsigned int hdr;
 	unsigned int len;
 
-	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
-			GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
+	hdr = ALIGN(VNET_SKB_PAD + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
+		    MERGEABLE_BUFFER_ALIGN);
+
+	len = hdr + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
+			    MIN_PACKET_ALLOC, PAGE_SIZE - hdr);
 	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
 }
 
@@ -601,8 +647,11 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
 		return -ENOMEM;
 
-	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
-	ctx = mergeable_buf_to_ctx(buf, len);
+	BUILD_BUG_ON(VNET_SKB_BUG);
+
+	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset +
+		VNET_SKB_OFF;
+	//ctx = mergeable_buf_to_ctx(buf - VNET_SKB_OFF, len);
 	get_page(alloc_frag->page);
 	alloc_frag->offset += len;
 	hole = alloc_frag->size - alloc_frag->offset;
@@ -615,8 +664,10 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
 		len += hole;
 		alloc_frag->offset += hole;
 	}
+	ctx = mergeable_buf_to_ctx(buf - VNET_SKB_OFF, len);
 
-	sg_init_one(rq->sg, buf, len);
+	sg_init_one(rq->sg, buf,
+		    len - VNET_SKB_OFF - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
 	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
 	if (err < 0)
 		put_page(virt_to_head_page(buf));
Jason Wang Feb. 22, 2017, 10:36 a.m. UTC | #8
On 2017年02月22日 11:42, Michael S. Tsirkin wrote:
> On Wed, Feb 22, 2017 at 11:17:50AM +0800, Jason Wang wrote:
>>
>> On 2017年02月22日 11:06, Michael S. Tsirkin wrote:
>>> On Wed, Feb 22, 2017 at 10:58:08AM +0800, Jason Wang wrote:
>>>> On 2017年02月21日 22:37, Michael S. Tsirkin wrote:
>>>>> On Tue, Feb 21, 2017 at 04:46:28PM +0800, Jason Wang wrote:
>>>>>> This patch switch to use build_skb() for small buffer which can have
>>>>>> better performance for both TCP and XDP (since we can work at page
>>>>>> before skb creation). It also remove lots of XDP codes since both
>>>>>> mergeable and small buffer use page frag during refill now.
>>>>>>
>>>>>>                           Before   | After
>>>>>> XDP_DROP(xdp1) 64B  :  11.1Mpps | 14.4Mpps
>>>>>>
>>>>>> Tested with xdp1/xdp2/xdp_ip_tx_tunnel and netperf.
>>>>>>
>>>>>> Signed-off-by: Jason Wang<jasowang@redhat.com>
>>>>> Thanks!
>>>>> I had a similar patch for mergeable too, though it's trickier there
>>>>> as host has a lot of flexibility in sizing buffers.
>>>>> Looks like a good intermediate step to me.
>>>> Yes, I think it's more tricky for the case of mergeable buffer:
>>>>
>>>> 1) we need reserve NET_SKB_PAD + NET_IP_ALIGN for each buffer, this will
>>>> break rx frag coalescing
>>>> 2) need tailroom for skb_shinfo, so it won't work for all size of packet
>>>>
>>>> Thanks
>>> Have you seen my prototype?
>> Not yet, please give me a pointer.
>>
>>>    It works with qemu in practice,
>>> just needs to cover a bunch of corner cases.
>> Sounds good, if you wish I can help to finalize it.
>>
>> Thanks
>>
>>>>> Acked-by: Michael S. Tsirkin<mst@redhat.com>
>>>>>
>
> Great! Here it is I think it's on top of 4.9 or so. Issues to address:
> - when ring is very small this might cause us not to
>    be able to fit even a single packet in the whole queue.
>    Limit logic to when ring is big enough?
> - If e.g. header is split across many buffers, this won't work
>    correctly. Detect and copy?
>
> Could be more issues that I forgot. It might be a good idea to
> first finish my buffer ctx hack. Will simplify the logic.
>
>
> commit 77c177091a7c8473e3f0ed148afa2b4765b8b0f7
> Author: Michael S. Tsirkin <mst@redhat.com>
> Date:   Sun Feb 21 20:22:31 2016 +0200
>
>      virtio_net: switch to build_skb for mrg_rxbuf
>      
>      For small packets data copy was observed to
>      take up about 15% CPU time. Switch to build_skb
>      and avoid the copy when using mergeable rx buffers.
>      
>      As a bonus, medium-size skbs that fit in a page will be
>      completely linear.
>      
>      Of course, we now need to lower the lower bound on packet size,
>      to make sure a sane number of skbs fits in rx socket buffer.
>      By how much? I don't know yet.
>      
>      It might also be useful to prefetch the packet buffer since
>      net stack will likely use it soon.
>
>      TODO: it appears that Linux won't handle correctly the case of first
>      buffer being very small (or consisting exclusively of virtio header).
>      This is already the case for current code, so don't bother
>      testing yet.
>      TODO: might be unfair to the last packet in a fragment as we include
>      remaining space if any in its truesize.
>      
>      Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index b425fa1..a6b996f 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -38,6 +38,8 @@ module_param(gso, bool, 0444);
>   

[...]

> -	if (unlikely(!skb))
> -		return;
> -
> -	hdr = skb_vnet_hdr(skb);
>   
>   	u64_stats_update_begin(&stats->rx_syncp);
>   	stats->rx_bytes += skb->len;
> @@ -581,11 +624,14 @@ static int add_recvbuf_big(struct virtnet_info *vi, struct receive_queue *rq,
>   
>   static unsigned int get_mergeable_buf_len(struct ewma_pkt_len *avg_pkt_len)
>   {
> -	const size_t hdr_len = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> +	unsigned int hdr;
>   	unsigned int len;
>   
> -	len = hdr_len + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
> -			GOOD_PACKET_LEN, PAGE_SIZE - hdr_len);
> +	hdr = ALIGN(VNET_SKB_PAD + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)),
> +		    MERGEABLE_BUFFER_ALIGN);

So we need to reserve sizeof(struct skb_shared_info) for each buffer. If 
we have several 4K buffers for a single large packet, about 8% truesize 
were wasted?

> +
> +	len = hdr + clamp_t(unsigned int, ewma_pkt_len_read(avg_pkt_len),
> +			    MIN_PACKET_ALLOC, PAGE_SIZE - hdr);
>   	return ALIGN(len, MERGEABLE_BUFFER_ALIGN);
>   }
>   
> @@ -601,8 +647,11 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>   	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
>   		return -ENOMEM;
>   
> -	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> -	ctx = mergeable_buf_to_ctx(buf, len);
> +	BUILD_BUG_ON(VNET_SKB_BUG);
> +
> +	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset +
> +		VNET_SKB_OFF;

Rx frag coalescing won't work any more and we will end up with more 
fraglist for large packets.

Thanks

> +	//ctx = mergeable_buf_to_ctx(buf - VNET_SKB_OFF, len);
>   	get_page(alloc_frag->page);
>   	alloc_frag->offset += len;
>   	hole = alloc_frag->size - alloc_frag->offset;
> @@ -615,8 +664,10 @@ static int add_recvbuf_mergeable(struct receive_queue *rq, gfp_t gfp)
>   		len += hole;
>   		alloc_frag->offset += hole;
>   	}
> +	ctx = mergeable_buf_to_ctx(buf - VNET_SKB_OFF, len);
>   
> -	sg_init_one(rq->sg, buf, len);
> +	sg_init_one(rq->sg, buf,
> +		    len - VNET_SKB_OFF - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)));
>   	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, (void *)ctx, gfp);
>   	if (err < 0)
>   		put_page(virt_to_head_page(buf));
John Fastabend Feb. 22, 2017, 5:17 p.m. UTC | #9
On 17-02-21 12:46 AM, Jason Wang wrote:
> This patch switch to use build_skb() for small buffer which can have
> better performance for both TCP and XDP (since we can work at page
> before skb creation). It also remove lots of XDP codes since both
> mergeable and small buffer use page frag during refill now.
> 
>                        Before   | After
> XDP_DROP(xdp1) 64B  :  11.1Mpps | 14.4Mpps
> 
> Tested with xdp1/xdp2/xdp_ip_tx_tunnel and netperf.

When you do the xdp tests are you generating packets with pktgen on the
corresponding tap devices?

Also another thought, have you looked at using some of the buffer recycling
techniques used in the hardware drivers such as ixgbe and with Eric's latest
patches mlx? I have seen significant performance increases for some
workloads doing this. I wanted to try something like this out on virtio
but haven't had time yet.

> 
> Signed-off-by: Jason Wang <jasowang@redhat.com>
> ---

[...]

>  static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
>  			     gfp_t gfp)
>  {
> -	int headroom = GOOD_PACKET_LEN + virtnet_get_headroom(vi);
> +	struct page_frag *alloc_frag = &rq->alloc_frag;
> +	char *buf;
>  	unsigned int xdp_headroom = virtnet_get_headroom(vi);
> -	struct sk_buff *skb;
> -	struct virtio_net_hdr_mrg_rxbuf *hdr;
> +	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
>  	int err;
>  
> -	skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp);
> -	if (unlikely(!skb))
> +	len = SKB_DATA_ALIGN(len) +
> +	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
> +	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
>  		return -ENOMEM;
>  
> -	skb_put(skb, headroom);
> -
> -	hdr = skb_vnet_hdr(skb);
> -	sg_init_table(rq->sg, 2);
> -	sg_set_buf(rq->sg, hdr, vi->hdr_len);
> -	skb_to_sgvec(skb, rq->sg + 1, xdp_headroom, skb->len - xdp_headroom);
> -
> -	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
> +	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
> +	get_page(alloc_frag->page);
> +	alloc_frag->offset += len;
> +	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
> +		    vi->hdr_len + GOOD_PACKET_LEN);
> +	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);

Nice this cleans up a lot of the branching code. Thanks.

Acked-by: John Fastabend <john.r.fastabend@intel.com>
Jason Wang Feb. 23, 2017, 2:44 a.m. UTC | #10
On 2017年02月23日 01:17, John Fastabend wrote:
> On 17-02-21 12:46 AM, Jason Wang wrote:
>> This patch switch to use build_skb() for small buffer which can have
>> better performance for both TCP and XDP (since we can work at page
>> before skb creation). It also remove lots of XDP codes since both
>> mergeable and small buffer use page frag during refill now.
>>
>>                         Before   | After
>> XDP_DROP(xdp1) 64B  :  11.1Mpps | 14.4Mpps
>>
>> Tested with xdp1/xdp2/xdp_ip_tx_tunnel and netperf.
> When you do the xdp tests are you generating packets with pktgen on the
> corresponding tap devices?

Yes, pktgen on the tap directly.

>
> Also another thought, have you looked at using some of the buffer recycling
> techniques used in the hardware drivers such as ixgbe and with Eric's latest
> patches mlx? I have seen significant performance increases for some
> workloads doing this. I wanted to try something like this out on virtio
> but haven't had time yet.

Yes, this is in TODO list. Will pick some time to do this.

Thanks

>
>> Signed-off-by: Jason Wang <jasowang@redhat.com>
>> ---
> [...]
>
>>   static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
>>   			     gfp_t gfp)
>>   {
>> -	int headroom = GOOD_PACKET_LEN + virtnet_get_headroom(vi);
>> +	struct page_frag *alloc_frag = &rq->alloc_frag;
>> +	char *buf;
>>   	unsigned int xdp_headroom = virtnet_get_headroom(vi);
>> -	struct sk_buff *skb;
>> -	struct virtio_net_hdr_mrg_rxbuf *hdr;
>> +	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
>>   	int err;
>>   
>> -	skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp);
>> -	if (unlikely(!skb))
>> +	len = SKB_DATA_ALIGN(len) +
>> +	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
>> +	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
>>   		return -ENOMEM;
>>   
>> -	skb_put(skb, headroom);
>> -
>> -	hdr = skb_vnet_hdr(skb);
>> -	sg_init_table(rq->sg, 2);
>> -	sg_set_buf(rq->sg, hdr, vi->hdr_len);
>> -	skb_to_sgvec(skb, rq->sg + 1, xdp_headroom, skb->len - xdp_headroom);
>> -
>> -	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
>> +	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
>> +	get_page(alloc_frag->page);
>> +	alloc_frag->offset += len;
>> +	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
>> +		    vi->hdr_len + GOOD_PACKET_LEN);
>> +	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
> Nice this cleans up a lot of the branching code. Thanks.
>
> Acked-by: John Fastabend <john.r.fastabend@intel.com>

Patch
diff mbox series

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index ca489e0..bf95016 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -41,6 +41,8 @@  module_param(gso, bool, 0444);
 #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN)
 #define GOOD_COPY_LEN	128
 
+#define VIRTNET_RX_PAD (NET_IP_ALIGN + NET_SKB_PAD)
+
 /* Amount of XDP headroom to prepend to packets for use by xdp_adjust_head */
 #define VIRTIO_XDP_HEADROOM 256
 
@@ -343,11 +345,10 @@  static struct sk_buff *page_to_skb(struct virtnet_info *vi,
 
 static bool virtnet_xdp_xmit(struct virtnet_info *vi,
 			     struct receive_queue *rq,
-			     struct xdp_buff *xdp,
-			     void *data)
+			     struct xdp_buff *xdp)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
-	unsigned int num_sg, len;
+	unsigned int len;
 	struct send_queue *sq;
 	unsigned int qp;
 	void *xdp_sent;
@@ -358,49 +359,23 @@  static bool virtnet_xdp_xmit(struct virtnet_info *vi,
 
 	/* Free up any pending old buffers before queueing new ones. */
 	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
-		if (vi->mergeable_rx_bufs) {
-			struct page *sent_page = virt_to_head_page(xdp_sent);
+		struct page *sent_page = virt_to_head_page(xdp_sent);
 
-			put_page(sent_page);
-		} else { /* small buffer */
-			struct sk_buff *skb = xdp_sent;
-
-			kfree_skb(skb);
-		}
+		put_page(sent_page);
 	}
 
-	if (vi->mergeable_rx_bufs) {
-		xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
-		/* Zero header and leave csum up to XDP layers */
-		hdr = xdp->data;
-		memset(hdr, 0, vi->hdr_len);
-
-		num_sg = 1;
-		sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
-	} else { /* small buffer */
-		struct sk_buff *skb = data;
+	xdp->data -= vi->hdr_len;
+	/* Zero header and leave csum up to XDP layers */
+	hdr = xdp->data;
+	memset(hdr, 0, vi->hdr_len);
 
-		/* Zero header and leave csum up to XDP layers */
-		hdr = skb_vnet_hdr(skb);
-		memset(hdr, 0, vi->hdr_len);
+	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
 
-		num_sg = 2;
-		sg_init_table(sq->sg, 2);
-		sg_set_buf(sq->sg, hdr, vi->hdr_len);
-		skb_to_sgvec(skb, sq->sg + 1,
-			     xdp->data - xdp->data_hard_start,
-			     xdp->data_end - xdp->data);
-	}
-	err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
-				   data, GFP_ATOMIC);
+	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
 	if (unlikely(err)) {
-		if (vi->mergeable_rx_bufs) {
-			struct page *page = virt_to_head_page(xdp->data);
+		struct page *page = virt_to_head_page(xdp->data);
 
-			put_page(page);
-		} else /* small buffer */
-			kfree_skb(data);
-		/* On error abort to avoid unnecessary kick */
+		put_page(page);
 		return false;
 	}
 
@@ -408,39 +383,50 @@  static bool virtnet_xdp_xmit(struct virtnet_info *vi,
 	return true;
 }
 
+static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
+{
+	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
+}
+
 static struct sk_buff *receive_small(struct net_device *dev,
 				     struct virtnet_info *vi,
 				     struct receive_queue *rq,
 				     void *buf, unsigned int len)
 {
-	struct sk_buff * skb = buf;
+	struct sk_buff *skb;
 	struct bpf_prog *xdp_prog;
-
+	unsigned int xdp_headroom = virtnet_get_headroom(vi);
+	unsigned int header_offset = VIRTNET_RX_PAD + xdp_headroom;
+	unsigned int headroom = vi->hdr_len + header_offset;
+	unsigned int buflen = SKB_DATA_ALIGN(GOOD_PACKET_LEN + headroom) +
+			      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	unsigned int delta = 0;
 	len -= vi->hdr_len;
 
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(rq->xdp_prog);
 	if (xdp_prog) {
-		struct virtio_net_hdr_mrg_rxbuf *hdr = buf;
+		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
 		struct xdp_buff xdp;
+		void *orig_data;
 		u32 act;
 
 		if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
 			goto err_xdp;
 
-		xdp.data_hard_start = skb->data;
-		xdp.data = skb->data + VIRTIO_XDP_HEADROOM;
+		xdp.data_hard_start = buf + VIRTNET_RX_PAD + vi->hdr_len;
+		xdp.data = xdp.data_hard_start + xdp_headroom;
 		xdp.data_end = xdp.data + len;
+		orig_data = xdp.data;
 		act = bpf_prog_run_xdp(xdp_prog, &xdp);
 
 		switch (act) {
 		case XDP_PASS:
 			/* Recalculate length in case bpf program changed it */
-			__skb_pull(skb, xdp.data - xdp.data_hard_start);
-			len = xdp.data_end - xdp.data;
+			delta = orig_data - xdp.data;
 			break;
 		case XDP_TX:
-			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp, skb)))
+			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 			rcu_read_unlock();
 			goto xdp_xmit;
@@ -454,13 +440,25 @@  static struct sk_buff *receive_small(struct net_device *dev,
 	}
 	rcu_read_unlock();
 
-	skb_trim(skb, len);
+	skb = build_skb(buf, buflen);
+	if (!skb) {
+		put_page(virt_to_head_page(buf));
+		goto err;
+	}
+	skb_reserve(skb, headroom - delta);
+	skb_put(skb, len + delta);
+	if (!delta) {
+		buf += header_offset;
+		memcpy(skb_vnet_hdr(skb), buf, vi->hdr_len);
+	} /* keep zeroed vnet hdr since packet was changed by bpf */
+
+err:
 	return skb;
 
 err_xdp:
 	rcu_read_unlock();
 	dev->stats.rx_dropped++;
-	kfree_skb(skb);
+	put_page(virt_to_head_page(buf));
 xdp_xmit:
 	return NULL;
 }
@@ -621,7 +619,7 @@  static struct sk_buff *receive_mergeable(struct net_device *dev,
 			}
 			break;
 		case XDP_TX:
-			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp, data)))
+			if (unlikely(!virtnet_xdp_xmit(vi, rq, &xdp)))
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 			ewma_pkt_len_add(&rq->mrg_avg_pkt_len, len);
 			if (unlikely(xdp_page != page))
@@ -737,7 +735,7 @@  static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 		} else if (vi->big_packets) {
 			give_pages(rq, buf);
 		} else {
-			dev_kfree_skb(buf);
+			put_page(virt_to_head_page(buf));
 		}
 		return 0;
 	}
@@ -780,34 +778,28 @@  static int receive_buf(struct virtnet_info *vi, struct receive_queue *rq,
 	return 0;
 }
 
-static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
-{
-	return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0;
-}
-
 static int add_recvbuf_small(struct virtnet_info *vi, struct receive_queue *rq,
 			     gfp_t gfp)
 {
-	int headroom = GOOD_PACKET_LEN + virtnet_get_headroom(vi);
+	struct page_frag *alloc_frag = &rq->alloc_frag;
+	char *buf;
 	unsigned int xdp_headroom = virtnet_get_headroom(vi);
-	struct sk_buff *skb;
-	struct virtio_net_hdr_mrg_rxbuf *hdr;
+	int len = vi->hdr_len + VIRTNET_RX_PAD + GOOD_PACKET_LEN + xdp_headroom;
 	int err;
 
-	skb = __netdev_alloc_skb_ip_align(vi->dev, headroom, gfp);
-	if (unlikely(!skb))
+	len = SKB_DATA_ALIGN(len) +
+	      SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+	if (unlikely(!skb_page_frag_refill(len, alloc_frag, gfp)))
 		return -ENOMEM;
 
-	skb_put(skb, headroom);
-
-	hdr = skb_vnet_hdr(skb);
-	sg_init_table(rq->sg, 2);
-	sg_set_buf(rq->sg, hdr, vi->hdr_len);
-	skb_to_sgvec(skb, rq->sg + 1, xdp_headroom, skb->len - xdp_headroom);
-
-	err = virtqueue_add_inbuf(rq->vq, rq->sg, 2, skb, gfp);
+	buf = (char *)page_address(alloc_frag->page) + alloc_frag->offset;
+	get_page(alloc_frag->page);
+	alloc_frag->offset += len;
+	sg_init_one(rq->sg, buf + VIRTNET_RX_PAD + xdp_headroom,
+		    vi->hdr_len + GOOD_PACKET_LEN);
+	err = virtqueue_add_inbuf(rq->vq, rq->sg, 1, buf, gfp);
 	if (err < 0)
-		dev_kfree_skb(skb);
+		put_page(virt_to_head_page(buf));
 
 	return err;
 }
@@ -1994,10 +1986,6 @@  static void free_receive_page_frags(struct virtnet_info *vi)
 
 static bool is_xdp_raw_buffer_queue(struct virtnet_info *vi, int q)
 {
-	/* For small receive mode always use kfree_skb variants */
-	if (!vi->mergeable_rx_bufs)
-		return false;
-
 	if (q < (vi->curr_queue_pairs - vi->xdp_queue_pairs))
 		return false;
 	else if (q < vi->curr_queue_pairs)
@@ -2032,7 +2020,7 @@  static void free_unused_bufs(struct virtnet_info *vi)
 			} else if (vi->big_packets) {
 				give_pages(&vi->rq[i], buf);
 			} else {
-				dev_kfree_skb(buf);
+				put_page(virt_to_head_page(buf));
 			}
 		}
 	}