From mboxrd@z Thu Jan 1 00:00:00 1970 From: Jason Wang Subject: Re: [PATCH] net/packet: support vhost mrg_rxbuf Date: Mon, 29 Oct 2018 10:54:19 +0800 Message-ID: <11ee5374-2df6-1d73-2d99-932b6117ccea@redhat.com> References: <20181027120445.21552-1-jianfeng.tan@linux.alibaba.com> Mime-Version: 1.0 Content-Type: text/plain; charset=utf-8; format=flowed Content-Transfer-Encoding: 8bit Cc: davem@davemloft.net, mst@redhat.com To: Jianfeng Tan , netdev@vger.kernel.org Return-path: Received: from mx1.redhat.com ([209.132.183.28]:33142 "EHLO mx1.redhat.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1729117AbeJ2Ll0 (ORCPT ); Mon, 29 Oct 2018 07:41:26 -0400 In-Reply-To: <20181027120445.21552-1-jianfeng.tan@linux.alibaba.com> Content-Language: en-US Sender: netdev-owner@vger.kernel.org List-ID: On 2018/10/27 下午8:04, Jianfeng Tan wrote: > Previouly, virtio net header size is hardcoded to be 10, which makes > the feature mrg_rxbuf not available. > > We redefine PACKET_VNET_HDR ioctl which treats user input as boolean, > but now as int, 0, 10, 12, or everything else be treated as 10. > > There will be one case which is treated differently: if user input is > 12, previously, the header size will be 10; but now it's 12. > > Signed-off-by: Jianfeng Tan This should go for net-next which is closed. You may consider to re-submit when it was open. > --- > net/packet/af_packet.c | 97 ++++++++++++++++++++++++++---------------- > net/packet/diag.c | 2 +- > net/packet/internal.h | 2 +- > 3 files changed, 63 insertions(+), 38 deletions(-) > > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c > index ec3095f13aae..1bd7f4cdcc80 100644 > --- a/net/packet/af_packet.c > +++ b/net/packet/af_packet.c > @@ -1999,18 +1999,24 @@ static unsigned int run_filter(struct sk_buff *skb, > } > > static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb, > - size_t *len) > + size_t *len, int vnet_hdr_len) > { > + int res; > struct virtio_net_hdr vnet_hdr; > > - if (*len < sizeof(vnet_hdr)) > + if (*len < vnet_hdr_len) > return -EINVAL; > - *len -= sizeof(vnet_hdr); > + *len -= vnet_hdr_len; > > if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0)) > return -EINVAL; > > - return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); > + res = memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr)); > + if (res == 0) > + iov_iter_advance(&msg->msg_iter, > + vnet_hdr_len - sizeof(vnet_hdr)); > + > + return res; > } > > /* > @@ -2206,11 +2212,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, > po->tp_reserve; > } else { > unsigned int maclen = skb_network_offset(skb); > + int vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); > + > netoff = TPACKET_ALIGN(po->tp_hdrlen + > (maclen < 16 ? 16 : maclen)) + > po->tp_reserve; > - if (po->has_vnet_hdr) { > - netoff += sizeof(struct virtio_net_hdr); > + if (vnet_hdr_sz) { > + netoff += vnet_hdr_sz; > do_vnet = true; > } > macoff = netoff - maclen; > @@ -2429,19 +2437,6 @@ static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len) > return 0; > } > > -static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len, > - struct virtio_net_hdr *vnet_hdr) > -{ > - if (*len < sizeof(*vnet_hdr)) > - return -EINVAL; > - *len -= sizeof(*vnet_hdr); > - > - if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter)) > - return -EFAULT; > - > - return __packet_snd_vnet_parse(vnet_hdr, *len); > -} > - > static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, > void *frame, struct net_device *dev, void *data, int tp_len, > __be16 proto, unsigned char *addr, int hlen, int copylen, > @@ -2609,6 +2604,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) > int len_sum = 0; > int status = TP_STATUS_AVAILABLE; > int hlen, tlen, copylen = 0; > + int vnet_hdr_sz; > > mutex_lock(&po->pg_vec_lock); > > @@ -2648,7 +2644,8 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) > size_max = po->tx_ring.frame_size > - (po->tp_hdrlen - sizeof(struct sockaddr_ll)); > > - if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr) > + vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); > + if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !vnet_hdr_sz) > size_max = dev->mtu + reserve + VLAN_HLEN; > > do { > @@ -2668,10 +2665,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) > status = TP_STATUS_SEND_REQUEST; > hlen = LL_RESERVED_SPACE(dev); > tlen = dev->needed_tailroom; > - if (po->has_vnet_hdr) { > + if (vnet_hdr_sz) { > vnet_hdr = data; > - data += sizeof(*vnet_hdr); > - tp_len -= sizeof(*vnet_hdr); > + data += vnet_hdr_sz; > + tp_len -= vnet_hdr_sz; > if (tp_len < 0 || > __packet_snd_vnet_parse(vnet_hdr, tp_len)) { > tp_len = -EINVAL; > @@ -2696,7 +2693,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) > addr, hlen, copylen, &sockc); > if (likely(tp_len >= 0) && > tp_len > dev->mtu + reserve && > - !po->has_vnet_hdr && > + !vnet_hdr_sz && > !packet_extra_vlan_len_allowed(dev, skb)) > tp_len = -EMSGSIZE; > > @@ -2715,7 +2712,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) > } > } > > - if (po->has_vnet_hdr) { > + if (vnet_hdr_sz) { > if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) { > tp_len = -EINVAL; > goto tpacket_error; > @@ -2802,9 +2799,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) > int err, reserve = 0; > struct sockcm_cookie sockc; > struct virtio_net_hdr vnet_hdr = { 0 }; > + int vnet_hdr_sz; > int offset = 0; > struct packet_sock *po = pkt_sk(sk); > - bool has_vnet_hdr = false; > int hlen, tlen, linear; > int extra_len = 0; > > @@ -2844,11 +2841,29 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) > > if (sock->type == SOCK_RAW) > reserve = dev->hard_header_len; > - if (po->has_vnet_hdr) { > - err = packet_snd_vnet_parse(msg, &len, &vnet_hdr); > - if (err) > + > + vnet_hdr_sz = READ_ONCE(po->vnet_hdr_sz); > + if (vnet_hdr_sz) { > + if (len < vnet_hdr_sz) { > + err = -EINVAL; > goto out_unlock; > - has_vnet_hdr = true; > + } > + len -= vnet_hdr_sz; > + > + if (!copy_from_iter_full(&vnet_hdr, sizeof(vnet_hdr), > + &msg->msg_iter)) { > + err = -EFAULT; > + goto out_unlock; > + } > + > + if (__packet_snd_vnet_parse(&vnet_hdr, len)) { > + err = -EINVAL; > + goto out_unlock; > + } Any reason to open code packet_snd_vnet_parse() here? > + > + /* TODO: check hdr_len with len? */ > + > + iov_iter_advance(&msg->msg_iter, vnet_hdr_sz - sizeof(vnet_hdr)); > } > > if (unlikely(sock_flag(sk, SOCK_NOFCS))) { > @@ -2912,7 +2927,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) > skb->mark = sockc.mark; > skb->tstamp = sockc.transmit_time; > > - if (has_vnet_hdr) { > + if (vnet_hdr_sz) { > err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le()); > if (err) > goto out_free; > @@ -3307,11 +3322,11 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, > if (pkt_sk(sk)->pressure) > packet_rcv_has_room(pkt_sk(sk), NULL); > > - if (pkt_sk(sk)->has_vnet_hdr) { > - err = packet_rcv_vnet(msg, skb, &len); > + vnet_hdr_len = READ_ONCE(pkt_sk(sk)->vnet_hdr_sz); > + if (vnet_hdr_len) { > + err = packet_rcv_vnet(msg, skb, &len, vnet_hdr_len); > if (err) > goto out_free; > - vnet_hdr_len = sizeof(struct virtio_net_hdr); > } > > /* You lose any data beyond the buffer you gave. If it worries > @@ -3772,7 +3787,17 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv > if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) { > ret = -EBUSY; > } else { > - po->has_vnet_hdr = !!val; > + /* Previouly we treat user input as boolean (!!val), > + * now we treat it as int. After the below correction, > + * the only violation case is 12, which results in > + * vnet header size of 12 instead of 10. > + */ > + if (val && > + val != sizeof(struct virtio_net_hdr) && > + val != sizeof(struct virtio_net_hdr_mrg_rxbuf)) > + val = sizeof(struct virtio_net_hdr); > + > + po->vnet_hdr_sz = val; > ret = 0; > } > release_sock(sk); > @@ -3903,7 +3928,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, > val = po->origdev; > break; > case PACKET_VNET_HDR: > - val = po->has_vnet_hdr; > + val = po->vnet_hdr_sz; So the change here is noticeable by userspace. Maybe we need a new opt for this? Thanks > break; > case PACKET_VERSION: > val = po->tp_version; > diff --git a/net/packet/diag.c b/net/packet/diag.c > index 7ef1c881ae74..950015b6704f 100644 > --- a/net/packet/diag.c > +++ b/net/packet/diag.c > @@ -26,7 +26,7 @@ static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb) > pinfo.pdi_flags |= PDI_AUXDATA; > if (po->origdev) > pinfo.pdi_flags |= PDI_ORIGDEV; > - if (po->has_vnet_hdr) > + if (po->vnet_hdr_sz) > pinfo.pdi_flags |= PDI_VNETHDR; > if (po->tp_loss) > pinfo.pdi_flags |= PDI_LOSS; > diff --git a/net/packet/internal.h b/net/packet/internal.h > index 3bb7c5fb3bff..11bc75950f28 100644 > --- a/net/packet/internal.h > +++ b/net/packet/internal.h > @@ -115,9 +115,9 @@ struct packet_sock { > unsigned int running; /* bind_lock must be held */ > unsigned int auxdata:1, /* writer must hold sock lock */ > origdev:1, > - has_vnet_hdr:1, > tp_loss:1, > tp_tx_has_off:1; > + int vnet_hdr_sz; > int pressure; > int ifindex; /* bound device */ > __be16 num;