netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Stefano Garzarella <sgarzare@redhat.com>
To: Arseniy Krasnov <avkrasnov@sberdevices.ru>
Cc: Stefan Hajnoczi <stefanha@redhat.com>,
	 "David S. Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	 Jakub Kicinski <kuba@kernel.org>,
	Paolo Abeni <pabeni@redhat.com>,
	 "Michael S. Tsirkin" <mst@redhat.com>,
	Jason Wang <jasowang@redhat.com>,
	 Bobby Eshleman <bobby.eshleman@bytedance.com>,
	kvm@vger.kernel.org, virtualization@lists.linux-foundation.org,
	 netdev@vger.kernel.org, linux-kernel@vger.kernel.org,
	kernel@sberdevices.ru,  oxffffaa@gmail.com
Subject: Re: [RFC PATCH v4 05/17] vsock/virtio: MSG_ZEROCOPY flag support
Date: Tue, 27 Jun 2023 09:50:50 +0200	[thread overview]
Message-ID: <m5q3fqqvur4pcvkcxx36ivoqu77tsrjd4xna6zszmzq34dbqq5@6wfrhllk6tsq> (raw)
In-Reply-To: <b25f0c4d-828c-ffe8-2780-2c954ed8770a@sberdevices.ru>

On Tue, Jun 27, 2023 at 07:41:51AM +0300, Arseniy Krasnov wrote:
>
>
>On 26.06.2023 19:03, Stefano Garzarella wrote:
>> On Sat, Jun 03, 2023 at 11:49:27PM +0300, Arseniy Krasnov wrote:
>>> This adds handling of MSG_ZEROCOPY flag on transmission path: if this
>>> flag is set and zerocopy transmission is possible, then non-linear skb
>>> will be created and filled with the pages of user's buffer. Pages of
>>> user's buffer are locked in memory by 'get_user_pages()'.
>>>
>>> Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>>> ---
>>> net/vmw_vsock/virtio_transport_common.c | 270 ++++++++++++++++++------
>>> 1 file changed, 208 insertions(+), 62 deletions(-)
>>>
>>> diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
>>> index 0de562c1dc4b..f1ec38c72db7 100644
>>> --- a/net/vmw_vsock/virtio_transport_common.c
>>> +++ b/net/vmw_vsock/virtio_transport_common.c
>>> @@ -37,27 +37,100 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
>>>     return container_of(t, struct virtio_transport, transport);
>>> }
>>>
>>> -/* Returns a new packet on success, otherwise returns NULL.
>>> - *
>>> - * If NULL is returned, errp is set to a negative errno.
>>> - */
>>> -static struct sk_buff *
>>> -virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>>> -               size_t len,
>>> -               u32 src_cid,
>>> -               u32 src_port,
>>> -               u32 dst_cid,
>>> -               u32 dst_port)
>>> -{
>>> -    const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
>>> -    struct virtio_vsock_hdr *hdr;
>>> -    struct sk_buff *skb;
>>> +static bool virtio_transport_can_zcopy(struct virtio_vsock_pkt_info *info,
>>> +                       size_t max_to_send)
>>> +{
>>> +    struct iov_iter *iov_iter;
>>> +
>>> +    if (!info->msg)
>>> +        return false;
>>> +
>>> +    iov_iter = &info->msg->msg_iter;
>>> +
>>> +    /* Data is simple buffer. */
>>> +    if (iter_is_ubuf(iov_iter))
>>> +        return true;
>>> +
>>> +    if (!iter_is_iovec(iov_iter))
>>> +        return false;
>>> +
>>> +    if (iov_iter->iov_offset)
>>> +        return false;
>>> +
>>> +    /* We can't send whole iov. */
>>> +    if (iov_iter->count > max_to_send)
>>> +        return false;
>>> +
>>> +    return true;
>>> +}
>>> +
>>> +static int virtio_transport_init_zcopy_skb(struct vsock_sock *vsk,
>>> +                       struct sk_buff *skb,
>>> +                       struct msghdr *msg,
>>> +                       bool zerocopy)
>>> +{
>>> +    struct ubuf_info *uarg;
>>> +
>>> +    if (msg->msg_ubuf) {
>>> +        uarg = msg->msg_ubuf;
>>> +        net_zcopy_get(uarg);
>>> +    } else {
>>> +        struct iov_iter *iter = &msg->msg_iter;
>>> +        struct ubuf_info_msgzc *uarg_zc;
>>> +        int len;
>>> +
>>> +        /* Only ITER_IOVEC or ITER_UBUF are allowed and
>>> +         * checked before.
>>> +         */
>>> +        if (iter_is_iovec(iter))
>>> +            len = iov_length(iter->__iov, iter->nr_segs);
>>> +        else
>>> +            len = iter->count;
>>> +
>>> +        uarg = msg_zerocopy_realloc(sk_vsock(vsk),
>>> +                        len,
>>> +                        NULL);
>>> +
>>> +        if (!uarg)
>>> +            return -1;
>>> +
>>> +        uarg_zc = uarg_to_msgzc(uarg);
>>> +        uarg_zc->zerocopy = zerocopy ? 1 : 0;
>>> +    }
>>> +
>>> +    skb_zcopy_init(skb, uarg);
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static int virtio_transport_fill_linear_skb(struct sk_buff *skb,
>>> +                        struct vsock_sock *vsk,
>>
>> `vsk` seems unused
>>
>>> +                        struct virtio_vsock_pkt_info *info,
>>> +                        size_t len)
>>> +{
>>>     void *payload;
>>>     int err;
>>>
>>> -    skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>>> -    if (!skb)
>>> -        return NULL;
>>> +    payload = skb_put(skb, len);
>>> +    err = memcpy_from_msg(payload, info->msg, len);
>>> +    if (err)
>>> +        return -1;
>>> +
>>> +    if (msg_data_left(info->msg))
>>> +        return 0;
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> +static void virtio_transport_init_hdr(struct sk_buff *skb,
>>> +                      struct virtio_vsock_pkt_info *info,
>>> +                      u32 src_cid,
>>> +                      u32 src_port,
>>> +                      u32 dst_cid,
>>> +                      u32 dst_port,
>>> +                      size_t len)
>>> +{
>>> +    struct virtio_vsock_hdr *hdr;
>>>
>>>     hdr = virtio_vsock_hdr(skb);
>>>     hdr->type    = cpu_to_le16(info->type);
>>> @@ -68,42 +141,6 @@ virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
>>>     hdr->dst_port    = cpu_to_le32(dst_port);
>>>     hdr->flags    = cpu_to_le32(info->flags);
>>>     hdr->len    = cpu_to_le32(len);
>>> -
>>> -    if (info->msg && len > 0) {
>>> -        payload = skb_put(skb, len);
>>> -        err = memcpy_from_msg(payload, info->msg, len);
>>> -        if (err)
>>> -            goto out;
>>> -
>>> -        if (msg_data_left(info->msg) == 0 &&
>>> -            info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>>> -            hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>>> -
>>> -            if (info->msg->msg_flags & MSG_EOR)
>>> -                hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>>> -        }
>>> -    }
>>> -
>>> -    if (info->reply)
>>> -        virtio_vsock_skb_set_reply(skb);
>>> -
>>> -    trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>> -                     dst_cid, dst_port,
>>> -                     len,
>>> -                     info->type,
>>> -                     info->op,
>>> -                     info->flags);
>>> -
>>> -    if (info->vsk && !skb_set_owner_sk_safe(skb, sk_vsock(info->vsk))) {
>>> -        WARN_ONCE(1, "failed to allocate skb on vsock socket with sk_refcnt == 0\n");
>>> -        goto out;
>>> -    }
>>> -
>>> -    return skb;
>>> -
>>> -out:
>>> -    kfree_skb(skb);
>>> -    return NULL;
>>> }
>>>
>>> static void virtio_transport_copy_nonlinear_skb(struct sk_buff *skb,
>>> @@ -214,6 +251,85 @@ static u16 virtio_transport_get_type(struct sock *sk)
>>>         return VIRTIO_VSOCK_TYPE_SEQPACKET;
>>> }
>>>
>>> +/* Returns a new packet on success, otherwise returns NULL.
>>> + *
>>> + * If NULL is returned, errp is set to a negative errno.
>>
>> I had noticed this in Bobby's patches, I think it's an old comment we
>> left around.
>>
>>> + */
>>> +static struct sk_buff *virtio_transport_alloc_skb(struct vsock_sock *vsk,
>>> +                          struct virtio_vsock_pkt_info *info,
>>> +                          size_t payload_len,
>>> +                          bool zcopy,
>>> +                          u32 dst_cid,
>>> +                          u32 dst_port,
>>> +                          u32 src_cid,
>>> +                          u32 src_port)
>>> +{
>>> +    struct sk_buff *skb;
>>> +    size_t skb_len;
>>> +
>>> +    skb_len = VIRTIO_VSOCK_SKB_HEADROOM;
>>> +
>>> +    if (!zcopy)
>>> +        skb_len += payload_len;
>>> +
>>> +    skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
>>> +    if (!skb)
>>> +        return NULL;
>>> +
>>> +    virtio_transport_init_hdr(skb, info, src_cid, src_port,
>>> +                  dst_cid, dst_port,
>>> +                  payload_len);
>>> +
>>> +    /* Set owner here, because '__zerocopy_sg_from_iter()' uses
>>> +     * owner of skb without check to update 'sk_wmem_alloc'.
>>> +     */
>>> +    if (vsk)
>>> +        skb_set_owner_w(skb, sk_vsock(vsk));
>>
>> why we are moving from skb_set_owner_sk_safe() to skb_set_owner_w()?
>>
>> We should mention this in the commit description.
>>
>>> +
>>> +    if (info->msg && payload_len > 0) {
>>> +        int err;
>>> +
>>> +        if (zcopy) {
>>> +            err = __zerocopy_sg_from_iter(info->msg, NULL, skb,
>>> +                              &info->msg->msg_iter,
>>> +                              payload_len);
>>> +        } else {
>>> +            err = virtio_transport_fill_linear_skb(skb, vsk, info, payload_len);
>>> +        }
>>> +
>>> +        if (err)
>>> +            goto out;
>>> +
>>> +        VIRTIO_VSOCK_SKB_CB(skb)->frag_off = 0;
>>> +
>>> +        if (info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
>>> +            struct virtio_vsock_hdr *hdr;
>>> +
>>> +            hdr = virtio_vsock_hdr(skb);
>>
>> Just `struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);` should be
>> fine.
>>
>>> +
>>> +            hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
>>> +
>>> +            if (info->msg->msg_flags & MSG_EOR)
>>> +                hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
>>> +        }
>>> +    }
>>> +
>>> +    if (info->reply)
>>> +        virtio_vsock_skb_set_reply(skb);
>>> +
>>> +    trace_virtio_transport_alloc_pkt(src_cid, src_port,
>>> +                     dst_cid, dst_port,
>>> +                     payload_len,
>>> +                     info->type,
>>> +                     info->op,
>>> +                     info->flags);
>>> +
>>> +    return skb;
>>> +out:
>>> +    kfree_skb(skb);
>>> +    return NULL;
>>> +}
>>> +
>>> /* This function can only be used on connecting/connected sockets,
>>>  * since a socket assigned to a transport is required.
>>>  *
>>> @@ -226,6 +342,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>>     const struct virtio_transport *t_ops;
>>>     struct virtio_vsock_sock *vvs;
>>>     u32 pkt_len = info->pkt_len;
>>> +    bool can_zcopy = false;
>>> +    u32 max_skb_cap;
>>>     u32 rest_len;
>>>     int ret;
>>>
>>> @@ -254,22 +372,49 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
>>>     if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
>>>         return pkt_len;
>>>
>>> +    /* If zerocopy is not enabled by 'setsockopt()', we behave as
>>> +     * there is no MSG_ZEROCOPY flag set.
>>> +     */
>>> +    if (!sock_flag(sk_vsock(vsk), SOCK_ZEROCOPY))
>>> +        info->flags &= ~MSG_ZEROCOPY;
>>> +
>>> +    if (info->flags & MSG_ZEROCOPY)
>>> +        can_zcopy = virtio_transport_can_zcopy(info, pkt_len);
>>> +
>>> +    if (can_zcopy)
>>> +        max_skb_cap = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE,
>>> +                    (MAX_SKB_FRAGS * PAGE_SIZE));
>>> +    else
>>> +        max_skb_cap = VIRTIO_VSOCK_MAX_PKT_BUF_SIZE;
>>> +
>>
>> We use `len` very often, what about `max_skb_len`?
>>
>>>     rest_len = pkt_len;
>>>
>>>     do {
>>>         struct sk_buff *skb;
>>>         size_t skb_len;
>>>
>>> -        skb_len = min_t(u32, VIRTIO_VSOCK_MAX_PKT_BUF_SIZE, rest_len);
>>> +        skb_len = min(max_skb_cap, rest_len);
>>>
>>> -        skb = virtio_transport_alloc_skb(info, skb_len,
>>> -                         src_cid, src_port,
>>> -                         dst_cid, dst_port);
>>> +        skb = virtio_transport_alloc_skb(vsk, info, skb_len, can_zcopy,
>>> +                         dst_cid, dst_port,
>>> +                         src_cid, src_port);
>>>         if (!skb) {
>>>             ret = -ENOMEM;
>>>             break;
>>>         }
>>>
>>> +        /* This is last skb to send this portion of data. */
>>> +        if (skb_len == rest_len &&
>>> +            info->flags & MSG_ZEROCOPY &&
>>> +            info->op == VIRTIO_VSOCK_OP_RW) {
>>> +            if (virtio_transport_init_zcopy_skb(vsk, skb,
>>> +                                info->msg,
>>> +                                can_zcopy)) {
>>> +                ret = -ENOMEM;
>>> +                break;
>>> +            }
>>> +        }
>>> +
>>>         virtio_transport_inc_tx_pkt(vvs, skb);
>>>
>>>         ret = t_ops->send_pkt(skb);
>>> @@ -884,6 +1029,7 @@ virtio_transport_stream_enqueue(struct vsock_sock *vsk,
>>>         .msg = msg,
>>>         .pkt_len = len,
>>>         .vsk = vsk,
>>> +        .flags = msg->msg_flags,
>>
>> These flags then get copied into the virtio_vsock_hdr, which I don't
>> think is a good idea.
>>
>> Why not using directly info->msg->msg_flags?
>
>Ops, yes, it's a bug, You're right, this is really wrong as there are two different
>sets of flags - MSG_XXX passed to syscall and flags in the header of packet.

Yep.

What about the moving from skb_set_owner_sk_safe() to skb_set_owner_w()?
Was it voluntary? If so, can you explain why?


Thanks,
Stefano


  reply	other threads:[~2023-06-27  7:51 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-06-03 20:49 [RFC PATCH v4 00/17] vsock: MSG_ZEROCOPY flag support Arseniy Krasnov
2023-06-03 20:49 ` [RFC PATCH v4 01/17] vsock/virtio: read data from non-linear skb Arseniy Krasnov
2023-06-12 17:43   ` Bobby Eshleman
2023-06-26 15:20   ` Stefano Garzarella
2023-06-03 20:49 ` [RFC PATCH v4 02/17] vhost/vsock: " Arseniy Krasnov
2023-06-12 17:53   ` Bobby Eshleman
2023-06-26 15:24   ` Stefano Garzarella
2023-06-03 20:49 ` [RFC PATCH v4 03/17] vsock/virtio: support to send " Arseniy Krasnov
2023-06-12 18:30   ` Bobby Eshleman
2023-06-26 15:36   ` Stefano Garzarella
2023-06-27  4:39     ` Arseniy Krasnov
2023-06-27  7:49       ` Stefano Garzarella
2023-06-03 20:49 ` [RFC PATCH v4 04/17] vsock/virtio: non-linear skb handling for tap Arseniy Krasnov
2023-06-26 15:43   ` Stefano Garzarella
2023-06-03 20:49 ` [RFC PATCH v4 05/17] vsock/virtio: MSG_ZEROCOPY flag support Arseniy Krasnov
2023-06-26 16:03   ` Stefano Garzarella
2023-06-27  4:41     ` Arseniy Krasnov
2023-06-27  7:50       ` Stefano Garzarella [this message]
2023-06-27  8:22         ` Arseniy Krasnov
2023-06-29 12:32           ` Stefano Garzarella
2023-06-03 20:49 ` [RFC PATCH v4 06/17] vsock: check error queue to set EPOLLERR Arseniy Krasnov
2023-06-26 16:04   ` Stefano Garzarella
2023-06-27  4:44     ` Arseniy Krasnov
2023-06-27  7:53       ` Stefano Garzarella
2023-06-03 20:49 ` [RFC PATCH v4 07/17] vsock: read from socket's error queue Arseniy Krasnov
2023-06-26 16:08   ` Stefano Garzarella
2023-06-27  4:49     ` Arseniy Krasnov
2023-06-27  7:58       ` Stefano Garzarella
2023-06-03 20:49 ` [RFC PATCH v4 08/17] vsock: check for MSG_ZEROCOPY support on send Arseniy Krasnov
2023-06-03 20:49 ` [RFC PATCH v4 09/17] vsock: enable SOCK_SUPPORT_ZC bit Arseniy Krasnov
2023-06-03 20:49 ` [RFC PATCH v4 10/17] vhost/vsock: support MSG_ZEROCOPY for transport Arseniy Krasnov
2023-06-26 16:10   ` Stefano Garzarella
2023-06-03 20:49 ` [RFC PATCH v4 11/17] vsock/virtio: " Arseniy Krasnov
2023-06-26 16:11   ` Stefano Garzarella
2023-06-03 20:49 ` [RFC PATCH v4 12/17] vsock/loopback: " Arseniy Krasnov
2023-06-26 16:14   ` Stefano Garzarella
2023-06-03 20:49 ` [RFC PATCH v4 13/17] net/sock: enable setting SO_ZEROCOPY for PF_VSOCK Arseniy Krasnov
2023-06-03 20:49 ` [RFC PATCH v4 14/17] docs: net: description of MSG_ZEROCOPY for AF_VSOCK Arseniy Krasnov
2023-06-03 20:49 ` [RFC PATCH v4 15/17] test/vsock: MSG_ZEROCOPY flag tests Arseniy Krasnov
2023-06-03 20:49 ` [RFC PATCH v4 16/17] test/vsock: MSG_ZEROCOPY support for vsock_perf Arseniy Krasnov
2023-06-03 20:49 ` [RFC PATCH v4 17/17] test/vsock: io_uring rx/tx tests Arseniy Krasnov
2023-06-12 17:20 ` [RFC PATCH v4 00/17] vsock: MSG_ZEROCOPY flag support Bobby Eshleman
2023-06-14  5:39   ` Arseniy Krasnov
2023-06-26 16:15 ` Stefano Garzarella
2023-06-27  4:55   ` Arseniy Krasnov
2023-06-27  8:01     ` Stefano Garzarella

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=m5q3fqqvur4pcvkcxx36ivoqu77tsrjd4xna6zszmzq34dbqq5@6wfrhllk6tsq \
    --to=sgarzare@redhat.com \
    --cc=avkrasnov@sberdevices.ru \
    --cc=bobby.eshleman@bytedance.com \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=jasowang@redhat.com \
    --cc=kernel@sberdevices.ru \
    --cc=kuba@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=oxffffaa@gmail.com \
    --cc=pabeni@redhat.com \
    --cc=stefanha@redhat.com \
    --cc=virtualization@lists.linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).