All of lore.kernel.org
 help / color / mirror / Atom feed
From: Stefano Garzarella <sgarzare@redhat.com>
To: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
Cc: Stefan Hajnoczi <stefanha@redhat.com>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	Jason Wang <jasowang@redhat.com>,
	"David S. Miller" <davem@davemloft.net>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"kvm@vger.kernel.org" <kvm@vger.kernel.org>,
	"virtualization@lists.linux-foundation.org" 
	<virtualization@lists.linux-foundation.org>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	kernel <kernel@sberdevices.ru>,
	Krasnov Arseniy <oxffffaa@gmail.com>
Subject: Re: [RFC PATCH v2 3/8] af_vsock: add zerocopy receive logic
Date: Mon, 13 Jun 2022 10:50:46 +0200	[thread overview]
Message-ID: <20220613085046.ee7cb2ye5yq5cbfo@sgarzare-redhat> (raw)
In-Reply-To: <204f5bc4-987e-a1ff-71e2-e51343e13f24@sberdevices.ru>

On Thu, Jun 09, 2022 at 12:20:22PM +0000, Arseniy Krasnov wrote:
>On 09.06.2022 11:39, Stefano Garzarella wrote:
>> On Fri, Jun 03, 2022 at 05:35:48AM +0000, Arseniy Krasnov wrote:
>>> This:
>>> 1) Adds callback for 'mmap()' call on socket. It checks vm
>>>   area flags and sets vm area ops.
>>> 2) Adds special 'getsockopt()' case which calls transport
>>>   zerocopy callback. Input argument is vm area address.
>>> 3) Adds 'getsockopt()/setsockopt()' for switching on/off rx
>>>   zerocopy mode.
>>>
>>> Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>>> ---
>>> include/net/af_vsock.h          |   7 +++
>>> include/uapi/linux/vm_sockets.h |   3 +
>>> net/vmw_vsock/af_vsock.c        | 100 ++++++++++++++++++++++++++++++++
>>> 3 files changed, 110 insertions(+)
>>>
>>> diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>>> index f742e50207fb..f15f84c648ff 100644
>>> --- a/include/net/af_vsock.h
>>> +++ b/include/net/af_vsock.h
>>> @@ -135,6 +135,13 @@ struct vsock_transport {
>>>     bool (*stream_is_active)(struct vsock_sock *);
>>>     bool (*stream_allow)(u32 cid, u32 port);
>>>
>>> +    int (*rx_zerocopy_set)(struct vsock_sock *vsk,
>>> +                   bool enable);
>>> +    int (*rx_zerocopy_get)(struct vsock_sock *vsk);
>>> +    int (*zerocopy_dequeue)(struct vsock_sock *vsk,
>>> +                struct vm_area_struct *vma,
>>> +                unsigned long addr);
>>> +
>>>     /* SEQ_PACKET. */
>>>     ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
>>>                      int flags);
>>> diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
>>> index c60ca33eac59..d1f792bed1a7 100644
>>> --- a/include/uapi/linux/vm_sockets.h
>>> +++ b/include/uapi/linux/vm_sockets.h
>>> @@ -83,6 +83,9 @@
>>>
>>> #define SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW 8
>>>
>>> +#define SO_VM_SOCKETS_MAP_RX 9
>>> +#define SO_VM_SOCKETS_ZEROCOPY 10
>>> +
>>> #if !defined(__KERNEL__)
>>> #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
>>> #define SO_VM_SOCKETS_CONNECT_TIMEOUT SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD
>>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>>> index f04abf662ec6..10061ef21730 100644
>>> --- a/net/vmw_vsock/af_vsock.c
>>> +++ b/net/vmw_vsock/af_vsock.c
>>> @@ -1644,6 +1644,17 @@ static int vsock_connectible_setsockopt(struct socket *sock,
>>>         }
>>>         break;
>>>     }
>>> +    case SO_VM_SOCKETS_ZEROCOPY: {
>>> +        if (!transport || !transport->rx_zerocopy_set) {
>>> +            err = -EOPNOTSUPP;
>>> +        } else {
>>> +            COPY_IN(val);
>>> +
>>> +            if (transport->rx_zerocopy_set(vsk, val))
>>> +                err = -EINVAL;
>>> +        }
>>> +        break;
>>> +    }
>>>
>>>     default:
>>>         err = -ENOPROTOOPT;
>>> @@ -1657,6 +1668,48 @@ static int vsock_connectible_setsockopt(struct socket *sock,
>>>     return err;
>>> }
>>>
>>> +static const struct vm_operations_struct afvsock_vm_ops = {
>>> +};
>>> +
>>> +static int vsock_recv_zerocopy(struct socket *sock,
>>> +                   unsigned long address)
>>> +{
>>> +    struct sock *sk = sock->sk;
>>> +    struct vsock_sock *vsk = vsock_sk(sk);
>>> +    struct vm_area_struct *vma;
>>> +    const struct vsock_transport *transport;
>>> +    int res;
>>> +
>>> +    transport = vsk->transport;
>>> +
>>> +    if (!transport->rx_zerocopy_get)
>>> +        return -EOPNOTSUPP;
>>> +
>>> +    if (!transport->rx_zerocopy_get(vsk))
>>> +        return -EOPNOTSUPP;
>>
>> Maybe we can merge in
>>         if (!transport->rx_zerocopy_get ||
>>             !transport->rx_zerocopy_get(vsk)}
>>                 return -EOPNOTSUPP;
>>
>>> +
>>> +    if (!transport->zerocopy_dequeue)
>>> +        return -EOPNOTSUPP;
>>> +
>>> +    lock_sock(sk);
>>> +    mmap_write_lock(current->mm);
>>
>> So, multiple threads using different sockets are serialized if they use zero-copy?
>>
>> IIUC this is necessary because the callback calls vm_insert_page().
>>
>> At this point I think it's better not to do this in every transport, but have the callback return an array of pages to map and we map them here trying to limit as much as possible the critical section to protect with mmap_write_lock().
>
>Yes, it will be easy to return array of pages by transport callback,
>
>>
>>> +
>>> +    vma = vma_lookup(current->mm, address);
>>> +
>>> +    if (!vma || vma->vm_ops != &afvsock_vm_ops) {
>>> +        mmap_write_unlock(current->mm);
>>> +        release_sock(sk);
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    res = transport->zerocopy_dequeue(vsk, vma, address);
>>> +
>>> +    mmap_write_unlock(current->mm);
>>> +    release_sock(sk);
>>> +
>>> +    return res;
>>> +}
>>> +
>>> static int vsock_connectible_getsockopt(struct socket *sock,
>>>                     int level, int optname,
>>>                     char __user *optval,
>>> @@ -1701,6 +1754,39 @@ static int vsock_connectible_getsockopt(struct socket *sock,
>>>         lv = sock_get_timeout(vsk->connect_timeout, &v,
>>>                       optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD);
>>>         break;
>>> +    case SO_VM_SOCKETS_ZEROCOPY: {
>>> +        const struct vsock_transport *transport;
>>> +        int res;
>>> +
>>> +        transport = vsk->transport;
>>> +
>>> +        if (!transport->rx_zerocopy_get)
>>> +            return -EOPNOTSUPP;
>>> +
>>> +        lock_sock(sk);
>>
>> I think we should call lock_sock() before reading the transport to avoid races and we should check if it is assigned.
>>
>> At that point I think is better to store this info in vsock_sock and not in the transport.
>You mean to store flag that zerocopy is enabled in 'vsock_sock', just reading it here, without touching transport?

Yep.

Thanks,
Stefano


WARNING: multiple messages have this Message-ID (diff)
From: Stefano Garzarella <sgarzare@redhat.com>
To: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
Cc: Krasnov Arseniy <oxffffaa@gmail.com>,
	"kvm@vger.kernel.org" <kvm@vger.kernel.org>,
	"Michael S. Tsirkin" <mst@redhat.com>,
	"netdev@vger.kernel.org" <netdev@vger.kernel.org>,
	"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>,
	"virtualization@lists.linux-foundation.org"
	<virtualization@lists.linux-foundation.org>,
	Stefan Hajnoczi <stefanha@redhat.com>,
	kernel <kernel@sberdevices.ru>, Jakub Kicinski <kuba@kernel.org>,
	Paolo Abeni <pabeni@redhat.com>,
	"David S. Miller" <davem@davemloft.net>
Subject: Re: [RFC PATCH v2 3/8] af_vsock: add zerocopy receive logic
Date: Mon, 13 Jun 2022 10:50:46 +0200	[thread overview]
Message-ID: <20220613085046.ee7cb2ye5yq5cbfo@sgarzare-redhat> (raw)
In-Reply-To: <204f5bc4-987e-a1ff-71e2-e51343e13f24@sberdevices.ru>

On Thu, Jun 09, 2022 at 12:20:22PM +0000, Arseniy Krasnov wrote:
>On 09.06.2022 11:39, Stefano Garzarella wrote:
>> On Fri, Jun 03, 2022 at 05:35:48AM +0000, Arseniy Krasnov wrote:
>>> This:
>>> 1) Adds callback for 'mmap()' call on socket. It checks vm
>>>   area flags and sets vm area ops.
>>> 2) Adds special 'getsockopt()' case which calls transport
>>>   zerocopy callback. Input argument is vm area address.
>>> 3) Adds 'getsockopt()/setsockopt()' for switching on/off rx
>>>   zerocopy mode.
>>>
>>> Signed-off-by: Arseniy Krasnov <AVKrasnov@sberdevices.ru>
>>> ---
>>> include/net/af_vsock.h          |   7 +++
>>> include/uapi/linux/vm_sockets.h |   3 +
>>> net/vmw_vsock/af_vsock.c        | 100 ++++++++++++++++++++++++++++++++
>>> 3 files changed, 110 insertions(+)
>>>
>>> diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
>>> index f742e50207fb..f15f84c648ff 100644
>>> --- a/include/net/af_vsock.h
>>> +++ b/include/net/af_vsock.h
>>> @@ -135,6 +135,13 @@ struct vsock_transport {
>>>     bool (*stream_is_active)(struct vsock_sock *);
>>>     bool (*stream_allow)(u32 cid, u32 port);
>>>
>>> +    int (*rx_zerocopy_set)(struct vsock_sock *vsk,
>>> +                   bool enable);
>>> +    int (*rx_zerocopy_get)(struct vsock_sock *vsk);
>>> +    int (*zerocopy_dequeue)(struct vsock_sock *vsk,
>>> +                struct vm_area_struct *vma,
>>> +                unsigned long addr);
>>> +
>>>     /* SEQ_PACKET. */
>>>     ssize_t (*seqpacket_dequeue)(struct vsock_sock *vsk, struct msghdr *msg,
>>>                      int flags);
>>> diff --git a/include/uapi/linux/vm_sockets.h b/include/uapi/linux/vm_sockets.h
>>> index c60ca33eac59..d1f792bed1a7 100644
>>> --- a/include/uapi/linux/vm_sockets.h
>>> +++ b/include/uapi/linux/vm_sockets.h
>>> @@ -83,6 +83,9 @@
>>>
>>> #define SO_VM_SOCKETS_CONNECT_TIMEOUT_NEW 8
>>>
>>> +#define SO_VM_SOCKETS_MAP_RX 9
>>> +#define SO_VM_SOCKETS_ZEROCOPY 10
>>> +
>>> #if !defined(__KERNEL__)
>>> #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
>>> #define SO_VM_SOCKETS_CONNECT_TIMEOUT SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD
>>> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
>>> index f04abf662ec6..10061ef21730 100644
>>> --- a/net/vmw_vsock/af_vsock.c
>>> +++ b/net/vmw_vsock/af_vsock.c
>>> @@ -1644,6 +1644,17 @@ static int vsock_connectible_setsockopt(struct socket *sock,
>>>         }
>>>         break;
>>>     }
>>> +    case SO_VM_SOCKETS_ZEROCOPY: {
>>> +        if (!transport || !transport->rx_zerocopy_set) {
>>> +            err = -EOPNOTSUPP;
>>> +        } else {
>>> +            COPY_IN(val);
>>> +
>>> +            if (transport->rx_zerocopy_set(vsk, val))
>>> +                err = -EINVAL;
>>> +        }
>>> +        break;
>>> +    }
>>>
>>>     default:
>>>         err = -ENOPROTOOPT;
>>> @@ -1657,6 +1668,48 @@ static int vsock_connectible_setsockopt(struct socket *sock,
>>>     return err;
>>> }
>>>
>>> +static const struct vm_operations_struct afvsock_vm_ops = {
>>> +};
>>> +
>>> +static int vsock_recv_zerocopy(struct socket *sock,
>>> +                   unsigned long address)
>>> +{
>>> +    struct sock *sk = sock->sk;
>>> +    struct vsock_sock *vsk = vsock_sk(sk);
>>> +    struct vm_area_struct *vma;
>>> +    const struct vsock_transport *transport;
>>> +    int res;
>>> +
>>> +    transport = vsk->transport;
>>> +
>>> +    if (!transport->rx_zerocopy_get)
>>> +        return -EOPNOTSUPP;
>>> +
>>> +    if (!transport->rx_zerocopy_get(vsk))
>>> +        return -EOPNOTSUPP;
>>
>> Maybe we can merge in
>>         if (!transport->rx_zerocopy_get ||
>>             !transport->rx_zerocopy_get(vsk)}
>>                 return -EOPNOTSUPP;
>>
>>> +
>>> +    if (!transport->zerocopy_dequeue)
>>> +        return -EOPNOTSUPP;
>>> +
>>> +    lock_sock(sk);
>>> +    mmap_write_lock(current->mm);
>>
>> So, multiple threads using different sockets are serialized if they use zero-copy?
>>
>> IIUC this is necessary because the callback calls vm_insert_page().
>>
>> At this point I think it's better not to do this in every transport, but have the callback return an array of pages to map and we map them here trying to limit as much as possible the critical section to protect with mmap_write_lock().
>
>Yes, it will be easy to return array of pages by transport callback,
>
>>
>>> +
>>> +    vma = vma_lookup(current->mm, address);
>>> +
>>> +    if (!vma || vma->vm_ops != &afvsock_vm_ops) {
>>> +        mmap_write_unlock(current->mm);
>>> +        release_sock(sk);
>>> +        return -EINVAL;
>>> +    }
>>> +
>>> +    res = transport->zerocopy_dequeue(vsk, vma, address);
>>> +
>>> +    mmap_write_unlock(current->mm);
>>> +    release_sock(sk);
>>> +
>>> +    return res;
>>> +}
>>> +
>>> static int vsock_connectible_getsockopt(struct socket *sock,
>>>                     int level, int optname,
>>>                     char __user *optval,
>>> @@ -1701,6 +1754,39 @@ static int vsock_connectible_getsockopt(struct socket *sock,
>>>         lv = sock_get_timeout(vsk->connect_timeout, &v,
>>>                       optname == SO_VM_SOCKETS_CONNECT_TIMEOUT_OLD);
>>>         break;
>>> +    case SO_VM_SOCKETS_ZEROCOPY: {
>>> +        const struct vsock_transport *transport;
>>> +        int res;
>>> +
>>> +        transport = vsk->transport;
>>> +
>>> +        if (!transport->rx_zerocopy_get)
>>> +            return -EOPNOTSUPP;
>>> +
>>> +        lock_sock(sk);
>>
>> I think we should call lock_sock() before reading the transport to avoid races and we should check if it is assigned.
>>
>> At that point I think is better to store this info in vsock_sock and not in the transport.
>You mean to store flag that zerocopy is enabled in 'vsock_sock', just reading it here, without touching transport?

Yep.

Thanks,
Stefano

_______________________________________________
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

  reply	other threads:[~2022-06-13  8:51 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-03  5:27 [RFC PATCH v2 0/8] virtio/vsock: experimental zerocopy receive Arseniy Krasnov
2022-06-03  5:31 ` [RFC PATCH v2 1/8] virtio/vsock: rework packet allocation logic Arseniy Krasnov
2022-06-09  8:37   ` Stefano Garzarella
2022-06-09  8:37     ` Stefano Garzarella
2022-06-03  5:33 ` [RFC PATCH v2 2/8] vhost/vsock: " Arseniy Krasnov
2022-06-09  8:38   ` Stefano Garzarella
2022-06-09  8:38     ` Stefano Garzarella
2022-06-09 12:09     ` Arseniy Krasnov
2022-06-03  5:35 ` [RFC PATCH v2 3/8] af_vsock: add zerocopy receive logic Arseniy Krasnov
2022-06-09  8:39   ` Stefano Garzarella
2022-06-09  8:39     ` Stefano Garzarella
2022-06-09 12:20     ` Arseniy Krasnov
2022-06-13  8:50       ` Stefano Garzarella [this message]
2022-06-13  8:50         ` Stefano Garzarella
2022-06-03  5:37 ` [RFC PATCH v2 4/8] virtio/vsock: add transport zerocopy callback Arseniy Krasnov
2022-06-09  8:41   ` Stefano Garzarella
2022-06-09  8:41     ` Stefano Garzarella
2022-06-09 12:23     ` Arseniy Krasnov
2022-06-03  5:39 ` [RFC PATCH v2 5/8] vhost/vsock: enable " Arseniy Krasnov
2022-06-03  5:41 ` [RFC PATCH v2 6/8] virtio/vsock: " Arseniy Krasnov
2022-06-03  5:43 ` [RFC PATCH v2 7/8] test/vsock: add receive zerocopy tests Arseniy Krasnov
2022-06-03  5:44 ` [RFC PATCH v2 8/8] test/vsock: vsock rx zerocopy utility Arseniy Krasnov
2022-06-08 15:59 ` [RFC PATCH v2 0/8] virtio/vsock: experimental zerocopy receive Zhao Liu
2022-06-09  8:54 ` Stefano Garzarella
2022-06-09  8:54   ` Stefano Garzarella
2022-06-09 12:33   ` Arseniy Krasnov
2022-06-13  8:54     ` Stefano Garzarella
2022-06-13  8:54       ` Stefano Garzarella
2022-06-14  4:23       ` Arseniy Krasnov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220613085046.ee7cb2ye5yq5cbfo@sgarzare-redhat \
    --to=sgarzare@redhat.com \
    --cc=AVKrasnov@sberdevices.ru \
    --cc=davem@davemloft.net \
    --cc=jasowang@redhat.com \
    --cc=kernel@sberdevices.ru \
    --cc=kuba@kernel.org \
    --cc=kvm@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=mst@redhat.com \
    --cc=netdev@vger.kernel.org \
    --cc=oxffffaa@gmail.com \
    --cc=pabeni@redhat.com \
    --cc=stefanha@redhat.com \
    --cc=virtualization@lists.linux-foundation.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.